From 6de00bf75fbe7d183779a82789839029889b3824 Mon Sep 17 00:00:00 2001 From: raver119 Date: Wed, 13 Nov 2019 17:15:18 +0300 Subject: [PATCH] [WIP] Weekly update of repo (#8390) * [WIP] Fix compilation after nd4j changes (#37) * Fix compilation. * Some tests fixed * Disable tests temporarily. * Restored test * Tests restored. * Test restored. * [WIP] perf tests (#40) * special maxpool test Signed-off-by: raver119 * special maxpool test Signed-off-by: raver119 * Shyrma bnorm bp (#41) Batchnorm backprop mkldnn * Add SameDiff memory reuse memory manager (array cache) (#39) * Attention op comments Signed-off-by: AlexDBlack * ArrayCacheMemoryMgr - first pass Signed-off-by: AlexDBlack * Tweak array cache for use with SameDiff identity arrays Signed-off-by: AlexDBlack * ArrayCacheMemoryMgr javadoc and properly get max memory Signed-off-by: AlexDBlack * LRU cache policy + add tests Signed-off-by: AlexDBlack * Fixes Signed-off-by: AlexDBlack * Resize arrays internally if required for ArrayCacheMemoryMgr Signed-off-by: AlexDBlack * Test improvement Signed-off-by: AlexDBlack * Small polish Signed-off-by: AlexDBlack * SameDiff op runtime benchmarking listener (#42) Signed-off-by: AlexDBlack * INLINE_LOOPS for windows Signed-off-by: raver119 * [WIP] ThreadPool (#8) This PR removes OpenMP use in 95% of cases --- .../CompareTrainingImplementations.java | 1 + .../deeplearning4j/util/ConvolutionUtils.java | 14 +- libnd4j/CMakeLists.txt | 22 +- libnd4j/CMakeLists.txt.mkldnn.in | 2 +- libnd4j/blas/CMakeLists.txt | 27 +- libnd4j/blas/Environment.cpp | 7 +- libnd4j/blas/NDArray.h | 2 +- libnd4j/blas/NDArray.hpp | 5 +- libnd4j/blas/NativeOpExecutioner.h | 6 +- libnd4j/blas/NativeOps.h | 5 +- libnd4j/blas/cpu/NDArray.cpp | 122 +- libnd4j/blas/cpu/NDArrayLambda.hpp | 191 +- libnd4j/blas/cpu/NativeOpExecutioner.cpp | 441 ++-- libnd4j/blas/cpu/NativeOps.cpp | 349 +-- libnd4j/blas/cuda/NativeOps.cu | 43 +- libnd4j/buildnativeoperations.sh | 38 +- libnd4j/include/array/DataTypeConversions.h | 37 +- libnd4j/include/buffer.h | 1 + libnd4j/include/cnpy/cnpy.h | 30 +- libnd4j/include/dll.h | 3 + libnd4j/include/execution/BlockingQueue.h | 52 + libnd4j/include/execution/CallableInterface.h | 94 + .../include/execution/CallableWithArguments.h | 92 + libnd4j/include/execution/ThreadPool.h | 71 + libnd4j/include/execution/Threads.h | 160 ++ libnd4j/include/execution/Ticket.h | 67 + .../include/execution/impl/BlockingQueue.cpp | 73 + .../execution/impl/CallableInterface.cpp | 213 ++ .../execution/impl/CallableWithArguments.cpp | 103 + libnd4j/include/execution/impl/ThreadPool.cpp | 194 ++ libnd4j/include/execution/impl/Threads.cpp | 641 +++++ libnd4j/include/execution/impl/Ticket.cpp | 94 + libnd4j/include/graph/Node.h | 1 + libnd4j/include/graph/impl/Graph.cpp | 3 +- libnd4j/include/graph/impl/Node.cpp | 69 +- libnd4j/include/helpers/Loops.h | 924 +++---- libnd4j/include/helpers/TAD.h | 2 +- .../helpers/benchmark/MatrixBenchmark.h | 1 - libnd4j/include/helpers/cpu/MmulHelper.cpp | 76 +- .../helpers/cpu/TrueBroadcastHelper.cpp | 1 + .../helpers/cpu/loops/IndexReductionLoops.cpp | 266 +- .../helpers/cpu/loops/Reduction3Loops_0.cpp | 24 +- .../helpers/cpu/loops/Reduction3Loops_1.cpp | 24 +- .../helpers/cpu/loops/Reduction3Loops_2.cpp | 24 +- .../helpers/cpu/loops/Reduction3Loops_3.cpp | 24 +- .../helpers/cpu/loops/ReductionLoops.hpp | 1 + .../helpers/cpu/loops/ReductionLoops_bool.cpp | 12 +- .../cpu/loops/ReductionLoops_float_0.cpp | 13 +- .../cpu/loops/ReductionLoops_float_1.cpp | 13 +- .../cpu/loops/ReductionLoops_float_2.cpp | 13 +- .../cpu/loops/ReductionLoops_float_3.cpp | 13 +- .../helpers/cpu/loops/ReductionLoops_long.cpp | 13 +- .../helpers/cpu/loops/ReductionLoops_same.cpp | 12 +- .../helpers/cuda/TrueBroadcastHelper.cu | 1 + .../include/helpers/impl/AttentionHelper.cpp | 10 +- libnd4j/include/helpers/impl/BlasHelper.cpp | 24 +- libnd4j/include/helpers/impl/DebugHelper.cpp | 18 +- libnd4j/include/helpers/impl/GradCheck.cpp | 2 - .../include/helpers/impl/OmpLaunchHelper.cpp | 6 +- libnd4j/include/loops/aggregates.h | 66 - libnd4j/include/loops/broadcasting.h | 19 +- libnd4j/include/loops/broadcasting_bool.h | 19 +- libnd4j/include/loops/broadcasting_int.h | 19 +- libnd4j/include/loops/cpu/broadcasting.hpp | 130 +- .../include/loops/cpu/broadcasting_bool.cpp | 117 +- .../include/loops/cpu/broadcasting_int.cpp | 137 +- libnd4j/include/loops/cpu/indexreduce.cpp | 62 +- libnd4j/include/loops/cpu/pairwise.hpp | 200 +- libnd4j/include/loops/cpu/pairwise2.hpp | 106 - libnd4j/include/loops/cpu/pairwise_bool.cpp | 201 +- libnd4j/include/loops/cpu/pairwise_int.cpp | 201 +- libnd4j/include/loops/cpu/random.cpp | 139 +- .../include/loops/cpu/reduce/reduce_bool.cpp | 104 +- .../include/loops/cpu/reduce/reduce_float.cpp | 121 +- .../include/loops/cpu/reduce/reduce_long.cpp | 117 +- .../include/loops/cpu/reduce/reduce_same.cpp | 123 +- libnd4j/include/loops/cpu/reduce3.cpp | 101 +- libnd4j/include/loops/cpu/scalar.hpp | 112 +- libnd4j/include/loops/cpu/scalar_bool.cpp | 116 +- libnd4j/include/loops/cpu/scalar_int.cpp | 118 +- .../include/loops/cpu/summarystatsreduce.cpp | 54 +- .../loops/cpu/transform/transform_any.cpp | 18 +- .../loops/cpu/transform/transform_bool.cpp | 18 +- .../loops/cpu/transform/transform_float.cpp | 16 +- .../loops/cpu/transform/transform_same.cpp | 14 +- .../loops/cpu/transform/transform_strict.cpp | 17 +- libnd4j/include/loops/cuda/aggregates.cu | 145 -- libnd4j/include/loops/cuda/broadcasting.cu | 78 - .../include/loops/cuda/broadcasting_bool.cu | 70 - .../include/loops/cuda/broadcasting_int.cu | 69 - libnd4j/include/loops/cuda/indexreduce.cu | 26 - libnd4j/include/loops/cuda/pairwise.cu | 52 - libnd4j/include/loops/cuda/pairwise_bool.cu | 57 - libnd4j/include/loops/cuda/pairwise_int.cu | 57 - libnd4j/include/loops/cuda/random.cu | 33 - libnd4j/include/loops/cuda/reduce3.chpp | 2 +- libnd4j/include/loops/cuda/reduce3.cu | 49 - libnd4j/include/loops/cuda/scalar_bool.cu | 35 - libnd4j/include/loops/cuda/scalar_int.cu | 34 - .../include/loops/cuda/summarystatsreduce.cu | 67 - .../loops/cuda/transform/transform_any.cu | 11 - .../loops/cuda/transform/transform_bool.cu | 11 - .../loops/cuda/transform/transform_float.cu | 12 - .../loops/cuda/transform/transform_same.cu | 11 - .../loops/cuda/transform/transform_strict.cu | 11 - .../include/loops/impl/type_conversions.cpp | 42 +- libnd4j/include/loops/indexreduce.h | 7 +- libnd4j/include/loops/legacy_ops.h | 3 - libnd4j/include/loops/pairwise_bool.h | 25 +- libnd4j/include/loops/pairwise_int.h | 24 +- libnd4j/include/loops/pairwise_transform.h | 22 +- libnd4j/include/loops/random.h | 3 +- libnd4j/include/loops/reduce3.h | 20 +- libnd4j/include/loops/reduce_bool.h | 13 +- libnd4j/include/loops/reduce_float.h | 13 +- libnd4j/include/loops/reduce_long.h | 13 +- libnd4j/include/loops/reduce_same.h | 14 +- libnd4j/include/loops/scalar.h | 15 +- libnd4j/include/loops/scalar_bool.h | 15 +- libnd4j/include/loops/scalar_int.h | 18 +- libnd4j/include/loops/summarystatsreduce.h | 4 +- libnd4j/include/loops/transform_any.h | 15 +- libnd4j/include/loops/transform_bool.h | 15 +- libnd4j/include/loops/transform_float.h | 14 +- libnd4j/include/loops/transform_same.h | 14 +- libnd4j/include/loops/transform_strict.h | 17 +- libnd4j/include/msvc.h | 39 + libnd4j/include/op_boilerplate.h | 3 +- libnd4j/include/openmp_pragmas.h | 40 +- libnd4j/include/ops/aggregate_ops.h | 996 ------- libnd4j/include/ops/declarable/BooleanOp.h | 1 - .../include/ops/declarable/BroadcastableOp.h | 1 - .../ops/declarable/DeclarableCustomOp.h | 1 - .../include/ops/declarable/DeclarableListOp.h | 3 +- libnd4j/include/ops/declarable/DeclarableOp.h | 2 +- .../ops/declarable/DeclarableReductionOp.h | 1 - libnd4j/include/ops/declarable/LegacyOp.h | 1 + libnd4j/include/ops/declarable/LogicOp.h | 1 - libnd4j/include/ops/declarable/OpTuple.h | 2 +- .../ops/declarable/generic/blas/axpy.cpp | 20 +- .../ops/declarable/generic/datatypes/cast.cpp | 8 - .../ops/declarable/generic/nn/batchnorm.cpp | 164 +- .../nn/multi_head_dot_product_attention.cpp | 21 +- .../declarable/generic/parity_ops/argmax.cpp | 2 +- .../declarable/generic/parity_ops/argmin.cpp | 2 +- .../recurrent/dynamicBidirectionalRNN.cpp | 6 +- .../generic/transforms/reverseSequence.cpp | 16 +- .../declarable/helpers/cpu/BarnesHutTsne.cpp | 39 +- .../declarable/helpers/cpu/activations.cpp | 103 +- .../ops/declarable/helpers/cpu/addBias.cpp | 65 +- .../ops/declarable/helpers/cpu/adjust_hue.cpp | 58 +- .../helpers/cpu/adjust_saturation.cpp | 62 +- .../declarable/helpers/cpu/batched_gemm.cpp | 34 +- .../ops/declarable/helpers/cpu/batchnorm.cpp | 20 +- .../ops/declarable/helpers/cpu/betaInc.cpp | 12 +- .../ops/declarable/helpers/cpu/col2im.cpp | 90 +- .../declarable/helpers/cpu/compare_elem.cpp | 43 +- .../ops/declarable/helpers/cpu/confusion.cpp | 18 +- .../declarable/helpers/cpu/convolutions.cpp | 1419 +++++----- .../ops/declarable/helpers/cpu/cross.cpp | 17 +- .../ops/declarable/helpers/cpu/d_t_s.cpp | 67 +- .../ops/declarable/helpers/cpu/diag.cpp | 1 - .../ops/declarable/helpers/cpu/dilation2d.cpp | 40 +- .../ops/declarable/helpers/cpu/dropout.cpp | 34 +- .../ops/declarable/helpers/cpu/dynamic.cpp | 39 +- .../helpers/cpu/extract_patches.cpp | 65 +- .../ops/declarable/helpers/cpu/gather.cpp | 33 +- .../ops/declarable/helpers/cpu/hamming.cpp | 47 +- .../ops/declarable/helpers/cpu/hashcode.cpp | 45 +- .../helpers/cpu/histogramFixedWidth.cpp | 20 +- .../ops/declarable/helpers/cpu/im2col.cpp | 76 +- .../declarable/helpers/cpu/image_resize.cpp | 149 +- .../helpers/cpu/image_suppression.cpp | 3 +- .../ops/declarable/helpers/cpu/ismax.cpp | 15 +- .../declarable/helpers/cpu/legacy_helper.cpp | 1 + .../ops/declarable/helpers/cpu/lrn.cpp | 378 +-- .../ops/declarable/helpers/cpu/lstm.cpp | 14 +- .../declarable/helpers/cpu/matrixSetDiag.cpp | 29 +- .../helpers/cpu/matrix_diag_part.cpp | 13 +- .../declarable/helpers/cpu/nth_element.cpp | 14 +- .../ops/declarable/helpers/cpu/one_hot.cpp | 63 +- .../ops/declarable/helpers/cpu/percentile.cpp | 2 +- .../ops/declarable/helpers/cpu/polyGamma.cpp | 11 +- .../ops/declarable/helpers/cpu/range.cpp | 10 +- .../ops/declarable/helpers/cpu/reverse.cpp | 117 +- .../ops/declarable/helpers/cpu/s_t_b.cpp | 112 +- .../ops/declarable/helpers/cpu/s_t_d.cpp | 73 +- .../ops/declarable/helpers/cpu/scatter.cpp | 105 +- .../ops/declarable/helpers/cpu/segment.cpp | 323 ++- .../declarable/helpers/cpu/sequence_mask.cpp | 14 +- .../ops/declarable/helpers/cpu/sg_cb.cpp | 310 +-- .../ops/declarable/helpers/cpu/sru.cpp | 206 +- .../ops/declarable/helpers/cpu/stack.cpp | 18 +- .../ops/declarable/helpers/cpu/top_k.cpp | 25 +- .../ops/declarable/helpers/cpu/transforms.cpp | 519 ++-- .../ops/declarable/helpers/cpu/zeta.cpp | 10 +- .../include/ops/declarable/helpers/cross.h | 18 +- .../ops/declarable/helpers/cuda/col2im.cppc | 138 - .../ops/declarable/helpers/cuda/im2col.cppc | 129 - .../declarable/helpers/cuda/legacy/relu.cu | 1 + .../declarable/helpers/cuda/legacy/tanh.cu | 1 + .../declarable/helpers/cuda/legacy_helper.cu | 1 + .../ops/declarable/helpers/cuda/transforms.cu | 3 +- .../include/ops/declarable/helpers/helpers.h | 1 + .../ops/declarable/helpers/impl/choose.cpp | 1 + .../ops/declarable/helpers/impl/unique.cpp | 15 +- .../include/ops/declarable/helpers/matmul.h | 1 - .../include/ops/declarable/impl/BooleanOp.cpp | 4 - .../ops/declarable/impl/BroadcastableOp.cpp | 4 - .../declarable/impl/DeclarableCustomOp.cpp | 4 - .../ops/declarable/impl/DeclarableListOp.cpp | 4 - .../declarable/impl/DeclarableReductionOp.cpp | 8 +- .../ops/declarable/impl/LegacyReduce3Op.cpp | 5 +- .../declarable/impl/LegacyReduceBoolOp.cpp | 5 +- .../declarable/impl/LegacyReduceFloatOp.cpp | 5 +- .../declarable/impl/LegacyReduceLongOp.cpp | 5 +- .../declarable/impl/LegacyReduceSameOp.cpp | 3 +- .../ops/declarable/impl/LegacyStatsOp.cpp | 5 +- .../declarable/platform/mkldnn/batchnorm.cpp | 130 +- .../ops/declarable/platform/mkldnn/conv3d.cpp | 3 + libnd4j/include/ops/impl/gemm.cpp | 87 +- libnd4j/include/ops/impl/specials.cpp | 252 +- libnd4j/include/ops/ops.h | 36 - .../include/ops/special_accumulation_ops.h | 213 -- libnd4j/include/ops/special_ops.h | 2293 ----------------- libnd4j/include/ops/special_random_ops.h | 176 +- libnd4j/include/ops/specials.h | 7 +- .../benchmarking/impl/FullBenchmarkSuit.cpp | 3 +- .../benchmarking/impl/LightBenchmarkSuit.cpp | 19 +- libnd4j/include/pointercast.h | 1 + libnd4j/include/templatemath.h | 43 +- libnd4j/pom.xml | 2 + .../layers_tests/BooleanOpsTests.cpp | 2 +- .../layers_tests/BroadcastableOpsTests.cpp | 8 +- .../tests_cpu/layers_tests/BrodcastTests.cpp | 2 +- libnd4j/tests_cpu/layers_tests/CMakeLists.txt | 21 +- .../layers_tests/ConditionalTests.cpp | 1 - .../layers_tests/ConstantShapeHelperTests.cpp | 4 +- .../layers_tests/ConvolutionTests1.cpp | 180 +- .../layers_tests/DataTypesValidationTests.cpp | 4 +- .../layers_tests/DeclarableOpsTests1.cpp | 53 +- .../layers_tests/DeclarableOpsTests10.cpp | 42 +- .../layers_tests/DeclarableOpsTests11.cpp | 11 - .../layers_tests/DeclarableOpsTests12.cpp | 18 - .../layers_tests/DeclarableOpsTests13.cpp | 1 - .../layers_tests/DeclarableOpsTests14.cpp | 12 +- .../layers_tests/DeclarableOpsTests15.cpp | 1 - .../layers_tests/DeclarableOpsTests16.cpp | 11 + .../layers_tests/DeclarableOpsTests2.cpp | 2 - .../layers_tests/DeclarableOpsTests4.cpp | 8 - .../layers_tests/DeclarableOpsTests5.cpp | 72 +- .../layers_tests/DeclarableOpsTests6.cpp | 49 +- .../layers_tests/DeclarableOpsTests7.cpp | 83 +- .../layers_tests/DeclarableOpsTests8.cpp | 1008 ++++---- .../layers_tests/DeclarableOpsTests9.cpp | 115 +- libnd4j/tests_cpu/layers_tests/EmptyTests.cpp | 3 - .../tests_cpu/layers_tests/HelpersTests1.cpp | 5 +- .../tests_cpu/layers_tests/IndexingTests.cpp | 5 - .../layers_tests/JavaInteropCudaTests.cu | 2 - .../layers_tests/JavaInteropTests.cpp | 25 +- libnd4j/tests_cpu/layers_tests/LambdaTests.cu | 9 - .../tests_cpu/layers_tests/LegacyOpsTests.cpp | 24 +- .../layers_tests/NDArrayCudaBasicsTests.cu | 42 +- .../tests_cpu/layers_tests/NDArrayTests.cpp | 5 - .../tests_cpu/layers_tests/NDArrayTests2.cpp | 19 - .../tests_cpu/layers_tests/NativeOpsTests.cpp | 7 +- .../layers_tests/OmpLaunchHelperTests.cpp | 28 - libnd4j/tests_cpu/layers_tests/OpsArena.cpp | 200 -- .../tests_cpu/layers_tests/ParityOpsTests.cpp | 6 +- .../layers_tests/PerformanceTests.cpp | 95 + .../layers_tests/PlaygroundTests.cpp | 191 +- libnd4j/tests_cpu/layers_tests/RNGTests.cpp | 1 - .../tests_cpu/layers_tests/ReduceTests.cpp | 6 +- .../tests_cpu/layers_tests/ShapeTests2.cpp | 1 - libnd4j/tests_cpu/layers_tests/TadTests.cpp | 7 - .../tests_cpu/layers_tests/ThreadsTests.cpp | 233 ++ .../tests_cpu/layers_tests/WorkspaceTests.cpp | 2 - .../tests_cpu/libnd4j_tests/CMakeLists.txt | 6 +- libnd4j/tests_cpu/run_tests.sh | 25 +- .../functions/DifferentialFunction.java | 2 +- .../debugging/OpBenchmarkListener.java | 189 ++ .../samediff/internal/InferenceSession.java | 42 +- .../internal/memory/ArrayCacheMemoryMgr.java | 292 +++ .../api/ops/impl/reduce3/EqualsWithEps.java | 2 +- .../java/org/nd4j/nativeblas/Nd4jCuda.java | 2 +- .../cpu/nativecpu/CpuMemoryManager.java | 2 +- .../java/org/nd4j/nativeblas/Nd4jCpu.java | 66 +- .../nd4j/autodiff/samediff/MemoryMgrTest.java | 119 + nd4s/build.sbt | 2 +- nd4s/src/main/scala/org/nd4s/Implicits.scala | 2 +- .../org/nd4s/samediff/ConstructionTest.scala | 6 +- .../scala/org/nd4s/samediff/MathTest.scala | 14 +- .../org/nd4s/samediff/SameDiffTest.scala | 21 +- 293 files changed, 9700 insertions(+), 12064 deletions(-) create mode 100644 libnd4j/include/execution/BlockingQueue.h create mode 100644 libnd4j/include/execution/CallableInterface.h create mode 100644 libnd4j/include/execution/CallableWithArguments.h create mode 100644 libnd4j/include/execution/ThreadPool.h create mode 100644 libnd4j/include/execution/Threads.h create mode 100644 libnd4j/include/execution/Ticket.h create mode 100644 libnd4j/include/execution/impl/BlockingQueue.cpp create mode 100644 libnd4j/include/execution/impl/CallableInterface.cpp create mode 100644 libnd4j/include/execution/impl/CallableWithArguments.cpp create mode 100644 libnd4j/include/execution/impl/ThreadPool.cpp create mode 100644 libnd4j/include/execution/impl/Threads.cpp create mode 100644 libnd4j/include/execution/impl/Ticket.cpp delete mode 100644 libnd4j/include/loops/aggregates.h delete mode 100644 libnd4j/include/loops/cpu/pairwise2.hpp delete mode 100644 libnd4j/include/loops/cuda/aggregates.cu create mode 100644 libnd4j/include/msvc.h delete mode 100644 libnd4j/include/ops/aggregate_ops.h delete mode 100644 libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc delete mode 100644 libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc delete mode 100644 libnd4j/include/ops/special_accumulation_ops.h delete mode 100644 libnd4j/include/ops/special_ops.h delete mode 100644 libnd4j/tests_cpu/layers_tests/OpsArena.cpp create mode 100644 libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp create mode 100644 libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp create mode 100644 nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/listeners/debugging/OpBenchmarkListener.java create mode 100644 nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/memory/ArrayCacheMemoryMgr.java create mode 100644 nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/samediff/MemoryMgrTest.java diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/samediff/CompareTrainingImplementations.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/samediff/CompareTrainingImplementations.java index 12564f01a..fa0fc335f 100644 --- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/samediff/CompareTrainingImplementations.java +++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/samediff/CompareTrainingImplementations.java @@ -98,6 +98,7 @@ public class CompareTrainingImplementations extends BaseDL4JTest { SDVariable diff = sd.f().squaredDifference(a1, label); SDVariable lossMse = diff.mean(); + lossMse.markAsLoss(); IUpdater updater; double lr; diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/ConvolutionUtils.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/ConvolutionUtils.java index d5c8ee1f6..56421bc00 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/ConvolutionUtils.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/ConvolutionUtils.java @@ -35,6 +35,7 @@ import org.nd4j.linalg.api.ops.Op; import org.nd4j.linalg.api.ops.impl.broadcast.BroadcastCopyOp; import org.nd4j.linalg.api.ops.impl.layers.convolution.MaxPooling2D; import org.nd4j.linalg.api.ops.impl.layers.convolution.config.Pooling2DConfig; +import org.nd4j.linalg.api.ops.impl.transforms.custom.Assign; import org.nd4j.linalg.api.shape.Shape; import org.nd4j.linalg.exception.ND4JArraySizeException; import org.nd4j.linalg.factory.NDArrayFactory; @@ -482,23 +483,12 @@ public class ConvolutionUtils { return reshape5dTo2d(format, mask, workspaceMgr, type); } else { //Need to broadcast first - IntArrayList broadcastDims = new IntArrayList(); - for(int i=0; i) endif() + #if(WIN32) + # message("CPU on Windows: enabling /EHsc") + # SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj /std:c++14") + # SET_TARGET_PROPERTIES(${LIBND4J_NAME} PROPERTIES COMPILER_FLAGS "/EHsc /bigobj /std:c++14") + #endif() + # we're including {MKLDNN} here in case of building from sources. in future that'll replace {MKLDNN_LIBRARIES}. same applies to BLAS + if (NOT BLAS_LIBRARIES) + set(BLAS_LIBRARIES "") + endif() target_link_libraries(${LIBND4J_NAME} ${MKLDNN} ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${BLAS_LIBRARIES} ${CPU_FEATURES}) if ("${LIBND4J_ALL_OPS}" AND "${LIBND4J_BUILD_MINIFIER}") diff --git a/libnd4j/blas/Environment.cpp b/libnd4j/blas/Environment.cpp index 0c23f61be..90c391cf1 100644 --- a/libnd4j/blas/Environment.cpp +++ b/libnd4j/blas/Environment.cpp @@ -24,6 +24,8 @@ #include #include "Environment.h" #include +#include +#include #ifdef _OPENMP @@ -49,6 +51,7 @@ namespace nd4j { _precBoost.store(false); _leaks.store(false); _dataType.store(nd4j::DataType::FLOAT32); + _maxThreads = std::thread::hardware_concurrency(); #ifndef ANDROID const char* omp_threads = std::getenv("OMP_NUM_THREADS"); @@ -86,9 +89,7 @@ namespace nd4j { cudaSetDevice(0); delete[] devProperties; #else -#ifdef _OPENMP - omp_set_nested(1); -#endif + #endif } diff --git a/libnd4j/blas/NDArray.h b/libnd4j/blas/NDArray.h index 10847f882..de2488f9d 100644 --- a/libnd4j/blas/NDArray.h +++ b/libnd4j/blas/NDArray.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -1678,7 +1679,6 @@ namespace nd4j { ////////////////////////////////////////////////////////////////////////// size_t NDArray::sizeOfT() const { - return DataTypeUtils::sizeOfElement(_dataType); } diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp index 2a601033a..c4a631cf5 100644 --- a/libnd4j/blas/NDArray.hpp +++ b/libnd4j/blas/NDArray.hpp @@ -2478,7 +2478,6 @@ double NDArray::getTrace() const { double sum = 0.; -PRAGMA_OMP_PARALLEL_FOR_ARGS(reduction(OMP_SUMT:sum) OMP_IF(minDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) for(int i = 0; i < minDim; ++i) sum += e(i * offset); @@ -3275,7 +3274,7 @@ bool NDArray::equalsTo(const NDArray *other, double eps) const { // regular numeric types NDArray tmp(nd4j::DataType::FLOAT32, getContext()); // scalar = 0 - ExtraArguments extras({eps}); + ExtraArguments extras({0.0, 0.0, eps}); NDArray::prepareSpecialUse({&tmp}, {this, other}); NativeOpExecutioner::execReduce3Scalar(getContext(), reduce3::EqualsWithEps, getBuffer(), getShapeInfo(), @@ -3288,7 +3287,7 @@ bool NDArray::equalsTo(const NDArray *other, double eps) const { synchronize("NDArray::equalsTo"); - if (tmp.e(0) > 0) + if (tmp.e(0) != 0) return false; return true; diff --git a/libnd4j/blas/NativeOpExecutioner.h b/libnd4j/blas/NativeOpExecutioner.h index cae7a4e56..fb2ca58f0 100644 --- a/libnd4j/blas/NativeOpExecutioner.h +++ b/libnd4j/blas/NativeOpExecutioner.h @@ -24,10 +24,10 @@ #include #include -#include #include #include #include +#include /** * Native op executioner: @@ -624,10 +624,6 @@ static void execTransformBool(nd4j::LaunchContext *lc, void *vrealArguments, int numRealArguments) { - auto arguments = reinterpret_cast(varguments); - auto realArguments = reinterpret_cast(vrealArguments); - - functions::aggregate::AggregatedFunction::exec(opNum, arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments); } diff --git a/libnd4j/blas/NativeOps.h b/libnd4j/blas/NativeOps.h index b2679f537..b10b3807a 100755 --- a/libnd4j/blas/NativeOps.h +++ b/libnd4j/blas/NativeOps.h @@ -55,7 +55,6 @@ #define ND4J_EXPORT #endif #include -#include /* int tad_threshold = 1; @@ -1430,7 +1429,11 @@ static const char* getNpyArrayNameFromMap(void *map, int index){ for(; it != end; ++it, ++cnt){ if (cnt == index){ // FIXME: @fariz, this is a leak! +#ifdef _MSC_VER + return const_cast(_strdup(it->first.c_str())); +#else return const_cast(strdup(it->first.c_str())); +#endif } } throw std::runtime_error("No array at index."); diff --git a/libnd4j/blas/cpu/NDArray.cpp b/libnd4j/blas/cpu/NDArray.cpp index 03c7c53e1..dc9d09231 100644 --- a/libnd4j/blas/cpu/NDArray.cpp +++ b/libnd4j/blas/cpu/NDArray.cpp @@ -98,24 +98,27 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, const char const bool areSameOffsets = shape::haveSameShapeAndStrides(getShapeInfo(), target->getShapeInfo()); - std::vector coords(zRank); - PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(zLen > Environment::getInstance()->elementwiseThreshold()) firstprivate(coords)) - for (Nd4jLong i = 0; i < zLen; ++i) { + auto func = PRAGMA_THREADS_FOR { + Nd4jLong coords[MAX_RANK]; + for (auto i = start; i < stop; i += increment) { + shape::index2coords(i, target->getShapeInfo(), coords); + const auto zOffset = shape::getOffset(target->getShapeInfo(), coords); - shape::index2coords(i, target->getShapeInfo(), coords.data()); - const auto zOffset = shape::getOffset(target->getShapeInfo(), coords.data()); + // if( (row + upper < col) || (row + lower > col) ) + if ((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1])) + z[zOffset] = value; + else if (this != target) { // when this and target are different arrays + if (xRank != zRank) + coords[0] = coords[1]; - // if( (row + upper < col) || (row + lower > col) ) - if((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1])) - z[zOffset] = value; - else if(this != target) { // when this and target are different arrays - if(xRank != zRank) - coords[0] = coords[1]; - const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(getShapeInfo(), coords.data()); - z[zOffset] = x[xOffset]; + const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(getShapeInfo(), coords); + z[zOffset] = x[xOffset]; + } } - } + }; + + samediff::Threads::parallel_for(func, 0, zLen); } BUILD_SINGLE_TEMPLATE(template void NDArray::fillAsTriangular, (const float val, int lower, int upper, const char direction, NDArray* target), LIBND4J_TYPES); @@ -140,7 +143,7 @@ void NDArray::setIdentity() { minDim = shape[i]; float v = 1.0f; - PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(minDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) + for(int i = 0; i < minDim; ++i) templatedSet(buffer(), i*offset, this->dataType(), &v); } @@ -151,12 +154,15 @@ static void templatedSwap(void *xBuffer, void *yBuffer, Nd4jLong length) { auto x = reinterpret_cast(xBuffer); auto y = reinterpret_cast(yBuffer); - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(static)) - for (Nd4jLong i = 0; i < length; ++i) { - auto temp = x[i]; - x[i] = y[i]; - y[i] = temp; - } + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto temp = x[i]; + x[i] = y[i]; + y[i] = temp; + } + }; + + samediff::Threads::parallel_for(func, 0, length); } BUILD_SINGLE_TEMPLATE(template void templatedSwap, (void *xBuffer, void *yBuffer, Nd4jLong length), LIBND4J_TYPES); @@ -262,21 +268,26 @@ NDArray NDArray::tile(const std::vector& reps) const { auto xType = this->dataType(); if(result.ordering() == 'c') { // ews == 1 always here - PRAGMA_OMP_PARALLEL_FOR_SIMD - for(Nd4jLong i = 0; i < resultLen; ++i) { - auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo()); - BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign, (result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo()); + BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES); + } + }; - } + samediff::Threads::parallel_for(func, 0, resultLen); } else { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for(Nd4jLong i=0; itemplate templatedAssign, (result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES); - } + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto xOffset = result.getOffset(i); + auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo()); + BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES); + } + }; + + samediff::Threads::parallel_for(func, 0, resultLen); } result.tickWriteHost(); return result; @@ -337,14 +348,7 @@ void NDArray::tile(NDArray& target) const { // looping through _buffer goes automatically by means of getSubArrayIndex applying const auto ews = target.ews(); const auto targetLen = target.lengthOf(); - if(target.ordering() == 'c' && ews == 1) { // ews == 1 always here - - for (Nd4jLong i = 0; i < targetLen; ++i) { - auto yOffset = shape::subArrayOffset(i, target.getShapeInfo(), getShapeInfo()); - BUILD_DOUBLE_SELECTOR(target.dataType(), dataType(), templatedDoubleAssign, (target.getBuffer(), i, getBuffer(), yOffset), LIBND4J_TYPES, LIBND4J_TYPES); - } - } - else if(target.ordering() == 'c' && ews > 1) { + if(target.ordering() == 'c' && ews >= 1) { for(Nd4jLong i=0; i coords(rank); - // loop through input array - PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords)) - for (Nd4jLong i = 0; i < zLen; ++i) { + auto func = PRAGMA_THREADS_FOR { + Nd4jLong coords[MAX_RANK]; + for (auto i = start; i < stop; i += increment) { + shape::index2coords(i, output.getShapeInfo(), coords); - shape::index2coords(i, output.getShapeInfo(), coords.data()); + const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); - const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data()); - - if(repSize > 1) { - for (uint j = 0; j < repSize; ++j) { - coords[axis] -= repeats[j]; - if (coords[axis] < 0) { - coords[axis] = j; - break; + if (repSize > 1) { + for (uint j = 0; j < repSize; ++j) { + coords[axis] -= repeats[j]; + if (coords[axis] < 0) { + coords[axis] = j; + break; + } } - } - } - else - coords[axis] /= repeats[0]; + } else + coords[axis] /= repeats[0]; - z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())]; - } + z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)]; + } + }; + + samediff::Threads::parallel_for(func, 0, zLen); } ////////////////////////////////////////////////////////////////////////// diff --git a/libnd4j/blas/cpu/NDArrayLambda.hpp b/libnd4j/blas/cpu/NDArrayLambda.hpp index ecf2aa9ed..6ce8e6823 100644 --- a/libnd4j/blas/cpu/NDArrayLambda.hpp +++ b/libnd4j/blas/cpu/NDArrayLambda.hpp @@ -32,33 +32,40 @@ void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std:: if (this->ordering() == second->ordering() && this->ordering() == third->ordering() && this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1) && this->ews() == second->ews() && this->ews() == third->ews()) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < _length; e++) - z[e] = func(f[e], s[e], t[e]); + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) + z[e] = func(f[e], s[e], t[e]); + }; + + samediff::Threads::parallel_for(loop, 0, _length); } else { if (f == z) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < _length; e++) { + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto tOffset = this->getOffset(e); + auto uOffset = second->getOffset(e); + auto vOffset = third->getOffset(e); - auto tOffset = this->getOffset(e); - auto uOffset = second->getOffset(e); - auto vOffset = third->getOffset(e); + f[tOffset] = func(f[tOffset], s[uOffset], t[vOffset]); + } + }; - f[tOffset] = func(f[tOffset], s[uOffset], t[vOffset]); - } + samediff::Threads::parallel_for(loop, 0, _length); } else { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < _length; e++) { + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto tOffset = this->getOffset(e); + auto uOffset = second->getOffset(e); + auto vOffset = third->getOffset(e); + auto zOffset = target->getOffset(e); - auto tOffset = this->getOffset(e); - auto uOffset = second->getOffset(e); - auto vOffset = third->getOffset(e); - auto zOffset = target->getOffset(e); + z[zOffset] = func(f[tOffset], s[uOffset], t[vOffset]); + } + }; - z[zOffset] = func(f[tOffset], s[uOffset], t[vOffset]); - } + samediff::Threads::parallel_for(loop, 0, _length); } } } @@ -103,31 +110,38 @@ void NDArray::applyPairwiseLambda(const NDArray* other, const std::functionordering() == other->ordering() && this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1) && this->ews() == other->ews()) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < _length; e++) - z[e] = func(f[e], s[e]); + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) + z[e] = func(f[e], s[e]); + }; + + samediff::Threads::parallel_for(loop, 0, _length); } else { if (f == z) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < _length; e++) { + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto xOffset = this->getOffset(e); + auto yOffset = other->getOffset(e); - auto xOffset = this->getOffset(e); - auto yOffset = other->getOffset(e); + f[xOffset] = func(f[xOffset], s[yOffset]); + } + }; - f[xOffset] = func(f[xOffset], s[yOffset]); - } + samediff::Threads::parallel_for(loop, 0, _length); } else { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < _length; e++) { + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto xOffset = this->getOffset(e); + auto yOffset = other->getOffset(e); + auto zOffset = target->getOffset(e); - auto xOffset = this->getOffset(e); - auto yOffset = other->getOffset(e); - auto zOffset = target->getOffset(e); + z[zOffset] = func(f[xOffset], s[yOffset]); + } + }; - z[zOffset] = func(f[xOffset], s[yOffset]); - } + samediff::Threads::parallel_for(loop, 0, _length); } } } @@ -161,29 +175,36 @@ void NDArray::applyLambda(const std::function& func, NDArray* target) { if (this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1)) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int e = 0; e < _length; e++) - z[e] = func(f[e]); + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) + z[e] = func(f[e]); + }; + + samediff::Threads::parallel_for(loop, 0, _length); } else { if (f == z) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int e = 0; e < _length; e++) { + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto xOffset = this->getOffset(e); - auto xOffset = this->getOffset(e); + f[xOffset] = func(f[xOffset]); + } + }; - f[xOffset] = func(f[xOffset]); - } + samediff::Threads::parallel_for(loop, 0, _length); } else { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int e = 0; e < _length; e++) { + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto xOffset = this->getOffset(e); + auto zOffset = target->getOffset(e); - auto xOffset = this->getOffset(e); - auto zOffset = target->getOffset(e); + z[zOffset] = func(f[xOffset]); + } + }; - z[zOffset] = func(f[xOffset]); - } + samediff::Threads::parallel_for(loop, 0, _length); } } } @@ -217,29 +238,36 @@ void NDArray::applyIndexedLambda(const std::function& func, NDAr if (this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1)) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < _length; e++) - z[e] = func(e, f[e]); + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) + z[e] = func(e, f[e]); + }; + + samediff::Threads::parallel_for(loop, 0, _length); } else { if (f == z) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < _length; e++) { + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto xOffset = this->getOffset(e); - auto xOffset = this->getOffset(e); + f[xOffset] = func(e, f[xOffset]); + } + }; - f[xOffset] = func(e, f[xOffset]); - } + samediff::Threads::parallel_for(loop, 0, _length); } else { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < _length; e++) { + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto xOffset = this->getOffset(e); + auto zOffset = target->getOffset(e); - auto xOffset = this->getOffset(e); - auto zOffset = target->getOffset(e); + z[zOffset] = func(e, f[xOffset]); + } + }; - z[zOffset] = func(e, f[xOffset]); - } + samediff::Threads::parallel_for(loop, 0, _length); } } } @@ -282,31 +310,38 @@ void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::functionordering() == other->ordering() && this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1) && this->ews() == other->ews()) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < _length; e++) - z[e] = func((Nd4jLong) e, f[e], s[e]); + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) + z[e] = func((Nd4jLong) e, f[e], s[e]); + }; + + samediff::Threads::parallel_for(loop, 0, _length); } else { if (f == z) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int e = 0; e < _length; e++) { + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto xOffset = this->getOffset(e); + auto yOffset = other->getOffset(e); - auto xOffset = this->getOffset(e); - auto yOffset = other->getOffset(e); + f[xOffset] = func((Nd4jLong) e, f[xOffset], s[yOffset]); + } + }; - f[xOffset] = func((Nd4jLong) e, f[xOffset], s[yOffset]); - } + samediff::Threads::parallel_for(loop, 0, _length); } else { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int e = 0; e < _length; e++) { + auto loop = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto xOffset = this->getOffset(e); + auto yOffset = other->getOffset(e); + auto zOffset = target->getOffset(e); - auto xOffset = this->getOffset(e); - auto yOffset = other->getOffset(e); - auto zOffset = target->getOffset(e); + z[zOffset] = func((Nd4jLong) e, f[xOffset], s[yOffset]); + } + }; - z[zOffset] = func((Nd4jLong) e, f[xOffset], s[yOffset]); - } + samediff::Threads::parallel_for(loop, 0, _length); } } } diff --git a/libnd4j/blas/cpu/NativeOpExecutioner.cpp b/libnd4j/blas/cpu/NativeOpExecutioner.cpp index 22fd9eca4..dc27c1cce 100644 --- a/libnd4j/blas/cpu/NativeOpExecutioner.cpp +++ b/libnd4j/blas/cpu/NativeOpExecutioner.cpp @@ -20,6 +20,8 @@ #include "NativeOpExecutioner.h" #include +#include + #include #include #include @@ -50,11 +52,14 @@ #include #include #include +#include +#include #ifdef _OPENMP #include +#include #endif @@ -78,9 +83,7 @@ void NativeOpExecutioner::execIndexReduceScalar(nd4j::LaunchContext *lc, int op void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); @@ -111,9 +114,7 @@ void NativeOpExecutioner::execIndexReduce(nd4j::LaunchContext *lc, void *dZ, Nd4jLong *dZShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); @@ -149,9 +150,7 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext *lc, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo); @@ -160,7 +159,16 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext *lc, #ifdef __ND4J_EXPERIMENTAL__ BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES); #else - BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES); + + auto func = PRAGMA_THREADS_FOR { + BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES); + }; + + auto xLen = shape::length(hXShapeInfo); + auto yLen = shape::length(hYShapeInfo); + auto numTads = xLen / yLen; + + samediff::Threads::parallel_tad(func, 0, numTads); #endif } @@ -179,9 +187,7 @@ void NativeOpExecutioner::execInverseBroadcast(nd4j::LaunchContext *lc, auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); -#ifdef _OPENMP - omp_set_nested(1); -#endif + if (!nd4j::Environment::getInstance()->isExperimentalBuild()) if ((yType != xType && yType != nd4j::DataType::BOOL) || xType != zType) @@ -190,7 +196,15 @@ void NativeOpExecutioner::execInverseBroadcast(nd4j::LaunchContext *lc, #ifdef __ND4J_EXPERIMENTAL__ BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::broadcast::Broadcast, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES); #else - BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES); + auto func = PRAGMA_THREADS_FOR { + BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES); + }; + + auto xLen = shape::length(hXShapeInfo); + auto yLen = shape::length(hYShapeInfo); + auto numTads = yLen / xLen; + + samediff::Threads::parallel_tad(func, 0, numTads); #endif } @@ -208,15 +222,21 @@ void NativeOpExecutioner::execBroadcastBool(nd4j::LaunchContext *lc, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, BOOL_TYPES); + auto func = PRAGMA_THREADS_FOR { + BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES, BOOL_TYPES); + }; + + auto xLen = shape::length(hXShapeInfo); + auto yLen = shape::length(hYShapeInfo); + auto numTads = xLen / yLen; + + samediff::Threads::parallel_tad(func, 0, numTads); } void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext *lc, @@ -231,9 +251,7 @@ void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext *lc, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo); @@ -243,7 +261,15 @@ void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext *lc, if (yType != xType || nd4j::DataType::BOOL != zType) throw nd4j::datatype_exception::build("NativeOps::execInverseBroadcastBool both operands must have same data type", xType, yType); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, BOOL_TYPES); + auto func = PRAGMA_THREADS_FOR { + BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES, BOOL_TYPES); + }; + + auto xLen = shape::length(hXShapeInfo); + auto yLen = shape::length(hYShapeInfo); + auto numTads = yLen / xLen; + + samediff::Threads::parallel_tad(func, 0, numTads); } @@ -260,9 +286,7 @@ void NativeOpExecutioner::execBroadcastInt(nd4j::LaunchContext *lc, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo); @@ -274,7 +298,15 @@ void NativeOpExecutioner::execBroadcastInt(nd4j::LaunchContext *lc, if (!nd4j::DataTypeUtils::isZ(zType)) throw nd4j::datatype_exception::build("NativeOpExecutioner::execBroadcastInt requires integer data type", zType); - BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), INTEGER_TYPES); + auto func = PRAGMA_THREADS_FOR { + BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), INTEGER_TYPES); + }; + + auto xLen = shape::length(hXShapeInfo); + auto yLen = shape::length(hYShapeInfo); + auto numTads = xLen / yLen; + + samediff::Threads::parallel_tad(func, 0, numTads); } void NativeOpExecutioner::execInverseBroadcastInt(nd4j::LaunchContext *lc, @@ -289,21 +321,27 @@ void NativeOpExecutioner::execInverseBroadcastInt(nd4j::LaunchContext *lc, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); if (xType != yType || xType != zType) - throw nd4j::datatype_exception::build("NativeOpExecutioner::execPairwiseIntTransform", zType, xType, yType); + throw nd4j::datatype_exception::build("NativeOpExecutioner::execInverseBroadcastInt", zType, xType, yType); if (!nd4j::DataTypeUtils::isZ(zType)) - throw nd4j::datatype_exception::build("NativeOpExecutioner::execBroadcastInt requires integer data type", zType); + throw nd4j::datatype_exception::build("NativeOpExecutioner::execInverseBroadcastInt requires integer data type", zType); - BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), INTEGER_TYPES); + auto func = PRAGMA_THREADS_FOR { + BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt,::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), INTEGER_TYPES); + }; + + auto xLen = shape::length(hXShapeInfo); + auto yLen = shape::length(hYShapeInfo); + auto numTads = yLen / xLen; + + samediff::Threads::parallel_tad(func, 0, numTads); } //////////////////////////////////////////////////////////////////////// @@ -328,9 +366,7 @@ void NativeOpExecutioner::execPairwiseTransform(nd4j::LaunchContext *lc, void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo, void *extraParams) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo); @@ -339,7 +375,15 @@ void NativeOpExecutioner::execPairwiseTransform(nd4j::LaunchContext *lc, #ifdef __ND4J_EXPERIMENTAL__ BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::pairwise_transforms::PairWiseTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams), LIBND4J_TYPES, LIBND4J_TYPES); #else - BUILD_SINGLE_SELECTOR_THRICE(xType, functions::pairwise_transforms::PairWiseTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams), LIBND4J_TYPES); + auto func = PRAGMA_THREADS_FOR { + BUILD_SINGLE_SELECTOR_THRICE(xType, functions::pairwise_transforms::PairWiseTransform, + ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams, start, stop), + LIBND4J_TYPES); + }; + + auto zLen = shape::length(hZShapeInfo); + samediff::Threads::parallel_for(func, 0, zLen, 1, nd4j::math::nd4j_max(1, nd4j::math::nd4j_min(zLen / 1024, nd4j::Environment::getInstance()->maxThreads()))); + #endif } @@ -353,9 +397,7 @@ void NativeOpExecutioner::execPairwiseBoolTransform(nd4j::LaunchContext *lc, void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo, void *extraParams) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo); @@ -367,7 +409,13 @@ void NativeOpExecutioner::execPairwiseBoolTransform(nd4j::LaunchContext *lc, if (zType != nd4j::DataType::BOOL) throw nd4j::datatype_exception::build("NativeOpExecutioner::execPairwiseBoolTransform", nd4j::DataType::BOOL, zType); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::pairwise_transforms::PairWiseBoolTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams), LIBND4J_TYPES, BOOL_TYPES); + auto func = PRAGMA_THREADS_FOR { + BUILD_DOUBLE_SELECTOR(xType, zType, functions::pairwise_transforms::PairWiseBoolTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams, start, stop), LIBND4J_TYPES, BOOL_TYPES); + }; + + auto zLen = shape::length(hZShapeInfo); + samediff::Threads::parallel_for(func, 0, zLen, 1, nd4j::math::nd4j_max(1, nd4j::math::nd4j_min(zLen / 1024, nd4j::Environment::getInstance()->maxThreads()))); + } //////////////////////////////////////////////////////////////////////// @@ -380,9 +428,7 @@ void NativeOpExecutioner::execPairwiseIntTransform(nd4j::LaunchContext *lc, void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo, void *extraParams) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo); @@ -394,7 +440,13 @@ void NativeOpExecutioner::execPairwiseIntTransform(nd4j::LaunchContext *lc, if (!nd4j::DataTypeUtils::isZ(zType)) throw nd4j::datatype_exception::build("NativeOpExecutioner::execSPairwiseInt requires integer data type", zType); - BUILD_SINGLE_SELECTOR(xType, functions::pairwise_transforms::PairWiseIntTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams), INTEGER_TYPES); + auto func = PRAGMA_THREADS_FOR { + BUILD_SINGLE_SELECTOR(xType, functions::pairwise_transforms::PairWiseIntTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams, start, stop), INTEGER_TYPES); + }; + + auto zLen = shape::length(hZShapeInfo); + samediff::Threads::parallel_for(func, 0, zLen, 1, nd4j::math::nd4j_max(1, nd4j::math::nd4j_min(zLen / 1024, nd4j::Environment::getInstance()->maxThreads()))); + } //////////////////////////////////////////////////////////////////////// @@ -417,14 +469,22 @@ void NativeOpExecutioner::execReduceFloat(nd4j::LaunchContext *lc, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES); + // nothing to do here if result is empty + if (shape::isEmpty(hZShapeInfo)) + return; + + auto func = PRAGMA_THREADS_FOR { + BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, FLOAT_TYPES); + }; + + const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo); + + samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxThreads()); } //////////////////////////////////////////////////////////////////////// @@ -437,14 +497,22 @@ void NativeOpExecutioner::execReduceSame(nd4j::LaunchContext *lc, void *dZ, Nd4jLong *dZShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); - BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES); + // nothing to do here if result is empty + if (shape::isEmpty(hZShapeInfo)) + return; + + auto func = PRAGMA_THREADS_FOR { + BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES); + }; + + const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo); + + samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxThreads()); } //////////////////////////////////////////////////////////////////////// @@ -457,14 +525,22 @@ void NativeOpExecutioner::execReduceBool(nd4j::LaunchContext *lc, void *dZ, Nd4jLong *dZShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, BOOL_TYPES); + // nothing to do here if result is empty + if (shape::isEmpty(hZShapeInfo)) + return; + + auto func = PRAGMA_THREADS_FOR { + BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, BOOL_TYPES); + }; + + const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo); + + samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxThreads()); } //////////////////////////////////////////////////////////////////////// @@ -477,14 +553,22 @@ void NativeOpExecutioner::execReduceLong(nd4j::LaunchContext *lc, void *dZ, Nd4jLong *dZShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, LONG_TYPES); + // nothing to do here if result is empty + if (shape::isEmpty(hZShapeInfo)) + return; + + auto func = PRAGMA_THREADS_FOR { + BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, LONG_TYPES); + }; + + const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo); + + samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxThreads()); } //////////////////////////////////////////////////////////////////////// @@ -503,9 +587,7 @@ void NativeOpExecutioner::execReduceFloatScalar(nd4j::LaunchContext *lc, void *extraParams, void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); @@ -521,9 +603,7 @@ void NativeOpExecutioner::execReduceSameScalar(nd4j::LaunchContext *lc, void *extraParams, void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); @@ -539,9 +619,7 @@ void NativeOpExecutioner::execReduceBoolScalar(nd4j::LaunchContext *lc, void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); @@ -557,9 +635,7 @@ void NativeOpExecutioner::execReduceLongScalar(nd4j::LaunchContext *lc, void *extraParams, void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); @@ -591,10 +667,6 @@ void NativeOpExecutioner::execReduce3Scalar(nd4j::LaunchContext *lc, void *dY, Nd4jLong *dYShapeInfo, void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo) { -#ifdef _OPENMP - omp_set_nested(1); -#endif - auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); @@ -623,15 +695,13 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext *lc, void *dY, Nd4jLong *dYShapeInfo, void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, nullptr, 1), LIBND4J_TYPES, FLOAT_TYPES); - + //BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, nullptr, 0), LIBND4J_TYPES, FLOAT_TYPES); + NativeOpExecutioner::execReduce3Scalar(lc, opNum, hX, hXShapeInfo, dX, dXShapeInfo, extraParamsVals, hY, hYShapeInfo, dY, dYShapeInfo, hZ, hZShapeInfo, dZ, dZShapeInfo); } //////////////////////////////////////////////////////////////////////// @@ -647,14 +717,31 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext *lc, int *dimension, int dimensionLength, Nd4jLong *xTadOnlyShapeInfo, Nd4jLong *xTadOffsets, Nd4jLong *yTadOnlyShapeInfo, Nd4jLong *yTadOffsets) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength), LIBND4J_TYPES, FLOAT_TYPES); + const auto xLen = shape::length(hXShapeInfo); + const auto yLen = shape::length(hYShapeInfo); + + nd4j::TadPack tadPack; + + if(xLen == yLen) { + tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength); + } + else if(yLen > xLen) { + tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hYShapeInfo, dimension, dimensionLength); + } + else { + tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength); + } + + auto func = PRAGMA_THREADS_FOR { + BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, start, stop), LIBND4J_TYPES, FLOAT_TYPES); + }; + + samediff::Threads::parallel_tad(func, 0, tadPack.numberOfTads()); } @@ -671,15 +758,19 @@ void NativeOpExecutioner::execReduce3All(nd4j::LaunchContext *lc, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execAll(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets), LIBND4J_TYPES, FLOAT_TYPES); -// BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execAll(opNum, hX, hXShapeInfo, dX, dXShapeInfo, extraParamsVals, hY, hYShapeInfo, dY, dYShapeInfo, hZ, hZShapeInfo, dZ, dZShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets), LIBND4J_TYPES, FLOAT_TYPES); + auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength); + + // TODO: make it 2d + auto func = PRAGMA_THREADS_FOR { + BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execAll(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, start, stop), LIBND4J_TYPES, FLOAT_TYPES); + }; + + samediff::Threads::parallel_tad(func, 0, tadPack.numberOfTads()); } //////////////////////////////////////////////////////////////////////// @@ -696,15 +787,31 @@ void NativeOpExecutioner::execReduce3TAD(nd4j::LaunchContext *lc, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yTadOffsets) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES); -// BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, dX, dXShapeInfo, extraParamsVals, hY, hYShapeInfo, dY, dYShapeInfo, hZ, hZShapeInfo, dZ, dZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES); + const auto xLen = shape::length(hXShapeInfo); + const auto yLen = shape::length(hYShapeInfo); + + nd4j::TadPack tadPack; + + if(xLen == yLen) { + tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength); + } + else if(yLen > xLen) { + tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hYShapeInfo, dimension, dimensionLength); + } + else { + tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength); + } + + auto func = PRAGMA_THREADS_FOR { + BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, FLOAT_TYPES); + }; + + samediff::Threads::parallel_tad(func, 0, tadPack.numberOfTads()); } @@ -729,9 +836,7 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext *lc, void *hScalar, Nd4jLong *hScalarShapeInfo, void *dScalar, Nd4jLong *dScalarShapeInfo, void *extraParams, bool allowParallelism) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo); @@ -743,7 +848,13 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext *lc, if (xType != yType || xType != zType) throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalar", zType, xType, yType); - BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, allowParallelism), LIBND4J_TYPES); + auto func = PRAGMA_THREADS_FOR { + BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform,::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, start, stop), LIBND4J_TYPES); + }; + + auto zLen = shape::length(hZShapeInfo); + samediff::Threads::parallel_for(func, 0, zLen, 1, !allowParallelism ? 1 : nd4j::math::nd4j_max(1, nd4j::math::nd4j_min(zLen / 1024, nd4j::Environment::getInstance()->maxThreads()))); + #endif } @@ -760,9 +871,7 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext *lc, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo); @@ -774,7 +883,13 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext *lc, if (xType != yType || xType != zType) throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalar", zType, xType, yType); - BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES); + auto func = PRAGMA_THREADS_FOR { + BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES); + }; + + auto yLen = shape::length(hScalarShapeInfo); + samediff::Threads::parallel_tad(func, 0, yLen, 1, nd4j::math::nd4j_min(yLen, nd4j::Environment::getInstance()->maxThreads())); + #endif } @@ -789,9 +904,7 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext *lc, void *dScalar, Nd4jLong *dSscalarShapeInfo, void *extraParams, bool allowParallelism) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto yType = nd4j::ArrayOptions::dataType(hSscalarShapeInfo); @@ -803,7 +916,13 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext *lc, if (zType != nd4j::DataType::BOOL) throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarBool", nd4j::DataType::BOOL, zType); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams), LIBND4J_TYPES, BOOL_TYPES); + auto func = PRAGMA_THREADS_FOR { + BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, start, stop), LIBND4J_TYPES, BOOL_TYPES); + }; + + auto zLen = shape::length(hZShapeInfo); + samediff::Threads::parallel_for(func, 0, zLen, 1, !allowParallelism ? 1 : nd4j::math::nd4j_max(1, nd4j::math::nd4j_min(zLen / 1024, nd4j::Environment::getInstance()->maxThreads()))); + } //////////////////////////////////////////////////////////////////////// @@ -819,9 +938,7 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext *lc, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo); @@ -833,7 +950,12 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext *lc, if (zType != nd4j::DataType::BOOL) throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarBool", nd4j::DataType::BOOL, zType); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, BOOL_TYPES); + auto func = PRAGMA_THREADS_FOR { + BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES, BOOL_TYPES); + }; + + auto yLen = shape::length(hScalarShapeInfo); + samediff::Threads::parallel_tad(func, 0, yLen, 1, nd4j::math::nd4j_min(yLen, nd4j::Environment::getInstance()->maxThreads())); } //////////////////////////////////////////////////////////////////////// @@ -847,9 +969,7 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext *lc, void *dScalar, Nd4jLong *dSscalarShapeInfo, void *extraParams, bool allowParallelism) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto yType = nd4j::ArrayOptions::dataType(hSscalarShapeInfo); @@ -861,7 +981,13 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext *lc, if (!nd4j::DataTypeUtils::isZ(zType)) throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarInt", nd4j::DataType::INT32, zType); - BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams), INTEGER_TYPES); + auto func = PRAGMA_THREADS_FOR { + BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, start, stop), INTEGER_TYPES); + }; + + auto zLen = shape::length(hZShapeInfo); + samediff::Threads::parallel_for(func, 0, zLen, 1, !allowParallelism ? 1 : nd4j::math::nd4j_max(1, nd4j::math::nd4j_min(zLen / 1024, nd4j::Environment::getInstance()->maxThreads()))); + } //////////////////////////////////////////////////////////////////////// @@ -877,9 +1003,7 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext *lc, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo); @@ -891,7 +1015,12 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext *lc, if (!nd4j::DataTypeUtils::isZ(zType)) throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarInt requires integer data type", zType); - BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), INTEGER_TYPES); + auto func = PRAGMA_THREADS_FOR { + BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ, start, stop), INTEGER_TYPES); + }; + + auto yLen = shape::length(hScalarShapeInfo); + samediff::Threads::parallel_tad(func, 0, yLen, 1, nd4j::math::nd4j_min(yLen, nd4j::Environment::getInstance()->maxThreads())); } //////////////////////////////////////////////////////////////////////// @@ -912,9 +1041,7 @@ void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext *lc, void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo, bool biasCorrected) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); @@ -940,9 +1067,7 @@ void NativeOpExecutioner::execSummaryStatsScalar(nd4j::LaunchContext *lc, void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo, bool biasCorrected) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); @@ -972,10 +1097,6 @@ void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext *lc, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected) { -#ifdef _OPENMP - omp_set_nested(1); -#endif - auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); @@ -1002,14 +1123,14 @@ void NativeOpExecutioner::execTransformFloat(nd4j::LaunchContext *lc, void *dZ, Nd4jLong *dZShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { -#ifdef _OPENMP - omp_set_nested(1); -#endif - auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformFloat, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES); + auto func = PRAGMA_THREADS_DO { + BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformFloat, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES, FLOAT_TYPES); + }; + + samediff::Threads::parallel_do(func, nd4j::math::nd4j_max(1, nd4j::math::nd4j_min(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads()))); } //////////////////////////////////////////////////////////////////////// @@ -1021,14 +1142,14 @@ void NativeOpExecutioner::execTransformBool(nd4j::LaunchContext *lc, void *dZ, Nd4jLong *dZShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { -#ifdef _OPENMP - omp_set_nested(1); -#endif - auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformBool, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets), LIBND4J_TYPES, BOOL_TYPES); + auto func = PRAGMA_THREADS_DO { + BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformBool, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES, BOOL_TYPES); + }; + + samediff::Threads::parallel_do(func, nd4j::math::nd4j_max(1, nd4j::math::nd4j_min(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads()))); } //////////////////////////////////////////////////////////////////////// @@ -1040,14 +1161,14 @@ void NativeOpExecutioner::execTransformAny(nd4j::LaunchContext *lc, void *dZ, Nd4jLong *dZShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism) { -#ifdef _OPENMP - omp_set_nested(1); -#endif - auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); - BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformAny, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets, allowParallelism), LIBND4J_TYPES, LIBND4J_TYPES); + auto func = PRAGMA_THREADS_DO { + BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformAny, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES, LIBND4J_TYPES); + }; + + samediff::Threads::parallel_do(func, nd4j::math::nd4j_max(1, nd4j::math::nd4j_min(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads()))); } //////////////////////////////////////////////////////////////////////// @@ -1059,14 +1180,14 @@ void NativeOpExecutioner::execTransformSame(nd4j::LaunchContext *lc, void *dZ, Nd4jLong *dZShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { -#ifdef _OPENMP - omp_set_nested(1); -#endif - auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); - BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformSame, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets), LIBND4J_TYPES); + auto func = PRAGMA_THREADS_DO { + BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformSame, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES); + }; + + samediff::Threads::parallel_do(func, nd4j::math::nd4j_max(1, nd4j::math::nd4j_min(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads()))); } //////////////////////////////////////////////////////////////////////// @@ -1078,14 +1199,14 @@ void NativeOpExecutioner::execTransformStrict(nd4j::LaunchContext *lc, void *dZ, Nd4jLong *dZShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { -#ifdef _OPENMP - omp_set_nested(1); -#endif - auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo); auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); - BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformStrict, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets), FLOAT_TYPES); + auto func = PRAGMA_THREADS_DO { + BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformStrict, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), FLOAT_TYPES); + }; + + samediff::Threads::parallel_do(func, nd4j::math::nd4j_max(1, nd4j::math::nd4j_min(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads()))); } //////////////////////////////////////////////////////////////////////// @@ -1095,9 +1216,7 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext *lc, void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo, void *extraArguments) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); @@ -1116,9 +1235,7 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext *lc, void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo, void *extraArguments) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo); @@ -1139,9 +1256,7 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext *lc, void *hZ, Nd4jLong *hZShapeInfo, void *dZ, Nd4jLong *dZShapeInfo, void *extraArguments) { -#ifdef _OPENMP - omp_set_nested(1); -#endif + auto xType = nd4j::ArrayOptions::dataType(hZShapeInfo); diff --git a/libnd4j/blas/cpu/NativeOps.cpp b/libnd4j/blas/cpu/NativeOps.cpp index 7449bb022..151f5c883 100644 --- a/libnd4j/blas/cpu/NativeOps.cpp +++ b/libnd4j/blas/cpu/NativeOps.cpp @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -36,6 +35,7 @@ #include #include #include +#include #include @@ -75,6 +75,7 @@ bool experimentalSupport = false; #include #include #include +#include #ifdef CPU_FEATURES #include @@ -1152,10 +1153,7 @@ void initializeFunctions(Nd4jPointer *functions) { * @param flags optional parameter */ Nd4jPointer mallocHost(Nd4jLong memorySize, int flags) { - Nd4jPointer pointer = (Nd4jPointer) malloc(memorySize); - if (pointer == 0) - return 0L; - return pointer; + return reinterpret_cast(new int8_t[memorySize]); } /** @@ -1179,7 +1177,7 @@ Nd4jPointer mallocDevice(Nd4jLong memorySize, int deviceId, int flags) { * @param pointer pointer that'll be freed */ int freeHost(Nd4jPointer pointer) { - free(reinterpret_cast(pointer)); + delete[] reinterpret_cast(pointer); return 1L; } @@ -1364,37 +1362,37 @@ void pullRowsGeneric(void *vx, int elementsPerThread = n / TAD_THRESHOLD; int _threads = nd4j::math::nd4j_max(1, elementsPerThread); - _threads = nd4j::math::nd4j_min(_threads, omp_get_max_threads()); + _threads = nd4j::math::nd4j_min(_threads, nd4j::Environment::getInstance()->maxThreads()); - PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads) - for (int idx = 0; idx < n; idx++) { - auto xTadOffsetForBlock = tadOffsets[indexes[idx]]; - auto zTadOffsetForBlock = zTadOffsets[idx]; + auto func = PRAGMA_THREADS_FOR { + for (auto idx = start; idx < stop; idx += increment) { + auto xTadOffsetForBlock = tadOffsets[indexes[idx]]; + auto zTadOffsetForBlock = zTadOffsets[idx]; - auto rX = hX + xTadOffsetForBlock; - auto rZ = hZ + zTadOffsetForBlock; + auto rX = hX + xTadOffsetForBlock; + auto rZ = hZ + zTadOffsetForBlock; - if (xEWS == 1 && zEWS == 1) { - - PRAGMA_OMP_SIMD - for (int i = 0; i < tadLength; i++ ) { - rZ[i] = rX[i]; - } - } else if (xEWS >= 1 && zEWS >= 1) { - - PRAGMA_OMP_SIMD - for (int i = 0; i < tadLength; i++ ) { - rZ[i * zEWS] = rX[i * xEWS]; + if (xEWS == 1 && zEWS == 1) { + PRAGMA_OMP_SIMD + for (int i = 0; i < tadLength; i++) { + rZ[i] = rX[i]; + } + } else if (xEWS >= 1 && zEWS >= 1) { + PRAGMA_OMP_SIMD + for (int i = 0; i < tadLength; i++) { + rZ[i * zEWS] = rX[i * xEWS]; + } + } else { + for (int i = 0; i < tadLength; i++) { + auto xOffset = xTadOffsetForBlock + shape::getIndexOffset(i, tadShapeInfo); + auto zOffset = zTadOffsetForBlock + shape::getIndexOffset(i, zTadShapeInfo); + hZ[zOffset] = hX[xOffset]; + } } } - else { - for (int i = 0; i < tadLength; i++) { - auto xOffset = xTadOffsetForBlock + shape::getIndexOffset(i, tadShapeInfo); - auto zOffset = zTadOffsetForBlock + shape::getIndexOffset(i, zTadShapeInfo); - hZ[zOffset] = hX[xOffset]; - } - } - } + }; + + samediff::Threads::parallel_tad(func, 0, n, 1, _threads); } void pullRows(Nd4jPointer *extraPointers, @@ -1433,30 +1431,29 @@ void tearGeneric(void *vx, auto zEWS = shape::elementWiseStride(hZShapeInfo); auto numTads = shape::length(hXShapeInfo) / tadLength; - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong i = 0; i < numTads; i++) { - auto hZ = reinterpret_cast(targets[i]); - auto s = hX + tadOffsets[i]; + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto hZ = reinterpret_cast(targets[i]); + auto s = hX + tadOffsets[i]; - if (zEWS == 1 && tadEWS == 1) { - - PRAGMA_OMP_SIMD - for (Nd4jLong j = 0; j < tadLength; j++) { - hZ[j] = s[j]; - } - } else if (zEWS > 0 && tadEWS > 0) { - - PRAGMA_OMP_SIMD - for (Nd4jLong j = 0; j < tadLength; j++) { - hZ[j * zEWS] = s[j * tadEWS]; + if (zEWS == 1 && tadEWS == 1) { + PRAGMA_OMP_SIMD + for (Nd4jLong j = 0; j < tadLength; j++) { + hZ[j] = s[j]; + } + } else if (zEWS > 0 && tadEWS > 0) { + PRAGMA_OMP_SIMD + for (Nd4jLong j = 0; j < tadLength; j++) { + hZ[j * zEWS] = s[j * tadEWS]; + } + } else { + for (Nd4jLong j = 0; j < tadLength; j++) + hZ[shape::getIndexOffset(j, hZShapeInfo)] = s[shape::getIndexOffset(j, tadShapeInfo)]; } } - else { + }; - for (Nd4jLong j = 0; j < tadLength; j++) - hZ[shape::getIndexOffset(j, hZShapeInfo)] = s[shape::getIndexOffset(j, tadShapeInfo)]; - } - } + samediff::Threads::parallel_tad(func,0, numTads); } void tear(Nd4jPointer *extraPointers, @@ -1557,57 +1554,60 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS auto dX = reinterpret_cast(hX); auto dZ = reinterpret_cast(dz); - PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(N) - for (int f = 0; f < N; f++) { - auto hX = reinterpret_cast(dX[f]); - //auto hZ = reinterpret_cast(dZ[f]); + auto func = PRAGMA_THREADS_FOR { + for (auto f = start; f < stop; f += increment) { + auto hX = reinterpret_cast(dX[f]); + //auto hZ = reinterpret_cast(dZ[f]); - auto xShapeInfo = hXShapeInfo[f]; - auto tadOffset = reinterpret_cast(tadOffsets[f]); + auto xShapeInfo = hXShapeInfo[f]; + auto tadOffset = reinterpret_cast(tadOffsets[f]); - const auto tadLength = shape::length(tadOnlyShapeInfo[f]); - auto tadEWS = shape::elementWiseStride(tadOnlyShapeInfo[f]); - auto tadRank = shape::rank(tadOnlyShapeInfo[f]); - auto numTads = shape::length(hXShapeInfo[f]) / tadLength; + const auto tadLength = shape::length(tadOnlyShapeInfo[f]); + auto tadEWS = shape::elementWiseStride(tadOnlyShapeInfo[f]); + auto tadRank = shape::rank(tadOnlyShapeInfo[f]); + auto numTads = shape::length(hXShapeInfo[f]) / tadLength; - auto tadShape = shape::shapeOf(tadOnlyShapeInfo[f]); - auto tadStride = shape::stride(tadOnlyShapeInfo[f]); + auto tadShape = shape::shapeOf(tadOnlyShapeInfo[f]); + auto tadStride = shape::stride(tadOnlyShapeInfo[f]); - if (shape::rank(xShapeInfo) == 1) { - auto xLength = shape::length(xShapeInfo); - auto ews = shape::elementWiseStride(xShapeInfo); - for (Nd4jLong r = 0; r < xLength; r++) { - auto swapIdx = shuffleMap[r]; - if (swapIdx < 0) - continue; + if (shape::rank(xShapeInfo) == 1) { + auto xLength = shape::length(xShapeInfo); + auto ews = shape::elementWiseStride(xShapeInfo); + for (Nd4jLong r = 0; r < xLength; r++) { + auto swapIdx = shuffleMap[r]; + if (swapIdx < 0) + continue; - nd4j::math::nd4j_swap(hX[r*ews], hX[swapIdx*ews]); - } - } else { - for (Nd4jLong r = 0; r < numTads; r++) { - if (shuffleMap[r] < 0) - continue; + nd4j::math::nd4j_swap(hX[r * ews], hX[swapIdx * ews]); + } + } else { + for (Nd4jLong r = 0; r < numTads; r++) { + if (shuffleMap[r] < 0) + continue; - auto oldOffset = tadOffset[r]; - auto newOffset = tadOffset[shuffleMap[r]]; + auto oldOffset = tadOffset[r]; + auto newOffset = tadOffset[shuffleMap[r]]; - auto rX = hX + oldOffset; - auto rY = hX + newOffset; + auto rX = hX + oldOffset; + auto rY = hX + newOffset; - if (tadEWS == 1) { - for (Nd4jLong i = 0; i < tadLength; i++) { - nd4j::math::nd4j_swap(rX[i], rY[i]); - } - } else { - for (Nd4jLong i = 0; i < tadLength; i++) { - auto offset = shape::getIndexOffset(i, tadOnlyShapeInfo[f]); - nd4j::math::nd4j_swap(hX[offset + oldOffset], hX[offset + newOffset]); + if (tadEWS == 1) { + for (Nd4jLong i = 0; i < tadLength; i++) { + nd4j::math::nd4j_swap(rX[i], rY[i]); + } + } else { + for (Nd4jLong i = 0; i < tadLength; i++) { + auto offset = shape::getIndexOffset(i, tadOnlyShapeInfo[f]); + nd4j::math::nd4j_swap(hX[offset + oldOffset], hX[offset + newOffset]); + } } } } } - } + }; + + samediff::Threads::parallel_tad(func, 0, N); } void shuffle(Nd4jPointer *extras, @@ -1772,72 +1772,9 @@ void execAggregate(Nd4jPointer *extraPointers,int opNum, void *realArguments, int numRealArguments, nd4j::DataType dtype) { - try { - BUILD_SINGLE_SELECTOR(dtype, NativeOpExecutioner::execAggregate, (nullptr, opNum, arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments), FLOAT_TYPES); - } catch (std::exception &e) { - nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); - nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); - } } -template -void _batchExecutor(Nd4jPointer *extraPointers, - int numAggregates, - int opNum, - int maxArgs, - int maxShapes, - int maxIntArrays, - int maxIntArraySize, - int maxIdx, - int maxReals, - void *ptrToArguments, - nd4j::DataType dtype) { - // probably, we don't want too much threads as usually - int _threads = nd4j::math::nd4j_min(numAggregates, omp_get_max_threads()); - - nd4j::PointersHelper helper(ptrToArguments, - numAggregates, - maxArgs, - maxShapes, - maxIntArrays, - maxIntArraySize, - maxIdx, - maxReals); - - // special case here, we prefer spread arrangement here, all threads are detached from each other - PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads) - for (int i = 0; i < numAggregates; i++) { - auto intArrays = new int *[maxIntArrays]; - - auto arguments = helper.getArguments(i); - auto shapes = helper.getShapeArguments(i); - auto idxArg = helper.getIndexArguments(i); - auto realArg = helper.getRealArguments(i); - - for (int e = 0; e < maxIntArrays; e++) { - intArrays[e] = helper.getIntArrayArguments(i, e); - } - - execAggregate(extraPointers, - opNum, - reinterpret_cast(arguments), - helper.getNumArguments(i), - shapes, - helper.getNumShapeArguments(i), - idxArg, - helper.getNumIndexArguments(i), - intArrays, - helper.getNumIntArrayArguments(i), - realArg, - helper.getNumRealArguments(i), - dtype); - - delete [] intArrays; - } -} -BUILD_SINGLE_TEMPLATE(template void _batchExecutor, (Nd4jPointer *extraPointers, int numAggregates, int opNum, int maxArgs, int maxShapes, int maxIntArrays, int maxIntArraySize, int maxIdx, int maxReals, void *ptrToArguments, nd4j::DataType dtype), FLOAT_TYPES); - void batchExecutor(Nd4jPointer *extraPointers, int numAggregates, int opNum, @@ -1849,12 +1786,7 @@ void batchExecutor(Nd4jPointer *extraPointers, int maxReals, void *ptrToArguments, nd4j::DataType dtype) { - try { - BUILD_SINGLE_SELECTOR(dtype, _batchExecutor, (extraPointers, numAggregates, opNum, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals, ptrToArguments, dtype), FLOAT_TYPES); - } catch (std::exception &e) { - nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); - nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); - } + } void execAggregateBatch(Nd4jPointer *extraPointers, @@ -1868,12 +1800,7 @@ void execAggregateBatch(Nd4jPointer *extraPointers, int maxReals, void *ptrToArguments, nd4j::DataType dtype) { - try { - BUILD_SINGLE_SELECTOR(dtype, _batchExecutor, (extraPointers, numAggregates, opNum, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals, ptrToArguments, dtype), FLOAT_TYPES); - } catch (std::exception &e) { - nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); - nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); - } + } @@ -2094,27 +2021,21 @@ const char* getAllCustomOps() { template FORCEINLINE int estimateThresholdGeneric(Nd4jPointer *extraPointers, Nd4jPointer hX, int N, T threshold) { auto buffer = reinterpret_cast(hX); - int span = (N / 6) + 8; - int cnt = 0; - - PRAGMA_OMP_PARALLEL_REDUCTION(+:cnt) - { - int tid = omp_get_thread_num(); - int start = span * tid; - int stop = span * (tid + 1); - if (stop > N) - stop = N; + auto func = PRAGMA_REDUCE_LONG { + int64_t cnt = 0; PRAGMA_OMP_SIMD - for (int e = start; e < stop; e++) { + for (auto e = start; e < stop; e++) { auto v = nd4j::math::nd4j_abs(buffer[e]); if (v >= threshold) cnt++; } - } - return cnt; + return cnt; + }; + + return samediff::Threads::parallel_long(func, LAMBDA_AL { return _old + _new; }, 0, N); } @@ -2776,58 +2697,51 @@ static void _scatterUpdate(Nd4jPointer *extraPointers, int opCode, int numOfSub void* vIindexes, Nd4jLong* hIndicesShapeInfo, void* dIindexes, Nd4jLong* dIndicesShapeInfo) { auto hIindexes = reinterpret_cast(vIindexes); - - int numThreads = omp_get_max_threads(); - - PRAGMA_OMP_PARALLEL_THREADS(numThreads) - { - for (int i = 0; i < numOfSubArrs; ++i) { - - int threadIndex = omp_get_thread_num(); + auto func = PRAGMA_THREADS_DO { + for (int i = 0; i < numOfSubArrs; ++i) { + int threadIndex = thread_id; const auto xIndex = hIindexes[i]; const bool isOwner = xIndex < numThreads ? threadIndex == xIndex : threadIndex == xIndex % numThreads; if (!isOwner) continue; - NDArray inSubArr( - reinterpret_cast(hX) + (hXOffsets[hIindexes[i]] * DataTypeUtils::sizeOf(hXShapeInfo)), - hXShapeInfo); - NDArray updSubArr(reinterpret_cast(hY) + (hYOffsets[i] * DataTypeUtils::sizeOf(hXShapeInfo)), - hYShapeInfo); + NDArray inSubArr(reinterpret_cast(hX) + (hXOffsets[hIindexes[i]] * DataTypeUtils::sizeOf(hXShapeInfo)), hXShapeInfo); + NDArray updSubArr(reinterpret_cast(hY) + (hYOffsets[i] * DataTypeUtils::sizeOf(hXShapeInfo)), hYShapeInfo); if (inSubArr.lengthOf() != updSubArr.lengthOf()) { continue; } - switch (opCode) { - case 0: - inSubArr.applyPairwiseTransform(pairwise::Add, &updSubArr, &inSubArr, nullptr); - break; - case 1: - inSubArr.applyPairwiseTransform(pairwise::Subtract, &updSubArr, &inSubArr, nullptr); - break; - case 2: - inSubArr.applyPairwiseTransform(pairwise::Multiply, &updSubArr, &inSubArr, nullptr); - break; - case 3: - inSubArr.applyPairwiseTransform(pairwise::Divide, &updSubArr, &inSubArr, nullptr); - break; - case 4: - inSubArr.applyPairwiseTransform(pairwise::ReverseSubtract, &updSubArr, &inSubArr, nullptr); - break; - case 5: - inSubArr.applyPairwiseTransform(pairwise::ReverseDivide, &updSubArr, &inSubArr, nullptr); - break; - case 6: - inSubArr.applyPairwiseTransform(pairwise::CopyPws, &updSubArr, &inSubArr, nullptr); - break; - default: - continue; + switch (opCode) { + case 0: + inSubArr.applyPairwiseTransform(pairwise::Add, &updSubArr, &inSubArr, nullptr); + break; + case 1: + inSubArr.applyPairwiseTransform(pairwise::Subtract, &updSubArr, &inSubArr, nullptr); + break; + case 2: + inSubArr.applyPairwiseTransform(pairwise::Multiply, &updSubArr, &inSubArr, nullptr); + break; + case 3: + inSubArr.applyPairwiseTransform(pairwise::Divide, &updSubArr, &inSubArr, nullptr); + break; + case 4: + inSubArr.applyPairwiseTransform(pairwise::ReverseSubtract, &updSubArr, &inSubArr, nullptr); + break; + case 5: + inSubArr.applyPairwiseTransform(pairwise::ReverseDivide, &updSubArr, &inSubArr, nullptr); + break; + case 6: + inSubArr.applyPairwiseTransform(pairwise::CopyPws, &updSubArr, &inSubArr, nullptr); + break; + default: + continue; + } } - } - } + }; + samediff::Threads::parallel_do(func); } //////////////////////////////////////////////////////////////////////// @@ -2847,6 +2761,7 @@ void scatterUpdate(Nd4jPointer *extraPointers, int opCode, int numOfSubArrs, } } + void inspectArray(Nd4jPointer *extraPointers, Nd4jPointer buffer, Nd4jLong *shapeInfo, Nd4jPointer specialBuffer, Nd4jLong *specialShapeInfo, Nd4jPointer debugInfo) { try { auto p = reinterpret_cast(debugInfo); diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu index 2db1aa128..2af0e3783 100755 --- a/libnd4j/blas/cuda/NativeOps.cu +++ b/libnd4j/blas/cuda/NativeOps.cu @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -33,8 +34,8 @@ #include #include #include -// FIXME: we need cuda-specific implementations #include +#include #include #include #include @@ -1723,11 +1724,7 @@ void execScalarTad(Nd4jPointer *extraPointers, #ifdef __ND4J_EXPERIMENTAL__ BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::scalar::ScalarTransform, ::executeCudaAlongDimension(launchDims, stream, opNum, dX, dXShapeInfo, dZ, dZShapeInfo, dScalars, extraParams, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES); #else - BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, - ::executeCudaAlongDimension(launchDims, stream, opNum, dX, dXShapeInfo, dZ, - dZShapeInfo, dScalars, extraParams, dimension, - dimensionLength, tadShapeInfo, tadOffsets, - tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES); + BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::executeCudaAlongDimension(launchDims, stream, opNum, dX, dXShapeInfo, dZ, dZShapeInfo, dScalars, extraParams, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES); #endif DEBUG_KERNEL(stream, opNum); @@ -1750,23 +1747,7 @@ void execAggregate(Nd4jPointer *extraPointers, void *realArguments, int numRealArguments, nd4j::DataType dtype) { - try { - cudaStream_t *stream = reinterpret_cast(extraPointers[1]); - int numBlocks = getDeviceId(extraPointers[2]); - int numThreads = getDeviceId(extraPointers[3]); - int shmem = getDeviceId(extraPointers[4]); - dim3 launchDims = dim3(numBlocks, numThreads, shmem); - - BUILD_SINGLE_SELECTOR(dtype, functions::aggregate::AggregatedFunction, - ::aggregateKernelGeneric(launchDims, stream, opNum, arguments, numArguments, shapes, - numShapes, indexArguments, numIndexArguments, intArrays, - numIntArrays, realArguments, numRealArguments), FLOAT_TYPES); - nd4j::DebugHelper::checkErrorCode(stream, "execAggregateFloat(...) failed"); - } catch (std::exception &e) { - nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); - nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); - } } void batchExecutor(Nd4jPointer *extraPointers, @@ -1788,25 +1769,7 @@ void execAggregateBatch(Nd4jPointer *extraPointers, int maxIntArrays, int maxIntArraySize, int maxIdx, int maxReals, void *ptrToArguments, nd4j::DataType dtype) { - try { - // not implemented yet - cudaStream_t *stream = reinterpret_cast(extraPointers[1]); - int numBlocks = getDeviceId(extraPointers[2]); - int numThreads = getDeviceId(extraPointers[3]); - int shmem = getDeviceId(extraPointers[4]); - dim3 launchDims = dim3(numAggregates, numThreads, shmem); - - BUILD_SINGLE_SELECTOR(dtype, functions::aggregate::AggregatedFunction, - ::aggregateBatchKernelGeneric(launchDims, stream, opNum, numAggregates, maxArgs, - maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals, - ptrToArguments), FLOAT_TYPES); - - DEBUG_KERNEL(stream, opNum); - } catch (std::exception &e) { - nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); - nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); - } } //////////////////////////////////////////////////////////////////////// diff --git a/libnd4j/buildnativeoperations.sh b/libnd4j/buildnativeoperations.sh index 599c4f250..56e225a5d 100755 --- a/libnd4j/buildnativeoperations.sh +++ b/libnd4j/buildnativeoperations.sh @@ -53,6 +53,7 @@ CLEAN="false" MINIFIER="false" TESTS="false" VERBOSE="false" +VERBOSE_ARG="VERBOSE=1" HELPER= NAME= while [[ $# > 0 ]] @@ -291,38 +292,37 @@ case "$OS" in macosx*) # Do something under Mac OS X platform - if [ "$CHIP" == "cuda" ]; then + #if [ "$CHIP" == "cuda" ]; then export CC=clang export CXX=clang++ - PARALLEL="false" - else - export CC="$(ls -1 /usr/local/bin/gcc-? | head -n 1)" - export CXX="$(ls -1 /usr/local/bin/g++-? | head -n 1)" PARALLEL="true" - fi + #else + # export CC="$(ls -1 /usr/local/bin/gcc-? | head -n 1)" + # export CXX="$(ls -1 /usr/local/bin/g++-? | head -n 1)" + # PARALLEL="true" + #fi export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_MACOSX_RPATH=ON -DAPPLE_BUILD=true" ;; windows*) - # Do something under Windows NT platform - if [ "$CHIP" == "cuda" ]; then + # Do something under Windows NT platform + if [ "$CHIP" == "cuda" ]; then export CMAKE_COMMAND="cmake -G \"Ninja\"" export MAKE_COMMAND="ninja" export CC="cl.exe" export CXX="cl.exe" PARALLEL="true" - else + VERBOSE_ARG="-v" + else export CMAKE_COMMAND="cmake -G \"MSYS Makefiles\"" export MAKE_COMMAND="make" - - # Sam, do we really need this? export CC=/mingw64/bin/gcc export CXX=/mingw64/bin/g++ PARALLEL="true" + fi - fi - # Try some defaults for Visual Studio 2013 if user has not run vcvarsall.bat or something - if [ -z "${VCINSTALLDIR:-}" ]; then + # Try some defaults for Visual Studio 2013 if user has not run vcvarsall.bat or something + if [ -z "${VCINSTALLDIR:-}" ]; then export VisualStudioVersion=12.0 export VSINSTALLDIR="C:\\Program Files (x86)\\Microsoft Visual Studio $VisualStudioVersion" export VCINSTALLDIR="$VSINSTALLDIR\\VC" @@ -332,10 +332,10 @@ case "$OS" in export LIB="$VCINSTALLDIR\\LIB\\amd64;$WindowsSdkDir\\lib\\winv6.3\\um\\x64" export LIBPATH="$VCINSTALLDIR\\LIB\\amd64;$WindowsSdkDir\\References\\CommonConfiguration\\Neutral" export PATH="$PATH:$VCINSTALLDIR\\BIN\\amd64:$WindowsSdkDir\\bin\\x64:$WindowsSdkDir\\bin\\x86" - fi - # Make sure we are using 64-bit MinGW-w64 - export PATH=/mingw64/bin/:$PATH - # export GENERATOR="MSYS Makefiles" + fi + # Make sure we are using 64-bit MinGW-w64 + export PATH=/mingw64/bin/:/mingw64/lib:$PATH + # export GENERATOR="MSYS Makefiles" ;; esac @@ -534,6 +534,6 @@ if [ "$PARALLEL" == "true" ]; then MAKE_ARGUMENTS="$MAKE_ARGUMENTS -j $MAKEJ" fi if [ "$VERBOSE" == "true" ]; then - MAKE_ARGUMENTS="$MAKE_ARGUMENTS VERBOSE=1" + MAKE_ARGUMENTS="$MAKE_ARGUMENTS $VERBOSE_ARG" fi eval $MAKE_COMMAND $MAKE_ARGUMENTS && cd ../../.. diff --git a/libnd4j/include/array/DataTypeConversions.h b/libnd4j/include/array/DataTypeConversions.h index 677401954..3af77ca39 100644 --- a/libnd4j/include/array/DataTypeConversions.h +++ b/libnd4j/include/array/DataTypeConversions.h @@ -29,6 +29,7 @@ #include #include #include +#include namespace nd4j { template @@ -50,9 +51,12 @@ namespace nd4j { else TypeCast::convertGeneric(nullptr, tmp, length, buffer); #else - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < length; e++) - buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) + buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); + }; + + samediff::Threads::parallel_for(func, 0, length); #endif delete[] tmp; @@ -105,9 +109,12 @@ namespace nd4j { else TypeCast::convertGeneric(nullptr, tmp, length, buffer); #else - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < length; e++) - buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) + buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); + }; + + samediff::Threads::parallel_for(func, 0, length); #endif delete[] tmp; @@ -130,9 +137,12 @@ namespace nd4j { #else - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < length; e++) - buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) + buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); + }; + + samediff::Threads::parallel_for(func, 0, length); #endif delete[] tmp; } @@ -153,9 +163,12 @@ namespace nd4j { else TypeCast::convertGeneric(nullptr, tmp, length, buffer); #else - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < length; e++) - buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) + buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); + }; + + samediff::Threads::parallel_for(func, 0, length); #endif delete[] tmp; } diff --git a/libnd4j/include/buffer.h b/libnd4j/include/buffer.h index e2aa70046..79197753d 100755 --- a/libnd4j/include/buffer.h +++ b/libnd4j/include/buffer.h @@ -26,6 +26,7 @@ #ifdef __CUDACC__ #include #include +#include #endif #include diff --git a/libnd4j/include/cnpy/cnpy.h b/libnd4j/include/cnpy/cnpy.h index ac7fef863..06ff3336d 100644 --- a/libnd4j/include/cnpy/cnpy.h +++ b/libnd4j/include/cnpy/cnpy.h @@ -97,10 +97,10 @@ namespace cnpy { * @param t * @return */ - char mapType(const std::type_info &t); + ND4J_EXPORT char mapType(const std::type_info &t); template - char mapType(); + ND4J_EXPORT char mapType(); /** * @@ -111,7 +111,7 @@ namespace cnpy { * @return */ template - std::vector createNpyHeader(const void *data, + ND4J_EXPORT std::vector createNpyHeader(const void *data, const unsigned int *shape, const unsigned int ndims, unsigned int wordSize = 4); @@ -126,7 +126,7 @@ namespace cnpy { * @param ndims * @param fortranOrder */ - void parseNpyHeader(FILE *fp, + ND4J_EXPORT void parseNpyHeader(FILE *fp, unsigned int &wordSize, unsigned int *&shape, unsigned int &ndims, @@ -143,7 +143,7 @@ namespace cnpy { * @param ndims * @param fortran_order */ - void parseNpyHeaderPointer( + ND4J_EXPORT void parseNpyHeaderPointer( const char *header, unsigned int& word_size, unsigned int*& shape, @@ -156,7 +156,7 @@ namespace cnpy { * @param global_header_size * @param global_header_offset */ - void parseZipFooter(FILE *fp, + ND4J_EXPORT void parseZipFooter(FILE *fp, unsigned short &nrecs, unsigned int &global_header_size, unsigned int &global_header_offset); @@ -167,14 +167,14 @@ namespace cnpy { * @param varname * @return */ - NpyArray npzLoad(std::string fname, std::string varname); + ND4J_EXPORT NpyArray npzLoad(std::string fname, std::string varname); /** * * @param fname * @return */ - NpyArray npyLoad(std::string fname); + ND4J_EXPORT NpyArray npyLoad(std::string fname); /** * Parse the numpy header from @@ -187,7 +187,7 @@ namespace cnpy { * @param ndims * @param fortranOrder */ - void parseNpyHeaderStr(std::string header, + ND4J_EXPORT void parseNpyHeaderStr(std::string header, unsigned int &wordSize, unsigned int *&shape, unsigned int &ndims, @@ -199,14 +199,14 @@ namespace cnpy { * @param fp * @return */ - int * shapeFromFile(FILE *fp); + ND4J_EXPORT int* shapeFromFile(FILE *fp); /** * * @param data * @return */ - int * shapeFromPointer(char *data); + ND4J_EXPORT int* shapeFromPointer(char *data); /** * Load the numpy array from the given file. @@ -250,7 +250,7 @@ namespace cnpy { * @param ndims * @param fortran_order */ - void parseNpyHeader(std::string header, + ND4J_EXPORT void parseNpyHeader(std::string header, unsigned int &word_size, unsigned int *&shape, unsigned int &ndims, @@ -273,7 +273,7 @@ namespace cnpy { template - void npy_save(std::string fname, const T* data, const unsigned int* shape, const unsigned int ndims, std::string mode = "w"); + ND4J_EXPORT void npy_save(std::string fname, const T* data, const unsigned int* shape, const unsigned int ndims, std::string mode = "w"); } @@ -284,8 +284,8 @@ namespace cnpy { * @param rhs * @return */ -template -std::vector& operator+=(std::vector& lhs, const T rhs); + template + ND4J_EXPORT std::vector& operator+=(std::vector& lhs, const T rhs); #endif diff --git a/libnd4j/include/dll.h b/libnd4j/include/dll.h index 4b5a71eec..91d5a7677 100644 --- a/libnd4j/include/dll.h +++ b/libnd4j/include/dll.h @@ -20,6 +20,9 @@ #ifndef NATIVEOPERATIONS_DLL_H #define NATIVEOPERATIONS_DLL_H + +#include + #ifdef _WIN32 //#include # define ND4J_EXPORT __declspec(dllexport) diff --git a/libnd4j/include/execution/BlockingQueue.h b/libnd4j/include/execution/BlockingQueue.h new file mode 100644 index 000000000..a78196dfc --- /dev/null +++ b/libnd4j/include/execution/BlockingQueue.h @@ -0,0 +1,52 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#ifndef SAMEDIFF_BLOCKINGQUEUE_H +#define SAMEDIFF_BLOCKINGQUEUE_H + +#include +#include +#include +#include +#include + +namespace samediff { + template + class BlockingQueue { + private: + std::queue _queue; + std::mutex _lock; + std::atomic _size; + std::atomic _available; + + std::condition_variable _condition; + public: + BlockingQueue(int queueSize); + ~BlockingQueue() = default; + T poll(); + void put(const T &t); + + bool available(); + void markAvailable(); + void markUnavailable(); + }; +} + +#endif //DEV_TESTS_BLOCKINGQUEUE_H diff --git a/libnd4j/include/execution/CallableInterface.h b/libnd4j/include/execution/CallableInterface.h new file mode 100644 index 000000000..7e5502af1 --- /dev/null +++ b/libnd4j/include/execution/CallableInterface.h @@ -0,0 +1,94 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#ifndef SAMEDIFF_CALLABLEINTERFACE_H +#define SAMEDIFF_CALLABLEINTERFACE_H + +#include +#include +#include +#include +#include +#include +#include + +namespace samediff { + /** + * This class is suited for passing functions to execution threads without queues + */ + class CallableInterface { + private: + // parallel_for functions + FUNC_1D _function_1d; + FUNC_2D _function_2d; + FUNC_3D _function_3d; + + // parallel function + FUNC_DO _function_do; + + // reduction functions + FUNC_RL _function_rl; + FUNC_RD _function_rd; + + std::array _arguments; + + volatile int _branch = 0; + volatile uint32_t _thread_id = 0; + volatile uint32_t _num_threads = 0; + + std::atomic _finished; + std::atomic _filled; + std::atomic _available; + + std::condition_variable _starter; + std::condition_variable _finisher; + + int64_t* _lptr = nullptr; + double* _dptr = nullptr; + + std::mutex _ms; + std::mutex _mf; + public: + CallableInterface(); + ~CallableInterface() = default; + + void waitForTask(); + void waitForCompletion(); + + void fill(int thread_id, int num_threads, int64_t *lpt, FUNC_RL func, int64_t start_x, int64_t stop_x, int64_t inc_x); + void fill(int thread_id, int num_threads, double *dpt, FUNC_RD func, int64_t start_x, int64_t stop_x, int64_t inc_x); + + void fill(int thread_id, int num_threads, FUNC_DO func); + void fill(int thread_id, int num_threads, FUNC_1D func, int64_t start_x, int64_t stop_x, int64_t inc_x); + void fill(int thread_id, int num_threads, FUNC_2D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y); + void fill(int thread_id, int num_threads, FUNC_3D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z); + + bool available(); + void markAvailable(); + void markUnavailable(); + + void finish(); + + void execute(); + }; +} + + +#endif //DEV_TESTS_CALLABLEINTERFACE_H diff --git a/libnd4j/include/execution/CallableWithArguments.h b/libnd4j/include/execution/CallableWithArguments.h new file mode 100644 index 000000000..ebf1f0019 --- /dev/null +++ b/libnd4j/include/execution/CallableWithArguments.h @@ -0,0 +1,92 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#ifndef DEV_TESTS_CALLABLEWITHARGUMENTS_H +#define DEV_TESTS_CALLABLEWITHARGUMENTS_H + +#include +#include +#include +#include +#include + +namespace samediff { + class CallableWithArguments { + FUNC_DO _function_do; + FUNC_1D _function_1d; + FUNC_2D _function_2d; + FUNC_3D _function_3d; + + std::vector _arguments; + + std::atomic _finished; + + std::condition_variable _condition; + + std::mutex _lock; + + int _dimensions = 0; + + uint64_t _threadId; + uint64_t _numThreads; + public: + CallableWithArguments(FUNC_DO func, uint64_t thread_id, uint64_t numThreads); + CallableWithArguments(FUNC_1D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x); + CallableWithArguments(FUNC_2D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x, int64_t start_y, int64_t stop_y, int64_t increment_y); + CallableWithArguments(FUNC_3D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x, int64_t start_y, int64_t stop_y, int64_t increment_y, int64_t start_z, int64_t stop_z, int64_t increment_z); + + + /** + * This method returns number of dimensions + * @return + */ + int dimensions(); + + /** + * This method checks if this callable is finished + * @return + */ + bool finished(); + + /** + * this method marks this Callable as finished + */ + void finish(); + + /** + * This method blocks until callable is finished + */ + void waitUntilFinished(); + + std::vector& arguments(); + FUNC_DO function_do(); + FUNC_1D function_1d(); + FUNC_2D function_2d(); + FUNC_3D function_3d(); + + + uint64_t threadId(); + + uint64_t numThreads(); + }; +} + + +#endif //DEV_TESTS_CALLABLEWITHARGUMENTS_H diff --git a/libnd4j/include/execution/ThreadPool.h b/libnd4j/include/execution/ThreadPool.h new file mode 100644 index 000000000..e17b4b540 --- /dev/null +++ b/libnd4j/include/execution/ThreadPool.h @@ -0,0 +1,71 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#ifndef SAMEDIFF_THREADPOOL_H +#define SAMEDIFF_THREADPOOL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace samediff { + class ThreadPool { + private: + static ThreadPool* _INSTANCE; + + std::vector _threads; + std::vector*> _queues; + std::vector _interfaces; + + std::mutex _lock; + std::atomic _available; + std::queue _tickets; + protected: + ThreadPool(); + ~ThreadPool(); + public: + static ThreadPool* getInstance(); + + /** + * This method returns list of pointers to threads ONLY if num_threads of threads were available upon request, returning empty list otherwise + * @param num_threads + * @return + */ + Ticket* tryAcquire(int num_threads); + + /** + * This method marks specified number of threads as released, and available for use + * @param num_threads + */ + void release(int num_threads = 1); + + void release(Ticket *ticket); + }; +} + + +#endif //DEV_TESTS_THREADPOOL_H diff --git a/libnd4j/include/execution/Threads.h b/libnd4j/include/execution/Threads.h new file mode 100644 index 000000000..683220b61 --- /dev/null +++ b/libnd4j/include/execution/Threads.h @@ -0,0 +1,160 @@ +/******************************************************************************* + * Copyright (c) 2019 Konduit + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// +#ifndef SAMEDIFF_THREADS_H +#define SAMEDIFF_THREADS_H + +#include +#include +#include +#include +#include + +namespace samediff { + class ThreadsHelper { + public: + static int numberOfThreads(int maxThreads, uint64_t numberOfElements); + static int numberOfThreads2d(int maxThreads, uint64_t iters_x, uint64_t iters_y); + static int numberOfThreads3d(int maxThreads, uint64_t iters_x, uint64_t iters_y, uint64_t iters_z); + static int pickLoop2d(int numThreads, uint64_t iters_x, uint64_t iters_y); + static int pickLoop3d(int numThreads, uint64_t iters_x, uint64_t iters_y, uint64_t iters_z); + }; + + class Span { + private: + int64_t _startX, _stopX, _incX; + public: + Span(int64_t start_x, int64_t stop_x, int64_t inc_x); + ~Span() = default; + + int64_t startX() const; + int64_t stopX() const; + int64_t incX() const; + + static Span build(uint64_t thread_id, uint64_t num_threads, int64_t start_x, int64_t stop_x, int64_t inc_x); + }; + + class Span2 { + private: + int64_t _startX, _stopX, _incX; + int64_t _startY, _stopY, _incY; + public: + Span2(int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y); + ~Span2() = default; + + int64_t startX() const; + int64_t startY() const; + + int64_t stopX() const; + int64_t stopY() const; + + int64_t incX() const; + int64_t incY() const; + + static Span2 build(int loop, uint64_t thread_id, uint64_t num_threads, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y); + }; + + class Span3 { + private: + int64_t _startX, _stopX, _incX; + int64_t _startY, _stopY, _incY; + int64_t _startZ, _stopZ, _incZ; + public: + Span3(int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z); + ~Span3() = default; + + int64_t startX() const; + int64_t startY() const; + int64_t startZ() const; + + int64_t stopX() const; + int64_t stopY() const; + int64_t stopZ() const; + + int64_t incX() const; + int64_t incY() const; + int64_t incZ() const; + + static Span3 build(int loop, uint64_t thread_id, uint64_t num_threads, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z); + }; + + class Threads { + public: + /** + * This function executes 1 dimensional loop for a given number of threads + * PLEASE NOTE: this function can use smaller number of threads than requested. + * + * @param function + * @param numThreads + * @param start + * @param stop + * @param increment + * @return + */ + static int parallel_for(FUNC_1D function, int64_t start, int64_t stop, int64_t increment = 1, uint32_t numThreads = nd4j::Environment::getInstance()->maxThreads()); + + static int parallel_tad(FUNC_1D function, int64_t start, int64_t stop, int64_t increment = 1, uint32_t numThreads = nd4j::Environment::getInstance()->maxThreads()); + + /** + * + * @param function + * @param numThreads + * @param start_x + * @param stop_x + * @param inc_x + * @param start_y + * @param stop_y + * @param inc_y + * @return + */ + static int parallel_for(FUNC_2D function, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads(), bool debug = false); + + /** + * + * @param function + * @param numThreads + * @param start_x + * @param stop_x + * @param inc_x + * @param start_y + * @param stop_y + * @param inc_y + * @param start_z + * @param stop_z + * @param inc_z + * @return + */ + static int parallel_for(FUNC_3D function, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads()); + + /** + * + * @param function + * @param numThreads + * @return + */ + static int parallel_do(FUNC_DO function, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads()); + + static int64_t parallel_long(FUNC_RL function, FUNC_AL aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads()); + + static double parallel_double(FUNC_RD function, FUNC_AD aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads()); + }; +} + + +#endif //SAMEDIFF_THREADS_H diff --git a/libnd4j/include/execution/Ticket.h b/libnd4j/include/execution/Ticket.h new file mode 100644 index 000000000..e4152b66a --- /dev/null +++ b/libnd4j/include/execution/Ticket.h @@ -0,0 +1,67 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#ifndef SAMEDIFF_TICKET_H +#define SAMEDIFF_TICKET_H + +#include +#include +#include +#include +#include +#include + +namespace samediff { + class Ticket { + private: + bool _acquired = false; + std::vector*> _queues; + std::vector _callables; + std::vector _interfaces; + + uint32_t _acquiredThreads = 0; + public: + explicit Ticket(const std::vector*> &queues); + Ticket(); + ~Ticket() = default; + + bool acquired(); + + void acquiredThreads(uint32_t threads); + + void attach(uint32_t thread_id, CallableInterface *interface); + + // deprecated one + void enqueue(int thread_id, CallableWithArguments* callable); + + void enqueue(uint32_t thread_id, uint32_t num_threads, int64_t *lpt, FUNC_RL func, int64_t start_x, int64_t stop_x, int64_t inc_x); + void enqueue(uint32_t thread_id, uint32_t num_threads, double *lpt, FUNC_RD func, int64_t start_x, int64_t stop_x, int64_t inc_x); + + void enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_DO func); + void enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_1D func, int64_t start_x, int64_t stop_x, int64_t inc_x); + void enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_2D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y); + void enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_3D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_, int64_t stop_z, int64_t inc_z); + + void waitAndRelease(); + }; +} + + +#endif //DEV_TESTS_TICKET_H diff --git a/libnd4j/include/execution/impl/BlockingQueue.cpp b/libnd4j/include/execution/impl/BlockingQueue.cpp new file mode 100644 index 000000000..ff483fd28 --- /dev/null +++ b/libnd4j/include/execution/impl/BlockingQueue.cpp @@ -0,0 +1,73 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include +#include +#include + +namespace samediff { + template + BlockingQueue::BlockingQueue(int queueSize) { + _size = 0; + _available = true; + } + + template + T BlockingQueue::poll() { + // locking untill there's something within queue + std::unique_lock lock(_lock); + _condition.wait(lock, [&]{ return this->_size.load() != 0; }); + + T t(std::move(_queue.front())); + _queue.pop(); + _size--; + return t; + } + + template + void BlockingQueue::put(const T &t) { + { + // locking before push, unlocking after + std::unique_lock lock(_lock); + _queue.push(t); + _size++; + } + + // notifying condition + _condition.notify_one(); + } + + template + bool BlockingQueue::available() { + return _available.load(); + } + + template + void BlockingQueue::markAvailable() { + _available = true; + } + + template + void BlockingQueue::markUnavailable() { + _available = false; + } + + template class BlockingQueue; +} diff --git a/libnd4j/include/execution/impl/CallableInterface.cpp b/libnd4j/include/execution/impl/CallableInterface.cpp new file mode 100644 index 000000000..a719af848 --- /dev/null +++ b/libnd4j/include/execution/impl/CallableInterface.cpp @@ -0,0 +1,213 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include +#include + +namespace samediff { + CallableInterface::CallableInterface() { + // initial state is available + _available = true; + _filled = false; + _finished = false; + } + + bool CallableInterface::available() { + return _available.load(); + } + + void CallableInterface::markUnavailable() { + _available = false; + } + + void CallableInterface::markAvailable() { + _available = true; + } + + void CallableInterface::fill(int threadID, int numThreads, FUNC_DO func) { + _function_do = std::move(func); + + _branch = 0; + _num_threads = numThreads; + _thread_id = threadID; + _finished = false; + { + std::unique_lock l(_ms); + _filled = true; + } + _starter.notify_one(); + } + + void CallableInterface::fill(int threadID, int numThreads, FUNC_1D func, int64_t startX, int64_t stopX, int64_t incX) { + _function_1d = std::move(func); + _arguments[0] = startX; + _arguments[1] = stopX; + _arguments[2] = incX; + + _branch = 1; + _num_threads = numThreads; + _thread_id = threadID; + _finished = false; + + { + std::unique_lock l(_ms); + _filled = true; + } + _starter.notify_one(); + } + + void CallableInterface::fill(int threadID, int numThreads, FUNC_2D func, int64_t startX, int64_t stopX, int64_t incX, int64_t start_y, int64_t stop_y, int64_t inc_y) { + _function_2d = std::move(func); + _arguments[0] = startX; + _arguments[1] = stopX; + _arguments[2] = incX; + _arguments[3] = start_y; + _arguments[4] = stop_y; + _arguments[5] = inc_y; + + _branch = 2; + _num_threads = numThreads; + _thread_id = threadID; + _finished = false; + + { + std::unique_lock l(_ms); + _filled = true; + } + _starter.notify_one(); + } + + void CallableInterface::fill(int threadID, int numThreads, FUNC_3D func, int64_t startX, int64_t stopX, int64_t incX, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z) { + _function_3d = std::move(func); + _arguments[0] = startX; + _arguments[1] = stopX; + _arguments[2] = incX; + _arguments[3] = start_y; + _arguments[4] = stop_y; + _arguments[5] = inc_y; + _arguments[6] = start_z; + _arguments[7] = stop_z; + _arguments[8] = inc_z; + + _branch = 3; + _num_threads = numThreads; + _thread_id = threadID; + _finished = false; + + { + std::unique_lock l(_ms); + _filled = true; + } + _starter.notify_one(); + } + + void CallableInterface::fill(int threadID, int numThreads, int64_t *lptr, FUNC_RL func, int64_t startX, int64_t stopX, int64_t incX) { + _function_rl = std::move(func); + _arguments[0] = startX; + _arguments[1] = stopX; + _arguments[2] = incX; + + _lptr = lptr; + + _branch = 4; + _num_threads = numThreads; + _thread_id = threadID; + _finished = false; + + { + std::unique_lock l(_ms); + _filled = true; + } + _starter.notify_one(); + } + + void CallableInterface::fill(int threadID, int numThreads, double *dptr, FUNC_RD func, int64_t startX, int64_t stopX, int64_t incX) { + _function_rd = std::move(func); + _arguments[0] = startX; + _arguments[1] = stopX; + _arguments[2] = incX; + + _dptr = dptr; + + _branch = 5; + _num_threads = numThreads; + _thread_id = threadID; + _finished = false; + + { + std::unique_lock l(_ms); + _filled = true; + } + _starter.notify_one(); + } + + void CallableInterface::waitForTask() { + // block until task is available + std::unique_lock lock(_ms); + _starter.wait(lock, [&]{ return _filled.load(); }); + } + + void CallableInterface::waitForCompletion() { + //while (!_finished.load()); + + // block until finished + std::unique_lock lock(_mf); + _finisher.wait(lock, [&] { return _finished.load(); }); + } + + void CallableInterface::finish() { + // mark as finished + { + std::unique_lock l(_mf); + _finished.store(true); + } + _finisher.notify_one(); + } + + void CallableInterface::execute() { + // mark it as consumed + _filled = false; + + // actually executing op + switch (_branch) { + case 0: + _function_do(_thread_id, _num_threads); + break; + case 1: + _function_1d(_thread_id, _arguments[0], _arguments[1], _arguments[2]); + break; + case 2: + _function_2d(_thread_id, _arguments[0], _arguments[1], _arguments[2], _arguments[3], _arguments[4], _arguments[5]); + break; + case 3: + _function_3d(_thread_id, _arguments[0], _arguments[1], _arguments[2], _arguments[3], _arguments[4], _arguments[5], _arguments[6], _arguments[7], _arguments[8]); + break; + case 4: + _lptr[0] = _function_rl(_thread_id, _arguments[0], _arguments[1], _arguments[2]); + break; + case 5: + _dptr[0] = _function_rd(_thread_id, _arguments[0], _arguments[1], _arguments[2]); + break; + } + + // notify that thread finished the job + this->finish(); + } +} \ No newline at end of file diff --git a/libnd4j/include/execution/impl/CallableWithArguments.cpp b/libnd4j/include/execution/impl/CallableWithArguments.cpp new file mode 100644 index 000000000..8f17622b7 --- /dev/null +++ b/libnd4j/include/execution/impl/CallableWithArguments.cpp @@ -0,0 +1,103 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include + +namespace samediff { + CallableWithArguments::CallableWithArguments(FUNC_DO func, uint64_t thread_id, uint64_t numThreads) { + _function_do = func; + _finished = false; + _threadId = thread_id; + _numThreads = numThreads; + _dimensions = 0; + } + + CallableWithArguments::CallableWithArguments(FUNC_3D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x, int64_t start_y, int64_t stop_y, int64_t increment_y, int64_t start_z, int64_t stop_z, int64_t increment_z) { + _function_3d = func; + _arguments = {start_x, stop_x, increment_x, start_y, stop_y, increment_y, start_z, stop_z, increment_z}; + _finished = false; + _threadId = thread_id; + _dimensions = 3; + } + + CallableWithArguments::CallableWithArguments(FUNC_1D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x) { + _function_1d = func; + _arguments = {start_x, stop_x, increment_x}; + _finished = false; + _threadId = thread_id; + _dimensions = 1; + } + + CallableWithArguments::CallableWithArguments(FUNC_2D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x, int64_t start_y, int64_t stop_y, int64_t increment_y) { + _function_2d = func; + _arguments = {start_x, stop_x, increment_x, start_y, stop_y, increment_y}; + _finished = false; + _threadId = thread_id; + _dimensions = 2; + } + + int CallableWithArguments::dimensions() { + return _dimensions; + } + + std::vector& CallableWithArguments::arguments() { + return _arguments; + } + + bool CallableWithArguments::finished() { + return _finished.load(); + } + + void CallableWithArguments::finish() { + std::lock_guard lock(_lock); + _finished = true; + _condition.notify_one(); + } + + void CallableWithArguments::waitUntilFinished() { + std::unique_lock lock(_lock); + _condition.wait(lock, [&]{ return _finished.load(); }); + } + + + FUNC_1D CallableWithArguments::function_1d() { + return _function_1d; + } + + FUNC_2D CallableWithArguments::function_2d() { + return _function_2d; + } + + FUNC_DO CallableWithArguments::function_do() { + return _function_do; + } + + FUNC_3D CallableWithArguments::function_3d() { + return _function_3d; + } + + uint64_t CallableWithArguments::threadId() { + return _threadId; + } + + uint64_t CallableWithArguments::numThreads() { + return _numThreads; + } +} \ No newline at end of file diff --git a/libnd4j/include/execution/impl/ThreadPool.cpp b/libnd4j/include/execution/impl/ThreadPool.cpp new file mode 100644 index 000000000..5d9e2d5eb --- /dev/null +++ b/libnd4j/include/execution/impl/ThreadPool.cpp @@ -0,0 +1,194 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +//#include +#endif + +namespace samediff { + + // this function executed once per thread, it polls functions from queue, and executes them via wrapper + static void executionLoop_(int thread_id, BlockingQueue *queue) { + while (true) { + // this method blocks until there's something within queue + auto c = queue->poll(); + //nd4j_printf("ThreadPool: starting thread %i\n", c->threadId()); + switch (c->dimensions()) { + case 0: { + c->function_do()(c->threadId(), c->numThreads()); + c->finish(); + } + break; + case 1: { + auto args = c->arguments(); + c->function_1d()(c->threadId(), args[0], args[1], args[2]); + c->finish(); + } + break; + case 2: { + auto args = c->arguments(); + c->function_2d()(c->threadId(), args[0], args[1], args[2], args[3], args[4], args[5]); + c->finish(); + //nd4j_printf("ThreadPool: finished thread %i\n", c->threadId()); + } + break; + case 3: { + auto args = c->arguments(); + c->function_3d()(c->threadId(), args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8]); + c->finish(); + } + break; + default: + throw std::runtime_error("Don't know what to do with provided Callable"); + } + } + } + + static void executionLoopWithInterface_(int thread_id, CallableInterface *c) { + while (true) { + // blocking here until there's something to do + c->waitForTask(); + + // execute whatever we have + c->execute(); + } + } + + ThreadPool::ThreadPool() { + // TODO: number of threads must reflect number of cores for UMA system. In case of NUMA it should be per-device pool + // FIXME: on mobile phones this feature must NOT be used + _available = nd4j::Environment::getInstance()->maxThreads(); + + _queues.resize(_available.load()); + _threads.resize(_available.load()); + _interfaces.resize(_available.load()); + + // creating threads here + for (int e = 0; e < _available.load(); e++) { + _queues[e] = new BlockingQueue(2); + _interfaces[e] = new CallableInterface(); + _threads[e] = new std::thread(executionLoopWithInterface_, e, _interfaces[e]); + _tickets.push(new Ticket()); + // _threads[e] = new std::thread(executionLoop_, e, _queues[e]); + + // TODO: add other platforms here as well + // now we must set affinity, and it's going to be platform-specific thing +#ifdef LINUX_BUILD + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(e, &cpuset); + int rc = pthread_setaffinity_np(_threads[e]->native_handle(), sizeof(cpu_set_t), &cpuset); + if (rc != 0) + throw std::runtime_error("Failed to set pthread affinity"); +#endif + /* +#if defined(_WIN32) || defined(_WIN64) + // we can't set affinity to more than 64 cores + if (e <= 64) { + auto mask = (static_cast(1) << e); + auto result = SetThreadAffinityMask(_threads[e]->native_handle(), mask); + if (!result) + throw std::runtime_error("Failed to set pthread affinity"); + } + + // that's fine. no need for time_critical here + SetThreadPriority(_threads[e]->native_handle(), THREAD_PRIORITY_HIGHEST); +#endif + */ + } + } + + ThreadPool::~ThreadPool() { + // TODO: implement this one properly + for (int e = 0; e < _queues.size(); e++) { + // stop each and every thread + + // release queue and thread + //delete _queues[e]; + //delete _threads[e]; + } + } + + static std::mutex _lmutex; + + ThreadPool* ThreadPool::getInstance() { + std::unique_lock lock(_lmutex); + if (!_INSTANCE) + _INSTANCE = new ThreadPool(); + + return _INSTANCE; + } + + void ThreadPool::release(int numThreads) { + _available += numThreads; + } + + Ticket* ThreadPool::tryAcquire(int numThreads) { + //std::vector*> queues; + + Ticket *t = nullptr; + // we check for threads availability first + bool threaded = false; + { + // we lock before checking availability + std::unique_lock lock(_lock); + if (_available >= numThreads) { + threaded = true; + _available -= numThreads; + + // getting a ticket from the queue + t = _tickets.front(); + _tickets.pop(); + + // ticket must contain information about number of threads for the current session + t->acquiredThreads(numThreads); + + // filling ticket with executable interfaces + for (int e = 0, i = 0; e < _queues.size() && i < numThreads; e++) { + if (_interfaces[e]->available()) { + t->attach(i++, _interfaces[e]); + _interfaces[e]->markUnavailable(); + } + } + } + } + + // we either dispatch tasks to threads, or run single-threaded + if (threaded) { + return t; + } else { + // if there's no threads available - return nullptr + return nullptr; + } + } + + void ThreadPool::release(samediff::Ticket *ticket) { + // returning ticket back to the queue + std::unique_lock lock(_lock); + _tickets.push(ticket); + } + + + ThreadPool* ThreadPool::_INSTANCE = 0; +} diff --git a/libnd4j/include/execution/impl/Threads.cpp b/libnd4j/include/execution/impl/Threads.cpp new file mode 100644 index 000000000..f5ae5b5eb --- /dev/null +++ b/libnd4j/include/execution/impl/Threads.cpp @@ -0,0 +1,641 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// +#include +#include +#include +#include +#include +#include +#include + + +namespace samediff { + + int ThreadsHelper::numberOfThreads(int maxThreads, uint64_t numberOfElements) { + // let's see how many threads we actually need first + auto optimalThreads = nd4j::math::nd4j_max(1, numberOfElements / 1024); + + // now return the smallest value + return nd4j::math::nd4j_min(optimalThreads, maxThreads); + } + + Span3::Span3(int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, int64_t startZ, int64_t stopZ, int64_t incZ) { + _startX = startX; + _startY = startY; + _startZ = startZ; + _stopX = stopX; + _stopY = stopY; + _stopZ = stopZ; + _incX = incX; + _incY = incY; + _incZ = incZ; + } + + Span3 Span3::build(int loop, uint64_t threadID, uint64_t numThreads, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, int64_t startZ, int64_t stopZ, int64_t incZ) { + switch (loop) { + case 1: { + auto span = (stopX - startX) / numThreads; + auto s = span * threadID; + auto e = s + span; + if (threadID == numThreads - 1) + e = stopX; + + return Span3(s, e, incX, startY, stopY, incY, startZ, stopZ, incZ); + } + break; + case 2: { + auto span = (stopY - startY) / numThreads; + auto s = span * threadID; + auto e = s + span; + if (threadID == numThreads - 1) + e = stopY; + + return Span3(startX, stopX, incX, s, e, incY, startZ, stopZ, incZ); + } + break; + case 3: { + auto span = (stopZ - startZ) / numThreads; + auto s = span * threadID; + auto e = s + span; + if (threadID == numThreads - 1) + e = stopZ; + + return Span3(startX, stopX, incX, startY, stopY, incY, s, e, incZ); + } + break; + default: + throw std::runtime_error(""); + } + return Span3(startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ); + } + + Span::Span(int64_t startX, int64_t stopX, int64_t incX) { + _startX = startX; + _stopX = stopX; + _incX = incX; + } + + Span Span::build(uint64_t threadID, uint64_t numThreads, int64_t startX, int64_t stopX, int64_t incX) { + auto span = (stopX - startX) / numThreads; + auto s = span * threadID; + auto e = s + span; + if (threadID == numThreads - 1) + e = stopX; + + return Span(s, e, incX); + } + + Span2::Span2(int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY) { + _startX = startX; + _startY = startY; + _stopX = stopX; + _stopY = stopY; + _incX = incX; + _incY = incY; + } + + + Span2 Span2::build(int loop, uint64_t threadID, uint64_t numThreads, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY) { + + switch (loop) { + case 1: { + auto span = (stopX - startX) / numThreads; + auto s = span * threadID; + auto e = s + span; + if (threadID == numThreads - 1) + e = stopX; + + return Span2(s, e, incX, startY, stopY, incY); + } + break; + case 2: { + auto span = (stopY - startY) / numThreads; + auto s = span * threadID; + auto e = s + span; + if (threadID == numThreads - 1) + e = stopY; + + return Span2(startX, stopX, incX, s, e, incY); + } + break; + default: + throw std::runtime_error(""); + } + } + + int64_t Span::startX() const { + return _startX; + } + + int64_t Span::stopX() const { + return _stopX; + } + + int64_t Span::incX() const { + return _incX; + } + + int64_t Span2::startX() const { + return _startX; + } + + int64_t Span2::startY() const { + return _startY; + } + + int64_t Span2::stopX() const { + return _stopX; + } + + int64_t Span2::stopY() const { + return _stopY; + } + + int64_t Span2::incX() const { + return _incX; + } + + int64_t Span2::incY() const { + return _incY; + } + + int64_t Span3::startX() const { + return _startX; + } + + int64_t Span3::startY() const { + return _startY; + } + + int64_t Span3::startZ() const { + return _startZ; + } + + int64_t Span3::stopX() const { + return _stopX; + } + + int64_t Span3::stopY() const { + return _stopY; + } + + int64_t Span3::stopZ() const { + return _stopZ; + } + + int64_t Span3::incX() const { + return _incX; + } + + int64_t Span3::incY() const { + return _incY; + } + + int64_t Span3::incZ() const { + return _incZ; + } + + int ThreadsHelper::pickLoop2d(int numThreads, uint64_t itersX, uint64_t itersY) { + // if one of dimensions is definitely too small - we just pick the other one + if (itersX < numThreads && itersY >= numThreads) + return 2; + if (itersY < numThreads && itersX >= numThreads) + return 1; + + // next step - we pick the most balanced dimension + auto remX = itersX % numThreads; + auto remY = itersY % numThreads; + auto splitY = itersY / numThreads; + + // if there's no remainder left in some dimension - we're picking that dimension, because it'll be the most balanced work distribution + if (remX == 0) + return 1; + if (remY == 0) + return 2; + + // if there's no loop without a remainder - we're picking one with smaller remainder + if (remX < remY) + return 1; + if (remY < remX && splitY >= 64) // we don't want too small splits over last dimension, or vectorization will fail + return 2; + // if loops are equally sized - give the preference to the first thread + return 1; + } + + + static int threads_(int maxThreads, uint64_t elements) { + + if (elements == maxThreads) { + return maxThreads; + } + else if (elements > maxThreads) { + // if we have full load across thread, or at least half of threads can be utilized + auto rem = elements % maxThreads; + if (rem == 0 || rem >= maxThreads / 3) + return maxThreads; + else + return threads_(maxThreads - 1, elements); + + } + else if (elements < maxThreads) { + return elements; + } + + return 1; + } + + int ThreadsHelper::numberOfThreads2d(int maxThreads, uint64_t iters_x, uint64_t iters_y) { + // in some cases there's nothing to think about, part 1 + if (iters_x < maxThreads && iters_y < maxThreads) + return nd4j::math::nd4j_max(iters_x, iters_y); + + auto remX = iters_x % maxThreads; + auto remY = iters_y % maxThreads; + + // in some cases there's nothing to think about, part 2 + if ((iters_x >= maxThreads && remX == 0 )|| (iters_y >= maxThreads && remY == 0)) + return maxThreads; + + // at this point we suppose that there's no loop perfectly matches number of our threads + // so let's pick something as equal as possible + if (iters_x > maxThreads || iters_y > maxThreads) + return maxThreads; + else + return numberOfThreads2d(maxThreads - 1, iters_x, iters_y); + } + + int ThreadsHelper::numberOfThreads3d(int maxThreads, uint64_t itersX, uint64_t itersY, uint64_t itersZ) { + // we don't want to run underloaded threads + if (itersX * itersY * itersZ <= 32) + return 1; + + auto remX = itersX % maxThreads; + auto remY = itersY % maxThreads; + auto remZ = itersZ % maxThreads; + + // if we have perfect balance across one of dimensions - just go for it + if ((itersX >= maxThreads && remX == 0) || (itersY >= maxThreads && remY == 0) || (itersZ >= maxThreads && remZ == 0)) + return maxThreads; + + int threadsX = 0, threadsY = 0, threadsZ = 0; + + // now we look into possible number of + threadsX = threads_(maxThreads, itersX); + threadsY = threads_(maxThreads, itersY); + threadsZ = threads_(maxThreads, itersZ); + + // we want to split as close to outer loop as possible, so checking it out first + if (threadsX >= threadsY && threadsX >= threadsZ) + return threadsX; + else if (threadsY >= threadsX && threadsY >= threadsZ) + return threadsY; + else if (threadsZ >= threadsX && threadsZ >= threadsY) + return threadsZ; + + return 1; + } + + int ThreadsHelper::pickLoop3d(int numThreads, uint64_t itersX, uint64_t itersY, uint64_t itersZ) { + auto remX = itersX % numThreads; + auto remY = itersY % numThreads; + auto remZ = itersZ % numThreads; + + auto splitX = itersX / numThreads; + auto splitY = itersY / numThreads; + auto splitZ = itersZ / numThreads; + + // if there's no remainder left in some dimension - we're picking that dimension, because it'll be the most balanced work distribution + if (remX == 0) + return 1; + else if (remY == 0) + return 2; + else if (remZ == 0) // TODO: we don't want too smal splits over last dimension? or we do? + return 3; + + if (itersX > numThreads) + return 1; + else if (itersY > numThreads) + return 2; + else if (itersZ > numThreads) + return 3; + + return 1; + } + + int Threads::parallel_tad(FUNC_1D function, int64_t start, int64_t stop, int64_t increment, uint32_t numThreads) { + if (start > stop) + throw std::runtime_error("Threads::parallel_for got start > stop"); + + auto delta = (stop - start); + + if (numThreads > delta) + numThreads = delta; + + if (numThreads == 0) + return 0; + + // shortcut + if (numThreads == 1) { + function(0, start, stop, increment); + return 1; + } + + auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads); + if (ticket != nullptr) { + // if we got our threads - we'll run our jobs here + auto span = delta / numThreads; + + for (uint32_t e = 0; e < numThreads; e++) { + auto start_ = span * e + start; + auto stop_ = start_ + span; + + // last thread will process tail + if (e == numThreads - 1) + stop_ = stop; + + // putting the task into the queue for a given thread + ticket->enqueue(e, numThreads, function, start_, stop_, increment); + } + + // block and wait till all threads finished the job + ticket->waitAndRelease(); + + // we tell that parallelism request succeeded + return numThreads; + } else { + // if there were no threads available - we'll execute function right within current thread + function(0, start, stop, increment); + + // we tell that parallelism request declined + return 1; + } + } + + int Threads::parallel_for(FUNC_1D function, int64_t start, int64_t stop, int64_t increment, uint32_t numThreads) { + if (start > stop) + throw std::runtime_error("Threads::parallel_for got start > stop"); + + auto delta = (stop - start); + + // in some cases we just fire func as is + if (delta == 0 || numThreads == 1) { + function(0, start, stop, increment); + return 1; + } + + auto numElements = delta / increment; + + // we decide what's optimal number of threads we need here, and execute it in parallel_tad. + numThreads = ThreadsHelper::numberOfThreads(numThreads, numElements); + return parallel_tad(function, start, stop, increment, numThreads); + } + + int Threads::parallel_for(FUNC_2D function, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, uint64_t numThreads, bool debug) { + if (startX > stopX) + throw std::runtime_error("Threads::parallel_for got startX > stopX"); + + if (startY > stopY) + throw std::runtime_error("Threads::parallel_for got startY > stopY"); + + // number of elements per loop + auto delta_x = (stopX - startX); + auto delta_y = (stopY - startY); + + // number of iterations per loop + auto itersX = delta_x / incX; + auto itersY = delta_y / incY; + + // total number of iterations + auto iters_t = itersX * itersY; + + // we are checking the case of number of requested threads was smaller + numThreads = ThreadsHelper::numberOfThreads2d(numThreads, itersX, itersY); + + // basic shortcut for no-threading cases + if (numThreads == 1) { + function(0, startX, stopX, incX, startY, stopY, incY); + return 1; + } + + // We have couple of scenarios: + // either we split workload along 1st loop, or 2nd + auto splitLoop = ThreadsHelper::pickLoop2d(numThreads, itersX, itersY); + + // for debug mode we execute things inplace, without any threads + if (debug) { + for (int e = 0; e < numThreads; e++) { + auto span = Span2::build(splitLoop, e, numThreads, startX, stopX, incX, startY, stopY, incY); + + function(e, span.startX(), span.stopX(), span.incX(), span.startY(), span.stopY(), span.incY()); + } + + // but we still mimic multithreaded execution + return numThreads; + } else { + auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads); + if (ticket != nullptr) { + + for (int e = 0; e < numThreads; e++) { + auto threadId = numThreads - e - 1; + auto span = Span2::build(splitLoop, threadId, numThreads, startX, stopX, incX, startY, stopY, incY); + + ticket->enqueue(e, numThreads, function, span.startX(), span.stopX(), span.incX(), span.startY(), span.stopY(), span.incY()); + } + + // block until all threads finish their job + ticket->waitAndRelease(); + + return numThreads; + } else { + // if there were no threads available - we'll execute function right within current thread + function(0, startX, stopX, incX, startY, stopY, incY); + + // we tell that parallelism request declined + return 1; + } + }; + } + + + int Threads::parallel_for(FUNC_3D function, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, int64_t startZ, int64_t stopZ, int64_t incZ, uint64_t numThreads) { + if (startX > stopX) + throw std::runtime_error("Threads::parallel_for got startX > stopX"); + + if (startY > stopY) + throw std::runtime_error("Threads::parallel_for got startY > stopY"); + + if (startZ > stopZ) + throw std::runtime_error("Threads::parallel_for got startZ > stopZ"); + + auto delta_x = stopX - startX; + auto delta_y = stopY - startY; + auto delta_z = stopZ - startZ; + + auto itersX = delta_x / incX; + auto itersY = delta_y / incY; + auto itersZ = delta_z / incZ; + + numThreads = 1; //ThreadsHelper::numberOfThreads3d(numThreads, itersX, itersY, itersZ); + if (numThreads == 1) { + // loop is too small - executing function as is + function(0, startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ); + return 1; + } + + auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads); + if (ticket != nullptr) { + auto splitLoop = ThreadsHelper::pickLoop3d(numThreads, itersX, itersY, itersZ); + + for (int e = 0; e < numThreads; e++) { + auto thread_id = numThreads - e - 1; + auto span = Span3::build(splitLoop, thread_id, numThreads, startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ); + + ticket->enqueue(e, numThreads, function, span.startX(), span.stopX(), span.incX(), span.startY(), span.stopY(), span.incY(), span.startZ(), span.stopZ(), span.incZ()); + } + + // block until we're done + ticket->waitAndRelease(); + + // we tell that parallelism request succeeded + return numThreads; + } else { + // if there were no threads available - we'll execute function right within current thread + function(0, startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ); + + // we tell that parallelism request declined + return 1; + } + + } + + int Threads::parallel_do(FUNC_DO function, uint64_t numThreads) { + auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads - 1); + if (ticket != nullptr) { + + // submit tasks one by one + for (uint64_t e = 0; e < numThreads - 1; e++) + ticket->enqueue(e, numThreads, function); + + function(numThreads - 1, numThreads); + + ticket->waitAndRelease(); + + return numThreads; + } else { + // if there's no threads available - we'll execute function sequentially one by one + for (uint64_t e = 0; e < numThreads; e++) + function(e, numThreads); + + return numThreads; + } + + + return numThreads; + } + + int64_t Threads::parallel_long(FUNC_RL function, FUNC_AL aggregator, int64_t start, int64_t stop, int64_t increment, uint64_t numThreads) { + if (start > stop) + throw std::runtime_error("Threads::parallel_long got start > stop"); + + auto delta = (stop - start); + if (delta == 0 || numThreads == 1) + return function(0, start, stop, increment); + + auto numElements = delta / increment; + + // we decide what's optimal number of threads we need here, and execute it + numThreads = ThreadsHelper::numberOfThreads(numThreads, numElements); + if (numThreads == 1) + return function(0, start, stop, increment); + + auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads - 1); + if (ticket == nullptr) + return function(0, start, stop, increment); + + // create temporary array + int64_t intermediatery[256]; + auto span = delta / numThreads; + + // execute threads in parallel + for (uint32_t e = 0; e < numThreads; e++) { + auto start_ = span * e + start; + auto stop_ = span * (e + 1) + start; + + if (e == numThreads - 1) + intermediatery[e] = function(e, start_, stop, increment); + else + ticket->enqueue(e, numThreads, &intermediatery[e], function, start_, stop_, increment); + } + + ticket->waitAndRelease(); + + // aggregate results in single thread + for (uint64_t e = 1; e < numThreads; e++) + intermediatery[0] = aggregator(intermediatery[0], intermediatery[e]); + + // return accumulated result + return intermediatery[0]; + } + + double Threads::parallel_double(FUNC_RD function, FUNC_AD aggregator, int64_t start, int64_t stop, int64_t increment, uint64_t numThreads) { + if (start > stop) + throw std::runtime_error("Threads::parallel_long got start > stop"); + + auto delta = (stop - start); + if (delta == 0 || numThreads == 1) + return function(0, start, stop, increment); + + auto numElements = delta / increment; + + // we decide what's optimal number of threads we need here, and execute it + numThreads = ThreadsHelper::numberOfThreads(numThreads, numElements); + if (numThreads == 1) + return function(0, start, stop, increment); + + auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads - 1); + if (ticket == nullptr) + return function(0, start, stop, increment); + + // create temporary array + double intermediatery[256]; + auto span = delta / numThreads; + + // execute threads in parallel + for (uint32_t e = 0; e < numThreads; e++) { + auto start_ = span * e + start; + auto stop_ = span * (e + 1) + start; + + if (e == numThreads - 1) + intermediatery[e] = function(e, start_, stop, increment); + else + ticket->enqueue(e, numThreads, &intermediatery[e], function, start_, stop_, increment); + } + + ticket->waitAndRelease(); + + // aggregate results in single thread + for (uint64_t e = 1; e < numThreads; e++) + intermediatery[0] = aggregator(intermediatery[0], intermediatery[e]); + + // return accumulated result + return intermediatery[0]; + } + +} \ No newline at end of file diff --git a/libnd4j/include/execution/impl/Ticket.cpp b/libnd4j/include/execution/impl/Ticket.cpp new file mode 100644 index 000000000..5bf911fd0 --- /dev/null +++ b/libnd4j/include/execution/impl/Ticket.cpp @@ -0,0 +1,94 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include +#include +#include +#include + +namespace samediff { + Ticket::Ticket(const std::vector*> &queues) { + _acquired = true; + _queues = queues; + } + + Ticket::Ticket() { + _acquired = true; + _interfaces.resize(nd4j::Environment::getInstance()->maxThreads()); + } + + bool Ticket::acquired() { + return _acquired; + } + + void Ticket::enqueue(int thread_id, samediff::CallableWithArguments *callable) { + _queues[thread_id]->put(callable); + _callables.emplace_back(callable); + } + + void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_DO func) { + _interfaces[thread_id]->fill(thread_id, num_threads, func); + } + + void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_1D func, int64_t start_x, int64_t stop_x, int64_t inc_x) { + _interfaces[thread_id]->fill(thread_id, num_threads, func, start_x, stop_x, inc_x); + } + + void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, int64_t *lpt, FUNC_RL func, int64_t start_x, int64_t stop_x, int64_t inc_x) { + _interfaces[thread_id]->fill(thread_id, num_threads, lpt, func, start_x, stop_x, inc_x); + } + + void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, double *dpt, FUNC_RD func, int64_t start_x, int64_t stop_x, int64_t inc_x) { + _interfaces[thread_id]->fill(thread_id, num_threads, dpt, func, start_x, stop_x, inc_x); + } + + void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_2D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y) { + _interfaces[thread_id]->fill(thread_id, num_threads, std::move(func), start_x, stop_x, inc_x, start_y, stop_y, inc_y); + } + + void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_3D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z) { + _interfaces[thread_id]->fill(thread_id, num_threads, func, start_x, stop_x, inc_x, start_y, stop_y, inc_y, start_z, stop_z, inc_z); + } + + void Ticket::acquiredThreads(uint32_t threads) { + _acquiredThreads = threads; + } + + void Ticket::waitAndRelease() { + for (uint32_t e = 0; e < this->_acquiredThreads; e++) { + // block until finished + _interfaces[e]->waitForCompletion(); + + // mark available + _interfaces[e]->markAvailable(); + + // increment availability counter + ThreadPool::getInstance()->release(); + } + + // return this ticket back to the pool + ThreadPool::getInstance()->release(this); + } + + + void Ticket::attach(uint32_t thread_id, samediff::CallableInterface *interface) { + _interfaces[thread_id] = interface; + } +} \ No newline at end of file diff --git a/libnd4j/include/graph/Node.h b/libnd4j/include/graph/Node.h index 3eac03e07..b57998e38 100644 --- a/libnd4j/include/graph/Node.h +++ b/libnd4j/include/graph/Node.h @@ -232,6 +232,7 @@ namespace nd4j { } static nd4j::ops::DeclarableOp* buildOpByType(OpType opType, int numInputs, int numIArgs, int numTArgs, int opNum, NDArray *scalar); + static void deleteOpByType(OpType opType, void *op); }; } } diff --git a/libnd4j/include/graph/impl/Graph.cpp b/libnd4j/include/graph/impl/Graph.cpp index f4514efdb..2acedcea3 100644 --- a/libnd4j/include/graph/impl/Graph.cpp +++ b/libnd4j/include/graph/impl/Graph.cpp @@ -19,6 +19,7 @@ // #include +#include #include #include #include @@ -154,7 +155,7 @@ namespace nd4j { Nd4jLong *newShape = nullptr; // if that's scalar output - we don't care about previous node - if (node->getDimensions()->size() == 0 || (node->getDimensions()->size() == 1 && node->getDimensions()->at(0) == MAX_INT)) { + if (node->getDimensions()->size() == 0 || (node->getDimensions()->size() == 1 && node->getDimensions()->at(0) == nd4j::DataTypeUtils::max())) { newShape = new Nd4jLong[8]; newShape[0] = 2; diff --git a/libnd4j/include/graph/impl/Node.cpp b/libnd4j/include/graph/impl/Node.cpp index d365ddd6a..795d9b7f0 100644 --- a/libnd4j/include/graph/impl/Node.cpp +++ b/libnd4j/include/graph/impl/Node.cpp @@ -682,8 +682,9 @@ namespace nd4j { if (_protoContext != nullptr) delete _protoContext; - if (_isDeductable && _customOp != nullptr) - delete _customOp; + if (_isDeductable && _customOp != nullptr) { + Node::deleteOpByType(_opType, _customOp); + } } int nd4j::graph::Node::getRewindNode() { @@ -710,6 +711,70 @@ namespace nd4j { return false; } + void nd4j::graph::Node::deleteOpByType(OpType opType, void *op) { + switch (opType) { + case OpType_PAIRWISE: + delete reinterpret_cast(op); + break; + case OpType_PAIRWISE_BOOL: + delete reinterpret_cast(op); + break; + case OpType_TRANSFORM_STRICT: + delete reinterpret_cast(op); + break; + case OpType_TRANSFORM_SAME: + delete reinterpret_cast(op); + break; + case OpType_TRANSFORM_FLOAT: + delete reinterpret_cast(op); + break; + case OpType_TRANSFORM_BOOL: + delete reinterpret_cast(op); + break; + case OpType_SCALAR: + delete reinterpret_cast(op); + break; + case OpType_SCALAR_BOOL: + delete reinterpret_cast(op); + break; + case OpType_REDUCE_3: + delete reinterpret_cast(op); + break; + case OpType_REDUCE_SAME: + delete reinterpret_cast(op); + break; + case OpType_REDUCE_FLOAT: + delete reinterpret_cast(op); + break; + case OpType_REDUCE_LONG: + delete reinterpret_cast(op); + break; + case OpType_REDUCE_BOOL: + delete reinterpret_cast(op); + break; + case OpType_INDEX_REDUCE: + delete reinterpret_cast(op); + break; + case OpType_SUMMARYSTATS: + delete reinterpret_cast(op); + break; + case OpType_RANDOM: + delete reinterpret_cast(op); + break; + case OpType_BROADCAST: + delete reinterpret_cast(op); + break; + case OpType_BROADCAST_BOOL: + delete reinterpret_cast(op); + break; + case OpType_CUSTOM: + delete reinterpret_cast(op); + break; + default: + throw std::runtime_error("Bad opType passed in"); + } + } + nd4j::ops::DeclarableOp* nd4j::graph::Node::buildOpByType(OpType opType, int numInputs, int numIArgs, int numTArgs, int opNum, NDArray *scalar) { switch (opType) { case OpType_PAIRWISE: diff --git a/libnd4j/include/helpers/Loops.h b/libnd4j/include/helpers/Loops.h index 392ed3edf..fb1582056 100644 --- a/libnd4j/include/helpers/Loops.h +++ b/libnd4j/include/helpers/Loops.h @@ -31,6 +31,7 @@ #include #include #include +#include namespace nd4j { @@ -40,43 +41,43 @@ namespace nd4j { public: template - static FORCEINLINE void loopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, E* extraParams); + static FORCEINLINE void loopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, E* extraParams, int64_t start, int64_t stop); }; template class ReductionFloatLoops : public ReductionLoops { public: - static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams); + static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop); template - static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams); + static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop); }; template class ND4J_EXPORT ReductionBoolLoops : public ReductionLoops { public: - static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams); + static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop); template - static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams); + static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop); }; template class ND4J_EXPORT ReductionLongLoops : public ReductionLoops { public: - static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams); + static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop); template - static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams); + static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop); }; template class ND4J_EXPORT ReductionSameLoops : public ReductionLoops { public: - static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams); + static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop); template - static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams); + static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop); }; @@ -96,8 +97,8 @@ namespace nd4j { public: - template - static FORCEINLINE void loopTransform(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, E* extraParams); + template + static FORCEINLINE void loopTransform(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, E* extraParams, uint64_t threadId, uint64_t numThreads); }; template @@ -105,20 +106,20 @@ namespace nd4j { public: template - static FORCEINLINE void loopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams); + static FORCEINLINE void loopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop); template - static FORCEINLINE void loopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams); + static FORCEINLINE void loopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop); - static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams); + static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop); - static void wrapperAll(const int opNum, X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams); + static void wrapperAll(const int opNum, X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop); template - static void innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams); + static void innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop); template - static void innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams); + static void innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop); }; @@ -265,7 +266,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, void nd4j::ReductionLoops::loopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, - E* extraParams) { + E* extraParams, int64_t start, int64_t stop) { const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopTadXZ(xShapeInfo, zShapeInfo, tadShapeInfo); @@ -319,263 +320,170 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, //*********************************************// case LoopKind::EWS1: { - - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; i++) { + for (auto i = start; i < stop; i++) { auto tad = x + tadOffsets[i]; - auto start = OpType::startingValue(tad); + auto s = OpType::startingValue(tad); for (uint j = 0; j < tadLen; j++) - start = OpType::update(start, OpType::op(tad[j], extraParams), extraParams); + s = OpType::update(s, OpType::op(tad[j], extraParams), extraParams); - z[i] = OpType::postProcess(start, tadLen, extraParams); - } + z[i] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// case LoopKind::EWSNONZERO: { - - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; i++) { + for (auto i = start; i < stop; i++) { auto tad = x + tadOffsets[i]; - auto start = OpType::startingValue(tad); + auto s = OpType::startingValue(tad); for (uint j = 0; j < tadLen; j++) - start = OpType::update(start, OpType::op(tad[j * tadEws], extraParams), extraParams); + s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams); - z[i * zEws] = OpType::postProcess(start, tadLen, extraParams); - } + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// case LoopKind::RANK1: { - - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; i++) { + for (auto i = start; i < stop; i++) { auto tad = x + tadOffsets[i]; - auto start = OpType::startingValue(tad); + auto s = OpType::startingValue(tad); for (uint i0 = 0; i0 < tadLen; ++i0) - start = OpType::update(start, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams); + s = OpType::update(s, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams); - z[i] = OpType::postProcess(start, tadLen, extraParams); - } + z[i] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// case LoopKind::RANK2: { - - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; ++i) { + for (auto i = start; i < stop; i++) { auto tad = x + tadOffsets[i]; - auto start = OpType::startingValue(tad); + auto s = OpType::startingValue(tad); for (uint i0 = 0; i0 < tadShape[0]; ++i0) for (uint i1 = 0; i1 < tadShape[1]; ++i1) - start = OpType::update(start, OpType::op(tad[i0*tadStride[0] + i1*tadStride[1]], extraParams), extraParams); + s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1]], extraParams), extraParams); - z[i] = OpType::postProcess(start, tadLen, extraParams); - } + z[i] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// case LoopKind::RANK3: { - - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; ++i) { + for (auto i = start; i < stop; i++) { auto tad = x + tadOffsets[i]; - auto start = OpType::startingValue(tad); + auto s = OpType::startingValue(tad); for (uint i0 = 0; i0 < tadShape[0]; ++i0) for (uint i1 = 0; i1 < tadShape[1]; ++i1) for (uint i2 = 0; i2 < tadShape[2]; ++i2) - start = OpType::update(start, OpType::op(tad[i0*tadStride[0] + i1*tadStride[1] + i2*tadStride[2]], extraParams), extraParams); + s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2]], extraParams), extraParams); - z[i] = OpType::postProcess(start, tadLen, extraParams); - } + z[i] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// case LoopKind::RANK4: { - - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; ++i) { + for (auto i = start; i < stop; i++) { auto tad = x + tadOffsets[i]; - auto start = OpType::startingValue(tad); + auto s = OpType::startingValue(tad); for (uint i0 = 0; i0 < tadShape[0]; ++i0) for (uint i1 = 0; i1 < tadShape[1]; ++i1) for (uint i2 = 0; i2 < tadShape[2]; ++i2) for (uint i3 = 0; i3 < tadShape[3]; ++i3) - start = OpType::update(start, OpType::op(tad[i0*tadStride[0] + i1*tadStride[1] + i2*tadStride[2] + i3*tadStride[3]], extraParams), extraParams); + s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3]], extraParams), extraParams); - z[i] = OpType::postProcess(start, tadLen, extraParams); - } + z[i] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// case LoopKind::RANK5: { - - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; ++i) { + for (auto i = start; i < stop; i++) { auto tad = x + tadOffsets[i]; - auto start = OpType::startingValue(tad); + auto s = OpType::startingValue(tad); for (uint i0 = 0; i0 < tadShape[0]; ++i0) for (uint i1 = 0; i1 < tadShape[1]; ++i1) for (uint i2 = 0; i2 < tadShape[2]; ++i2) for (uint i3 = 0; i3 < tadShape[3]; ++i3) for (uint i4 = 0; i4 < tadShape[4]; ++i4) - start = OpType::update(start, OpType::op(tad[i0*tadStride[0] + i1*tadStride[1] + i2*tadStride[2] + i3*tadStride[3] + i4*tadStride[4] ], extraParams), extraParams); + s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4]], extraParams), extraParams); - z[i] = OpType::postProcess(start, tadLen, extraParams); - } + z[i] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// case LoopKind::X_EWSNONZERO: { uint castZShapeInfo[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, castZShapeInfo); - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; i++) { + for (auto i = start; i < stop; i++) { auto tad = x + tadOffsets[i]; - auto start = OpType::startingValue(tad); + auto s = OpType::startingValue(tad); for (uint j = 0; j < tadLen; j++) - start = OpType::update(start, OpType::op(tad[j * tadEws], extraParams), extraParams); + s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams); auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); - z[zOffset] = OpType::postProcess(start, tadLen, extraParams); - } + z[zOffset] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// case LoopKind::Z_EWSNONZERO: { uint castTadShapeInfo[MAX_RANK]; const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo(tadShapeInfo, castTadShapeInfo); - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; i++) { + for (auto i = start; i < stop; i++) { auto tad = x + tadOffsets[i]; - auto start = OpType::startingValue(tad); + auto s = OpType::startingValue(tad); for (uint j = 0; j < tadLen; j++) { auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad); - start = OpType::update(start, OpType::op(tad[tadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(tad[tadOffset], extraParams), extraParams); } - z[i * zEws] = OpType::postProcess(start, tadLen, extraParams); - } + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; - - //*********************************************// - // default: { - // uint castTadShapeInfo[MAX_RANK]; - // uint castZShapeInfo[MAX_RANK]; - // const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo(tadShapeInfo, castTadShapeInfo); - // const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, castZShapeInfo); - - // PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - // for (uint i = 0; i < zLen; i++) { - // auto tad = x + tadOffsets[i]; - // auto start = OpType::startingValue(tad); - - // for (uint j = 0; j < tadLen; j++) { - // auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad); - // start = OpType::update(start, OpType::op(tad[tadOffset], extraParams), extraParams); - // } - - // auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); - // z[zOffset] = OpType::postProcess(start, tadLen, extraParams); - // } - // } + break; //*********************************************// default: { - - Nd4jLong* innertadOffsets = new Nd4jLong[tadLen]; + auto innertadOffsets = new Nd4jLong[tadLen]; shape::calcOffsets(tadShapeInfo, innertadOffsets); uint castZShapeInfo[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, castZShapeInfo); - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; i++) { + for (auto i = start; i < stop; i++) { auto tad = x + tadOffsets[i]; - auto start = OpType::startingValue(tad); + auto s = OpType::startingValue(tad); for (uint j = 0; j < tadLen; j++) - start = OpType::update(start, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams); + s = OpType::update(s, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams); auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); - z[zOffset] = OpType::postProcess(start, tadLen, extraParams); - } + z[zOffset] = OpType::postProcess(s, tadLen, extraParams); + }; - delete []innertadOffsets; + delete[] innertadOffsets; } - - //*********************************************// - // default: { - - // Nd4jLong* innertadOffsets = new Nd4jLong[tadLen]; - // shape::calcOffsets(tadShapeInfo, innertadOffsets); - - // const int zRankMinusOne = shape::rank(zShapeInfo) - 1; - - // Nd4jLong* offsetPerDimZ = new Nd4jLong[zRankMinusOne]; - // int* idxZ = new int[zRankMinusOne]; - - // memset(idxZ, 0, sizeof(Nd4jLong) * zRankMinusOne); - - // const Nd4jLong* shapeZ = shape::shapeOf(zShapeInfo); - // const Nd4jLong* strideZ = shape::stride(zShapeInfo); - - // PRAGMA_OMP_SIMD - // for (int k = 0; k < zRankMinusOne; ++k) - // offsetPerDimZ[k] = (shapeZ[k] - 1) * strideZ[k]; - - // int dimZ = zRankMinusOne, lZ = 1; - // Nd4jLong initZ = 0, zOffset = 0, e = 1; - - // // first iteration - // auto tad = x + tadOffsets[0]; - // auto start = OpType::startingValue(tad); - // for (uint j = 0; j < tadLen; j++) - // start = OpType::update(start, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams); - // z[0] = OpType::postProcess(start, OpType::startingValue(x), extraParams); - - // // rest iterations - // while (dimZ >= 0) { - - // if(shapeZ[dimZ] == 1) { --dimZ; continue; } // ignore dimensions equal to unity - // if(dimZ == zRankMinusOne) { // last dimension - // if(lZ < shapeZ[dimZ]) { zOffset += strideZ[dimZ]; ++lZ;} - // else { lZ = 1; --dimZ; continue; } - // } - // else if(idxZ[dimZ] < shapeZ[dimZ] - 1) { initZ += strideZ[dimZ]; zOffset = initZ; ++idxZ[dimZ]; dimZ = zRankMinusOne; } - // else { initZ -= offsetPerDimZ[dimZ]; idxZ[dimZ--] = 0; continue;} - - // start = OpType::startingValue(tad); - // tad = x + tadOffsets[e++]; - - // for (uint j = 0; j < tadLen; j++) - // start = OpType::update(start, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams); - - // z[zOffset] = OpType::postProcess(start, tadLen, extraParams); - // } - - // delete []innertadOffsets; - // } } } @@ -583,10 +491,10 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, ////////////////////////////////////////////////////////////////////////////// template - template + template void nd4j::TransformLoops::loopTransform(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, - E* extraParams) { + E* extraParams, uint64_t threadId, uint64_t numThreads) { const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo); @@ -596,265 +504,176 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, const Nd4jLong len = shape::length(xShapeInfo); - OmpLaunchHelper threadsInfo(len, doParallel ? -1 : 1); + if (len == 0) + return; switch (kindOfLoop) { //*********************************************// case LoopKind::EWS1: { + auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); + int64_t start = span.startX(), stop = span.stopX(); - PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads) - { - const auto threadNum = omp_get_thread_num(); - const auto threadOffset = threadsInfo.getThreadOffset(threadNum); - const auto lenPerThread = static_cast(threadsInfo.getItersPerThread(threadNum)); - - const auto xi = x + threadOffset; - const auto zi = z + threadOffset; - - PRAGMA_OMP_SIMD - for (uint i = 0; i < lenPerThread; i++) - zi[i] = OpType::op(xi[i], extraParams); + for (auto i = start; i < stop; i++) + z[i] = OpType::op(x[i], extraParams); } - } break; //*********************************************// case LoopKind::EWSNONZERO: { - const uint xEws = shape::elementWiseStride(xShapeInfo); - const uint zEws = shape::elementWiseStride(zShapeInfo); + const uint xEws = shape::elementWiseStride(xShapeInfo); + const uint zEws = shape::elementWiseStride(zShapeInfo); - PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads) - { - const auto threadNum = omp_get_thread_num(); - const auto threadOffset = threadsInfo.getThreadOffset(threadNum); - const auto lenPerThread = static_cast(threadsInfo.getItersPerThread(threadNum)); + auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); + int64_t start = span.startX(), stop = span.stopX(); - const auto xi = x + threadOffset * xEws; - auto zi = z + threadOffset * zEws; - - PRAGMA_OMP_SIMD - for (uint i = 0; i < lenPerThread; i++) - zi[i*zEws] = OpType::op(xi[i*xEws], extraParams); + for (auto i = start; i < stop; i++) + z[i*zEws] = OpType::op(x[i*xEws], extraParams); } - } break; //*********************************************// case LoopKind::Z_EWSNONZERO: { - const uint zEws = shape::elementWiseStride(zShapeInfo); - uint castXShapeInfo[MAX_RANK]; - const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, castXShapeInfo); + const uint zEws = shape::elementWiseStride(zShapeInfo); + uint castXShapeInfo[MAX_RANK]; + const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, castXShapeInfo); - PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads) - { - const auto threadNum = omp_get_thread_num(); - const auto threadOffset = threadsInfo.getThreadOffset(threadNum); - const auto lenPerThread = static_cast(threadsInfo.getItersPerThread(threadNum)); - - auto zi = z + threadOffset * zEws; + auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); + int64_t start = span.startX(), stop = span.stopX(); if (zEws > 1) { - - PRAGMA_OMP_SIMD - for (uint i = 0; i < lenPerThread; i++) { - const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, canCastX); - zi[i * zEws] = OpType::op(x[xOffset], extraParams); + for (auto i = start; i < stop; i++) { + const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX); + z[i * zEws] = OpType::op(x[xOffset], extraParams); } } else { - PRAGMA_OMP_SIMD - for (uint i = 0; i < lenPerThread; i++) { - const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, canCastX); - zi[i] = OpType::op(x[xOffset], extraParams); + for (auto i = start; i < stop; i++) { + const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX); + z[i] = OpType::op(x[xOffset], extraParams); } } } - } break; //*********************************************// case LoopKind::RANK1: { - PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(threadsInfo._numThreads) - for (uint i0 = 0; i0 < len; ++i0) - z[i0 * zStride[0]] = OpType::op(x[i0 * xStride[0]], extraParams); - } + auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); + + for (auto i0 = span.startX(); i0 < span.stopX(); i0++) + z[i0 * zStride[0]] = OpType::op(x[i0 * xStride[0]], extraParams); + } break; //*********************************************// case LoopKind::RANK2: { - auto uXShape0 = static_cast(xShape[0]); - auto uXShape1 = static_cast(xShape[1]); + auto uXShape0 = static_cast(xShape[0]); + auto uXShape1 = static_cast(xShape[1]); - //PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(threadsInfo._numThreads) - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (uint i0 = 0; i0 < uXShape0; ++i0) { + auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1); + auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1); - auto z0 = i0 * zStride[0]; - auto x0 = i0 * xStride[0]; - for (uint i1 = 0; i1 < uXShape1; ++i1) - z[z0 + i1 * zStride[1]] = OpType::op(x[x0 + i1 * xStride[1]], extraParams); + for (auto i0 = span.startX(); i0 < span.stopX(); i0++) { + auto z0 = i0 * zStride[0]; + auto x0 = i0 * xStride[0]; + + for (uint i1 = span.startY(); i1 < span.stopY(); ++i1) + z[z0 + i1 * zStride[1]] = OpType::op(x[x0 + i1 * xStride[1]], extraParams); + } } - } break; //*********************************************// case LoopKind::RANK3: { - auto uXShape0 = static_cast(xShape[0]); - auto uXShape1 = static_cast(xShape[1]); - auto uXShape2 = static_cast(xShape[2]); + auto uXShape0 = static_cast(xShape[0]); + auto uXShape1 = static_cast(xShape[1]); + auto uXShape2 = static_cast(xShape[2]); - PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS_COLLAPSE(threadsInfo._numThreads, 2) - for (uint i0 = 0; i0 < uXShape0; ++i0) - for (uint i1 = 0; i1 < uXShape1; ++i1) { + auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1); + auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1); - auto z0 = i0 * zStride[0] + i1 * zStride[1]; - auto x0 = i0 * xStride[0] + i1 * xStride[1]; - for (uint i2 = 0; i2 < uXShape2; ++i2) - z[z0 + i2 * zStride[2]] = OpType::op(x[x0 + i2 * xStride[2]], extraParams); - } - } + for (auto i0 = span.startX(); i0 < span.stopX(); i0++) + for (auto i1 = span.startY(); i1 < span.stopY(); i1++) { + auto z0 = i0 * zStride[0] + i1 * zStride[1]; + auto x0 = i0 * xStride[0] + i1 * xStride[1]; + + for (uint i2 = 0; i2 < uXShape2; ++i2) + z[z0 + i2 * zStride[2]] = OpType::op(x[x0 + i2 * xStride[2]], extraParams); + } + } break; //*********************************************// case LoopKind::RANK4: { - auto uXShape0 = static_cast(xShape[0]); - auto uXShape1 = static_cast(xShape[1]); - auto uXShape2 = static_cast(xShape[2]); - auto uXShape3 = static_cast(xShape[3]); + auto uXShape0 = static_cast(xShape[0]); + auto uXShape1 = static_cast(xShape[1]); + auto uXShape2 = static_cast(xShape[2]); + auto uXShape3 = static_cast(xShape[3]); - PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS_COLLAPSE(threadsInfo._numThreads, 2) - for (uint i0 = 0; i0 < uXShape0; ++i0) - for (uint i1 = 0; i1 < uXShape1; ++i1) - for (uint i2 = 0; i2 < uXShape2; ++i2) { + auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2); + auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1); - auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2]; - auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2]; + for (auto i0 = span.startX(); i0 < span.stopX(); i0++) + for (auto i1 = span.startY(); i1 < span.stopY(); i1++) + for (auto i2 = span.startZ(); i2 < span.stopZ(); i2++) { + auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2]; + auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2]; - for (uint i3 = 0; i3 < uXShape3; ++i3) - z[z0 + i3 * zStride[3]] = OpType::op(x[x0 + i3 * xStride[3]], extraParams); - } - } + for (uint i3 = 0; i3 < uXShape3; ++i3) + z[z0 + i3 * zStride[3]] = OpType::op(x[x0 + i3 * xStride[3]], extraParams); + } + } break; //*********************************************// case LoopKind::RANK5: { - auto uXShape0 = static_cast(xShape[0]); - auto uXShape1 = static_cast(xShape[1]); - auto uXShape2 = static_cast(xShape[2]); - auto uXShape3 = static_cast(xShape[3]); - auto uXShape4 = static_cast(xShape[4]); + auto uXShape0 = static_cast(xShape[0]); + auto uXShape1 = static_cast(xShape[1]); + auto uXShape2 = static_cast(xShape[2]); + auto uXShape3 = static_cast(xShape[3]); + auto uXShape4 = static_cast(xShape[4]); - PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS_COLLAPSE(threadsInfo._numThreads, 3) - for (uint i0 = 0; i0 < uXShape0; ++i0) - for (uint i1 = 0; i1 < uXShape1; ++i1) - for (uint i2 = 0; i2 < uXShape2; ++i2) { + auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2); + auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1); - auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2]; - auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2]; - for (uint i3 = 0; i3 < uXShape3; ++i3) { + for (auto i0 = span.startX(); i0 < span.stopX(); i0++) + for (auto i1 = span.startY(); i1 < span.stopY(); i1++) + for (auto i2 = span.startZ(); i2 < span.stopZ(); i2++) { + auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2]; + auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2]; - auto z1 = z0 + i3 * zStride[3]; - auto x1 = x0 + i3 * xStride[3]; + for (uint i3 = 0; i3 < uXShape3; ++i3) { - for (uint i4 = 0; i4 < uXShape4; ++i4) - z[z1 + i4 * zStride[4]] = OpType::op(x[x1 + i4 * xStride[4]], extraParams); + auto z1 = z0 + i3 * zStride[3]; + auto x1 = x0 + i3 * xStride[3]; + for (uint i4 = 0; i4 < uXShape4; ++i4) + z[z1 + i4 * zStride[4]] = OpType::op(x[x1 + i4 * xStride[4]], extraParams); + + } } - } - } + + } break; //*********************************************// default: { - uint xShapeInfoCast[MAX_RANK]; - uint zShapeInfoCast[MAX_RANK]; + uint xShapeInfoCast[MAX_RANK]; + uint zShapeInfoCast[MAX_RANK]; - bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); + bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); + bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = threadsInfo.getThreadOffset(threadNum); - auto lenPerThread = static_cast(threadsInfo.getItersPerThread(threadNum)); + auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); - PRAGMA_OMP_SIMD - for (uint i = 0; i < lenPerThread; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); + for (auto i = span.startX(); i < span.stopX(); i++) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpType::op(x[xOffset], extraParams); } } - } - // default: { - - // const int xRankMinusOne = shape::rank(xShapeInfo) - 1; - // const int zRankMinusOne = shape::rank(zShapeInfo) - 1; - - // printf("%i %i \n", xRankMinusOne, zRankMinusOne); - - // uint* xIdx = new uint[xRankMinusOne + 1]; - // uint* zIdx = new uint[zRankMinusOne + 1]; - - // Nd4jLong* xOffsetPerDim = new Nd4jLong[xRankMinusOne]; - // Nd4jLong* zOffsetPerDim = new Nd4jLong[zRankMinusOne]; - - // memset(xIdx, 0, sizeof(uint) * xRankMinusOne); - // memset(zIdx, 0, sizeof(uint) * zRankMinusOne); - - // xIdx[xRankMinusOne] = zIdx[zRankMinusOne] = 1; - - // const Nd4jLong* xShape = shape::shapeOf(xShapeInfo); - // const Nd4jLong* zShape = shape::shapeOf(zShapeInfo); - // const Nd4jLong* xStride = shape::stride(xShapeInfo); - // const Nd4jLong* zStride = shape::stride(zShapeInfo); - - // PRAGMA_OMP_SIMD - // for (int k = 0; k < xRankMinusOne; ++k) - // xOffsetPerDim[k] = (xShape[k] - 1) * xStride[k]; - // PRAGMA_OMP_SIMD - // for (int k = 0; k < zRankMinusOne; ++k) - // zOffsetPerDim[k] = (zShape[k] - 1) * zStride[k]; - - // Nd4jLong xInit = 0, zInit = 0, xOffset = 0, zOffset = 0; - // int jX = xRankMinusOne, jZ = zRankMinusOne; - - // // first iteration - // z[0] = OpType::op(x[0], extraParams); - - // // rest iterations - // for (uint i = 1; i < len; i++) { - - // while(true) { - // if(xShape[jX] == 1) { --jX; continue; } - // if(jX == xRankMinusOne) { - // if(xIdx[jX] < xShape[jX]) { xOffset += xStride[jX]; ++xIdx[jX]; break; } - // else { xIdx[jX] = 1; --jX; continue; } - // } - // else if(xIdx[jX] < xShape[jX] - 1) { xInit += xStride[jX]; xOffset = xInit; ++xIdx[jX]; jX = xRankMinusOne; break; } - // else { xInit -= xOffsetPerDim[jX]; xIdx[jX--] = 0; continue; } - // } - - // while(true) { - // if(zShape[jZ] == 1) { --jZ; continue; } - // if(jZ == zRankMinusOne) { - // if(zIdx[jZ] < zShape[jZ]) { zOffset += zStride[jZ]; ++zIdx[jZ]; break; } - // else { zIdx[jZ] = 1; --jZ; continue; } - // } - // else if(zIdx[jZ] < zShape[jZ] - 1) { zInit += zStride[jZ]; zOffset = zInit; ++zIdx[jZ]; jZ = zRankMinusOne; break; } - // else { zInit -= zOffsetPerDim[jZ]; zIdx[jZ--] = 0; continue; } - // } - // z[zOffset] = OpType::op(x[xOffset], extraParams); - // } - - // delete []xIdx; - // delete []zIdx; - // delete []xOffsetPerDim; - // delete []zOffsetPerDim; - // } } } @@ -866,12 +685,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, - Z* extraParameters) { + Z* extraParameters, int64_t start, int64_t stop) { // both tads have same shape, however strides and ews may differ Z param0(OpType::startingValue(x)), param1(OpType::startingValue(x)), param2(extraParameters ? extraParameters[0] : OpType::startingValue(x)); - Z extraParams[3] = {param0, param1, param2}; const Nd4jLong xLen = shape::length(xShapeInfo); const Nd4jLong yLen = shape::length(yShapeInfo); @@ -921,139 +739,128 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, //*********************************************// case LoopKind::EWS1: { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint i = 0; i < zLen; ++i) { - + Z extraParams[3]; + for (auto i = start; i < stop; i++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto start = OpType::startingValue(xTad); + auto s = OpType::startingValue(xTad); for (uint j = 0; j < tadLen; ++j) - start = OpType::update(start, OpType::op(xTad[j], yTad[j], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams); - z[i] = OpType::postProcess(start, tadLen, extraParams); - } + z[i] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// case LoopKind::EWSNONZERO: { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint i = 0; i < zLen; ++i) { - + Z extraParams[3]; + for (auto i = start; i < stop; i++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; - const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto start = OpType::startingValue(xTad); + const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; + const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; + auto s = OpType::startingValue(xTad); for (uint j = 0; j < tadLen; ++j) - start = OpType::update(start, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams); - z[i * zEws] = OpType::postProcess(start, tadLen, extraParams); - } + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// case LoopKind::RANK1: { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint i = 0; i < zLen; i++) { - + Z extraParams[3]; + for (auto i = start; i < stop; i++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; - const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto start = OpType::startingValue(xTad); + const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; + const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; + auto s = OpType::startingValue(xTad); for (uint i0 = 0; i0 < tadLen; ++i0) { const auto xTadOffset = i0 * xTadStride[0]; const auto yTadOffset = i0 * yTadStride[0]; - start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } - z[i * zEws] = OpType::postProcess(start, tadLen, extraParams); - } + + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// case LoopKind::RANK2: { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint i = 0; i < zLen; i++) { - + Z extraParams[3]; + for (auto i = start; i < stop; i++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; - const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto start = OpType::startingValue(xTad); + const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; + const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; + auto s = OpType::startingValue(xTad); for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (uint i1 = 0; i1 < tadShape[1]; ++i1) { const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1]; - start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } } - z[i * zEws] = OpType::postProcess(start, tadLen, extraParams); - } + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// case LoopKind::RANK3: { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint i = 0; i < zLen; i++) { - + Z extraParams[3]; + for (auto i = start; i < stop; i++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; - const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto start = OpType::startingValue(xTad); + const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; + const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; + auto s = OpType::startingValue(xTad); for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (uint i1 = 0; i1 < tadShape[1]; ++i1) { for (uint i2 = 0; i2 < tadShape[2]; ++i2) { const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2]; - start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } } } - z[i * zEws] = OpType::postProcess(start, tadLen, extraParams); - } + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// case LoopKind::RANK4: { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint i = 0; i < zLen; i++) { - + Z extraParams[3]; + for (auto i = start; i < stop; i++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; - const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto start = OpType::startingValue(xTad); + const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; + const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; + auto s = OpType::startingValue(xTad); for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (uint i1 = 0; i1 < tadShape[1]; ++i1) { @@ -1061,29 +868,27 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, for (uint i3 = 0; i3 < tadShape[3]; ++i3) { const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3]; - start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } } } } - z[i * zEws] = OpType::postProcess(start, tadLen, extraParams); - } + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// case LoopKind::RANK5: { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint i = 0; i < zLen; i++) { - + Z extraParams[3]; + for (auto i = start; i < stop; i++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; - const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto start = OpType::startingValue(xTad); + const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; + const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; + auto s = OpType::startingValue(xTad); for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (uint i1 = 0; i1 < tadShape[1]; ++i1) { @@ -1092,68 +897,62 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, for (uint i4 = 0; i4 < tadShape[4]; ++i4) { const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4]; - start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } } } } } - z[i * zEws] = OpType::postProcess(start, tadLen, extraParams); - } + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; } - break; + break; //*********************************************// default: { - uint castXTadShapeInfo[MAX_RANK]; const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo(xTadShapeInfo, castXTadShapeInfo); if(shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint i = 0; i < zLen; ++i) { - + Z extraParams[3]; + for (auto i = start; i < stop; i++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto start = OpType::startingValue(xTad); + auto s = OpType::startingValue(xTad); for (uint j = 0; j < tadLen; ++j) { const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); - start = OpType::update(start, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams); } - z[i * zEws] = OpType::postProcess(start, tadLen, extraParams); - } + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; } else { - uint castYTadShapeInfo[MAX_RANK]; const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo(yTadShapeInfo, castYTadShapeInfo); - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint i = 0; i < zLen; ++i) { - + Z extraParams[3]; + for (auto i = start; i < stop; i++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto start = OpType::startingValue(xTad); + auto s = OpType::startingValue(xTad); for (uint j = 0; j < tadLen; ++j) { const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad); - start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } - - z[i * zEws] = OpType::postProcess(start, tadLen, extraParams); - } + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; } } } @@ -1167,12 +966,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, - Z* extraParameters) { + Z* extraParameters, int64_t start, int64_t stop) { // both tads have same shape, however strides and ews may differ Z param0(OpType::startingValue(x)), param1(OpType::startingValue(x)), param2(extraParameters ? extraParameters[0] : OpType::startingValue(x)); - Z extraParams[3] = {param0, param1, param2}; const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopTadXYZ(xTadShapeInfo, yTadShapeInfo, zShapeInfo); @@ -1195,159 +993,146 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, int numThreads = OmpLaunchHelper::tadThreads(tadLen, numXTads*numYTads); switch (kindOfLoop) { - //*********************************************// case LoopKind::EWS1: { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint ix = 0; ix < numXTads; ++ix) { - for (uint iy = 0; iy < numYTads; ++iy) { - + Z extraParams[3]; + for (auto ix = 0; ix < numXTads; ix++) { + for (auto iy = 0; iy < numYTads; iy++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto start = startVal; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; for (uint j = 0; j < tadLen; ++j) - start = OpType::update(start, OpType::op(xTad[j], yTad[j], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams); - z[zInd] = OpType::postProcess(start, tadLen, extraParams); + z[zInd] = OpType::postProcess(s, tadLen, extraParams); } - } + }; } - break; + break; //*********************************************// case LoopKind::EWSNONZERO: { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint ix = 0; ix < numXTads; ++ix) { - for (uint iy = 0; iy < numYTads; ++iy) { - + Z extraParams[3]; + for (auto ix = 0; ix < numXTads; ix++) { + for (auto iy = 0; iy < numYTads; iy++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto start = startVal; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; for (uint j = 0; j < tadLen; ++j) - start = OpType::update(start, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams); - z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams); + z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); } - } + }; } - break; + break; //*********************************************// case LoopKind::RANK1: { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint ix = 0; ix < numXTads; ++ix) { - for (uint iy = 0; iy < numYTads; ++iy) { - + Z extraParams[3]; + for (auto ix = 0; ix < numXTads; ix++) { + for (auto iy = 0; iy < numYTads; iy++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto start = startVal; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; for (uint i0 = 0; i0 < tadLen; ++i0) { const auto xTadOffset = i0 * xTadStride[0]; const auto yTadOffset = i0 * yTadStride[0]; - start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } - z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams); + z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); } - } + }; } - break; + break; //*********************************************// case LoopKind::RANK2: { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint ix = 0; ix < numXTads; ++ix) { - for (uint iy = 0; iy < numYTads; ++iy) { - + Z extraParams[3]; + for (auto ix = 0; ix < numXTads; ix++) { + for (auto iy = 0; iy < numYTads; iy++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto start = startVal; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (uint i1 = 0; i1 < tadShape[1]; ++i1) { const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1]; - start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } } - z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams); + z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); } - } + }; } - break; + break; //*********************************************// case LoopKind::RANK3: { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint ix = 0; ix < numXTads; ++ix) { - for (uint iy = 0; iy < numYTads; ++iy) { - + Z extraParams[3]; + for (auto ix = 0; ix < numXTads; ix++) { + for (auto iy = 0; iy < numYTads; iy++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto start = startVal; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (uint i1 = 0; i1 < tadShape[1]; ++i1) { for (uint i2 = 0; i2 < tadShape[2]; ++i2) { const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2]; - start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } } } - z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams); + z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); } - } + }; } - break; + break; //*********************************************// case LoopKind::RANK4: { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint ix = 0; ix < numXTads; ++ix) { - for (uint iy = 0; iy < numYTads; ++iy) { - + Z extraParams[3]; + for (auto ix = 0; ix < numXTads; ix++) { + for (auto iy = 0; iy < numYTads; iy++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto start = startVal; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (uint i1 = 0; i1 < tadShape[1]; ++i1) { @@ -1355,32 +1140,30 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, for (uint i3 = 0; i3 < tadShape[3]; ++i3) { const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3]; - start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } } } } - z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams); + z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); } - } + }; } - break; + break; //*********************************************// case LoopKind::RANK5: { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint ix = 0; ix < numXTads; ++ix) { - for (uint iy = 0; iy < numYTads; ++iy) { - + Z extraParams[3]; + for (auto ix = 0; ix < numXTads; ix++) { + for (auto iy = 0; iy < numYTads; iy++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto start = startVal; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (uint i1 = 0; i1 < tadShape[1]; ++i1) { @@ -1389,7 +1172,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, for (uint i4 = 0; i4 < tadShape[4]; ++i4) { const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4]; - start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } } } @@ -1397,66 +1180,61 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, } z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams); } - } + }; } - break; + break; //*********************************************// default: { - uint castXTadShapeInfo[MAX_RANK]; const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo(xTadShapeInfo, castXTadShapeInfo); if(shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) { - - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint ix = 0; ix < numXTads; ++ix) { - for (uint iy = 0; iy < numYTads; ++iy) { - + Z extraParams[3]; + for (auto ix = 0; ix < numXTads; ix++) { + for (auto iy = 0; iy < numYTads; iy++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto start = startVal; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; for (uint j = 0; j < tadLen; ++j) { const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); - start = OpType::update(start, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams); } - z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams); + z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); } - } + }; } else { - uint castYTadShapeInfo[MAX_RANK]; const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo(yTadShapeInfo, castYTadShapeInfo); - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams)) - for (uint ix = 0; ix < numXTads; ++ix) { - for (uint iy = 0; iy < numYTads; ++iy) { - + Z extraParams[3]; + for (auto ix = 0; ix < numXTads; ix++) { + for (auto iy = 0; iy < numYTads; iy++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto start = startVal; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; for (uint j = 0; j < tadLen; ++j) { const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad); - start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } - z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams); + z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); } - } + }; } } } diff --git a/libnd4j/include/helpers/TAD.h b/libnd4j/include/helpers/TAD.h index 9888bb1fd..fb52e639c 100644 --- a/libnd4j/include/helpers/TAD.h +++ b/libnd4j/include/helpers/TAD.h @@ -721,7 +721,7 @@ namespace shape { INLINEDEF void TAD::createOffsets() { this->tadOffsets = new Nd4jLong[this->numTads]; uint nT = this->numTads; - PRAGMA_OMP_PARALLEL_FOR_SIMD + for(uint i = 0; i < nT; i++) this->tadOffsets[i] = this->tadOffset(i); } diff --git a/libnd4j/include/helpers/benchmark/MatrixBenchmark.h b/libnd4j/include/helpers/benchmark/MatrixBenchmark.h index fe64b364f..7c1330648 100644 --- a/libnd4j/include/helpers/benchmark/MatrixBenchmark.h +++ b/libnd4j/include/helpers/benchmark/MatrixBenchmark.h @@ -19,7 +19,6 @@ // #include "../OpBenchmark.h" -#include #include #ifndef DEV_TESTS_MATRIXBENCHMARK_H diff --git a/libnd4j/include/helpers/cpu/MmulHelper.cpp b/libnd4j/include/helpers/cpu/MmulHelper.cpp index fbf2fbc20..fca40d564 100644 --- a/libnd4j/include/helpers/cpu/MmulHelper.cpp +++ b/libnd4j/include/helpers/cpu/MmulHelper.cpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace nd4j { @@ -74,26 +75,28 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c // } // } - PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided) collapse(2)) - for(uint row = 0; row < M; ++row) { - for(uint col = 0; col < N; ++col) { - - T3* c = flagC ? (C + row + col * ldc) : (C + row * ldc + col); - T3 val = 0; + auto func = PRAGMA_THREADS_FOR_2D { ; + for (auto row = start_x; row < stop_x; row += inc_x) { + for (auto col = start_y; col < stop_y; col += inc_y) { + T3 *c = flagC ? (C + row + col * ldc) : (C + row * ldc + col); + T3 val = 0; - PRAGMA_OMP_SIMD - for(uint i = 0; i < K; ++i) { - T3 a = flagA ? *(A + row * lda + i) : *(A + row + i * lda); - T3 b = flagB ? *(B + col + i * ldb) : *(B + col * ldb + i); - val += alphaZ * a * b; + PRAGMA_OMP_SIMD + for (uint i = 0; i < K; ++i) { + T3 a = flagA ? *(A + row * lda + i) : *(A + row + i * lda); + T3 b = flagB ? *(B + col + i * ldb) : *(B + col * ldb + i); + val += alphaZ * a * b; + } + + if (betaZ) + *c = val + betaZ * *c; + else + *c = val; } - - if(betaZ) - *c = val + betaZ * *c; - else - *c = val; - } - } + } + }; + + samediff::Threads::parallel_for(func, 0, M, 1, 0, N, 1); } ////////////////////////////////////////////////////////////////////////////// @@ -108,24 +111,27 @@ static void usualGemv(const char aOrder, const int M, const int N, const double const bool flagA = aOrder == 'f'; - PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(M > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) - for(int row = 0; row < M; ++row) { - - T3* y = Y + row * incy; - T3 val = 0; + auto func = PRAGMA_THREADS_FOR { + for (auto row = start; row < stop; row += increment) { - PRAGMA_OMP_SIMD - for(int i = 0; i < N; ++i) { - T3 a = flagA ? *(A + row + i * lda) : *(A + row * lda + i); - T3 x = *(X + i * incx); - val += alphaZ * a * x; + T3 *y = Y + row * incy; + T3 val = 0; + + PRAGMA_OMP_SIMD + for (int i = 0; i < N; ++i) { + T3 a = flagA ? *(A + row + i * lda) : *(A + row * lda + i); + T3 x = *(X + i * incx); + val += alphaZ * a * x; + } + + if (betaZ) + *y = val + betaZ * *y; + else + *y = val; } - - if(betaZ) - *y = val + betaZ * *y; - else - *y = val; - } + }; + + samediff::Threads::parallel_for(func, 0, M); } ////////////////////////////////////////////////////////////////////////////// @@ -141,7 +147,7 @@ static void usualDot(const Nd4jLong length, const double alpha, const void* vX, T3 sum = 0; PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(length > Environment::getInstance()->elementwiseThreshold()) schedule(guided) reduction(OMP_SUMT:sum)) for(int i = 0; i < length; ++i) - sum = sum + X[i * incx] * Y[i * incy]; + sum += X[i * incx] * Y[i * incy]; *Z = alphaZ * sum + betaZ * *Z; } diff --git a/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp b/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp index 5f8789077..c4c2fa995 100644 --- a/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp +++ b/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp @@ -19,6 +19,7 @@ // #include +#include using namespace simdOps; diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp index 22ff3e6b1..4bd456da2 100644 --- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp +++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp @@ -44,62 +44,67 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, const Nd4jLong* tadShape = shape::shapeOf(const_cast(tadShapeInfo)); const Nd4jLong* tadStride = shape::stride(const_cast(tadShapeInfo)); - int tadsPerThread = zLen / TAD_THRESHOLD; - int numThreads = nd4j::math::nd4j_max(1, tadsPerThread); - numThreads = nd4j::math::nd4j_min(numThreads, omp_get_max_threads()); - switch (kindOfLoop) { //*********************************************// case nd4j::LoopKind::EWS1: { - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; i++) { - auto tad = const_cast(x) + tadOffsets[i]; - auto indexValue = OpType::startingIndexValue(tad); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto tad = const_cast(x) + tadOffsets[i]; + auto indexValue = OpType::startingIndexValue(tad); - for (uint j = 0; j < tadLen; j++) { - functions::indexreduce::IndexValue comp(tad[j], j); - indexValue = OpType::update(indexValue, comp, extraParams); + for (uint j = 0; j < tadLen; j++) { + functions::indexreduce::IndexValue comp(tad[j], j); + indexValue = OpType::update(indexValue, comp, extraParams); + } + + z[i] = (Z) indexValue.index; } + }; - z[i] = (Z) indexValue.index; - } + samediff::Threads::parallel_tad(func, 0, zLen); } break; //*********************************************// case nd4j::LoopKind::EWSNONZERO: { - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; i++) { - auto tad = const_cast(x) + tadOffsets[i]; - auto indexValue = OpType::startingIndexValue(tad); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto tad = const_cast(x) + tadOffsets[i]; + auto indexValue = OpType::startingIndexValue(tad); - for (uint j = 0; j < tadLen; j++) { - functions::indexreduce::IndexValue comp(tad[j * tadEws], j); - indexValue = OpType::update(indexValue, comp, extraParams); + for (uint j = 0; j < tadLen; j++) { + functions::indexreduce::IndexValue comp(tad[j * tadEws], j); + indexValue = OpType::update(indexValue, comp, extraParams); + } + + z[i * zEws] = (Z) indexValue.index; } + }; - z[i * zEws] = (Z) indexValue.index; - } + samediff::Threads::parallel_tad(func, 0, zLen); } break; //*********************************************// case nd4j::LoopKind::RANK1: { - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; i++) { - auto tad = const_cast(x) + tadOffsets[i]; - auto indexValue = OpType::startingIndexValue(tad); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto tad = const_cast(x) + tadOffsets[i]; + auto indexValue = OpType::startingIndexValue(tad); - for (uint i0 = 0; i0 < tadLen; ++i0) { - functions::indexreduce::IndexValue comp(tad[i0 * tadStride[0]], i0); - indexValue = OpType::update(indexValue, comp, extraParams); + for (uint i0 = 0; i0 < tadLen; ++i0) { + functions::indexreduce::IndexValue comp(tad[i0 * tadStride[0]], i0); + indexValue = OpType::update(indexValue, comp, extraParams); + } + + z[i] = (Z) indexValue.index; } + }; - z[i] = (Z) indexValue.index; - } + samediff::Threads::parallel_tad(func, 0, zLen); } break; @@ -108,22 +113,25 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, Nd4jLong newStride[2]; shape::updateStrides(2, tadShape, newStride, 'c'); - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; ++i) { - auto tad = const_cast(x) + tadOffsets[i]; - auto indexValue = OpType::startingIndexValue(tad); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto tad = const_cast(x) + tadOffsets[i]; + auto indexValue = OpType::startingIndexValue(tad); - for (uint i0 = 0; i0 < tadShape[0]; ++i0) { - for (uint i1 = 0; i1 < tadShape[1]; ++i1) { - const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1]; - const auto tadIndex = i0 * newStride[0] + i1; - functions::indexreduce::IndexValue comp(tad[tadOffset], tadIndex); - indexValue = OpType::update(indexValue, comp, extraParams); + for (uint i0 = 0; i0 < tadShape[0]; ++i0) { + for (uint i1 = 0; i1 < tadShape[1]; ++i1) { + const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1]; + const auto tadIndex = i0 * newStride[0] + i1; + functions::indexreduce::IndexValue comp(tad[tadOffset], tadIndex); + indexValue = OpType::update(indexValue, comp, extraParams); + } } - } - z[i] = (Z) indexValue.index; - } + z[i] = (Z) indexValue.index; + } + }; + + samediff::Threads::parallel_tad(func, 0, zLen); } break; @@ -132,24 +140,27 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, Nd4jLong newStride[3]; shape::updateStrides(3, tadShape, newStride, 'c'); - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; ++i) { - auto tad = const_cast(x) + tadOffsets[i]; - auto indexValue = OpType::startingIndexValue(tad); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto tad = const_cast(x) + tadOffsets[i]; + auto indexValue = OpType::startingIndexValue(tad); - for (uint i0 = 0; i0 < tadShape[0]; ++i0) { - for (uint i1 = 0; i1 < tadShape[1]; ++i1) { - for (uint i2 = 0; i2 < tadShape[2]; ++i2) { - const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2]; - const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2; - functions::indexreduce::IndexValue comp(tad[tadOffset], tadIndex); - indexValue = OpType::update(indexValue, comp, extraParams); + for (uint i0 = 0; i0 < tadShape[0]; ++i0) { + for (uint i1 = 0; i1 < tadShape[1]; ++i1) { + for (uint i2 = 0; i2 < tadShape[2]; ++i2) { + const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2]; + const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2; + functions::indexreduce::IndexValue comp(tad[tadOffset], tadIndex); + indexValue = OpType::update(indexValue, comp, extraParams); + } } } - } - z[i] = (Z) indexValue.index; - } + z[i] = (Z) indexValue.index; + } + }; + + samediff::Threads::parallel_tad(func, 0, zLen); } break; @@ -158,26 +169,29 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, Nd4jLong newStride[4]; shape::updateStrides(4, tadShape, newStride, 'c'); - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; ++i) { - auto tad = const_cast(x) + tadOffsets[i]; - auto indexValue = OpType::startingIndexValue(tad); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto tad = const_cast(x) + tadOffsets[i]; + auto indexValue = OpType::startingIndexValue(tad); - for (uint i0 = 0; i0 < tadShape[0]; ++i0) { - for (uint i1 = 0; i1 < tadShape[1]; ++i1) { - for (uint i2 = 0; i2 < tadShape[2]; ++i2) { - for (uint i3 = 0; i3 < tadShape[3]; ++i3) { - const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3]; - const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3; - functions::indexreduce::IndexValue comp(tad[tadOffset], tadIndex); - indexValue = OpType::update(indexValue, comp, extraParams); + for (uint i0 = 0; i0 < tadShape[0]; ++i0) { + for (uint i1 = 0; i1 < tadShape[1]; ++i1) { + for (uint i2 = 0; i2 < tadShape[2]; ++i2) { + for (uint i3 = 0; i3 < tadShape[3]; ++i3) { + const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3]; + const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3; + functions::indexreduce::IndexValue comp(tad[tadOffset], tadIndex); + indexValue = OpType::update(indexValue, comp, extraParams); + } } } } - } - z[i] = (Z) indexValue.index; - } + z[i] = (Z) indexValue.index; + } + }; + + samediff::Threads::parallel_tad(func, 0, zLen); } break; @@ -186,28 +200,31 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, Nd4jLong newStride[5]; shape::updateStrides(5, tadShape, newStride, 'c'); - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; ++i) { - auto tad = const_cast(x) + tadOffsets[i]; - auto indexValue = OpType::startingIndexValue(tad); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto tad = const_cast(x) + tadOffsets[i]; + auto indexValue = OpType::startingIndexValue(tad); - for (uint i0 = 0; i0 < tadShape[0]; ++i0) { - for (uint i1 = 0; i1 < tadShape[1]; ++i1) { - for (uint i2 = 0; i2 < tadShape[2]; ++i2) { - for (uint i3 = 0; i3 < tadShape[3]; ++i3) { - for (uint i4 = 0; i4 < tadShape[4]; ++i4) { - const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4]; - const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3 * newStride[3] + i4; - functions::indexreduce::IndexValue comp(tad[tadOffset], tadIndex); - indexValue = OpType::update(indexValue, comp, extraParams); + for (uint i0 = 0; i0 < tadShape[0]; ++i0) { + for (uint i1 = 0; i1 < tadShape[1]; ++i1) { + for (uint i2 = 0; i2 < tadShape[2]; ++i2) { + for (uint i3 = 0; i3 < tadShape[3]; ++i3) { + for (uint i4 = 0; i4 < tadShape[4]; ++i4) { + const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4]; + const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3 * newStride[3] + i4; + functions::indexreduce::IndexValue comp(tad[tadOffset], tadIndex); + indexValue = OpType::update(indexValue, comp, extraParams); + } } } } } - } - z[i] = (Z) indexValue.index; - } + z[i] = (Z) indexValue.index; + } + }; + + samediff::Threads::parallel_tad(func, 0, zLen); } break; @@ -216,19 +233,22 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, uint castZShapeInfo[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, castZShapeInfo); - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; i++) { - auto tad = const_cast(x) + tadOffsets[i]; - auto indexValue = OpType::startingIndexValue(tad); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto tad = const_cast(x) + tadOffsets[i]; + auto indexValue = OpType::startingIndexValue(tad); - for (uint j = 0; j < tadLen; j++) { - functions::indexreduce::IndexValue comp(tad[j * tadEws], j); - indexValue = OpType::update(indexValue, comp, extraParams); + for (uint j = 0; j < tadLen; j++) { + functions::indexreduce::IndexValue comp(tad[j * tadEws], j); + indexValue = OpType::update(indexValue, comp, extraParams); + } + + auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); + z[zOffset] = (Z) indexValue.index; } + }; - auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); - z[zOffset] = (Z) indexValue.index; - } + samediff::Threads::parallel_tad(func, 0, zLen); } break; @@ -237,19 +257,22 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, uint castTadShapeInfo[MAX_RANK]; const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo(tadShapeInfo, castTadShapeInfo); - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; i++) { - auto tad = const_cast(x) + tadOffsets[i]; - auto indexValue = OpType::startingIndexValue(tad); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto tad = const_cast(x) + tadOffsets[i]; + auto indexValue = OpType::startingIndexValue(tad); - for (uint j = 0; j < tadLen; j++) { - auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad); - functions::indexreduce::IndexValue comp(tad[tadOffset], j); - indexValue = OpType::update(indexValue, comp, extraParams); + for (uint j = 0; j < tadLen; j++) { + auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad); + functions::indexreduce::IndexValue comp(tad[tadOffset], j); + indexValue = OpType::update(indexValue, comp, extraParams); + } + + z[i * zEws] = (Z) indexValue.index; } + }; - z[i * zEws] = (Z) indexValue.index; - } + samediff::Threads::parallel_tad(func, 0, zLen); } break; @@ -260,20 +283,23 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo(tadShapeInfo, castTadShapeInfo); const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, castZShapeInfo); - PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads) - for (uint i = 0; i < zLen; i++) { - auto tad = const_cast(x) + tadOffsets[i]; - auto indexValue = OpType::startingIndexValue(tad); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto tad = const_cast(x) + tadOffsets[i]; + auto indexValue = OpType::startingIndexValue(tad); - for (uint j = 0; j < tadLen; j++) { - auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad); - functions::indexreduce::IndexValue comp(tad[tadOffset], j); - indexValue = OpType::update(indexValue, comp, extraParams); + for (uint j = 0; j < tadLen; j++) { + auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad); + functions::indexreduce::IndexValue comp(tad[tadOffset], j); + indexValue = OpType::update(indexValue, comp, extraParams); + } + + auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); + z[zOffset] = (Z) indexValue.index; } + }; - auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); - z[zOffset] = (Z) indexValue.index; - } + samediff::Threads::parallel_tad(func, 0, zLen); } } } diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp index 895afccfd..b8405553e 100644 --- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp +++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp @@ -28,24 +28,32 @@ namespace nd4j { template template - void Reduction3Loops::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) { - Reduction3Loops::template loopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams); + void Reduction3Loops::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + Reduction3Loops::template loopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop); +#endif } template template - void Reduction3Loops::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) { - Reduction3Loops::template loopReduce3All(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams); + void Reduction3Loops::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + Reduction3Loops::template loopReduce3All(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop); +#endif } template - void Reduction3Loops::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) { - DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS); + void Reduction3Loops::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop), REDUCE3_OPS); +#endif } template - void Reduction3Loops::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) { - DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS); + void Reduction3Loops::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop), REDUCE3_OPS); +#endif } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT Reduction3Loops, , LIBND4J_TYPES, FLOAT_TYPES_0); diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp index d8c24e096..44ccea08c 100644 --- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp +++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp @@ -28,24 +28,32 @@ namespace nd4j { template template - void Reduction3Loops::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) { - Reduction3Loops::template loopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams); + void Reduction3Loops::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + Reduction3Loops::template loopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop); +#endif } template template - void Reduction3Loops::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) { - Reduction3Loops::template loopReduce3All(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams); + void Reduction3Loops::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + Reduction3Loops::template loopReduce3All(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop); +#endif } template - void Reduction3Loops::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) { - DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS); + void Reduction3Loops::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop), REDUCE3_OPS); +#endif } template - void Reduction3Loops::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) { - DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS); + void Reduction3Loops::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop), REDUCE3_OPS); +#endif } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT Reduction3Loops, , LIBND4J_TYPES, FLOAT_TYPES_1); diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp index 4ecc0e370..ec261a7ea 100644 --- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp +++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp @@ -28,24 +28,32 @@ namespace nd4j { template template - void Reduction3Loops::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) { - Reduction3Loops::template loopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams); + void Reduction3Loops::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + Reduction3Loops::template loopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop); +#endif } template template - void Reduction3Loops::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) { - Reduction3Loops::template loopReduce3All(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams); + void Reduction3Loops::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + Reduction3Loops::template loopReduce3All(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop); +#endif } template - void Reduction3Loops::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) { - DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS); + void Reduction3Loops::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop), REDUCE3_OPS); +#endif } template - void Reduction3Loops::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) { - DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS); + void Reduction3Loops::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop), REDUCE3_OPS); +#endif } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT Reduction3Loops, , LIBND4J_TYPES, FLOAT_TYPES_2); diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp index 218c335ca..3b1efadc9 100644 --- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp +++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp @@ -28,24 +28,32 @@ namespace nd4j { template template - void Reduction3Loops::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) { - Reduction3Loops::template loopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams); + void Reduction3Loops::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + Reduction3Loops::template loopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop); +#endif } template template - void Reduction3Loops::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) { - Reduction3Loops::template loopReduce3All(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams); + void Reduction3Loops::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + Reduction3Loops::template loopReduce3All(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop); +#endif } template - void Reduction3Loops::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) { - DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS); + void Reduction3Loops::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop), REDUCE3_OPS); +#endif } template - void Reduction3Loops::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) { - DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS); + void Reduction3Loops::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop), REDUCE3_OPS); +#endif } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT Reduction3Loops, , LIBND4J_TYPES, FLOAT_TYPES_3); diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp index 4a223a0f2..0709e5f3c 100644 --- a/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp +++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp @@ -19,3 +19,4 @@ // #include +#include diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp index 35ae99afb..151bc6a82 100644 --- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp +++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp @@ -26,16 +26,20 @@ namespace nd4j { template template - void ReductionBoolLoops::innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams) { - ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams); + void ReductionBoolLoops::innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop); +#endif } template void ReductionBoolLoops::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, - X *extraParams) { - DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams), REDUCE_BOOL_OPS); + X *extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_BOOL_OPS); +#endif } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionBoolLoops, , LIBND4J_TYPES, BOOL_TYPES); diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp index c7b1f6ff8..af8b0b451 100644 --- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp +++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp @@ -28,16 +28,19 @@ namespace nd4j { template template - void ReductionFloatLoops::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) { - ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams); + void ReductionFloatLoops::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop); +#endif } template void ReductionFloatLoops::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets, Y *extraParams) { - - DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS); + Nd4jLong *tadOffsets, Y *extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_FLOAT_OPS); +#endif } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionFloatLoops, , LIBND4J_TYPES, FLOAT_TYPES_0); diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp index 76c1141bf..137ffc011 100644 --- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp +++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp @@ -28,16 +28,19 @@ namespace nd4j { template template - void ReductionFloatLoops::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) { - ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams); + void ReductionFloatLoops::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop); +#endif } template void ReductionFloatLoops::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets, Y *extraParams) { - - DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS); + Nd4jLong *tadOffsets, Y *extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_FLOAT_OPS); +#endif } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionFloatLoops, , LIBND4J_TYPES, FLOAT_TYPES_1); diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp index 7288816ad..79b11b419 100644 --- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp +++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp @@ -28,16 +28,19 @@ namespace nd4j { template template - void ReductionFloatLoops::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) { - ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams); + void ReductionFloatLoops::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop); +#endif } template void ReductionFloatLoops::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets, Y *extraParams) { - - DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS); + Nd4jLong *tadOffsets, Y *extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_FLOAT_OPS); +#endif } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionFloatLoops, , LIBND4J_TYPES, FLOAT_TYPES_2); diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp index 251624076..ddedd6c18 100644 --- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp +++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp @@ -28,16 +28,19 @@ namespace nd4j { template template - void ReductionFloatLoops::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) { - ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams); + void ReductionFloatLoops::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop); +#endif } template void ReductionFloatLoops::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets, Y *extraParams) { - - DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS); + Nd4jLong *tadOffsets, Y *extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_FLOAT_OPS); +#endif } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionFloatLoops, , LIBND4J_TYPES, FLOAT_TYPES_3); diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp index a6dd992c6..2e7708497 100644 --- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp +++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp @@ -33,16 +33,19 @@ namespace nd4j { template template - void ReductionLongLoops::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z *z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams) { - ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams); + void ReductionLongLoops::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z *z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop); +#endif } template void ReductionLongLoops::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets, X *extraParams) { - - DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_LONG_OPS); + Nd4jLong *tadOffsets, X *extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_LONG_OPS); +#endif } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionLongLoops, , LIBND4J_TYPES, LONG_TYPES); diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp index 623d97e79..08a67ec59 100644 --- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp +++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp @@ -26,20 +26,24 @@ namespace nd4j { template template - void ReductionSameLoops::innerloopReduce(X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams) { - ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams); + void ReductionSameLoops::innerloopReduce(X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS + ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop); +#endif } template void ReductionSameLoops::wrapper(const int opNum, X *vx, Nd4jLong *xShapeInfo, X *vz, Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, - X *vextraParams) { + X *vextraParams, int64_t start, int64_t stop) { +#ifndef INLINE_LOOPS auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto extraParams = reinterpret_cast(vextraParams); - DISPATCH_BY_OPNUM_T(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams), REDUCE_SAME_OPS); + DISPATCH_BY_OPNUM_T(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_SAME_OPS); +#endif } BUILD_SINGLE_TEMPLATE(template class ReductionSameLoops, , LIBND4J_TYPES); diff --git a/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu b/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu index 152e74652..8f67f0004 100644 --- a/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu +++ b/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu @@ -24,6 +24,7 @@ #include #include #include +#include // #include // #include diff --git a/libnd4j/include/helpers/impl/AttentionHelper.cpp b/libnd4j/include/helpers/impl/AttentionHelper.cpp index 4e7393a8e..3cfee1c08 100644 --- a/libnd4j/include/helpers/impl/AttentionHelper.cpp +++ b/libnd4j/include/helpers/impl/AttentionHelper.cpp @@ -34,16 +34,16 @@ namespace nd4j { auto numHeads = projectionMatrix->sizeAt(0); auto projectedSize = projectionMatrix->sizeAt(1); - auto inputPerm = input->permute({1, 0, 2}); - auto inputPrep = inputPerm.reshape('c', {input->sizeAt(1), (miniBatchSize * seqLength)}); - auto projectionPrep = projectionMatrix->reshape('c', {numHeads * projectionMatrix->sizeAt(1), projectionMatrix->sizeAt(2)}); + auto inputPerm = input->permute({1, 0, 2}); //[batch, nIn, timeSteps] -> [nIn, batch, timeSteps] + auto inputPrep = inputPerm.reshape('c', {input->sizeAt(1), (miniBatchSize * seqLength)}); //[nIn, batch*timeSteps] + auto projectionPrep = projectionMatrix->reshape('c', {numHeads * projectionMatrix->sizeAt(1), projectionMatrix->sizeAt(2)}); //[nHeads, hS, nIn] -> [nHeads*hS, nIn] - NDArray projected('c', {numHeads * projectionMatrix->sizeAt(1), (miniBatchSize * seqLength)}, input->dataType(), context); + NDArray projected('c', {numHeads * projectionMatrix->sizeAt(1), (miniBatchSize * seqLength)}, input->dataType(), context); //[nHeads*hS, batch*timeSteps] nd4j::ops::matmul mmul; mmul.execute({&projectionPrep, &inputPrep}, {&projected}, {}, {}, {}); projected.reshapei({numHeads, projectedSize, miniBatchSize, seqLength}); - projected.permutei({2, 0, 1, 3}); + projected.permutei({2, 0, 1, 3}); //[minibatch, numHeads, projectedSize, seqLength] return projected; } diff --git a/libnd4j/include/helpers/impl/BlasHelper.cpp b/libnd4j/include/helpers/impl/BlasHelper.cpp index 61b542697..bf52fe2c6 100644 --- a/libnd4j/include/helpers/impl/BlasHelper.cpp +++ b/libnd4j/include/helpers/impl/BlasHelper.cpp @@ -74,7 +74,7 @@ namespace nd4j { template <> bool BlasHelper::hasGEMV() { -#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS) +#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS) return true; #else return _hasSgemv; @@ -83,7 +83,7 @@ namespace nd4j { template <> bool BlasHelper::hasGEMV() { -#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS) +#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS) return true; #else return _hasDgemv; @@ -132,14 +132,14 @@ namespace nd4j { bool BlasHelper::hasGEMV(const nd4j::DataType dtype) { if(dtype == DataType::FLOAT32) { - #if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS) + #if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS) return true; #else return _hasSgemv; #endif } if(dtype == DataType::DOUBLE) { - #if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS) + #if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS) return true; #else return _hasDgemv; @@ -150,7 +150,7 @@ namespace nd4j { template <> bool BlasHelper::hasGEMM() { -#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS) +#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS) return true; #else return _hasSgemm; @@ -159,7 +159,7 @@ namespace nd4j { template <> bool BlasHelper::hasGEMM() { -#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS) +#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS) return true; #else return _hasDgemm; @@ -208,14 +208,14 @@ namespace nd4j { bool BlasHelper:: hasGEMM(const nd4j::DataType dtype) { if(dtype == DataType::FLOAT32) { - #if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS) + #if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS) return true; #else return _hasSgemm; #endif } if(dtype == DataType::DOUBLE) { - #if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS) + #if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS) return true; #else return _hasDgemm; @@ -276,14 +276,14 @@ namespace nd4j { } CblasSgemv BlasHelper::sgemv() { -#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS) +#if defined(__EXTERNAL_BLAS__)|| defined(HAVE_OPENBLAS) return (CblasSgemv)&cblas_sgemv; #else return this->cblasSgemv; #endif } CblasDgemv BlasHelper::dgemv() { -#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS) +#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS) return (CblasDgemv)&cblas_dgemv; #else return this->cblasDgemv; @@ -291,7 +291,7 @@ namespace nd4j { } CblasSgemm BlasHelper::sgemm() { -#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS) +#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS) return (CblasSgemm)&cblas_sgemm; #else return this->cblasSgemm; @@ -299,7 +299,7 @@ namespace nd4j { } CblasDgemm BlasHelper::dgemm() { -#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS) +#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS) return (CblasDgemm)&cblas_dgemm; #else return this->cblasDgemm; diff --git a/libnd4j/include/helpers/impl/DebugHelper.cpp b/libnd4j/include/helpers/impl/DebugHelper.cpp index f1ba8a755..704c463e6 100644 --- a/libnd4j/include/helpers/impl/DebugHelper.cpp +++ b/libnd4j/include/helpers/impl/DebugHelper.cpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace nd4j { DebugInfo DebugHelper::debugStatistics(NDArray const* input) { @@ -88,11 +89,18 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) reduction(+:_nanCount,_infCount,_m } *info = {_minValue, _maxValue, _meanValue / input->lengthOf(), _stdDevValue, _zeroCount, _positiveCount, _negativeCount, _infCount, _nanCount}; _stdDevValue = 0; //math::nd4j_sqrt(info->_stdDevValue / (input->lengthOf() - 1)); -PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule (static) reduction(+:_stdDevValue)) - for (Nd4jLong e = 0; e < input->lengthOf(); e++) { - double current = input->e(e); - _stdDevValue += (info->_meanValue - current) * (info->_meanValue - current); //info->_minValue; - } + + auto func = PRAGMA_REDUCE_DOUBLE { + auto _stdDevValue = 0.0; + for (auto e = start; e < stop; e++) { + double current = input->e(e); + _stdDevValue += (info->_meanValue - current) * (info->_meanValue - current); //info->_minValue; + } + + return _stdDevValue; + }; + _stdDevValue = samediff::Threads::parallel_double(func, LAMBDA_AD { return _old + _new; }, 0, input->lengthOf()); + info->_stdDevValue = math::nd4j_sqrt(_stdDevValue / input->lengthOf()); } diff --git a/libnd4j/include/helpers/impl/GradCheck.cpp b/libnd4j/include/helpers/impl/GradCheck.cpp index a3ae7d1ac..8b24e5f16 100644 --- a/libnd4j/include/helpers/impl/GradCheck.cpp +++ b/libnd4j/include/helpers/impl/GradCheck.cpp @@ -33,13 +33,11 @@ void GradCheck::fillGradArrays(const LossFunc loss, const std::vector& switch(loss) { case MEAN: - PRAGMA_OMP_PARALLEL_FOR_IF(numInGradArrs > 1) for(int i = 0; i < numInGradArrs; ++i) *gradArrs[i] = 1. / gradArrs[i]->lengthOf(); break; case SUM: - PRAGMA_OMP_PARALLEL_FOR_IF(numInGradArrs > 1) for(int i = 0; i < numInGradArrs; ++i) *gradArrs[i] = 1.; break; diff --git a/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp b/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp index a4b9c4000..80e456e29 100644 --- a/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp +++ b/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp @@ -45,7 +45,7 @@ OmpLaunchHelper::OmpLaunchHelper(const Nd4jLong N, float desiredNumThreads) { else desiredNumThreads = nd4j::math::nd4j_min(omp_get_max_threads(), desiredNumThreads); #else - desiredNumThreads = 1; + desiredNumThreads = nd4j::Environment::getInstance()->maxThreads(); #endif _numThreads = nd4j::math::nd4j_min(N / maxItersPerThread, desiredNumThreads); } @@ -75,7 +75,7 @@ Nd4jLong OmpLaunchHelper::betterSpan(Nd4jLong N) { #ifdef _OPENMP return betterThreads(N, omp_get_max_threads()); #else - return 1; + return betterThreads(N, nd4j::Environment::getInstance()->maxThreads());; #endif } @@ -92,7 +92,7 @@ Nd4jLong OmpLaunchHelper::betterSpan(Nd4jLong N) { #ifdef _OPENMP auto maxThreads = omp_get_max_threads(); #else - auto maxThreads = 1; + auto maxThreads = nd4j::Environment::getInstance()->maxThreads(); #endif // if there's only 1 thread allowed - nothing to do here diff --git a/libnd4j/include/loops/aggregates.h b/libnd4j/include/loops/aggregates.h deleted file mode 100644 index 8fbdefcaf..000000000 --- a/libnd4j/include/loops/aggregates.h +++ /dev/null @@ -1,66 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -// -// @author raver119@gmail.com -// - -#ifndef LIBND4J_AGGREGATES_H -#define LIBND4J_AGGREGATES_H - -#include -#include -#include - -namespace functions { -namespace aggregate { - - template - class AggregatedFunction { - - public: -#ifdef __CUDACC__ - template - __device__ static void execCuda(X **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, X *realArguments, int numRealArguments); - - __device__ static void execCuda(int opNum, X **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, X *realArguments, int numRealArguments); - - __device__ static void aggregateBatch(int numAggregates, int opNum, int maxArgs, int maxShapes, int maxIntArrays, int maxIntArraySize, int maxIdx, int maxReals, void *ptrToArguments); - - __host__ static void aggregateBatchKernelGeneric(dim3& launchDims, cudaStream_t *stream, int opNum, int numAggregates, int maxArgs, int maxShapes, int maxIntArrays, int maxIntArraySize, int maxIdx, int maxReals, void *ptrToArguments); - - __host__ static void aggregateKernelGeneric(dim3& launchDims, cudaStream_t *stream, int opNum, void **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, void *realArguments, int numRealArguments); - -#endif - - template - inline static void exec(X **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, X *realArguments, int numRealArguments) { - OpClass::executeAggregate(arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments); - } - - inline static void exec(int opNum, X **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, X *realArguments, int numRealArguments) { - DISPATCH_BY_OPNUM_T(exec, PARAMS(arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments), AGGREGATE_OPS); - } - }; -} -} - -#ifdef __CUDACC__ - - -#endif - -#endif //LIBND4J_AGGREGATES_H diff --git a/libnd4j/include/loops/broadcasting.h b/libnd4j/include/loops/broadcasting.h index cc0331549..a38e79c3f 100755 --- a/libnd4j/include/loops/broadcasting.h +++ b/libnd4j/include/loops/broadcasting.h @@ -91,7 +91,7 @@ namespace functions { static __host__ void execInverseBroadcast(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *result, Nd4jLong *resultShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ); -#endif +#else static void execInverse(int opNum, void *x, @@ -105,7 +105,9 @@ namespace functions { Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset, Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ); + Nd4jLong *tadOffsetZ, + uint64_t start, + uint64_t stop); static void exec(int opNum, void *x, @@ -119,7 +121,9 @@ namespace functions { Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset, Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ); + Nd4jLong *tadOffsetZ, + uint64_t start, + uint64_t stop); /** * CPU execution @@ -144,7 +148,9 @@ namespace functions { Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset, Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ); + Nd4jLong *tadOffsetZ, + uint64_t start, + uint64_t stop); template static void execInverse(void *x, @@ -158,7 +164,10 @@ namespace functions { Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset, Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ); + Nd4jLong *tadOffsetZ, + uint64_t start, + uint64_t stop); +#endif }; } } diff --git a/libnd4j/include/loops/broadcasting_bool.h b/libnd4j/include/loops/broadcasting_bool.h index a3098abbb..3b0958be1 100644 --- a/libnd4j/include/loops/broadcasting_bool.h +++ b/libnd4j/include/loops/broadcasting_bool.h @@ -89,7 +89,7 @@ namespace functions { static __host__ void execInverseBroadcast(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *result, Nd4jLong *resultShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ); -#endif +#else static void exec(int opNum, void *x, @@ -103,7 +103,9 @@ namespace functions { Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset, Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ); + Nd4jLong *tadOffsetZ, + uint64_t start, + uint64_t stop); static void execInverse(int opNum, void *x, @@ -117,7 +119,9 @@ namespace functions { Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset, Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ); + Nd4jLong *tadOffsetZ, + uint64_t start, + uint64_t stop); /** * CPU execution @@ -142,7 +146,9 @@ namespace functions { Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset, Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ); + Nd4jLong *tadOffsetZ, + uint64_t start, + uint64_t stop); template static void execInverse(void *x, @@ -156,7 +162,10 @@ namespace functions { Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset, Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ); + Nd4jLong *tadOffsetZ, + uint64_t start, + uint64_t stop); +#endif }; } } diff --git a/libnd4j/include/loops/broadcasting_int.h b/libnd4j/include/loops/broadcasting_int.h index 84bc0f949..92e4ca7dd 100644 --- a/libnd4j/include/loops/broadcasting_int.h +++ b/libnd4j/include/loops/broadcasting_int.h @@ -89,7 +89,7 @@ namespace functions { static __host__ void execInverseBroadcast(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *result, Nd4jLong *resultShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ); -#endif +#else static void exec(int opNum, void *x, @@ -103,7 +103,9 @@ namespace functions { Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset, Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ); + Nd4jLong *tadOffsetZ, + uint64_t start, + uint64_t stop); static void execInverse(int opNum, void *x, @@ -117,7 +119,9 @@ namespace functions { Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset, Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ); + Nd4jLong *tadOffsetZ, + uint64_t start, + uint64_t stop); /** * CPU execution @@ -142,7 +146,9 @@ namespace functions { Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset, Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ); + Nd4jLong *tadOffsetZ, + uint64_t start, + uint64_t stop); template static void execInverse(void *x, @@ -156,7 +162,10 @@ namespace functions { Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset, Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ); + Nd4jLong *tadOffsetZ, + uint64_t start, + uint64_t stop); +#endif }; } } diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp index 3bd619827..37dbf833f 100644 --- a/libnd4j/include/loops/cpu/broadcasting.hpp +++ b/libnd4j/include/loops/cpu/broadcasting.hpp @@ -24,6 +24,7 @@ #include #include #include +#include using namespace simdOps; @@ -43,7 +44,9 @@ namespace functions { Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffset, Nd4jLong *zTadShapeInfo, - Nd4jLong *zTadOffset) { + Nd4jLong *zTadOffset, + uint64_t start, + uint64_t stop) { DISPATCH_BY_OPNUM_TTT(execInverse, PARAMS(x, xShapeInfo, y, @@ -55,7 +58,7 @@ namespace functions { xTadShapeInfo, xTadOffset, zTadShapeInfo, - zTadOffset), BROADCAST_OPS); + zTadOffset, start, stop), BROADCAST_OPS); } template @@ -71,7 +74,9 @@ namespace functions { Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffset, Nd4jLong *zTadShapeInfo, - Nd4jLong *zTadOffset) { + Nd4jLong *zTadOffset, + uint64_t start, + uint64_t stop) { DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x, xShapeInfo, y, @@ -83,7 +88,7 @@ namespace functions { xTadShapeInfo, xTadOffset, zTadShapeInfo, - zTadOffset), BROADCAST_OPS); + zTadOffset, start, stop), BROADCAST_OPS); } template @@ -99,7 +104,9 @@ namespace functions { Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffset, Nd4jLong *zTadShapeInfo, - Nd4jLong *zTadOffset) { + Nd4jLong *zTadOffset, + uint64_t start, + uint64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); @@ -131,10 +138,6 @@ namespace functions { auto lenZ = shape::length(zTadShapeInfo); auto lenY = shape::length(yShapeInfo); - int tadsPerThread = tads / TAD_THRESHOLD; - int threads = nd4j::math::nd4j_max(1, tadsPerThread); - threads = nd4j::math::nd4j_min(threads, omp_get_max_threads()); - auto xEws = shape::elementWiseStride(xTadShapeShapeInfo); auto yEws = shape::elementWiseStride(yShapeInfo); auto zEws = shape::elementWiseStride(zTadShapeInfo); @@ -142,19 +145,17 @@ namespace functions { const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo); if (kindOfLoop == nd4j::LoopKind::EWS1) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - auto oX = x + tadOffsets[i]; - auto oZ = z + zTadOffset[i]; + for (auto i = start; i < stop; i++) { + auto oX = x + tadOffsets[i]; + auto oZ = z + zTadOffset[i]; - PRAGMA_OMP_SIMD - for (unsigned int f = 0; f < tadLength; f++) - oZ[f] = OpType::op(oX[f], y[f]); + PRAGMA_OMP_SIMD + for (unsigned int f = 0; f < tadLength; f++) + oZ[f] = OpType::op(oX[f], y[f]); } } else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO){ - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { + for (auto i = start; i < stop; i++) { auto oX = x + tadOffsets[i]; auto oZ = z + zTadOffset[i]; @@ -164,13 +165,10 @@ namespace functions { } } else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i++) { auto oX = x + tadOffsets[i]; auto oZ = z + zTadOffset[i]; @@ -182,70 +180,61 @@ namespace functions { } } else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; uint tadShapeInfoZCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { + for (auto i = start; i < stop; i++) { auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[offset], y[offset]); } } } else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i++) { auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[offset], y[yOffset]); } } } else if(shape::haveSameShapeAndStrides(yShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i++) { auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[xOffset], y[offset]); } } } else { - uint tadShapeShapeInfoCast[MAX_RANK]; uint tadShapeInfoZCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; @@ -253,17 +242,15 @@ namespace functions { bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i++) { auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]); } } @@ -285,7 +272,9 @@ namespace functions { Nd4jLong *yTadShapeInfo, Nd4jLong *yTadOffset, Nd4jLong *zTadShapeInfo, - Nd4jLong *zTadOffset) { + Nd4jLong *zTadOffset, + uint64_t start, + uint64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); @@ -319,7 +308,7 @@ namespace functions { int tadsPerThread = tads / TAD_THRESHOLD; int threads = nd4j::math::nd4j_max(1, tadsPerThread); - threads = nd4j::math::nd4j_min(threads, omp_get_max_threads()); + threads = nd4j::math::nd4j_min(threads, nd4j::Environment::getInstance()->maxThreads()); auto yEws = shape::elementWiseStride(yTadShapeShapeInfo); auto xEws = shape::elementWiseStride(xShapeInfo); @@ -328,8 +317,7 @@ namespace functions { const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo); if(kindOfLoop == nd4j::LoopKind::EWS1) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (unsigned int i = 0; i < tads; i++) { + for (auto i = start; i < stop; i++) { auto oY = y + tadOffsets[i]; auto oZ = z + zTadOffset[i]; @@ -339,24 +327,20 @@ namespace functions { } } else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { + for (auto i = start; i < stop; i++) { auto oY = y + tadOffsets[i]; auto oZ = z + zTadOffset[i]; PRAGMA_OMP_SIMD for (unsigned int f = 0; f < tadLength; f++) oZ[f * zEws] = OpType::op(x[f * xEws], oY[f * yEws]); - } + }; } else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo) && shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i++) { auto oY = x + tadOffsets[i]; auto oZ = z + zTadOffset[i]; @@ -365,73 +349,63 @@ namespace functions { auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); oZ[offset] = OpType::op(x[offset], oY[offset]); } - } + }; } else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; uint tadShapeInfoZCast[MAX_RANK]; bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i++) { auto oZ = z + zTadOffset[i]; auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(x[offset], oY[offset]); } - } + }; } else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; uint xShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i++) { auto oZ = z + zTadOffset[i]; auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[xOffset], oY[offset]); } - } + }; } else if(shape::haveSameShapeAndStrides(xShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; uint xShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i++) { auto oZ = z + zTadOffset[i]; auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[offset], oY[yOffset]); } - } + }; } else { - uint tadShapeShapeInfoCast[MAX_RANK]; uint tadShapeInfoZCast[MAX_RANK]; uint xShapeInfoCast[MAX_RANK]; @@ -439,20 +413,18 @@ namespace functions { bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i++) { auto oZ = z + zTadOffset[i]; auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); - auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); + auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]); } - } + }; } } } diff --git a/libnd4j/include/loops/cpu/broadcasting_bool.cpp b/libnd4j/include/loops/cpu/broadcasting_bool.cpp index bca423e3e..7a3eb1e31 100644 --- a/libnd4j/include/loops/cpu/broadcasting_bool.cpp +++ b/libnd4j/include/loops/cpu/broadcasting_bool.cpp @@ -24,6 +24,7 @@ #include #include #include +#include using namespace simdOps; @@ -43,7 +44,9 @@ namespace functions { Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffset, Nd4jLong *zTadShapeInfo, - Nd4jLong *zTadOffset) { + Nd4jLong *zTadOffset, + uint64_t start, + uint64_t stop) { DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, y, @@ -55,7 +58,7 @@ namespace functions { xTadShapeInfo, xTadOffset, zTadShapeInfo, - zTadOffset), BROADCAST_BOOL_OPS); + zTadOffset, start, stop), BROADCAST_BOOL_OPS); } template @@ -71,7 +74,9 @@ namespace functions { Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffset, Nd4jLong *zTadShapeInfo, - Nd4jLong *zTadOffset) { + Nd4jLong *zTadOffset, + uint64_t start, + uint64_t stop) { DISPATCH_BY_OPNUM_TT(execInverse, PARAMS(x, xShapeInfo, y, @@ -83,7 +88,7 @@ namespace functions { xTadShapeInfo, xTadOffset, zTadShapeInfo, - zTadOffset), BROADCAST_BOOL_OPS); + zTadOffset, start, stop), BROADCAST_BOOL_OPS); } template @@ -99,7 +104,9 @@ namespace functions { Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffset, Nd4jLong *zTadShapeInfo, - Nd4jLong *zTadOffset) { + Nd4jLong *zTadOffset, + uint64_t start, + uint64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); @@ -133,7 +140,7 @@ namespace functions { int tadsPerThread = tads / TAD_THRESHOLD; int threads = nd4j::math::nd4j_max(1, tadsPerThread); - threads = nd4j::math::nd4j_min(threads, omp_get_max_threads()); + threads = nd4j::math::nd4j_min(threads, nd4j::Environment::getInstance()->maxThreads()); auto xEws = shape::elementWiseStride(xTadShapeShapeInfo); auto yEws = shape::elementWiseStride(yShapeInfo); @@ -142,10 +149,9 @@ namespace functions { const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo); if (kindOfLoop == nd4j::LoopKind::EWS1) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { + for (auto i = start; i < stop; i++) { auto oX = x + tadOffsets[i]; - auto oZ = z + zTadOffset[i]; + auto oZ = z + zTadOffset[i]; PRAGMA_OMP_SIMD for (unsigned int f = 0; f < tadLength; f++) @@ -153,101 +159,86 @@ namespace functions { } } else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { + for (auto i = start; i < stop; i ++) { auto oX = x + tadOffsets[i]; auto oZ = z + zTadOffset[i]; PRAGMA_OMP_SIMD for (unsigned int f = 0; f < tadLength; f++) oZ[f * zEws] = OpType::op(oX[f * xEws], y[f * yEws]); - } + }; } else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; - // TODO: cover this codebranch with tests - // all this stuff already happens within thread PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); oZ[offset] = OpType::op(oX[offset], y[offset]); } - } + }; } else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; uint tadShapeInfoZCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[offset], y[offset]); } - } + }; } else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[offset], y[yOffset]); } - } + }; + } else if(shape::haveSameShapeAndStrides(yShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[xOffset], y[offset]); } - } + }; } else { - uint tadShapeShapeInfoCast[MAX_RANK]; uint tadShapeInfoZCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; @@ -255,20 +246,18 @@ namespace functions { bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]); } - } + }; } } @@ -286,7 +275,9 @@ namespace functions { Nd4jLong *yTadShapeInfo, Nd4jLong *yTadOffset, Nd4jLong *zTadShapeInfo, - Nd4jLong *zTadOffset) { + Nd4jLong *zTadOffset, + uint64_t start, + uint64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); @@ -320,7 +311,7 @@ namespace functions { int tadsPerThread = tads / TAD_THRESHOLD; int threads = nd4j::math::nd4j_max(1, tadsPerThread); - threads = nd4j::math::nd4j_min(threads, omp_get_max_threads()); + threads = nd4j::math::nd4j_min(threads, nd4j::Environment::getInstance()->maxThreads()); auto yEws = shape::elementWiseStride(yTadShapeShapeInfo); auto xEws = shape::elementWiseStride(xShapeInfo); @@ -329,8 +320,7 @@ namespace functions { const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo); if (kindOfLoop == nd4j::LoopKind::EWS1) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { + for (auto i = start; i < stop; i ++) { auto oY = y + tadOffsets[i]; auto oZ = z + zTadOffset[i]; @@ -340,8 +330,7 @@ namespace functions { } } else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { + for (auto i = start; i < stop; i ++) { auto oY = y + tadOffsets[i]; auto oZ = z + zTadOffset[i]; @@ -355,14 +344,10 @@ namespace functions { uint tadShapeShapeInfoCast[MAX_RANK]; bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oY = y + tadOffsets[i]; auto oZ = z + zTadOffset[i]; - // TODO: cover this codebranch with tests - // all this stuff already happens within thread PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); @@ -377,15 +362,13 @@ namespace functions { bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(x[offset], oY[offset]); } @@ -398,15 +381,13 @@ namespace functions { bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[xOffset], oY[offset]); } @@ -419,16 +400,14 @@ namespace functions { bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); - auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); + auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[offset], oY[yOffset]); } } @@ -442,9 +421,7 @@ namespace functions { bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oY = y + tadOffsets[i]; diff --git a/libnd4j/include/loops/cpu/broadcasting_int.cpp b/libnd4j/include/loops/cpu/broadcasting_int.cpp index 375d7577a..9dcce7545 100644 --- a/libnd4j/include/loops/cpu/broadcasting_int.cpp +++ b/libnd4j/include/loops/cpu/broadcasting_int.cpp @@ -24,6 +24,7 @@ #include #include #include +#include using namespace simdOps; @@ -43,7 +44,9 @@ namespace functions { Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffset, Nd4jLong *zTadShapeInfo, - Nd4jLong *zTadOffset) { + Nd4jLong *zTadOffset, + uint64_t start, + uint64_t stop) { DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, y, @@ -55,7 +58,7 @@ namespace functions { xTadShapeInfo, xTadOffset, zTadShapeInfo, - zTadOffset), BROADCAST_INT_OPS); + zTadOffset, start, stop), BROADCAST_INT_OPS); } template @@ -71,7 +74,9 @@ namespace functions { Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffset, Nd4jLong *zTadShapeInfo, - Nd4jLong *zTadOffset) { + Nd4jLong *zTadOffset, + uint64_t start, + uint64_t stop) { DISPATCH_BY_OPNUM_T(execInverse, PARAMS(x, xShapeInfo, y, @@ -83,7 +88,7 @@ namespace functions { xTadShapeInfo, xTadOffset, zTadShapeInfo, - zTadOffset), BROADCAST_INT_OPS); + zTadOffset, start, stop), BROADCAST_INT_OPS); } template @@ -99,7 +104,9 @@ namespace functions { Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffset, Nd4jLong *zTadShapeInfo, - Nd4jLong *zTadOffset) { + Nd4jLong *zTadOffset, + uint64_t start, + uint64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); @@ -133,7 +140,7 @@ namespace functions { int tadsPerThread = tads / TAD_THRESHOLD; int threads = nd4j::math::nd4j_max(1, tadsPerThread); - threads = nd4j::math::nd4j_min(threads, omp_get_max_threads()); + threads = nd4j::math::nd4j_min(threads, nd4j::Environment::getInstance()->maxThreads()); auto xEws = shape::elementWiseStride(xTadShapeShapeInfo); auto yEws = shape::elementWiseStride(yShapeInfo); @@ -142,112 +149,95 @@ namespace functions { const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo); if (kindOfLoop == nd4j::LoopKind::EWS1) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { + for (auto i = start; i < stop; i ++) { auto oX = x + tadOffsets[i]; - auto oZ = z + zTadOffset[i]; + auto oZ = z + zTadOffset[i]; PRAGMA_OMP_SIMD for (unsigned int f = 0; f < tadLength; f++) oZ[f] = OpType::op(oX[f], y[f]); - } + }; } else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { + for (auto i = start; i < stop; i ++) { auto oX = x + tadOffsets[i]; auto oZ = z + zTadOffset[i]; PRAGMA_OMP_SIMD for (unsigned int f = 0; f < tadLength; f++) oZ[f * zEws] = OpType::op(oX[f * xEws], y[f * yEws]); - } + }; } else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; - // TODO: cover this codebranch with tests - // all this stuff already happens within thread PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); oZ[offset] = OpType::op(oX[offset], y[offset]); } - } + }; } else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; uint tadShapeInfoZCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[offset], y[offset]); } - } + }; } else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[offset], y[yOffset]); } - } + }; } else if(shape::haveSameShapeAndStrides(yShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[xOffset], y[offset]); } - } + }; } else { - uint tadShapeShapeInfoCast[MAX_RANK]; uint tadShapeInfoZCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; @@ -255,20 +245,18 @@ namespace functions { bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); + auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); - auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); + auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]); } - } + }; } } @@ -286,7 +274,9 @@ namespace functions { Nd4jLong *yTadShapeInfo, Nd4jLong *yTadOffset, Nd4jLong *zTadShapeInfo, - Nd4jLong *zTadOffset) { + Nd4jLong *zTadOffset, + uint64_t start, + uint64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); @@ -320,7 +310,7 @@ namespace functions { int tadsPerThread = tads / TAD_THRESHOLD; int threads = nd4j::math::nd4j_max(1, tadsPerThread); - threads = nd4j::math::nd4j_min(threads, omp_get_max_threads()); + threads = nd4j::math::nd4j_min(threads, nd4j::Environment::getInstance()->maxThreads()); auto yEws = shape::elementWiseStride(yTadShapeShapeInfo); auto xEws = shape::elementWiseStride(xShapeInfo); @@ -329,46 +319,39 @@ namespace functions { const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo); if (kindOfLoop == nd4j::LoopKind::EWS1) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { + for (auto i = start; i < stop; i ++) { auto oY = y + tadOffsets[i]; auto oZ = z + zTadOffset[i]; PRAGMA_OMP_SIMD for (unsigned int f = 0; f < tadLength; f++) oZ[f] = OpType::op(x[f], oY[f]); - } + }; } else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { + for (auto i = start; i < stop; i ++) { auto oY = y + tadOffsets[i]; auto oZ = z + zTadOffset[i]; PRAGMA_OMP_SIMD for (uint f = 0; f < tadLength; f++) oZ[f * zEws] = OpType::op(x[f * xEws], oY[f * yEws]); - } + }; } else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo) && shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oY = y + tadOffsets[i]; auto oZ = z + zTadOffset[i]; - // TODO: cover this codebranch with tests - // all this stuff already happens within thread PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); oZ[offset] = OpType::op(x[offset], oY[offset]); } - } + }; } else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo)) { @@ -377,64 +360,54 @@ namespace functions { bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oY = y + tadOffsets[i]; - PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(x[offset], oY[offset]); } - } + }; } else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; uint xShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { - auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); + auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[xOffset], oY[offset]); } - } + }; } else if(shape::haveSameShapeAndStrides(xShapeInfo, zTadShapeInfo)) { - uint tadShapeShapeInfoCast[MAX_RANK]; uint xShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD for (int f = 0; f < tadLength; f++) { auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); - auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); + auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[offset], oY[yOffset]); } - } + }; } else { - uint xShapeInfoCast[MAX_RANK]; uint tadShapeShapeInfoCast[MAX_RANK]; uint tadShapeInfoZCast[MAX_RANK]; @@ -442,9 +415,7 @@ namespace functions { bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast); bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast); - PRAGMA_OMP_PARALLEL_FOR_THREADS(threads) - for (int i = 0; i < tads; i++) { - + for (auto i = start; i < stop; i ++) { auto oZ = z + zTadOffset[i]; auto oY = y + tadOffsets[i]; @@ -455,7 +426,7 @@ namespace functions { auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]); } - } + }; } } diff --git a/libnd4j/include/loops/cpu/indexreduce.cpp b/libnd4j/include/loops/cpu/indexreduce.cpp index 23286ecd9..df3fd64a9 100644 --- a/libnd4j/include/loops/cpu/indexreduce.cpp +++ b/libnd4j/include/loops/cpu/indexreduce.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include "../legacy_ops.h" using namespace simdOps; @@ -44,8 +45,7 @@ void IndexReduce::exec(const int opNum, void *z, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset) { - -DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset), INDEX_REDUCE_OPS); + DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset), INDEX_REDUCE_OPS); } //////////////////////////////////////////////////////////////////////// @@ -64,42 +64,41 @@ Nd4jLong IndexReduce::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex uint xShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); + int maxThreads = nd4j::math::nd4j_min(64, nd4j::Environment::getInstance()->maxThreads()); + IndexValue intermediatery[64]; + for (int e = 0; e < maxThreads; e++) + intermediatery[e].index = -1; if (xEws == 1) { - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto local = OpType::startingIndexValue(x); - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); + auto func = PRAGMA_THREADS_FOR { + intermediatery[thread_id] = OpType::startingIndexValue(x); - auto ulen = info.getItersPerThread(threadNum); - - for (Nd4jLong i = 0; i < ulen; i++) { - IndexValue curr(x[i + threadOffset], i + threadOffset); - local = OpType::update(local, curr, extraParams); + for (auto i = start; i < stop; i += increment) { + IndexValue curr(x[i], i); + intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams); } + }; + + maxThreads = samediff::Threads::parallel_for(func, 0, len, 1, maxThreads); + + for (int e = 0; e < maxThreads; e++) + startingIndex = OpType::update(startingIndex, intermediatery[e], extraParams); - PRAGMA_OMP_CRITICAL - startingIndex = OpType::update(startingIndex, local, extraParams); - } } else { - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto local = OpType::startingIndexValue(x); - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); + auto func = PRAGMA_THREADS_FOR { + intermediatery[thread_id] = OpType::startingIndexValue(x); - auto ulen = info.getItersPerThread(threadNum); - - for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(threadOffset + i, xShapeInfo, xShapeInfoCast, canCastX); - IndexValue curr(x[offset], threadOffset + i); - local = OpType::update(local, curr, extraParams); + for (auto i = start; i < stop; i += increment) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + IndexValue curr(x[offset], i); + intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams); } + }; - PRAGMA_OMP_CRITICAL - startingIndex = OpType::update(startingIndex, local, extraParams); - } + maxThreads = samediff::Threads::parallel_for(func, 0, len, 1, maxThreads); + + for (int e = 0; e < maxThreads; e++) + startingIndex = OpType::update(startingIndex, intermediatery[e], extraParams); } return startingIndex.index; } @@ -124,9 +123,10 @@ void IndexReduce::exec(void *vx, Nd4jLong *xShapeInfo, if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY) return; const auto indexValue = OpType::startingIndexValue(x); - PRAGMA_OMP_PARALLEL_FOR_IF(zLen > nd4j::Environment::getInstance()->elementwiseThreshold()) + for (uint i = 0; i < zLen; i++) - z[i] = (Z) indexValue.index;; + z[i] = (Z) indexValue.index; + return; } diff --git a/libnd4j/include/loops/cpu/pairwise.hpp b/libnd4j/include/loops/cpu/pairwise.hpp index 9dfa129aa..1fc85e5d8 100644 --- a/libnd4j/include/loops/cpu/pairwise.hpp +++ b/libnd4j/include/loops/cpu/pairwise.hpp @@ -26,6 +26,7 @@ #include #include #include +#include using namespace simdOps; @@ -42,7 +43,9 @@ namespace functions { void *z, Nd4jLong zEws, void *extraParams, - Nd4jLong n) { + Nd4jLong n, + const uint64_t start, + const uint64_t stop) { DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x, xEws, y, @@ -50,7 +53,7 @@ namespace functions { z, zEws, extraParams, - n), PAIRWISE_TRANSFORM_OPS); + n, start, stop), PAIRWISE_TRANSFORM_OPS); }; @@ -61,48 +64,24 @@ namespace functions { void *vy, Nd4jLong yEws, void *vz, Nd4jLong zEws, void *vextraParams, - const Nd4jLong n) { + const Nd4jLong n, + const uint64_t start, + const uint64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); auto extraParams = reinterpret_cast(vextraParams); - nd4j::OmpLaunchHelper info(n); - if (xEws == 1 && yEws == 1 && zEws == 1) { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + threadOffset; - auto yi = y + threadOffset; - auto zi = z + threadOffset; - - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) - zi[i] = OpType::op(xi[i], yi[i], extraParams); - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) + z[i] = OpType::op(x[i], y[i], extraParams); } else { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + xEws*threadOffset; - auto yi = y + yEws*threadOffset; - auto zi = z + zEws*threadOffset; - - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) - zi[i*zEws] = OpType::op(xi[i*xEws], yi[i*yEws], extraParams); - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) + z[i*zEws] = OpType::op(x[i*xEws], y[i*yEws], extraParams); } } @@ -115,14 +94,16 @@ namespace functions { Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo, - void *extraParams) { + void *extraParams, + const uint64_t start, + const uint64_t stop) { DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, - extraParams), + extraParams, start, stop), PAIRWISE_TRANSFORM_OPS); }; @@ -136,7 +117,9 @@ namespace functions { Nd4jLong* yShapeInfo, void *vz, Nd4jLong* zShapeInfo, - void *vextraParams) { + void *vextraParams, + const uint64_t start, + const uint64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); @@ -148,7 +131,6 @@ namespace functions { auto yEws = shape::elementWiseStride(yShapeInfo); auto zEws = shape::elementWiseStride(zShapeInfo); - nd4j::OmpLaunchHelper info(n); if (shape::isScalar(yShapeInfo)) { @@ -156,38 +138,22 @@ namespace functions { const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for(unsigned int i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - z[offset] = OpType::op(x[offset], y[0], extraParams); - } - } + PRAGMA_OMP_SIMD + for(auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + z[offset] = OpType::op(x[offset], y[0], extraParams); + }; } else { uint zShapeInfoCast[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for(unsigned int i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); - z[zOffset] = OpType::op(x[xOffset], y[0], extraParams); - } - } + PRAGMA_OMP_SIMD + for(auto i = start; i < stop; i++) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); + z[zOffset] = OpType::op(x[xOffset], y[0], extraParams); + }; } return; } @@ -198,96 +164,63 @@ namespace functions { const bool sameShapesXY = shape::shapeEquals(xShapeInfo, yShapeInfo); if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) { - exec(x, xEws, y, yEws, z, zEws, extraParams, n); + exec(x, xEws, y, yEws, z, zEws, extraParams, n, start, stop); } else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape - exec(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo)); + exec(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo), start, stop); } else { if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - uint xShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - z[offset] = OpType::op(x[offset], y[offset], extraParams); - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + z[offset] = OpType::op(x[offset], y[offset], extraParams); } } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { - uint xShapeInfoCast[MAX_RANK]; uint zShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); - z[zOffset] = OpType::op(x[offset], y[offset], extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); + z[zOffset] = OpType::op(x[offset], y[offset], extraParams); + }; } else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); - z[offset] = OpType::op(x[offset], y[yOffset], extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); + z[offset] = OpType::op(x[offset], y[yOffset], extraParams); + }; } else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) { - uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); - z[offset] = OpType::op(x[xOffset], y[offset], extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto offset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); + z[offset] = OpType::op(x[xOffset], y[offset], extraParams); + }; } else { - uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; uint zShapeInfoCast[MAX_RANK]; @@ -295,20 +228,13 @@ namespace functions { bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); - z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); + z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); + }; } } } diff --git a/libnd4j/include/loops/cpu/pairwise2.hpp b/libnd4j/include/loops/cpu/pairwise2.hpp deleted file mode 100644 index 17acd35b7..000000000 --- a/libnd4j/include/loops/cpu/pairwise2.hpp +++ /dev/null @@ -1,106 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -// -// Created by remote on 2018-09-20. -// - -#include -#include -#include -#include -#include -#include -#include - -using namespace simdOps; - -namespace functions { - namespace pairwise_transforms { - - template - void PairWiseTransform::exec( - const int opNum, - void *x, - Nd4jLong xEws, - void *y, - Nd4jLong yEws, - void *z, - Nd4jLong zEws, - void *extraParams, - Nd4jLong n) { - DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x, - xEws, - y, - yEws, - z, - zEws, - extraParams, - n), PAIRWISE_TRANSFORM_OPS); - }; - - - - template - template - void PairWiseTransform::exec(void *vx, Nd4jLong xEws, - void *vy, Nd4jLong yEws, - void *vz, Nd4jLong zEws, - void *vextraParams, - const Nd4jLong n) { - - auto x = reinterpret_cast(vx); - auto y = reinterpret_cast(vy); - auto z = reinterpret_cast(vz); - auto extraParams = reinterpret_cast(vextraParams); - - nd4j::OmpLaunchHelper info(n); - - if (xEws == 1 && yEws == 1 && zEws == 1) { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - Nd4jLong threadOffset = info.getThreadOffset(threadNum); - auto xi = x + threadOffset; - auto yi = y + threadOffset; - auto zi = z + threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) - zi[i] = OpType::op(xi[i], yi[i], extraParams); - } - } - else { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - Nd4jLong threadOffset = info.getThreadOffset(threadNum); - auto xi = x + xEws*threadOffset; - auto yi = y + yEws*threadOffset; - auto zi = z + zEws*threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) - zi[i*zEws] = OpType::op(xi[i*xEws], yi[i*yEws], extraParams); - } - } - } - } -} diff --git a/libnd4j/include/loops/cpu/pairwise_bool.cpp b/libnd4j/include/loops/cpu/pairwise_bool.cpp index 8feabb98a..2259c37b0 100644 --- a/libnd4j/include/loops/cpu/pairwise_bool.cpp +++ b/libnd4j/include/loops/cpu/pairwise_bool.cpp @@ -22,6 +22,7 @@ #include #include #include +#include using namespace simdOps; @@ -38,7 +39,9 @@ namespace functions { void *z, Nd4jLong zEws, void *extraParams, - Nd4jLong n) { + Nd4jLong n, + const uint64_t start, + const uint64_t stop) { DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xEws, y, @@ -46,7 +49,7 @@ namespace functions { z, zEws, extraParams, - n), PAIRWISE_BOOL_OPS); + n, start, stop), PAIRWISE_BOOL_OPS); }; @@ -60,46 +63,24 @@ namespace functions { void *vz, Nd4jLong zEws, void *vextraParams, - const Nd4jLong n) { + const Nd4jLong n, + const uint64_t start, + const uint64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); auto extraParams = reinterpret_cast(vextraParams); - nd4j::OmpLaunchHelper info(n); - if (xEws == 1 && yEws == 1 && zEws == 1) { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - Nd4jLong threadOffset = info.getThreadOffset(threadNum); - auto xi = x + threadOffset; - auto yi = y + threadOffset; - auto zi = z + threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) - zi[i] = OpType::op(xi[i], yi[i], extraParams); - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) + z[i] = OpType::op(x[i], y[i], extraParams); } else { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - Nd4jLong threadOffset = info.getThreadOffset(threadNum); - auto xi = x + xEws*threadOffset; - auto yi = y + yEws*threadOffset; - auto zi = z + zEws*threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) - zi[i*zEws] = OpType::op(xi[i*xEws], yi[i*yEws], extraParams); - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) + z[i*zEws] = OpType::op(x[i*xEws], y[i*yEws], extraParams); } } @@ -112,14 +93,16 @@ namespace functions { Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo, - void *extraParams) { + void *extraParams, + const uint64_t start, + const uint64_t stop) { DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, - extraParams), + extraParams, start, stop), PAIRWISE_BOOL_OPS); }; @@ -129,7 +112,9 @@ namespace functions { void PairWiseBoolTransform::exec(void *vx, Nd4jLong* xShapeInfo, void *vy, Nd4jLong* yShapeInfo, void *vz, Nd4jLong* zShapeInfo, - void *vextraParams) { + void *vextraParams, + const uint64_t start, + const uint64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); @@ -141,8 +126,6 @@ namespace functions { auto yEws = shape::elementWiseStride(yShapeInfo); auto zEws = shape::elementWiseStride(zShapeInfo); - nd4j::OmpLaunchHelper info(n); - if (shape::isScalar(yShapeInfo)) { uint xShapeInfoCast[MAX_RANK]; @@ -150,37 +133,22 @@ namespace functions { if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for(Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - z[offset] = OpType::op(x[offset], y[0], extraParams); - } - } + PRAGMA_OMP_SIMD + for(auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + z[offset] = OpType::op(x[offset], y[0], extraParams); + }; } else { - uint zShapeInfoCast[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for(Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); - z[zOffset] = OpType::op(x[xOffset], y[0], extraParams); - } - } + PRAGMA_OMP_SIMD + for(auto i = start; i < stop; i++) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); + z[zOffset] = OpType::op(x[xOffset], y[0], extraParams); + }; } return; } @@ -189,96 +157,62 @@ namespace functions { const bool sameShapesXY = shape::shapeEquals(xShapeInfo, yShapeInfo); if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) { - exec(x, xEws, y, yEws, z, zEws, extraParams, n); + exec(x, xEws, y, yEws, z, zEws, extraParams, n, start, stop); } else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape - exec(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo)); + exec(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo), start, stop); } else { - if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - z[offset] = OpType::op(x[offset], y[offset], extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + z[offset] = OpType::op(x[offset], y[offset], extraParams); + }; } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { - uint xShapeInfoCast[MAX_RANK]; uint zShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); - z[zOffset] = OpType::op(x[offset], y[offset], extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); + z[zOffset] = OpType::op(x[offset], y[offset], extraParams); + }; } else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); - z[offset] = OpType::op(x[offset], y[yOffset], extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); + z[offset] = OpType::op(x[offset], y[yOffset], extraParams); + }; } else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) { - uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); - z[offset] = OpType::op(x[xOffset], y[offset], extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto offset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); + z[offset] = OpType::op(x[xOffset], y[offset], extraParams); + }; } else { - uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; uint zShapeInfoCast[MAX_RANK]; @@ -286,20 +220,13 @@ namespace functions { const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); - z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); + z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); + }; } } } diff --git a/libnd4j/include/loops/cpu/pairwise_int.cpp b/libnd4j/include/loops/cpu/pairwise_int.cpp index 63b9dc8c8..673951d6a 100644 --- a/libnd4j/include/loops/cpu/pairwise_int.cpp +++ b/libnd4j/include/loops/cpu/pairwise_int.cpp @@ -22,6 +22,7 @@ #include #include #include +#include using namespace simdOps; @@ -38,7 +39,9 @@ namespace functions { void *z, Nd4jLong zEws, void *extraParams, - Nd4jLong n) { + Nd4jLong n, + const uint64_t start, + const uint64_t stop) { DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xEws, y, @@ -46,7 +49,7 @@ namespace functions { z, zEws, extraParams, - n), PAIRWISE_INT_OPS); + n, start, stop), PAIRWISE_INT_OPS); }; @@ -60,46 +63,24 @@ namespace functions { void *vz, Nd4jLong zEws, void *vextraParams, - const Nd4jLong n) { + const Nd4jLong n, + const uint64_t start, + const uint64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); auto extraParams = reinterpret_cast(vextraParams); - nd4j::OmpLaunchHelper info(n); - if (xEws == 1 && yEws == 1 && zEws == 1) { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - Nd4jLong threadOffset = info.getThreadOffset(threadNum); - auto xi = x + threadOffset; - auto yi = y + threadOffset; - auto zi = z + threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) - zi[i] = OpType::op(xi[i], yi[i], extraParams); - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) + z[i] = OpType::op(x[i], y[i], extraParams); } else { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - Nd4jLong threadOffset = info.getThreadOffset(threadNum); - auto xi = x + xEws*threadOffset; - auto yi = y + yEws*threadOffset; - auto zi = z + zEws*threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) - zi[i*zEws] = OpType::op(xi[i*xEws], yi[i*yEws], extraParams); - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) + z[i*zEws] = OpType::op(x[i*xEws], y[i*yEws], extraParams); } } @@ -112,14 +93,16 @@ namespace functions { Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo, - void *extraParams) { + void *extraParams, + const uint64_t start, + const uint64_t stop) { DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, - extraParams), + extraParams, start, stop), PAIRWISE_INT_OPS); }; @@ -129,7 +112,9 @@ namespace functions { void PairWiseIntTransform::exec(void *vx, Nd4jLong* xShapeInfo, void *vy, Nd4jLong* yShapeInfo, void *vz, Nd4jLong* zShapeInfo, - void *vextraParams) { + void *vextraParams, + const uint64_t start, + const uint64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); @@ -141,46 +126,28 @@ namespace functions { auto yEws = shape::elementWiseStride(yShapeInfo); auto zEws = shape::elementWiseStride(zShapeInfo); - nd4j::OmpLaunchHelper info(n); - if (shape::isScalar(yShapeInfo)) { uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for(Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - z[offset] = OpType::op(x[offset], y[0], extraParams); - } - } + PRAGMA_OMP_SIMD + for(auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + z[offset] = OpType::op(x[offset], y[0], extraParams); + }; } else { - uint zShapeInfoCast[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for(Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); - z[zOffset] = OpType::op(x[xOffset], y[0], extraParams); - } - } + PRAGMA_OMP_SIMD + for(auto i = start; i < stop; i++) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); + z[zOffset] = OpType::op(x[xOffset], y[0], extraParams); + }; } return; } @@ -189,96 +156,63 @@ namespace functions { const bool sameShapesXY = shape::shapeEquals(xShapeInfo, yShapeInfo); if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) { - exec(x, xEws, y, yEws, z, zEws, extraParams, n); + exec(x, xEws, y, yEws, z, zEws, extraParams, n, start, stop); } else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape - exec(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo)); + exec(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo), start, stop); } else { if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - z[offset] = OpType::op(x[offset], y[offset], extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + z[offset] = OpType::op(x[offset], y[offset], extraParams); + }; } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { - uint xShapeInfoCast[MAX_RANK]; uint zShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); - z[zOffset] = OpType::op(x[offset], y[offset], extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); + z[zOffset] = OpType::op(x[offset], y[offset], extraParams); + }; } else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); - z[offset] = OpType::op(x[offset], y[yOffset], extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); + z[offset] = OpType::op(x[offset], y[yOffset], extraParams); + }; } else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) { - uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); - z[offset] = OpType::op(x[xOffset], y[offset], extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto offset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); + z[offset] = OpType::op(x[xOffset], y[offset], extraParams); + }; } else { - uint xShapeInfoCast[MAX_RANK]; uint yShapeInfoCast[MAX_RANK]; uint zShapeInfoCast[MAX_RANK]; @@ -286,20 +220,13 @@ namespace functions { const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); - z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); + z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); + }; } } } diff --git a/libnd4j/include/loops/cpu/random.cpp b/libnd4j/include/loops/cpu/random.cpp index 5abc1447a..d4c808719 100644 --- a/libnd4j/include/loops/cpu/random.cpp +++ b/libnd4j/include/loops/cpu/random.cpp @@ -52,28 +52,22 @@ namespace functions { auto length = shape::length(zShapeInfo); -// nd4j::random::RandomBuffer *buffer = reinterpret_cast (state); nd4j::graph::RandomGenerator* rng = reinterpret_cast(state); - nd4j::OmpLaunchHelper info(length); - if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - + auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + for (auto i = start; i < stop; i += increment) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); } - } + }; + + samediff::Threads::parallel_for(func, 0, length, 1); } else if (shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { @@ -82,19 +76,16 @@ namespace functions { const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - + auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); + for (uint64_t i = start; i < stop; i += increment) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); } - } + }; + + samediff::Threads::parallel_for(func, 0, length, 1); } else if (shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { @@ -103,19 +94,16 @@ namespace functions { const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - + auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); + for (uint64_t i = start; i < stop; i += increment) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments); } - } + }; + + samediff::Threads::parallel_for(func, 0, length, 1); } else if (shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) { @@ -124,19 +112,16 @@ namespace functions { const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - + auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < info.getItersPerThread(threadNum); i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); + for (uint64_t i = start; i < stop; i += increment) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto offset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments); } - } + }; + + samediff::Threads::parallel_for(func, 0, length, 1); } else { @@ -147,20 +132,17 @@ namespace functions { const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - + auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); + for (uint64_t i = start; i < stop; i += increment) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpClass::op(x[xOffset], y[yOffset], i, length, rng, extraArguments); } - } + }; + + samediff::Threads::parallel_for(func, 0, length, 1); } }; @@ -184,41 +166,34 @@ namespace functions { const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); nd4j::graph::RandomGenerator* rng = reinterpret_cast(state); - nd4j::OmpLaunchHelper info(length); if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - + auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + for (uint64_t i = start; i < stop; i += increment) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments); } - } + }; + + samediff::Threads::parallel_for(func, 0, length, 1); } else { uint zShapeInfoCast[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - + auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); + for (uint64_t i = start; i < stop; i += increment) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments); } - } + }; + + samediff::Threads::parallel_for(func, 0, length, 1); } } @@ -232,25 +207,21 @@ namespace functions { auto length = shape::length(zShapeInfo); - //nd4j::random::RandomBuffer *buffer = reinterpret_cast (state); nd4j::graph::RandomGenerator* rng = reinterpret_cast(state); nd4j::OmpLaunchHelper info(length); uint zShapeInfoCast[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - + auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); - z[offset] = OpClass::op(i+threadOffset, length, rng, extraArguments); + for (uint64_t i = start; i < stop; i += increment) { + auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); + z[offset] = OpClass::op(i, length, rng, extraArguments); } - } + }; + + samediff::Threads::parallel_for(func, 0, length, 1); } template diff --git a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp index 246d18ac4..882b1740e 100644 --- a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp +++ b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp @@ -55,7 +55,7 @@ namespace functions { if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY) return; const auto startingVal = OpType::startingValue(x); - PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold()) + for (uint i = 0; i < length; i++) z[i] = startingVal; return; @@ -65,25 +65,14 @@ namespace functions { z[0] = execScalar(x, xEws, length, extraParams); } else { - X start = OpType::startingValue(x); - const int maxThreads = nd4j::math::nd4j_min(256, omp_get_max_threads()); - X intermediate[256]; - - for (int e = 0; e < maxThreads; e++) - intermediate[e] = start; - + auto startingValue = OpType::startingValue(x); uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads) - for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); + for (auto i = 0; i < length; i++) + startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); - - for (int e = 0; e < maxThreads; e++) - start = OpType::update(start, intermediate[e], extraParams); - - z[0] = OpType::postProcess(start, shape::length(xShapeInfo), extraParams); + z[0] = OpType::postProcess(startingValue, length, extraParams); } } @@ -102,23 +91,14 @@ namespace functions { return execScalar(x, xEws, length, extraParams); } else { - X start = OpType::startingValue(x); - auto intermediate = new X[nd4j::math::nd4j_max(1, omp_get_max_threads())]; - for (int e = 0; e < omp_get_max_threads(); e++) - intermediate[e] = start; - + auto startingValue = OpType::startingValue(x); uint xShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_SIMD - for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); + for (auto i = 0; i < length; i++) + startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); - for (int e = 0; e < omp_get_max_threads(); e++) - start = OpType::update(start, intermediate[e], extraParams); - - delete[] intermediate; - return OpType::postProcess(start, shape::length(xShapeInfo), extraParams); + return OpType::postProcess(startingValue, length, extraParams); } } @@ -150,8 +130,8 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset) { - DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset), REDUCE_BOOL_OPS); + Nd4jLong *tadOffset, int64_t start, int64_t stop) { + DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset, start, stop), REDUCE_BOOL_OPS); } template @@ -164,7 +144,7 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset) { + Nd4jLong *tadOffset, int64_t start, int64_t stop) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vresult); @@ -176,7 +156,7 @@ namespace functions { if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY) return; const auto startingVal = OpType::startingValue(x); - PRAGMA_OMP_PARALLEL_FOR_IF(resultLength > nd4j::Environment::getInstance()->elementwiseThreshold()) + for (uint i = 0; i < resultLength; i++) z[i] = startingVal; return; @@ -205,9 +185,9 @@ namespace functions { } #ifdef INLINE_LOOPS - nd4j::ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams); + nd4j::ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams, start, stop); #else - nd4j::ReductionBoolLoops::template innerloopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams); + nd4j::ReductionBoolLoops::template innerloopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams, start, stop); #endif } @@ -227,49 +207,33 @@ namespace functions { template template Z _CUDA_H ReduceBoolFunction::execScalar(void *vx, Nd4jLong xEws, Nd4jLong length, void *vextraParams) { - auto x = reinterpret_cast(vx); auto extraParams = reinterpret_cast(vextraParams); + int maxThreads = nd4j::math::nd4j_min(64, nd4j::Environment::getInstance()->maxThreads()); + Z intermediate[64]; - auto startingVal = OpType::startingValue(x); - nd4j::OmpLaunchHelper info(length); + PRAGMA_OMP_SIMD + for (auto e = 0; e < maxThreads; e++) + intermediate[e] = OpType::startingValue(x); - if (xEws == 1) { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto local = OpType::startingValue(x); - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - for (Nd4jLong i = 0; i < ulen; i++) { - local = OpType::update(local, OpType::op(xi[i], extraParams), extraParams); - } - - PRAGMA_OMP_CRITICAL - startingVal = OpType::update(startingVal, local, extraParams); + auto func = PRAGMA_THREADS_FOR { + if (xEws == 1) { + for (auto i = start; i < stop; i++) + intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], extraParams), extraParams); + } else { + for (auto i = start; i < stop; i++) + intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i * xEws], extraParams), extraParams); } - } - else { + }; - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto local = OpType::startingValue(x); - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + xEws*threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); + maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads); - for (Nd4jLong i = 0; i < ulen; i++) - local = OpType::update(local, OpType::op(xi[i*xEws], extraParams), extraParams); + // merge results + for (int e = 1; e < maxThreads; e++) + intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams); - PRAGMA_OMP_CRITICAL - startingVal = OpType::update(startingVal, local, extraParams); - } - } - return OpType::postProcess(startingVal, length, extraParams); + // return result + return OpType::postProcess(intermediate[0], length, extraParams); } diff --git a/libnd4j/include/loops/cpu/reduce/reduce_float.cpp b/libnd4j/include/loops/cpu/reduce/reduce_float.cpp index a94a19b25..112656852 100644 --- a/libnd4j/include/loops/cpu/reduce/reduce_float.cpp +++ b/libnd4j/include/loops/cpu/reduce/reduce_float.cpp @@ -59,9 +59,10 @@ namespace functions { if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY) return; const auto startingVal = OpType::startingValue(x); - PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold()) + for (uint i = 0; i < length; i++) z[i] = startingVal; + return; } @@ -69,25 +70,29 @@ namespace functions { z[0] = execScalar(x, xEws, length, extraParams); } else { - X start = OpType::startingValue(x); - const int maxThreads = nd4j::math::nd4j_min(256, omp_get_max_threads()); - X intermediate[256]; - - for (int e = 0; e < maxThreads; e++) - intermediate[e] = start; - + auto startingValue = OpType::startingValue(x); uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); + int maxThreads = nd4j::math::nd4j_min(64, nd4j::Environment::getInstance()->maxThreads()); + Z intermediate[64]; - PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads) - for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); + PRAGMA_OMP_SIMD + for (auto e = 0; e < maxThreads; e++) + intermediate[e] = OpType::startingValue(x); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i++) + intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); + }; - for (int e = 0; e < maxThreads; e++) - start = OpType::update(start, intermediate[e], extraParams); + maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads); - z[0] = OpType::postProcess(start, shape::length(xShapeInfo), extraParams); + // merge results + for (int e = 1; e < maxThreads; e++) + intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams); + + // write out results + z[0] = OpType::postProcess(intermediate[0], length, extraParams); } } @@ -105,23 +110,14 @@ namespace functions { return execScalar(x, xEws, length, extraParams); } else { - X start = OpType::startingValue(x); - auto intermediate = new X[nd4j::math::nd4j_max(1, omp_get_max_threads())]; - for (int e = 0; e < omp_get_max_threads(); e++) - intermediate[e] = start; - + auto startingValue = OpType::startingValue(x); uint xShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_SIMD - for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); + for (auto i = 0; i < length; i++) + startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); - for (int e = 0; e < omp_get_max_threads(); e++) - start = OpType::update(start, intermediate[e], extraParams); - - delete[] intermediate; - return OpType::postProcess(start, shape::length(xShapeInfo), extraParams); + return OpType::postProcess(startingValue, length, extraParams); } } @@ -153,7 +149,7 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset) { + Nd4jLong *tadOffset, int64_t start, int64_t stop) { DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, @@ -162,7 +158,7 @@ namespace functions { dimension, dimensionLength, tadShapeInfo, - tadOffset), + tadOffset, start, stop), REDUCE_FLOAT_OPS); } @@ -176,7 +172,7 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset) { + Nd4jLong *tadOffset, int64_t start, int64_t stop) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vresult); @@ -188,7 +184,7 @@ namespace functions { if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY) return; const auto startingVal = std::is_same>::value ? nd4j::DataTypeUtils::nanOrZero() : static_cast(OpType::startingValue(x)); - PRAGMA_OMP_PARALLEL_FOR_IF(resultLength > nd4j::Environment::getInstance()->elementwiseThreshold()) + for (uint i = 0; i < resultLength; i++) z[i] = startingVal; return; @@ -222,9 +218,9 @@ namespace functions { } #ifdef INLINE_LOOPS - nd4j::ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams); + nd4j::ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams, start, stop); #else - nd4j::ReductionFloatLoops::template innerloopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams); + nd4j::ReductionFloatLoops::template innerloopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams, start, stop); #endif } @@ -245,49 +241,34 @@ namespace functions { template Z _CUDA_H ReduceFloatFunction::execScalar(void *vx, Nd4jLong xEws, Nd4jLong length, void *vextraParams) { - auto x = reinterpret_cast(vx); - auto extraParams = reinterpret_cast(vextraParams); + auto x = reinterpret_cast(vx); + auto extraParams = reinterpret_cast(vextraParams); + int maxThreads = nd4j::math::nd4j_min(64, nd4j::Environment::getInstance()->maxThreads()); + Z intermediate[64]; - auto startingVal = OpType::startingValue(x); - nd4j::OmpLaunchHelper info(length); - int nt = info._numThreads; + PRAGMA_OMP_SIMD + for (auto e = 0; e < maxThreads; e++) + intermediate[e] = OpType::startingValue(x); - if (xEws == 1) { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto local = OpType::startingValue(x); - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - for (Nd4jLong i = 0; i < ulen; i++) - local = OpType::update(local, OpType::op(xi[i], extraParams), extraParams); - - PRAGMA_OMP_CRITICAL - startingVal = OpType::update(startingVal, local, extraParams); + auto func = PRAGMA_THREADS_FOR { + if (xEws == 1) { + for (auto i = start; i < stop; i++) + intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], extraParams), extraParams); + } else { + for (auto i = start; i < stop; i++) + intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i * xEws], extraParams), extraParams); } - } - else { + }; - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto local = OpType::startingValue(x); - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + xEws*threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); + maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads); - for (Nd4jLong i = 0; i < ulen; i++) - local = OpType::update(local, OpType::op(xi[i*xEws], extraParams), extraParams); + // merge results + for (int e = 1; e < maxThreads; e++) + intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams); - PRAGMA_OMP_CRITICAL - startingVal = OpType::update(startingVal, local, extraParams); - } - } - return OpType::postProcess(startingVal, length, extraParams); - } + // return result + return OpType::postProcess(intermediate[0], length, extraParams); + } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReduceFloatFunction, , LIBND4J_TYPES, FLOAT_TYPES); diff --git a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp index 1a148805e..76dc209f6 100644 --- a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp +++ b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp @@ -55,7 +55,7 @@ namespace functions { if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY) return; const auto startingVal = OpType::startingValue(x); - PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold()) + for (uint i = 0; i < length; i++) z[i] = startingVal; return; @@ -65,25 +65,29 @@ namespace functions { z[0] = execScalar(x, xEws, length, extraParams); } else { - X start = OpType::startingValue(x); - const int maxThreads = nd4j::math::nd4j_min(256, omp_get_max_threads()); - X intermediate[256]; - - for (int e = 0; e < maxThreads; e++) - intermediate[e] = start; - + auto startingValue = OpType::startingValue(x); uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); + int maxThreads = nd4j::math::nd4j_min(64, nd4j::Environment::getInstance()->maxThreads()); + Z intermediate[64]; - PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads) - for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); + PRAGMA_OMP_SIMD + for (auto e = 0; e < maxThreads; e++) + intermediate[e] = OpType::startingValue(x); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i++) + intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); + }; - for (int e = 0; e < maxThreads; e++) - start = OpType::update(start, intermediate[e], extraParams); + maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads); - z[0] = OpType::postProcess(start, shape::length(xShapeInfo), extraParams); + // merge results + for (int e = 1; e < maxThreads; e++) + intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams); + + // write out results + z[0] = OpType::postProcess(intermediate[0], length, extraParams); } } @@ -103,23 +107,14 @@ namespace functions { return execScalar(x, xEws, length, extraParams); } else { - X start = OpType::startingValue(x); - auto intermediate = new X[nd4j::math::nd4j_max(1, omp_get_max_threads())]; - for (int e = 0; e < omp_get_max_threads(); e++) - intermediate[e] = start; - + auto startingValue = OpType::startingValue(x); uint xShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_SIMD - for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); + for (auto i = 0; i < length; i++) + startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); - for (int e = 0; e < omp_get_max_threads(); e++) - start = OpType::update(start, intermediate[e], extraParams); - - delete[] intermediate; - return OpType::postProcess(start, shape::length(xShapeInfo), extraParams); + return OpType::postProcess(startingValue, length, extraParams); } } @@ -152,8 +147,8 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset) { - DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset), REDUCE_LONG_OPS); + Nd4jLong *tadOffset, int64_t start, int64_t stop) { + DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset, start, stop), REDUCE_LONG_OPS); } template @@ -166,7 +161,7 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset) { + Nd4jLong *tadOffset, int64_t start, int64_t stop) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vresult); @@ -178,7 +173,7 @@ namespace functions { if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY) return; const auto startingVal = OpType::startingValue(x); - PRAGMA_OMP_PARALLEL_FOR_IF(resultLength > nd4j::Environment::getInstance()->elementwiseThreshold()) + for (uint i = 0; i < resultLength; i++) z[i] = startingVal; return; @@ -212,9 +207,9 @@ namespace functions { } #ifdef INLINE_LOOPS - nd4j::ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams); + nd4j::ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams, start, stop); #else - nd4j::ReductionLongLoops::template innerloopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams); + nd4j::ReductionLongLoops::template innerloopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams, start, stop); #endif } @@ -235,48 +230,34 @@ namespace functions { template Z _CUDA_H ReduceLongFunction::execScalar(void *vx, Nd4jLong xEws, Nd4jLong length, void *vextraParams) { - auto x = reinterpret_cast(vx); - auto extraParams = reinterpret_cast(vextraParams); + auto x = reinterpret_cast(vx); + auto extraParams = reinterpret_cast(vextraParams); + int maxThreads = nd4j::math::nd4j_min(64, nd4j::Environment::getInstance()->maxThreads()); + Z intermediate[64]; - auto startingVal = OpType::startingValue(x); - nd4j::OmpLaunchHelper info(length); + PRAGMA_OMP_SIMD + for (auto e = 0; e < maxThreads; e++) + intermediate[e] = OpType::startingValue(x); + auto func = PRAGMA_THREADS_FOR { if (xEws == 1) { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto local = OpType::startingValue(x); - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - for (Nd4jLong i = 0; i < ulen; i++) - local = OpType::update(local, OpType::op(xi[i], extraParams), extraParams); - - PRAGMA_OMP_CRITICAL - startingVal = OpType::update(startingVal, local, extraParams); - } + for (auto i = start; i < stop; i++) + intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], extraParams), extraParams); + } else { + for (auto i = start; i < stop; i++) + intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i * xEws], extraParams), extraParams); } - else { + }; - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto local = OpType::startingValue(x); - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + xEws*threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); + maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads); - for (Nd4jLong i = 0; i < ulen; i++) - local = OpType::update(local, OpType::op(xi[i*xEws], extraParams), extraParams); + // merge results + for (int e = 1; e < maxThreads; e++) + intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams); - PRAGMA_OMP_CRITICAL - startingVal = OpType::update(startingVal, local, extraParams); - } - } - return OpType::postProcess(startingVal, length, extraParams); - } + // return result + return OpType::postProcess(intermediate[0], length, extraParams); + } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReduceLongFunction, , LIBND4J_TYPES, LONG_TYPES); diff --git a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp index 0dfff5e73..cbd7e6e12 100644 --- a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp +++ b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp @@ -57,7 +57,7 @@ namespace functions { if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY) return; const auto startingVal = OpType::startingValue(x); - PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold()) + for (uint i = 0; i < length; i++) z[i] = startingVal; return; @@ -67,25 +67,29 @@ namespace functions { z[0] = execScalar(x, xEws, length, extraParams); } else { - X start = OpType::startingValue(x); - const int maxThreads = nd4j::math::nd4j_min(256, omp_get_max_threads()); - X intermediate[256]; - - for (int e = 0; e < maxThreads; e++) - intermediate[e] = start; - + auto startingValue = OpType::startingValue(x); uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); + int maxThreads = nd4j::math::nd4j_min(64, nd4j::Environment::getInstance()->maxThreads()); + X intermediate[64]; - PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads) - for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); + PRAGMA_OMP_SIMD + for (auto e = 0; e < maxThreads; e++) + intermediate[e] = OpType::startingValue(x); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i++) + intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); + }; - for (int e = 0; e < maxThreads; e++) - start = OpType::update(start, intermediate[e], extraParams); + maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads); - z[0] = OpType::postProcess(start, length, extraParams); + // merge results + for (int e = 1; e < maxThreads; e++) + intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams); + + // write out results + z[0] = OpType::postProcess(intermediate[0], length, extraParams); } } @@ -103,26 +107,15 @@ namespace functions { if (xEws >= 1) { return execScalar(x, xEws, length, extraParams); - } - else { - X start = OpType::startingValue(x); - const int maxThreads = nd4j::math::nd4j_min(256, omp_get_max_threads()); - X intermediate[256]; - - for (int e = 0; e < maxThreads; e++) - intermediate[e] = start; - + } else { + auto startingValue = OpType::startingValue(x); uint xShapeInfoCast[MAX_RANK]; - const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); + bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads) - for(Nd4jLong i = 0; i < length; ++i) - intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); + for (auto i = 0; i < length; i++) + startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); - for (int e = 0; e < maxThreads; e++) - start = OpType::update(start, intermediate[e], extraParams); - - return OpType::postProcess(start, shape::length(xShapeInfo), extraParams); + return OpType::postProcess(startingValue, length, extraParams); } } @@ -154,7 +147,7 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset) { + Nd4jLong *tadOffset, int64_t start, int64_t stop) { DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, extraParams, @@ -163,7 +156,7 @@ namespace functions { dimension, dimensionLength, tadShapeInfo, - tadOffset), + tadOffset, start, stop), REDUCE_SAME_OPS); } @@ -177,7 +170,7 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset) { + Nd4jLong *tadOffset, int64_t start, int64_t stop) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); @@ -189,7 +182,7 @@ namespace functions { if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY) return; const auto startingVal = OpType::startingValue(x); - PRAGMA_OMP_PARALLEL_FOR_IF(zLength > nd4j::Environment::getInstance()->elementwiseThreshold()) + for (uint i = 0; i < zLength; i++) z[i] = startingVal; return; @@ -223,9 +216,9 @@ namespace functions { } #ifdef INLINE_LOOPS - nd4j::ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams); + nd4j::ReductionLoops::template loopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams, start, stop); #else - nd4j::ReductionSameLoops::template innerloopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams); + nd4j::ReductionSameLoops::template innerloopReduce(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams, start, stop); #endif } @@ -246,48 +239,34 @@ namespace functions { template X _CUDA_H ReduceSameFunction::execScalar(void *vx, Nd4jLong xEws, Nd4jLong length, void *vextraParams) { - auto x = reinterpret_cast(vx); - auto extraParams = reinterpret_cast(vextraParams); + auto x = reinterpret_cast(vx); + auto extraParams = reinterpret_cast(vextraParams); + int maxThreads = nd4j::math::nd4j_min(64, nd4j::Environment::getInstance()->maxThreads()); + X intermediate[64]; - auto startingVal = OpType::startingValue(x); - nd4j::OmpLaunchHelper info(length); + PRAGMA_OMP_SIMD + for (auto e = 0; e < maxThreads; e++) + intermediate[e] = OpType::startingValue(x); + auto func = PRAGMA_THREADS_FOR { if (xEws == 1) { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto local = OpType::startingValue(x); - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - for (Nd4jLong i = 0; i < ulen; i++) - local = OpType::update(local, OpType::op(xi[i], extraParams), extraParams); - - PRAGMA_OMP_CRITICAL - startingVal = OpType::update(startingVal, local, extraParams); - } + for (auto i = start; i < stop; i++) + intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], extraParams), extraParams); + } else { + for (auto i = start; i < stop; i++) + intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i * xEws], extraParams), extraParams); } - else { + }; - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto local = OpType::startingValue(x); - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + xEws*threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); + maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads); - for (Nd4jLong i = 0; i < ulen; i++) - local = OpType::update(local, OpType::op(xi[i*xEws], extraParams), extraParams); + // merge results + for (int e = 1; e < maxThreads; e++) + intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams); - PRAGMA_OMP_CRITICAL - startingVal = OpType::update(startingVal, local, extraParams); - } - } - return OpType::postProcess(startingVal, length, extraParams); - } + // return result + return OpType::postProcess(intermediate[0], length, extraParams); + } BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT ReduceSameFunction, , LIBND4J_TYPES); diff --git a/libnd4j/include/loops/cpu/reduce3.cpp b/libnd4j/include/loops/cpu/reduce3.cpp index fd09dc0e1..dbe93620a 100644 --- a/libnd4j/include/loops/cpu/reduce3.cpp +++ b/libnd4j/include/loops/cpu/reduce3.cpp @@ -24,6 +24,7 @@ #include #include #include +#include using namespace simdOps; @@ -51,72 +52,82 @@ void Reduce3::execScalar(void *vx, Nd4jLong *xShapeInfo, if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY) return; const auto startingVal = OpType::startingValue(x); - PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold()) + for (uint i = 0; i < length; i++) z[i] = startingVal; + return; } Z extraParamsVals[3] = {(Z) 0.0f, (Z) 0.0f, (Z) 0.0f}; - // it's possible case for EqualsWithEps op - if (extraParams != nullptr) - extraParamsVals[2] = extraParams[0]; uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); Z startingVal = OpType::startingValue(x); - const int maxThreads = nd4j::math::nd4j_min(256, omp_get_max_threads()); - nd4j::OmpLaunchHelper t(length, maxThreads); - Z intermediate[256]; - Z extraParamsLocal[3 * 256]; + int maxThreads = nd4j::math::nd4j_min(64, nd4j::Environment::getInstance()->maxThreads()); + Z intermediate[64]; + Z extraParamsLocal[3 * 64]; PRAGMA_OMP_SIMD for (int e = 0; e < maxThreads; e++) intermediate[e] = startingVal; - memset(extraParamsLocal, 0, 3 * 256 * sizeof(Z)); - if (extraParams != nullptr) + memset(extraParamsLocal, 0, 3 * 64 * sizeof(Z)); + if (extraParams != nullptr) { PRAGMA_OMP_SIMD - for (int e = 0; e < maxThreads; e++) - extraParamsLocal[3 * e + 2] = extraParams[0]; + // mostly for future reference + for (int e = 0; e < maxThreads; e++) { + extraParamsLocal[3 * e] = extraParams[0]; + extraParamsLocal[3 * e + 1] = extraParams[1]; + extraParamsLocal[3 * e + 2] = extraParams[2]; + } + } nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, yShapeInfo); if (kindOfLoop == nd4j::LoopKind::EWS1) { - PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads) - for(unsigned int i = 0; i < length; i++) { - const auto threadNum = omp_get_thread_num(); - intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[i], y[i], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum); - } + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); + } + }; + + maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads); } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { - PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads) - for(unsigned int i = 0; i < length; i++) { - const auto threadNum = omp_get_thread_num(); - auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); - intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum); - } + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); + } + }; + + maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads); } else { uint yShapeInfoCast[MAX_RANK]; const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads) - for(unsigned int i = 0; i < length; i++) { - const auto threadNum = omp_get_thread_num(); - auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); - auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); - intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum); - } + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); + intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); + } + }; + + maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads); } // merge step for (int e = 0; e < maxThreads; e++) OpType::aggregateExtraParams(extraParamsVals, extraParamsLocal + 3 * e); + for (int e = 0; e < maxThreads; e++) startingVal = OpType::update(startingVal, intermediate[e], extraParamsVals); + // writing out result z[0] = OpType::postProcess(startingVal, length, extraParamsVals); } @@ -139,7 +150,7 @@ void Reduce3::exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, - int *dimension, int dimensionLength) { + int *dimension, int dimensionLength, int64_t start, int64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); @@ -151,9 +162,9 @@ void Reduce3::exec(void *vx, Nd4jLong *xShapeInfo, return; } #ifdef INLINE_LOOPS - nd4j::Reduction3Loops::template loopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams); + nd4j::Reduction3Loops::template loopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop); #else - nd4j::Reduction3Loops::template innerloopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams); + nd4j::Reduction3Loops::template innerloopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop); #endif } @@ -165,16 +176,16 @@ void Reduce3::exec(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { + Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, int64_t start, int64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); auto z = reinterpret_cast(vz); auto extraParams = reinterpret_cast(vextraParams); #ifdef INLINE_LOOPS - nd4j::Reduction3Loops::template loopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams); + nd4j::Reduction3Loops::template loopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop); #else - nd4j::Reduction3Loops::template innerloopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams); + nd4j::Reduction3Loops::template innerloopReduce3(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop); #endif } @@ -188,7 +199,7 @@ void Reduce3:: execAll(void *vx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, - Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) { + Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets, int64_t start, int64_t stop) { auto x = reinterpret_cast(vx); auto y = reinterpret_cast(vy); @@ -196,9 +207,9 @@ void Reduce3:: execAll(void *vx, Nd4jLong *xShapeInfo, auto extraParams = reinterpret_cast(vextraParams); #ifdef INLINE_LOOPS - nd4j::Reduction3Loops::template loopReduce3All(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams); + nd4j::Reduction3Loops::template loopReduce3All(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams, start, stop); #else - nd4j::Reduction3Loops::template innerloopReduce3All(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams); + nd4j::Reduction3Loops::template innerloopReduce3All(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams, start, stop); #endif } @@ -209,9 +220,9 @@ void Reduce3::exec( const int opNum, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, - int *dimension, int dimensionLength) { + int *dimension, int dimensionLength, int64_t start, int64_t stop) { - DISPATCH_BY_OPNUM_TT(exec, PARAMS(vx, xShapeInfo, extraParamsVals, vy, yShapeInfo, vz, zShapeInfo, dimension, dimensionLength), REDUCE3_OPS); + DISPATCH_BY_OPNUM_TT(exec, PARAMS(vx, xShapeInfo, extraParamsVals, vy, yShapeInfo, vz, zShapeInfo, dimension, dimensionLength, start, stop), REDUCE3_OPS); } @@ -223,9 +234,9 @@ void Reduce3::exec( const int opNum, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { + Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, int64_t start, int64_t stop) { - DISPATCH_BY_OPNUM_TT(exec, PARAMS(vx,xShapeInfo,extraParamsVals,vy, yShapeInfo,vz,zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), REDUCE3_OPS); + DISPATCH_BY_OPNUM_TT(exec, PARAMS(vx,xShapeInfo,extraParamsVals,vy, yShapeInfo,vz,zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), REDUCE3_OPS); } @@ -238,9 +249,9 @@ void Reduce3::execAll(const int opNum, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, - Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) { + Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets, int64_t start, int64_t stop) { - DISPATCH_BY_OPNUM_TT(execAll, PARAMS(vx, xShapeInfo, extraParamsVals, vy, yShapeInfo, vz, zShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets), REDUCE3_OPS); + DISPATCH_BY_OPNUM_TT(execAll, PARAMS(vx, xShapeInfo, extraParamsVals, vy, yShapeInfo, vz, zShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, start, stop), REDUCE3_OPS); } diff --git a/libnd4j/include/loops/cpu/scalar.hpp b/libnd4j/include/loops/cpu/scalar.hpp index 79e53e4a2..071913e22 100644 --- a/libnd4j/include/loops/cpu/scalar.hpp +++ b/libnd4j/include/loops/cpu/scalar.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #include "../legacy_ops.h" using namespace simdOps; @@ -39,7 +40,8 @@ void ScalarTransform::transform(void *vx, Nd4jLong *xShapeInfo, void *vscalars, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, - Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) { + Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, + const uint64_t start, const uint64_t stop) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); @@ -63,29 +65,27 @@ void ScalarTransform::transform(void *vx, Nd4jLong *xShapeInfo, return; } - int num_threads = nd4j::math::nd4j_min(numTads, omp_get_max_threads()); + int num_threads = nd4j::math::nd4j_min(numTads, nd4j::Environment::getInstance()->maxThreads()); if (kindOfLoop == nd4j::LoopKind::EWS1) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads) - for (unsigned int r = 0; r < numTads; r++) { + for (auto r = start; r < stop; r++) { auto oZ = z + zTadOffsets[r]; auto oX = x + xTadOffsets[r]; PRAGMA_OMP_SIMD for (unsigned int f = 0; f < tadLength; f++) oZ[f] = OpType::op(oX[f], scalars[r], extraParams); - } + }; } else { - PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads) - for (unsigned int r = 0; r < numTads; r++) { + for (auto r = start; r < stop; r++) { auto oZ = z + zTadOffsets[r]; auto oX = x + xTadOffsets[r]; PRAGMA_OMP_SIMD for (unsigned int f = 0; f < tadLength; f++) oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams); - } + }; } } @@ -98,9 +98,10 @@ void ScalarTransform::transform(int opNum, void *scalars, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, - Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) { + Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, + const uint64_t start, const uint64_t stop) { - DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets), SCALAR_OPS); + DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets, start, stop), SCALAR_OPS); } //////////////////////////////////////////////////////////////////////// @@ -110,9 +111,10 @@ void ScalarTransform::transform(const int opNum, void *z, Nd4jLong zStride, void *scalar, void *extraParams, - const Nd4jLong n, bool allowParallelism) { + const uint64_t n, + const uint64_t start, const uint64_t stop) { - DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xStride, z, zStride, scalar, extraParams, n, allowParallelism), SCALAR_OPS); + DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xStride, z, zStride, scalar, extraParams, n, start, stop), SCALAR_OPS); } //////////////////////////////////////////////////////////////////////// @@ -121,9 +123,10 @@ void ScalarTransform::transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *z, Nd4jLong *zShapeInfo, void *scalar, - void *extraParams, bool allowParallelism) { + void *extraParams, + const uint64_t start, const uint64_t stop) { - DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams, allowParallelism), SCALAR_OPS); + DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams, start, stop), SCALAR_OPS); } //////////////////////////////////////////////////////////////////////// @@ -132,7 +135,8 @@ template void ScalarTransform::transform(void *vx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *vscalar, - void *vextraParams, bool allowParallelism) { + void *vextraParams, + const uint64_t start, const uint64_t stop) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); @@ -146,48 +150,30 @@ void ScalarTransform::transform(void *vx, Nd4jLong *xShapeInfo, nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo); if (kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) { - transform(x, xEws, z, zEws, vscalar, extraParams, len, allowParallelism); + transform(x, xEws, z, zEws, vscalar, extraParams, len, start, stop); } else { uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - nd4j::OmpLaunchHelper info(len, allowParallelism ? -1 : 1); - if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - - PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - z[offset] = OpType::op(x[offset], scalar, extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + z[offset] = OpType::op(x[offset], scalar, extraParams); + }; } else { - uint zShapeInfoCast[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); - z[zOffset] = OpType::op(x[xOffset], scalar, extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); + z[zOffset] = OpType::op(x[xOffset], scalar, extraParams); + }; } } } @@ -199,44 +185,22 @@ void ScalarTransform::transform(void *vx, Nd4jLong xEws, void *vz, Nd4jLong zEws, void *vscalar, void *vextraParams, - const Nd4jLong len, bool allowParallelism) { + const uint64_t len, const uint64_t start, const uint64_t stop) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto scalar = reinterpret_cast(vscalar)[0]; auto extraParams = reinterpret_cast(vextraParams); - nd4j::OmpLaunchHelper info(len, allowParallelism ? -1 : 1); - if (xEws == 1 && zEws == 1) { - - PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + threadOffset; - auto zi = z + threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) - zi[i] = OpType::op(xi[i], scalar, extraParams); - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) + z[i] = OpType::op(x[i], scalar, extraParams); } else { - - PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + xEws * threadOffset; - auto zi = z + zEws * threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) - zi[i * zEws] = OpType::op(xi[i * xEws], scalar, extraParams); - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) + z[i * zEws] = OpType::op(x[i * xEws], scalar, extraParams); } } diff --git a/libnd4j/include/loops/cpu/scalar_bool.cpp b/libnd4j/include/loops/cpu/scalar_bool.cpp index b37bdd6ef..d6dce445b 100644 --- a/libnd4j/include/loops/cpu/scalar_bool.cpp +++ b/libnd4j/include/loops/cpu/scalar_bool.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include "../legacy_ops.h" @@ -39,7 +40,8 @@ namespace functions { void *vscalars, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, - Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) { + Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, + const uint64_t start, const uint64_t stop) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); @@ -64,29 +66,27 @@ namespace functions { return; } - int num_threads = nd4j::math::nd4j_min(numTads, omp_get_max_threads()); + int num_threads = nd4j::math::nd4j_min(numTads, nd4j::Environment::getInstance()->maxThreads()); if (kindOfLoop == nd4j::LoopKind::EWS1) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads) - for (unsigned int r = 0; r < numTads; r++) { + for (auto r = start; r < stop; r++) { auto oZ = z + zTadOffsets[r]; auto oX = x + xTadOffsets[r]; PRAGMA_OMP_SIMD for (unsigned int f = 0; f < tadLength; f++) oZ[f] = OpType::op(oX[f], scalars[r], extraParams); - } + }; } - else { // kindOfLoop != nd4j::LoopKind::EWSNONZERO - PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads) - for (unsigned int r = 0; r < numTads; r++) { + else { + for (auto r = start; r < stop; r++) { auto oZ = z + zTadOffsets[r]; auto oX = x + xTadOffsets[r]; PRAGMA_OMP_SIMD for (unsigned int f = 0; f < tadLength; f++) oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams); - } + }; } } @@ -103,8 +103,8 @@ namespace functions { Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, Nd4jLong *zTadShapeInfo, - Nd4jLong *zTadOffsets) { - DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets), SCALAR_BOOL_OPS); + Nd4jLong *zTadOffsets, const uint64_t start, const uint64_t stop) { + DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets, start, stop), SCALAR_BOOL_OPS); } @@ -116,8 +116,9 @@ namespace functions { Nd4jLong zEws, void *scalar, void *extraParams, - const Nd4jLong n) { - DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xEws, z, zEws, scalar, extraParams, n), SCALAR_BOOL_OPS); + const uint64_t n, + const uint64_t start, const uint64_t stop) { + DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xEws, z, zEws, scalar, extraParams, n, start, stop), SCALAR_BOOL_OPS); } template @@ -127,8 +128,9 @@ namespace functions { void *z, Nd4jLong *zShapeInfo, void *scalar, - void *extraParams) { - DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams), SCALAR_BOOL_OPS); + void *extraParams, + const uint64_t start, const uint64_t stop) { + DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams, start, stop), SCALAR_BOOL_OPS); } template @@ -138,7 +140,8 @@ namespace functions { void *vz, Nd4jLong *zShapeInfo, void *vscalar, - void *vextraParams) { + void *vextraParams, + const uint64_t start, const uint64_t stop) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); @@ -149,53 +152,33 @@ namespace functions { auto zEws = shape::elementWiseStride(zShapeInfo); auto len = shape::length(xShapeInfo); - // nd4j_logger("Launching scalar: xOrder: %i; zOrder: %i; xEWS: %i\n", xOrder, zOrder, xEws); - nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo); if (kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) { - transform(x, xEws, z, zEws, vscalar, extraParams, len); + transform(x, xEws, z, zEws, vscalar, extraParams, len, start, stop); return; } uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - nd4j::OmpLaunchHelper info(len); - if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - z[offset] = OpType::op(x[offset], scalar, extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + z[offset] = OpType::op(x[offset], scalar, extraParams); + }; } else { - uint zShapeInfoCast[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); - z[zOffset] = OpType::op(x[xOffset], scalar, extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); + z[zOffset] = OpType::op(x[xOffset], scalar, extraParams); + }; } } @@ -208,44 +191,23 @@ namespace functions { Nd4jLong zEws, void *vscalar, void *vextraParams, - const Nd4jLong len) { + const uint64_t len, + const uint64_t start, const uint64_t stop) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto scalar = reinterpret_cast(vscalar)[0]; auto extraParams = reinterpret_cast(vextraParams); - nd4j::OmpLaunchHelper info(len); - if (xEws == 1 && zEws == 1) { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + threadOffset; - auto zi = z + threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) - zi[i] = OpType::op(xi[i], scalar, extraParams); - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) + z[i] = OpType::op(x[i], scalar, extraParams); } else { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + xEws * threadOffset; - auto zi = z + zEws * threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) - zi[i * zEws] = OpType::op(xi[i * xEws], scalar, extraParams); - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) + z[i * zEws] = OpType::op(x[i * xEws], scalar, extraParams); } } diff --git a/libnd4j/include/loops/cpu/scalar_int.cpp b/libnd4j/include/loops/cpu/scalar_int.cpp index 9e73e2756..5f2308418 100644 --- a/libnd4j/include/loops/cpu/scalar_int.cpp +++ b/libnd4j/include/loops/cpu/scalar_int.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include "../legacy_ops.h" @@ -39,7 +40,8 @@ namespace functions { void *vscalars, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, - Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) { + Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, + const uint64_t start, const uint64_t stop) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); @@ -64,29 +66,27 @@ namespace functions { return; } - int num_threads = nd4j::math::nd4j_min(numTads, omp_get_max_threads()); + int num_threads = nd4j::math::nd4j_min(numTads, nd4j::Environment::getInstance()->maxThreads()); if (kindOfLoop == nd4j::LoopKind::EWS1) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads) - for (unsigned int r = 0; r < numTads; r++) { + for (auto r = start; r < stop; r++) { auto oZ = z + zTadOffsets[r]; auto oX = x + xTadOffsets[r]; PRAGMA_OMP_SIMD for (unsigned int f = 0; f < tadLength; f++) oZ[f] = OpType::op(oX[f], scalars[r], extraParams); - } + }; } - else { // kindOfLoop != nd4j::LoopKind::EWSNONZERO - PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads) - for (unsigned int r = 0; r < numTads; r++) { + else { + for (auto r = start; r < stop; r++) { auto oZ = z + zTadOffsets[r]; auto oX = x + xTadOffsets[r]; PRAGMA_OMP_SIMD for (unsigned int f = 0; f < tadLength; f++) oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams); - } + }; } } @@ -103,8 +103,10 @@ namespace functions { Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, Nd4jLong *zTadShapeInfo, - Nd4jLong *zTadOffsets) { - DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets), SCALAR_INT_OPS); + Nd4jLong *zTadOffsets, + const uint64_t start, const uint64_t stop) { + + DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets, start, stop), SCALAR_INT_OPS); } @@ -116,8 +118,9 @@ namespace functions { Nd4jLong zEws, void *scalar, void *extraParams, - const Nd4jLong n) { - DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xEws, z, zEws, scalar, extraParams, n), SCALAR_INT_OPS); + const uint64_t n, + const uint64_t start, const uint64_t stop) { + DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xEws, z, zEws, scalar, extraParams, n, start, stop), SCALAR_INT_OPS); } template @@ -127,8 +130,9 @@ namespace functions { void *z, Nd4jLong *zShapeInfo, void *scalar, - void *extraParams) { - DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams), SCALAR_INT_OPS); + void *extraParams, + const uint64_t start, const uint64_t stop) { + DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams, start, stop), SCALAR_INT_OPS); } template @@ -138,7 +142,8 @@ namespace functions { void *vz, Nd4jLong *zShapeInfo, void *vscalar, - void *vextraParams) { + void *vextraParams, + const uint64_t start, const uint64_t stop) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); @@ -149,53 +154,33 @@ namespace functions { auto zEws = shape::elementWiseStride(zShapeInfo); auto len = shape::length(xShapeInfo); - // nd4j_logger("Launching scalar: xOrder: %i; zOrder: %i; xEWS: %i\n", xOrder, zOrder, xEws); - nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo); if (kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) { - transform(x, xEws, z, zEws, vscalar, extraParams, len); + transform(x, xEws, z, zEws, vscalar, extraParams, len, start, stop); return; } uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - nd4j::OmpLaunchHelper info(len); - if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) { - auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - z[offset] = OpType::op(x[offset], scalar, extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + z[offset] = OpType::op(x[offset], scalar, extraParams); + }; } else { - uint zShapeInfoCast[MAX_RANK]; const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); - z[zOffset] = OpType::op(x[xOffset], scalar, extraParams); - } - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); + z[zOffset] = OpType::op(x[xOffset], scalar, extraParams); + }; } } @@ -208,44 +193,23 @@ namespace functions { Nd4jLong zEws, void *vscalar, void *vextraParams, - const Nd4jLong len) { + const uint64_t len, + const uint64_t start, const uint64_t stop) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto scalar = reinterpret_cast(vscalar)[0]; auto extraParams = reinterpret_cast(vextraParams); - nd4j::OmpLaunchHelper info(len); - if (xEws == 1 && zEws == 1) { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + threadOffset; - auto zi = z + threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) - zi[i] = OpType::op(xi[i], scalar, extraParams); - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) + z[i] = OpType::op(x[i], scalar, extraParams); } else { - - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = info.getThreadOffset(threadNum); - auto xi = x + xEws * threadOffset; - auto zi = z + zEws * threadOffset; - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (unsigned int i = 0; i < ulen; i++) - zi[i * zEws] = OpType::op(xi[i * xEws], scalar, extraParams); - } + PRAGMA_OMP_SIMD + for (auto i = start; i < stop; i++) + z[i * zEws] = OpType::op(x[i * xEws], scalar, extraParams); } } diff --git a/libnd4j/include/loops/cpu/summarystatsreduce.cpp b/libnd4j/include/loops/cpu/summarystatsreduce.cpp index 1f5a7c339..a8f766f6a 100644 --- a/libnd4j/include/loops/cpu/summarystatsreduce.cpp +++ b/libnd4j/include/loops/cpu/summarystatsreduce.cpp @@ -24,6 +24,7 @@ #include #include #include +#include using namespace simdOps; @@ -90,8 +91,7 @@ namespace functions { uint xShapeInfoCast[MAX_RANK]; const bool canCast = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - for (Nd4jLong i = 0; i < length; i++) { - + for (uint64_t i = 0; i < length; i++) { auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCast); SummaryStatsData curr; @@ -123,7 +123,7 @@ namespace functions { return; SummaryStatsData comp; comp.initWithValue(x[0]); - PRAGMA_OMP_PARALLEL_FOR_IF(resultLength > nd4j::Environment::getInstance()->elementwiseThreshold()) + for (uint i = 0; i < resultLength; i++) z[i] = OpType::getValue(biasCorrected, comp); return; @@ -157,35 +157,37 @@ namespace functions { uint tadShapeShapeInfoCast[MAX_RANK]; const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo(tadShapeShapeInfo, tadShapeShapeInfoCast); - PRAGMA_OMP_PARALLEL_FOR - for (int r = 0; r < resultLength; r++) { + auto func = PRAGMA_THREADS_FOR { + for (auto r = start; r < stop; r += increment) { - auto tadOffsetForBlock = tadPack.primaryOffsets()[r]; - auto tx = x + tadOffsetForBlock; - SummaryStatsData comp; - comp.initWithValue(tx[0]); + auto tadOffsetForBlock = tadPack.primaryOffsets()[r]; + auto tx = x + tadOffsetForBlock; + SummaryStatsData comp; + comp.initWithValue(tx[0]); - if (tadEWS == 1 && tadOrder == 'c') { - for (int i = 1; i < tadLength; i ++) { - SummaryStatsData indexVal2; - indexVal2.initWithValue(tx[i]); + if (tadEWS == 1 && tadOrder == 'c') { + for (int i = 1; i < tadLength; i++) { + SummaryStatsData indexVal2; + indexVal2.initWithValue(tx[i]); - comp = update(comp, OpType::op(indexVal2, extraParams), extraParams); + comp = update(comp, OpType::op(indexVal2, extraParams), extraParams); + } + } else { + for (int i = 1; i < tadLength; i++) { + auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast); + + SummaryStatsData indexVal2; + indexVal2.initWithValue(tx[xOffset]); + + comp = update(comp, OpType::op(indexVal2, extraParams), extraParams); + } } + + z[r] = OpType::getValue(biasCorrected, comp); } - else { - for (int i = 1; i < tadLength; i ++) { - auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast); + }; - SummaryStatsData indexVal2; - indexVal2.initWithValue(tx[xOffset]); - - comp = update(comp, OpType::op(indexVal2, extraParams), extraParams); - } - } - - z[r] = OpType::getValue(biasCorrected, comp); - } + samediff::Threads::parallel_tad(func, 0, resultLength, 1); } diff --git a/libnd4j/include/loops/cpu/transform/transform_any.cpp b/libnd4j/include/loops/cpu/transform/transform_any.cpp index 5727c096d..5b3c4a0f8 100644 --- a/libnd4j/include/loops/cpu/transform/transform_any.cpp +++ b/libnd4j/include/loops/cpu/transform/transform_any.cpp @@ -37,9 +37,8 @@ namespace functions { void *z, Nd4jLong *zShapeInfo, void *extraParams, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets, bool allowParallelism) { - DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets, allowParallelism), TRANSFORM_ANY_OPS); + uint64_t threadId, uint64_t numThreads) { + DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_ANY_OPS); } ///////////////////////////////////////////////////////////////////// @@ -47,22 +46,13 @@ template template void _CUDA_H TransformAny::exec(void *vx, Nd4jLong *xShapeInfo, void *vz,Nd4jLong *zShapeInfo, - void *vextraParams, - Nd4jLong *tadShapeInfo,Nd4jLong *tadOffsets, bool allowParallelism) { + void *vextraParams, uint64_t threadId, uint64_t numThreads) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto extraParams = reinterpret_cast(vextraParams); - - if(OpType::requiresSpecial) { - OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets); - return; - } - if (allowParallelism) - nd4j::TransformLoops::template loopTransform(x, xShapeInfo, z, zShapeInfo, extraParams); - else - nd4j::TransformLoops::template loopTransform(x, xShapeInfo, z, zShapeInfo, extraParams); + nd4j::TransformLoops::template loopTransform(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads); } diff --git a/libnd4j/include/loops/cpu/transform/transform_bool.cpp b/libnd4j/include/loops/cpu/transform/transform_bool.cpp index 3560c85fe..fdfde93f5 100644 --- a/libnd4j/include/loops/cpu/transform/transform_bool.cpp +++ b/libnd4j/include/loops/cpu/transform/transform_bool.cpp @@ -37,9 +37,8 @@ namespace functions { void *z, Nd4jLong *zShapeInfo, void *extraParams, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets) { - DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets), TRANSFORM_BOOL_OPS); + uint64_t threadId, uint64_t numThreads) { + DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_BOOL_OPS); } template @@ -49,20 +48,13 @@ namespace functions { Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, - void *vextraParams, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets) { + void *vextraParams, uint64_t threadId, uint64_t numThreads) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); - auto extraParams = reinterpret_cast(vextraParams); + auto extraParams = reinterpret_cast(vextraParams); - if(OpType::requiresSpecial) { - OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets); - return; - } - - nd4j::TransformLoops::template loopTransform(x, xShapeInfo, z, zShapeInfo, extraParams); + nd4j::TransformLoops::template loopTransform(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads); } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformBool, , LIBND4J_TYPES, BOOL_TYPES); diff --git a/libnd4j/include/loops/cpu/transform/transform_float.cpp b/libnd4j/include/loops/cpu/transform/transform_float.cpp index 922a76265..8e164a90f 100644 --- a/libnd4j/include/loops/cpu/transform/transform_float.cpp +++ b/libnd4j/include/loops/cpu/transform/transform_float.cpp @@ -36,9 +36,8 @@ namespace functions { void *z, Nd4jLong *zShapeInfo, void *extraParams, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets) { - DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets), TRANSFORM_FLOAT_OPS); + uint64_t threadId, uint64_t numThreads) { + DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_FLOAT_OPS); } template @@ -48,20 +47,13 @@ namespace functions { Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, - void *vextraParams, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets) { + void *vextraParams, uint64_t threadId, uint64_t numThreads) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto extraParams = reinterpret_cast(vextraParams); - if(OpType::requiresSpecial) { - OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets); - return; - } - - nd4j::TransformLoops::template loopTransform(x, xShapeInfo, z, zShapeInfo, extraParams); + nd4j::TransformLoops::template loopTransform(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads); } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformFloat, , LIBND4J_TYPES, FLOAT_TYPES); diff --git a/libnd4j/include/loops/cpu/transform/transform_same.cpp b/libnd4j/include/loops/cpu/transform/transform_same.cpp index f821d73bc..67f7762f0 100644 --- a/libnd4j/include/loops/cpu/transform/transform_same.cpp +++ b/libnd4j/include/loops/cpu/transform/transform_same.cpp @@ -36,10 +36,8 @@ namespace functions { Nd4jLong *xShapeInfo, void *z, Nd4jLong *zShapeInfo, - void *extraParams, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets) { - DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets), TRANSFORM_SAME_OPS); + void *extraParams, uint64_t threadId, uint64_t numThreads) { + DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_SAME_OPS); } template @@ -47,18 +45,14 @@ namespace functions { void _CUDA_H TransformSame::exec(void *vx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *vextraParams, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { + uint64_t threadId, uint64_t numThreads) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto extraParams = reinterpret_cast(vextraParams); - if(OpType::requiresSpecial) { - OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets); - return; - } - nd4j::TransformLoops::template loopTransform(x, xShapeInfo, z, zShapeInfo, extraParams); + nd4j::TransformLoops::template loopTransform(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads); } BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformSame, , LIBND4J_TYPES); diff --git a/libnd4j/include/loops/cpu/transform/transform_strict.cpp b/libnd4j/include/loops/cpu/transform/transform_strict.cpp index e600d2fb8..29964e3e0 100644 --- a/libnd4j/include/loops/cpu/transform/transform_strict.cpp +++ b/libnd4j/include/loops/cpu/transform/transform_strict.cpp @@ -36,10 +36,8 @@ namespace functions { Nd4jLong *xShapeInfo, void *z, Nd4jLong *zShapeInfo, - void *extraParams, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets) { - DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets), TRANSFORM_STRICT_OPS); + void *extraParams, uint64_t threadId, uint64_t numThreads) { + DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_STRICT_OPS); } template @@ -49,20 +47,13 @@ namespace functions { Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, - void *vextraParams, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets) { + void *vextraParams, uint64_t threadId, uint64_t numThreads) { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto extraParams = reinterpret_cast(vextraParams); - if(OpType::requiresSpecial) { - OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets); - return; - } - - nd4j::TransformLoops::template loopTransform(x, xShapeInfo, z, zShapeInfo, extraParams); + nd4j::TransformLoops::template loopTransform(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads); } BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformStrict, , FLOAT_TYPES); diff --git a/libnd4j/include/loops/cuda/aggregates.cu b/libnd4j/include/loops/cuda/aggregates.cu deleted file mode 100644 index 9ced20e51..000000000 --- a/libnd4j/include/loops/cuda/aggregates.cu +++ /dev/null @@ -1,145 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -// -// @author raver119@gmail.com -// @author Yurii Shyrma, created on 27.11.2018 -// - -#include "../aggregates.h" - -namespace functions { -namespace aggregate { - -/////////////////////////////////////////////////////////////////////// -template -template -__device__ void AggregatedFunction::execCuda(X **arguments, int numArguments, - Nd4jLong **shapeArguments, int numShapeArguments, - int *indexArguments, int numIndexArguments, - int **intArrays, int numIntArrays, - X *realArguments, int numRealArguments) { - - OpClass::executeAggregateCuda(arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments); -} - -/////////////////////////////////////////////////////////////////////// -template -__device__ void AggregatedFunction::execCuda(int opNum, - X **arguments, int numArguments, - Nd4jLong **shapeArguments, int numShapeArguments, - int *indexArguments, int numIndexArguments, - int **intArrays, int numIntArrays, - X *realArguments, int numRealArguments) { - - DISPATCH_BY_OPNUM_T(execCuda, PARAMS(arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments), AGGREGATE_OPS); -} - -/////////////////////////////////////////////////////////////////////// -template -__global__ static void execAggregateKernel(int opNum, - void **varguments, int numArguments, - Nd4jLong **shapeArguments, int numShapeArguments, - int *indexArguments, int numIndexArguments, - int **intArrays, int numIntArrays, - void *vrealArguments, int numRealArguments) { - - auto arguments = reinterpret_cast(varguments); - auto realArguments = reinterpret_cast(vrealArguments); - functions::aggregate::AggregatedFunction::execCuda(opNum, arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments); -} - -/////////////////////////////////////////////////////////////////////// -template -__host__ void AggregatedFunction::aggregateKernelGeneric(dim3& launchDims, cudaStream_t *stream, - int opNum, - void **arguments, int numArguments, - Nd4jLong **shapeArguments, int numShapeArguments, - int *indexArguments, int numIndexArguments, - int **intArrays, int numIntArrays, - void *realArguments, int numRealArguments) { - - execAggregateKernel<<>>(opNum, arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments); - nd4j::DebugHelper::checkErrorCode(stream, "aggregateKernelGeneric(...) failed"); -} - -/////////////////////////////////////////////////////////////////////// -template -__device__ void AggregatedFunction::aggregateBatch(int opNum, int numAggregates, - int maxArgs, int maxShapes, - int maxIntArrays, int maxIntArraySize, - int maxIdx, int maxReals, - void *ptrToArguments) { - - nd4j::PointersHelper helper(ptrToArguments, numAggregates, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals); - - // TODO: we probably should lift this restriction - __shared__ int *intArrays[32]; - - __shared__ X **arguments; - __shared__ Nd4jLong **shapes; - __shared__ int *idxArg; - __shared__ X *realArg; - - for(int r = blockIdx.x; r < numAggregates; r += gridDim.x) { - if (threadIdx.x == 0) { - arguments = helper.getArguments(r); - shapes = helper.getShapeArguments(r); - idxArg = helper.getIndexArguments(r); - realArg = helper.getRealArguments(r); - } - - // we fill intArrays param in parallel within block - if (threadIdx.x < 32 && threadIdx.x < maxIntArrays) { - intArrays[threadIdx.x] = helper.getIntArrayArguments(r, threadIdx.x); - } - __syncthreads(); - - functions::aggregate::AggregatedFunction::execCuda(opNum, arguments, helper.getNumArguments(r), shapes, helper.getNumShapeArguments(r), idxArg, helper.getNumIndexArguments(r), intArrays, helper.getNumIntArrayArguments(r), realArg, helper.getNumRealArguments(r)); - } -} - -/////////////////////////////////////////////////////////////////////// -template -__global__ static void execAggregateBatch(int opNum, int numAggregates, - int maxArgs, int maxShapes, - int maxIntArrays, int maxIntArraySize, - int maxIdx, int maxReals, - void *ptrToArguments) { - - functions::aggregate::AggregatedFunction::aggregateBatch(opNum, numAggregates, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals, ptrToArguments); -} - -/////////////////////////////////////////////////////////////////////// -template -__host__ void AggregatedFunction::aggregateBatchKernelGeneric(dim3& launchDims, cudaStream_t *stream, - int opNum, int numAggregates, - int maxArgs, int maxShapes, - int maxIntArrays, int maxIntArraySize, - int maxIdx, int maxReals, - void *ptrToArguments) { - - execAggregateBatch<<>>(opNum, numAggregates, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals, ptrToArguments); - nd4j::DebugHelper::checkErrorCode(stream, "aggregateBatchKernel(...) failed"); -} - - - - - -BUILD_SINGLE_TEMPLATE(template class AggregatedFunction, , FLOAT_TYPES); -} -} diff --git a/libnd4j/include/loops/cuda/broadcasting.cu b/libnd4j/include/loops/cuda/broadcasting.cu index 8028db2ba..8846e5473 100644 --- a/libnd4j/include/loops/cuda/broadcasting.cu +++ b/libnd4j/include/loops/cuda/broadcasting.cu @@ -32,84 +32,6 @@ namespace functions { namespace broadcast { - template - void Broadcast::execInverse(int opNum, - void *x, - Nd4jLong *xShapeInfo, - void *y, - Nd4jLong *yShapeInfo, - void *result, - Nd4jLong *resultShapeInfo, - int *dimension, - int dimensionLength, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset, - Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ) { - // - } - template - void Broadcast::exec(int opNum, - void *x, - Nd4jLong *xShapeInfo, - void *y, - Nd4jLong *yShapeInfo, - void *result, - Nd4jLong *resultShapeInfo, - int *dimension, - int dimensionLength, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset, - Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ) { - - } - - /** - * CPU execution - * @param x the input - * @param xShapeInfo the x shape information - * @param y the y data - * @param yShapeInfo the y shape information - * @param result the result - * @param resultShapeInfo the result shape information - * @param dimension the dimension to broadcast along long - * @param dimensionLength the length of the dimension buffer - */ - template - template - void Broadcast::exec(void *x, - Nd4jLong *xShapeInfo, - void *y, - Nd4jLong *yShapeInfo, - void *result, - Nd4jLong *resultShapeInfo, - int *dimension, - int dimensionLength, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset, - Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ) { - // - } - - - template - template - void Broadcast::execInverse(void *x, - Nd4jLong *xShapeInfo, - void *y, - Nd4jLong *yShapeInfo, - void *result, - Nd4jLong *resultShapeInfo, - int *dimension, - int dimensionLength, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset, - Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ) { - - } } } \ No newline at end of file diff --git a/libnd4j/include/loops/cuda/broadcasting_bool.cu b/libnd4j/include/loops/cuda/broadcasting_bool.cu index aaec44690..af354a2e2 100644 --- a/libnd4j/include/loops/cuda/broadcasting_bool.cu +++ b/libnd4j/include/loops/cuda/broadcasting_bool.cu @@ -224,76 +224,6 @@ namespace functions { } - template - void BroadcastBool::exec(int opNum, - void *x, - Nd4jLong *xShapeInfo, - void *y, - Nd4jLong *yShapeInfo, - void *result, - Nd4jLong *resultShapeInfo, - int *dimension, - int dimensionLength, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset, - Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ) { - - } - - template - void BroadcastBool::execInverse(int opNum, - void *x, - Nd4jLong *xShapeInfo, - void *y, - Nd4jLong *yShapeInfo, - void *result, - Nd4jLong *resultShapeInfo, - int *dimension, - int dimensionLength, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset, - Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ) { - - } - - template - template - void BroadcastBool::exec(void *x, - Nd4jLong *xShapeInfo, - void *y, - Nd4jLong *yShapeInfo, - void *result, - Nd4jLong *resultShapeInfo, - int *dimension, - int dimensionLength, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset, - Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ) { - - } - - template - template - void BroadcastBool::execInverse(void *x, - Nd4jLong *xShapeInfo, - void *y, - Nd4jLong *yShapeInfo, - void *result, - Nd4jLong *resultShapeInfo, - int *dimension, - int dimensionLength, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset, - Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ) { - - } - - - BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT BroadcastBool, , LIBND4J_TYPES, BOOL_TYPES); } } \ No newline at end of file diff --git a/libnd4j/include/loops/cuda/broadcasting_int.cu b/libnd4j/include/loops/cuda/broadcasting_int.cu index fc613a438..f183c009e 100644 --- a/libnd4j/include/loops/cuda/broadcasting_int.cu +++ b/libnd4j/include/loops/cuda/broadcasting_int.cu @@ -217,75 +217,6 @@ namespace functions { } } - - template - void BroadcastInt::exec(int opNum, - void *x, - Nd4jLong *xShapeInfo, - void *y, - Nd4jLong *yShapeInfo, - void *result, - Nd4jLong *resultShapeInfo, - int *dimension, - int dimensionLength, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset, - Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ) { - - } - - template - void BroadcastInt::execInverse(int opNum, - void *x, - Nd4jLong *xShapeInfo, - void *y, - Nd4jLong *yShapeInfo, - void *result, - Nd4jLong *resultShapeInfo, - int *dimension, - int dimensionLength, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset, - Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ) { - - } - - template - template - void BroadcastInt::exec(void *x, - Nd4jLong *xShapeInfo, - void *y, - Nd4jLong *yShapeInfo, - void *result, - Nd4jLong *resultShapeInfo, - int *dimension, - int dimensionLength, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset, - Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ) { - - } - - template - template - void BroadcastInt::execInverse(void *x, - Nd4jLong *xShapeInfo, - void *y, - Nd4jLong *yShapeInfo, - void *result, - Nd4jLong *resultShapeInfo, - int *dimension, - int dimensionLength, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset, - Nd4jLong *tadShapeInfoZ, - Nd4jLong *tadOffsetZ) { - - } - BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT BroadcastInt, , INTEGER_TYPES); } } \ No newline at end of file diff --git a/libnd4j/include/loops/cuda/indexreduce.cu b/libnd4j/include/loops/cuda/indexreduce.cu index 8a560e416..1bd5d10cb 100644 --- a/libnd4j/include/loops/cuda/indexreduce.cu +++ b/libnd4j/include/loops/cuda/indexreduce.cu @@ -359,32 +359,6 @@ namespace functions { } } - - - - template - Nd4jLong IndexReduce::execScalar(const int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams) { - return 0; - } - - template - void IndexReduce::exec(const int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *result, Nd4jLong *resultShapeInfoBuffer, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset) { - - } - - template - template - Nd4jLong IndexReduce:: execScalar(void *x, Nd4jLong *xShapeInfo, void *extraParams) { - return 0; - } - - template - template - _CUDA_H void IndexReduce::exec(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *result, Nd4jLong *resultShapeInfoBuffer, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset) { - - } - - BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES, INDEXING_TYPES); } } diff --git a/libnd4j/include/loops/cuda/pairwise.cu b/libnd4j/include/loops/cuda/pairwise.cu index 17f8537e5..4833d32d0 100644 --- a/libnd4j/include/loops/cuda/pairwise.cu +++ b/libnd4j/include/loops/cuda/pairwise.cu @@ -22,58 +22,6 @@ namespace functions { namespace pairwise_transforms { - template - void PairWiseTransform::exec( - const int opNum, - void *x, - Nd4jLong *xShapeInfo, - void *y, - Nd4jLong *yShapeInfo, - void *z, - Nd4jLong *zShapeInfo, - void *extraParams) { - } - - template - void PairWiseTransform::exec( - const int opNum, - void *x, - Nd4jLong xStride, - void *y, - Nd4jLong yStride, - void *z, - Nd4jLong resultStride, - void *extraParams, - Nd4jLong len) { - - } - - - template - template - void PairWiseTransform:: exec( - void *vx, - Nd4jLong* xShapeInfo, - void *vy, - Nd4jLong* yShapeInfo, - void *vresult, - Nd4jLong* zShapeInfo, - void *vextraParams) { - - } - - template - template - void PairWiseTransform::exec(void *vx, - Nd4jLong xStride, - void *vy, - Nd4jLong yStride, - void *vresult, - Nd4jLong resultStride, - void *vextraParams, - const Nd4jLong len) { - - } } } \ No newline at end of file diff --git a/libnd4j/include/loops/cuda/pairwise_bool.cu b/libnd4j/include/loops/cuda/pairwise_bool.cu index 414aadd30..05adbbce4 100644 --- a/libnd4j/include/loops/cuda/pairwise_bool.cu +++ b/libnd4j/include/loops/cuda/pairwise_bool.cu @@ -110,63 +110,6 @@ void PairWiseBoolTransform::executeCudaShaped(dim3& launchDims, cudaStream_ DISPATCH_BY_OPNUM_TT(intermediateShaped, PARAMS(launchDims, stream, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, vextraParams), PAIRWISE_BOOL_OPS); } - - template - void PairWiseBoolTransform::exec( - const int opNum, - void *dx, - Nd4jLong *xShapeBuffer, - void *y, - Nd4jLong *yShapeBuffer, - void *result, - Nd4jLong *resultShapeBuffer, - void *extraParams) { - - } - - template - void PairWiseBoolTransform::exec( - const int opNum, - void *dx, - Nd4jLong xStride, - void *y, - Nd4jLong yStride, - void *result, - Nd4jLong resultStride, - void *extraParams, - Nd4jLong n) { - - } - - - template - template - void PairWiseBoolTransform::exec( - void *vx, - Nd4jLong* xShapeBuffer, - void *vy, - Nd4jLong* yShapeBuffer, - void *vresult, - Nd4jLong* resultShapeBuffer, - void *vextraParams) { - - } - - template - template - void PairWiseBoolTransform::exec(void *vx, - Nd4jLong xStride, - void *vy, - Nd4jLong yStride, - void *vresult, - Nd4jLong resultStride, - void *vextraParams, - const Nd4jLong n) { - - } - - - BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT PairWiseBoolTransform, , LIBND4J_TYPES, BOOL_TYPES); } } diff --git a/libnd4j/include/loops/cuda/pairwise_int.cu b/libnd4j/include/loops/cuda/pairwise_int.cu index 2bedb4a82..85dce56f2 100644 --- a/libnd4j/include/loops/cuda/pairwise_int.cu +++ b/libnd4j/include/loops/cuda/pairwise_int.cu @@ -109,63 +109,6 @@ void PairWiseIntTransform::executeCudaShaped(dim3& launchDims, cudaStream_t * DISPATCH_BY_OPNUM_T(intermediateShaped, PARAMS(launchDims, stream, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, vextraParams), PAIRWISE_INT_OPS); } - - template - void PairWiseIntTransform::exec( - const int opNum, - void *dx, - Nd4jLong *xShapeBuffer, - void *y, - Nd4jLong *yShapeBuffer, - void *result, - Nd4jLong *resultShapeBuffer, - void *extraParams) { - - } - - template - void PairWiseIntTransform::exec( - const int opNum, - void *dx, - Nd4jLong xStride, - void *y, - Nd4jLong yStride, - void *result, - Nd4jLong resultStride, - void *extraParams, - Nd4jLong n) { - - } - - - template - template - void PairWiseIntTransform::exec( - void *vx, - Nd4jLong* xShapeBuffer, - void *vy, - Nd4jLong* yShapeBuffer, - void *vresult, - Nd4jLong* resultShapeBuffer, - void *vextraParams) { - - } - - template - template - void PairWiseIntTransform::exec(void *vx, - Nd4jLong xStride, - void *vy, - Nd4jLong yStride, - void *vresult, - Nd4jLong resultStride, - void *vextraParams, - const Nd4jLong n) { - - } - - - BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT PairWiseIntTransform, , INTEGER_TYPES); } } diff --git a/libnd4j/include/loops/cuda/random.cu b/libnd4j/include/loops/cuda/random.cu index 3bf06ae91..47ced2769 100644 --- a/libnd4j/include/loops/cuda/random.cu +++ b/libnd4j/include/loops/cuda/random.cu @@ -442,39 +442,6 @@ namespace functions { DEBUG_KERNEL(stream, opNum); } - template - template - void RandomFunction::execTransform(Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) { - - } - - template - template - void RandomFunction::execTransform(Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) { - - } - - template - template - void RandomFunction::execTransform(Nd4jPointer state, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) { - - } - - template - void RandomFunction::execTransform(int opNum, Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) { - - } - - template - void RandomFunction::execTransform(int opNum, Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) { - - } - - template - void RandomFunction::execTransform(int opNum, Nd4jPointer state, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) { - - } - BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT RandomFunction, , FLOAT_TYPES); } } diff --git a/libnd4j/include/loops/cuda/reduce3.chpp b/libnd4j/include/loops/cuda/reduce3.chpp index fa1ab2e17..ac1d1adc3 100644 --- a/libnd4j/include/loops/cuda/reduce3.chpp +++ b/libnd4j/include/loops/cuda/reduce3.chpp @@ -132,7 +132,7 @@ __device__ void Reduce3::execScalarCuda( void *vx, Nd4jLong *xShapeInfo, extraZ[1] = (Z) 0.0f; if (extraParams != nullptr) - extraZ[2] = *(static_cast(extraParams)); + extraZ[2] = static_cast(extraParams)[2]; else extraZ[2] = (Z) 0.0f; } diff --git a/libnd4j/include/loops/cuda/reduce3.cu b/libnd4j/include/loops/cuda/reduce3.cu index 1ad94beee..4f0e0457c 100644 --- a/libnd4j/include/loops/cuda/reduce3.cu +++ b/libnd4j/include/loops/cuda/reduce3.cu @@ -27,56 +27,7 @@ namespace functions { namespace reduce3 { - template - template - void Reduce3::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo) { - } - - - template - void Reduce3::execScalar(const int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParamsVals, void *y, Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo) { - - } - - - template - template - void Reduce3::exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength) { - - } - - - template - template - void Reduce3::exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - } - - - template - template - void Reduce3::execAll(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) { - - } - - - template - void Reduce3::exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength) { - - } - - - template - void Reduce3::exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - } - - - template - void Reduce3::execAll(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) { - - } } } \ No newline at end of file diff --git a/libnd4j/include/loops/cuda/scalar_bool.cu b/libnd4j/include/loops/cuda/scalar_bool.cu index 37939b9b9..bb498c3a9 100644 --- a/libnd4j/include/loops/cuda/scalar_bool.cu +++ b/libnd4j/include/loops/cuda/scalar_bool.cu @@ -231,41 +231,6 @@ void ScalarBoolTransform::executeCudaAlongDimension(dim3& launchDims, cudaS } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ScalarBoolTransform, , LIBND4J_TYPES, BOOL_TYPES); - - - template - template - void ScalarBoolTransform::transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) { - - } - - template - void ScalarBoolTransform::transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) { - - } - - template - void ScalarBoolTransform::transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams) { - - } - - template - void ScalarBoolTransform::transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n) { - - } - - template - template - void ScalarBoolTransform::transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams) { - - } - - - template - template - void ScalarBoolTransform::transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n) { - - } } } diff --git a/libnd4j/include/loops/cuda/scalar_int.cu b/libnd4j/include/loops/cuda/scalar_int.cu index 44c73fcb4..f25beca82 100644 --- a/libnd4j/include/loops/cuda/scalar_int.cu +++ b/libnd4j/include/loops/cuda/scalar_int.cu @@ -230,40 +230,6 @@ void ScalarIntTransform::executeCudaAlongDimension(dim3& launchDims, cudaStre BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT ScalarIntTransform, , INTEGER_TYPES); - - template - template - void ScalarIntTransform::transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) { - - } - - template - void ScalarIntTransform::transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) { - - } - - template - void ScalarIntTransform::transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams) { - - } - - template - void ScalarIntTransform::transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n) { - - } - - template - template - void ScalarIntTransform::transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams) { - - } - - - template - template - void ScalarIntTransform::transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n) { - - } } } diff --git a/libnd4j/include/loops/cuda/summarystatsreduce.cu b/libnd4j/include/loops/cuda/summarystatsreduce.cu index 4867f5de1..e505929e6 100644 --- a/libnd4j/include/loops/cuda/summarystatsreduce.cu +++ b/libnd4j/include/loops/cuda/summarystatsreduce.cu @@ -414,73 +414,6 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa } - template - Y SummaryStatsReduce::execScalar(int opNum, - bool biasCorrected, - void *x, - Nd4jLong *xShapeInfo, - void *extraParams) { - return 0; - } - - template - void SummaryStatsReduce::execScalar(int opNum, - bool biasCorrected, - void *x, - Nd4jLong *xShapeInfo, - void *extraParams, - void *vz, - Nd4jLong *resultShapeInfoBuffer) { - - } - - template - void SummaryStatsReduce::exec(int opNum, - bool biasCorrected, - void *x, - Nd4jLong *xShapeInfo, - void *extraParams, - void *vz, - Nd4jLong *resultShapeInfoBuffer, - int *dimension, int dimensionLength) { - - } - - template - template - Y SummaryStatsReduce::execScalar(bool biasCorrected, - void *x, - Nd4jLong *xShapeInfo, - void *extraParams) { - return 0; - } - - template - template - void SummaryStatsReduce::execScalar(bool biasCorrected, - void *x, - Nd4jLong *xShapeInfo, - void *extraParams, - void *vz, - Nd4jLong *resultShapeInfoBuffer) { - // - } - - - template - template - void SummaryStatsReduce::exec(bool biasCorrected, - void *x, - Nd4jLong *xShapeInfo, - void *extraParams, - void *vz, - Nd4jLong *resultShapeInfoBuffer, - int *dimension, - int dimensionLength) { - - } - - BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT SummaryStatsReduce, , LIBND4J_TYPES, FLOAT_TYPES); } } \ No newline at end of file diff --git a/libnd4j/include/loops/cuda/transform/transform_any.cu b/libnd4j/include/loops/cuda/transform/transform_any.cu index 18b53cea7..5ca6f0067 100644 --- a/libnd4j/include/loops/cuda/transform/transform_any.cu +++ b/libnd4j/include/loops/cuda/transform/transform_any.cu @@ -114,17 +114,6 @@ namespace functions { nd4j::DebugHelper::checkErrorCode(stream, "transformAny(...) failed"); } - template - void TransformAny::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism) { - - } - - template - template - void TransformAny::exec(void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism) { - - } - BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformAny, , LIBND4J_TYPES, LIBND4J_TYPES); } } diff --git a/libnd4j/include/loops/cuda/transform/transform_bool.cu b/libnd4j/include/loops/cuda/transform/transform_bool.cu index e88a4274b..0f56020b0 100644 --- a/libnd4j/include/loops/cuda/transform/transform_bool.cu +++ b/libnd4j/include/loops/cuda/transform/transform_bool.cu @@ -120,17 +120,6 @@ namespace functions { nd4j::DebugHelper::checkErrorCode(stream, "transformBool(...) failed"); } - template - void TransformBool::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - } - - template - template - void TransformBool::exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - } - BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformBool, , LIBND4J_TYPES, BOOL_TYPES); } } diff --git a/libnd4j/include/loops/cuda/transform/transform_float.cu b/libnd4j/include/loops/cuda/transform/transform_float.cu index 44ddb0246..49d6ab26f 100644 --- a/libnd4j/include/loops/cuda/transform/transform_float.cu +++ b/libnd4j/include/loops/cuda/transform/transform_float.cu @@ -142,18 +142,6 @@ namespace functions { nd4j::DebugHelper::checkErrorCode(stream, "transformFloat(...) failed"); } - template - void TransformFloat::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - } - - template - template - void TransformFloat::exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - } - - BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformFloat, , LIBND4J_TYPES, FLOAT_TYPES); } } diff --git a/libnd4j/include/loops/cuda/transform/transform_same.cu b/libnd4j/include/loops/cuda/transform/transform_same.cu index e59381fba..4c587111b 100644 --- a/libnd4j/include/loops/cuda/transform/transform_same.cu +++ b/libnd4j/include/loops/cuda/transform/transform_same.cu @@ -118,17 +118,6 @@ namespace functions { nd4j::DebugHelper::checkErrorCode(stream, "transformSame(...) failed"); } - template - void TransformSame::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - } - - template - template - void TransformSame::exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - } - BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformSame, , LIBND4J_TYPES); } } diff --git a/libnd4j/include/loops/cuda/transform/transform_strict.cu b/libnd4j/include/loops/cuda/transform/transform_strict.cu index 0befdf35f..1136ef695 100644 --- a/libnd4j/include/loops/cuda/transform/transform_strict.cu +++ b/libnd4j/include/loops/cuda/transform/transform_strict.cu @@ -119,17 +119,6 @@ namespace functions { nd4j::DebugHelper::checkErrorCode(stream, "transformStrict(...) failed"); } - template - void TransformStrict::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - } - - template - template - void TransformStrict::exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - } - BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformStrict, , FLOAT_TYPES); } } diff --git a/libnd4j/include/loops/impl/type_conversions.cpp b/libnd4j/include/loops/impl/type_conversions.cpp index dc85b9554..5a4a9db41 100644 --- a/libnd4j/include/loops/impl/type_conversions.cpp +++ b/libnd4j/include/loops/impl/type_conversions.cpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace nd4j { @@ -79,10 +80,13 @@ namespace nd4j { auto amin = nd4j::math::nd4j_abs(min); // now we actually apply quantization - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < N; e++) { - rz[e] = static_cast(nd4j::math::nd4j_round(1.0f * x[e] / nd4j::math::nd4j_max(amax, amin) * max_byte)); - } + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + rz[e] = static_cast(nd4j::math::nd4j_round(1.0f * x[e] / nd4j::math::nd4j_max(amax, amin) * max_byte)); + } + }; + + samediff::Threads::parallel_for(func, 0, N); } template @@ -172,12 +176,15 @@ PRAGMA_OMP_ATOMIC_ARGS(write) // we use 3 as offset, since first 12 bytes are occupied with header int flimit = limit + 4; - PRAGMA_OMP_PARALLEL_FOR_IF(flimit > Environment::getInstance()->elementwiseThreshold()) - for (int e = 4; e < flimit; e++) { - int el = x[e]; - int ael = nd4j::math::nd4j_abs(el) - 1; - z[ael] += el > 0 ? threshold : -threshold; - } + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + int el = x[e]; + int ael = nd4j::math::nd4j_abs(el) - 1; + z[ael] += el > 0 ? threshold : -threshold; + } + }; + + samediff::Threads::parallel_for(func, 4, flimit); } /** @@ -194,19 +201,12 @@ PRAGMA_OMP_ATOMIC_ARGS(write) auto x = reinterpret_cast(dx); auto z = reinterpret_cast(dz); - if (N < nd4j::Environment::getInstance()->elementwiseThreshold()) { - for (int i = 0; i < N; i++) { - // FIXME: get rid of through-float though + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { z[i] = static_cast(static_cast(x[i])); } - } else { - - PRAGMA_OMP_PARALLEL_FOR - for (int i = 0; i < N; i++) { - // FIXME: get rid of through-float though - z[i] = static_cast(static_cast(x[i])); - } - } + }; + samediff::Threads::parallel_for(func, 0, N); }; template void TypeCast::convertFromThreshold(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); diff --git a/libnd4j/include/loops/indexreduce.h b/libnd4j/include/loops/indexreduce.h index 792ed16a9..ad4472dec 100755 --- a/libnd4j/include/loops/indexreduce.h +++ b/libnd4j/include/loops/indexreduce.h @@ -37,10 +37,6 @@ #include #endif -#ifndef _OPENMP -#define omp_get_thread_num() 0 -#define omp_get_max_threads() 1 -#endif #include @@ -70,7 +66,7 @@ namespace functions { static _CUDA_H void executeIndexReduceScalar(dim3 launchDims, cudaStream_t *stream, const int op, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *result, Nd4jLong *resultShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets); static _CUDA_H void executeIndexReduce(dim3 launchDims, cudaStream_t *stream, const int op, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *result, Nd4jLong *resultShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets); -#endif +#else static Nd4jLong execScalar(const int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams); @@ -81,6 +77,7 @@ namespace functions { template static _CUDA_H void exec(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *result, Nd4jLong *resultShapeInfoBuffer, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset); +#endif }; } } diff --git a/libnd4j/include/loops/legacy_ops.h b/libnd4j/include/loops/legacy_ops.h index 0e5200321..92fd58d7a 100644 --- a/libnd4j/include/loops/legacy_ops.h +++ b/libnd4j/include/loops/legacy_ops.h @@ -92,8 +92,6 @@ (5, TimesOneMinus), \ (6, Cube), \ (7, OneMinus), \ - (8, Col2Im), \ - (9, Im2col),\ (11, Reciprocal), \ (12, Square), \ (13, CompareAndSetTransform) ,\ @@ -101,7 +99,6 @@ (17, Ceiling), \ (18, Floor), \ (19, ClipByValue) ,\ - (20, Reverse), \ (21, Copy) #define TRANSFORM_ANY_OPS \ diff --git a/libnd4j/include/loops/pairwise_bool.h b/libnd4j/include/loops/pairwise_bool.h index 0ff4ebdee..f7a65c3f5 100644 --- a/libnd4j/include/loops/pairwise_bool.h +++ b/libnd4j/include/loops/pairwise_bool.h @@ -40,11 +40,6 @@ #include #endif -#ifndef _OPENMP -#define omp_get_thread_num() 0 -#define omp_get_max_threads() 1 -#endif - #include "legacy_ops.h" @@ -68,8 +63,7 @@ namespace functions { static __host__ void executeCudaShaped(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo, void *extraParams); -#endif - public: +#else static void exec( const int opNum, @@ -79,7 +73,9 @@ namespace functions { Nd4jLong *yShapeBuffer, void *result, Nd4jLong *resultShapeBuffer, - void *extraParams); + void *extraParams, + const uint64_t start, + const uint64_t stop); static void exec( const int opNum, @@ -90,7 +86,9 @@ namespace functions { void *result, Nd4jLong resultStride, void *extraParams, - Nd4jLong n); + Nd4jLong n, + const uint64_t start, + const uint64_t stop); template @@ -101,7 +99,9 @@ namespace functions { Nd4jLong* yShapeBuffer, void *vresult, Nd4jLong* resultShapeBuffer, - void *vextraParams); + void *vextraParams, + const uint64_t start, + const uint64_t stop); template static void exec(void *vx, @@ -111,7 +111,10 @@ namespace functions { void *vresult, Nd4jLong resultStride, void *vextraParams, - const Nd4jLong n); + const Nd4jLong n, + const uint64_t start, + const uint64_t stop); +#endif }; } } diff --git a/libnd4j/include/loops/pairwise_int.h b/libnd4j/include/loops/pairwise_int.h index 14d273285..aa6437d17 100644 --- a/libnd4j/include/loops/pairwise_int.h +++ b/libnd4j/include/loops/pairwise_int.h @@ -40,10 +40,6 @@ #include #endif -#ifndef _OPENMP -#define omp_get_thread_num() 0 -#define omp_get_max_threads() 1 -#endif #include "legacy_ops.h" @@ -68,8 +64,7 @@ namespace functions { static __host__ void executeCudaShaped(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo, void *extraParams); -#endif - public: +#else static void exec( const int opNum, @@ -79,7 +74,9 @@ namespace functions { Nd4jLong *yShapeBuffer, void *result, Nd4jLong *resultShapeBuffer, - void *extraParams); + void *extraParams, + const uint64_t start, + const uint64_t stop); static void exec( const int opNum, @@ -90,7 +87,9 @@ namespace functions { void *result, Nd4jLong resultStride, void *extraParams, - Nd4jLong n); + Nd4jLong n, + const uint64_t start, + const uint64_t stop); template @@ -101,7 +100,9 @@ namespace functions { Nd4jLong* yShapeBuffer, void *vresult, Nd4jLong* resultShapeBuffer, - void *vextraParams); + void *vextraParams, + const uint64_t start, + const uint64_t stop); template static void exec(void *vx, @@ -111,7 +112,10 @@ namespace functions { void *vresult, Nd4jLong resultStride, void *vextraParams, - const Nd4jLong n); + const Nd4jLong n, + const uint64_t start, + const uint64_t stop); +#endif }; } } diff --git a/libnd4j/include/loops/pairwise_transform.h b/libnd4j/include/loops/pairwise_transform.h index 4fe3eb0cc..0109b309f 100755 --- a/libnd4j/include/loops/pairwise_transform.h +++ b/libnd4j/include/loops/pairwise_transform.h @@ -41,12 +41,6 @@ #include #endif -#ifndef _OPENMP -#define omp_get_thread_num() 0 -#define omp_get_max_threads() 1 -#endif - - namespace functions { namespace pairwise_transforms { @@ -76,7 +70,9 @@ namespace functions { Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo, - void *extraParams); + void *extraParams, + uint64_t start, + uint64_t stop); static void exec( const int opNum, @@ -87,7 +83,9 @@ namespace functions { void *z, Nd4jLong resultStride, void *extraParams, - Nd4jLong len); + Nd4jLong len, + uint64_t start, + uint64_t stop); template @@ -98,7 +96,9 @@ namespace functions { Nd4jLong* yShapeInfo, void *vresult, Nd4jLong* zShapeInfo, - void *vextraParams); + void *vextraParams, + uint64_t start, + uint64_t stop); template static void exec(void *vx, @@ -108,7 +108,9 @@ namespace functions { void *vresult, Nd4jLong resultStride, void *vextraParams, - const Nd4jLong len); + Nd4jLong len, + uint64_t start, + uint64_t stop); }; } } diff --git a/libnd4j/include/loops/random.h b/libnd4j/include/loops/random.h index 620187b82..5048e5ce0 100644 --- a/libnd4j/include/loops/random.h +++ b/libnd4j/include/loops/random.h @@ -52,7 +52,7 @@ namespace functions { static _CUDA_H void executeCudaSingle(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *z, Nd4jLong *zShapeBuffer, void *extraArguments); static _CUDA_H void executeCudaDouble(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *x, Nd4jLong *xShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments); static _CUDA_H void executeCudaTriple(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments); -#endif +#else template static void execTransform(Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments); @@ -66,6 +66,7 @@ namespace functions { static void execTransform(int opNum, Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments); static void execTransform(int opNum, Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments); static void execTransform(int opNum, Nd4jPointer state, void *z, Nd4jLong *zShapeBuffer, void *extraArguments); +#endif }; } } diff --git a/libnd4j/include/loops/reduce3.h b/libnd4j/include/loops/reduce3.h index 781a17bb7..178bac7c2 100755 --- a/libnd4j/include/loops/reduce3.h +++ b/libnd4j/include/loops/reduce3.h @@ -44,10 +44,6 @@ #include #endif -#ifndef _OPENMP -#define omp_get_thread_num() 0 -#define omp_get_max_threads() 1 -#endif #include "legacy_ops.h" @@ -114,7 +110,7 @@ class Reduce3 { -#endif +#else template static void execScalar(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo); @@ -124,25 +120,25 @@ class Reduce3 { template - static void exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength); + static void exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, int64_t start, int64_t stop); template - static void exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); + static void exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, int64_t start, int64_t stop); template - static void execAll(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets); + static void execAll(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets, int64_t start, int64_t stop); - static void exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength); + static void exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, int64_t start, int64_t stop); - static void exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); + static void exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, int64_t start, int64_t stop); - static void execAll(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets); - + static void execAll(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets, int64_t start, int64_t stop); +#endif }; diff --git a/libnd4j/include/loops/reduce_bool.h b/libnd4j/include/loops/reduce_bool.h index 89df1330f..540a6041d 100644 --- a/libnd4j/include/loops/reduce_bool.h +++ b/libnd4j/include/loops/reduce_bool.h @@ -28,7 +28,6 @@ #include #include #include -#include #include #pragma once @@ -37,10 +36,6 @@ #include #endif -#ifndef _OPENMP -#define omp_get_thread_num() 0 -#define omp_get_max_threads() 1 -#endif #include "legacy_ops.h" @@ -77,7 +72,7 @@ namespace functions { static __host__ void execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo); static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); -#endif +#else /** * Reduce down to 1 number @@ -121,7 +116,7 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset); + Nd4jLong *tadOffset, int64_t start, int64_t stop); /** * Execute on the cpu @@ -145,7 +140,7 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset); + Nd4jLong *tadOffset, int64_t start, int64_t stop); /** * CPU implementation @@ -178,8 +173,10 @@ namespace functions { Nd4jLong xElementWiseStride, Nd4jLong length, void *extraParams); +#endif }; + #ifdef __CUDACC__ /** * diff --git a/libnd4j/include/loops/reduce_float.h b/libnd4j/include/loops/reduce_float.h index 9856e1d8e..ff2c0e668 100644 --- a/libnd4j/include/loops/reduce_float.h +++ b/libnd4j/include/loops/reduce_float.h @@ -28,7 +28,6 @@ #include #include #include -#include #include #pragma once @@ -37,10 +36,6 @@ #include #endif -#ifndef _OPENMP -#define omp_get_thread_num() 0 -#define omp_get_max_threads() 1 -#endif #include "legacy_ops.h" @@ -79,7 +74,7 @@ namespace functions { static __host__ void execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong *hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo); static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, void *vx, Nd4jLong *xShapeInfo, Nd4jLong *hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShape, Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); -#endif +#else /** * Reduce down to 1 number @@ -123,7 +118,7 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset); + Nd4jLong *tadOffset, int64_t start, int64_t stop); /** * Execute on the cpu @@ -147,7 +142,7 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset); + Nd4jLong *tadOffset, int64_t start, int64_t stop); /** * CPU implementation @@ -180,8 +175,10 @@ namespace functions { Nd4jLong xElementWiseStride, Nd4jLong length, void *extraParams); +#endif }; + #ifdef __CUDACC__ /** * diff --git a/libnd4j/include/loops/reduce_long.h b/libnd4j/include/loops/reduce_long.h index 193160074..a5d2a9498 100644 --- a/libnd4j/include/loops/reduce_long.h +++ b/libnd4j/include/loops/reduce_long.h @@ -28,7 +28,6 @@ #include #include #include -#include #include #pragma once @@ -37,11 +36,6 @@ #include #endif -#ifndef _OPENMP -#define omp_get_thread_num() 0 -#define omp_get_max_threads() 1 -#endif - #include "legacy_ops.h" //an op for the kernel @@ -78,7 +72,7 @@ namespace functions { static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); -#endif +#else /** * Reduce down to 1 number @@ -122,7 +116,7 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset); + Nd4jLong *tadOffset, int64_t start, int64_t stop); /** * Execute on the cpu @@ -146,7 +140,7 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset); + Nd4jLong *tadOffset, int64_t start, int64_t stop); /** * CPU implementation @@ -179,6 +173,7 @@ namespace functions { Nd4jLong xElementWiseStride, Nd4jLong length, void *extraParams); +#endif }; #ifdef __CUDACC__ diff --git a/libnd4j/include/loops/reduce_same.h b/libnd4j/include/loops/reduce_same.h index c7f5f9173..e828ecf46 100644 --- a/libnd4j/include/loops/reduce_same.h +++ b/libnd4j/include/loops/reduce_same.h @@ -28,7 +28,6 @@ #include #include #include -#include #include #pragma once @@ -37,11 +36,6 @@ #include #endif -#ifndef _OPENMP -#define omp_get_thread_num() 0 -#define omp_get_max_threads() 1 -#endif - #include "legacy_ops.h" //an op for the kernel @@ -80,7 +74,7 @@ namespace functions { static __host__ void execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo); static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); -#endif +#else /** * Reduce down to 1 number @@ -124,7 +118,7 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset); + Nd4jLong *tadOffset, int64_t start, int64_t stop); /** * Execute on the cpu @@ -148,7 +142,7 @@ namespace functions { int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset); + Nd4jLong *tadOffset, int64_t start, int64_t stop); /** * CPU implementation @@ -181,6 +175,8 @@ namespace functions { Nd4jLong xElementWiseStride, Nd4jLong length, void *extraParams); + +#endif }; #ifdef __CUDACC__ diff --git a/libnd4j/include/loops/scalar.h b/libnd4j/include/loops/scalar.h index b2ee46dba..0f32dedf3 100755 --- a/libnd4j/include/loops/scalar.h +++ b/libnd4j/include/loops/scalar.h @@ -70,15 +70,15 @@ namespace functions { __host__ static void executeCudaAlongDimension(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *z, Nd4jLong *zShapeInfo, void *scalars, void *extraParams, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ); -#endif +#else template - static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ); + static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop); - static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ); + static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop); - static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, bool allowParallelism); + static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, const uint64_t start, const uint64_t stop); - static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong len, bool allowParallelism); + static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t len, const uint64_t start, const uint64_t stop); @@ -101,7 +101,7 @@ namespace functions { */ template - static void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, bool allowParallelism); + static void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, const uint64_t start, const uint64_t stop); /** @@ -117,7 +117,8 @@ namespace functions { */ template - static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong len, bool allowParallelism); + static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t len, const uint64_t start, const uint64_t stop); +#endif }; } } diff --git a/libnd4j/include/loops/scalar_bool.h b/libnd4j/include/loops/scalar_bool.h index ddc039d89..a5931ddfb 100644 --- a/libnd4j/include/loops/scalar_bool.h +++ b/libnd4j/include/loops/scalar_bool.h @@ -86,15 +86,15 @@ namespace functions { /* #include "cuda/scalar_temp.cu" */ -#endif +#else template - static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ); + static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop); - static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ); + static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop); - static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams); + static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, const uint64_t start, const uint64_t stop); - static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n); + static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t n, const uint64_t start, const uint64_t stop); @@ -117,7 +117,7 @@ namespace functions { */ template - static void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams); + static void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, const uint64_t start, const uint64_t stop); /** @@ -133,7 +133,8 @@ namespace functions { */ template - static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n); + static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t n, const uint64_t start, const uint64_t stop); +#endif }; } } diff --git a/libnd4j/include/loops/scalar_int.h b/libnd4j/include/loops/scalar_int.h index f873d5419..509d7574f 100644 --- a/libnd4j/include/loops/scalar_int.h +++ b/libnd4j/include/loops/scalar_int.h @@ -83,18 +83,15 @@ namespace functions { static void executeCudaAlongDimension(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *z, Nd4jLong *zShapeInfo, void *scalars, void *extraParams, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ); -/* -#include "cuda/scalar_temp.cu" -*/ -#endif +#else template - static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ); + static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop); - static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ); + static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop); - static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams); + static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, const uint64_t start, const uint64_t stop); - static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n); + static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t n, const uint64_t start, const uint64_t stop); @@ -117,7 +114,7 @@ namespace functions { */ template - static void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams); + static void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, const uint64_t start, const uint64_t stop); /** @@ -133,7 +130,8 @@ namespace functions { */ template - static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n); + static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t n, const uint64_t start, const uint64_t stop); +#endif }; } } diff --git a/libnd4j/include/loops/summarystatsreduce.h b/libnd4j/include/loops/summarystatsreduce.h index 915293904..afaee9c47 100755 --- a/libnd4j/include/loops/summarystatsreduce.h +++ b/libnd4j/include/loops/summarystatsreduce.h @@ -286,7 +286,7 @@ namespace functions { static _CUDA_H void execSummaryStatsReduceScalar(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected, void *reductionBuffer); static _CUDA_H void execSummaryStatsReduce(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected, void *reductionBuffer); static _CUDA_H void execSummaryStatsReduce(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected, void *reductionBuffer); -#endif +#else static Z execScalar(int opNum, bool biasCorrected, @@ -335,7 +335,7 @@ namespace functions { Nd4jLong *resultShapeInfoBuffer, int *dimension, int dimensionLength); - +#endif }; } } diff --git a/libnd4j/include/loops/transform_any.h b/libnd4j/include/loops/transform_any.h index ab9ad47c4..d97e3e90e 100644 --- a/libnd4j/include/loops/transform_any.h +++ b/libnd4j/include/loops/transform_any.h @@ -27,7 +27,7 @@ #include #include #include -#include + #ifdef _OPENMP #include #endif @@ -44,11 +44,6 @@ #include #endif -#ifndef _OPENMP -#define omp_get_thread_num() 0 -#define omp_get_max_threads() 1 -#endif - #include "legacy_ops.h" @@ -69,12 +64,12 @@ class TransformAny { static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); -#endif - - static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism); +#else + static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads); template - static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism); + static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads); +#endif }; } diff --git a/libnd4j/include/loops/transform_bool.h b/libnd4j/include/loops/transform_bool.h index ee416ea87..4c87ae58c 100644 --- a/libnd4j/include/loops/transform_bool.h +++ b/libnd4j/include/loops/transform_bool.h @@ -27,7 +27,7 @@ #include #include #include -#include + #ifdef _OPENMP #include #endif @@ -44,11 +44,6 @@ #include #endif -#ifndef _OPENMP -#define omp_get_thread_num() 0 -#define omp_get_max_threads() 1 -#endif - #include "legacy_ops.h" @@ -78,12 +73,12 @@ namespace functions { static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); -#endif - - static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); +#else + static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads); template - static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); + static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads); +#endif }; } } diff --git a/libnd4j/include/loops/transform_float.h b/libnd4j/include/loops/transform_float.h index 66547ee79..ae28e069f 100644 --- a/libnd4j/include/loops/transform_float.h +++ b/libnd4j/include/loops/transform_float.h @@ -27,7 +27,7 @@ #include #include #include -#include + #ifdef _OPENMP #include #endif @@ -44,11 +44,6 @@ #include #endif -#ifndef _OPENMP -#define omp_get_thread_num() 0 -#define omp_get_max_threads() 1 -#endif - #include "legacy_ops.h" @@ -102,11 +97,12 @@ namespace functions { static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); -#endif - static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); +#else + static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads); template - static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); + static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads); +#endif }; } } diff --git a/libnd4j/include/loops/transform_same.h b/libnd4j/include/loops/transform_same.h index ef646a1b6..ae5b498e6 100644 --- a/libnd4j/include/loops/transform_same.h +++ b/libnd4j/include/loops/transform_same.h @@ -27,7 +27,7 @@ #include #include #include -#include + #ifdef _OPENMP #include #endif @@ -44,11 +44,6 @@ #include #endif -#ifndef _OPENMP -#define omp_get_thread_num() 0 -#define omp_get_max_threads() 1 -#endif - #include "legacy_ops.h" @@ -79,12 +74,13 @@ namespace functions { static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); -#endif - static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); +#else + static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads); template - static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); + static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads); +#endif }; } } diff --git a/libnd4j/include/loops/transform_strict.h b/libnd4j/include/loops/transform_strict.h index fe520743e..96917ebc1 100644 --- a/libnd4j/include/loops/transform_strict.h +++ b/libnd4j/include/loops/transform_strict.h @@ -27,7 +27,7 @@ #include #include #include -#include + #ifdef _OPENMP #include #endif @@ -44,11 +44,6 @@ #include #endif -#ifndef _OPENMP -#define omp_get_thread_num() 0 -#define omp_get_max_threads() 1 -#endif - #include "legacy_ops.h" @@ -79,12 +74,16 @@ namespace functions { static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); -#endif +#else - static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); + + + static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads); template - static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); + static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads); + +#endif }; } } diff --git a/libnd4j/include/msvc.h b/libnd4j/include/msvc.h new file mode 100644 index 000000000..c884736f3 --- /dev/null +++ b/libnd4j/include/msvc.h @@ -0,0 +1,39 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#ifndef SAMEDIFF_MSVC_H +#define SAMEDIFF_MSVC_H + +#if defined(_MSC_VER) + +#pragma warning( disable : 4244 ) +#pragma warning( disable : 4267 ) +#pragma warning( disable : 4251 ) +#pragma warning( disable : 4101 ) +#pragma warning( disable : 4305 ) +#pragma warning( disable : 4309 ) +#pragma warning( disable : 4333 ) +#pragma warning( disable : 4146 ) +#pragma warning( disable : 4018 ) +#pragma warning( disable : 4297 ) + +#endif + +#endif //DEV_TESTS_MSVC_H diff --git a/libnd4j/include/op_boilerplate.h b/libnd4j/include/op_boilerplate.h index 4f70d9bf2..102a1776a 100644 --- a/libnd4j/include/op_boilerplate.h +++ b/libnd4j/include/op_boilerplate.h @@ -1461,7 +1461,7 @@ #ifdef _RELEASE -#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast(&VARIABLE), LENGTH * sizeof(TT) + 8); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT) + 8)); } +#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast(&VARIABLE), LENGTH * sizeof(TT)); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT))); } #define RELEASE_SPECIAL(VARIABLE, WORKSPACE) if (VARIABLE != nullptr) {if (WORKSPACE == nullptr) { auto erc_##VARIABLE = cudaFree(reinterpret_cast(VARIABLE)); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] deallocation failed", erc_##VARIABLE);}; }; }; #else @@ -1528,6 +1528,7 @@ #elif _MSC_VER #define FORCEINLINE __forceinline #elif __GNUC__ +#define INLINE_LOOPS #define FORCEINLINE __attribute__((always_inline)) inline #elif __CUDACC__ #define FORCEINLINE __forceinline__ inline diff --git a/libnd4j/include/openmp_pragmas.h b/libnd4j/include/openmp_pragmas.h index f1d4a8f67..667f54521 100644 --- a/libnd4j/include/openmp_pragmas.h +++ b/libnd4j/include/openmp_pragmas.h @@ -23,7 +23,7 @@ #if defined(_MSC_VER) -#define OMP_STRINGIFY(args) +#define OMP_STRINGIFY(args) #args #define OMP_IF(args) #define OMP_SCHEDULE(args) #define OMP_MAXT @@ -32,7 +32,7 @@ #define PRAGMA_OMP_ATOMIC #define PRAGMA_OMP_ATOMIC_ARGS(args) #define PRAGMA_OMP_CRITICAL -#define PRAGMA_OMP_SIMD +#define PRAGMA_OMP_SIMD __pragma(omp simd) #define PRAGMA_OMP_SIMD_ARGS(args) #define PRAGMA_OMP_SIMD_SUM(args) #define PRAGMA_OMP_SIMD_MAX(args) @@ -61,6 +61,7 @@ #else + #define OMP_STRINGIFY(args) #args #define OMP_IF(args) if(args) #define OMP_SCHEDULE(args) schedule(args) @@ -99,4 +100,39 @@ #endif +// reductions +#define FUNC_RL std::function +#define FUNC_AL std::function + +// aggregation functions +#define FUNC_RD std::function +#define FUNC_AD std::function + +// parallel block +#define FUNC_DO std::function + +// parallel_for block +#define FUNC_1D std::function +#define FUNC_2D std::function +#define FUNC_3D std::function + +// aggregation lambda +#define LAMBDA_AL [&](int64_t _old, int64_t _new) -> int64_t +#define LAMBDA_AD [&](double _old, double _new) -> double + +#define LAMBDA_SUML LAMBDA_AL {return _old + _new; } +#define LAMBDA_SUMD LAMBDA_AD {return _old + _new; } + +// reduction lambda +#define PRAGMA_REDUCE_LONG [&] (uint64_t thread_id, int64_t start, int64_t stop, int64_t increment) mutable -> int64_t +#define PRAGMA_REDUCE_DOUBLE [&] (uint64_t thread_id, int64_t start, int64_t stop, int64_t increment) mutable -> double + +// paralllel block lambda +#define PRAGMA_THREADS_DO [&](uint64_t thread_id, uint64_t numThreads) -> void + +// paralllel_for lambdas +#define PRAGMA_THREADS_FOR [&](uint64_t thread_id, int64_t start, int64_t stop, int64_t increment) -> void +#define PRAGMA_THREADS_FOR_2D [&](uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y) -> void +#define PRAGMA_THREADS_FOR_3D [&](uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z) -> void + #endif //DEV_TESTS_OPENMP_PRAGMAS_H diff --git a/libnd4j/include/ops/aggregate_ops.h b/libnd4j/include/ops/aggregate_ops.h deleted file mode 100644 index a10a2912e..000000000 --- a/libnd4j/include/ops/aggregate_ops.h +++ /dev/null @@ -1,996 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -// -// @author raver119@gmail.com -// -#ifndef LIBND4J_AGGREGATE_OPS_H -#define LIBND4J_AGGREGATE_OPS_H - -#include -#include - -#define HS_MAX_EXP 6.0f - -#ifdef __CUDACC__ -#define aggregate_def __device__ inline static -#else -#include -#define aggregate_def inline static -#endif -/* - * - * - * Aggregate Ops are special things suited for CUDA mostly. They are meant to be executed within single block ONLY. - * So, when batched, they should provide proper parallelism levels on poorly parallel tasks otherwise. - * - * On CPU aggregate ops are trying to minimize OpenMP multi-threading use, only SIMD is enforced - * - * - */ -namespace aggregateOps { - - template - class GEMM { - public: -#ifdef __CUDACC__ - aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { - // no-op - } -#endif - -#ifndef __CUDACC__ - static CBLAS_ORDER convertOrder(int from) { - switch(from) { - //'c' - case 99: - return CblasRowMajor; - //'C' - case 67: return CblasRowMajor; - //'f' - case 102: return CblasColMajor; - //'F' - case 70: return CblasColMajor; - default: return CblasColMajor; - - } - } - - - static CBLAS_TRANSPOSE convertTranspose(int from) { - switch(from) { - //'t' - case 116: return CblasTrans; - //'T' - case 84: return CblasTrans; - //'n' - case 110: return CblasNoTrans; - //'N' - case 78: return CblasNoTrans; - //'c' - case 99: return CblasConjTrans; - //'C' - case 67: return CblasConjTrans; - default: return CblasNoTrans; - } - } -#endif - -#ifndef __CUDACC__ - aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { - int M = indexArguments[0]; - int N = indexArguments[1]; - int K = indexArguments[2]; - int lda = indexArguments[3]; - int ldb = indexArguments[4]; - int ldc = indexArguments[5]; - int TransA = indexArguments[6]; - int TransB = indexArguments[7]; - int Order = indexArguments[8]; - - T alpha = realArguments[0]; - T beta = realArguments[1]; - - T *A = arguments[0]; - T *B = arguments[1]; - T *C = arguments[2]; - - nd4j::blas::GEMM::op(convertOrder(Order), convertTranspose(TransA), convertTranspose(TransB),M,N,K,(T) alpha,A,lda,B,ldb,(T) beta,C,ldc); - } -#else - aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { - // stub for nvcc - } -#endif - }; - - /** - * We don't include this class into ops directly, since it won't be ever used directly, - * Only as part of SkipGram or CBOW - */ - template - class HierarchicSoftmax { - private: - - public: - - aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { - int vectorLength = indexArguments[0]; - int expLength = indexArguments[1]; - int code = indexArguments[2]; - int isInference = indexArguments[3]; - - T *syn0 = arguments[0]; // we pass row pointer here - T *syn1 = arguments[1]; // we pass row pointer here - T *expTable = arguments[2]; - T *neu1e = arguments[3]; - - - T dot(0.0f); - T g(0.0f); - T f(0.0f); - T alpha = realArguments[0]; - - //nd4j_printf("Vector length: [%i]; expLength: [%i]; Code: [%i]; Inf: [%i]\n", vectorLength, expLength, code, isInference); - - -// shape::printArray(syn0, vectorLength, "syn0"); -// shape::printArray(syn1, vectorLength, "syn1"); -// shape::printArray(neu1e, vectorLength, "neu1e"); - - // dot - for (int x = 0; x < vectorLength; x++) { - dot += syn0[x] * syn1[x]; - } - - // gradient - if (dot < (T) - HS_MAX_EXP || dot >= (T) HS_MAX_EXP) { - return; - } - - int idx = static_cast((dot + HS_MAX_EXP) * ((T) expLength / HS_MAX_EXP / 2.0f)); - - if (idx >= expLength || idx < 0) { - return; - } - - f = expTable[idx]; - g = (static_cast(1.0f) - static_cast(code) - f) * alpha; - - //nd4j_printf("dot: [%f]; idx: [%i]; f: [%f]; g: [%f]\n", (float) dot, idx, (float) f, (float) g); - - // axpy1 - PRAGMA_OMP_SIMD - for (int x = 0; x < vectorLength; x++) { - neu1e[x] = g * syn1[x] + neu1e[x]; - } - - // axpy2 - if (!isInference) { - PRAGMA_OMP_SIMD - for (int x = 0; x < vectorLength; x++) { - syn1[x] = g * syn0[x] + syn1[x]; - } - } - } - -#ifdef __CUDACC__ - aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { - /* - We know that syn0 & syn1 are 2D matrices, so we can just use offsets here - */ - __shared__ int vectorLength; - __shared__ int expLength; - __shared__ int code; - __shared__ int isInference; - - T *syn0 = arguments[0]; - T *syn1 = arguments[1]; - T *expTable = arguments[2]; - T *neu1e = arguments[3]; - - __shared__ T dot; - __shared__ T g; - __shared__ T f; - __shared__ T alpha; - - if (threadIdx.x == 0) { - vectorLength = indexArguments[0]; - expLength = indexArguments[1]; - code = indexArguments[2]; - isInference = indexArguments[3]; - - dot = (T) 0.0f; - - alpha = realArguments[0]; - } - __syncthreads(); - - - // TODO: it would be great to implement dot without atomicAdd call. like aggregateParticles, or something like that - // dot - for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { - T prod = syn0[x] * syn1[x]; - nd4j::math::atomics::nd4j_atomicAdd(&dot, prod); - } - - - // gradient - __syncthreads(); - - if (dot < - (T) HS_MAX_EXP || dot >= (T) HS_MAX_EXP) - return; - - int idx = (int) ((dot + HS_MAX_EXP) * ((T) expLength / (T) HS_MAX_EXP / 2.0)); - - if (idx >= expLength) - return; - - - if (threadIdx.x == 0) { - // gradient calculation - f = expTable[idx]; - g = ((T) 1.0f - (T) code - f) * alpha; - } - __syncthreads(); - - // axpy1 - for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { - neu1e[x] = g * syn1[x] + neu1e[x]; - } - - // axpy2 - if (!isInference) - for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { - syn1[x] = g * syn0[x] + syn1[x]; - } - } -#endif - }; - - /** - * We don't include this class into ops directly, since it won't be ever used directly, - * Only as part of SkipGram or CBOW - */ - template - class NegativeSampling { - public: - - aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { - int vectorLength = indexArguments[0]; - int expLength = indexArguments[1]; - int code = indexArguments[2]; - int isInference = indexArguments[3]; - - T *syn0 = arguments[0]; // we pass row pointer here - T *syn1Neg = arguments[1]; // we pass row pointer here - T *expTable = arguments[2]; - T *neu1e = arguments[3]; - - T dot = (T) 0.0f; - T g = (T) 0.0f; - T alpha = realArguments[0]; - - // dot - for (int x = 0; x < vectorLength; x++) { - dot += syn0[x] * syn1Neg[x]; - } - - if (dot > HS_MAX_EXP) - g = (code - 1) * alpha; - else if (dot < (T) - HS_MAX_EXP) - g = (code - 0) * alpha; - else { - int idx = (int) ((dot + (T) HS_MAX_EXP) * ((T) expLength / HS_MAX_EXP / 2.0)); - if (idx >= expLength) - return; - - if (idx < 0) - return; - - g = ((T) code - expTable[idx]) * alpha; - } - - // axpy1 - PRAGMA_OMP_SIMD - for (int x = 0; x < vectorLength; x++) { - neu1e[x] = g * syn1Neg[x] + neu1e[x]; - } - - // axpy2 - if (!isInference) { - PRAGMA_OMP_SIMD - for (int x = 0; x < vectorLength; x++) { - syn1Neg[x] = g * syn0[x] + syn1Neg[x]; - } - } - } - -#ifdef __CUDACC__ - aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { - /* - We know that syn0 & syn1 are 2D matrices, so we can just use offsets here - */ - __shared__ int vectorLength; - __shared__ int expLength; - __shared__ int code; - __shared__ int isInference; - - T *syn0 = arguments[0]; - T *syn1Neg = arguments[1]; - T *expTable = arguments[2]; - T *neu1e = arguments[3]; - - __shared__ T dot; - __shared__ T g; - __shared__ T alpha; - - if (threadIdx.x == 0) { - vectorLength = indexArguments[0]; - expLength = indexArguments[1]; - code = indexArguments[2]; - isInference = indexArguments[3]; - - dot = (T) 0.0f; - - alpha = realArguments[0]; - } - __syncthreads(); - - - // TODO: it would be great to implement dot without atomicAdd call. like aggregateParticles, or something like that - // dot - for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { - T prod = syn0[x] * syn1Neg[x]; - nd4j::math::atomics::nd4j_atomicAdd(&dot, prod); - } - - - // gradient - __syncthreads(); - - - int idx = (int) ((dot + (T) HS_MAX_EXP) * ((T) expLength / (T) HS_MAX_EXP / 2.0)); - if (idx >= expLength && dot <= (T) HS_MAX_EXP && dot >= (T) -HS_MAX_EXP) - return; - - - if (threadIdx.x == 0) { - // gradient calculation - if (dot > (T) HS_MAX_EXP) - g = (code - 1) * alpha; - else if (dot < (T) - HS_MAX_EXP) - g = (code - 0) * alpha; - else { - - - g = ((T) code - expTable[idx]) * alpha; - } - - // printf("dot: [%f]; g: [%f]\n", dot, g); - } - __syncthreads(); - - // printf("before syn1Neg[%i]: [%f], dot: [%f]; g: [%f]; vectorLength: [%i]\n", threadIdx.x, syn1Neg[threadIdx.x], dot, g, vectorLength); - - // axpy1 - for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { - neu1e[x] = g * syn1Neg[x] + neu1e[x]; - } - - // axpy2 - if (!isInference) - for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { - syn1Neg[x] = g * syn0[x] + syn1Neg[x]; - } - - // printf("after syn1Neg[%i]: [%f]\n", threadIdx.x, syn1Neg[threadIdx.x]); - - } -#endif - }; - - template - class Dot { - public: - - aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { - T *vecX = arguments[0]; - T *vecY = arguments[1]; - T *vecZ = arguments[2]; - - T dot = (T) 0.0f; - - int vectorLength = indexArguments[0]; - - PRAGMA_OMP_SIMD_SUM(dot) - for (int x = 0; x < vectorLength; x++) { - dot += vecX[x] * vecY[x]; - } - - vecZ[0] = dot; - }; - -#ifdef __CUDACC__ - aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { - T *vecX = arguments[0]; - T *vecY = arguments[1]; - T *vecZ = arguments[2]; - - int vectorLength = indexArguments[0]; - - __shared__ T dot; - if (threadIdx.x == 0) - dot = (T) 0.0f; - __syncthreads(); - - for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { - T prod = vecX[x] * vecY[x]; - nd4j::math::atomics::nd4j_atomicAdd(&dot, prod); - } - __syncthreads(); - - if (threadIdx.x == 0) - vecZ[0] = dot; - } -#endif - }; - - template - class Axpy { - public: - - aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { - T *vecX = arguments[0]; - T *vecY = arguments[1]; - - T alpha = realArguments[0]; - - int vectorLength = indexArguments[0]; - - PRAGMA_OMP_SIMD - for (int x = 0; x < vectorLength; x++) { - vecY[x] = alpha * vecX[x] + vecY[x]; - } - }; - -#ifdef __CUDACC__ - aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { - T *vecX = arguments[0]; - T *vecY = arguments[1]; - - T alpha = realArguments[0]; - - int vectorLength = indexArguments[0]; - - for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) { - vecY[x] = alpha * vecX[x] + vecY[x]; - } - __syncthreads(); - } -#endif - }; - - - template - class SkipGram { - public: - - aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { - int syn0Row = indexArguments[0]; - int vectorLength = indexArguments[1]; - int hsRounds = indexArguments[2]; - int ngRounds = indexArguments[3]; - int expLength = indexArguments[4]; - int vocabSize = indexArguments[5]; - int ngStarter = indexArguments[6]; - int negTableLength = indexArguments[7]; - int isInference = indexArguments[8]; - - - auto neu1e = new T[vectorLength]; - std::memset(neu1e, 0, sizeof(T) * vectorLength); - - T *args[4]; - int idxArgs[4]; - - args[1] = arguments[1]; // syn1 - args[2] = arguments[2]; // expTable - args[3] = neu1e; - - - idxArgs[0] = vectorLength; // vectorLength - idxArgs[1] = expLength; // expLength - idxArgs[3] = isInference; - - T *syn1Neg = arguments[3]; - T *negTable = arguments[4]; - T *inferenceVector = arguments[5]; - - T *syn0 = isInference == 1 ? inferenceVector : arguments[0] + (syn0Row * vectorLength); - - args[0] = syn0;// syn0 - - int *idxSyn1 = intArrays[0]; - int *codes = intArrays[1]; - - //nd4j_printf("syn0Row: [%i]; vecLen: [%i]; hsRounds: [%i]; ngRounds: [%i]; expLength: [%i]; vocabSize: [%i]; ngStarter: [%i]; negTableLength: [%i]; isInf: [%i]\n", syn0Row, vectorLength, hsRounds, ngRounds, expLength, vocabSize, ngStarter, negTableLength, isInference); - - auto next_random = static_cast(realArguments[1]); - - if (hsRounds > 0) { - for (int r = 0; r < hsRounds; r++) { - args[1] = arguments[1] + (idxSyn1[r] * vectorLength); // syn1 row - idxArgs[2] = codes[r]; // code for row - - //nd4j_printf("idx syn1: [%i]; code: [%i]\n", idxSyn1[r], idxArgs[2]); - - HierarchicSoftmax::executeAggregate(args, 4, nullptr, 0, idxArgs, 5, nullptr, 0, realArguments, 1); - } - } - - - - int target = ngStarter; - if (ngRounds > 0) { - for (int r = 0; r < ngRounds + 1; r++) { - if (r == 0) { - idxArgs[2] = 1; - } else { - next_random = next_random * (unsigned long long) 25214903917 + 11; - target = negTable[(next_random >> 16) % negTableLength]; - - if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1; - if (target == ngStarter) - continue; - - idxArgs[2] = 0; - } - - args[1] = syn1Neg + (target * vectorLength); // syn1Neg instead of syn1 - - NegativeSampling::executeAggregate(args, 4, nullptr, 0, idxArgs, 5, nullptr, 0, realArguments, 1); - } - } - - //nd4j_printf("applying...\n",""); - - if (!isInference) { - PRAGMA_OMP_SIMD - for (int x = 0; x < vectorLength; x++) { - syn0[x] += neu1e[x]; - } - } else { - PRAGMA_OMP_SIMD - for (int x = 0; x < vectorLength; x++) { - inferenceVector[x] += neu1e[x]; - } - } - - delete[] neu1e; - } - -#ifdef __CUDACC__ - aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) { - __shared__ int syn0Row; - __shared__ int vectorLength; - __shared__ int hsRounds; - __shared__ int ngRounds; - __shared__ int expLength; - __shared__ int vocabSize; - __shared__ int ngStarter; - __shared__ int negTableLength; - __shared__ int isInference; - - __shared__ T *neu1e; - - __shared__ T *args[4]; - __shared__ int idxArgs[4]; - - - __shared__ unsigned long long next_random; - - __shared__ T *negTable; - T *syn1Neg = arguments[3]; - __shared__ T *inferenceVector; - - if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; - neu1e = (T *) shmem; - - syn0Row = indexArguments[0]; - vectorLength = indexArguments[1]; - hsRounds = indexArguments[2]; - ngRounds = indexArguments[3]; - expLength = indexArguments[4]; - vocabSize = indexArguments[5]; - ngStarter = indexArguments[6]; - negTableLength = indexArguments[7]; - isInference = indexArguments[8]; - - inferenceVector = arguments[5]; - - next_random = (unsigned long long) realArguments[1]; - - args[0] = isInference == 1 ? inferenceVector : arguments[0] + (syn0Row * vectorLength); // syn0 - args[1] = arguments[1]; // syn1 - args[2] = arguments[2]; // expTable - args[3] = neu1e; - - negTable = arguments[4]; - - idxArgs[0] = vectorLength; // vectorLength - idxArgs[1] = expLength; // expLength - idxArgs[3] = isInference; - } - __syncthreads(); - - T *syn0 = isInference ? inferenceVector : arguments[0] + (syn0Row * vectorLength); - - for (int i = threadIdx.x; i < vectorLength; i+=blockDim.x) { - neu1e[i] = (T) 0.0f; - } - - int *idxSyn1 = intArrays[0]; - int *codes = intArrays[1]; - - - for (int r = 0; r < hsRounds; r++) { - if (threadIdx.x == 0) { - args[1] = arguments[1] + (idxSyn1[r] * vectorLength);// syn1 row - idxArgs[2] = codes[r]; // code for row - } - __syncthreads(); - - HierarchicSoftmax::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 1); - } - __syncthreads(); - - - __shared__ int target; - if (ngRounds > 0) - for (int r = 0; r < ngRounds + 1; r++) { - if (threadIdx.x == 0) { - if (r == 0) { - // this line isn't a mistake - target = ngStarter; - - idxArgs[2] = 1; - } else { - next_random = next_random * (unsigned long long)25214903917 + 11 + blockIdx.x; - target = negTable[(next_random >> 16) % negTableLength]; - - if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1; - - idxArgs[2] = 0; - } - - args[1] = syn1Neg + (target * vectorLength); - } - __syncthreads(); - - // we put it here, to make sure all threads pick up continue call - if (r != 0 && target == ngStarter) - continue; - - NegativeSampling::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 1); - } - - - - // final axpy with 1.0f as alpha - if (!isInference) - for (int x = threadIdx.x; x < vectorLength; x+= blockDim.x) { - syn0[x] += neu1e[x]; - } - else - for (int x = threadIdx.x; x < vectorLength; x+= blockDim.x) { - inferenceVector[x] += neu1e[x]; - } - } -#endif - }; - - template - class CBOW { - public: - - aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, - int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, - T *realArguments, int numRealArguments) { - int vectorLength = indexArguments[0]; - int hsRounds = indexArguments[1]; - int ngRounds = indexArguments[2]; - int expLength = indexArguments[3]; - int vocabSize = indexArguments[4]; - int ngStarter = indexArguments[5]; - int negTableLength = indexArguments[6]; - int idxSyn0Length = indexArguments[7]; - //int initialIdx = indexArguments[8]; - int numLabels = indexArguments[9]; - int trainWords = indexArguments[10]; - int isInference = indexArguments[11]; - - - int *idxSyn0 = intArrays[0]; - int *idxSyn1 = intArrays[1]; - int *codes = intArrays[2]; - - - T *neu1 = new T[vectorLength]; - T *neu1e = new T[vectorLength]; - std::memset(neu1, 0, sizeof(T) * vectorLength); - std::memset(neu1e, 0, sizeof(T) * vectorLength); - - T *syn0 = arguments[0]; - T *syn1 = arguments[1]; - T *expTable = arguments[2]; - T *syn1Neg = arguments[3]; - T *negTable = arguments[4]; - T *inferenceVector = arguments[5]; - - T *args[4]; - int idxArgs[4]; - idxArgs[0] = vectorLength; // vectorLength - idxArgs[1] = expLength; // expLength - idxArgs[3] = isInference; - - unsigned long long next_random = (unsigned long long) realArguments[1]; - - // building neu1 for current window - for (int c = 0; c < idxSyn0Length; c++) { - T *syn0word = syn0 + (idxSyn0[c] * vectorLength); - - PRAGMA_OMP_SIMD - for (int i = 0; i < vectorLength; i++) { - neu1[i] += syn0word[i]; - } - } - - // for inference we use additional inference vector - if (isInference) { - PRAGMA_OMP_SIMD - for (int i = 0; i < vectorLength; i++) { - neu1[i] += inferenceVector[i]; - } - } - - - // average neu1 - if (idxSyn0Length > 0) { - PRAGMA_OMP_SIMD - for (int i = 0; i < vectorLength; i++) { - neu1[i] /= idxSyn0Length + isInference; - } - } - - args[0] = neu1; - args[2] = expTable; - args[3] = neu1e; - - if (hsRounds > 0) - for (int i = 0; i < hsRounds; i++) { - args[1] = syn1 + (idxSyn1[i] * vectorLength); - idxArgs[2] = codes[i]; - - HierarchicSoftmax::executeAggregate((T **)args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2); - } - - int target = ngStarter; - if (ngRounds > 0) - for (int i = 0; i < ngRounds + 1; i++) { - if (i == 0) { - idxArgs[2] = 1; - } else { - next_random = next_random * (unsigned long long) 25214903917 + 11; - target = negTable[(next_random >> 16) % negTableLength]; - - if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1; - if (target == ngStarter) - continue; - - idxArgs[2] = 0; - } - - args[1] = syn1Neg + (target * vectorLength); // syn1Neg instead of syn1 - - //printf("Negative round: target: [%i]; code: [%i]; neu1e[0]: [%f]\n", target, idxArgs[4], neu1e[0]); - - NegativeSampling::executeAggregate((T **)args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2); - } - - - // if we don't train words - we skip start of idxSyn0 - int starter = trainWords == 1 ? 0 : idxSyn0Length - numLabels; - - // propagate neu1e -> syn0 - if (!isInference) { - for (int c = starter; c < idxSyn0Length; c++) { - T *syn0word = arguments[0] + (idxSyn0[c] * vectorLength); - - PRAGMA_OMP_SIMD - for (int i = 0; i < vectorLength; i++) { - syn0word[i] += neu1e[i]; - } - } - } else { - PRAGMA_OMP_SIMD - for (int i = 0; i < vectorLength; i++) { - inferenceVector[i] += neu1e[i]; - } - } - - - - delete[] neu1; - delete[] neu1e; - } - - -#ifdef __CUDACC__ - aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, - int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, - T *realArguments, int numRealArguments) { - __shared__ int vectorLength; - __shared__ int hsRounds; - __shared__ int ngRounds; - __shared__ int expLength; - __shared__ int vocabSize; - __shared__ int ngStarter; - __shared__ int negTableLength; - __shared__ int idxSyn0Length; - __shared__ int initialIdx; - __shared__ int numLabels; - __shared__ int trainWords; - __shared__ int isInference; - - int *idxSyn0 = intArrays[0]; - int *idxSyn1 = intArrays[1]; - int *codes = intArrays[2]; - - __shared__ T *neu1; - __shared__ T *neu1e; - - __shared__ T *args[5]; - __shared__ int idxArgs[4]; - - T *syn0 = arguments[0]; - T *syn1 = arguments[1]; - //T *expTable = arguments[2]; - T *syn1Neg = arguments[3]; - T *negTable = arguments[4]; - T *inferenceVector = arguments[5]; - - if (threadIdx.x == 0) { - vectorLength = indexArguments[0]; - hsRounds = indexArguments[1]; - ngRounds = indexArguments[2]; - expLength = indexArguments[3]; - vocabSize = indexArguments[4]; - ngStarter = indexArguments[5]; - negTableLength = indexArguments[6]; - idxSyn0Length = indexArguments[7]; - initialIdx = indexArguments[8]; - numLabels = indexArguments[9]; - trainWords = indexArguments[10]; - isInference = indexArguments[11]; - - extern __shared__ unsigned char shmem[]; - neu1 = (T *) shmem; - neu1e = neu1 + vectorLength; - - args[0] = neu1; - args[2] = arguments[2]; //expTable - args[3] = neu1e; - - idxArgs[0] = vectorLength; // vectorLength - idxArgs[1] = expLength; // expLength - idxArgs[3] = isInference; - } - __syncthreads(); - - for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) { - neu1[i] = (T) 0.0f; - neu1e[i] = (T) 0.0f; - } - - unsigned long long next_random = (unsigned long long) realArguments[1]; - for (int c = 0; c < idxSyn0Length; c++) { - T *syn0word = syn0 + (idxSyn0[c] * vectorLength); - - for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) { - neu1[i] += syn0word[i]; - } - } - - if (isInference) - for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) { - neu1[i] += inferenceVector[i]; - } - - // average neu1 - if (idxSyn0Length > 0) { - for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) { - neu1[i] /= idxSyn0Length + + isInference; - } - } - __syncthreads(); - - - - if (hsRounds > 0) - for (int i = 0; i < hsRounds; i++) { - if (threadIdx.x == 0) { - args[1] = syn1 + (idxSyn1[i] * vectorLength); - idxArgs[2] = codes[i]; - } - __syncthreads(); - - HierarchicSoftmax::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2); - } - - __shared__ int target; - if (ngRounds > 0) - for (int i = 0; i < ngRounds + 1; i++) { - if (threadIdx.x == 0) { - if (i == 0) { - target = ngStarter; - } else { - next_random = next_random * (unsigned long long) 25214903917 + 11; - target = negTable[(next_random >> 16) % negTableLength]; - - if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1; - } - - args[1] = syn1Neg + (target * vectorLength); // syn1Neg instead of syn1 - idxArgs[2] = i == 0 ? 1 : 0; - } - __syncthreads(); - - if (i != 0 && target == ngStarter) - continue; - - - NegativeSampling::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2); - - //printf("Negative round: target: [%i]; code: [%i]; neu1[%i]: [%f]; neu1e[%i]: [%f]\n", target, idxArgs[2], threadIdx.x, neu1[threadIdx.x], threadIdx.x, neu1e[threadIdx.x]); - } - - - // if we don't train words - we skip start of idxSyn0 - int starter = trainWords == 1 ? 0 : idxSyn0Length - numLabels; - - if (!isInference) - for (int c = starter; c < idxSyn0Length; c++) { - T *syn0word = arguments[0] + (idxSyn0[c] * vectorLength); - - for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) { - syn0word[i] += neu1e[i]; - } - } - else { - for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) { - inferenceVector[i] += neu1e[i]; - } - } - - } -#endif - }; - -} - -#endif //LIBND4J_AGGREGATE_OPS_H diff --git a/libnd4j/include/ops/declarable/BooleanOp.h b/libnd4j/include/ops/declarable/BooleanOp.h index b341ce394..b741c61c4 100644 --- a/libnd4j/include/ops/declarable/BooleanOp.h +++ b/libnd4j/include/ops/declarable/BooleanOp.h @@ -35,7 +35,6 @@ namespace nd4j { Nd4jStatus validateAndExecute(Context& block) override = 0; public: BooleanOp(const char *name, int numInputs, bool scalar); - ~BooleanOp(); bool evaluate(std::initializer_list args); bool evaluate(std::vector& args); diff --git a/libnd4j/include/ops/declarable/BroadcastableOp.h b/libnd4j/include/ops/declarable/BroadcastableOp.h index bc2cddc59..39435195b 100644 --- a/libnd4j/include/ops/declarable/BroadcastableOp.h +++ b/libnd4j/include/ops/declarable/BroadcastableOp.h @@ -33,7 +33,6 @@ namespace nd4j { Nd4jStatus validateAndExecute(Context& block) override = 0; public: BroadcastableOp(const char *name, int numTArgs, int numIArgs); - ~BroadcastableOp(); ShapeList *calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context& block) override; }; diff --git a/libnd4j/include/ops/declarable/DeclarableCustomOp.h b/libnd4j/include/ops/declarable/DeclarableCustomOp.h index 38cc20e71..49d3735d4 100644 --- a/libnd4j/include/ops/declarable/DeclarableCustomOp.h +++ b/libnd4j/include/ops/declarable/DeclarableCustomOp.h @@ -33,7 +33,6 @@ namespace nd4j { Nd4jStatus validateAndExecute(Context& block) override = 0; public: DeclarableCustomOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs); - ~DeclarableCustomOp(); ShapeList* calculateOutputShape(ShapeList* inputShapes, nd4j::graph::Context& block) override = 0; }; diff --git a/libnd4j/include/ops/declarable/DeclarableListOp.h b/libnd4j/include/ops/declarable/DeclarableListOp.h index 6fa4fe086..2d6115027 100644 --- a/libnd4j/include/ops/declarable/DeclarableListOp.h +++ b/libnd4j/include/ops/declarable/DeclarableListOp.h @@ -34,13 +34,12 @@ namespace nd4j { protected: Nd4jStatus validateAndExecute(Context& block) override = 0; - nd4j::NDArray* getZ(Context& block, int inputId); + nd4j::NDArray* getZ(Context& block, int inputId) ; void setupResult(NDArray* array, Context& block); void setupResultList(NDArrayList* arrayList, Context& block); public: DeclarableListOp(int numInputs, int numOutputs, const char* opName, int tArgs, int iArgs); - ~DeclarableListOp(); Nd4jStatus execute(Context* block) override; diff --git a/libnd4j/include/ops/declarable/DeclarableOp.h b/libnd4j/include/ops/declarable/DeclarableOp.h index f8c96d400..5da74860b 100644 --- a/libnd4j/include/ops/declarable/DeclarableOp.h +++ b/libnd4j/include/ops/declarable/DeclarableOp.h @@ -126,7 +126,7 @@ namespace nd4j { DeclarableOp(const char *name, bool isLogical); // default testructor - ~DeclarableOp(); + virtual ~DeclarableOp(); // this method returns OpDescriptor, describing this Op instance OpDescriptor *getOpDescriptor(); diff --git a/libnd4j/include/ops/declarable/DeclarableReductionOp.h b/libnd4j/include/ops/declarable/DeclarableReductionOp.h index 4a75c5daf..5306f60eb 100644 --- a/libnd4j/include/ops/declarable/DeclarableReductionOp.h +++ b/libnd4j/include/ops/declarable/DeclarableReductionOp.h @@ -33,7 +33,6 @@ namespace nd4j { Nd4jStatus validateAndExecute(Context& block) override = 0; public: DeclarableReductionOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs); - ~DeclarableReductionOp(); ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override; }; diff --git a/libnd4j/include/ops/declarable/LegacyOp.h b/libnd4j/include/ops/declarable/LegacyOp.h index 951f60165..a7c7ad055 100644 --- a/libnd4j/include/ops/declarable/LegacyOp.h +++ b/libnd4j/include/ops/declarable/LegacyOp.h @@ -45,6 +45,7 @@ namespace nd4j { public: LegacyOp(int numInputs); LegacyOp(int numInputs, int opNum); + ~LegacyOp() = default; // All Op classes provide own specific implementation for this method ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override = 0; diff --git a/libnd4j/include/ops/declarable/LogicOp.h b/libnd4j/include/ops/declarable/LogicOp.h index 026afe634..70fa3a6ff 100644 --- a/libnd4j/include/ops/declarable/LogicOp.h +++ b/libnd4j/include/ops/declarable/LogicOp.h @@ -37,7 +37,6 @@ namespace nd4j { Nd4jStatus validateAndExecute(nd4j::graph::Context& block) override; public: LogicOp(const char *name); - ~LogicOp() = default; ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override; }; diff --git a/libnd4j/include/ops/declarable/OpTuple.h b/libnd4j/include/ops/declarable/OpTuple.h index e0296dd9c..fc0fd594a 100644 --- a/libnd4j/include/ops/declarable/OpTuple.h +++ b/libnd4j/include/ops/declarable/OpTuple.h @@ -29,7 +29,7 @@ namespace nd4j { namespace ops { class ND4J_EXPORT OpTuple { public: - const char * _opName; + std::string _opName; std::vector _inputs; std::vector _outputs; std::vector _tArgs; diff --git a/libnd4j/include/ops/declarable/generic/blas/axpy.cpp b/libnd4j/include/ops/declarable/generic/blas/axpy.cpp index 986b93019..1b949eb35 100644 --- a/libnd4j/include/ops/declarable/generic/blas/axpy.cpp +++ b/libnd4j/include/ops/declarable/generic/blas/axpy.cpp @@ -30,9 +30,10 @@ namespace nd4j { auto y = INPUT_VARIABLE(1); auto z = OUTPUT_VARIABLE(0); - REQUIRE_TRUE(x->isSameShape(y),0, "Axpy: both arguments should have the same shape") + REQUIRE_TRUE(x->isSameShape(y),0, "Axpy: both arguments should have the same shape"); + REQUIRE_TRUE(x->dataType() == y->dataType() && x->dataType() == z->dataType(), 0, "Axpy: all arguments must have the same data type"); - double a = (double) 1.0f; + double a = 1.0; if (block.width() > 2) { auto alpha = INPUT_VARIABLE(2); @@ -41,15 +42,6 @@ namespace nd4j { a = T_ARG(0); } - /* - auto lambda = LAMBDA_TT(_y, _x, a) { - return a * _x + _y; - }; - - y->applyPairwiseLambda(x, lambda, z); - */ - - // FIXME: set proper extras here ExtraArguments arguments({a}); y->applyPairwiseTransform(pairwise::Axpy, x, z, &arguments); @@ -59,9 +51,9 @@ namespace nd4j { DECLARE_TYPES(axpy) { getOpDescriptor() - ->setAllowedInputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF}) - ->setAllowedInputTypes(1, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF}) - ->setAllowedOutputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF}); + ->setAllowedInputTypes(0, {ALL_FLOATS}) + ->setAllowedInputTypes(1, {ALL_FLOATS}) + ->setAllowedOutputTypes(0, {ALL_FLOATS}); } } } diff --git a/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp b/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp index 6897f7f77..ad7b7fee2 100644 --- a/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp +++ b/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp @@ -30,14 +30,6 @@ namespace nd4j { auto input = INPUT_VARIABLE(0); auto output = OUTPUT_VARIABLE(0); - // TODO: once we add support for multiple dtypes - uncommend this - /* - int it = INT_ARG(0); - DataType newType = DataTypeUtils::fromInt(it); - - input->cast(output, newType); - */ - if(input->isEmpty()){ REQUIRE_TRUE(output->isEmpty(), 0, "If input is empty, output array must also be empty"); return Status::OK(); diff --git a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp index 5641bab43..8b6bd24bc 100644 --- a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp +++ b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright (c) 2015-2018 Skymind, Inc. + * Copyright (c) 2019 Konduit K.K. * * This program and the accompanying materials are made available under the * terms of the Apache License, Version 2.0 which is available at @@ -88,8 +89,27 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) { nd4j_debug("MKL-DNN is not used for batchnorm!\n", 0); // formula: output = gamma * ((input - mean) / sqrt(variance + epsilon)) + beta + // auto v = input->varianceAlongDimension(variance::SummaryStatsVariance, false, ShapeUtils::evalDimsToExclude(input->rankOf(), axes)); + // auto m = input->reduceAlongDimension(nd4j::reduce::Mean, ShapeUtils::evalDimsToExclude(input->rankOf(), axes)); + helpers::batchnorm(input, mean, variance, gamma, beta, output, axes, epsilon); + // NDArray stdInv = *v + epsilon; + // stdInv.applyTransform(transform::Reciprocal); // 1 / (variance + epsilon) + // stdInv.applyTransform(transform::Sqrt); // 1 / (variance + epsilon)^0.5 + // if(applyScale) + // stdInv *= *gamma; + + // // empty array with same shape as input + // input->applyBroadcast(nd4j::broadcast::Subtract, axes, m, output); + // output->applyBroadcast(nd4j::broadcast::Multiply, axes, &stdInv); + + // if(applyOffset) + // output->applyBroadcast(nd4j::broadcast::Add, axes, beta); + + // delete v; + // delete m; + return Status::OK(); } @@ -113,10 +133,9 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) { NDArray* input = INPUT_VARIABLE(0); NDArray* mean = INPUT_VARIABLE(1); NDArray* variance = INPUT_VARIABLE(2); - NDArray* dLdO = INPUT_VARIABLE(3); // next epsilon NDArray* gamma = nullptr; NDArray* beta = nullptr; - + NDArray* dLdO = INPUT_VARIABLE(block.width() - 1); // next epsilon NDArray* dLdI = OUTPUT_VARIABLE(0); NDArray* dLdM = OUTPUT_VARIABLE(1); @@ -129,11 +148,11 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) { const float epsilon = T_ARG(0); if(applyScale) { - gamma = INPUT_VARIABLE(4); + gamma = INPUT_VARIABLE(3); dLdG = OUTPUT_VARIABLE(3); } if(applyOffset) { - beta = INPUT_VARIABLE(4 + (int)applyScale); + beta = INPUT_VARIABLE(3 + (int)applyScale); dLdB = OUTPUT_VARIABLE(3 + (int)applyScale); } @@ -172,67 +191,120 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) { REQUIRE_TRUE(input->isSameShape(dLdO), 0, "BATCHNORM_BP op: wrong shape of output gradients array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(input).c_str(), ShapeUtils::shapeAsString(dLdO).c_str()); // types of all input arrays should be the same (except dLdO) - for(int i = 1; i < block.width() - 1; ++i) - if(i != 3) - REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP op: types of arrays (input, mean, variance, gamma, beta) should be the same !"); + for(int i = 1; i < block.width() - 2; ++i) + REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP op: types of arrays (input, mean, variance, gamma, beta) should be the same !"); // ***** calculations ***** // - // formula for forward step: output = gamma * ((input - mean) / sqrt(variance + epsilon)) + beta + // notations: + // f = g * (gamma * ((x - m) / (v + eps)^0.5) + beta) -> means dLdO * ff_output + // g = dLdO + // stdInv = 1 / (v + eps)^0.5 + // N - batch size (product of spatial dimensions) - // consider mean and variance as constants (since we get them as inputs and don't calculate them) - // dLdI = (dLdO * gamma) / (variance + epsilon)^0.5 - // dLdV = (-0.5 * gamma * (dLdO * (x - mean))_sum) / (variance + epsilon)^1.5 - // dLdM = - (dLdO_sum * gamma) / (variance + epsilon)^0.5 - // dLdG = (dLdO * (x - mean))_sum / (variance + epsilon)^0.5 - // dLdB = dLdO_sum + // derivatives: + // dLdI = dfdx + dfdm*dmdx + dfdv*(dvdm*dmdx + dvdx) + + // dfdx = gamma*stdInv*g; + // dfdm = -gamma*stdInv*g_sum; + // dmdx = 1/N; + // dvdx = 2 * (x - m) / N + // dvdm = -2 * [(x - m)]_sum / N + // dfdv = -0.5 * [g*(x - m)]_sum * stdInv^3, drop gamma here for calc convenience + + // finally: + // dLdI = gamma * ( stdInv * (g - g_sum/N) + (2/N) * dfdv * (dvdm/2 + (x - m)) ) + + // dLdG = (g * (x - m))_sum * stdInv + // dLdB = g_sum + + // variance = input->varianceAlongDimension(variance::SummaryStatsVariance, false, ShapeUtils::evalDimsToExclude(input->rankOf(), axes)); + // mean = input->reduceAlongDimension(nd4j::reduce::Mean, ShapeUtils::evalDimsToExclude(input->rankOf(), axes)); const auto excludedAxes = ShapeUtils::evalDimsToExclude(inRank, axes); - - NDArray temp1 = *variance + epsilon; - temp1.applyTransform(transform::Reciprocal); // 1 / (variance + epsilon) - auto temp2 = temp1.transform(transform::Sqrt); // 1 / (variance + epsilon)^0.5 - if(applyScale) - temp2 *= *gamma; // gamma / (variance + epsilon)^0.5 - - NDArray temp3(input); // empty array with same shape as input - input->applyBroadcast(nd4j::broadcast::Subtract, axes, mean, &temp3); // input - mean - temp3 *= *dLdO; // (input - mean) * dLdO - const bool keepUnitiesInShape = inRank == mean->rankOf(); - // dLdI - dLdO->applyBroadcast(nd4j::broadcast::Multiply, axes, &temp2, dLdI); + // inverse batch size 1/N + const float Ninv = 1.f * shape::tadLength(input->getShapeInfo(), axes.data(), axes.size()) / input->lengthOf(); - // dLdM - dLdO->reduceAlongDimension(reduce::Sum, dLdM, excludedAxes, keepUnitiesInShape); // dLdO sum over excluded axes + // input - mean + NDArray xMinusMean(input); // empty array with same shape as input + input->applyBroadcast(nd4j::broadcast::Subtract, axes, mean, &xMinusMean); + + // stdInv + NDArray stdInv = *variance + epsilon; + stdInv.applyTransform(transform::Reciprocal); // 1 / (variance + epsilon) + stdInv.applyTransform(transform::Sqrt); // 1 / (variance + epsilon)^0.5 + + // dvdm (use dLdM as storage for dvdm) + xMinusMean.reduceAlongDimension(nd4j::reduce::Sum, dLdM, excludedAxes, keepUnitiesInShape); + *dLdM *= -Ninv; + + // g_sum + auto gSum = dLdO->reduceAlongDims(nd4j::reduce::Sum, excludedAxes, keepUnitiesInShape); // dLdB if(applyOffset) - dLdB->assign(dLdM); + dLdB->assign(gSum); - // dLdM - // dLdM->applyPairwiseTransform(nd4j::pairwise::Multiply, temp2); - // dLdM->applyTransform(nd4j::transform::Neg); - *dLdM = 0; // put zeros so far + // stdInv * (g - g_sum/N) (use dLdI as storage for this expression) + gSum *= Ninv; + dLdO->applyBroadcast(nd4j::broadcast::Subtract, axes, &gSum, dLdI); + dLdI->applyBroadcast(nd4j::broadcast::Multiply, axes, &stdInv); - //dLdV - temp3.reduceAlongDimension(reduce::Sum, dLdV, excludedAxes, keepUnitiesInShape); // ((input - mean) * dLdO)_sum + // dLdV <- [g*(x - m)]_sum + (xMinusMean * *dLdO).reduceAlongDimension(nd4j::reduce::Sum, dLdV, excludedAxes, keepUnitiesInShape); // dLdG - if(applyScale) { - dLdV->applyPairwiseTransform(nd4j::pairwise::Multiply, &temp2, dLdG); - // dLdV->assign(dLdG); - dLdG->applyPairwiseTransform(nd4j::pairwise::Divide, *gamma); - } - else - // dLdV->applyPairwiseTransform(nd4j::pairwise::Multiply, temp2); + *dLdV *= stdInv; + if(applyScale) + dLdG->assign(dLdV); - // dLdV - // dLdV->applyPairwiseTransform(nd4j::pairwise::Multiply, temp1); - // *dLdV *= -0.5; + // (2 / N) * dfdv (use dLdV as storage for dfdv) + *dLdV *= stdInv*stdInv; // dLdV*stdInv * stdInv^2 + *dLdV *= -Ninv; // -0.5f * (2 / N); + + // dfdv * (dvdm + (x - m)) (use xMinusMean as storage for this expression) + xMinusMean.applyBroadcast(nd4j::broadcast::Add, axes, dLdM); + xMinusMean.applyBroadcast(nd4j::broadcast::Multiply, axes, dLdV); + + // dLdI + *dLdI += xMinusMean; + if(applyScale) + dLdI->applyBroadcast(nd4j::broadcast::Multiply, axes, gamma); + + *dLdM = 0; // put zeros so far *dLdV = 0; // put zeros so far + // java code + // NDArray std = *variance + epsilon; + // std.applyTransform(transform::Reciprocal); // 1 / (variance + epsilon) + // std.applyTransform(transform::Sqrt); // 1 / (variance + epsilon)^0.5 + // NDArray xMu(input); + // input->applyBroadcast(nd4j::broadcast::Subtract, axes, mean, &xMu); + // NDArray xHat(input); + // xMu.applyBroadcast(nd4j::broadcast::Multiply, axes, &std, &xHat); + // NDArray dxhat(input); + // dLdO->applyBroadcast(nd4j::broadcast::Multiply, axes, gamma, &dxhat); + // NDArray temp = dxhat*xMu; + // temp.reduceAlongDimension(reduce::Sum, dLdV, excludedAxes, keepUnitiesInShape); + // *dLdV *= -0.5f * std*std*std; + // NDArray* dxmu1 = dxhat.reduceAlongDimension(reduce::Sum, excludedAxes, keepUnitiesInShape); + // *dxmu1 *= -std; + // NDArray* dxmu2 = xMu.reduceAlongDimension(reduce::Sum, excludedAxes, keepUnitiesInShape); + // *dxmu2 *= *dLdV * (-2.f/N); + // NDArray dLdmu = *dxmu1 + *dxmu2; + // dLdmu *= (1.f /N); + // *dLdV *= (2.f/N); + // dxhat.applyBroadcast(nd4j::broadcast::Multiply, axes, &std); + // xMu.applyBroadcast(nd4j::broadcast::Multiply, axes, dLdV); + // dxhat += xMu; + // dxhat.applyBroadcast(nd4j::broadcast::Add, axes, &dLdmu, dLdI); + // delete dxmu1; + // delete dxmu2; + // xHat *= *dLdO; + // xHat.reduceAlongDimension(reduce::Sum, dLdG, excludedAxes, keepUnitiesInShape); + return Status::OK(); } diff --git a/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp b/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp index 45324300d..2123317b5 100644 --- a/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp +++ b/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp @@ -28,13 +28,13 @@ namespace nd4j { namespace ops { CUSTOM_OP_IMPL(multi_head_dot_product_attention, 7, -1, false, 0, 2) { - auto queries = INPUT_VARIABLE(0); - auto keys = INPUT_VARIABLE(1); - auto values = INPUT_VARIABLE(2); - auto Wq = INPUT_VARIABLE(3); - auto Wk = INPUT_VARIABLE(4); - auto Wv = INPUT_VARIABLE(5); - auto Wo = INPUT_VARIABLE(6); + auto queries = INPUT_VARIABLE(0); //[batch, nIn, timeSteps] + auto keys = INPUT_VARIABLE(1); //[batch, nIn, timeSteps] + auto values = INPUT_VARIABLE(2); //[batch, nIn, timeSteps] + auto Wq = INPUT_VARIABLE(3); //[nHeads, headSize, nIn] + auto Wk = INPUT_VARIABLE(4); //[nHeads, headSize, nIn] + auto Wv = INPUT_VARIABLE(5); //[nHeads, headSize, nIn] + auto Wo = INPUT_VARIABLE(6); //[nHeads * headSize, nOut] auto mask = block.width() > 7 ? INPUT_VARIABLE(7) : nullptr; @@ -93,11 +93,12 @@ namespace ops { // Project queries, keys, values - auto projectedQueries = AttentionHelper::multiHeadProject(queries, Wq, block.launchContext()); - auto projectedKeys = AttentionHelper::multiHeadProject(keys, Wk, block.launchContext()); - auto projectedValues = AttentionHelper::multiHeadProject(values, Wv, block.launchContext()); + auto projectedQueries = AttentionHelper::multiHeadProject(queries, Wq, block.launchContext()); //[minibatch, numHeads, projectedSize, seqLength] + auto projectedKeys = AttentionHelper::multiHeadProject(keys, Wk, block.launchContext()); //[minibatch, numHeads, projectedSize, seqLength] + auto projectedValues = AttentionHelper::multiHeadProject(values, Wv, block.launchContext()); //[minibatch, numHeads, projectedSize, seqLength] // Apply Attention + // attnResults = [minibatch, numHeads, projectedSize, seqLenth NDArray attnResults('c', {projectedQueries.sizeAt(0), projectedValues.sizeAt(1), projectedValues.sizeAt(2), projectedQueries.sizeAt(3)}, projectedValues.dataType(), block.launchContext()); nd4j::ops::dot_product_attention attention; attention.execute({&projectedQueries, &projectedKeys, &projectedValues, mask}, {&attnResults, weights ? OUTPUT_VARIABLE(1) : nullptr}, {}, {normalization, weights}, {}); diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp index bdfdfb6c6..3fd5e2250 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp @@ -78,7 +78,7 @@ namespace nd4j { } // special case - output is scalar - if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == MAX_INT)) { + if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == nd4j::DataTypeUtils::max())) { return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(nd4j::DataType::INT64)); } diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp index a80194eb2..91e9d5a41 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp @@ -77,7 +77,7 @@ namespace nd4j { } // special case - output is scalar - if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == MAX_INT)) { + if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == nd4j::DataTypeUtils::max())) { return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(DataType::INT64)); } diff --git a/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp b/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp index f027bfca3..eb1a01861 100644 --- a/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp +++ b/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp @@ -95,11 +95,9 @@ CUSTOM_OP_IMPL(dynamic_bidirectional_rnn, 7, 4, false, 0, 0) { seqLen->assign(time); // set each element of seqLen to be equal to time } - std::initializer_list dimsForReverse = timeMajor ? std::initializer_list{0,1} : std::initializer_list{1,0}; - // reverse x nd4j::ops::reverse_sequence reverse; - auto resultsIn = reverse.execute({x, seqLen}, {}, dimsForReverse, {}, false, x->dataType()); + auto resultsIn = timeMajor ? reverse.execute({x, seqLen}, {}, {0, 1}, {}, false, x->dataType()) : reverse.execute({x, seqLen}, {}, {1, 0}, {}, false, x->dataType()); REQUIRE_TRUE (resultsIn->status() == ND4J_STATUS_OK, 0, "dynamic_bidirectional_rnn: there is a problem with reverse on the sequence."); auto revInput = resultsIn->at(0); @@ -109,7 +107,7 @@ CUSTOM_OP_IMPL(dynamic_bidirectional_rnn, 7, 4, false, 0, 0) { hBWFinal->assign(resultsBW->at(1)); // reverse hBWtemp - auto resultsOut = reverse.execute({hBWtemp, seqLen}, {}, dimsForReverse, {}); + auto resultsOut = timeMajor ? reverse.execute({hBWtemp, seqLen}, {}, {0, 1}, {}) : reverse.execute({hBWtemp, seqLen}, {}, {1, 0}, {}); hBW->assign(resultsOut->at(0)); delete resultsOut; diff --git a/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp b/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp index fef13d44b..b3c2a93d4 100644 --- a/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp +++ b/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp @@ -28,7 +28,7 @@ namespace nd4j { namespace ops { CUSTOM_OP_IMPL(reverse_sequence, 2, 1, false, 0, 2) { - + auto input = INPUT_VARIABLE(0); auto seqLengths = INPUT_VARIABLE(1); auto output = OUTPUT_VARIABLE(0); @@ -39,13 +39,13 @@ CUSTOM_OP_IMPL(reverse_sequence, 2, 1, false, 0, 2) { REQUIRE_TRUE(input->rankOf() > 1, 0, "REVERSE_SEQUENSE operation: input array must have rank > 1, but got %i instead !", input->rankOf()); REQUIRE_TRUE(seqLengths->rankOf() == 1, 0, "REVERSE_SEQUENSE operation: input array seqLengths must be 1D vector, that is it must have rank == 1, but got %i instead !", seqLengths->rankOf()); REQUIRE_TRUE(seqLengths->lengthOf() == input->sizeAt(batchDim), 0, "REVERSE_SEQUENSE custom operation: the length of array seqLengths must be equal to the value of batchDim dimension of input array, but got %i and %i correspondingly !", seqLengths->lengthOf(), input->sizeAt(batchDim)); - REQUIRE_TRUE(seqDim != batchDim, 0, "REVERSE_SEQUENSE operation: input integer parameters seqDim and batchDim must be different, but they are %i and %i correspondingly !", seqDim, batchDim); + REQUIRE_TRUE(seqDim != batchDim, 0, "REVERSE_SEQUENSE operation: input integer parameters seqDim and batchDim must be different, but they both are equal to %i !", batchDim); REQUIRE_TRUE(batchDim < input->rankOf(), 0, "REVERSE_SEQUENSE operation: input integer parameter batchDim must be smaller than input array rank, but got %i and %i correspondingly !", batchDim, input->rankOf()); - REQUIRE_TRUE(seqDim < input->rankOf(), 0, "REVERSE_SEQUENSE operation: input integer parameter seqDim must be smaller than input array rank, but got %i and %i correspondingly !", seqDim, input->rankOf()); + REQUIRE_TRUE(seqDim < input->rankOf(), 0, "REVERSE_SEQUENSE operation: input integer parameter seqDim must be smaller than input array rank, but got %i and %i correspondingly !", seqDim, input->rankOf()); auto maxElem = seqLengths->reduceNumber(reduce::Max); REQUIRE_TRUE(maxElem.e(0) <= input->sizeAt(seqDim), 0, "REVERSE_SEQUENSE operation: max element in seqLengths array must be not greater than value of seqDim dimension of input array !"); - + helpers::reverseSequence(block.launchContext(), input, seqLengths, output, seqDim, batchDim); return Status::OK(); @@ -65,15 +65,15 @@ DECLARE_SHAPE_FN(reverse_sequence) { int seqDim = INT_ARG(0); int batchDim = block.numI() > 1 ? INT_ARG(1) : 0; + REQUIRE_TRUE(batchDim < inShapeInfo[0], 0, "REVERSE_SEQUENSE operation: input integer parameter batchDim must be smaller than input array rank, but got %i and %i correspondingly !", batchDim, inShapeInfo[0]); + REQUIRE_TRUE(seqDim < inShapeInfo[0], 0, "REVERSE_SEQUENSE operation: input integer parameter seqDim must be smaller than input array rank, but got %i and %i correspondingly !", seqDim, inShapeInfo[0]); REQUIRE_TRUE(inShapeInfo[0] > 1, 0, "REVERSE_SEQUENSE operation: input array must have rank > 1, but got %i instead !", inShapeInfo[0]); REQUIRE_TRUE(seqLenShapeInfo[0] == 1, 0, "REVERSE_SEQUENSE operation: input array seqLengths must be 1D vector, that is it must have rank == 1, but got %i instead !", seqLenShapeInfo[0]); REQUIRE_TRUE(seqLenShapeInfo[1] == inShapeInfo[batchDim+1], 0, "REVERSE_SEQUENSE custom operation: the length of array seqLengths must be equal to the value of batchDim dimension of input array, but got %i and %i correspondingly !", seqLenShapeInfo[1], inShapeInfo[batchDim+1]); - REQUIRE_TRUE(batchDim < inShapeInfo[0], 0, "REVERSE_SEQUENSE operation: input integer parameter batchDim must be smaller than input array rank, but got %i and %i correspondingly !", batchDim, inShapeInfo[0]); - REQUIRE_TRUE(seqDim < inShapeInfo[0], 0, "REVERSE_SEQUENSE operation: input integer parameter seqDim must be smaller than input array rank, but got %i and %i correspondingly !", seqDim, inShapeInfo[0]); - + Nd4jLong* outShapeInfo = nullptr; COPY_SHAPE(inShapeInfo, outShapeInfo); - + return SHAPELIST(CONSTANT(outShapeInfo)); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp index 5b6e6122e..a7123d42f 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp @@ -19,6 +19,7 @@ // #include +#include namespace nd4j { namespace ops { @@ -150,26 +151,30 @@ namespace helpers { // auto shift = 0; auto rowSize = sizeof(T) * colCount; - PRAGMA_OMP_PARALLEL_FOR - for (int n = 0; n < N; n++) { - int start = rowP->e(n); - int end = rowP->e(n+1); - int shift = n * colCount; - for (int i = start; i < end; i++) { - T const* thisSlice = dataP + colP->e(i) * colCount; - T res = 1; - for (int k = 0; k < colCount; k++) { - auto tempVal = dataP[shift + k] - thisSlice[k];//thisSlice[k]; - res += tempVal * tempVal; + auto func = PRAGMA_THREADS_FOR { + for (auto n = start; n < stop; n += increment) { + int s = rowP->e(n); + int end = rowP->e(n + 1); + int shift = n * colCount; + for (int i = s; i < end; i++) { + T const *thisSlice = dataP + colP->e(i) * colCount; + T res = 1; + + for (int k = 0; k < colCount; k++) { + auto tempVal = dataP[shift + k] - thisSlice[k];//thisSlice[k]; + res += tempVal * tempVal; + } + + res = vals[i] / res; + for (int k = 0; k < colCount; k++) + outputP[shift + k] += ((dataP[shift + k] - thisSlice[k]) * res); } - - res = vals[i] / res; - for (int k = 0; k < colCount; k++) - outputP[shift + k] += ((dataP[shift + k] - thisSlice[k]) * res); + //shift += colCount; } - //shift += colCount; - } + }; + + samediff::Threads::parallel_tad(func, 0, N); } void barnes_edge_forces(const NDArray* rowP, NDArray const* colP, NDArray const* valP, int N, NDArray* output, NDArray const& data) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp index bd29094ec..ba0f36eb5 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -44,11 +45,9 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output, if (inEWS == 1 && outEWS == 1) { - PRAGMA_OMP_SIMD_MAX(max) for (int i = 0; i < length; i++) max = nd4j::math::nd4j_max(max, inBuff[i]); - PRAGMA_OMP_SIMD_SUM(sum) for (int i = 0; i < length; i++) { outBuff[i] = nd4j::math::nd4j_exp(inBuff[i] - max); sum += outBuff[i]; @@ -60,11 +59,9 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output, } else { - PRAGMA_OMP_SIMD_MAX(max) for (int i = 0; i < length; i++) max = nd4j::math::nd4j_max(max, inBuff[i * inEWS]); - PRAGMA_OMP_SIMD_SUM(sum) for (int i = 0; i < length; i++) { T r = nd4j::math::nd4j_exp(inBuff[i * inEWS] - max); outBuff[i * outEWS] = r; @@ -89,19 +86,17 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output, T sum = 0.; int length = shape::length(inShapeInfo); -PRAGMA_OMP_SIMD_ARGS(reduction(OMP_MAXT:max)) for (int i = 0; i < length; i++) { const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo); max = nd4j::math::nd4j_max(max, inBuff[offset]); } -PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(reduction(OMP_SUMT:sum)) for (int i = 0; i < length; i++) { const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo); outBuff[offset] = nd4j::math::nd4j_exp(inBuff[offset] - max); sum += outBuff[offset]; } -PRAGMA_OMP_SIMD + for (int i = 0; i < length; i++) { const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo); outBuff[offset] /= sum; @@ -151,7 +146,6 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr auto length = shape::length(inShapeInfo); if (inEWS == 1) { - PRAGMA_OMP_SIMD_MAX(max) for (int i = 0; i < length; i++) max = nd4j::math::nd4j_max(max, inBuff[i]); @@ -212,7 +206,7 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra } else if(input.isSameShapeStrict(&output)) { - TadPack tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), {dimension}); + TadPack tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimension); Nd4jLong* tadShapeInfo = tadPack.primaryShapeInfo(); Nd4jLong* tadOffsets = tadPack.primaryOffsets(); const uint numOfSubArrs = tadPack.numberOfTads(); @@ -220,27 +214,30 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra if(shape::elementWiseStride(tadShapeInfo) == 1){ - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (uint i = 0; i < numOfSubArrs; ++i) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { - T* inBuff = input.bufferAsT() + tadOffsets[i]; - T* outBuff = output.bufferAsT() + tadOffsets[i]; + T *inBuff = input.bufferAsT() + tadOffsets[i]; + T *outBuff = output.bufferAsT() + tadOffsets[i]; - T max = -DataTypeUtils::max(); - T sum = 0; + T max = -DataTypeUtils::max(); + T sum = 0; - for(uint j = 0; j < tadLen; ++j) - max = nd4j::math::nd4j_max(max, inBuff[j]); + for (uint j = 0; j < tadLen; ++j) + max = nd4j::math::nd4j_max(max, inBuff[j]); - for (uint j = 0; j < tadLen; ++j) { - T temp = nd4j::math::nd4j_exp(inBuff[j] - max); - outBuff[j] = temp; - sum += temp; + for (uint j = 0; j < tadLen; ++j) { + T temp = nd4j::math::nd4j_exp(inBuff[j] - max); + outBuff[j] = temp; + sum += temp; + } + + for (uint j = 0; j < tadLen; ++j) + outBuff[j] /= sum; } + }; - for (uint j = 0; j < tadLen; ++j) - outBuff[j] /= sum; - } + samediff::Threads::parallel_tad(func,0, numOfSubArrs); } else { @@ -250,29 +247,30 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra auto offsets = new Nd4jLong[tadLen]; shape::calcOffsets(tadShapeInfo, offsets); - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (uint i = 0; i < numOfSubArrs; ++i) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto inBuff = input.bufferAsT() + tadOffsets[i]; + auto outBuff = output.bufferAsT() + tadOffsets[i]; - T* inBuff = input.bufferAsT() + tadOffsets[i]; - T* outBuff = output.bufferAsT() + tadOffsets[i]; + T max = -DataTypeUtils::max(); + T sum = 0.f; - T max = -DataTypeUtils::max(); - T sum = 0.f; + for (uint j = 0; j < tadLen; ++j) + max = nd4j::math::nd4j_max(max, inBuff[offsets[j]]); + for (uint j = 0; j < tadLen; ++j) { + T temp = nd4j::math::nd4j_exp(inBuff[offsets[j]] - max); + outBuff[offsets[j]] = temp; + sum += temp; + } - - for(uint j = 0; j < tadLen; ++j) - max = nd4j::math::nd4j_max(max, inBuff[offsets[j]]); - - for (uint j = 0; j < tadLen; ++j) { - T temp = nd4j::math::nd4j_exp(inBuff[offsets[j]] - max); - outBuff[offsets[j]] = temp; - sum += temp; + for (uint j = 0; j < tadLen; ++j) + outBuff[offsets[j]] /= sum; } + }; + + samediff::Threads::parallel_tad(func, 0, numOfSubArrs); - for (uint j = 0; j < tadLen; ++j) - outBuff[offsets[j]] /= sum; - } delete []offsets; } } @@ -299,16 +297,19 @@ void prelu(nd4j::LaunchContext * context, const NDArray& input, const NDArray& a const Nd4jLong* inputShapeInfo = input.getShapeInfo(); const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo(); - PRAGMA_OMP_PARALLEL_FOR_IF(inputLen > Environment::getInstance()->elementwiseThreshold()) - for(Nd4jLong i = 0; i < inputLen; ++i) { - // FIXME: double! - double x = input.e(i); - if(x < 0.0) { - // FIXME: double - output.p(i, (x * alpha.e(shape::subArrayIndex(i, inputShapeInfo, alphaShapeInfo)))); - } else - output.p(i, x); - } + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + // FIXME: double! + double x = input.e(i); + if (x < 0.0) { + // FIXME: double + output.p(i, (x * alpha.e(shape::subArrayIndex(i, inputShapeInfo, alphaShapeInfo)))); + } else + output.p(i, x); + } + }; + + samediff::Threads::parallel_for(func, 0, inputLen); } ////////////////////////////////////////////////////////////////////////// diff --git a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp index 0e6e1f777..a36330fbe 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp @@ -20,6 +20,7 @@ #include +#include namespace nd4j { namespace ops { @@ -62,12 +63,15 @@ static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output, if(inOutAreSame) { - PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4)) - for(uint b = 0; b < bS; ++b) - for(uint c = 0; c < C; ++c) - for(uint h = 0; h < oH ; ++h) - for(uint w = 0; w < oW ; ++w) - z[b*zStrideB + c*zStrideC + h*zStrideH + w*zStrideW] += static_cast(y[c*yStrideC]); + auto func = PRAGMA_THREADS_FOR_3D { + for (uint b = start_x; b < stop_x; b += inc_x) + for (uint c = start_y; c < stop_y; c += inc_y) + for (uint h = start_z; h < stop_z; h += inc_z) + for (uint w = 0; w < oW; ++w) + z[b * zStrideB + c * zStrideC + h * zStrideH + w * zStrideW] += static_cast(y[c * yStrideC]); + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oH, 1); } else { @@ -76,12 +80,15 @@ static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output, const Nd4jLong xStrideH = isNCHW ? input.stridesOf()[2] : input.stridesOf()[1]; const Nd4jLong xStrideW = isNCHW ? input.stridesOf()[3] : input.stridesOf()[2]; - PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4)) - for(uint b = 0; b < bS; ++b) - for(uint c = 0; c < C; ++c) - for(uint h = 0; h < oH ; ++h) - for(uint w = 0; w < oW ; ++w) - z[b*zStrideB + c*zStrideC + h*zStrideH + w*zStrideW] = x[b*xStrideB + c*xStrideC + h*xStrideH + w*xStrideW] + static_cast(y[c*yStrideC]); + auto func = PRAGMA_THREADS_FOR_3D { + for (uint b = start_x; b < stop_x; b += inc_x) + for (uint c = start_y; c < stop_y; c += inc_y) + for (uint h = start_z; h < stop_z; h += inc_z) + for (uint w = 0; w < oW; ++w) + z[b * zStrideB + c * zStrideC + h * zStrideH + w * zStrideW] = x[b * xStrideB + c * xStrideC + h * xStrideH + w * xStrideW] + static_cast(y[c * yStrideC]); + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oH, 1); } } else if(output.rankOf() == 5) { @@ -98,13 +105,16 @@ static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output, if(inOutAreSame) { - PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5)) - for(uint b = 0; b < bS; ++b) - for(uint c = 0; c < C; ++c) - for(uint d = 0; d < oD ; ++d) - for(uint h = 0; h < oH ; ++h) - for(uint w = 0; w < oW ; ++w) - z[b*zStrideB + c*zStrideC + d*zStrideD + h*zStrideH + w*zStrideW] += static_cast(y[c*yStrideC]); + auto func = PRAGMA_THREADS_FOR_3D { + for (uint b = start_x; b < stop_x; b += inc_x) + for (uint c = start_y; c < stop_y; c += inc_y) + for (uint d = start_z; d < stop_z; d += inc_z) + for (uint h = 0; h < oH; ++h) + for (uint w = 0; w < oW; ++w) + z[b * zStrideB + c * zStrideC + d * zStrideD + h * zStrideH + w * zStrideW] += static_cast(y[c * yStrideC]); + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oD, 1); } else { @@ -114,13 +124,16 @@ static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output, const Nd4jLong xStrideH = isNCHW ? input.stridesOf()[3] : input.stridesOf()[2]; const Nd4jLong xStrideW = isNCHW ? input.stridesOf()[4] : input.stridesOf()[3]; - PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5)) - for(uint b = 0; b < bS; ++b) - for(uint c = 0; c < C; ++c) - for(uint d = 0; d < oD ; ++d) - for(uint h = 0; h < oH ; ++h) - for(uint w = 0; w < oW ; ++w) - z[b*zStrideB + c*zStrideC + d*zStrideD + h*zStrideH + w*zStrideW] = x[b*xStrideB + c*xStrideC + d*xStrideD + h*xStrideH + w*xStrideW] + static_cast(y[c*yStrideC]); + auto func = PRAGMA_THREADS_FOR_3D { + for (uint b = start_x; b < stop_x; b += inc_x) + for (uint c = start_y; c < stop_y; c += inc_y) + for (uint d = start_z; d < stop_z; d += inc_z) + for (uint h = 0; h < oH; ++h) + for (uint w = 0; w < oW; ++w) + z[b * zStrideB + c * zStrideC + d * zStrideD + h * zStrideH + w * zStrideW] = x[b * xStrideB + c * xStrideC + d * xStrideD + h * xStrideH + w * xStrideW] + static_cast(y[c * yStrideC]); + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oD, 1); } } else { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp index 5484d822d..ae76f0289 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp @@ -21,6 +21,7 @@ #include #include +#include namespace nd4j { namespace ops { @@ -38,50 +39,55 @@ static void adjustHue_(const NDArray *input, const NDArray* deltaScalarArr, NDAr if(dimC == rank - 1 && input->ews() == 1 && output->ews() == 1 && input->ordering() == 'c' && output->ordering() == 'c') { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong i = 0; i < input->lengthOf(); i += 3) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + T h, s, v; - T h, s, v; + rgbToHsv(x[i], x[i + 1], x[i + 2], h, s, v); - rgbToHsv(x[i], x[i+1], x[i+2], h, s, v); + h += delta * 360; + if (h > 360) + h -= 360; + else if (h < 0) + h += 360; - h += delta * 360; - if(h > 360) - h -= 360; - else if(h < 0) - h += 360; + hsvToRgb(h, s, v, z[i], z[i + 1], z[i + 2]); + } + }; - hsvToRgb(h, s, v, z[i], z[i+1], z[i+2]); - } + samediff::Threads::parallel_for(func, 0, input->lengthOf(), 3); } else { - auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), {dimC}); - auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC}); + auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimC); + auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC); const Nd4jLong numOfTads = packX.numberOfTads(); const Nd4jLong xDimCstride = input->stridesOf()[dimC]; const Nd4jLong zDimCstride = output->stridesOf()[dimC]; - PRAGMA_OMP_PARALLEL_FOR_SIMD - for(Nd4jLong i = 0; i < numOfTads; ++i) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { - const T* xTad = x + packX.platformOffsets()[i]; - T* zTad = z + packZ.platformOffsets()[i]; + const T *xTad = x + packX.platformOffsets()[i]; + T *zTad = z + packZ.platformOffsets()[i]; - T h, s, v; + T h, s, v; - rgbToHsv(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v); + rgbToHsv(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v); - h += delta * 360; - if(h > 360) - h -= 360; - else if(h < 0) - h += 360; + h += delta * 360; + if (h > 360) + h -= 360; + else if (h < 0) + h += 360; - hsvToRgb(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]); + hsvToRgb(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]); - } + } + }; + + samediff::Threads::parallel_tad(func, 0, numOfTads); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp index 9a5141a82..d4b0de398 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace nd4j { @@ -39,50 +40,51 @@ static void adjustSaturation_(const NDArray *input, const NDArray* factorScalarA if(dimC == rank - 1 && input->ews() == 1 && output->ews() == 1 && input->ordering() == 'c' && output->ordering() == 'c') { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong i = 0; i < input->lengthOf(); i += 3) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + T h, s, v; - T h, s, v; + rgbToHsv(x[i], x[i + 1], x[i + 2], h, s, v); - rgbToHsv(x[i], x[i+1], x[i+2], h, s, v); + s *= factor; + if (s > 1.f) + s = 1.f; + else if (s < 0.f) + s = 0.f; - s *= factor; - if(s > 1.f) - s = 1.f; - else if(s < 0.f) - s = 0.f; + hsvToRgb(h, s, v, z[i], z[i + 1], z[i + 2]); + } + }; - hsvToRgb(h, s, v, z[i], z[i+1], z[i+2]); - } - } - else { - - auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), {dimC}); - auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC}); + samediff::Threads::parallel_for(func, 0, input->lengthOf(), 3); + } else { + auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimC); + auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC); const Nd4jLong numOfTads = packX.numberOfTads(); const Nd4jLong xDimCstride = input->stridesOf()[dimC]; const Nd4jLong zDimCstride = output->stridesOf()[dimC]; - PRAGMA_OMP_PARALLEL_FOR_SIMD - for(Nd4jLong i = 0; i < numOfTads; ++i) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + const T *xTad = x + packX.platformOffsets()[i]; + T *zTad = z + packZ.platformOffsets()[i]; - const T* xTad = x + packX.platformOffsets()[i]; - T* zTad = z + packZ.platformOffsets()[i]; + T h, s, v; - T h, s, v; + rgbToHsv(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v); - rgbToHsv(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v); + s *= factor; + if (s > 1.f) + s = 1.f; + else if (s < 0.f) + s = 0.f; - s *= factor; - if(s > 1.f) - s = 1.f; - else if(s < 0.f) - s = 0.f; + hsvToRgb(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]); + } + }; - hsvToRgb(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]); - - } + samediff::Threads::parallel_tad(func, 0, numOfTads); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp index ffd75e435..b408da720 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace nd4j { @@ -92,25 +93,28 @@ void bgemm_(const std::vector& vA, const std::vector& vB, st int vaSize = vA.size(); - PRAGMA_OMP_PARALLEL_FOR - for (int p = 0; p < vaSize; ++p) { - auto A = reinterpret_cast(vA.at(p)->buffer()); - auto B = reinterpret_cast(vB.at(p)->buffer()); - auto C = reinterpret_cast(vC.at(p)->buffer()); - auto alpha = alphas->e(p); - auto beta = betas->e(p); - for (int m = 0; m < M; ++m) { - for (int n = 0; n < N; ++n) { - T c_mnp = 0; + auto func = PRAGMA_THREADS_FOR { + for (auto p = start; p < stop; p += increment) { + auto A = reinterpret_cast(vA.at(p)->buffer()); + auto B = reinterpret_cast(vB.at(p)->buffer()); + auto C = reinterpret_cast(vC.at(p)->buffer()); + auto alpha = alphas->e(p); + auto beta = betas->e(p); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + T c_mnp = 0; - PRAGMA_OMP_SIMD - for (int k = 0; k < K; ++k) - c_mnp += A[tA == CblasNoTrans ? (m + k * lda) : (m * lda + k)] * B[tB == CblasNoTrans ? (k + n * ldb) : (k * ldb + n)]; + PRAGMA_OMP_SIMD + for (int k = 0; k < K; ++k) + c_mnp += A[tA == CblasNoTrans ? (m + k * lda) : (m * lda + k)] * B[tB == CblasNoTrans ? (k + n * ldb) : (k * ldb + n)]; - C[m + n * ldc] = alpha * c_mnp + beta * C[m + n * ldc]; + C[m + n * ldc] = alpha * c_mnp + beta * C[m + n * ldc]; + } } } - } + }; + + samediff::Threads::parallel_tad(func, 0, vaSize); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp index a0847f704..7a0d8b97b 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -71,9 +72,8 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray* if(beta != nullptr) { const T* betaBuff = beta->bufferAsT(); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - const auto threadNum = omp_get_thread_num(); + auto func = PRAGMA_THREADS_DO { + const auto threadNum = thread_id; Nd4jLong* inOffsets = new Nd4jLong[step]; Nd4jLong* memBuff = new Nd4jLong[2 * inShapeInfo[0]]; @@ -98,17 +98,17 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray* } delete []inOffsets; delete []memBuff; - } + }; + + samediff::Threads::parallel_do(func, info._numThreads); } else { - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - const auto threadNum = omp_get_thread_num(); + auto func = PRAGMA_THREADS_DO { + const auto threadNum = thread_id; Nd4jLong* inOffsets = new Nd4jLong[step]; Nd4jLong* memBuff = new Nd4jLong[2 * inShapeInfo[0]]; for (int j = 0; j < lenSmall; ++j) { - const bool isOwner = j < info._numThreads ? threadNum == j : threadNum == j % info._numThreads; if (!isOwner) continue; @@ -128,7 +128,9 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray* } delete []inOffsets; delete []memBuff; - } + }; + + samediff::Threads::parallel_do(func, info._numThreads); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp index bba3e8acb..ddd1ad892 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -84,7 +85,7 @@ static T continuedFraction(const T a, const T b, const T x) { return f; } - return 1.f / 0.f; // no convergence, more iterations is required + return std::numeric_limits::infinity(); // no convergence, more iterations is required } /////////////////////////////////////////////////////////////////// @@ -121,9 +122,12 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con int xLen = x.lengthOf(); - PRAGMA_OMP_PARALLEL_FOR_IF(xLen > Environment::getInstance()->elementwiseThreshold()) - for(int i = 0; i < xLen; ++i) - output.t(i) = betaIncCore(a.t(i), b.t(i), x.t(i)); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) + output.t(i) = betaIncCore(a.t(i), b.t(i), x.t(i)); + }; + + samediff::Threads::parallel_for(func, 0, xLen); } /////////////////////////////////////////////////////////////////// diff --git a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp index b4a54ad7a..5aad38da8 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp @@ -19,6 +19,7 @@ // #include +#include namespace nd4j { namespace ops { @@ -56,64 +57,77 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp memset(imBuff, 0, shape::length(imShapeBuffer) * sizeof(T)); - T *col, *im; - int imRow, imCol; // if (shape::order(colShapeBuffer) == 'c' && shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) { if (false) { - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(private(col, im, imRow, imCol) collapse(2)) - for (int b = 0; b < bS; b++) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { + auto func = PRAGMA_THREADS_FOR_2D { + T *col, *im; + int imRow, imCol; - imRow = (-pH + kRow * dH) + colH*sH; - imCol = (-pW + kCol * dW) + colW*sW; + for (uint b = start_x; b < stop_x; b += inc_x) { + for (uint c = start_y; c < stop_y; c += inc_y) { + for (int kRow = 0; kRow < kH; ++kRow) { + for (int kCol = 0; kCol < kW; ++kCol) { + for (int colH = 0; colH < oH; ++colH) { + for (int colW = 0; colW < oW; ++colW) { - col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; - im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; + imRow = (-pH + kRow * dH) + colH * sH; + imCol = (-pW + kCol * dW) + colW * sW; - if (static_cast(imRow) < static_cast(iH) && static_cast(imCol) < static_cast(iW)) - *im += *col; + col = colBuff + b * colStride0 + c * colStride1 + kRow * colStride2 + kCol * colStride3 + colH * colStride4 + colW * colStride5; + im = imBuff + b * imStride0 + c * imStride1 + imRow * imStride2 + imCol * imStride3; + + if (static_cast(imRow) < static_cast(iH) && + static_cast(imCol) < static_cast(iW)) + *im += *col; + } } } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1); } else { - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(firstprivate(imRow, imCol)) - for (int b = 0; b < bS; ++b) { - T* im0 = imBuff + b*imStride0; - T* col4 = colBuff + b*colStride0; - for (int colH = 0; colH < oH; ++colH, col4 += colStride4) { - T* col5 = col4; - for (int colW = 0; colW < oW; ++colW, col5 += colStride5) { - T* col1 = col5; - T* im1 = im0; - for (int c = 0; c < iC; ++c, col1 += colStride1, im1 += imStride1) { - int imRow = (-pH + colH*sH); - T* col2 = col1; - T* im2 = im1 + imRow*imStride2; - for (int kRow = 0; kRow < kH; ++kRow, col2 += colStride2, imRow += dH, im2 += dH*imStride2) { - int imCol =-pW + colW*sW; - T* col3 = col2; - T* im3 = im2 + imCol*imStride3; - for (int kCol = 0; kCol < kW; ++kCol, col3 += colStride3, imCol += dW, im3 += dW*imStride3) { + auto func = PRAGMA_THREADS_FOR { + T *col, *im; - if (static_cast(imRow) < static_cast(iH) && static_cast(imCol) < static_cast(iW)) - *im3 += *col3; + for (uint b = start; b < stop; b += increment) { + T *im0 = imBuff + b * imStride0; + T *col4 = colBuff + b * colStride0; + for (int colH = 0; colH < oH; ++colH, col4 += colStride4) { + T *col5 = col4; + for (int colW = 0; colW < oW; ++colW, col5 += colStride5) { + T *col1 = col5; + T *im1 = im0; + for (int c = 0; c < iC; ++c, col1 += colStride1, im1 += imStride1) { + int imRow = (-pH + colH * sH); + T *col2 = col1; + T *im2 = im1 + imRow * imStride2; + for (int kRow = 0; + kRow < kH; ++kRow, col2 += colStride2, imRow += dH, im2 += dH * imStride2) { + int imCol = -pW + colW * sW; + T *col3 = col2; + T *im3 = im2 + imCol * imStride3; + for (int kCol = 0; + kCol < kW; ++kCol, col3 += colStride3, imCol += dW, im3 += dW * imStride3) { + + if (static_cast(imRow) < static_cast(iH) && + static_cast(imCol) < static_cast(iW)) + *im3 += *col3; + } } } } - } + } } - } + }; + + samediff::Threads::parallel_tad(func, 0, bS); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp index 50a11f767..5f7fbf694 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp @@ -15,6 +15,7 @@ ******************************************************************************/ #include +#include namespace nd4j { namespace ops { @@ -26,26 +27,38 @@ namespace nd4j { int elementsPerThread = length / ELEMENT_THRESHOLD; int num_threads = nd4j::math::nd4j_max(1, elementsPerThread); num_threads = nd4j::math::nd4j_min(num_threads, omp_get_max_threads()); - - Nd4jLong sum = 0; + Nd4jLong sumt = 0; if(isStrictlyIncreasing) { - PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sum) - for (Nd4jLong i = 0; i < length - 1; i++) { - auto val0 = input->t(i); - auto val1 = input->t(i + 1); - sum += val0 >= val1 ? -1 : 0; - } + //PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sum) + auto func = PRAGMA_REDUCE_LONG { + Nd4jLong sum = 0; + for (auto i = start; i < stop; i++) { + auto val0 = input->t(i); + auto val1 = input->t(i + 1); + sum += val0 >= val1 ? -1 : 0; + } + return sum; + }; + sumt = samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, length - 1); } else { - PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sum) - for (Nd4jLong i = 0; i < length - 1; i++) { - auto val0 = input->t(i); - auto val1 = input->t(i + 1); - sum += val0 > val1 ? -1 : 0; - } + //PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sum) + auto func = PRAGMA_REDUCE_LONG { + Nd4jLong sum = 0; + for (auto i = start; i < stop; i++) { + auto val0 = input->t(i); + auto val1 = input->t(i + 1); + sum += val0 > val1 ? -1 : 0; + } + + return sum; + }; + sumt = samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, length - 1); } - output = (sum > -1); + nd4j_printf("Sum: %lld\n", sumt) + + output = (sumt > -1); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp index 859330a9d..e2d24c591 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp @@ -19,6 +19,7 @@ // #include +#include namespace nd4j { @@ -30,13 +31,16 @@ namespace helpers { std::unique_ptr arrs(output->allTensorsAlongDimension({1})); int lLen = labels->lengthOf(); - PRAGMA_OMP_PARALLEL_FOR_IF(lLen > Environment::getInstance()->elementwiseThreshold()) - for (int j = 0; j < lLen; ++j){ - auto label = labels->e(j); - auto pred = predictions->e(j); - T value = (weights == nullptr ? (T)1.0f : weights->e(j)); - (*arrs->at(label)).p(pred, value); - } + auto func = PRAGMA_THREADS_FOR { + for (int j = start; j < stop; j += increment) { + auto label = labels->e(j); + auto pred = predictions->e(j); + T value = (weights == nullptr ? (T) 1.0f : weights->e(j)); + (*arrs->at(label)).p(pred, value); + } + }; + + samediff::Threads::parallel_for(func, 0, lLen); } void confusionFunctor(nd4j::LaunchContext * context, NDArray* labels, NDArray* predictions, NDArray* weights, NDArray* output) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp index 93d00220e..0829bcbe6 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp @@ -24,6 +24,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -62,32 +63,34 @@ namespace nd4j { T* colBuff = columns.bufferAsT(); T* volBuff = const_cast(volume).bufferAsT(); - T *col, *vol; - int volDep, volRow, volCol; - if (volume.ordering() == 'c' && columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo())) + if (volume.ordering() == 'c' && columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo())) { - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, vol, volDep, volRow, volCol) collapse(2)) - for (int b = 0; b < bS; ++b) { - for (int c = 0; c < iC; ++c) { - for (int kDep = 0; kDep < kD; ++kDep) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - for (int colD = 0; colD < oD; ++colD) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { + auto func = PRAGMA_THREADS_FOR_3D { + T *col, *vol; + int volDep, volRow, volCol; - volDep = (-pD + kDep * dD) + colD*sD; - volRow = (-pH + kRow * dH) + colH*sH; - volCol = (-pW + kCol * dW) + colW*sW; + for (int b = start_x; b < stop_x; b += inc_x) { + for (int c = start_y; c < stop_y; c += inc_y) { + for (int kDep = start_z; kDep < stop_z; kDep += inc_z) { + for (int kRow = 0; kRow < kH; ++kRow) { + for (int kCol = 0; kCol < kW; ++kCol) { + for (int colD = 0; colD < oD; ++colD) { + for (int colH = 0; colH < oH; ++colH) { + for (int colW = 0; colW < oW; ++colW) { - col = colBuff + b*colStride0 + c*colStride1 + kDep*colStride2 + kRow*colStride3 + kCol*colStride4 + colD*colStride5 + colH*colStride6 + colW*colStride7; + volDep = (-pD + kDep * dD) + colD * sD; + volRow = (-pH + kRow * dH) + colH * sH; + volCol = (-pW + kCol * dW) + colW * sW; - if (static_cast(volDep) >= static_cast(iD) || static_cast(volRow) >= static_cast(iH) || static_cast(volCol) >= static_cast(iW)) - *col = static_cast(0.); - else { - vol = volBuff + b*volStride0 + c*volStride1 + volDep*volStride2 + volRow*volStride3 + volCol*volStride4; - *col = *vol; + col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7; + + if (static_cast(volDep) >= static_cast(iD) || static_cast(volRow) >= static_cast(iH) || static_cast(volCol) >= static_cast(iW)) + *col = static_cast(0.); + else { + vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4; + *col = *vol; + } } } } @@ -96,31 +99,36 @@ namespace nd4j { } } } - } + }; - else + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, kD, 1); - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(vol, col, volDep, volRow, volCol)) - for (int b = 0; b < bS; b++) { - for (int colD = 0; colD < oD; ++colD) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { - for (int c = 0; c < iC; ++c) { - for (int kDep = 0; kDep < kD; ++kDep) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { + } else { - volDep = (-pD + kDep * dD) + colD*sD; - volRow = (-pH + kRow * dH) + colH*sH; - volCol = (-pW + kCol * dW) + colW*sW; + auto func = PRAGMA_THREADS_FOR_2D { + T *col, *vol; + int volDep, volRow, volCol; + for (int b = start_x; b < stop_x; b++) { + for (int colD = start_y; colD < stop_y; colD++) { + for (int colH = 0; colH < oH; ++colH) { + for (int colW = 0; colW < oW; ++colW) { + for (int c = 0; c < iC; ++c) { + for (int kDep = 0; kDep < kD; ++kDep) { + for (int kRow = 0; kRow < kH; ++kRow) { + for (int kCol = 0; kCol < kW; ++kCol) { - col = colBuff + b*colStride0 + c*colStride1 + kDep*colStride2 + kRow*colStride3 + kCol*colStride4 + colD*colStride5 + colH*colStride6 + colW*colStride7; + volDep = (-pD + kDep * dD) + colD * sD; + volRow = (-pH + kRow * dH) + colH * sH; + volCol = (-pW + kCol * dW) + colW * sW; - if (static_cast(volDep) >= static_cast(iD) || static_cast(volRow) >= static_cast(iH) || static_cast(volCol) >= static_cast(iW)) - *col = static_cast(0.); - else { - vol = volBuff + b*volStride0 + c*volStride1 + volDep*volStride2 + volRow*volStride3 + volCol*volStride4; - *col = *vol; + col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7; + + if (static_cast(volDep) >= static_cast(iD) || static_cast(volRow) >= static_cast(iH) || static_cast(volCol) >= static_cast(iW)) + *col = static_cast(0.f); + else { + vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4; + *col = *vol; + } } } } @@ -129,7 +137,11 @@ namespace nd4j { } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, oD, 1); + //func(0, 0, bS, 1, 0, oD, 1); + } } ////////////////////////////////////////////////////////////////////////// @@ -168,29 +180,31 @@ namespace nd4j { T* volBuff = volume.bufferAsT(); T* colBuff = const_cast(columns).bufferAsT(); - T* col, *vol; - int volDep, volRow, volCol; - if (volume.ordering() == 'c' && columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo())) + if (volume.ordering() == 'c' && columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo())) { - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, vol, volDep, volRow, volCol) collapse(2)) - for (int b = 0; b < bS; b++) { - for (int c = 0; c < iC; ++c) { - for (int kDep = 0; kDep < kD; ++kDep) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - for (int colD = 0; colD < oD; ++colD) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { + auto func = PRAGMA_THREADS_FOR { + T* col, *vol; + int volDep, volRow, volCol; - volDep = -pD + kDep * dD + colD * sD; - volRow = -pH + kRow * dH + colH * sH; - volCol = -pW + kCol * dW + colW * sW; + for (int b = start; b < stop; b++) { + for (int c = 0; c < iC; c++) { + for (int kDep = 0; kDep < kD; ++kDep) { + for (int kRow = 0; kRow < kH; ++kRow) { + for (int kCol = 0; kCol < kW; ++kCol) { + for (int colD = 0; colD < oD; ++colD) { + for (int colH = 0; colH < oH; ++colH) { + for (int colW = 0; colW < oW; ++colW) { - if (static_cast(volDep) < static_cast(iD) && static_cast(volRow) < static_cast(iH) && static_cast(volCol) < static_cast(iW)) { - col = colBuff + b*colStride0 + c*colStride1 + kDep*colStride2 + kRow*colStride3 + kCol*colStride4 + colD*colStride5 + colH*colStride6 + colW*colStride7; - vol = volBuff + b*volStride0 + c*volStride1 + volDep*volStride2 + volRow*volStride3 + volCol*volStride4; - *vol += *col; + volDep = -pD + kDep * dD + colD * sD; + volRow = -pH + kRow * dH + colH * sH; + volCol = -pW + kCol * dW + colW * sW; + + if (static_cast(volDep) < static_cast(iD) && static_cast(volRow) < static_cast(iH) && static_cast(volCol) < static_cast(iW)) { + col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7; + vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4; + *vol += *col; + } } } } @@ -199,28 +213,34 @@ namespace nd4j { } } } - } + }; - else + samediff::Threads::parallel_tad(func, 0, bS); - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(vol, col, volDep, volRow, volCol)) - for (int b = 0; b < bS; b++) { - for (int colD = 0; colD < oD; ++colD) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { - for (int c = 0; c < iC; ++c) { - for (int kDep = 0; kDep < kD; ++kDep) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { + } else { - volDep = (-pD + kDep * dD) + colD*sD; - volRow = (-pH + kRow * dH) + colH*sH; - volCol = (-pW + kCol * dW) + colW*sW; + auto func = PRAGMA_THREADS_FOR { + T* col, *vol; + int volDep, volRow, volCol; - if (static_cast(volDep) < static_cast(iD) && static_cast(volRow) < static_cast(iH) && static_cast(volCol) < static_cast(iW)) { - col = colBuff + b*colStride0 + c*colStride1 + kDep*colStride2 + kRow*colStride3 + kCol*colStride4 + colD*colStride5 + colH*colStride6 + colW*colStride7; - vol = volBuff + b*volStride0 + c*volStride1 + volDep*volStride2 + volRow*volStride3 + volCol*volStride4; - *vol += *col; + for (int b = start; b < stop; b++) { + for (int colD = 0; colD < oD; colD++) { + for (int colH = 0; colH < oH; ++colH) { + for (int colW = 0; colW < oW; ++colW) { + for (int c = 0; c < iC; ++c) { + for (int kDep = 0; kDep < kD; ++kDep) { + for (int kRow = 0; kRow < kH; ++kRow) { + for (int kCol = 0; kCol < kW; ++kCol) { + + volDep = (-pD + kDep * dD) + colD * sD; + volRow = (-pH + kRow * dH) + colH * sH; + volCol = (-pW + kCol * dW) + colW * sW; + + if (static_cast(volDep) < static_cast(iD) && static_cast(volRow) < static_cast(iH) && static_cast(volCol) < static_cast(iW)) { + col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7; + vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4; + *vol += *col; + } } } } @@ -229,7 +249,10 @@ namespace nd4j { } } } - } + }; + + samediff::Threads::parallel_tad(func, 0, bS); + } } @@ -568,22 +591,24 @@ namespace nd4j { const Nd4jLong zStride2 = output.stridesOf()[dimIH]; const Nd4jLong zStride3 = output.stridesOf()[dimIH + 1]; - uint xCoord2, xCoord3; // loop through output array - PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4) private(xCoord2, xCoord3)) - for(uint b = 0; b < bS; ++b) { - for(uint c = 0; c < iC; ++c) { - for(uint h = 0; h < oH ; ++h) { - for(uint w = 0; w < oW ; ++w) { + auto func = PRAGMA_THREADS_FOR_3D { + uint xCoord2, xCoord3; + for (uint b = start_x; b < stop_x; b += inc_x) { + for (uint c = start_y; c < stop_y; c += inc_y) { + for (uint h = start_z; h < stop_z; h += inc_z) { + for (uint w = 0; w < oW; ++w) { + xCoord2 = h / factorH; + xCoord3 = w / factorW; - xCoord2 = h / factorH; - xCoord3 = w / factorW; - - z[b*zStride0 + c*zStride1 + h*zStride2 + w*zStride3] = x[b*xStride0 + c*xStride1 + xCoord2*xStride2 + xCoord3*xStride3]; + z[b * zStride0 + c * zStride1 + h * zStride2 + w * zStride3] = x[b * xStride0 + c * xStride1 + xCoord2 * xStride2 + xCoord3 * xStride3]; + } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oH, 1); } ////////////////////////////////////////////////////////////////////////// @@ -616,25 +641,31 @@ namespace nd4j { const Nd4jLong zStride3 = output.stridesOf()[dimID + 1]; const Nd4jLong zStride4 = output.stridesOf()[dimID + 2]; - uint xCoord2, xCoord3, xCoord4; // loop through output array - PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5) private(xCoord2, xCoord3, xCoord4)) - for(uint b = 0; b < bS; ++b) { - for(uint c = 0; c < iC; ++c) { - for(uint d = 0; d < oD ; ++d) { - for(uint h = 0; h < oH ; ++h) { - for(uint w = 0; w < oW ; ++w) { + auto func = PRAGMA_THREADS_FOR_3D { + uint xCoord2, xCoord3, xCoord4; - xCoord2 = d / factorD; - xCoord3 = h / factorH; - xCoord4 = w / factorW; + for (uint b = start_x; b < stop_x; b += inc_x) { + for (uint c = start_y; c < stop_y; c += inc_y) { + for (uint d = start_z; d < stop_z; d += inc_z) { + for (uint h = 0; h < oH; ++h) { + for (uint w = 0; w < oW; ++w) { - z[b*zStride0 + c*zStride1 + d*zStride2 + h*zStride3 + w*zStride4] = x[b*xStride0 + c*xStride1 + xCoord2*xStride2 + xCoord3*xStride3 + xCoord4*xStride4]; + xCoord2 = d / factorD; + xCoord3 = h / factorH; + xCoord4 = w / factorW; + + z[b * zStride0 + c * zStride1 + d * zStride2 + h * zStride3 + w * zStride4] = x[ + b * xStride0 + c * xStride1 + xCoord2 * xStride2 + xCoord3 * xStride3 + + xCoord4 * xStride4]; + } } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1); } ////////////////////////////////////////////////////////////////////////// @@ -668,23 +699,26 @@ namespace nd4j { const Nd4jLong zStride3 = gradI.stridesOf()[dimIH + 1]; // loop through output array - PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4)) - for(uint b = 0; b < bS; ++b) { - for(uint c = 0; c < iC; ++c) { - for(uint h = 0; h < iH; ++h) { - for(uint w = 0; w < iW; ++w) { + auto func = PRAGMA_THREADS_FOR_3D { + for (uint b = start_x; b < stop_x; b += inc_x) { + for (uint c = start_y; c < stop_y; c += inc_y) { + for (uint h = start_z; h < stop_z; h += inc_z) { + for (uint w = 0; w < iW; ++w) { - const auto zOffset = b*zStride0 + c*zStride1 + h*zStride2 + w*zStride3; + const auto zOffset = b * zStride0 + c * zStride1 + h * zStride2 + w * zStride3; - z[zOffset] = 0; + z[zOffset] = 0; - for(uint xh = h * factorH; xh < h * factorH + factorH; ++xh) - for(uint xw = w * factorW; xw < w * factorW + factorW; ++xw) - z[zOffset] += x[b*xStride0 + c*xStride1 + xh*xStride2 + xw*xStride3]; + for (uint xh = h * factorH; xh < h * factorH + factorH; ++xh) + for (uint xw = w * factorW; xw < w * factorW + factorW; ++xw) + z[zOffset] += x[b * xStride0 + c * xStride1 + xh * xStride2 + xw * xStride3]; + } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, iH, 1); } ////////////////////////////////////////////////////////////////////////// @@ -723,26 +757,29 @@ namespace nd4j { const Nd4jLong zStride4 = gradI.stridesOf()[dimID + 2]; // loop through output array - PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5)) - for(uint b = 0; b < bS; ++b) { - for(uint c = 0; c < iC; ++c) { - for(uint d = 0; d < iD; ++d) { - for(uint h = 0; h < iH; ++h) { - for(uint w = 0; w < iW; ++w) { + auto func = PRAGMA_THREADS_FOR_3D { + for (uint b = start_x; b < stop_x; b += inc_x) { + for (uint c = start_y; c < stop_y; c += inc_y) { + for (uint d = start_z; d < stop_z; d += inc_z) { + for (uint h = 0; h < iH; ++h) { + for (uint w = 0; w < iW; ++w) { - const auto zOffset = b*zStride0 + c*zStride1 + d*zStride2 + h*zStride3 + w*zStride4; + const auto zOffset = b * zStride0 + c * zStride1 + d * zStride2 + h * zStride3 + w * zStride4; - z[zOffset] = 0; + z[zOffset] = 0; - for(uint xd = d * factorD; xd < d * factorD + factorD; ++xd) - for(uint xh = h * factorH; xh < h * factorH + factorH; ++xh) - for(uint xw = w * factorW; xw < w * factorW + factorW; ++xw) - z[zOffset] += x[b*xStride0 + c*xStride1 + xd*xStride2 + xh*xStride3 + xw*xStride4]; + for (uint xd = d * factorD; xd < d * factorD + factorD; ++xd) + for (uint xh = h * factorH; xh < h * factorH + factorH; ++xh) + for (uint xw = w * factorW; xw < w * factorW + factorW; ++xw) + z[zOffset] += x[b * xStride0 + c * xStride1 + xd * xStride2 + xh * xStride3 + xw * xStride4]; + } } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, iD, 1); } ////////////////////////////////////////////////////////////////////////// @@ -779,142 +816,156 @@ namespace nd4j { const Nd4jLong iStep3 = dW*iStride3; const int kProd = kH*kW; - Nd4jLong hstart, wstart, hend, wend; - T *pIn; - if(poolingMode == 0) { // max - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, hstart, wstart, hend, wend) collapse(2)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { + auto func = PRAGMA_THREADS_FOR_2D { + Nd4jLong hstart, wstart, hend, wend; + T *pIn; - pIn = in + b * iStride0 + c * iStride1; + for (int b = start_x; b < stop_x; b += inc_x) { + for (int c = start_y; c < stop_y; c += inc_y) { + for (int oh = 0; oh < oH; ++oh) { + for (int ow = 0; ow < oW; ++ow) { - hstart = oh * sH - pH; - wstart = ow * sW - pW; - hend = hstart + kHEff; - wend = wstart + kWEff; + pIn = in + b * iStride0 + c * iStride1; - if(hstart < 0) - hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); - if(wstart < 0) - wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); - if(hend > iH) - hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); - if(wend > iW) - wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); + hstart = oh * sH - pH; + wstart = ow * sW - pW; + hend = hstart + kHEff; + wend = wstart + kWEff; - hstart *= iStride2; - hend *= iStride2; - wstart *= iStride3; - wend *= iStride3; + if (hstart < 0) + hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); + if (wstart < 0) + wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); + if (hend > iH) + hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); + if (wend > iW) + wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); - T max = -DataTypeUtils::max(); + hstart *= iStride2; + hend *= iStride2; + wstart *= iStride3; + wend *= iStride3; - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) { - T val = pIn[kh + kw]; - if (val > max) - max = val; - } - out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = max; + T max = -DataTypeUtils::max(); + + for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) { + T val = pIn[kh + kw]; + if (val > max) + max = val; + } + out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = max; + } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1); } /*************************************************************************/ else if(poolingMode == 1) { // avg - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, hstart, wstart, hend, wend) collapse(2)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { + auto func = PRAGMA_THREADS_FOR_2D { + Nd4jLong hstart, wstart, hend, wend; + T *pIn; - pIn = in + b * iStride0 + c * iStride1; + for (int b = start_x; b < stop_x; b += inc_x) { + for (int c = start_y; c < stop_y; c += inc_y) { + for (int oh = 0; oh < oH; ++oh) { + for (int ow = 0; ow < oW; ++ow) { - hstart = oh * sH - pH; - wstart = ow * sW - pW; - hend = hstart + kHEff; - wend = wstart + kWEff; + pIn = in + b * iStride0 + c * iStride1; - if(hstart < 0) - hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); - if(wstart < 0) - wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); - if(hend > iH) - hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); - if(wend > iW) - wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); + hstart = oh * sH - pH; + wstart = ow * sW - pW; + hend = hstart + kHEff; + wend = wstart + kWEff; - hstart *= iStride2; - hend *= iStride2; - wstart *= iStride3; - wend *= iStride3; + if (hstart < 0) + hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); + if (wstart < 0) + wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); + if (hend > iH) + hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); + if (wend > iW) + wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); - T sum = static_cast(0.f); + hstart *= iStride2; + hend *= iStride2; + wstart *= iStride3; + wend *= iStride3; - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) - sum += pIn[kh + kw]; + T sum = static_cast(0.f); - if (extraParam0 == 0) { //Exclude padding - int a = (hend-hstart)/iStep2 + ((hend-hstart) % iStep2 == 0 ? 0 : 1); - int b = (wend-wstart)/iStep3 + ((wend-wstart) % iStep3 == 0 ? 0 : 1); - sum /= static_cast(a * b); // Accounts for dilation + for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) + sum += pIn[kh + kw]; + + if (extraParam0 == 0) { //Exclude padding + int a = (hend - hstart) / iStep2 + ((hend - hstart) % iStep2 == 0 ? 0 : 1); + int r = (wend - wstart) / iStep3 + ((wend - wstart) % iStep3 == 0 ? 0 : 1); + sum /= static_cast(a * r); // Accounts for dilation + } else if (extraParam0 == 1) //Include padding + sum /= kProd; + + out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum; } - else if (extraParam0 == 1) //Include padding - sum /= kProd; - - out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum; } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1); } /*************************************************************************/ else if(poolingMode == 2) { // pnorm - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, hstart, wstart, hend, wend) collapse(2)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { + auto func = PRAGMA_THREADS_FOR_2D { + Nd4jLong hstart, wstart, hend, wend; + T *pIn; - pIn = in + b * iStride0 + c * iStride1; + for (int b = start_x; b < stop_x; b += inc_x) { + for (int c = start_y; c < stop_y; c += inc_y) { + for (int oh = 0; oh < oH; ++oh) { + for (int ow = 0; ow < oW; ++ow) { - hstart = oh * sH - pH; - wstart = ow * sW - pW; - hend = hstart + kHEff; - wend = wstart + kWEff; + pIn = in + b * iStride0 + c * iStride1; - if(hstart < 0) - hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); - if(wstart < 0) - wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); - if(hend > iH) - hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); - if(wend > iW) - wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); + hstart = oh * sH - pH; + wstart = ow * sW - pW; + hend = hstart + kHEff; + wend = wstart + kWEff; - hstart *= iStride2; - hend *= iStride2; - wstart *= iStride3; - wend *= iStride3; + if (hstart < 0) + hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); + if (wstart < 0) + wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); + if (hend > iH) + hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); + if (wend > iW) + wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); - T sum = static_cast(0.f); + hstart *= iStride2; + hend *= iStride2; + wstart *= iStride3; + wend *= iStride3; - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) - sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kh + kw]), extraParam0); + T sum = static_cast(0.f); - sum = nd4j::math::nd4j_pow(sum, static_cast((T)1.f) / extraParam0); + for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) + sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kh + kw]), extraParam0); - out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum; + sum = nd4j::math::nd4j_pow(sum, static_cast((T) 1.f) / extraParam0); + + out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum; + } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1); } else { nd4j_printf("ConvolutionUtils::pooling2d: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode); @@ -961,176 +1012,192 @@ namespace nd4j { const Nd4jLong iStep4 = dW*iStride4; const int kProd = kD*kH*kW; - Nd4jLong dstart, hstart, wstart, dend, hend, wend; - T sum, *pIn; - if(poolingMode == 0) { // max - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, dstart, hstart, wstart, dend, hend, wend)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int od = 0; od < oD; ++od) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { + auto func = PRAGMA_THREADS_FOR_3D { + Nd4jLong dstart, hstart, wstart, dend, hend, wend; + T sum, *pIn; - pIn = in + b * iStride0 + c * iStride1; + for (int b = start_x; b < stop_x; b += inc_x) { + for (int c = start_y; c < stop_y; c += inc_y) { + for (int od = start_z; od < stop_z; od += inc_z) { + for (int oh = 0; oh < oH; ++oh) { + for (int ow = 0; ow < oW; ++ow) { - dstart = od * sD - pD; - hstart = oh * sH - pH; - wstart = ow * sW - pW; - dend = dstart + kDEff; - hend = hstart + kHEff; - wend = wstart + kWEff; + pIn = in + b * iStride0 + c * iStride1; - if(dstart < 0) - dstart += dD * ((-dstart + dD - 1) / dD); - if(hstart < 0) - hstart += dH * ((-hstart + dH - 1) / dH); - if(wstart < 0) - wstart += dW * ((-wstart + dW - 1) / dW); - if(dend > iD) - dend -= dD * ((dend-iD + dD - 1) / dD); - if(hend > iH) - hend -= dH * ((hend-iH + dH - 1) / dH); - if(wend > iW) - wend -= dW * ((wend-iW + dW - 1) / dW); + dstart = od * sD - pD; + hstart = oh * sH - pH; + wstart = ow * sW - pW; + dend = dstart + kDEff; + hend = hstart + kHEff; + wend = wstart + kWEff; - dstart *= iStride2; - dend *= iStride2; - hstart *= iStride3; - hend *= iStride3; - wstart *= iStride4; - wend *= iStride4; + if (dstart < 0) + dstart += dD * ((-dstart + dD - 1) / dD); + if (hstart < 0) + hstart += dH * ((-hstart + dH - 1) / dH); + if (wstart < 0) + wstart += dW * ((-wstart + dW - 1) / dW); + if (dend > iD) + dend -= dD * ((dend - iD + dD - 1) / dD); + if (hend > iH) + hend -= dH * ((hend - iH + dH - 1) / dH); + if (wend > iW) + wend -= dW * ((wend - iW + dW - 1) / dW); - sum = -DataTypeUtils::max(); + dstart *= iStride2; + dend *= iStride2; + hstart *= iStride3; + hend *= iStride3; + wstart *= iStride4; + wend *= iStride4; - for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) - for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) { - T val = pIn[kd + kh + kw]; - if (val > sum) - sum = val; - } - out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum; + sum = -DataTypeUtils::max(); + + for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) + for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) { + T val = pIn[kd + kh + kw]; + if (val > sum) + sum = val; + } + + out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum; + } } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1); } /*************************************************************************/ else if(poolingMode == 1) { // avg - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, dstart, hstart, wstart, dend, hend, wend)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int od = 0; od < oD; ++od) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { + auto func = PRAGMA_THREADS_FOR_3D { + Nd4jLong dstart, hstart, wstart, dend, hend, wend; + T sum, *pIn; - pIn = in + b * iStride0 + c * iStride1; + for (int b = start_x; b < stop_x; b += inc_x) { + for (int c = start_y; c < stop_y; c += inc_y) { + for (int od = start_z; od < stop_z; od += inc_z) { + for (int oh = 0; oh < oH; ++oh) { + for (int ow = 0; ow < oW; ++ow) { - dstart = od * sD - pD; - hstart = oh * sH - pH; - wstart = ow * sW - pW; - dend = dstart + kDEff; - hend = hstart + kHEff; - wend = wstart + kWEff; + pIn = in + b * iStride0 + c * iStride1; - if(dstart < 0) - dstart += dD * ((-dstart + dD - 1) / dD); - if(hstart < 0) - hstart += dH * ((-hstart + dH - 1) / dH); - if(wstart < 0) - wstart += dW * ((-wstart + dW - 1) / dW); - if(dend > iD) - dend -= dD * ((dend-iD + dD - 1) / dD); - if(hend > iH) - hend -= dH * ((hend-iH + dH - 1) / dH); - if(wend > iW) - wend -= dW * ((wend-iW + dW - 1) / dW); + dstart = od * sD - pD; + hstart = oh * sH - pH; + wstart = ow * sW - pW; + dend = dstart + kDEff; + hend = hstart + kHEff; + wend = wstart + kWEff; - dstart *= iStride2; - dend *= iStride2; - hstart *= iStride3; - hend *= iStride3; - wstart *= iStride4; - wend *= iStride4; + if (dstart < 0) + dstart += dD * ((-dstart + dD - 1) / dD); + if (hstart < 0) + hstart += dH * ((-hstart + dH - 1) / dH); + if (wstart < 0) + wstart += dW * ((-wstart + dW - 1) / dW); + if (dend > iD) + dend -= dD * ((dend - iD + dD - 1) / dD); + if (hend > iH) + hend -= dH * ((hend - iH + dH - 1) / dH); + if (wend > iW) + wend -= dW * ((wend - iW + dW - 1) / dW); - sum = static_cast(0.); + dstart *= iStride2; + dend *= iStride2; + hstart *= iStride3; + hend *= iStride3; + wstart *= iStride4; + wend *= iStride4; - for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) - for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) - sum += pIn[kd + kh + kw]; + sum = static_cast(0.); - if (extraParam0 == 0) //Exclude padding - sum /= nd4j::math::nd4j_ceil(static_cast(dend-dstart) / static_cast(iStep2)) * nd4j::math::nd4j_ceil(static_cast(hend-hstart) / static_cast(iStep3)) * nd4j::math::nd4j_ceil(static_cast(wend-wstart) / static_cast(iStep4)); //Accounts for dilation - else if (extraParam0 == 1) //Include padding - sum /= kProd; + for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) + for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) + sum += pIn[kd + kh + kw]; - out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum; + if (extraParam0 == 0) //Exclude padding + sum /= nd4j::math::nd4j_ceil(static_cast(dend - dstart) / static_cast(iStep2)) * nd4j::math::nd4j_ceil(static_cast(hend - hstart) / static_cast(iStep3)) * nd4j::math::nd4j_ceil(static_cast(wend - wstart) / static_cast(iStep4)); //Accounts for dilation + else if (extraParam0 == 1) //Include padding + sum /= kProd; + + out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum; + } } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1); } /*************************************************************************/ else if(poolingMode == 2) { // pnorm - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, dstart, hstart, wstart, dend, hend, wend)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int od = 0; od < oD; ++od) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { + auto func = PRAGMA_THREADS_FOR_3D { + Nd4jLong dstart, hstart, wstart, dend, hend, wend; + T sum, *pIn; - pIn = in + b * iStride0 + c * iStride1; + for (int b = start_x; b < stop_x; b += inc_x) { + for (int c = start_y; c < stop_y; c += inc_y) { + for (int od = start_z; od < stop_z; od += inc_z) { + for (int oh = 0; oh < oH; ++oh) { + for (int ow = 0; ow < oW; ++ow) { - dstart = od * sD - pD; - hstart = oh * sH - pH; - wstart = ow * sW - pW; - dend = dstart + kDEff; - hend = hstart + kHEff; - wend = wstart + kWEff; + pIn = in + b * iStride0 + c * iStride1; - if(dstart < 0) - dstart += dD * ((-dstart + dD - 1) / dD); - if(hstart < 0) - hstart += dH * ((-hstart + dH - 1) / dH); - if(wstart < 0) - wstart += dW * ((-wstart + dW - 1) / dW); - if(dend > iD) - dend -= dD * ((dend-iD + dD - 1) / dD); - if(hend > iH) - hend -= dH * ((hend-iH + dH - 1) / dH); - if(wend > iW) - wend -= dW * ((wend-iW + dW - 1) / dW); + dstart = od * sD - pD; + hstart = oh * sH - pH; + wstart = ow * sW - pW; + dend = dstart + kDEff; + hend = hstart + kHEff; + wend = wstart + kWEff; - dstart *= iStride2; - dend *= iStride2; - hstart *= iStride3; - hend *= iStride3; - wstart *= iStride4; - wend *= iStride4; + if (dstart < 0) + dstart += dD * ((-dstart + dD - 1) / dD); + if (hstart < 0) + hstart += dH * ((-hstart + dH - 1) / dH); + if (wstart < 0) + wstart += dW * ((-wstart + dW - 1) / dW); + if (dend > iD) + dend -= dD * ((dend - iD + dD - 1) / dD); + if (hend > iH) + hend -= dH * ((hend - iH + dH - 1) / dH); + if (wend > iW) + wend -= dW * ((wend - iW + dW - 1) / dW); - sum = static_cast(0.); + dstart *= iStride2; + dend *= iStride2; + hstart *= iStride3; + hend *= iStride3; + wstart *= iStride4; + wend *= iStride4; - for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) - for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) - sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd + kh + kw]), extraParam0); + sum = static_cast(0.); - sum = nd4j::math::nd4j_pow(sum, (T) 1.f / extraParam0); + for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) + for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) + sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd + kh + kw]), extraParam0); - out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum; + sum = nd4j::math::nd4j_pow(sum, (T) 1.f / extraParam0); + + out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum; + } } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1); } else { nd4j_printf("ConvolutionUtils::pooling3d: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode); - throw ""; + throw std::runtime_error("Incorrect poooling3d mode"); } } @@ -1182,191 +1249,230 @@ namespace nd4j { const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3; - Nd4jLong hstart, wstart,hend, wend, maxKH, maxKW; - T sum, valO, *pIn, *pgI; - if(poolingMode == 0) { // max - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, valO, sum, hstart, wstart, hend, wend, maxKH, maxKW)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { + auto func = PRAGMA_THREADS_FOR_2D { + Nd4jLong hstart, wstart,hend, wend, maxKH, maxKW; + T sum, valO, *pIn, *pgI; - pIn = in + b * iStride0 + c * iStride1; + for (int b = start_x; b < stop_x; b += inc_x) { + for (int c = start_y; c < stop_y; c += inc_y) { + for (int oh = 0; oh < oH; ++oh) { + for (int ow = 0; ow < oW; ++ow) { - hstart = oh * sH - pH; - wstart = ow * sW - pW; - hend = hstart + kHEff; - wend = wstart + kWEff; + pIn = in + b * iStride0 + c * iStride1; - if(hstart < 0) - hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); - if(wstart < 0) - wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); - if(hend > iH) - hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); - if(wend > iW) - wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); + hstart = oh * sH - pH; + wstart = ow * sW - pW; + hend = hstart + kHEff; + wend = wstart + kWEff; - sum = -DataTypeUtils::max(); - valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3]; + if (hstart < 0) + hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); + if (wstart < 0) + wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); + if (hend > iH) + hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); + if (wend > iW) + wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); - if(sameStrides) { + sum = -DataTypeUtils::max(); + valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3]; - hstart *= iStride2; - hend *= iStride2; - wstart *= iStride3; - wend *= iStride3; + if (sameStrides) { - // we set these to default values - maxKH = hstart; - maxKW = wstart; + hstart *= iStride2; + hend *= iStride2; + wstart *= iStride3; + wend *= iStride3; - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) { - T valIn = pIn[kh + kw]; - if (valIn > sum) { - sum = valIn; - maxKH = kh; - maxKW = kw; + // we set these to default values + maxKH = hstart; + maxKW = wstart; + + for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) { + T valIn = pIn[kh + kw]; + if (valIn > sum) { + sum = valIn; + maxKH = kh; + maxKW = kw; + } } - } - gI[pIn - in + maxKH + maxKW] += valO; - } - else { + gI[pIn - in + maxKH + maxKW] += valO; + } else { - // we set these to default values - maxKH = hstart; - maxKW = wstart; + // we set these to default values + maxKH = hstart; + maxKW = wstart; - for (Nd4jLong kh = hstart; kh < hend; kh += dH) - for (Nd4jLong kw = wstart; kw < wend; kw += dW) { - T valIn = pIn[kh * iStride2 + kw * iStride3]; - if (valIn > sum) { - sum = valIn; - maxKH = kh; - maxKW = kw; + for (Nd4jLong kh = hstart; kh < hend; kh += dH) + for (Nd4jLong kw = wstart; kw < wend; kw += dW) { + T valIn = pIn[kh * iStride2 + kw * iStride3]; + if (valIn > sum) { + sum = valIn; + maxKH = kh; + maxKW = kw; + } } - } - gI[b * gIStride0 + c * gIStride1 + maxKH * gIStride2 + maxKW * gIStride3] += valO; + + gI[b * gIStride0 + c * gIStride1 + maxKH * gIStride2 + maxKW * gIStride3] += valO; + } } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1); } /*************************************************************************/ else if(poolingMode == 1) { // avg - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pgI, valO, hstart, wstart, hend, wend)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { + auto func = PRAGMA_THREADS_FOR_2D { + Nd4jLong hstart, wstart, hend, wend, maxKH, maxKW; + T sum, valO, *pIn, *pgI; - pgI = gI + b * gIStride0 + c * gIStride1; + for (int b = start_x; b < stop_x; b += inc_x) { + for (int c = start_y; c < stop_y; c += inc_y) { + for (int oh = 0; oh < oH; ++oh) { + for (int ow = 0; ow < oW; ++ow) { - hstart = oh * sH - pH; - wstart = ow * sW - pW; - hend = hstart + kHEff; - wend = wstart + kWEff; + pgI = gI + b * gIStride0 + c * gIStride1; - if(hstart < 0) - hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); - if(wstart < 0) - wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); - if(hend > iH) - hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); - if(wend > iW) - wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); + hstart = oh * sH - pH; + wstart = ow * sW - pW; + hend = hstart + kHEff; + wend = wstart + kWEff; - hstart *= gIStride2; - hend *= gIStride2; - wstart *= gIStride3; - wend *= gIStride3; + if (hstart < 0) + hstart += dH * ((-hstart + dH - 1) / + dH); // (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); + if (wstart < 0) + wstart += dW * ((-wstart + dW - 1) / + dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); + if (hend > iH) + hend -= dH * ((hend - iH + dH - 1) / + dH); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); + if (wend > iW) + wend -= dW * ((wend - iW + dW - 1) / + dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); - valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3]; + hstart *= gIStride2; + hend *= gIStride2; + wstart *= gIStride3; + wend *= gIStride3; - if ((int) extraParam0 == 0) //Exclude padding - valO /= static_cast(nd4j::math::nd4j_ceil(static_cast(hend-hstart) / static_cast(gIStep2))) * static_cast(nd4j::math::nd4j_ceil(static_cast(wend-wstart) / static_cast(gIStep3))); //Accounts for dilation - else if ((int) extraParam0 == 1) //Include padding - valO /= kProd; + valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3]; - for (Nd4jLong kh = hstart; kh < hend; kh += gIStep2) - for (Nd4jLong kw = wstart; kw < wend; kw += gIStep3) - pgI[kh + kw] += valO; + if ((int) extraParam0 == 0) //Exclude padding + valO /= static_cast(nd4j::math::nd4j_ceil( + static_cast(hend - hstart) / static_cast(gIStep2))) * + static_cast(nd4j::math::nd4j_ceil( + static_cast(wend - wstart) / + static_cast(gIStep3))); //Accounts for dilation + else if ((int) extraParam0 == 1) //Include padding + valO /= kProd; + + for (Nd4jLong kh = hstart; kh < hend; kh += gIStep2) + for (Nd4jLong kw = wstart; kw < wend; kw += gIStep3) + pgI[kh + kw] += valO; + } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1); } /*************************************************************************/ else if(poolingMode == 2) { // pnorm - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, valO, pgI, sum, hstart, wstart, hend, wend)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { + auto func = PRAGMA_THREADS_FOR_2D { + Nd4jLong hstart, wstart, hend, wend, maxKH, maxKW; + T sum, valO, *pIn, *pgI; - pIn = in + b * iStride0 + c * iStride1; - pgI = sameStrides ? gI + (pIn - in) : gI + b * gIStride0 + c * gIStride1; + for (int b = start_x; b < stop_x; b += inc_x) { + for (int c = start_y; c < stop_y; c += inc_y) { + for (int oh = 0; oh < oH; ++oh) { + for (int ow = 0; ow < oW; ++ow) { - hstart = oh * sH - pH; - wstart = ow * sW - pW; - hend = hstart + kHEff; - wend = wstart + kWEff; + pIn = in + b * iStride0 + c * iStride1; + pgI = sameStrides ? gI + (pIn - in) : gI + b * gIStride0 + c * gIStride1; - if(hstart < 0) - hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); - if(wstart < 0) - wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); - if(hend > iH) - hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); - if(wend > iW) - wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); + hstart = oh * sH - pH; + wstart = ow * sW - pW; + hend = hstart + kHEff; + wend = wstart + kWEff; - sum = static_cast(0.f); - valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3]; + if (hstart < 0) + hstart += dH * ((-hstart + dH - 1) / + dH); // (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); + if (wstart < 0) + wstart += dW * ((-wstart + dW - 1) / + dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); + if (hend > iH) + hend -= dH * ((hend - iH + dH - 1) / + dH); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); + if (wend > iW) + wend -= dW * ((wend - iW + dW - 1) / + dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); - if(sameStrides) { + sum = static_cast(0.f); + valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3]; - hstart *= iStride2; - hend *= iStride2; - wstart *= iStride3; - wend *= iStride3; + if (sameStrides) { - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) - sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kh + kw]), extraParam0); + hstart *= iStride2; + hend *= iStride2; + wstart *= iStride3; + wend *= iStride3; - valO *= nd4j::math::nd4j_pow(sum, ((T)1. - extraParam0) / extraParam0); + for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) + sum += nd4j::math::nd4j_pow( + nd4j::math::nd4j_abs(pIn[kh + kw]), extraParam0); - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) - pgI[kh + kw] += valO * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kh + kw]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn(pIn[kh + kw]); - } - else { + valO *= nd4j::math::nd4j_pow(sum, + ((T) 1. - extraParam0) / extraParam0); - for (Nd4jLong kh = hstart; kh < hend; kh += dH) - for (Nd4jLong kw = wstart; kw < wend; kw += dW) - sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kh * iStride2 + kw * iStride3]), extraParam0); + for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) + pgI[kh + kw] += valO * nd4j::math::nd4j_pow( + nd4j::math::nd4j_abs(pIn[kh + kw]), extraParam0 - 1.f) * + nd4j::math::nd4j_sgn(pIn[kh + kw]); + } else { - valO *= nd4j::math::nd4j_pow(sum, ((T)1. - extraParam0) / extraParam0); + for (Nd4jLong kh = hstart; kh < hend; kh += dH) + for (Nd4jLong kw = wstart; kw < wend; kw += dW) + sum += nd4j::math::nd4j_pow( + nd4j::math::nd4j_abs(pIn[kh * iStride2 + kw * iStride3]), + extraParam0); - for (Nd4jLong kh = hstart; kh < hend; kh += dH) { - for (Nd4jLong kw = wstart; kw < wend; kw += dW) { - const auto inVal = pIn[kh * iStride2 + kw * iStride3]; - pgI[kh * gIStride2 + kw * gIStride3] += valO * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn(inVal); + valO *= nd4j::math::nd4j_pow(sum, + ((T) 1. - extraParam0) / extraParam0); + + for (Nd4jLong kh = hstart; kh < hend; kh += dH) { + for (Nd4jLong kw = wstart; kw < wend; kw += dW) { + const auto inVal = pIn[kh * iStride2 + kw * iStride3]; + pgI[kh * gIStride2 + kw * gIStride3] += valO * + nd4j::math::nd4j_pow( + nd4j::math::nd4j_abs( + inVal), + extraParam0 - 1.f) * + nd4j::math::nd4j_sgn( + inVal); + } } } } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1); } else { nd4j_printf("ConvolutionUtils::pooling2dBP: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode); - throw ""; + throw std::runtime_error("Incorrect pooling2dBP mode"); } } @@ -1425,226 +1531,239 @@ namespace nd4j { const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3 && iStride4 == gIStride4; - Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW; - T sum, valO, *pIn, *pgI; - if(poolingMode == 0) { // max - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, valO, sum, dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int od = 0; od < oD; ++od) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { + auto func = PRAGMA_THREADS_FOR_3D { + Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW; + T sum, valO, *pIn, *pgI; - pIn = in + b * iStride0 + c * iStride1; + for (int b = start_x; b < stop_x; b += inc_x) { + for (int c = start_y; c < stop_y; c += inc_y) { + for (int od = start_z; od < stop_z; od += inc_z) { + for (int oh = 0; oh < oH; ++oh) { + for (int ow = 0; ow < oW; ++ow) { - dstart = od * sD - pD; - hstart = oh * sH - pH; - wstart = ow * sW - pW; - dend = dstart + kDEff; - hend = hstart + kHEff; - wend = wstart + kWEff; + pIn = in + b * iStride0 + c * iStride1; - if(dstart < 0) - dstart += dD * ((-dstart + dD - 1) / dD); - if(hstart < 0) - hstart += dH * ((-hstart + dH - 1) / dH); - if(wstart < 0) - wstart += dW * ((-wstart + dW - 1) / dW); - if(dend > iD) - dend -= dD * ((dend-iD + dD - 1) / dD); - if(hend > iH) - hend -= dH * ((hend-iH + dH - 1) / dH); - if(wend > iW) - wend -= dW * ((wend-iW + dW - 1) / dW); + dstart = od * sD - pD; + hstart = oh * sH - pH; + wstart = ow * sW - pW; + dend = dstart + kDEff; + hend = hstart + kHEff; + wend = wstart + kWEff; - sum = -DataTypeUtils::max(); - valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4]; + if (dstart < 0) + dstart += dD * ((-dstart + dD - 1) / dD); + if (hstart < 0) + hstart += dH * ((-hstart + dH - 1) / dH); + if (wstart < 0) + wstart += dW * ((-wstart + dW - 1) / dW); + if (dend > iD) + dend -= dD * ((dend - iD + dD - 1) / dD); + if (hend > iH) + hend -= dH * ((hend - iH + dH - 1) / dH); + if (wend > iW) + wend -= dW * ((wend - iW + dW - 1) / dW); - if(sameStrides) { + sum = -DataTypeUtils::max(); + valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4]; - dstart *= iStride2; - dend *= iStride2; - hstart *= iStride3; - hend *= iStride3; - wstart *= iStride4; - wend *= iStride4; + if (sameStrides) { - maxKD = dstart; - maxKH = hstart; - maxKW = wstart; + dstart *= iStride2; + dend *= iStride2; + hstart *= iStride3; + hend *= iStride3; + wstart *= iStride4; + wend *= iStride4; - for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) - for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) { - T valIn = pIn[kd + kh + kw]; - if (valIn > sum) { - sum = valIn; - maxKD = kd; - maxKH = kh; - maxKW = kw; + maxKD = dstart; + maxKH = hstart; + maxKW = wstart; + + for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) + for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) { + T valIn = pIn[kd + kh + kw]; + if (valIn > sum) { + sum = valIn; + maxKD = kd; + maxKH = kh; + maxKW = kw; + } } - } - gI[pIn - in + maxKD + maxKH + maxKW] += valO; - } - else { + gI[pIn - in + maxKD + maxKH + maxKW] += valO; + } else { - // we set these to default values - maxKH = hstart; - maxKW = wstart; - maxKD = dstart; + // we set these to default values + maxKH = hstart; + maxKW = wstart; + maxKD = dstart; - for (Nd4jLong kd = dstart; kd < dend; kd += dD) - for (Nd4jLong kh = hstart; kh < hend; kh += dH) - for (Nd4jLong kw = wstart; kw < wend; kw += dW) { - T valIn = pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]; - if (valIn > sum) { - sum = valIn; - maxKD = kd; - maxKH = kh; - maxKW = kw; + for (Nd4jLong kd = dstart; kd < dend; kd += dD) + for (Nd4jLong kh = hstart; kh < hend; kh += dH) + for (Nd4jLong kw = wstart; kw < wend; kw += dW) { + T valIn = pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]; + if (valIn > sum) { + sum = valIn; + maxKD = kd; + maxKH = kh; + maxKW = kw; + } } - } - gI[b * gIStride0 + c * gIStride1 + maxKD * gIStride2 + maxKH * gIStride3 + maxKW * gIStride4] += valO; + + gI[b * gIStride0 + c * gIStride1 + maxKD * gIStride2 + maxKH * gIStride3 + maxKW * gIStride4] += valO; + } } } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1); } /*************************************************************************/ else if(poolingMode == 1) { // avg - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pgI, valO, dstart, hstart, wstart, dend, hend, wend)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int od = 0; od < oD; ++od) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { + auto func = PRAGMA_THREADS_FOR_3D { + Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW; + T sum, valO, *pIn, *pgI; - pgI = gI + b * gIStride0 + c * gIStride1; + for (int b = start_x; b < stop_x; b += inc_x) { + for (int c = start_y; c < stop_y; c += inc_y) { + for (int od = start_z; od < stop_z; od += inc_z) { + for (int oh = 0; oh < oH; ++oh) { + for (int ow = 0; ow < oW; ++ow) { - dstart = od * sD - pD; - hstart = oh * sH - pH; - wstart = ow * sW - pW; - dend = dstart + kDEff; - hend = hstart + kHEff; - wend = wstart + kWEff; + pgI = gI + b * gIStride0 + c * gIStride1; - if(dstart < 0) - dstart += dD * ((-dstart + dD - 1) / dD); - if(hstart < 0) - hstart += dH * ((-hstart + dH - 1) / dH); - if(wstart < 0) - wstart += dW * ((-wstart + dW - 1) / dW); - if(dend > iD) - dend -= dD * ((dend-iD + dD - 1) / dD); - if(hend > iH) - hend -= dH * ((hend-iH + dH - 1) / dH); - if(wend > iW) - wend -= dW * ((wend-iW + dW - 1) / dW); + dstart = od * sD - pD; + hstart = oh * sH - pH; + wstart = ow * sW - pW; + dend = dstart + kDEff; + hend = hstart + kHEff; + wend = wstart + kWEff; - dstart *= gIStride2; - dend *= gIStride2; - hstart *= gIStride3; - hend *= gIStride3; - wstart *= gIStride4; - wend *= gIStride4; + if (dstart < 0) + dstart += dD * ((-dstart + dD - 1) / dD); + if (hstart < 0) + hstart += dH * ((-hstart + dH - 1) / dH); + if (wstart < 0) + wstart += dW * ((-wstart + dW - 1) / dW); + if (dend > iD) + dend -= dD * ((dend - iD + dD - 1) / dD); + if (hend > iH) + hend -= dH * ((hend - iH + dH - 1) / dH); + if (wend > iW) + wend -= dW * ((wend - iW + dW - 1) / dW); - valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4]; + dstart *= gIStride2; + dend *= gIStride2; + hstart *= gIStride3; + hend *= gIStride3; + wstart *= gIStride4; + wend *= gIStride4; - if (extraParam0 == 0) //Exclude padding - valO /= nd4j::math::nd4j_ceil(static_cast(dend-dstart) / static_cast(gIStep2)) * nd4j::math::nd4j_ceil(static_cast(hend-hstart) / static_cast(gIStep3)) * nd4j::math::nd4j_ceil(static_cast(wend-wstart) / static_cast(gIStep4)); //Accounts for dilation - else if (extraParam0 == 1) //Include padding - valO /= kProd; + valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4]; - for (Nd4jLong kd = dstart; kd < dend; kd += gIStep2) - for (Nd4jLong kh = hstart; kh < hend; kh += gIStep3) - for (Nd4jLong kw = wstart; kw < wend; kw += gIStep4) - pgI[kd + kh + kw] += valO; + if (extraParam0 == 0) //Exclude padding + valO /= nd4j::math::nd4j_ceil(static_cast(dend - dstart) / static_cast(gIStep2)) * nd4j::math::nd4j_ceil(static_cast(hend - hstart) / static_cast(gIStep3)) * nd4j::math::nd4j_ceil(static_cast(wend - wstart) / static_cast(gIStep4)); //Accounts for dilation + else if (extraParam0 == 1) //Include padding + valO /= kProd; + + for (Nd4jLong kd = dstart; kd < dend; kd += gIStep2) + for (Nd4jLong kh = hstart; kh < hend; kh += gIStep3) + for (Nd4jLong kw = wstart; kw < wend; kw += gIStep4) + pgI[kd + kh + kw] += valO; + } } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1); } /*************************************************************************/ else if(poolingMode == 2) { // pnorm - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, pgI, valO, sum, dstart, hstart, wstart, dend, hend, wend)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int od = 0; od < oD; ++od) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { + auto func = PRAGMA_THREADS_FOR_3D { + Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW; + T sum, valO, *pIn, *pgI; - pIn = in + b * iStride0 + c * iStride1; - pgI = gI + (pIn - in); + for (int b = start_x; b < stop_x; b += inc_x) { + for (int c = start_y; c < stop_y; c += inc_y) { + for (int od = start_z; od < stop_z; od += inc_z) { + for (int oh = 0; oh < oH; ++oh) { + for (int ow = 0; ow < oW; ++ow) { - dstart = od * sD - pD; - hstart = oh * sH - pH; - wstart = ow * sW - pW; - dend = dstart + kDEff; - hend = hstart + kHEff; - wend = wstart + kWEff; + pIn = in + b * iStride0 + c * iStride1; + pgI = gI + (pIn - in); - if(dstart < 0) - dstart += dD * ((-dstart + dD - 1) / dD); - if(hstart < 0) - hstart += dH * ((-hstart + dH - 1) / dH); - if(wstart < 0) - wstart += dW * ((-wstart + dW - 1) / dW); - if(dend > iD) - dend -= dD * ((dend-iD + dD - 1) / dD); - if(hend > iH) - hend -= dH * ((hend-iH + dH - 1) / dH); - if(wend > iW) - wend -= dW * ((wend-iW + dW - 1) / dW); + dstart = od * sD - pD; + hstart = oh * sH - pH; + wstart = ow * sW - pW; + dend = dstart + kDEff; + hend = hstart + kHEff; + wend = wstart + kWEff; - sum = static_cast(0.); - valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4]; + if (dstart < 0) + dstart += dD * ((-dstart + dD - 1) / dD); + if (hstart < 0) + hstart += dH * ((-hstart + dH - 1) / dH); + if (wstart < 0) + wstart += dW * ((-wstart + dW - 1) / dW); + if (dend > iD) + dend -= dD * ((dend - iD + dD - 1) / dD); + if (hend > iH) + hend -= dH * ((hend - iH + dH - 1) / dH); + if (wend > iW) + wend -= dW * ((wend - iW + dW - 1) / dW); - if(sameStrides) { + sum = static_cast(0.); + valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4]; - dstart *= iStride2; - dend *= iStride2; - hstart *= iStride3; - hend *= iStride3; - wstart *= iStride4; - wend *= iStride4; + if (sameStrides) { - for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) - for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) - sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd + kh + kw]), extraParam0); + dstart *= iStride2; + dend *= iStride2; + hstart *= iStride3; + hend *= iStride3; + wstart *= iStride4; + wend *= iStride4; - valO *= nd4j::math::nd4j_pow(sum, ((T)1.f - extraParam0) / extraParam0); + for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) + for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) + sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd + kh + kw]), extraParam0); - for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) - for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) - pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd + kh + kw]), extraParam0 - (T)1.f) * nd4j::math::nd4j_sgn(pIn[kd + kh + kw]); - } - else { + valO *= nd4j::math::nd4j_pow(sum, ((T) 1.f - extraParam0) / extraParam0); - for (Nd4jLong kd = dstart; kd < dend; kd += dD) - for (Nd4jLong kh = hstart; kh < hend; kh += dH) - for (Nd4jLong kw = wstart; kw < wend; kw += dW) - sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]), extraParam0); + for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) + for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) + pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd + kh + kw]),extraParam0 - (T) 1.f) * nd4j::math::nd4j_sgn(pIn[kd + kh + kw]); + } else { + for (Nd4jLong kd = dstart; kd < dend; kd += dD) + for (Nd4jLong kh = hstart; kh < hend; kh += dH) + for (Nd4jLong kw = wstart; kw < wend; kw += dW) + sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]), extraParam0); - valO *= nd4j::math::nd4j_pow(sum, ((T)1.f - extraParam0) / extraParam0); + valO *= nd4j::math::nd4j_pow(sum, ((T) 1.f - extraParam0) / extraParam0); - for (Nd4jLong kd = dstart; kd < dend; kd += dD) - for (Nd4jLong kh = hstart; kh < hend; kh += dH) - for (Nd4jLong kw = wstart; kw < wend; kw += dW) { - const auto inVal = pIn[kD * iStride2 + kh * iStride3 + kw * iStride4]; - pgI[kd * gIStride2 + kh * gIStride3 + kw * gIStride4] += valO * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn(inVal); - } + for (Nd4jLong kd = dstart; kd < dend; kd += dD) + for (Nd4jLong kh = hstart; kh < hend; kh += dH) + for (Nd4jLong kw = wstart; kw < wend; kw += dW) { + const auto inVal = pIn[kD * iStride2 + kh * iStride3 + kw * iStride4]; + pgI[kd * gIStride2 + kh * gIStride3 + kw * gIStride4] += valO * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn(inVal); + } + } } } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1); } else { nd4j_printf("ConvolutionUtils::pooling3dBP: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp index f61a53f30..3150c0cfd 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp @@ -38,14 +38,17 @@ void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray int tads = tadsA->size(); - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < tads; e++) { - auto a_ = tadsA->at(e); - auto b_ = tadsB->at(e); - auto o_ = tadsO->at(e); + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto a_ = tadsA->at(e); + auto b_ = tadsB->at(e); + auto o_ = tadsO->at(e); - helpers::cross(context, a_, b_, o_); - } + helpers::cross(context, a_, b_, o_); + } + }; + + samediff::Threads::parallel_tad(func, 0, tads); delete tadsA; delete tadsB; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp index 55cc57d3e..f041452ab 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp @@ -19,6 +19,7 @@ // #include +#include namespace nd4j { namespace ops { @@ -44,45 +45,51 @@ namespace helpers { if (isNHWC) { const int total_count = batch_size * output_height * output_width * output_depth; - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int out_idx = 0; out_idx < total_count; out_idx++) { - const int d = out_idx % output_depth; - const int out_idx2 = out_idx / output_depth; - const int w = out_idx2 % output_width; - const int out_idx3 = out_idx2 / output_width; - const int h = out_idx3 % output_height; - const int b = out_idx3 / output_height; + auto func = PRAGMA_THREADS_FOR { + for (auto out_idx = start; out_idx < stop; out_idx += increment) { + const int d = out_idx % output_depth; + const int out_idx2 = out_idx / output_depth; + const int w = out_idx2 % output_width; + const int out_idx3 = out_idx2 / output_width; + const int h = out_idx3 % output_height; + const int b = out_idx3 / output_height; - const int in_h = h / block_size; - const int offset_h = h % block_size; - const int in_w = w / block_size; - const int offset_w = w % block_size; - const int offset_d = (offset_h * block_size + offset_w) * output_depth; - const int in_d = d + offset_d; - const int inp_idx = in_d + input_depth * (in_w + input_width * (in_h + input_height * b)); - (output_ptr + out_idx)[0] = (input_ptr + inp_idx)[0]; - } + const int in_h = h / block_size; + const int offset_h = h % block_size; + const int in_w = w / block_size; + const int offset_w = w % block_size; + const int offset_d = (offset_h * block_size + offset_w) * output_depth; + const int in_d = d + offset_d; + const int inp_idx = in_d + input_depth * (in_w + input_width * (in_h + input_height * b)); + (output_ptr + out_idx)[0] = (input_ptr + inp_idx)[0]; + } + }; + + samediff::Threads::parallel_for(func, 0, total_count); } else { const int total_count = batch_size * input_depth_by_input_area; - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int input_idx = 0; input_idx < total_count; input_idx++) { - const int n_bY_bX_oC_iY = input_idx / input_width; - const int iX = input_idx - n_bY_bX_oC_iY * input_width; + auto func = PRAGMA_THREADS_FOR { + for (int input_idx = start; input_idx < stop; input_idx += increment) { + const int n_bY_bX_oC_iY = input_idx / input_width; + const int iX = input_idx - n_bY_bX_oC_iY * input_width; - const int n_bY_bX = n_bY_bX_oC_iY / output_depth_by_input_height; - const int oC_iY = n_bY_bX_oC_iY - n_bY_bX * output_depth_by_input_height; + const int n_bY_bX = n_bY_bX_oC_iY / output_depth_by_input_height; + const int oC_iY = n_bY_bX_oC_iY - n_bY_bX * output_depth_by_input_height; - const int n_bY = n_bY_bX / block_size; - const int bX = n_bY_bX - n_bY * block_size; + const int n_bY = n_bY_bX / block_size; + const int bX = n_bY_bX - n_bY * block_size; - const int n = n_bY / block_size; - const int bY = n_bY - n * block_size; + const int n = n_bY / block_size; + const int bY = n_bY - n * block_size; - const int output_idx = bX + block_size * (iX + input_width * (bY + block_size * (oC_iY + n * output_depth_by_input_height))); + const int output_idx = bX + block_size * (iX + input_width * (bY + block_size * (oC_iY + n * output_depth_by_input_height))); - (output_ptr + output_idx)[0] = (input_ptr + input_idx)[0]; - } + (output_ptr + output_idx)[0] = (input_ptr + input_idx)[0]; + } + }; + + samediff::Threads::parallel_for(func, 0, total_count); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp index 3a687981e..f2f2033c1 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp @@ -34,7 +34,6 @@ static void _diagFunctor(const NDArray* input, NDArray* output) { const int inLength = input->lengthOf(); - PRAGMA_OMP_PARALLEL_FOR_IF(inLength > Environment::getInstance()->elementwiseThreshold()) for(int i = 0; i < inLength; ++i) output->p(i * (inLength + 1), (*input).e(i)); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp index c75bbf131..f5c0fe71c 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp @@ -20,6 +20,7 @@ #include #include +#include namespace nd4j { namespace ops { @@ -52,33 +53,36 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const const uint oH = output->sizeAt(1); const uint oW = output->sizeAt(2); - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(4)) - for (uint b = 0; b < bS; ++b) { - for (uint oh = 0; oh < oH; ++oh) { - for (uint ow = 0; ow < oW; ++ow) { - for (uint c = 0; c < iC; ++c) { + auto func = PRAGMA_THREADS_FOR_2D { + for (uint b = start_x; b < stop_x; b += inc_x) { + for (uint oh = start_y; oh < stop_y; oh += inc_y) { + for (uint ow = 0; ow < oW; ++ow) { + for (uint c = 0; c < iC; ++c) { - X max = -DataTypeUtils::max(); + X max = -DataTypeUtils::max(); - for (uint kh = 0; kh < kH; ++kh) { - const int ih = oh * sH - pH + kh * dH; - if (ih < 0 || ih >= iH) continue; + for (uint kh = 0; kh < kH; ++kh) { + const int ih = oh * sH - pH + kh * dH; + if (ih < 0 || ih >= iH) continue; - for (uint kw = 0; kw < kW; ++kw) { - const int iw = ow * sW - pW + kw * dW; - if(iw < 0 || iw >= iW) continue; + for (uint kw = 0; kw < kW; ++kw) { + const int iw = ow * sW - pW + kw * dW; + if (iw < 0 || iw >= iW) continue; - const X val = x[shape::getOffset(xShapeInfo, {b,(uint)ih,(uint)iw,c})] + y[shape::getOffset(yShapeInfo, {kh,kw,c})]; - if (val > max) - max = val; + const X val = x[shape::getOffset(xShapeInfo, {b, (uint) ih, (uint) iw, c})] + y[shape::getOffset(yShapeInfo, {kh, kw, c})]; + if (val > max) + max = val; + } } - } - z[shape::getOffset(zShapeInfo, {b,oh,ow,c})] = static_cast(max); + z[shape::getOffset(zShapeInfo, {b, oh, ow, c})] = static_cast(max); + } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, oH, 1); } void dilation2d(nd4j::LaunchContext* context, NDArray *input, NDArray *weights, NDArray *output, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp index 7b40d0fa7..9db974b36 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -33,13 +34,16 @@ namespace helpers { nd4j::graph::RandomGenerator nodeRng(3019L, seed); int inLen = input->lengthOf(); - PRAGMA_OMP_PARALLEL_FOR_IF(inLen > Environment::getInstance()->elementwiseThreshold()) - for (Nd4jLong e = 0; e < inLen; ++e) { - float val = nodeRng.relativeT(e, T(0.f), T(1.f)); + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + float val = nodeRng.relativeT(e, T(0.f), T(1.f)); - if (val < probValue) - output->p(e, input->e(e) / probValue); - } + if (val < probValue) + output->p(e, input->e(e) / probValue); + } + }; + + samediff::Threads::parallel_for(func, 0, inLen); } BUILD_SINGLE_TEMPLATE(template void dropoutSimple, (NDArray const* input, NDArray* output, double probValue, int seed), FLOAT_TYPES); @@ -59,7 +63,6 @@ namespace helpers { std::vector dims(reduceShape->lengthOf()); bool fit = true; - PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit)) for( int i = 0; i < dims.size(); i++ ) { if (fit) { dims[i] = reduceShape->e(i); @@ -126,14 +129,17 @@ namespace helpers { //input->template applyRandom>(rng, nullptr, output, probValueArr); nd4j::graph::RandomGenerator nodeRng(3019L, seed); - PRAGMA_OMP_PARALLEL_FOR_IF(input->lengthOf() > Environment::getInstance()->elementwiseThreshold()) - for (Nd4jLong e = 0; e < input->lengthOf(); ++e) { - float randVal = nodeRng.relativeT(e, T(0.f), T(1.f)); - float xVal = input->e(e); - output->p(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1); - } + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + float randVal = nodeRng.relativeT(e, T(0.f), T(1.f)); + float xVal = input->e(e); + output->p(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1); + } + }; - return ND4J_STATUS_OK; + samediff::Threads::parallel_for(func, 0, input->lengthOf()); + + return Status::OK(); } template diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp index 2a2b631c8..073167f18 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp @@ -18,6 +18,7 @@ // Created by george on 05.04.18. // #include +#include namespace nd4j { namespace ops { @@ -61,14 +62,17 @@ namespace nd4j { } else { unsigned int outSize = outputList.size(); - PRAGMA_OMP_PARALLEL_FOR_IF(outSize > Environment::getInstance()->tadThreshold()) - for (unsigned int i = 0; i < outSize; i++) { - outputs[i].first = outputList[i]; - outputs[i].second = 0; - for (int e = 0; e < indices->lengthOf(); ++e) - if (indices->e(e) == i) - outputs[i].first->p(outputs[i].second++, input->e(e)); - } + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + outputs[i].first = outputList[i]; + outputs[i].second = 0; + for (int e = 0; e < indices->lengthOf(); ++e) + if (indices->e(e) == i) + outputs[i].first->p(outputs[i].second++, input->e(e)); + } + }; + + samediff::Threads::parallel_tad(func, 0, outSize); } } template @@ -165,14 +169,17 @@ namespace nd4j { auto output = outputList[0]; unsigned int gradsSize = inputGradientList.size(); - PRAGMA_OMP_PARALLEL_FOR_IF(gradsSize > Environment::getInstance()->tadThreshold()) - for (unsigned int i = 0; i < gradsSize; i++) { - outputs[i].first = inputGradientList[i]; - outputs[i].second = 0; - for (int e = 0; e < indices->lengthOf(); ++e) - if (indices->e(e) == i) - output->p(e, outputs[i].first->e(outputs[i].second++)); - } + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + outputs[i].first = inputGradientList[i]; + outputs[i].second = 0; + for (int e = 0; e < indices->lengthOf(); ++e) + if (indices->e(e) == i) + output->p(e, outputs[i].first->e(outputs[i].second++)); + } + }; + + samediff::Threads::parallel_tad(func, 0, gradsSize); } outputList[1]->assign(indices); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp index f450584d7..f3fe89103 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp @@ -19,6 +19,7 @@ // #include +#include namespace nd4j { namespace ops { @@ -47,37 +48,41 @@ namespace helpers { rowCast = 0; if (sizeCol * rateCol < 3) colCast = 0; - //Nd4jLong outputLastDim = output->sizeAt(3); - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong batch = 0; batch < batchCount; batch++) { - auto patch = listOfMatricies->at(batch); - auto outMatrix = listOfOutputs->at(batch); - for (Nd4jLong i = 0; i < outRowDim; i++) { - for (Nd4jLong j = 0; j < outColDim; j++) { - Nd4jLong pos = 0; - //for (Nd4jLong k = 0; k < outputLastDim; k++) { - auto rowStart = i * strideRow - (theSame?rowCast:0); - auto colStart = j * strideCol - (theSame?colCast:0); - auto rowEnd = rowStart + sizeRow * rateRow; - auto colEnd = colStart + sizeCol * rateCol; - if (!theSame) { - rowEnd = math::nd4j_min(rowStart + sizeRow * rateRow, rowDim); - colEnd = math::nd4j_min(colStart + sizeCol * rateCol, colDim); - } - //auto pixel = 0LL; - for (auto row = rowStart; row < rowEnd; row += rateRow) - for (auto col = colStart; col < colEnd; col += rateCol) - for (auto pixel = 0; pixel < lastDim; pixel++) { - bool setUp = (theSame && row >= 0 && col >= 0 && row < rowDim && col < colDim) || (!theSame); - if (setUp) { - outMatrix->t(i, j, pos) = patch->e(row, col, pixel); - } - pos++; - } - } - } - } + auto func = PRAGMA_THREADS_FOR { + for (auto batch = 0; batch < stop; batch += increment) { + auto patch = listOfMatricies->at(batch); + auto outMatrix = listOfOutputs->at(batch); + + for (Nd4jLong i = 0; i < outRowDim; i++) { + for (Nd4jLong j = 0; j < outColDim; j++) { + Nd4jLong pos = 0; + //for (Nd4jLong k = 0; k < outputLastDim; k++) { + auto rowStart = i * strideRow - (theSame ? rowCast : 0); + auto colStart = j * strideCol - (theSame ? colCast : 0); + auto rowEnd = rowStart + sizeRow * rateRow; + auto colEnd = colStart + sizeCol * rateCol; + if (!theSame) { + rowEnd = math::nd4j_min(rowStart + sizeRow * rateRow, rowDim); + colEnd = math::nd4j_min(colStart + sizeCol * rateCol, colDim); + } + //auto pixel = 0LL; + for (auto row = rowStart; row < rowEnd; row += rateRow) + for (auto col = colStart; col < colEnd; col += rateCol) + for (auto pixel = 0; pixel < lastDim; pixel++) { + bool setUp = (theSame && row >= 0 && col >= 0 && row < rowDim && col < colDim) || + (!theSame); + if (setUp) { + outMatrix->t(i, j, pos) = patch->e(row, col, pixel); + } + pos++; + } + } + } + } + }; + + samediff::Threads::parallel_tad(func, 0, batchCount); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp index 1a43fb250..3fb7c290d 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp @@ -20,6 +20,7 @@ #include #include +#include namespace nd4j { namespace ops { @@ -56,12 +57,16 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* std::vector dimsOut(indices->rankOf()); std::iota(dimsOut.begin(), dimsOut.end(), axis); // fill with axis, axis+1, ... axis+indices->rankOf()-1 const Nd4jLong numOfSubArrs = indices->lengthOf(); - PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold()) - for(int i = 0; i < numOfSubArrs; ++i) { - NDArray subArrOut = (*output)(i, dimsOut); - NDArray subArrIn = (*input)(indices->e(i), {axis}); - subArrOut.assign(subArrIn); - } + + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + NDArray subArrOut = (*output)(i, dimsOut); + NDArray subArrIn = (*input)(indices->e(i), {axis}); + subArrOut.assign(subArrIn); + } + }; + + samediff::Threads::parallel_tad(func, 0, numOfSubArrs); } } else { @@ -72,12 +77,16 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* } else { // vector case const Nd4jLong numOfSubArrs = intArgs.size() - 1; - PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold()) - for(int i = 0; i < numOfSubArrs; ++i) { - NDArray subArrOut = (*output)(i, {axis}); - NDArray subArrIn = (*input)(intArgs[i+1], {axis}); - subArrOut.assign(subArrIn); - } + + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + NDArray subArrOut = (*output)(i, {axis}); + NDArray subArrIn = (*input)(intArgs[i + 1], {axis}); + subArrOut.assign(subArrIn); + } + }; + + samediff::Threads::parallel_tad(func, 0, numOfSubArrs); } } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp index 687dc0bde..9e3bdf885 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp @@ -20,6 +20,7 @@ #include #include +#include namespace nd4j { namespace ops { @@ -46,7 +47,7 @@ namespace nd4j { Nd4jLong distance = 0; auto lengthOf = x.lengthOf(); - const int maxThreads = nd4j::math::nd4j_min(256, omp_get_max_threads()); + int maxThreads = nd4j::math::nd4j_min(256, omp_get_max_threads()); Nd4jLong intermediate[256]; // nullify temp values @@ -54,30 +55,38 @@ namespace nd4j { intermediate[e] = 0; if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < lengthOf; e++) { - auto _x = static_cast(xBuffer[e]); - auto _y = static_cast(yBuffer[e]); + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto _x = static_cast(xBuffer[e]); + auto _y = static_cast(yBuffer[e]); - intermediate[omp_get_thread_num()] += hamming_distance(_x, _y); - } + intermediate[thread_id] += hamming_distance(_x, _y); + } + }; + maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf); } else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < lengthOf; e++) { - auto _x = static_cast(xBuffer[e * xEws]); - auto _y = static_cast(yBuffer[e * yEws]); + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto _x = static_cast(xBuffer[e * xEws]); + auto _y = static_cast(yBuffer[e * yEws]); - intermediate[omp_get_thread_num()] += hamming_distance(_x, _y); - } + intermediate[thread_id] += hamming_distance(_x, _y); + } + }; + + maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf); } else { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < lengthOf; e++) { - auto _x = static_cast(x.e(e)); - auto _y = static_cast(y.e(e)); + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto _x = static_cast(x.e(e)); + auto _y = static_cast(y.e(e)); - intermediate[omp_get_thread_num()] += hamming_distance(_x, _y); - } + intermediate[thread_id] += hamming_distance(_x, _y); + } + }; + + maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf); } // accumulate intermediate variables into output array diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp index b254788a8..04df86c36 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp @@ -19,6 +19,7 @@ // #include +#include namespace nd4j { namespace ops { @@ -40,18 +41,20 @@ namespace nd4j { auto tempResult = tempBufferB; // we divide array into 32 element chunks, and store intermediate results once - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int b = 0; b < numBlocks; b++) { - auto blockBuffer = buffer + b * numBlocks; + auto func = PRAGMA_THREADS_FOR { + for (auto b = 0; b < stop; b += increment) { + auto blockBuffer = buffer + b * numBlocks; - Nd4jLong r = 1; - for (int e = 0; e < blockSize && e + (b * numBlocks) < length; e++) { - auto v = longBytes(blockBuffer[e]); - r = 31 * r + v; + Nd4jLong r = 1; + for (int e = 0; e < blockSize && e + (b * numBlocks) < length; e++) { + auto v = longBytes(blockBuffer[e]); + r = 31 * r + v; + } + + tempBuffer[b] = r; } - - tempBuffer[b] = r; - } + }; + samediff::Threads::parallel_tad(func, 0, numBlocks); // we replace pointer with intermediate one, and repeat only one chunk left int iterationCount = 0; @@ -60,18 +63,20 @@ namespace nd4j { numBlocks = lastLength / blockSize + ((lastLength % blockSize == 0) ? 0 : 1); - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int b = 0; b < numBlocks; b++) { - auto blockBuffer = tempBuffer + b * numBlocks; + auto func2 = PRAGMA_THREADS_FOR { + for (auto b = start; b < stop; b += increment) { + auto blockBuffer = tempBuffer + b * numBlocks; - Nd4jLong r = 1; - for (int e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) { - auto v = longBytes(blockBuffer[e]); - r = 31 * r + v; + Nd4jLong r = 1; + for (int e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) { + auto v = longBytes(blockBuffer[e]); + r = 31 * r + v; + } + + tempResult[b] = r; } - - tempResult[b] = r; - } + }; + samediff::Threads::parallel_tad(func2, 0, numBlocks); iterationCount++; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp b/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp index 349d0381a..1ffb59824 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp @@ -42,29 +42,17 @@ void histogramFixedWidth_(const NDArray& input, const NDArray& range, NDArray& o Nd4jLong inputLength = input.lengthOf(); - PRAGMA_OMP_PARALLEL_FOR + // FIXME: make this one parallel without CRITICAL section for(Nd4jLong i = 0; i < inputLength; ++i) { - const T value = input.e(i); if(value < secondEdge) { - - PRAGMA_OMP_CRITICAL - { - output.p(0, output.e(0) + 1); - } + output.p(0, output.e(0) + 1); } else if(value >= lastButOneEdge) { - PRAGMA_OMP_CRITICAL - { - output.p(nbins - 1, output.e(nbins - 1) + 1); - } + output.p(nbins - 1, output.e(nbins - 1) + 1); } else { Nd4jLong currInd = static_cast((value - leftEdge) / binWidth); - - PRAGMA_OMP_CRITICAL - { - output.p(currInd, output.e(currInd) + 1); - } + output.p(currInd, output.e(currInd) + 1); } } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp b/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp index 002c68226..7be34e6ca 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp @@ -19,6 +19,7 @@ // #include +#include namespace nd4j { @@ -59,64 +60,71 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input, NDArra const Nd4jLong imStride2 = imStride[2]; const Nd4jLong imStride3 = imStride[3]; - T *col, *im; - int imRow, imCol; if (shape::order(imShapeBuffer) == 'c' && shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) { - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(private(col, im, imRow, imCol) collapse(2)) - for (int b = 0; b < bS; b++) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { + auto func = PRAGMA_THREADS_FOR_2D { + for (int b = start_x; b < stop_x; b++) { + for (int c = start_y; c < stop_y; c++) { + for (int kRow = 0; kRow < kH; ++kRow) { + for (int kCol = 0; kCol < kW; ++kCol) { + for (int colH = 0; colH < oH; ++colH) { + for (int colW = 0; colW < oW; ++colW) { - imRow = (-pH + kRow * dH) + colH*sH; - imCol = (-pW + kCol * dW) + colW*sW; + int imRow = (-pH + kRow * dH) + colH * sH; + int imCol = (-pW + kCol * dW) + colW * sW; - col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; + auto col = colBuff + b * colStride0 + c * colStride1 + kRow * colStride2 + kCol * colStride3 + colH * colStride4 + colW * colStride5; - if (static_cast(imRow) >= static_cast(iH) || static_cast(imCol) >= static_cast(iW)) - *col = zeroPadVal; - else { - im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; - *col = *im; + if (static_cast(imRow) >= static_cast(iH) || static_cast(imCol) >= static_cast(iW)) + *col = zeroPadVal; + else { + auto im = imBuff + b * imStride0 + c * imStride1 + imRow * imStride2 + imCol * imStride3; + *col = *im; + } } } } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1); } else { - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(private(im, col, imRow, imCol) collapse(2)) - for (int b = 0; b < bS; b++) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { + auto func = PRAGMA_THREADS_FOR_2D { + T *col, *im; + int imRow, imCol; - imRow = (-pH + kRow * dH) + colH*sH; - imCol = (-pW + kCol * dW) + colW*sW; + for (int b = start_x; b < stop_x; b += inc_x) { + for (int colH = start_y; colH < stop_y; colH += inc_y) { + for (int colW = 0; colW < oW; ++colW) { + for (int c = 0; c < iC; ++c) { + for (int kRow = 0; kRow < kH; ++kRow) { + for (int kCol = 0; kCol < kW; ++kCol) { - col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; + imRow = (-pH + kRow * dH) + colH * sH; + imCol = (-pW + kCol * dW) + colW * sW; - if (static_cast(imRow) >= static_cast(iH) || static_cast(imCol) >= static_cast(iW)) - *col = zeroPadVal; - else { - im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; - *col = *im; + col = colBuff + b * colStride0 + c * colStride1 + kRow * colStride2 + kCol * colStride3 + colH * colStride4 + colW * colStride5; + + if (static_cast(imRow) >= static_cast(iH) || static_cast(imCol) >= static_cast(iW)) + *col = zeroPadVal; + else { + im = imBuff + b * imStride0 + c * imStride1 + imRow * imStride2 + imCol * imStride3; + *col = *im; + } } } } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, oH, 1); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp index 2ac679fc5..11bc1ecaa 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp @@ -19,6 +19,7 @@ // #include +#include namespace nd4j { namespace ops { @@ -83,7 +84,7 @@ namespace helpers { return top + (bottom - top) * yVal; }; - PRAGMA_OMP_PARALLEL_FOR_SIMD + // FIXME: fix parallelism here for (Nd4jLong b = 0; b < batchSize; ++b) { for (Nd4jLong y = 0; y < outHeight; ++y) { const T *ys_input_lower_ptr = input_b_ptr + ys[y].bottomIndex * inRowSize; @@ -149,11 +150,13 @@ namespace helpers { int xsSize = xs.size(); // Scale x interpolation weights to avoid a multiplication during iteration. - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int i = 0; i < xsSize; ++i) { - xs[i].bottomIndex *= channels; - xs[i].topIndex *= channels; - } + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + xs[i].bottomIndex *= channels; + xs[i].topIndex *= channels; + } + }; + samediff::Threads::parallel_for(func, 0, xsSize); resizeImage(images, batchSize, inHeight, inWidth, outHeight, outWidth, channels, xs, ys, output); return ND4J_STATUS_OK; @@ -184,24 +187,22 @@ namespace helpers { double heightScale = center ? (inHeight - 1.) / double(outHeight - 1.0) : (inHeight / double(outHeight)); double widthScale = center ? (inWidth - 1.) / double(outWidth - 1.0) : (inWidth / double(outWidth)); - PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2) - for (int b = 0; b < batchSize; ++b) { - for (int y = 0; y < outHeight; ++y) { - Nd4jLong inY = nd4j::math::nd4j_min( - (center) ? static_cast(nd4j::math::p_round(y * heightScale)) : static_cast(nd4j::math::p_floor( - y * heightScale)), inHeight - 1); - for (int x = 0; x < outWidth; ++x) { - Nd4jLong inX = nd4j::math::nd4j_min( - (center) ? static_cast(nd4j::math::p_round(x * widthScale)) : static_cast(nd4j::math::p_floor( - x * widthScale)), inWidth - 1); - for (Nd4jLong e = 0; e < channels; e++) - output->p(b, y, x, e, images->e(b, inY, inX, e)); -// std::copy_n(&input(b, in_y, in_x, 0), channels, &output(b, y, x, 0)); + auto func = PRAGMA_THREADS_FOR_2D { + for (auto b = start_x; b < stop_x; b += inc_x) { + for (auto y = start_y; y < stop_y; y += inc_y) { + Nd4jLong inY = nd4j::math::nd4j_min((center) ? static_cast(nd4j::math::p_round(y * heightScale)) : static_cast(nd4j::math::p_floor(y * heightScale)), inHeight - 1); + + for (int x = 0; x < outWidth; ++x) { + Nd4jLong inX = nd4j::math::nd4j_min((center) ? static_cast(nd4j::math::p_round(x * widthScale)) : static_cast(nd4j::math::p_floor(x * widthScale)),inWidth - 1); + for (Nd4jLong e = 0; e < channels; e++) + output->p(b, y, x, e, images->e(b, inY, inX, e)); + } } } - } + }; + samediff::Threads::parallel_for(func, 0, batchSize, 1, 0, outHeight, 1); - return ND4J_STATUS_OK; + return Status::OK(); } void resizeImage(NDArray const *images, Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight, @@ -263,67 +264,73 @@ namespace helpers { T heightScale = (cropHeight > 1) ? (y2 - y1) * (imageHeight - 1) / (cropHeight - 1) : T(0); T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0); - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int y = 0; y < cropHeight; ++y) { - const float inY = (cropHeight > 1) - ? y1 * (imageHeight - 1) + y * heightScale - : 0.5 * (y1 + y2) * (imageHeight - 1); - if (inY < 0 || inY > imageHeight - 1) { - for (int x = 0; x < cropWidth; ++x) { - for (int d = 0; d < depth; ++d) { - crops->p(b, y, x, d, extrapolationVal); - } - } - continue; - } - if (method == 0 /* bilinear */) { - const int topYIndex = nd4j::math::p_floor(inY); - const int bottomYIndex = nd4j::math::p_ceil(inY); - const float y_lerp = inY - topYIndex; + auto func = PRAGMA_THREADS_FOR { + for (int y = start; y < stop; y += increment) { + const float inY = (cropHeight > 1) + ? y1 * (imageHeight - 1) + y * heightScale + : 0.5 * (y1 + y2) * (imageHeight - 1); - for (int x = 0; x < cropWidth; ++x) { - const float in_x = (cropWidth > 1) - ? x1 * (imageWidth - 1) + x * widthScale - : 0.5 * (x1 + x2) * (imageWidth - 1); - if (in_x < 0 || in_x > imageWidth - 1) { + if (inY < 0 || inY > imageHeight - 1) { + for (int x = 0; x < cropWidth; ++x) { for (int d = 0; d < depth; ++d) { crops->p(b, y, x, d, extrapolationVal); } - continue; - } - int left_x_index = math::p_floor(in_x); - int right_x_index = math::p_ceil(in_x); - T x_lerp = in_x - left_x_index; - - for (int d = 0; d < depth; ++d) { - const float topLeft(images->e(bIn, topYIndex, left_x_index, d)); - const float topRight(images->e(bIn, topYIndex, right_x_index, d)); - const float bottomLeft(images->e(bIn, bottomYIndex, left_x_index, d)); - const float bottomRight(images->e(bIn, bottomYIndex, right_x_index, d)); - const float top = topLeft + (topRight - topLeft) * x_lerp; - const float bottom = bottomLeft + (bottomRight - bottomLeft) * x_lerp; - crops->p(b, y, x, d, top + (bottom - top) * y_lerp); } + continue; } - } else { // method is "nearest neighbor" - for (int x = 0; x < cropWidth; ++x) { - const float inX = (cropWidth > 1) - ? x1 * (imageWidth - 1) + x * widthScale - : 0.5 * (x1 + x2) * (imageWidth - 1); - if (inX < 0 || inX > imageWidth - 1) { - for (int d = 0; d < depth; ++d) { - crops->p(b, y, x, d, extrapolationVal); + if (method == 0 /* bilinear */) { + const int topYIndex = nd4j::math::p_floor(inY); + const int bottomYIndex = nd4j::math::p_ceil(inY); + const float y_lerp = inY - topYIndex; + + for (int x = 0; x < cropWidth; ++x) { + const float in_x = (cropWidth > 1) + ? x1 * (imageWidth - 1) + x * widthScale + : 0.5 * (x1 + x2) * (imageWidth - 1); + + if (in_x < 0 || in_x > imageWidth - 1) { + for (int d = 0; d < depth; ++d) { + crops->p(b, y, x, d, extrapolationVal); + } + continue; + } + int left_x_index = math::p_floor(in_x); + int right_x_index = math::p_ceil(in_x); + T x_lerp = in_x - left_x_index; + + for (int d = 0; d < depth; ++d) { + const float topLeft(images->e(bIn, topYIndex, left_x_index, d)); + const float topRight(images->e(bIn, topYIndex, right_x_index, d)); + const float bottomLeft(images->e(bIn, bottomYIndex, left_x_index, d)); + const float bottomRight(images->e(bIn, bottomYIndex, right_x_index, d)); + const float top = topLeft + (topRight - topLeft) * x_lerp; + const float bottom = bottomLeft + (bottomRight - bottomLeft) * x_lerp; + crops->p(b, y, x, d, top + (bottom - top) * y_lerp); } - continue; } - const int closestXIndex = roundf(inX); - const int closestYIndex = roundf(inY); - for (int d = 0; d < depth; ++d) { - crops->p(b, y, x, d, (F)images->e(bIn, closestYIndex, closestXIndex, d)); + } else { // method is "nearest neighbor" + for (int x = 0; x < cropWidth; ++x) { + const float inX = (cropWidth > 1) + ? x1 * (imageWidth - 1) + x * widthScale + : 0.5 * (x1 + x2) * (imageWidth - 1); + + if (inX < 0 || inX > imageWidth - 1) { + for (int d = 0; d < depth; ++d) { + crops->p(b, y, x, d, extrapolationVal); + } + continue; + } + const int closestXIndex = roundf(inX); + const int closestYIndex = roundf(inY); + for (int d = 0; d < depth; ++d) { + crops->p(b, y, x, d, (F) images->e(bIn, closestYIndex, closestXIndex, d)); + } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, cropHeight); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp index f4fb98b2a..ab48ebb32 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp @@ -72,7 +72,8 @@ namespace helpers { for (int i = 0; i < numBoxes; ++i) { bool shouldSelect = numSelected < output->lengthOf(); - PRAGMA_OMP_PARALLEL_FOR //_ARGS(firstprivate(numSelected)) + + // FIXME: add parallelism here for (int j = numSelected - 1; j >= 0; --j) { if (shouldSelect) if (needToSuppressWithThreshold(*boxes, indices[i], indices[selectedIndices[j]], T(overlapThreshold))) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp index def210457..4bc9d3304 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -144,14 +145,8 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector int span = (tads / num_threads) + 8; - PRAGMA_OMP_PARALLEL_THREADS(num_threads) - { - int tid = omp_get_thread_num(); - int start = span * tid; - int end = span * (tid + 1); - if (end > tads) end = tads; - - for (int r = start; r < end; r++) { + auto func = PRAGMA_THREADS_FOR { + for (auto r = start; r < stop; r += increment) { auto rX = const_cast(input)->bufferAsT() + tadOffsets[r]; auto rZ = output->bufferAsT() + zOfsets[r]; @@ -198,7 +193,9 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector } } } - } + }; + + samediff::Threads::parallel_tad(func, 0, tads); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp b/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp index 09cb2df2e..62f8316ce 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp @@ -20,6 +20,7 @@ #include #include +#include namespace nd4j { namespace ops { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp index 0d0705104..c9b833cf5 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -60,76 +61,80 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out if(inTadEws == 1 && outTadEws == 1) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (uint i = 0; i < numOfTads; ++i) { - const T* x = inBuff + inTadOffsets[i]; - T* y = outBuff + outTadOffsets[i]; + auto func = PRAGMA_THREADS_FOR { + for (uint i = start; i < stop; i += increment) { + const T *x = inBuff + inTadOffsets[i]; + T *y = outBuff + outTadOffsets[i]; - T prev = 0; + T prev = 0; - // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1] - // we store each squared sum in corresponding element of y array - for (uint j = 0; j < tadLen; ++j) { - const uint begin = nd4j::math::nd4j_max(0, j - depth); - const uint last = depth + j + 1; - const uint end = nd4j::math::nd4j_min(last, tadLen); - - if (j == 0) { - for (uint s = begin; s < end; ++s) - prev = prev + x[s] * x[s]; - y[j] = prev; + // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1] + // we store each squared sum in corresponding element of y array + for (uint j = 0; j < tadLen; ++j) { + const uint begin = nd4j::math::nd4j_max(0, j - depth); + const uint last = depth + j + 1; + const uint end = nd4j::math::nd4j_min(last, tadLen); + + if (j == 0) { + for (uint s = begin; s < end; ++s) + prev = prev + x[s] * x[s]; + y[j] = prev; + } else if (begin == 0 && last <= tadLen) + y[j] = prev + x[end - 1] * x[end - 1]; + else if (begin > 0 && last <= tadLen) + y[j] = prev + x[end - 1] * x[end - 1] - x[begin - 1] * x[begin - 1]; + else if (begin > 0 && last > tadLen) + y[j] = prev - x[begin - 1] * x[begin - 1]; + else + y[j] = prev; + + if (j != 0) + prev = y[j]; + + y[j] = x[j] / nd4j::math::nd4j_pow(tbias + alpha * prev, tbeta); } - else if (begin == 0 && last <= tadLen) - y[j] = prev + x[end - 1] * x[end - 1]; - else if (begin > 0 && last <= tadLen) - y[j] = prev + x[end - 1] * x[end - 1] - x[begin - 1] * x[begin - 1]; - else if (begin > 0 && last > tadLen) - y[j] = prev - x[begin - 1] * x[begin - 1]; - else - y[j] = prev; + } + }; - if(j != 0) - prev = y[j]; - - y[j] = x[j] / nd4j::math::nd4j_pow(tbias + alpha * prev, tbeta); - } - } + samediff::Threads::parallel_tad(func, 0, numOfTads); } else { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (uint i = 0; i < numOfTads; ++i) { - const T* x = inBuff + inTadOffsets[i]; - T* y = outBuff + outTadOffsets[i]; + auto func = PRAGMA_THREADS_FOR { + for (uint i = 0; i < numOfTads; ++i) { + const T *x = inBuff + inTadOffsets[i]; + T *y = outBuff + outTadOffsets[i]; - T prev = 0; + T prev = 0; - // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1] - // we store each squared sum in corresponding element of y array - for (uint j = 0; j < tadLen; ++j) { - const uint begin = nd4j::math::nd4j_max(0, j - depth); - const uint last = depth + j + 1; - const uint end = nd4j::math::nd4j_min(last, tadLen); - - if (j == 0) { - for (uint s = begin; s < end; ++s) - prev = prev + x[s*inTadEws] * x[s*inTadEws]; - y[j*outTadEws] = prev; + // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1] + // we store each squared sum in corresponding element of y array + for (uint j = 0; j < tadLen; ++j) { + const uint begin = nd4j::math::nd4j_max(0, j - depth); + const uint last = depth + j + 1; + const uint end = nd4j::math::nd4j_min(last, tadLen); + + if (j == 0) { + for (uint s = begin; s < end; ++s) + prev = prev + x[s * inTadEws] * x[s * inTadEws]; + y[j * outTadEws] = prev; + } else if (begin == 0 && last <= tadLen) + y[j * outTadEws] = prev + x[(end - 1) * inTadEws] * x[(end - 1) * inTadEws]; + else if (begin > 0 && last <= tadLen) + y[j * outTadEws] = prev + x[(end - 1) * inTadEws] * x[(end - 1) * inTadEws] - x[(begin - 1) * inTadEws] * x[(begin - 1) * inTadEws]; + else if (begin > 0 && last > tadLen) + y[j * outTadEws] = prev - x[(begin - 1) * inTadEws] * x[(begin - 1) * inTadEws]; + else + y[j * outTadEws] = prev; + + if (j != 0) + prev = y[j * outTadEws]; + + y[j * outTadEws] = x[j * inTadEws] / nd4j::math::nd4j_pow(tbias + alpha * prev, tbeta); } - else if (begin == 0 && last <= tadLen) - y[j*outTadEws] = prev + x[(end - 1)*inTadEws] * x[(end - 1)*inTadEws]; - else if (begin > 0 && last <= tadLen) - y[j*outTadEws] = prev + x[(end - 1)*inTadEws] * x[(end - 1)*inTadEws] - x[(begin - 1)*inTadEws] * x[(begin - 1)*inTadEws]; - else if (begin > 0 && last > tadLen) - y[j*outTadEws] = prev - x[(begin - 1)*inTadEws] * x[(begin - 1)*inTadEws]; - else - y[j*outTadEws] = prev; + } + }; - if(j != 0) - prev = y[j*outTadEws]; - - y[j*outTadEws] = x[j*inTadEws] / nd4j::math::nd4j_pow(tbias + alpha * prev, tbeta); - } - } + samediff::Threads::parallel_tad(func, 0, numOfTads); } return Status::OK(); } @@ -173,141 +178,146 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c if(inTadEws == 1 && gradITadEws == 1) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (uint i = 0; i < numOfTads; ++i) { - const X* x = inBuff + inTadOffsets[i]; - Y* y = gradIBuff + gradITadOffsets[i]; + auto func = PRAGMA_THREADS_FOR { + for (uint i = start; i < stop; i += increment) { + const X *x = inBuff + inTadOffsets[i]; + Y *y = gradIBuff + gradITadOffsets[i]; - // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1] - // we store each squared sum in corresponding element of y array - for (uint j = 0; j < tadLen; ++j) { - const uint begin = nd4j::math::nd4j_max(0, j - depth); - const uint last = depth + j + 1; - const uint end = nd4j::math::nd4j_min(last, tadLen); - - if (j == 0) { - y[0] = 0; - for (uint s = begin; s < end; ++s) - y[0] = y[0] + x[s] * x[s]; + // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1] + // we store each squared sum in corresponding element of y array + for (uint j = 0; j < tadLen; ++j) { + const uint begin = nd4j::math::nd4j_max(0, j - depth); + const uint last = depth + j + 1; + const uint end = nd4j::math::nd4j_min(last, tadLen); + + if (j == 0) { + y[0] = 0; + for (uint s = begin; s < end; ++s) + y[0] = y[0] + x[s] * x[s]; + } else if (begin == 0 && last <= tadLen) + y[j] = y[j - 1] + x[end - 1] * x[end - 1]; + else if (begin > 0 && last <= tadLen) + y[j] = y[j - 1] + x[end - 1] * x[end - 1] - x[begin - 1] * x[begin - 1]; + else if (begin > 0 && last > tadLen) + y[j] = y[j - 1] - x[begin - 1] * x[begin - 1]; + else + y[j] = y[j - 1]; } - else if (begin == 0 && last <= tadLen) - y[j] = y[j - 1] + x[end - 1] * x[end - 1]; - else if (begin > 0 && last <= tadLen) - y[j] = y[j - 1] + x[end - 1] * x[end - 1] - x[begin - 1] * x[begin - 1]; - else if (begin > 0 && last > tadLen) - y[j] = y[j - 1] - x[begin - 1] * x[begin - 1]; - else - y[j] = y[j - 1]; + + Y *factor = new Y[tadLen]; + + Y prev = 0; + // second loop calculates derivatives using information gained in first loop above + for (uint j = 0; j < tadLen; ++j) { + const uint begin = nd4j::math::nd4j_max(0, j - depth); + const uint last = depth + j + 1; + const uint end = nd4j::math::nd4j_min(last, tadLen); + + Y init = tbias + talpha * y[j]; + + if (j == 0) { + for (uint s = begin; s < end; ++s) { + factor[s] = nd4j::math::nd4j_pow(tbias + talpha * y[s], -tbeta - 1); + prev = prev + x[s] * factor[s]; + } + y[0] = prev; + } else if (begin == 0 && last <= tadLen) { + factor[end - 1] = nd4j::math::nd4j_pow(tbias + talpha * y[end - 1], -tbeta - 1); + y[j] = prev + x[end - 1] * factor[end - 1]; + } else if (begin > 0 && last <= tadLen) { + factor[end - 1] = nd4j::math::nd4j_pow(tbias + talpha * y[end - 1], -tbeta - 1); + y[j] = prev + x[end - 1] * factor[end - 1] - x[begin - 1] * factor[begin - 1]; + } else if (begin > 0 && last > tadLen) + y[j] = prev - x[begin - 1] * factor[begin - 1]; + else + y[j] = prev; + + if (j != 0) + prev = y[j]; + + y[j] = factor[j] * init - 2 * x[j] * coeff * prev; + } + + delete[]factor; } + }; - Y* factor = new Y[tadLen]; - - Y prev = 0; - // second loop calculates derivatives using information gained in first loop above - for (uint j = 0; j < tadLen; ++j) { - const uint begin = nd4j::math::nd4j_max(0, j - depth); - const uint last = depth + j + 1; - const uint end = nd4j::math::nd4j_min(last, tadLen); - - Y init = tbias + talpha * y[j]; - - if (j == 0) { - for (uint s = begin; s < end; ++s) { - factor[s] = nd4j::math::nd4j_pow(tbias + talpha * y[s], -tbeta - 1); - prev = prev + x[s] * factor[s]; - } - y[0] = prev; - } - else if(begin == 0 && last <= tadLen) { - factor[end - 1] = nd4j::math::nd4j_pow(tbias + talpha * y[end - 1], -tbeta - 1); - y[j] = prev + x[end - 1] * factor[end - 1]; - } - else if (begin > 0 && last <= tadLen) { - factor[end - 1] = nd4j::math::nd4j_pow(tbias + talpha * y[end - 1], -tbeta - 1); - y[j] = prev + x[end - 1] * factor[end - 1] - x[begin - 1] * factor[begin - 1]; - } - else if (begin > 0 && last > tadLen) - y[j] = prev - x[begin - 1] * factor[begin - 1]; - else - y[j] = prev; - - if(j != 0) - prev = y[j]; - - y[j] = factor[j] * init - 2 * x[j] * coeff * prev; - } - - delete []factor; - } + samediff::Threads::parallel_tad(func, 0, numOfTads); } else { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (uint i = 0; i < numOfTads; ++i) { - const X* x = inBuff + inTadOffsets[i]; - Y* y = gradIBuff + gradITadOffsets[i]; + auto func = PRAGMA_THREADS_FOR { + for (uint i = start; i < stop; i += increment) { + const X *x = inBuff + inTadOffsets[i]; + Y *y = gradIBuff + gradITadOffsets[i]; - // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1] - // we store each squared sum in corresponding element of y array - for (uint j = 0; j < tadLen; ++j) { - const uint begin = nd4j::math::nd4j_max(0, j - depth); - const uint last = depth + j + 1; - const uint end = nd4j::math::nd4j_min(last, tadLen); - - if (j == 0) { - y[0] = 0; - for (uint s = begin; s < end; ++s) - y[0] = y[0] + x[s*inTadEws] * x[s*inTadEws]; + // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1] + // we store each squared sum in corresponding element of y array + for (uint j = 0; j < tadLen; ++j) { + const uint begin = nd4j::math::nd4j_max(0, j - depth); + const uint last = depth + j + 1; + const uint end = nd4j::math::nd4j_min(last, tadLen); + + if (j == 0) { + y[0] = 0; + for (uint s = begin; s < end; ++s) + y[0] = y[0] + x[s * inTadEws] * x[s * inTadEws]; + } else if (begin == 0 && last <= tadLen) + y[j * gradITadEws] = + y[(j - 1) * gradITadEws] + x[(end - 1) * inTadEws] * x[(end - 1) * inTadEws]; + else if (begin > 0 && last <= tadLen) + y[j * gradITadEws] = + y[(j - 1) * gradITadEws] + x[(end - 1) * inTadEws] * x[(end - 1) * inTadEws] - + x[(begin - 1) * inTadEws] * x[(begin - 1) * inTadEws]; + else if (begin > 0 && last > tadLen) + y[j * gradITadEws] = + y[(j - 1) * gradITadEws] - x[(begin - 1) * inTadEws] * x[(begin - 1) * inTadEws]; + else + y[j * gradITadEws] = y[(j - 1) * gradITadEws]; } - else if (begin == 0 && last <= tadLen) - y[j*gradITadEws] = y[(j - 1)*gradITadEws] + x[(end - 1)*inTadEws] * x[(end - 1)*inTadEws]; - else if (begin > 0 && last <= tadLen) - y[j*gradITadEws] = y[(j - 1)*gradITadEws] + x[(end - 1)*inTadEws] * x[(end - 1)*inTadEws] - x[(begin - 1)*inTadEws] * x[(begin - 1)*inTadEws]; - else if (begin > 0 && last > tadLen) - y[j*gradITadEws] = y[(j - 1)*gradITadEws] - x[(begin - 1)*inTadEws] * x[(begin - 1)*inTadEws]; - else - y[j*gradITadEws] = y[(j - 1)*gradITadEws]; + + Y *factor = new Y[tadLen]; + + Y prev = 0; + // second loop calculates derivatives using information gained in first loop above + for (uint j = 0; j < tadLen; ++j) { + const uint begin = nd4j::math::nd4j_max(0, j - depth); + const uint last = depth + j + 1; + const uint end = nd4j::math::nd4j_min(last, tadLen); + + Y init = tbias + talpha * y[j * gradITadEws]; + + if (j == 0) { + for (uint s = begin; s < end; ++s) { + factor[s] = nd4j::math::nd4j_pow(tbias + talpha * y[s * gradITadEws], -tbeta - 1); + prev = prev + x[s * inTadEws] * factor[s]; + } + y[0] = prev; + } else if (begin == 0 && last <= tadLen) { + factor[end - 1] = nd4j::math::nd4j_pow(tbias + talpha * y[(end - 1) * gradITadEws], + -tbeta - 1); + y[j * gradITadEws] = prev + x[(end - 1) * inTadEws] * factor[end - 1]; + } else if (begin > 0 && last <= tadLen) { + factor[end - 1] = nd4j::math::nd4j_pow(tbias + talpha * y[(end - 1) * gradITadEws], + -tbeta - 1); + y[j * gradITadEws] = prev + x[(end - 1) * inTadEws] * factor[end - 1] - + x[(begin - 1) * inTadEws] * factor[begin - 1]; + } else if (begin > 0 && last > tadLen) + y[j * gradITadEws] = prev - x[(begin - 1) * inTadEws] * factor[begin - 1]; + else + y[j * gradITadEws] = prev; + + if (j != 0) + prev = y[j * gradITadEws]; + + y[j * gradITadEws] = factor[j] * init - 2 * x[j * inTadEws] * coeff * prev; + } + + delete[]factor; } + }; - Y* factor = new Y[tadLen]; - - Y prev = 0; - // second loop calculates derivatives using information gained in first loop above - for (uint j = 0; j < tadLen; ++j) { - const uint begin = nd4j::math::nd4j_max(0, j - depth); - const uint last = depth + j + 1; - const uint end = nd4j::math::nd4j_min(last, tadLen); - - Y init = tbias + talpha * y[j*gradITadEws]; - - if (j == 0) { - for (uint s = begin; s < end; ++s) { - factor[s] = nd4j::math::nd4j_pow(tbias + talpha * y[s*gradITadEws], -tbeta - 1); - prev = prev + x[s*inTadEws] * factor[s]; - } - y[0] = prev; - } - else if(begin == 0 && last <= tadLen) { - factor[end - 1] = nd4j::math::nd4j_pow(tbias + talpha * y[(end - 1)*gradITadEws], -tbeta - 1); - y[j*gradITadEws] = prev + x[(end - 1)*inTadEws] * factor[end - 1]; - } - else if (begin > 0 && last <= tadLen) { - factor[end - 1] = nd4j::math::nd4j_pow(tbias + talpha * y[(end - 1)*gradITadEws], -tbeta - 1); - y[j*gradITadEws] = prev + x[(end - 1)*inTadEws] * factor[end - 1] - x[(begin - 1)*inTadEws] * factor[begin - 1]; - } - else if (begin > 0 && last > tadLen) - y[j*gradITadEws] = prev - x[(begin - 1)*inTadEws] * factor[begin - 1]; - else - y[j*gradITadEws] = prev; - - if(j != 0) - prev = y[j*gradITadEws]; - - y[j*gradITadEws] = factor[j] * init - 2 * x[j*inTadEws] * coeff * prev; - } - - delete []factor; - } + samediff::Threads::parallel_tad(func, 0, numOfTads); } gradI *= gradO; } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp index 743aab40a..922fdc3a9 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp @@ -34,6 +34,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -122,11 +123,14 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast, auto cLast_ = cLast->bufferAsT(); auto h_ = h->bufferAsT(); - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (uint e = 0; e < uLen; e++) { - c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]); - h_[e] = nd4j::math::nd4j_tanh(c_[e]); - } + auto func = PRAGMA_THREADS_FOR { + for (uint e = start; e < stop; e += increment) { + c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]); + h_[e] = nd4j::math::nd4j_tanh(c_[e]); + } + }; + + samediff::Threads::parallel_for(func, 0, uLen); } ////////////////////////////////////////////////////////////////////////// diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp index 9a2034fd0..25605d77e 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp @@ -20,6 +20,7 @@ #include "ResultSet.h" #include +#include namespace nd4j { namespace ops { @@ -47,22 +48,22 @@ void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& outp const int xRank = input.rankOf(); const auto xLen = input.lengthOf(); - std::vector coords(xRank); // we use the same coordinates storage both for input and output since their ranks are the same + auto func = PRAGMA_THREADS_FOR { + Nd4jLong coords[MAX_RANK]; + for (Nd4jLong i = 0; i < xLen; ++i) { + shape::index2coords(i, xShapeInfo, coords); - PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords)) - for (Nd4jLong i = 0; i < xLen; ++i) { + const auto xOffset = shape::getOffset(xShapeInfo, coords); + const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords); - shape::index2coords(i, xShapeInfo, coords.data()); - - const auto xOffset = shape::getOffset(xShapeInfo, coords.data()); - const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords.data()); - - // condition to be on diagonal of innermost matrix - if(coords[xRank - 2] == coords[xRank - 1]) - z[zOffset] = y[shape::getOffset(yShapeInfo, coords.data())]; - else - z[zOffset] = zeroPad ? static_cast(0) : x[xOffset]; - } + // condition to be on diagonal of innermost matrix + if (coords[xRank - 2] == coords[xRank - 1]) + z[zOffset] = y[shape::getOffset(yShapeInfo, coords)]; + else + z[zOffset] = zeroPad ? static_cast(0) : x[xOffset]; + } + }; + samediff::Threads::parallel_for(func, 0, xLen); } ////////////////////////////////////////////////////////////////////////// diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp index c26637bd8..e0e487e82 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp @@ -21,6 +21,7 @@ #include "ResultSet.h" #include #include +#include namespace nd4j { namespace ops { @@ -43,10 +44,14 @@ int _matrixDiagPart(const NDArray* input, NDArray* output) { int lastDimension = nd4j::math::nd4j_min(input->sizeAt(-2), input->sizeAt(-1)); // TODO: tune this properlys int lO = listOut->size(); - PRAGMA_OMP_PARALLEL_FOR_IF(lO > Environment::getInstance()->tadThreshold()) - for(int i = 0; i < lO; ++i) - for(int j = 0; j < lastDimension; ++j) - listOut->at(i)->p(j, listDiag->at(i)->e(j, j)); + + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) + for (int j = 0; j < lastDimension; ++j) + listOut->at(i)->p(j, listDiag->at(i)->e(j, j)); + }; + + samediff::Threads::parallel_tad(func, 0, lO); delete listOut; delete listDiag; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp index daffa8f17..8c5332be6 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -53,11 +54,14 @@ namespace helpers { std::unique_ptr rows(sortedVals.allTensorsAlongDimension(lastDims)); Nd4jLong oL = output->lengthOf(); - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < oL; e++) { - auto row = rows->at(e); - output->p(e, row->e(n)); - } + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto row = rows->at(e); + output->p(e, row->e(n)); + } + }; + + samediff::Threads::parallel_for(func, 0, oL); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp index a83518899..3e18d6d14 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp @@ -20,6 +20,7 @@ #include #include +#include #include "../one_hot.h" namespace nd4j { @@ -47,41 +48,47 @@ namespace nd4j { Z one = static_cast(on); if (tadEws >= 1) { - PRAGMA_OMP_PARALLEL_FOR - for (unsigned int e = 0; e < numTads; e++) { - auto cO = output + tadPack.primaryOffsets()[e]; + auto func = PRAGMA_THREADS_FOR { + for (auto e = 0; e < stop; e += increment) { + auto cO = output + tadPack.primaryOffsets()[e]; - auto idx = static_cast(indices[e]); - if (idx < 0 || idx >= tLen) { - PRAGMA_OMP_SIMD - for (unsigned int t = 0; t < tLen; t++) { - cO[t * tadEws] = zero; - } - } else { - PRAGMA_OMP_SIMD - for (unsigned int t = 0; t < tLen; t++) { - cO[t * tadEws] = idx == t ? one : zero; + auto idx = static_cast(indices[e]); + if (idx < 0 || idx >= tLen) { + PRAGMA_OMP_SIMD + for (unsigned int t = 0; t < tLen; t++) { + cO[t * tadEws] = zero; + } + } else { + PRAGMA_OMP_SIMD + for (unsigned int t = 0; t < tLen; t++) { + cO[t * tadEws] = idx == t ? one : zero; + } } } - } + }; + + samediff::Threads::parallel_tad(func, 0, numTads); } else { - PRAGMA_OMP_PARALLEL_FOR - for (unsigned int e = 0; e < numTads; e++) { - auto cO = output + tadPack.primaryOffsets()[e]; + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto cO = output + tadPack.primaryOffsets()[e]; - auto idx = static_cast(indices[e]); - if (idx < 0 || idx >= tLen) { - PRAGMA_OMP_SIMD - for (unsigned int t = 0; t < tLen; t++) { - cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = zero; - } - } else { - PRAGMA_OMP_SIMD - for (unsigned int t = 0; t < tLen; t++) { - cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = idx == t ? one : zero; + auto idx = static_cast(indices[e]); + if (idx < 0 || idx >= tLen) { + PRAGMA_OMP_SIMD + for (unsigned int t = 0; t < tLen; t++) { + cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = zero; + } + } else { + PRAGMA_OMP_SIMD + for (unsigned int t = 0; t < tLen; t++) { + cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = idx == t ? one : zero; + } } } - } + }; + + samediff::Threads::parallel_tad(func, 0, numTads); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp b/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp index 6ebbb784b..5c1f3c28d 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp @@ -66,7 +66,7 @@ static void _percentile(const NDArray& input, NDArray& output, std::vector& position = len - position - 1; // FIXME: our sort impl should be used instead, so this operation might be implemented as generic - PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(flattenedArr)) + // FIXME: parallelism ! for(int i=0; isize(); ++i) { T* buff = reinterpret_cast(flattenedArr.getBuffer()); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp index 6290de6ad..cb97ffe1e 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -39,7 +40,6 @@ static FORCEINLINE T getFactorial(const int n) { T result = (T)1.f; - PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(prodT : result) for(int i = 2; i <= n; ++i) result *= i; @@ -74,9 +74,12 @@ static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const ND NDArray& result = output; int xLen = x.lengthOf(); - PRAGMA_OMP_PARALLEL_FOR_IF(xLen > Environment::getInstance()->elementwiseThreshold()) - for(int i = 0; i < x.lengthOf(); ++i) - result.p(i, polyGammaScalar(context, n.e(i), x.e(i))); + + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) + result.p(i, polyGammaScalar(context, n.e(i), x.e(i))); + }; + samediff::Threads::parallel_for(func, 0, x.lengthOf()); // return result; } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp index 33ba9575d..bb0e7e24e 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp @@ -20,6 +20,7 @@ #include +#include namespace nd4j { namespace ops { @@ -37,10 +38,11 @@ static void _range(const NDArray& start, const NDArray& delta, NDArray& outVecto auto s = start.e(0); auto d = delta.e(0); - PRAGMA_OMP_PARALLEL_FOR_SIMD - for(Nd4jLong i = 0; i < len; ++i) - buff[i] = s + i * d; - + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) + buff[i] = s + i * d; + }; + samediff::Threads::parallel_for(func, 0, len); } void range(nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp index 83deeca88..9f424606d 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace nd4j { @@ -52,36 +53,36 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * // two step phase here if (inArr == outArr) { if (inEWS == 1) { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) { - auto idx = sLength - e; - swap(inArr, e, idx); -// T tmp = inArr[e]; -// inArr[e] = inArr[idx]; -// inArr[idx] = tmp; - } + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto idx = sLength - e; + swap(inArr, e, idx); + } + }; + samediff::Threads::parallel_for(func, 0, numOfElemsToReverse / 2); } else if (inEWS > 1) { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) { - auto idx1 = (sLength - e) * inEWS; - Nd4jLong idx2 = e * inEWS; -// T tmp = inArr[idx2]; -// inArr[idx2] = inArr[idx1]; -// inArr[idx1] = tmp; - swap(inArr, idx1, idx2); - } + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto idx1 = (sLength - e) * inEWS; + Nd4jLong idx2 = e * inEWS; + swap(inArr, idx1, idx2); + } + }; + + samediff::Threads::parallel_for(func, 0, numOfElemsToReverse / 2); } else { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) { + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto inOffset = shape::getIndexOffset(e, inShapeBuffer); + auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer); + swap(outArr, inOffset, outOffset); + } + }; - auto inOffset = shape::getIndexOffset(e, inShapeBuffer); - auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer); - //outArr[outOffset] = inArr[inOffset]; - swap(outArr, inOffset, outOffset); - } + samediff::Threads::parallel_for(func, 0, numOfElemsToReverse / 2); } } else { @@ -91,47 +92,57 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < numOfElemsToReverse; e++) - outArr[sLength - e] = inArr[e]; + auto func = PRAGMA_THREADS_FOR { + for (Nd4jLong e = start; e < stop; e += increment) + outArr[sLength - e] = inArr[e]; + }; + samediff::Threads::parallel_for(func, 0, numOfElemsToReverse); if(inLength != numOfElemsToReverse) { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++) - outArr[e] = inArr[e]; + auto f2 = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) + outArr[e] = inArr[e]; + }; + samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength); } } else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < numOfElemsToReverse; e++) - outArr[(sLength - e) * outEWS] = inArr[e * inEWS]; + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) + outArr[(sLength - e) * outEWS] = inArr[e * inEWS]; + }; + samediff::Threads::parallel_for(func, 0, numOfElemsToReverse); if(inLength != numOfElemsToReverse) { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++) - outArr[e * outEWS] = inArr[e * inEWS]; + auto f2 = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) + outArr[e * outEWS] = inArr[e * inEWS]; + }; + samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength); } } else { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < numOfElemsToReverse; e++) { - - auto inOffset = shape::getIndexOffset(e, inShapeBuffer); - auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer); - outArr[outOffset] = inArr[inOffset]; - } + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto inOffset = shape::getIndexOffset(e, inShapeBuffer); + auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer); + outArr[outOffset] = inArr[inOffset]; + } + }; + samediff::Threads::parallel_for(func, 0, numOfElemsToReverse); if(inLength != numOfElemsToReverse) { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++) { - - auto inOffset = shape::getIndexOffset(e, inShapeBuffer); - auto outOffset = shape::getIndexOffset(e, outShapeBuffer); - outArr[outOffset] = inArr[inOffset]; - } + auto f2 = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto inOffset = shape::getIndexOffset(e, inShapeBuffer); + auto outOffset = shape::getIndexOffset(e, outShapeBuffer); + outArr[outOffset] = inArr[inOffset]; + } + }; + samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength); } } } @@ -140,7 +151,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * /////////////////////////////////////////////////////////////////// template -static void _reverseSequence(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim){ +static void reverseSequence_(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim){ int posOfNonUnityDim = -1; if(input->isVector() || shape::isLikeVector(input->getShapeInfo(), posOfNonUnityDim)) { @@ -184,7 +195,7 @@ static void _reverseSequence(nd4j::LaunchContext * context, const NDArray* input } void reverseSequence(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim) { - BUILD_SINGLE_SELECTOR(input->dataType(), _reverseSequence, (context, input, seqLengths, output, seqDim, batchDim), LIBND4J_TYPES); + BUILD_SINGLE_SELECTOR(input->dataType(), reverseSequence_, (context, input, seqLengths, output, seqDim, batchDim), LIBND4J_TYPES); } ////////////////////////////////////////////////////////////////////////// @@ -208,7 +219,7 @@ void reverse(nd4j::LaunchContext * context, const NDArray* input, NDArray* outpu delete listIn; } -BUILD_SINGLE_TEMPLATE(template void _reverseSequence, (nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim), LIBND4J_TYPES); +BUILD_SINGLE_TEMPLATE(template void reverseSequence_, (nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim), LIBND4J_TYPES); BUILD_SINGLE_TEMPLATE(template void reverseArray, (nd4j::LaunchContext * context, void *inArr, Nd4jLong *inShapeBuffer, void *outArr, Nd4jLong *outShapeBuffer, int numOfElemsToReverse), LIBND4J_TYPES); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp index 5b4c44874..5422d04c1 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp @@ -20,6 +20,7 @@ // #include +#include namespace nd4j { namespace ops { @@ -53,21 +54,22 @@ static void batchToSpace_(const NDArray& input, NDArray& output, const uint crop const uint iC = xShapeInfo[4]; // loop through output array - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(4)) - for (uint b = 0; b < bS; ++b) { - for (uint h = cropBottom; h < iH - cropTop; ++h) { - for (uint w = cropLeft; w < iW - cropRight; ++w) { - for (uint c = 0; c < iC; ++c) { + auto func = PRAGMA_THREADS_FOR_3D { + for (uint b = start_x; b < stop_x; b += inc_x) { + for (uint h = start_y; h < stop_y; h += inc_y) { + for (uint w = start_z; w < stop_z; w += inc_z) { + for (uint c = 0; c < iC; ++c) { + const Nd4jLong xOffset = b * xShapeInfo[5] + h * xShapeInfo[6] + w * xShapeInfo[7] + c * xShapeInfo[8]; + const Nd4jLong zOffset = b * zShapeInfo[5] + (h - cropBottom) * zShapeInfo[6] + (w - cropLeft) * zShapeInfo[7] + c * zShapeInfo[8]; - const Nd4jLong xOffset = b * xShapeInfo[5] + h * xShapeInfo[6] + w * xShapeInfo[7] + c * xShapeInfo[8]; - - const Nd4jLong zOffset = b * zShapeInfo[5] + (h - cropBottom) * zShapeInfo[6] + (w - cropLeft) * zShapeInfo[7] + c * zShapeInfo[8]; - - z[zOffset] = x[xOffset]; + z[zOffset] = x[xOffset]; + } } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, cropBottom, iH - cropTop, 1, cropLeft, iW - cropRight, 1); } BUILD_SINGLE_TEMPLATE(template void batchToSpace_, (const NDArray& input, NDArray& output, const uint cropBottom, const uint cropTop, const uint cropLeft, const uint cropRight), LIBND4J_TYPES); @@ -109,23 +111,24 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray& const int rank = input.rankOf(); const Nd4jLong zLen = output.lengthOf(); - std::vector coords(rank); - // loop through input array - PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords)) + auto func = PRAGMA_THREADS_FOR { + Nd4jLong coords[MAX_RANK]; + for (auto i = start; i < stop; i += increment) { - for (Nd4jLong i = 0; i < zLen; ++i) { + shape::index2coords(i, output.getShapeInfo(), coords); - shape::index2coords(i, output.getShapeInfo(), coords.data()); + const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); - const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data()); + // evaluate spatial coordinates for x + for (uint j = 1; j <= numOfSpatialDims; ++j) + coords[j] += crop.e(j - 1, 0); // add crop left - // evaluate spatial coordinates for x - for(uint j = 1; j <= numOfSpatialDims; ++j) - coords[j] += crop.e(j - 1, 0); // add crop left + z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)]; + } + }; - z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())]; - } + samediff::Threads::parallel_tad(func, 0, zLen); } BUILD_SINGLE_TEMPLATE(template void batchToSpaceND_, (const NDArray& input, const NDArray& crop, NDArray& output, const uint numOfSpatialDims), LIBND4J_TYPES); @@ -212,24 +215,26 @@ static void spaceToBatch_(const NDArray& input, NDArray& output, const uint padB const uint iC = zShapeInfo[4]; // loop through output array - PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(4)) - for (uint b = 0; b < bS; ++b) { - for (uint h = 0; h < oH; ++h) { - for (uint w = 0; w < oW; ++w) { - for (uint c = 0; c < iC; ++c) { + auto func = PRAGMA_THREADS_FOR_2D { + for (uint b = start_x; b < stop_x; b += inc_x) { + for (uint h = start_y; h < stop_y; h += inc_y) { + for (uint w = 0; w < oW; ++w) { + for (uint c = 0; c < iC; ++c) { - const Nd4jLong zOffset = b * zShapeInfo[5] + h * zShapeInfo[6] + w * zShapeInfo[7] + c * zShapeInfo[8]; + const Nd4jLong zOffset = b * zShapeInfo[5] + h * zShapeInfo[6] + w * zShapeInfo[7] + c * zShapeInfo[8]; - if(h >= padBottom && h < oH - padTop && w >= padLeft && w < oW - padRight) { - const Nd4jLong xOffset = b * xShapeInfo[5] + (h - padBottom) * xShapeInfo[6] + (w - padLeft) * xShapeInfo[7] + c * xShapeInfo[8]; - z[zOffset] = x[xOffset]; + if (h >= padBottom && h < oH - padTop && w >= padLeft && w < oW - padRight) { + const Nd4jLong xOffset = b * xShapeInfo[5] + (h - padBottom) * xShapeInfo[6] + (w - padLeft) * xShapeInfo[7] + c * xShapeInfo[8]; + z[zOffset] = x[xOffset]; + } else + z[zOffset] = 0.f; } - else - z[zOffset] = 0.f; } } } - } + }; + + samediff::Threads::parallel_for(func, 0, bS, 1, 0, oH, 1); } BUILD_SINGLE_TEMPLATE(template void spaceToBatch_, (const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight), LIBND4J_TYPES); @@ -292,36 +297,37 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra const int rank = input.rankOf(); const Nd4jLong zLen = output.lengthOf(); - std::vector coords(rank); - // loop through output array - PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords)) - for (Nd4jLong i = 0; i < zLen; ++i) { + auto func = PRAGMA_THREADS_FOR { + Nd4jLong coords[MAX_RANK]; + for (auto i = start; i < stop; i += increment) { + shape::index2coords(i, output.getShapeInfo(), coords); - shape::index2coords(i, output.getShapeInfo(), coords.data()); + const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); - const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data()); + bool within = true; - bool within = true; + for (uint j = 1; j <= numOfSpatialDims; ++j) { - for(uint j = 1; j <= numOfSpatialDims; ++j) { + const auto padLeft = padding.e(j - 1, 0); + const auto padRight = padding.e(j - 1, 1); - const auto padLeft = padding.e(j - 1, 0); - const auto padRight = padding.e(j - 1, 1); + within &= (coords[j] >= padLeft && coords[j] < output.sizeAt(j) - padRight); - within &= (coords[j] >= padLeft && coords[j] < output.sizeAt(j) - padRight); + if (!within) + break; - if(!within) - break; + coords[j] -= padLeft; // get coordinates for x + } - coords[j] -= padLeft; // get coordinates for x + if (within) + z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)]; + else + z[zOffset] = 0.f; } + }; - if(within) - z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())]; - else - z[zOffset] = 0.f; - } + samediff::Threads::parallel_tad(func, 0, zLen); } BUILD_SINGLE_TEMPLATE(template void spaceToBatchND_, (const NDArray& input, const NDArray& padding, NDArray& output, const uint numOfSpatialDims), LIBND4J_TYPES); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp index af9a74b68..fd285ed9c 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp @@ -19,6 +19,7 @@ // #include +#include namespace nd4j { namespace ops { @@ -46,47 +47,53 @@ namespace helpers { if (isNHWC) { const int total_count = batch_size * input_height * input_width * input_depth; - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int inp_idx = 0; inp_idx < total_count; inp_idx++){ - // inp_idx = d + input_depth * (w + input_width * (h + input_height * b)) - const int d = inp_idx % input_depth; - const int inp_idx2 = inp_idx / input_depth; - const int w = inp_idx2 % input_width; - const int inp_idx3 = inp_idx2 / input_width; - const int h = inp_idx3 % input_height; - const int b = inp_idx3 / input_height; + auto func = PRAGMA_THREADS_FOR { + for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) { + // inp_idx = d + input_depth * (w + input_width * (h + input_height * b)) + const int d = inp_idx % input_depth; + const int inp_idx2 = inp_idx / input_depth; + const int w = inp_idx2 % input_width; + const int inp_idx3 = inp_idx2 / input_width; + const int h = inp_idx3 % input_height; + const int b = inp_idx3 / input_height; - const int out_h = h / block_size; - const int offset_h = h % block_size; - const int out_w = w / block_size; - const int offset_w = w % block_size; - const int offset_d = (offset_h * block_size + offset_w) * input_depth; - const int out_d = d + offset_d; - - const int out_idx = out_d + output_depth * (out_w + output_width * (out_h + output_height * b)); - *(output_ptr + out_idx) = *(input_ptr + inp_idx); - } + const int out_h = h / block_size; + const int offset_h = h % block_size; + const int out_w = w / block_size; + const int offset_w = w % block_size; + const int offset_d = (offset_h * block_size + offset_w) * input_depth; + const int out_d = d + offset_d; + + const int out_idx = out_d + output_depth * (out_w + output_width * (out_h + output_height * b)); + *(output_ptr + out_idx) = *(input_ptr + inp_idx); + } + }; + + samediff::Threads::parallel_for(func, 0, total_count); } else { const int total_count = batch_size * output_depth_by_output_area; - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int inp_idx = 0; inp_idx < total_count; inp_idx++) { - const int n_iC_oY_bY_oX = inp_idx / block_size; - const int bX = inp_idx - n_iC_oY_bY_oX * block_size; + auto func = PRAGMA_THREADS_FOR { + for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) { + const int n_iC_oY_bY_oX = inp_idx / block_size; + const int bX = inp_idx - n_iC_oY_bY_oX * block_size; - const int n_iC_oY_bY = n_iC_oY_bY_oX / output_width; - const int oX = n_iC_oY_bY_oX - n_iC_oY_bY * output_width; + const int n_iC_oY_bY = n_iC_oY_bY_oX / output_width; + const int oX = n_iC_oY_bY_oX - n_iC_oY_bY * output_width; - const int n_iC_oY = n_iC_oY_bY / block_size; - const int bY = n_iC_oY_bY - n_iC_oY * block_size; + const int n_iC_oY = n_iC_oY_bY / block_size; + const int bY = n_iC_oY_bY - n_iC_oY * block_size; - const int n = n_iC_oY / input_depth_by_output_height; - const int iC_oY = n_iC_oY - n * input_depth_by_output_height; + const int n = n_iC_oY / input_depth_by_output_height; + const int iC_oY = n_iC_oY - n * input_depth_by_output_height; - const int output_idx = oX + (((n * block_size + bY) * block_size + bX) * input_depth_by_output_height + iC_oY) * output_width; - - *(output_ptr + output_idx) = *(input_ptr + inp_idx); - } + const int output_idx = oX + (((n * block_size + bY) * block_size + bX) * input_depth_by_output_height + iC_oY) * output_width; + + *(output_ptr + output_idx) = *(input_ptr + inp_idx); + } + }; + + samediff::Threads::parallel_for(func, 0, total_count); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp index 0b16ac989..99605e7cc 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -34,16 +35,16 @@ void scatter(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& ind const Nd4jLong indLen = indices.lengthOf(); if(outRank == 1) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + Nd4jLong idx = indices.e(i); + NDArray out = output({idx, idx + 1}); -// PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) -PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided)) - for(Nd4jLong i = 0; i < indLen; ++i) { + out.applyPairwiseTransform(op, updates.e(i), nullptr); + } + }; - Nd4jLong idx = indices.e(i); - NDArray out = output({idx, idx+1}); - - out.applyPairwiseTransform(op, updates.e(i), nullptr); - } + samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads()); } else { // outRank > 1 @@ -54,17 +55,16 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided)) std::vector dimsToExcludeUpd(sizeOfDims); std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0); - shape::printIntArray(dimsToExcludeUpd.data(),dimsToExcludeUpd.size()); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + NDArray outSubArr = output(indices.e(i), std::vector({0})); + NDArray updSubArr = updates(i, dimsToExcludeUpd); -// PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) // causes known openMP asan bug ! -PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided)) - for(Nd4jLong i = 0; i < indLen; ++i) { + outSubArr.applyPairwiseTransform(op, updSubArr, nullptr); + } + }; - NDArray outSubArr = output(indices.e(i), std::vector({0})); - NDArray updSubArr = updates(i, dimsToExcludeUpd); - - outSubArr.applyPairwiseTransform(op, updSubArr, nullptr); - } + samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads()); } } @@ -77,40 +77,41 @@ void scatterND(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& i const Nd4jLong indLastDim = indices.sizeAt(-1); if(outRank == 1) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + Nd4jLong idx = indices.e(i); + NDArray out = output({idx, idx + 1}); -// PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) -PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided)) - for(Nd4jLong i = 0; i < indLen; ++i) { + out.applyPairwiseTransform(op, updates.e(i), nullptr); + } + }; - Nd4jLong idx = indices.e(i); - NDArray out = output({idx, idx+1}); - - out.applyPairwiseTransform(op, updates.e(i), nullptr); - } + samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads()); } else { - std::vector dimsToExcludeInd = ShapeUtils::evalDimsToExclude(indRank, {indRank-1}); std::vector dimsToExcludeUpd(indRank - 1); std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0); - std::vector idxRangeOut(2*outRank, 0); -// PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(indLen/indLastDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided) firstprivate(idxRangeOut)) -PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided) firstprivate(idxRangeOut)) - for(Nd4jLong i = 0; i < indLen/indLastDim; ++i) { + auto func = PRAGMA_THREADS_FOR { + std::vector idxRangeOut(2*outRank, 0); - NDArray indSubArr = indices(i, dimsToExcludeInd); + for (auto i = start; i < stop; i += increment) { + NDArray indSubArr = indices(i, dimsToExcludeInd); - for(Nd4jLong j = 0; j < indLastDim; ++j) { - idxRangeOut[2*j] = indSubArr.e(j); - idxRangeOut[2*j + 1] = idxRangeOut[2*j] + 1; + for (Nd4jLong j = 0; j < indLastDim; ++j) { + idxRangeOut[2 * j] = indSubArr.e(j); + idxRangeOut[2 * j + 1] = idxRangeOut[2 * j] + 1; + } + + NDArray outSubArr = output(idxRangeOut); + NDArray updSubArr = updates(i, dimsToExcludeUpd); + + outSubArr.applyPairwiseTransform(op, updSubArr, nullptr); } + }; - NDArray outSubArr = output(idxRangeOut); - NDArray updSubArr = updates(i, dimsToExcludeUpd); - - outSubArr.applyPairwiseTransform(op, updSubArr, nullptr); - } + samediff::Threads::parallel_tad(func, 0, indLen / indLastDim, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads()); } } @@ -125,20 +126,24 @@ void scatterForLoss(nd4j::LaunchContext *context, const NDArray& indices, NDArr std::vector dimsToExclude = ShapeUtils::evalDimsToExclude(updates.rankOf(), {-1}); if(!calcGrad) { -PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided)) - for(Nd4jLong i = 0; i < indicesLen; ++i) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto subArr = updates(i, dimsToExclude); + output.p(i, subArr.e(indices.e(i))); + } + }; - auto subArr = updates(i, dimsToExclude); - output.p(i, subArr.e(indices.e(i))); - } + samediff::Threads::parallel_for(func, 0, indicesLen); } else { -PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided)) - for(Nd4jLong i = 0; i < indicesLen; ++i) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto subArr = updates(i, dimsToExclude); + auto ind = indices.e(i); + subArr.p(ind, subArr.e(ind) - 1.); + } + }; - auto subArr = updates(i, dimsToExclude); - auto ind = indices.e(i); - subArr.p(ind, subArr.e(ind) - 1.); - } + samediff::Threads::parallel_for(func, 0, indicesLen); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp index e13cfb177..2884107f3 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp @@ -21,6 +21,9 @@ #include #include +#include +#include + namespace nd4j { namespace ops { namespace helpers { @@ -167,10 +170,13 @@ namespace helpers { for (int i = 1; i < indices->lengthOf(); i++) { if (indices->e(i) == idx) { - PRAGMA_OMP_PARALLEL_FOR - for (int e = 0; e < meanT->lengthOf(); e++) { - meanV->p(e, meanV->e(e) + listOfTensors->at(i)->e(e)); - } + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + meanV->p(e, meanV->e(e) + listOfTensors->at(i)->e(e)); + } + }; + samediff::Threads::parallel_for(func, 0, meanT->lengthOf()); + count++; } else { @@ -221,10 +227,12 @@ namespace helpers { for (int i = 0; i < indices->lengthOf(); i++) { if (indices->e(i) == idx) { - PRAGMA_OMP_PARALLEL_FOR - for (int e = 0; e < sumT->lengthOf(); e++) { - sumT->p(e, sumT->e(e) + listOfTensors->at(i)->e(e)); - } + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + sumT->p(e, sumT->e(e) + listOfTensors->at(i)->e(e)); + } + }; + samediff::Threads::parallel_for(func, 0, sumT->lengthOf()); } else { idx = indices->e(i); @@ -270,10 +278,12 @@ namespace helpers { sumT->assign(listOfTensors->at(0)); for (int i = 1; i < indices->lengthOf(); i++) { if (indices->e(i) == idx) { - PRAGMA_OMP_PARALLEL_FOR - for (int e = 0; e < sumT->lengthOf(); e++) { - sumT->p(e, sumT->e(e) * listOfTensors->at(i)->e(e)); - } + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + sumT->p(e, sumT->e(e) * listOfTensors->at(i)->e(e)); + } + }; + samediff::Threads::parallel_for(func, 0, sumT->lengthOf()); } else { idx = indices->e(i); @@ -463,7 +473,8 @@ namespace helpers { for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { double sumValue = input->e(fi->second.at(0)); int loop_size = fi->second.size(); - PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sumValue) + + // FIXME: parallelism here? for (size_t idx = 1; idx < loop_size; ++idx) { sumValue += input->e(fi->second.at(idx)); } @@ -477,11 +488,12 @@ namespace helpers { std::unique_ptr listOfTensors(input->allTensorsAlongDimension(restDims)); std::unique_ptr listOfOutTensors(output->allTensorsAlongDimension(restDims)); + // FIXME: parallelism here? for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { auto outputT = listOfOutTensors->at(fi->first); outputT->assign(listOfTensors->at(fi->second.at(0))); Nd4jLong loopSize = fi->second.size(); - PRAGMA_OMP_PARALLEL_FOR + for (Nd4jLong idx = 1; idx < loopSize; ++idx) { auto current = listOfTensors->at(fi->second.at(idx)); *outputT += *current; @@ -501,7 +513,8 @@ namespace helpers { for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { double sumValue = input->e(fi->second.at(0)); Nd4jLong loop_size = fi->second.size(); - PRAGMA_OMP_PARALLEL_FOR_REDUCTION(+:sumValue) + + // FIXME: parallelism here? for (Nd4jLong idx = 1; idx < loop_size; ++idx) { sumValue += input->e(fi->second.at(idx)); } @@ -518,7 +531,8 @@ namespace helpers { auto outputT = listOfOutTensors->at(fi->first); outputT->assign(listOfTensors->at(fi->second.at(0))); Nd4jLong loop_size = fi->second.size(); - PRAGMA_OMP_PARALLEL_FOR + + // FIXME: parallelism here? for (Nd4jLong idx = 1; idx < loop_size; ++idx) { auto current = listOfTensors->at(fi->second.at(idx)); *(outputT) += *current; @@ -619,12 +633,15 @@ namespace helpers { segmentMaxFunctor_(input, indices, tempRes); if (input->isVector()) { Nd4jLong loop_size = input->lengthOf(); - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < loop_size; ++e) { - Nd4jLong classNum = indices->e(e); - if (nd4j::math::nd4j_abs(tempRes->e(classNum) - input->e(e)) <= T(1.e-6)) - output->p(e, gradOut->e(classNum)); - } + + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto classNum = indices->e(e); + if (nd4j::math::nd4j_abs(tempRes->e(classNum) - input->e(e)) <= T(1.e-6)) + output->p(e, gradOut->e(classNum)); + } + }; + samediff::Threads::parallel_for(func, 0, loop_size); } else { std::vector restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0}); @@ -637,18 +654,21 @@ namespace helpers { //int numOfClasses = tempRes->sizeAt(0); // number of classes //std::vector> outputs(numOfClasses); - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong i = 0; i < indices->lengthOf(); i++) { - Nd4jLong classNum = indices->e(i); - NDArray* current = listOfTensors->at(i); - NDArray* currentOut = listOfOutTensors->at(i); - NDArray* currentGradOut = listOfGradOuts->at(classNum); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto classNum = indices->e(i); + auto current = listOfTensors->at(i); + auto currentOut = listOfOutTensors->at(i); + auto currentGradOut = listOfGradOuts->at(classNum); - for (Nd4jLong e = 0; e < current->lengthOf(); e++) { - if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->e(e) - current->e(e)) <= T(1.e-6)) - currentOut->p(e, currentGradOut->e(e)); + for (uint64_t e = 0; e < current->lengthOf(); e++) { + if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->e(e) - current->e(e)) <= T(1.e-6)) + currentOut->p(e, currentGradOut->e(e)); + } } - } + }; + + samediff::Threads::parallel_tad(func, 0, indices->lengthOf()); } delete tempRes; return ND4J_STATUS_OK; @@ -664,12 +684,14 @@ namespace helpers { std::unique_ptr tempRes(gradOut->dup()); segmentMinFunctor(context, input, indices, tempRes.get()); if (input->isVector()) { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < input->lengthOf(); ++e) { - Nd4jLong classNum = indices->e(e); - if (nd4j::math::nd4j_abs(tempRes->e(classNum) - input->e(e)) < 1.e-5) - output->p(e, gradOut->e(classNum)); - } + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto classNum = indices->e(e); + if (nd4j::math::nd4j_abs(tempRes->e(classNum) - input->e(e)) < 1.e-5) + output->p(e, gradOut->e(classNum)); + } + }; + samediff::Threads::parallel_for(func, 0, input->lengthOf()); } else { auto restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0}); @@ -684,17 +706,22 @@ namespace helpers { output->assign(0.); int pos = 0; - PRAGMA_OMP_PARALLEL_FOR - for (int i = 0; i < indices->lengthOf(); i++) { - Nd4jLong classNum = indices->e(i); - NDArray* current = listOfTensors->at(i); - NDArray* currentOut = listOfOutTensors->at(i); - NDArray* currentGradOut = listOfGradOuts->at(classNum); - for (int e = 0; e < current->lengthOf(); e++) { - if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->e(e) - current->e(e)) < 1.e-5) - currentOut->p(e, currentGradOut->e(e)); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto classNum = indices->e(i); + auto current = listOfTensors->at(i); + auto currentOut = listOfOutTensors->at(i); + auto currentGradOut = listOfGradOuts->at(classNum); + + for (int e = 0; e < current->lengthOf(); e++) { + if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->e(e) - current->e(e)) < + 1.e-5) + currentOut->p(e, currentGradOut->e(e)); + } } - } + }; + + samediff::Threads::parallel_tad(func, 0, indices->lengthOf()); } return ND4J_STATUS_OK; } @@ -730,17 +757,20 @@ namespace helpers { //std::vector> outputs(numOfClasses); int pos = 0; - PRAGMA_OMP_PARALLEL_FOR - for (int i = 0; i < indices->lengthOf(); i++) { - Nd4jLong classNum = indices->e(i); - NDArray* current = listOfTensors->at(i); - NDArray* currentOut = listOfOutTensors->at(i); - NDArray* currentGradOut = listOfGradOuts->at(classNum); + //auto func = [&](uint64_t thread_id, uint64_t start, uint64_t stop, uint64_t increment) -> void { + for (auto i = 0; i < indices->lengthOf(); i++) { + auto classNum = indices->e(i); + auto current = listOfTensors->at(i); + auto currentOut = listOfOutTensors->at(i); + auto currentGradOut = listOfGradOuts->at(classNum); - for (int e = 0; e < current->lengthOf(); e++) { - currentOut->p(e, currentGradOut->e(e) / classCount[classNum]); + for (int e = 0; e < current->lengthOf(); e++) { + currentOut->p(e, currentGradOut->e(e) / classCount.at(classNum)); + } } - } + //}; + + //samediff::Threads::parallel_for(func, 0, indices->lengthOf()); } return ND4J_STATUS_OK; } @@ -762,16 +792,20 @@ namespace helpers { std::unique_ptr listOfTensors(input->allTensorsAlongDimension(restDims)); std::unique_ptr listOfOutTensors(output->allTensorsAlongDimension(restDims)); - PRAGMA_OMP_PARALLEL_FOR - for (int i = 0; i < indices->lengthOf(); i++) { - Nd4jLong classNum = indices->e(i); - NDArray* current = listOfTensors->at(i); - NDArray* currentOut = listOfOutTensors->at(i); - NDArray* currentGradOut = listOfGradOuts->at(classNum); - currentOut->assign(currentGradOut); - } + //auto func = PRAGMA_THREADS_FOR { + for (auto i = 0; i < indices->lengthOf(); i++) { + auto classNum = indices->e(i); + auto current = listOfTensors->at(i); + auto currentOut = listOfOutTensors->at(i); + auto currentGradOut = listOfGradOuts->at(classNum); + + currentOut->assign(currentGradOut); + } + //}; + + //samediff::Threads::parallel_for(func, 0, indices->lengthOf()); } - return ND4J_STATUS_OK; + return Status::OK(); } int segmentProdFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) { @@ -794,16 +828,19 @@ namespace helpers { //int numOfClasses = tempRes->sizeAt(0); // number of classes //std::vector> outputs(numOfClasses); - PRAGMA_OMP_PARALLEL_FOR - for (int i = 0; i < indices->lengthOf(); i++) { - Nd4jLong classNum = indices->e(i); - NDArray* current = listOfTensors->at(i); - NDArray* currentOut = listOfOutTensors->at(i); - NDArray* currentGradOut = listOfGradOuts->at(classNum); - NDArray* currentFFOut = listOfBPTensors->at(classNum); + //auto func = PRAGMA_THREADS_FOR { + for (auto i = 0; i < indices->lengthOf(); i++) { + auto classNum = indices->e(i); + auto current = listOfTensors->at(i); + auto currentOut = listOfOutTensors->at(i); + auto currentGradOut = listOfGradOuts->at(classNum); + auto currentFFOut = listOfBPTensors->at(classNum); - currentOut->assign((*currentFFOut) * (*currentGradOut) / (*current)); - } + currentOut->assign((*currentFFOut) * (*currentGradOut) / (*current)); + } + //}; + + //samediff::Threads::parallel_for(func, 0, indices->lengthOf()); } delete tempRes; return ND4J_STATUS_OK; @@ -861,12 +898,15 @@ namespace helpers { unsortedSegmentMinFunctor(context, input, indices, numOfClasses, tempRes); if (input->isVector()) { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < input->lengthOf(); ++e) { - Nd4jLong classNum = indices->e(e); - if (nd4j::math::nd4j_abs(tempRes->t(classNum) - input->t(e)) < 1.e-6) - output->t(e) = gradOut->t(classNum); - } + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto classNum = indices->e(e); + if (nd4j::math::nd4j_abs(tempRes->t(classNum) - input->t(e)) < 1.e-6) + output->t(e) = gradOut->t(classNum); + } + }; + + samediff::Threads::parallel_for(func, 0, input->lengthOf()); } else { auto restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0}); @@ -876,21 +916,21 @@ namespace helpers { std::unique_ptr listOfTensors(input->allTensorsAlongDimension(restDims)); std::unique_ptr listOfOutTensors(output->allTensorsAlongDimension(restDims)); - //int numOfClasses = tempRes->sizeAt(0); // number of classes - //std::vector> outputs(numOfClasses); + //auto func = PRAGMA_THREADS_FOR { + for (auto i = 0; i < indices->lengthOf(); i++) { + auto classNum = indices->e(i); + auto current = listOfTensors->at(i); + auto currentOut = listOfOutTensors->at(i); + auto currentGradOut = listOfGradOuts->at(classNum); - PRAGMA_OMP_PARALLEL_FOR - for (int i = 0; i < indices->lengthOf(); i++) { - Nd4jLong classNum = indices->e(i); - NDArray* current = listOfTensors->at(i); - NDArray* currentOut = listOfOutTensors->at(i); - NDArray* currentGradOut = listOfGradOuts->at(classNum); - - for (int e = 0; e < current->lengthOf(); e++) { - if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->t(e) - current->t(e)) < 1.e-6) - currentOut->t(e) = currentGradOut->t(e); + for (int e = 0; e < current->lengthOf(); e++) { + if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->t(e) - current->t(e)) < 1.e-6) + currentOut->t(e) = currentGradOut->t(e); + } } - } + //}; + + //samediff::Threads::parallel_for(func, 0, indices->lengthOf()); } delete tempRes; return ND4J_STATUS_OK; @@ -955,17 +995,19 @@ namespace helpers { std::unique_ptr listOfTensors(input->allTensorsAlongDimension(restDims)); std::unique_ptr listOfOutTensors(output->allTensorsAlongDimension(restDims)); - PRAGMA_OMP_PARALLEL_FOR - for (int i = 0; i < indices->lengthOf(); i++) { - Nd4jLong classNum = indices->e(i); - //NDArray* current = listOfTensors->at(i); - NDArray* currentOut = listOfOutTensors->at(i); - NDArray* currentGradOut = listOfGradOuts->at(classNum); + //auto func = PRAGMA_THREADS_FOR { + for (auto i = 0; i < indices->lengthOf(); i++) { + auto classNum = indices->e(i); + auto currentOut = listOfOutTensors->at(i); + auto currentGradOut = listOfGradOuts->at(classNum); - currentOut->assign(currentGradOut); - } + currentOut->assign(currentGradOut); + } + //}; + + //samediff::Threads::parallel_for(func, 0, indices->lengthOf()); } - return ND4J_STATUS_OK; + return Status::OK(); } int unsortedSegmentProdFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) { @@ -973,11 +1015,14 @@ namespace helpers { unsortedSegmentProdFunctor(context, input, indices, numOfClasses, tempRes); if (input->isVector()) { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < indices->lengthOf(); ++e) { - Nd4jLong classNum = indices->e(e); - output->p(e, gradOut->e(classNum) * tempRes->e(classNum)/ input->e(e)); - } + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto classNum = indices->e(e); + output->p(e, gradOut->e(classNum) * tempRes->e(classNum) / input->e(e)); + } + }; + + samediff::Threads::parallel_for(func, 0, indices->lengthOf()); } else { auto restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0}); @@ -987,19 +1032,22 @@ namespace helpers { std::unique_ptr listOfTensors(input->allTensorsAlongDimension(restDims)); std::unique_ptr listOfOutTensors(output->allTensorsAlongDimension(restDims)); - PRAGMA_OMP_PARALLEL_FOR - for (int i = 0; i < indices->lengthOf(); i++) { - Nd4jLong classNum = indices->e(i); - NDArray* current = listOfTensors->at(i); - NDArray* currentOut = listOfOutTensors->at(i); - NDArray* currentGradOut = listOfGradOuts->at(classNum); - auto currentFFOut = listOfBPTensors->at(classNum); + //auto func = PRAGMA_THREADS_FOR { + for (auto i = 0; i < indices->lengthOf(); i++) { + auto classNum = indices->e(i); + auto current = listOfTensors->at(i); + auto currentOut = listOfOutTensors->at(i); + auto currentGradOut = listOfGradOuts->at(classNum); + auto currentFFOut = listOfBPTensors->at(classNum); - currentOut->assign((*currentFFOut) * (*currentGradOut) / (*current)); - } + currentOut->assign((*currentFFOut) * (*currentGradOut) / (*current)); + } + //}; + + //samediff::Threads::parallel_for(func, 0, indices->lengthOf()); } delete tempRes; - return ND4J_STATUS_OK; + return Status::OK(); } // template @@ -1016,11 +1064,14 @@ namespace helpers { // if input is a vector: (as if in doc sample) if (input->isVector()) { - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 0; e < indices->lengthOf(); ++e) { - Nd4jLong classNum = indices->e(e); - output->p(e, gradOut->e(classNum) / nd4j::math::nd4j_sqrt(classCount[classNum])); - } + //auto func = PRAGMA_THREADS_FOR { + for (auto e = 0; e < indices->lengthOf(); e++) { + auto classNum = indices->e(e); + output->p(e, gradOut->e(classNum) / nd4j::math::nd4j_sqrt(classCount[classNum])); + } + //}; + + //samediff::Threads::parallel_for(func, 0, indices->lengthOf()); } else { auto restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0}); @@ -1029,22 +1080,22 @@ namespace helpers { std::unique_ptr listOfTensors(input->allTensorsAlongDimension(restDims)); std::unique_ptr listOfOutTensors(output->allTensorsAlongDimension(restDims)); - //int numOfClasses = tempRes->sizeAt(0); // number of classes - //std::vector> outputs(numOfClasses); + //auto func = PRAGMA_THREADS_FOR { + for (auto i = 0; i < indices->lengthOf(); i++) { + auto classNum = indices->e(i); + auto current = listOfTensors->at(i); + auto currentOut = listOfOutTensors->at(i); + auto currentGradOut = listOfGradOuts->at(classNum); - PRAGMA_OMP_PARALLEL_FOR - for (int i = 0; i < indices->lengthOf(); i++) { - Nd4jLong classNum = indices->e(i); - NDArray* current = listOfTensors->at(i); - NDArray* currentOut = listOfOutTensors->at(i); - NDArray* currentGradOut = listOfGradOuts->at(classNum); - - for (int e = 0; e < current->lengthOf(); e++) { - currentOut->p(e, currentGradOut->e(e) / nd4j::math::nd4j_sqrt(classCount[classNum])); + for (int e = 0; e < current->lengthOf(); e++) { + currentOut->p(e, currentGradOut->e(e) / nd4j::math::nd4j_sqrt(classCount[classNum])); + } } - } + //}; + + //samediff::Threads::parallel_for(func, 0, indices->lengthOf()); } - return ND4J_STATUS_OK; + return Status::OK(); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp index 03f61d453..bf3463afe 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp @@ -19,6 +19,7 @@ // #include +#include namespace nd4j { namespace ops { @@ -26,11 +27,14 @@ namespace helpers { template static void sequenceMask_(NDArray* input, NDArray* output, int maxIndex) { - PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2) - for (Nd4jLong i = 0; i < maxIndex; i++) - for(Nd4jLong k = 0; k < input->lengthOf(); k++) - if (i < input->t(k)) - output->t(k * maxIndex + i) = B(true); //, T(1.0f)); + auto func = PRAGMA_THREADS_FOR_2D { + for (auto i = start_x; i < stop_x; i += inc_x) + for (auto k = start_y; k < stop_y; k += inc_y) + if (i < input->t(k)) + output->t(k * maxIndex + i) = B(true); //, T(1.0f)); + }; + + samediff::Threads::parallel_for(func, 0, maxIndex, 1, 0, input->lengthOf(), 1); } void sequenceMask(nd4j::LaunchContext * context, NDArray* input, NDArray* output, int maxIndex) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp index b0fd449c7..59c257c28 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp @@ -20,6 +20,7 @@ #include #include +#include #define HS_MAX_EXP 6.0f @@ -350,8 +351,6 @@ namespace nd4j { const auto negTable = reinterpret_cast(vnegTable); const auto infVector = reinterpret_cast(vinfVector); - T sneu1e[600]; - //const auto numThreads = omp_get_max_threads(); const auto idxShift = indices.isEmpty() ? 0 : indices.sizeAt(1); const auto hsRounds = codes.isEmpty() ? 0 : codes.sizeAt(1); @@ -362,64 +361,71 @@ namespace nd4j { auto bIndices = indices.bufferAsT(); auto bCodes = codes.bufferAsT(); - PRAGMA_OMP_PARALLEL_FOR_ARGS(num_threads(numThreads) private(sneu1e)) - for (int t = 0; t < numTargets; t++) { - T* neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; - memset(neu1e, 0, vectorLength * sizeof(T)); + auto func = PRAGMA_THREADS_FOR { + T sneu1e[600]; - auto target = bTarget[t]; - auto alpha = lr.e(t); - unsigned long long randomValue = nextRandom.e(t); + for (auto t = start; t < stop; t += increment) { + T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; + memset(neu1e, 0, vectorLength * sizeof(T)); - auto syn0row = reinterpret_cast(s0.bufferWithOffset(target * vectorLength)); + auto target = bTarget[t]; + auto alpha = lr.e(t); + unsigned long long randomValue = nextRandom.e(t); - if (hsRounds > 0) { - int irow = 0; - auto cShift = t * idxShift; + auto syn0row = reinterpret_cast(s0.bufferWithOffset(target * vectorLength)); - for (int e = 0; e < hsRounds; e++) { - irow = bIndices[e + cShift]; - if (irow < 0 || irow >= vocabSize) - continue; + if (hsRounds > 0) { + int irow = 0; + auto cShift = t * idxShift; - auto syn1row = s1.bufferWithOffset(irow * vectorLength); - auto code = bCodes[e + cShift]; + for (int e = 0; e < hsRounds; e++) { + irow = bIndices[e + cShift]; + if (irow < 0 || irow >= vocabSize) + continue; + + auto syn1row = s1.bufferWithOffset(irow * vectorLength); + auto code = bCodes[e + cShift]; //nd4j_printf("syn0: [%i]; syn1: [%i]; code: [%i]\n", target, irow, code); - hSoftmax_(syn0row, syn1row, expTable, neu1e, alpha, vectorLength, code, expLength, false); - } - } - - - if (nsRounds > 0) { - int irow = negStarters.e(t); - int nsStarter = irow; - for (int r = 0; r < nsRounds + 1; r++) { - if (r == 0) { - // target is known in advance - } else { - randomValue = randomValue * (unsigned long long) 25214903917 + 11; - auto idx = nd4j::math::nd4j_abs((randomValue >> 16) % negLength); - irow = idx >= negLength ? -1 : static_cast(negTable[idx]); - - if (irow < 0 || irow >= vocabSize) - irow = randomValue % (vocabSize - 1) + 1; - - if (irow == nsStarter) - continue; + hSoftmax_(syn0row, syn1row, expTable, neu1e, alpha, vectorLength, code, + expLength, false); } - - nSampling_(syn0row, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e, alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr); } + + + if (nsRounds > 0) { + int irow = negStarters.e(t); + int nsStarter = irow; + for (int r = 0; r < nsRounds + 1; r++) { + if (r == 0) { + // target is known in advance + } else { + randomValue = randomValue * (unsigned long long) 25214903917 + 11; + auto idx = nd4j::math::nd4j_abs((randomValue >> 16) % negLength); + irow = idx >= negLength ? -1 : static_cast(negTable[idx]); + + if (irow < 0 || irow >= vocabSize) + irow = randomValue % (vocabSize - 1) + 1; + + if (irow == nsStarter) + continue; + } + + nSampling_(syn0row, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e, + alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr); + } + } + + for (int e = 0; e < vectorLength; e++) + syn0row[e] += neu1e[e]; + + // optionally release temp arrays + if (vectorLength > 600) + delete[] neu1e; } + }; - for (int e = 0; e < vectorLength; e++) - syn0row[e] += neu1e[e]; - - // optionally release temp arrays - if (vectorLength > 600) - delete[] neu1e; - } + samediff::Threads::parallel_tad(func, 0, numTargets, 1, numThreads); } BUILD_SINGLE_TEMPLATE(template void skipgramBatchExec_, (NDArray &s0, NDArray &s1, NDArray &s1n, void *vexpTable, void *vnegTable, void *vinfVector, NDArray &targets, NDArray &negStarters, NDArray &indices, NDArray &codes, NDArray &lr, NDArray &nextRandom, const int nsRounds, const int vocabSize, const int vectorLength, const int expLength, const int negLength, const bool preciseMode, const int numThreads), FLOAT_TYPES); @@ -434,9 +440,6 @@ namespace nd4j { const auto negTable = reinterpret_cast(vnegTable); const auto infVector = reinterpret_cast(vinfVector); - T sneu1[600]; - T sneu1e[600]; - //const auto numThreads = omp_get_max_threads(); const auto idxShift = indices.isEmpty() ? 0 : indices.sizeAt(1); const auto hsRounds = codes.isEmpty() ? 0 : codes.sizeAt(1); @@ -450,122 +453,131 @@ namespace nd4j { const auto bStarters = negStarters.bufferAsT(); const auto numIndices = indices.isEmpty() ? 0 : indices.sizeAt(1); - PRAGMA_OMP_PARALLEL_FOR_ARGS(num_threads(numThreads) private(sneu1, sneu1e)) - for (int e = 0; e < numTargets; e++){ - T* neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength]; - T* neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; + auto func = PRAGMA_THREADS_FOR { + T sneu1[600]; + T sneu1e[600]; - // optionally we nullify temp arrays after successful (and on first) cycle - memset(neu1, 0, sizeof(T) * vectorLength); - memset(neu1e, 0, sizeof(T) * vectorLength); + for (int e = start; e < stop; e += increment) { + T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength]; + T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; - auto alpha = lr.e(e); - auto numLabels = nLabels.isEmpty() ? 0 : nLabels.e(e); + // optionally we nullify temp arrays after successful (and on first) cycle + memset(neu1, 0, sizeof(T) * vectorLength); + memset(neu1e, 0, sizeof(T) * vectorLength); - int actualContext = 0; + auto alpha = lr.e(e); + auto numLabels = nLabels.isEmpty() ? 0 : nLabels.e(e); - // building neu1 for current window - for (int c = 0; c < contextWidth; c++) { - // getting next context word - auto cContext = bContext[c + (e * contextWidth)]; + int actualContext = 0; - // skipping padded values - if (cContext < 0) - continue; + // building neu1 for current window + for (int c = 0; c < contextWidth; c++) { + // getting next context word + auto cContext = bContext[c + (e * contextWidth)]; - if (cContext >= vocabSize) - throw std::runtime_error("ContextID can't be >= vocab size"); - - T *syn0word = syn0 + (cContext * vectorLength); - - for (int i = 0; i < vectorLength; i++) - neu1[i] += syn0word[i]; - - actualContext++; - } - - if (infVector != nullptr) - actualContext++; - - if (actualContext > 1) { - for (int i = 0; i < vectorLength; i++) - neu1[i] /= actualContext; - } - - // hierarchic softmax step - if (!indices.isEmpty()) { - for (int i = 0; i < numIndices; i++) { - const int cIndex = bIndices[(e * numIndices) + i]; - const int cCode = bCodes[(e * numIndices) + i]; - - // we're skipping padded values - if (cIndex < 0) + // skipping padded values + if (cContext < 0) continue; - if (cIndex >= vocabSize) - throw std::runtime_error("Index can't be > vocab size"); + if (cContext >= vocabSize) + throw std::runtime_error("ContextID can't be >= vocab size"); - hSoftmax_(neu1, syn1 + (cIndex * vectorLength), expTable, neu1e, alpha, vectorLength, cCode, expLength, false); + T *syn0word = syn0 + (cContext * vectorLength); + + for (int i = 0; i < vectorLength; i++) + neu1[i] += syn0word[i]; + + actualContext++; } - } - // negative sampling step - if (!negStarters.isEmpty() && nsRounds > 0) { - int irow = bStarters[e]; - const int nsStarter = irow; - unsigned long long randomValue = nextRandom.e(e); + if (infVector != nullptr) + actualContext++; - for (int r = 0; r < nsRounds + 1; r++) { - // we're skipping rng on 0 step - if (r != 0) { - randomValue = randomValue * (unsigned long long) 25214903917 + 11; - auto idx = nd4j::math::nd4j_abs((randomValue >> 16) % negLength); - irow = idx >= negLength ? -1 : static_cast(negTable[idx]); + if (actualContext > 1) { + for (int i = 0; i < vectorLength; i++) + neu1[i] /= actualContext; + } - if (irow < 0 || irow >= vocabSize) irow = randomValue % (vocabSize - 1) + 1; - if (irow == nsStarter) + // hierarchic softmax step + if (!indices.isEmpty()) { + for (int i = 0; i < numIndices; i++) { + const int cIndex = bIndices[(e * numIndices) + i]; + const int cCode = bCodes[(e * numIndices) + i]; + + // we're skipping padded values + if (cIndex < 0) continue; - nSampling_(neu1, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e, alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr); - } else { - nSampling_(neu1, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e, alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr); - } + if (cIndex >= vocabSize) + throw std::runtime_error("Index can't be > vocab size"); - //nd4j_printf("Thread <%i>: syn0: [%i]; s1n: [%i];\n", omp_get_thread_num(), 0, irow); + hSoftmax_(neu1, syn1 + (cIndex * vectorLength), expTable, neu1e, alpha, vectorLength, + cCode, expLength, false); + } + } + + // negative sampling step + if (!negStarters.isEmpty() && nsRounds > 0) { + int irow = bStarters[e]; + const int nsStarter = irow; + unsigned long long randomValue = nextRandom.e(e); + + for (int r = 0; r < nsRounds + 1; r++) { + // we're skipping rng on 0 step + if (r != 0) { + randomValue = randomValue * (unsigned long long) 25214903917 + 11; + auto idx = nd4j::math::nd4j_abs((randomValue >> 16) % negLength); + irow = idx >= negLength ? -1 : static_cast(negTable[idx]); + + if (irow < 0 || irow >= vocabSize) irow = randomValue % (vocabSize - 1) + 1; + if (irow == nsStarter) + continue; + + nSampling_(neu1, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e, + alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr); + } else { + nSampling_(neu1, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e, + alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr); + } + + //nd4j_printf("Thread <%i>: syn0: [%i]; s1n: [%i];\n", omp_get_thread_num(), 0, irow); + } + } + + + // if we're skipping labels + int starter = trainWords == 1 ? 0 : contextWidth - numLabels; + + // applying previously averaged results + for (int c = starter; c < contextWidth; c++) { + // getting context + auto cContext = bContext[c + (e * contextWidth)]; + auto cLock = bLocker[c + (e * contextWidth)]; + + // skipping padded values + if (cContext < 0 || cLock == 1) + continue; + + if (cContext >= vocabSize) + throw std::runtime_error("ContextID can't be > vocab size"); + + // one word from context + T *syn0word = syn0 + (cContext * vectorLength); + + for (int i = 0; i < vectorLength; i++) + syn0word[i] += neu1e[i]; + + } + + // optionally release temp arrays + if (vectorLength > 600) { + delete[] neu1; + delete[] neu1e; } } + }; - - // if we're skipping labels - int starter = trainWords == 1 ? 0 : contextWidth - numLabels; - - // applying previously averaged results - for (int c = starter; c < contextWidth; c++) { - // getting context - auto cContext = bContext[c + (e * contextWidth)]; - auto cLock = bLocker[c + (e * contextWidth)]; - - // skipping padded values - if (cContext < 0 || cLock == 1) - continue; - - if (cContext >= vocabSize) - throw std::runtime_error("ContextID can't be > vocab size"); - - // one word from context - T *syn0word = syn0 + (cContext * vectorLength); - - for (int i = 0; i < vectorLength; i++) - syn0word[i] += neu1e[i]; - - } - - // optionally release temp arrays - if (vectorLength > 600) { - delete[] neu1; - delete[] neu1e; - } - } + samediff::Threads::parallel_tad(func, 0, numTargets, 1, numThreads); } BUILD_SINGLE_TEMPLATE(template void cbowBatchExec_, (NDArray &s0, NDArray &s1, NDArray &s1n, void *vexpTable, void *vnegTable, void *vinfVector, NDArray &context, NDArray &lockedWords, NDArray &targets, NDArray &negStarters, NDArray &indices, NDArray &codes, NDArray &lr, NDArray &nextRandom, NDArray &nLabels, const int nsRounds, const int vocabSize, const int vectorLength, const int expLength, const int negLength, const bool trainWords, const int numThreads), FLOAT_TYPES); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp index a80e65999..1fea14824 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -141,47 +142,49 @@ static void sruBI_(NDArray* x, const NDArray* w, const NDArray* b, const NDArray T* pHt = ht->bufferAsT(); T* pCt = ct->bufferAsT(); - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong col = 0; col < ncols; ++col) { + auto func = PRAGMA_THREADS_FOR { + for (auto col = start; col < stop; col += increment) { + const auto colNum = col % d2; + bool flip = colNum >= K; + T maskVal = mask ? *(pMask + col) : T(1); + T cur = *(pInit + col); + T bF = *(pBias + colNum); + T bR = *(pBias + colNum + d2); + T *pWiVal = pWi + 3 * col; + T *pIVal = pI + col; + T *pHtVal = pHt + col; + T *pCtVal = pCt + col; - const auto colNum = col % d2; - bool flip = colNum >= K; - T maskVal = mask ? *(pMask + col) : T(1); - T cur = *(pInit + col); - T bF = *(pBias + colNum); - T bR = *(pBias + colNum + d2); - T* pWiVal = pWi + 3*col; - T* pIVal = pI + col; - T* pHtVal = pHt + col; - T* pCtVal = pCt + col; + if (flip) { + const auto step = (time - 1) * ncols; + pIVal += step; + pHtVal += step; + pCtVal += step; + pWiVal += (time - 1) * ncolsWi; + } - if (flip) { - const auto step = (time - 1) * ncols; - pIVal += step; - pHtVal += step; - pCtVal += step; - pWiVal += (time - 1) * ncolsWi; + auto ncolsRev = flip ? -ncols : ncols; + auto ncolsWiRev = flip ? -ncolsWi : ncolsWi; + + for (Nd4jLong t = 0; t < time; ++t) { + // evaluate sigmoids + T ft = (1.) / (1. + nd4j::math::nd4j_exp(-(pWiVal[1] + bF))); + T rt = (1.) / (1. + nd4j::math::nd4j_exp(-(pWiVal[2] + bR))); + + cur = (cur - *pWiVal) * ft + *pWiVal; + *pCtVal = cur; + T val = nd4j::math::nd4j_tanh(cur); + *pHtVal = (val * maskVal - *pIVal) * rt + *pIVal; + + pIVal += ncolsRev; + pWiVal += ncolsWiRev; + pCtVal += ncolsRev; + pHtVal += ncolsRev; + } } + }; - auto ncolsRev = flip ? -ncols : ncols; - auto ncolsWiRev = flip ? -ncolsWi : ncolsWi; - - for (Nd4jLong t = 0; t < time; ++t) { - // evaluate sigmoids - T ft = (1.)/(1. + nd4j::math::nd4j_exp(-(pWiVal[1] + bF))); - T rt = (1.)/(1. + nd4j::math::nd4j_exp(-(pWiVal[2] + bR))); - - cur = (cur - *pWiVal)*ft + *pWiVal; - *pCtVal = cur; - T val = nd4j::math::nd4j_tanh(cur); - *pHtVal = (val*maskVal - *pIVal)*rt + *pIVal; - - pIVal += ncolsRev; - pWiVal += ncolsWiRev; - pCtVal += ncolsRev; - pHtVal += ncolsRev; - } - } + samediff::Threads::parallel_tad(func, 0, ncols); } ////////////////////////////////////////////////////////////////////////// @@ -232,72 +235,75 @@ static void sruBIBP_(NDArray* x, const NDArray* w, const NDArray* b, const NDArr T* pGradBias = gradBias.bufferAsT(); T* pGradInit = gradC0->bufferAsT(); - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong col = 0; col < ncols; ++col) { - T gbF = 0.f; - T gbR = 0.f; - const auto colNum = col % d2; - const bool flip = colNum >= K; - T maskVal = mask ? *(pMask + col) : T(1.); - T cur = *(pInGradCt + col); - T bF = *(pBias + colNum); - T bR = *(pBias + colNum + d2); - T* pWiVal = pWi + 3*col; - T* pInputVal = pInput + col; - T* pStateVal = pState + col; - T* pInGradHtVal = pInGradHt + col; - T* pGradWiVal = pGradWi + 3*col; - T* pGradInputVal = pGradInput + col; + auto func = PRAGMA_THREADS_FOR { + for (auto col = start; col < stop; col += increment) { + T gbF = 0.f; + T gbR = 0.f; + const auto colNum = col % d2; + const bool flip = colNum >= K; + T maskVal = mask ? *(pMask + col) : T(1.); + T cur = *(pInGradCt + col); + T bF = *(pBias + colNum); + T bR = *(pBias + colNum + d2); + T *pWiVal = pWi + 3 * col; + T *pInputVal = pInput + col; + T *pStateVal = pState + col; + T *pInGradHtVal = pInGradHt + col; + T *pGradWiVal = pGradWi + 3 * col; + T *pGradInputVal = pGradInput + col; - if (!flip) { - const auto stepI = (time - 1) * ncols; - const auto stepW = (time - 1) * ncolsWi; - pInputVal += stepI; - pStateVal += stepI; - pInGradHtVal += stepI; - pGradInputVal += stepI; - pWiVal += stepW; - pGradWiVal += stepW; + if (!flip) { + const auto stepI = (time - 1) * ncols; + const auto stepW = (time - 1) * ncolsWi; + pInputVal += stepI; + pStateVal += stepI; + pInGradHtVal += stepI; + pGradInputVal += stepI; + pWiVal += stepW; + pGradWiVal += stepW; + } + + Nd4jLong ncolsRev = flip ? -ncols : ncols; + Nd4jLong ncolsWiRev = flip ? -ncolsWi : ncolsWi; + + for (Nd4jLong t = 0; t < time; ++t) { + // evaluate sigmoids + T ft = ((T) 1.) / ((T) 1. + nd4j::math::nd4j_exp(-(*(pWiVal + 1) + bF))); + T rt = ((T) 1.) / ((T) 1. + nd4j::math::nd4j_exp(-(*(pWiVal + 2) + bR))); + + T val = nd4j::math::nd4j_tanh(*pStateVal); + T prevVal = (t < time - 1) ? (*(pStateVal - ncolsRev)) : (*(pInit + col)); + // grad wrt input + *pGradInputVal = *pInGradHtVal - (*pInGradHtVal) * rt; + // grad wrt rt, wiR and bR + T grt = (*pInGradHtVal) * (val * maskVal - *pInputVal) * (rt - rt * rt); + *(pGradWiVal + 2) = grt; + gbR += grt; + // grad wrt state + T gradSateVal = (*pInGradHtVal) * maskVal * (rt - rt * val * val) + cur; + // grad wrt wi0 + *pGradWiVal = gradSateVal - gradSateVal * ft; + // grad wrt ft, wi1, and bF + T gft = gradSateVal * (prevVal - *pWiVal) * (ft - ft * ft); + *(pGradWiVal + 1) = gft; + gbF += gft; + // grad wrt c_previous + cur = gradSateVal * ft; + + pInputVal -= ncolsRev; + pWiVal -= ncolsWiRev; + pStateVal -= ncolsRev; + pGradWiVal -= ncolsWiRev; + pGradInputVal -= ncolsRev; + pInGradHtVal -= ncolsRev; + } + *(pGradBias + col) = gbF; + *(pGradBias + col + ncols) = gbR; + *(pGradInit + col) = cur; } + }; - Nd4jLong ncolsRev = flip ? -ncols : ncols; - Nd4jLong ncolsWiRev = flip ? -ncolsWi : ncolsWi; - - for (Nd4jLong t = 0; t < time; ++t) { - // evaluate sigmoids - T ft = ((T)1.)/((T)1. + nd4j::math::nd4j_exp(-(*(pWiVal + 1) + bF))); - T rt = ((T)1.)/((T)1. + nd4j::math::nd4j_exp(-(*(pWiVal + 2) + bR))); - - T val = nd4j::math::nd4j_tanh(*pStateVal); - T prevVal = (t < time-1) ? (*(pStateVal - ncolsRev)) : (*(pInit + col)); - // grad wrt input - *pGradInputVal = *pInGradHtVal - (*pInGradHtVal)*rt ; - // grad wrt rt, wiR and bR - T grt = (*pInGradHtVal) * (val*maskVal - *pInputVal) * (rt - rt*rt); - *(pGradWiVal + 2) = grt; - gbR += grt; - // grad wrt state - T gradSateVal = (*pInGradHtVal) * maskVal * (rt - rt*val*val) + cur; - // grad wrt wi0 - *pGradWiVal = gradSateVal - gradSateVal*ft; - // grad wrt ft, wi1, and bF - T gft = gradSateVal * (prevVal - *pWiVal) * (ft - ft*ft); - *(pGradWiVal + 1) = gft; - gbF += gft; - // grad wrt c_previous - cur = gradSateVal * ft; - - pInputVal -= ncolsRev; - pWiVal -= ncolsWiRev; - pStateVal -= ncolsRev; - pGradWiVal -= ncolsWiRev; - pGradInputVal -= ncolsRev; - pInGradHtVal -= ncolsRev; - } - *(pGradBias + col) = gbF; - *(pGradBias + col + ncols) = gbR; - *(pGradInit + col) = cur; - } + samediff::Threads::parallel_tad(func, 0, ncols); // gradB gradBias.reduceAlongDimension(reduce::Sum, gradB, {0}); // [4*K] diff --git a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp index 55de117a5..b974a236b 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace nd4j { @@ -35,9 +36,12 @@ static void stack_(const std::vector& inArrs, NDArray* outArr, c if(inArrs[0]->rankOf() == 0) { int inSize = inArrs.size(); - PRAGMA_OMP_PARALLEL_FOR_IF(inSize > Environment::getInstance()->tadThreshold()) - for(int i=0; i < inSize; ++i) - outArr->p(i, inArrs[i]->t(0)); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) + outArr->p(i, inArrs[i]->t(0)); + }; + + samediff::Threads::parallel_for(func, 0, inSize); } else { @@ -45,9 +49,11 @@ static void stack_(const std::vector& inArrs, NDArray* outArr, c auto list = outArr->allTensorsAlongDimension(dimsToExclude); // list.size() == block.width() int listSize = list->size(); - PRAGMA_OMP_PARALLEL_FOR_IF(listSize > Environment::getInstance()->tadThreshold()) - for(int i=0; iat(i)->assign(inArrs[i]); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) + list->at(i)->assign(inArrs[i]); + }; + samediff::Threads::parallel_tad(func, 0, listSize); delete list; } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp index f05647589..e38232928 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -148,19 +149,21 @@ namespace helpers { int status = topKFunctor(context, input, values, indices.get(), k, true); result->assign(0); if (status == ND4J_STATUS_OK) { - bool condition = target->lengthOf() > Environment::getInstance()->tadThreshold(); - PRAGMA_OMP_PARALLEL_FOR_IF(condition) - for (int e = 0; e < target->lengthOf(); e++) { - bool found = false; - for (int j = 0; j < k; j++) { - if (target->e(e) == indices->e(e * k + j)) { - found = true; - break; + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + bool found = false; + for (int j = 0; j < k; j++) { + if (target->e(e) == indices->e(e * k + j)) { + found = true; + break; + } } + if (found) + result->p(e, true); } - if (found) - result->p(e, true); - } + }; + + samediff::Threads::parallel_tad(func, 0, target->lengthOf()); } return status; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp index 9e04ed4df..ea2fb348a 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp @@ -42,11 +42,13 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N const_cast(input).fillAsTriangular(0, diagonal, dOdI.sizeAt(-1), 'b', &dOdI); int dLen = dOdI.lengthOf(); - PRAGMA_OMP_PARALLEL_FOR_IF(dLen > Environment::getInstance()->elementwiseThreshold()) - for(Nd4jLong i = 0; i < dLen; ++i) { - if(dOdI.t(i) != static_cast(0.f)) - dOdI.t(i) = static_cast(1.f); - } + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + if (dOdI.t(i) != static_cast(0.f)) + dOdI.t(i) = static_cast(1.f); + } + }; + samediff::Threads::parallel_for(func, 0, dLen); // FIXME: !!! gradI.assign(dOdI * gradO); // chain rule: dLoss/dI = dO/dI * dLoss/dO @@ -59,14 +61,14 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N ////////////////////////////////////////////////////////////////////////// template static void trace_(const NDArray& input, NDArray& output) { - const int inRank = input.rankOf(); - auto setOfSubArrs = input.allTensorsAlongDimension({inRank-2, inRank-1}); - PRAGMA_OMP_PARALLEL_FOR_IF(setOfSubArrs->size() > Environment::getInstance()->tadThreshold()) - for(int i = 0; i < setOfSubArrs->size(); ++i) - output.p(i, setOfSubArrs->at(i)->getTrace()); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) + output.p(i, setOfSubArrs->at(i)->getTrace()); + }; + samediff::Threads::parallel_for(func, 0, setOfSubArrs->size()); delete setOfSubArrs; } @@ -107,7 +109,8 @@ void randomShuffle_(NDArray& input, NDArray& output, nd4j::graph::RandomGenerato std::vector indices(firstDim); std::iota(indices.begin(), indices.end(), 0); output.p(Nd4jLong(0), input.e(0)); - PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold()) + + // FIXME: parallelism!! for(int i = firstDim-1; i > 0; --i) { int r = rng.relativeInt(i) % i; output.t(i) = input.t(indices[r]); @@ -184,54 +187,61 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray const auto zLen = output.lengthOf(); - std::vector coords(rank); // we use the same coordinates storage both for input and output since their ranks are the same - if(mode == 0) { // CONSTANT case const T padVal = padValue.e(0); - PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords)) - for(uint i = 0; i < zLen; ++i) { + auto func = PRAGMA_THREADS_FOR { + Nd4jLong coords[MAX_RANK]; + for (auto i = start; i < stop; i += increment) { + shape::index2coords(i, output.getShapeInfo(), coords); + const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); - shape::index2coords(i, output.getShapeInfo(), coords.data()); - const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data()); + bool within = true; + for (int j = rankMinusOne; j >= 0; --j) { + if (xShape[j] == zShape[j]) continue; + const auto left = paddings.e(j, 0); + if (coords[j] < left || coords[j] >= left + xShape[j]) { + within = false; + break; + } + else { coords[j] = coords[j] - left; } + } - bool within = true; - for(int j = rankMinusOne; j >= 0; --j) { - if(xShape[j] == zShape[j]) continue; - const auto left = paddings.e(j, 0); - if(coords[j] < left || coords[j] >= left + xShape[j]) {within = false; break;} - else {coords[j] = coords[j] - left;} + if (within) + z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)]; + else + z[zOffset] = padVal; } + }; - if(within) - z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())]; - else - z[zOffset] = padVal; - } + samediff::Threads::parallel_tad(func, 0, zLen); } else { // REFLECT and SYMMETRIC cases const Nd4jLong shift1 = mode == 1 ? 0 : 1; // REFLECT : SYMMETRIC const Nd4jLong shift2 = mode == 1 ? 2 : 1; // REFLECT : SYMMETRIC - PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords)) - for(uint i = 0; i < zLen; ++i) { + auto func = PRAGMA_THREADS_FOR { + Nd4jLong coords[MAX_RANK]; + for (auto i = start; i < stop; i += increment) { + shape::index2coords(i, output.getShapeInfo(), coords); + const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); - shape::index2coords(i, output.getShapeInfo(), coords.data()); - const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data()); + for (int j = rankMinusOne; j >= 0; --j) { - for(int j = rankMinusOne; j >= 0; --j) { + if (xShape[j] == zShape[j]) continue; + coords[j] = coords[j] - paddings.e(j, 0); // are ready to fill middle (within input dimension range) + if (coords[j] < 0) coords[j] = -coords[j] - shift1; // means fill from left + else if (coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right + } - if(xShape[j] == zShape[j]) continue; - coords[j] = coords[j] - paddings.e(j, 0); // are ready to fill middle (within input dimension range) - if(coords[j] < 0) coords[j] = -coords[j] - shift1; // means fill from left - else if(coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right + const auto xOffset = shape::getOffset(input.getShapeInfo(), coords); + z[zOffset] = x[xOffset]; } + }; - const auto xOffset = shape::getOffset(input.getShapeInfo(), coords.data()); - z[zOffset] = x[xOffset]; - } + samediff::Threads::parallel_tad(func, 0, zLen); } } @@ -558,50 +568,49 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) { const int yLastDim = indices.sizeAt(-1); - std::vector coords(maxRank); + auto func = PRAGMA_THREADS_FOR { + Nd4jLong coords[MAX_RANK * 3]; + for (auto i = start; i < stop; i += increment) { + Nd4jLong *zCoordStart, *xCoordStart; - PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(zLen > Environment::getInstance()->elementwiseThreshold()) firstprivate(coords)) - for (Nd4jLong i = 0; i < zLen; ++i) { + if (yLastDim == xRank) { + zCoordStart = coords; + xCoordStart = coords; + } else if (zRank >= xRank) { + zCoordStart = coords; + xCoordStart = coords + zRank - xRank; + } else { + zCoordStart = coords + xRank - zRank; + xCoordStart = coords; + } - Nd4jLong *zCoordStart, *xCoordStart; + shape::index2coords(i, output.getShapeInfo(), zCoordStart); - if(yLastDim == xRank) { - zCoordStart = coords.data(); - xCoordStart = coords.data(); - } - else if(zRank >= xRank) { - zCoordStart = coords.data(); - xCoordStart = coords.data() + zRank - xRank; - } - else { - zCoordStart = coords.data() + xRank - zRank; - xCoordStart = coords.data(); + const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoordStart); + + // last y coordinate + uint coordToRestore; + if (yLastDim != xRank) + coordToRestore = static_cast(zCoordStart[yRank - 1]); + + zCoordStart[yRank - 1] = 0; + const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoordStart); + + //restore z coordinate + if (yLastDim != xRank) + zCoordStart[yRank - 1] = coordToRestore; + + // construct coordinates for x + for (uint j = 0; j < yLastDim; ++j) + xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride + + const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart); + + z[zOffset] = x[xOffset]; } + }; - shape::index2coords(i, output.getShapeInfo(), zCoordStart); - - const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoordStart); - - // last y coordinate - uint coordToRestore; - if(yLastDim != xRank) - coordToRestore = static_cast(zCoordStart[yRank - 1]); - - zCoordStart[yRank - 1] = 0; - const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoordStart); - - //restore z coordinate - if(yLastDim != xRank) - zCoordStart[yRank - 1] = coordToRestore; - - // construct coordinates for x - for(uint j = 0; j < yLastDim; ++j) - xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride - - const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart); - - z[zOffset] = x[xOffset]; - } + samediff::Threads::parallel_tad(func, 0, zLen); } //////////////////////////////////////////////////////////////////////// @@ -644,21 +653,28 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con } else if (input->rankOf() == 1 && indices->isVector()) { // special case - PRAGMA_OMP_PARALLEL_FOR_IF(indices->lengthOf() > Environment::getInstance()->tadThreshold()) - for (int e = 0; e < indices->lengthOf(); e++) - output->p(e, input->e(indices->e(e))); + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) + output->p(e, input->e(indices->e(e))); + }; + + samediff::Threads::parallel_for(func, 0, indices->lengthOf()); } else { std::vector dimsOut(indices->rankOf()); std::iota(dimsOut.begin(), dimsOut.end(), axis); // fill with axis, axis+1, ... indices->rankOf()-1 const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), dimsOut); - PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold()) - for(int i = 0; i < numOfSubArrs; ++i) { - NDArray subArrOut = (*output)(i, dimsOut); - NDArray subArrIn = (*input)(indices->e(i), {axis}); - subArrOut.assign(subArrIn); - } + + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + NDArray subArrOut = (*output)(i, dimsOut); + NDArray subArrIn = (*input)(indices->e(i), {axis}); + subArrOut.assign(subArrIn); + } + }; + + samediff::Threads::parallel_tad(func, 0, numOfSubArrs); } } else { @@ -673,12 +689,16 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con } else { // vector case const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), {axis}); - PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold()) - for(int i = 0; i < numOfSubArrs; ++i) { - NDArray subArrOut = (*output)(i, {axis}); - NDArray subArrIn = (*input)(intArgs[i+1], {axis}); - subArrOut.assign(subArrIn); - } + + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + NDArray subArrOut = (*output)(i, {axis}); + NDArray subArrIn = (*input)(intArgs[i + 1], {axis}); + subArrOut.assign(subArrIn); + } + }; + + samediff::Threads::parallel_tad(func, 0, numOfSubArrs); } } } @@ -693,9 +713,12 @@ void eye(nd4j::LaunchContext * context, NDArray& output) { const int rank = output.rankOf(); auto arrs = output.allTensorsAlongDimension({rank-2, rank-1}); - PRAGMA_OMP_PARALLEL_FOR_IF(arrs->size() > Environment::getInstance()->tadThreshold()) - for(int i = 0; i < arrs->size(); ++i) - arrs->at(i)->setIdentity(); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) + arrs->at(i)->setIdentity(); + }; + + samediff::Threads::parallel_tad(func, 0, arrs->size()); delete arrs; } @@ -719,41 +742,43 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat for (; e < intArgs->size(); e++) indices.push_back((*intArgs)[e]); - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong i = 0; i < indices.size(); ++i) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto inSubArr = input(indices[i], dimsToExclude, true); + auto updSubArr = updates(i, dimsToExclude, true); - auto inSubArr = input(indices[i], dimsToExclude, true); - auto updSubArr = updates(i, dimsToExclude, true); - - if (inSubArr.lengthOf() != updSubArr.lengthOf()) - continue; - - switch (opCode) { - case 0: - inSubArr.applyPairwiseTransform(pairwise::Add, &updSubArr, &inSubArr, nullptr); - break; - case 1: - inSubArr.applyPairwiseTransform(pairwise::Subtract, &updSubArr, &inSubArr, nullptr); - break; - case 2: - inSubArr.applyPairwiseTransform(pairwise::Multiply, &updSubArr, &inSubArr, nullptr); - break; - case 3: - inSubArr.applyPairwiseTransform(pairwise::Divide, &updSubArr, &inSubArr, nullptr); - break; - case 4: - inSubArr.applyPairwiseTransform(pairwise::ReverseSubtract, &updSubArr, &inSubArr, nullptr); - break; - case 5: - inSubArr.applyPairwiseTransform(pairwise::ReverseDivide, &updSubArr, &inSubArr, nullptr); - break; - case 6: - inSubArr.applyPairwiseTransform(pairwise::CopyPws, &updSubArr, &inSubArr, nullptr); - break; - default: + if (inSubArr.lengthOf() != updSubArr.lengthOf()) continue; + + switch (opCode) { + case 0: + inSubArr.applyPairwiseTransform(pairwise::Add, &updSubArr, &inSubArr, nullptr); + break; + case 1: + inSubArr.applyPairwiseTransform(pairwise::Subtract, &updSubArr, &inSubArr, nullptr); + break; + case 2: + inSubArr.applyPairwiseTransform(pairwise::Multiply, &updSubArr, &inSubArr, nullptr); + break; + case 3: + inSubArr.applyPairwiseTransform(pairwise::Divide, &updSubArr, &inSubArr, nullptr); + break; + case 4: + inSubArr.applyPairwiseTransform(pairwise::ReverseSubtract, &updSubArr, &inSubArr, nullptr); + break; + case 5: + inSubArr.applyPairwiseTransform(pairwise::ReverseDivide, &updSubArr, &inSubArr, nullptr); + break; + case 6: + inSubArr.applyPairwiseTransform(pairwise::CopyPws, &updSubArr, &inSubArr, nullptr); + break; + default: + continue; + } } - } + }; + + samediff::Threads::parallel_tad(func, 0, indices.size()); } @@ -766,11 +791,14 @@ void scatterSimple(nd4j::LaunchContext * context, const int opId, NDArray& input switch (opId) { case 6: { // copy - PRAGMA_OMP_PARALLEL_FOR_IF(len > Environment::getInstance()->elementwiseThreshold()) - for(uint i = 0; i < len; ++i) { - auto inSubArr = input(i, dimensions); - inSubArr.p(indices.t(i), updates.e(i)); - } + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto inSubArr = input(i, dimensions); + inSubArr.p(indices.t(i), updates.e(i)); + } + }; + + samediff::Threads::parallel_for(func, 0, len); } break; @@ -786,70 +814,79 @@ static void mergeMaxIndex_(const std::vector& inArrs, NDArray& output) const Nd4jLong numArgs = inArrs.size(); auto x = inArrs[0]; - PRAGMA_OMP_PARALLEL_FOR_IF(x->lengthOf() > Environment::getInstance()->elementwiseThreshold()) - for (Nd4jLong e = 0; e < x->lengthOf(); e++) { - T max = -DataTypeUtils::max(); - Nd4jLong idx = 0; + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + T max = -DataTypeUtils::max(); + Nd4jLong idx = 0; - for (int i = 0; i < numArgs; i++){ - - T v = inArrs[i]->e(e); - if (v > max) { - max = v; - idx = i; + for (int i = 0; i < numArgs; i++) { + T v = inArrs[i]->e(e); + if (v > max) { + max = v; + idx = i; + } } + output.p(e, idx); } - output.p(e, idx); - } + }; + + samediff::Threads::parallel_for(func, 0, x->lengthOf()); +} + +void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output) { + BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), mergeMaxIndex_, (inArrs, output), LIBND4J_TYPES); } - void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output) { - BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), mergeMaxIndex_, (inArrs, output), LIBND4J_TYPES); - } ////////////////////////////////////////////////////////////////////////// template static void mergeMax_(const std::vector& inArrs, NDArray& output) { - const Nd4jLong numArgs = inArrs.size(); auto x = inArrs[0]; - PRAGMA_OMP_PARALLEL_FOR_IF(x->lengthOf() > Environment::getInstance()->elementwiseThreshold()) - for (Nd4jLong e = 0; e < x->lengthOf(); e++) { - T max = -DataTypeUtils::max(); - for (int i = 0; i < numArgs; i++) { - T v = inArrs[i]->e(e); - if (v > max) - max = v; + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + T max = -DataTypeUtils::max(); + for (int i = 0; i < numArgs; i++) { + T v = inArrs[i]->e(e); + if (v > max) + max = v; + } + output.p(e, max); } - output.p(e, max); - } + }; + + samediff::Threads::parallel_for(func, 0, x->lengthOf()); +} + +void mergeMax(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output) { + BUILD_SINGLE_SELECTOR(output.dataType(), mergeMax_, (inArrs, output), LIBND4J_TYPES); } - void mergeMax(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output) { - BUILD_SINGLE_SELECTOR(output.dataType(), mergeMax_, (inArrs, output), LIBND4J_TYPES); - } ////////////////////////////////////////////////////////////////////////// template static void mergeAvg_(const std::vector& inArrs, NDArray& output) { - const Nd4jLong numArgs = inArrs.size(); const T factor = 1.f / numArgs; auto x = inArrs[0]; - PRAGMA_OMP_PARALLEL_FOR_IF(x->lengthOf() > Environment::getInstance()->elementwiseThreshold()) - for (Nd4jLong e = 0; e < x->lengthOf(); e++) { - T sum = 0.; - for (int i = 0; i < numArgs; i++) { - T v = inArrs[i]->e(e); - sum += v; + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + T sum = 0.; + for (int i = 0; i < numArgs; i++) { + T v = inArrs[i]->e(e); + sum += v; + } + output.p(e, sum * factor); } - output.p(e, sum * factor); - } + }; + + samediff::Threads::parallel_for(func, 0, x->lengthOf()); +} + +void mergeAvg(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output) { + BUILD_SINGLE_SELECTOR(output.dataType(), mergeAvg_, (inArrs, output), LIBND4J_TYPES); } - void mergeAvg(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output) { - BUILD_SINGLE_SELECTOR(output.dataType(), mergeAvg_, (inArrs, output), LIBND4J_TYPES); - } ////////////////////////////////////////////////////////////////////////// @@ -859,16 +896,17 @@ static void mergeAdd_(const std::vector& inArrs, NDArray& output) { const Nd4jLong numArgs = inArrs.size(); auto x = inArrs[0]; - PRAGMA_OMP_PARALLEL_FOR_IF(x->lengthOf() > Environment::getInstance()->elementwiseThreshold()) - for (Nd4jLong e = 0; e < x->lengthOf(); e++) { + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + T sum = (T) 0.f; + for (int i = 0; i < numArgs; i++) + sum += inArrs[i]->e(e); - T sum = (T) 0.f; + output.p(e, sum); + } + }; - for (int i = 0; i < numArgs; i++) - sum += inArrs[i]->e(e); - - output.p(e, sum); - } + samediff::Threads::parallel_for(func, 0, x->lengthOf()); } void mergeAdd(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output) { BUILD_SINGLE_SELECTOR(output.dataType(), mergeAdd_, (inArrs, output), LIBND4J_TYPES); @@ -895,14 +933,15 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector& auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions); - PRAGMA_OMP_PARALLEL_FOR - for(Nd4jLong i = 0; i < listOfInSubArrs->size(); ++i) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + const T iNormActual = norm2.e(i); + if (iNormActual > normClip) + *listOfInSubArrs->at(i) *= normClip / iNormActual; + } + }; + samediff::Threads::parallel_tad(func, 0, listOfInSubArrs->size()); - const T iNormActual = norm2.e(i); - - if (iNormActual > normClip) - *listOfInSubArrs->at(i) *= normClip / iNormActual; - } delete listOfInSubArrs; } } @@ -920,18 +959,19 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector& auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions); auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions); - PRAGMA_OMP_PARALLEL_FOR - for(Nd4jLong i = 0; i < listOfInSubArrs->size(); ++i) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto inputSubArr = listOfInSubArrs->at(i); + auto outputSubArr = listOfOutSubArrs->at(i); + outputSubArr->assign(inputSubArr); - auto inputSubArr = listOfInSubArrs->at(i); - auto outputSubArr = listOfOutSubArrs->at(i); - outputSubArr->assign(inputSubArr); + const T iNormActual = norm2.e(i); - const T iNormActual = norm2.e(i); - - if (iNormActual > clipNorm.e(0)) - *outputSubArr *= clipNorm / iNormActual; - } + if (iNormActual > clipNorm.e(0)) + *outputSubArr *= clipNorm / iNormActual; + } + }; + samediff::Threads::parallel_tad(func, 0, listOfInSubArrs->size()); delete listOfInSubArrs; delete listOfOutSubArrs; @@ -1028,31 +1068,29 @@ static void clipByNormBP_(const NDArray& input, const NDArray& gradO, NDArray& g auto cn = clipNorm.e(0); - PRAGMA_OMP_PARALLEL_FOR - for(Nd4jLong i = 0; i < gradISubArrs->size(); ++i) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + T N = norm2.e(i); - T N = norm2.e(i); + auto gradOSubArr = gradOSubArrs->at(i); + auto gradISubArr = gradISubArrs->at(i); - auto gradOSubArr = gradOSubArrs->at(i); - auto gradISubArr = gradISubArrs->at(i); + if (N > cn) { + auto inputSubArr = inputSubArrs->at(i); + const T sumOfProd = (*inputSubArr * *gradOSubArr).reduceNumber(reduce::Sum).e(0); // reduce to scalar + const T factor1 = static_cast(1.f) / N; + const T factor3 = factor1 / (N * N); // 1 / (N*N*N) - if (N > cn) { + auto lambda = LAMBDA_TT(elem1, elem2, cn, sumOfProd, factor1, factor3) { + return cn * (factor1 * elem2 - factor3 * elem1 * sumOfProd); + }; - auto inputSubArr = inputSubArrs->at(i); - - const T sumOfProd = (*inputSubArr * *gradOSubArr).reduceNumber(reduce::Sum).e(0); // reduce to scalar - const T factor1 = static_cast(1.f) / N; - const T factor3 = factor1 / (N * N) ; // 1 / (N*N*N) - - auto lambda = LAMBDA_TT(elem1, elem2, cn, sumOfProd, factor1, factor3) { - return cn * (factor1 * elem2 - factor3 * elem1 * sumOfProd); - }; - - inputSubArr->applyPairwiseLambda(gradOSubArr, lambda, gradISubArr); + inputSubArr->applyPairwiseLambda(gradOSubArr, lambda, gradISubArr); + } else + gradISubArr->assign(gradOSubArr); } - else - gradISubArr->assign(gradOSubArr); - } + }; + samediff::Threads::parallel_tad(func, 0, gradISubArrs->size()); delete gradISubArrs; delete gradOSubArrs; @@ -1165,34 +1203,35 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o } else { - std::vector inIdx(rank), outIdx(rank); + auto func = PRAGMA_THREADS_FOR { + Nd4jLong inIdx[MAX_RANK]; + Nd4jLong outIdx[MAX_RANK]; + for (auto i = start; i < stop; i += increment) { + shape::index2coords(i, output.getShapeInfo(), outIdx); - PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(inIdx, outIdx)) - for(int i = 0; i < outLen; ++i) { + for (int j = 0; j < rank; ++j) { + const Nd4jLong inLen = input.sizeAt(j); + const auto leftSide = paddings.e(j, 0); + const auto leftSideCorrected = leftSide - reflBorder; + const Nd4jLong len = 2 * (inLen - 1) + leftSide + reflBorder; - shape::index2coords(i, output.getShapeInfo(), outIdx.data()); + if (outIdx[j] < leftSide) // left side + inIdx[j] = leftSideCorrected - outIdx[j]; - for(int j = 0; j < rank; ++j) { + else if (outIdx[j] >= leftSide && outIdx[j] < leftSide + inLen) // middle + inIdx[j] = outIdx[j] - leftSide; - const Nd4jLong inLen = input.sizeAt(j); - const auto leftSide = paddings.e(j, 0); - const auto leftSideCorrected = leftSide - reflBorder; - const Nd4jLong len = 2*(inLen-1) + leftSide + reflBorder; + else // right side + inIdx[j] = len - outIdx[j]; + } - if(outIdx[j] < leftSide) // left side - inIdx[j] = leftSideCorrected - outIdx[j]; - - else if(outIdx[j] >= leftSide && outIdx[j] < leftSide + inLen) // middle - inIdx[j] = outIdx[j] - leftSide; - - else // right side - inIdx[j] = len - outIdx[j]; + auto outOffset = shape::getOffset(output.getShapeInfo(), outIdx); + auto inOffset = shape::getOffset(input.getShapeInfo(), inIdx); + reinterpret_cast(output.buffer())[outOffset] = reinterpret_cast(input.getBuffer())[inOffset]; } + }; - auto outOffset = shape::getOffset(output.getShapeInfo(), outIdx.data()); - auto inOffset = shape::getOffset(input.getShapeInfo(), inIdx.data()); - reinterpret_cast(output.buffer())[outOffset] = reinterpret_cast(input.getBuffer())[inOffset]; - } + samediff::Threads::parallel_for(func, 0, outLen); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp index a365d8135..5d4ed9f2e 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp @@ -19,6 +19,7 @@ // #include +#include namespace nd4j { namespace ops { @@ -62,9 +63,12 @@ static void zeta_(nd4j::LaunchContext * context, const NDArray& x, const NDArray //auto result = NDArray(&x, false, context); int xLen = x.lengthOf(); - PRAGMA_OMP_PARALLEL_FOR_IF(xLen > Environment::getInstance()->elementwiseThreshold()) - for(int i = 0; i < xLen; ++i) - z.p(i, zetaScalar(x.e(i), q.e(i))); + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) + z.p(i, zetaScalar(x.e(i), q.e(i))); + }; + + samediff::Threads::parallel_for(func, 0, xLen); } void zeta(nd4j::LaunchContext * context, const NDArray& x, const NDArray& q, NDArray& z) { diff --git a/libnd4j/include/ops/declarable/helpers/cross.h b/libnd4j/include/ops/declarable/helpers/cross.h index 27caedd0c..d087a4849 100644 --- a/libnd4j/include/ops/declarable/helpers/cross.h +++ b/libnd4j/include/ops/declarable/helpers/cross.h @@ -19,6 +19,7 @@ // #include +#include namespace nd4j { namespace ops { @@ -66,14 +67,17 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND int tads = tadsA->size(); - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int e = 0; e < tads; e++) { - auto a_ = tadsA->at(e); - auto b_ = tadsB->at(e); - auto o_ = tadsO->at(e); + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + auto a_ = tadsA->at(e); + auto b_ = tadsB->at(e); + auto o_ = tadsO->at(e); - helpers::cross(context, a_, b_, o_); - } + helpers::cross(context, a_, b_, o_); + } + }; + + samediff::Threads::parallel_tad(func, 0, tads); delete tadsA; delete tadsB; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc deleted file mode 100644 index 63e406cc6..000000000 --- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc +++ /dev/null @@ -1,138 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -// -// Created by raver119 on 30.11.17. -// - -#include - -namespace nd4j { -namespace ops { -namespace helpers { - -// [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW] -template -void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW) { - - auto imBuff = output.bufferAsT(); - auto colBuff = input.bufferAsT(); - auto imShapeBuffer = output.getShapeInfo(); - auto colShapeBuffer = input.getShapeInfo(); - auto colShape = shape::shapeOf(colShapeBuffer); - auto colStride = shape::stride(colShapeBuffer); - auto imShape = shape::shapeOf(imShapeBuffer); - auto imStride = shape::stride(imShapeBuffer); - - const int bS = imShape[0]; - const int iC = imShape[1]; - const int kH = colShape[2]; - const int kW = colShape[3]; - const int oH = colShape[4]; - const int oW = colShape[5]; - const Nd4jLong colStride0 = colStride[0]; - const Nd4jLong colStride1 = colStride[1]; - const Nd4jLong colStride2 = colStride[2]; - const Nd4jLong colStride3 = colStride[3]; - const Nd4jLong colStride4 = colStride[4]; - const Nd4jLong colStride5 = colStride[5]; - const Nd4jLong imStride0 = imStride[0]; - const Nd4jLong imStride1 = imStride[1]; - const Nd4jLong imStride2 = imStride[2]; - const Nd4jLong imStride3 = imStride[3]; - - // initial zeroing of image content - const auto imEWS = shape::elementWiseStride(imShapeBuffer); - if(imEWS == 1) { - memset(imBuff, 0, shape::length(imShapeBuffer) * sizeof(T)); - } - else if (imEWS > 1) { -PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close)) - for (int i = 0; i < shape::length(imShapeBuffer) * imEWS; i += imEWS) - imBuff[i] = static_cast(0.f); - } - else { - const auto len = shape::length(imShapeBuffer); -PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close)) - for (int i = 0; i < len; i++) - imBuff[shape::getIndexOffset(i, imShapeBuffer)] = static_cast(0.f); - } - - T *col, *im; - int imRow, imCol; - - if (shape::order(colShapeBuffer) == 'c' && shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) { - -PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, imRow, imCol)) - for (int b = 0; b < bS; b++) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { - - imRow = (-pH + kRow * dH) + colH*sH; - imCol = (-pW + kCol * dW) + colW*sW; - - col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; - im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; - - if (static_cast(imRow) < static_cast(iH) && static_cast(imCol) < static_cast(iW)) - *im += *col; - } - } - } - } - } - } - } - else { - -PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, imRow, imCol)) - for (int b = 0; b < bS; b++) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - - imRow = (-pH + kRow * dH) + colH*sH; - imCol = (-pW + kCol * dW) + colW*sW; - - col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; - im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; - - if (static_cast(imRow) < static_cast(iH) && static_cast(imCol) < static_cast(iW)) - *im += *col; - } - } - } - } - } - } - } -} - - -void col2im(nd4j::LaunchContext & context, const NDArray& input, NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW) { - BUILD_SINGLE_SELECTOR(input.dataType(), col2im_, (context, input, output, sH, sW, pH, pW, iH, iW, dH, dW), LIBND4J_TYPES); -} - -BUILD_SINGLE_TEMPLATE(template void col2im_, (nd4j::LaunchContext & context, const NDArray& input, NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW), LIBND4J_TYPES); - -} -} -} diff --git a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc deleted file mode 100644 index 67f5650bd..000000000 --- a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc +++ /dev/null @@ -1,129 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -// -// @author Yurii Shyrma (iuriish@yahoo.com), created on 19.09.2018 -// - -#include - - -namespace nd4j { -namespace ops { -namespace helpers { - -// input [bS, iC, iH, iW] is convoluted to output [bS, iC, kH, kW, oH, oW] -template -static void im2col_(nd4j::LaunchContext & context, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal) { - - auto imBuff = static_cast(input.getBuffer()); - auto colBuff = static_cast(output.getBuffer()); - auto imShapeBuffer = input.getShapeInfo(); - auto colShapeBuffer = output.getShapeInfo(); - auto colShape = shape::shapeOf(colShapeBuffer); - auto colStride = shape::stride(colShapeBuffer); - auto imShape = shape::shapeOf(imShapeBuffer); - auto imStride = shape::stride(imShapeBuffer); - - const T zeroPadVal = arrZeroPadVal.e(0); - - const int bS = imShape[0]; - const int iC = imShape[1]; - const int iH = imShape[2]; - const int iW = imShape[3]; - const int oH = colShape[4]; - const int oW = colShape[5]; - const Nd4jLong colStride0 = colStride[0]; - const Nd4jLong colStride1 = colStride[1]; - const Nd4jLong colStride2 = colStride[2]; - const Nd4jLong colStride3 = colStride[3]; - const Nd4jLong colStride4 = colStride[4]; - const Nd4jLong colStride5 = colStride[5]; - const Nd4jLong imStride0 = imStride[0]; - const Nd4jLong imStride1 = imStride[1]; - const Nd4jLong imStride2 = imStride[2]; - const Nd4jLong imStride3 = imStride[3]; - - T *col, *im; - int imRow, imCol; - - if (shape::order(imShapeBuffer) == 'c' && shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) { - -PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, imRow, imCol)) - for (int b = 0; b < bS; b++) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { - - imRow = (-pH + kRow * dH) + colH*sH; - imCol = (-pW + kCol * dW) + colW*sW; - - col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; - im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; - - if (static_cast(imRow) >= static_cast(iH) || static_cast(imCol) >= static_cast(iW)) - *col = zeroPadVal; - else - *col = *im; - } - } - } - } - } - } - } - else { - -PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, imRow, imCol)) - for (int b = 0; b < bS; b++) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - - imRow = (-pH + kRow * dH) + colH*sH; - imCol = (-pW + kCol * dW) + colW*sW; - - col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; - im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; - - if (static_cast(imRow) >= static_cast(iH) || static_cast(imCol) >= static_cast(iW)) - *col = zeroPadVal; - else - *col = *im; - } - } - } - } - } - } - } -} - - -void im2col(nd4j::LaunchContext & context, const NDArray& im, NDArray& col, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal) { - BUILD_SINGLE_SELECTOR(im.dataType(), im2col_, (context, im, col, kH, kW, sH, sW, pH, pW, dH, dW, arrZeroPadVal), LIBND4J_TYPES); -} - -BUILD_SINGLE_TEMPLATE(template void im2col_, (nd4j::LaunchContext & context, const NDArray& im, NDArray& col, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal), LIBND4J_TYPES); - - -} -} -} diff --git a/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu b/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu index c2dd4919d..753c8ae64 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu @@ -19,6 +19,7 @@ // #include +#include #include #include diff --git a/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu b/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu index 017180b38..3a09f9a80 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu @@ -20,6 +20,7 @@ #include #include +#include #include namespace nd4j { diff --git a/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu b/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu index 8db1f66d4..fa97a3de2 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu @@ -21,6 +21,7 @@ #include #include #include +#include namespace nd4j { namespace ops { diff --git a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu index 0a707ffb3..8a9986e23 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu @@ -644,7 +644,6 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr // apply Fisher-Yates shuffle if(isInplace) { - PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->elementwiseThreshold()) for(int i = firstDim - 1; i > 0; --i) { int r = rng.relativeInt(i) % i; @@ -658,7 +657,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr std::vector indices(firstDim); std::iota(indices.begin(), indices.end(), 0); bool isZeroShuffled = false; - PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold()) + for(int i = firstDim - 1; i > 0; --i) { int r = rng.relativeInt(i) % i; subArrsListOut->at(i)->assign(subArrsListIn->at(indices[r])); diff --git a/libnd4j/include/ops/declarable/helpers/helpers.h b/libnd4j/include/ops/declarable/helpers/helpers.h index 0914d2d49..f2e19063e 100644 --- a/libnd4j/include/ops/declarable/helpers/helpers.h +++ b/libnd4j/include/ops/declarable/helpers/helpers.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #endif // CUDACC diff --git a/libnd4j/include/ops/declarable/helpers/impl/choose.cpp b/libnd4j/include/ops/declarable/helpers/impl/choose.cpp index 47ca64d3b..4fb32e2f8 100644 --- a/libnd4j/include/ops/declarable/helpers/impl/choose.cpp +++ b/libnd4j/include/ops/declarable/helpers/impl/choose.cpp @@ -20,6 +20,7 @@ #include #include +#include namespace nd4j { namespace ops { diff --git a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp index 5a73e0a00..8ef63101e 100644 --- a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp +++ b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp @@ -20,6 +20,7 @@ #include #include +#include namespace nd4j { namespace ops { @@ -67,12 +68,14 @@ namespace helpers { } } - PRAGMA_OMP_PARALLEL_FOR_IF(values->lengthOf() > Environment::getInstance()->elementwiseThreshold()) - for (int e = 0; e < values->lengthOf(); e++) { - values->p(e, static_cast(valuesVector[e])); - if (counts != nullptr) - counts->p(e, countsMap[valuesVector[e]]); - } + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + values->p(e, static_cast(valuesVector[e])); + if (counts != nullptr) + counts->p(e, countsMap[valuesVector[e]]); + } + }; + samediff::Threads::parallel_for(func, 0, values->lengthOf()); for (int e = 0; e < indices->lengthOf(); e++) { auto posI = std::find(valuesVector.begin(), valuesVector.end(), input->e(e)); diff --git a/libnd4j/include/ops/declarable/helpers/matmul.h b/libnd4j/include/ops/declarable/helpers/matmul.h index 8d253cabf..2e7cce13f 100644 --- a/libnd4j/include/ops/declarable/helpers/matmul.h +++ b/libnd4j/include/ops/declarable/helpers/matmul.h @@ -22,7 +22,6 @@ #define LIBND4J_HELPERS_MATMUL_H #include -#include namespace nd4j { namespace ops { diff --git a/libnd4j/include/ops/declarable/impl/BooleanOp.cpp b/libnd4j/include/ops/declarable/impl/BooleanOp.cpp index 579fdf394..436cddda3 100644 --- a/libnd4j/include/ops/declarable/impl/BooleanOp.cpp +++ b/libnd4j/include/ops/declarable/impl/BooleanOp.cpp @@ -29,10 +29,6 @@ namespace nd4j { // } - BooleanOp::~BooleanOp() { - // - } - /** * Output shape of any BooleanOp is ALWAYS scalar */ diff --git a/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp b/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp index 71c722bca..7d696c8ef 100644 --- a/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp +++ b/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp @@ -29,10 +29,6 @@ namespace nd4j { // } - BroadcastableOp::~BroadcastableOp() { - // no-op - } - ShapeList *BroadcastableOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) { auto shapeList = SHAPELIST(); auto x = inputShape->at(0); diff --git a/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp index 691a3154d..1fd57c867 100644 --- a/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp +++ b/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp @@ -26,9 +26,5 @@ namespace nd4j { DeclarableCustomOp::DeclarableCustomOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs) : nd4j::ops::DeclarableOp(numInputs, numOutputs, opName, allowsInplace, tArgs, iArgs) { // } - - DeclarableCustomOp::~DeclarableCustomOp() { - // - } } } \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp index 7cb28e76d..624d6dbef 100644 --- a/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp +++ b/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp @@ -26,10 +26,6 @@ namespace nd4j { namespace ops { - DeclarableListOp::~DeclarableListOp() { - // - } - DeclarableListOp::DeclarableListOp(int numInputs, int numOutputs, const char* opName, int tArgs, int iArgs) : DeclarableOp::DeclarableOp(numInputs, numOutputs, opName, false, tArgs, iArgs) { // This kind of operations work with sets: NDArrayList this->getOpDescriptor()->setInputType(InputType_NUMERIC_SET); diff --git a/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp index ef3b04d30..98a60b28b 100644 --- a/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp +++ b/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -30,11 +31,6 @@ namespace nd4j { // } - DeclarableReductionOp::~DeclarableReductionOp() { - // - } - - nd4j::ShapeList* DeclarableReductionOp::calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block) { // int numDims = INT_ARG(0); std::vector dims; @@ -55,7 +51,7 @@ namespace nd4j { std::sort(dims.begin(), dims.end()); // special case - output is scalar - if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == MAX_INT)) { + if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == nd4j::DataTypeUtils::max())) { auto newShape = ConstantShapeHelper::getInstance()->scalarShapeInfo(block.dataType()); return SHAPELIST(newShape); } diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp index 2b83b200a..684f09262 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -39,7 +40,7 @@ namespace nd4j { ExtraArguments extras(*block.getTArguments()); PointersManager manager(block.launchContext(), "LegacyReduce3Op"); - if (x->isSameShape(y) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT))) { + if (x->isSameShape(y) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max()))) { // reduce3 to scalar NativeOpExecutioner::execReduce3Scalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(z->dataType()), @@ -97,7 +98,7 @@ namespace nd4j { Nd4jLong *zShape = nullptr; - if (shape::equalsSoft(xShape, yShape) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT))) { + if (shape::equalsSoft(xShape, yShape) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max()))) { // reduce3 to scalar case ALLOCATE(zShape, block.getWorkspace(), shape::shapeInfoLength(2), Nd4jLong); zShape[0] = 2; diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp index ac4bb33b7..12a25537d 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -60,7 +61,7 @@ namespace nd4j { allAxes = true; if ((axis.empty()) || - (axis.size() == 1 && axis[0] == MAX_INT) || allAxes) { + (axis.size() == 1 && axis[0] == nd4j::DataTypeUtils::max()) || allAxes) { // scalar NativeOpExecutioner::execReduceBoolScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo()); @@ -100,7 +101,7 @@ namespace nd4j { dims[e] = f >= 0 ? f : f += x->rankOf(); } - if ((block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT) || allAxes) { + if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max()) || allAxes) { // scalar NativeOpExecutioner::execReduceBoolScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo()); } else { diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp index e1da0621e..2765e1b3f 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -60,7 +61,7 @@ namespace nd4j { allAxes = true; // _axis.(block.getIArguments()->size() == 0) || - // (block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT) + // (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max()) if (block.getAxis()->empty() || allAxes) { // scalar NativeOpExecutioner::execReduceFloatScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), @@ -101,7 +102,7 @@ namespace nd4j { dims[e] = f >= 0 ? f : f += x->rankOf(); } - if ((block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT) || allAxes) { + if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max()) || allAxes) { // scalar NativeOpExecutioner::execReduceFloatScalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo()); } else { diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp index 3c83df702..836564c79 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -60,7 +61,7 @@ namespace nd4j { allAxes = true; if ((axis.empty()) || - (axis.size() == 1 && axis[0] == MAX_INT) || allAxes) { + (axis.size() == 1 && axis[0] == nd4j::DataTypeUtils::max()) || allAxes) { // scalar NativeOpExecutioner::execReduceLongScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo()); @@ -103,7 +104,7 @@ namespace nd4j { dims[e] = f >= 0 ? f : f += x->rankOf(); } - if ((block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT) || allAxes) { + if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max()) || allAxes) { // scalar NativeOpExecutioner::execReduceLongScalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo()); } else { diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp index 09a225b19..2340f39b0 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace nd4j { namespace ops { @@ -98,7 +99,7 @@ namespace nd4j { dims[e] = f >= 0 ? f : f += x->rankOf(); } - if ((block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT) || allAxes) { + if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max()) || allAxes) { // scalar NativeOpExecutioner::execReduceSameScalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(z->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo()); } else { diff --git a/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp index bb4dda4d4..08ebb80de 100644 --- a/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp +++ b/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace nd4j { @@ -43,7 +44,7 @@ namespace nd4j { ExtraArguments extras(*block.getTArguments()); PointersManager manager(block.launchContext(),"LegacyStatsOp"); - if (block.getIArguments()->size() == 1 || (block.getIArguments()->size() == 2 && INT_ARG(1) == MAX_INT)) { + if (block.getIArguments()->size() == 1 || (block.getIArguments()->size() == 2 && INT_ARG(1) == nd4j::DataTypeUtils::max())) { // scalar NativeOpExecutioner::execSummaryStatsScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(z->dataType()), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), biasCorrected); @@ -92,7 +93,7 @@ namespace nd4j { auto inShape = inputShape->at(0); Nd4jLong *newShape; - if (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT)) { + if (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max())) { // in this case we just return scalar ALLOCATE(newShape, block.getWorkspace(), shape::shapeInfoLength(2), Nd4jLong); newShape[0] = 2; diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp index 13e1cfe11..27f836a0e 100644 --- a/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp +++ b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright (c) 2015-2018 Skymind, Inc. + * Copyright (c) 2019 Konduit K.K. * * This program and the accompanying materials are made available under the * terms of the Apache License, Version 2.0 which is available at @@ -55,7 +56,7 @@ static void batchnormMKLDNN(const NDArray* x, const NDArray* mean, const NDArray mkldnn::memory::data_type type = mkldnn::memory::data_type::f32; // indicate whether gamma or/and beta are given - auto flags = mkldnn::normalization_flags::use_global_stats; + auto flags = mkldnn::normalization_flags::use_global_stats; // don't calculate the mean and variance for each mini-batch if (weights != nullptr) flags |= mkldnn::normalization_flags::use_scale_shift; @@ -182,7 +183,7 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const mkldnn::memory::data_type type = mkldnn::memory::data_type::f32; // indicate whether gamma or/and beta are given - auto flags = mkldnn::normalization_flags::use_global_stats; + auto flags = mkldnn::normalization_flags::use_global_stats; // don't calculate the mean and variance for each mini-batch if (weights != nullptr) flags |= mkldnn::normalization_flags::use_scale_shift; @@ -308,6 +309,70 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const stream.wait(); // shape::printArray(dLdI_mkl_mem.map_data(),8); + + // notations: + // f = g * (gamma * ((x - m) / (v + eps)^0.5) + beta) -> means dLdO * ff_output + // g = dLdO + // stdInv = 1 / (v + eps)^0.5 + // N - batch size (product of spatial dimensions) + + // formula for full derivative with respect to input (x) + // dLdI = dfdx + dfdm*dmdx + dfdv*(dvdm*dmdx + dvdx) + + // !!! MKL CALCULATES ONLY FIRST TERM dfdx, SO WE SHOULD CALCULATE TERM (dfdm*dmdx + dfdv*(dvdm*dmdx + dvdx)) BY OURSELF !!! + + // dfdm = -gamma*stdInv*g_sum; + // dmdx = 1/N; + // dvdx = 2 * (x - m) / N + // dvdm = -2 * [(x - m)]_sum / N + // dfdv = -0.5 * [g*(x - m)]_sum * stdInv^3, drop gamma here for calc convenience + + // finally: + // dLdI = dfdm / N + (2/N) * dfdv * (dvdm/2 + (x - m)) + // dLdI = gamma * ( stdInv * -g_sum/N + (2/N) * dfdv * (dvdm/2 + (x - m)) ) + + std::vector axes = {1}; + const auto excludedAxes = ShapeUtils::evalDimsToExclude(x->rankOf(), axes); + + // inversed batch size 1 / N + const auto Ninv = 1.f * mean->lengthOf() / x->lengthOf(); + + // x - mean + NDArray xMinusMean(x); // empty array with same shape as x + const_cast(x)->applyBroadcast(nd4j::broadcast::Subtract, axes, mean, &xMinusMean); + + // stdInv + NDArray stdInv = *variance + epsilon; + stdInv.applyTransform(transform::Reciprocal); // 1 / (variance + epsilon) + stdInv.applyTransform(transform::Sqrt); // 1 / (variance + epsilon)^0.5 + + // dfdm / N + auto dfdm = dLdO->reduceAlongDims(nd4j::reduce::Sum, excludedAxes); + dfdm *= stdInv; + dfdm *= -Ninv; + + // dvdm / 2 + NDArray dvdm(mean); // empty array with same shape as mean + xMinusMean.reduceAlongDimension(nd4j::reduce::Sum, &dvdm, excludedAxes); + dvdm *= -Ninv; + + // (2/N)*dfdv + NDArray dfdv(variance); // empty array with same shape as variance + (xMinusMean * *dLdO).reduceAlongDimension(nd4j::reduce::Sum, &dfdv, excludedAxes); + dfdv *= stdInv*stdInv*stdInv; + dfdv *= -Ninv; + + // dvdm/2 + (x - m) + xMinusMean.applyBroadcast(nd4j::broadcast::Add, axes, &dvdm); + // dfdv * (dvdm/2 + (x - m)) + xMinusMean.applyBroadcast(nd4j::broadcast::Multiply, axes, &dfdv); + // add dfdm / N + xMinusMean.applyBroadcast(nd4j::broadcast::Add, axes, &dfdm); + // * gamma + auto gamma = (*weights)({0,1, 0,0}); + xMinusMean.applyBroadcast(nd4j::broadcast::Multiply, axes, &gamma); + + *dLdI += xMinusMean; } PLATFORM_IMPL(batchnorm) { @@ -371,10 +436,21 @@ PLATFORM_IMPL(batchnorm) { (*weights)({1,2, 0,0}).assign(0); } + if(axes[0] == inRank - 1 && inRank > 2) { // if nhwc or ndhwc + std::vector permut = inRank == 4 ? std::vector({0,3,1,2}) : std::vector({0,4,1,2,3}); + input = new NDArray(input->permute(permut)); + output = new NDArray(output->permute(permut)); + } + batchnormMKLDNN(input, mean, variance, weights, epsilon, output); delete weights; + if(axes[0] == inRank - 1 && inRank > 2) { + delete input; + delete output; + } + return Status::OK(); } @@ -418,7 +494,7 @@ PLATFORM_CHECK(batchnorm) { const int inRank = input->rankOf(); - return block.isUseMKLDNN() && axes.size() == 1 && axes[0] == 1 && (inRank == 2 || inRank == 4 || inRank == 5) && + return block.isUseMKLDNN() && axes.size() == 1 && (axes[0] == 1 || axes[0] == inRank - 1) && (inRank == 2 || inRank == 4 || inRank == 5) && (inputType == DataType::FLOAT32 && meanType == DataType::FLOAT32 && varType == DataType::FLOAT32 && gammaType == DataType::FLOAT32 && betaType == DataType::FLOAT32 && outType == DataType::FLOAT32); } @@ -558,29 +634,29 @@ PLATFORM_CHECK(batchnorm) { ////////////////////////////////////////////////////////////////////////// PLATFORM_IMPL(batchnorm_bp) { - NDArray* input = INPUT_VARIABLE(0); // 2D:nc, 4D:nchw, 5D:ncdhw - NDArray* mean = INPUT_VARIABLE(1); // [c] - NDArray* variance = INPUT_VARIABLE(2); // [c] - NDArray* dLdO = INPUT_VARIABLE(3); // same as input - NDArray* gamma = nullptr; // [c] - NDArray* beta = nullptr; // [c] + NDArray* input = INPUT_VARIABLE(0); // 2D:nc, 4D:nchw, 5D:ncdhw + NDArray* mean = INPUT_VARIABLE(1); // [c] + NDArray* variance = INPUT_VARIABLE(2); // [c] + NDArray* gamma = nullptr; // [c] + NDArray* beta = nullptr; // [c] + NDArray* dLdO = INPUT_VARIABLE(block.width() - 1); // same as input - NDArray* dLdI = OUTPUT_VARIABLE(0); // same as input - NDArray* dLdM = OUTPUT_VARIABLE(1); // [c] - NDArray* dLdV = OUTPUT_VARIABLE(2); // [c] - NDArray* dLdG = nullptr; // [c] - NDArray* dLdB = nullptr; // [c] + NDArray* dLdI = OUTPUT_VARIABLE(0); // same as input + NDArray* dLdM = OUTPUT_VARIABLE(1); // [c] + NDArray* dLdV = OUTPUT_VARIABLE(2); // [c] + NDArray* dLdG = nullptr; // [c] + NDArray* dLdB = nullptr; // [c] const bool applyScale = (bool)INT_ARG(0); const bool applyOffset = (bool)INT_ARG(1); const float epsilon = T_ARG(0); if(applyScale) { - gamma = INPUT_VARIABLE(4); + gamma = INPUT_VARIABLE(3); dLdG = OUTPUT_VARIABLE(3); } if(applyOffset) { - beta = INPUT_VARIABLE(4 + (int)applyScale); + beta = INPUT_VARIABLE(3 + (int)applyScale); dLdB = OUTPUT_VARIABLE(3 + (int)applyScale); } @@ -606,7 +682,7 @@ PLATFORM_IMPL(batchnorm_bp) { if(beta != nullptr) REQUIRE_TRUE(beta->rankOf() == 1 && beta->sizeAt(0) == input->sizeAt(axes[0]), 0, "BATCHNORM_BP_MKLDNN op: wrong shape of beta array, expected is [%lld], but got %s instead !", input->sizeAt(axes[0]), ShapeUtils::shapeAsString(beta).c_str()); - // types of all input arrays should be the same (except dLdO) + // types of all input arrays should be the same for(int i = 1; i < block.width() - 1; ++i) REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP_MKLDNN op: types of all input arrays should be the same !"); @@ -626,11 +702,19 @@ PLATFORM_IMPL(batchnorm_bp) { (*weights)({1,2, 0,0}).assign(0); } - *dLdM = 0; - *dLdV = 0; + + if(axes[0] == inRank - 1 && inRank > 2) { // if nhwc or ndhwc + std::vector permut = inRank == 4 ? std::vector({0,3,1,2}) : std::vector({0,4,1,2,3}); + input = new NDArray(input->permute(permut)); + dLdO = new NDArray(dLdO->permute(permut)); + dLdI = new NDArray(dLdI->permute(permut)); + } batchnormBackPropMKLDNN(input, mean, variance, dLdO, weights, epsilon, dLdI, dLdW); + *dLdM = 0; + *dLdV = 0; + if(applyScale || applyOffset) { if(applyScale) dLdG->assign((*dLdW)({0,1, 0,0})); @@ -641,6 +725,12 @@ PLATFORM_IMPL(batchnorm_bp) { delete dLdW; } + if(axes[0] == inRank - 1 && inRank > 2) { + delete input; + delete dLdO; + delete dLdI; + } + return Status::OK(); } @@ -696,7 +786,7 @@ PLATFORM_CHECK(batchnorm_bp) { const int inRank = input->rankOf(); - return block.isUseMKLDNN() && axes.size() == 1 && axes[0] == 1 && (inRank == 2 || inRank == 4 || inRank == 5) && + return block.isUseMKLDNN() && axes.size() == 1 && (axes[0] == 1 || axes[0] == inRank - 1) && (inRank == 2 || inRank == 4 || inRank == 5) && (inputType == DataType::FLOAT32 && meanType == DataType::FLOAT32 && varType == DataType::FLOAT32 && dLdOType == DataType::FLOAT32 && gammaType == DataType::FLOAT32 && betaType == DataType::FLOAT32 && dLdIType == DataType::FLOAT32 && dLdGType == DataType::FLOAT32 && dLdBType == DataType::FLOAT32); diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp index 3c334e726..3d9a79535 100644 --- a/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp +++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp @@ -213,6 +213,9 @@ PLATFORM_IMPL(conv3dnew_bp) { ConvolutionUtils::getSizesAndIndexesConv3d(isNDHWC, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWiC, indWoC, indWkD); + if(isSameMode) // SAME + ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, dW); + int trueoD, trueoH, trueoW; // true output depth/height/width ConvolutionUtils::calcOutSizePool3D(trueoD, trueoH, trueoW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, isSameMode); diff --git a/libnd4j/include/ops/impl/gemm.cpp b/libnd4j/include/ops/impl/gemm.cpp index e004dc379..74b832b4a 100644 --- a/libnd4j/include/ops/impl/gemm.cpp +++ b/libnd4j/include/ops/impl/gemm.cpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace nd4j { namespace blas { @@ -32,15 +33,18 @@ namespace nd4j { auto source = reinterpret_cast(vsource); // handle transpose in parallel - PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2) - for (int r = 0; r < rows; r++) { - for (int c = 0; c < cols; c++) { - int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c); - int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c); + auto func = PRAGMA_THREADS_FOR { + for (auto r = start; r < stop; r += increment) { + for (int c = 0; c < cols; c++) { + int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c); + int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c); - ret[zIdx] = source[xIdx]; + ret[zIdx] = source[xIdx]; + } } - } + }; + + samediff::Threads::parallel_for(func, 0, rows); return ret; } @@ -62,44 +66,49 @@ namespace nd4j { bool transBFlag = TransB == CblasTrans; if (beta == 0.0) { + Z z = 0.f; int length = M*N; if (length <= Environment::getInstance()->elementwiseThreshold()) { - PRAGMA_OMP_SIMD for (int r = 0; r < length; r++) - C[r] = static_cast(0.0f); + C[r] = z; } else { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int r = 0; r < length; r++) - C[r] = static_cast(0.0f); + auto func = PRAGMA_THREADS_FOR { + for (auto r = start; r < stop; r += increment) + C[r] = z; + }; + samediff::Threads::parallel_for(func, 0, length); } } - PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2) - for (int r = 0; r < M; r++) { - for (int c = 0; c < N; c++) { - int zIdx = linearIndexF(M, N, r, c); + auto func = PRAGMA_THREADS_FOR_2D { + for (auto r = start_x; r < stop_x; r += inc_x) { + for (auto c = start_y; c < stop_y; c += inc_y) { + int zIdx = linearIndexF(M, N, r, c); - Z dot = static_cast(0.0f); + Z dot = static_cast(0.0f); - if (alpha != 0.0) { - int bIdx; // = linearIndexF(K, N, 0, c); - int aIdx; + if (alpha != 0.0) { + int bIdx; // = linearIndexF(K, N, 0, c); + int aIdx; - for (int k = 0; k < K; k++) { - aIdx = (transAFlag ? linearIndexC(M, K, r, k) : linearIndexF(M, K, r, k)); - bIdx = (transBFlag ? linearIndexC(K, N, k, c) : linearIndexF(K,N, k, c)); - dot += static_cast(alpha) * static_cast(A[aIdx]) * static_cast(B[bIdx]);//A[aIdx]nd4j::math::nd4j_dot(aX, bX, K) * alpha; + for (int k = 0; k < K; k++) { + aIdx = (transAFlag ? linearIndexC(M, K, r, k) : linearIndexF(M, K, r, k)); + bIdx = (transBFlag ? linearIndexC(K, N, k, c) : linearIndexF(K, N, k, c)); + dot += static_cast(alpha) * static_cast(A[aIdx]) * static_cast(B[bIdx]);//A[aIdx]nd4j::math::nd4j_dot(aX, bX, K) * alpha; + } + } + + if (beta != 0.0) { + C[zIdx] = static_cast(dot + beta * C[zIdx]); + } else { + C[zIdx] = static_cast(dot); } } - - if (beta != 0.0) { - C[zIdx] = static_cast(dot + beta * C[zIdx]); - } else { - C[zIdx] = static_cast(dot); - } } - } + }; + + samediff::Threads::parallel_for(func, 0, M, 1, 0, N, 1); } @@ -120,14 +129,16 @@ namespace nd4j { auto aT = TRANS == CblasTrans ? reinterpret_cast(nd4j::blas::transpose(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast(x))) : x; - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int r = 0; r < M; r++) { - int aIdx = linearIndexC(M, N, r, 0); - auto aX = aT + aIdx; + auto func = PRAGMA_THREADS_FOR { + for (auto r = start; r < stop; r += increment) { + int aIdx = linearIndexC(M, N, r, 0); + auto aX = aT + aIdx; - auto dot = nd4j::math::nd4j_dot(aX, y, lda) * alpha; - z[r] = beta == 0.0f ? dot : dot + beta * z[r]; - } + auto dot = nd4j::math::nd4j_dot(aX, y, lda) * alpha; + z[r] = beta == 0.0f ? dot : dot + beta * z[r]; + } + }; + samediff::Threads::parallel_for(func, 0, M); if (TRANS == CblasTrans) delete[] aT; diff --git a/libnd4j/include/ops/impl/specials.cpp b/libnd4j/include/ops/impl/specials.cpp index 85642d6c8..11cca1b15 100644 --- a/libnd4j/include/ops/impl/specials.cpp +++ b/libnd4j/include/ops/impl/specials.cpp @@ -63,22 +63,24 @@ void SpecialMethods::concatCpuGeneric(const std::vector& inArrs, ND T* outBuff = output.bufferAsT(); - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (uint r = 0; r < numOfArrs; r++) { + auto func = PRAGMA_THREADS_FOR { + for (auto r = start; r < stop; r += increment) { + const Nd4jLong arrLen = inArrs[r]->lengthOf(); + const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]]; - const Nd4jLong arrLen = inArrs[r]->lengthOf(); - const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]]; + T *z = outBuff + zOffset[r]; + T *x = inArrs[r]->bufferAsT(); - T *z = outBuff + zOffset[r]; - T *x = inArrs[r]->bufferAsT(); + if (outEws == 1 && xEws == 1) + for (Nd4jLong e = 0; e < arrLen; e++) + z[e] = x[e]; + else + for (Nd4jLong e = 0; e < arrLen; e++) + z[e * outEws] = x[e * xEws]; + } + }; - if(outEws == 1 && xEws == 1) - for (Nd4jLong e = 0; e < arrLen; e++) - z[e] = x[e]; - else - for (Nd4jLong e = 0; e < arrLen; e++) - z[e * outEws] = x[e * xEws]; - } + samediff::Threads::parallel_tad(func, 0, numOfArrs); return; } } @@ -96,11 +98,14 @@ void SpecialMethods::concatCpuGeneric(const std::vector& inArrs, ND indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis); // index end with (excluding) } - PRAGMA_OMP_PARALLEL_FOR_SIMD - for(int i = 0; i < numOfArrs; ++i) { - auto temp = output(indices[i], true); - nd4j::TransformLoops::template loopTransform, false>(inArrs[i]->bufferAsT(), inArrs[i]->getShapeInfo(), temp.bufferAsT(), temp.getShapeInfo(), nullptr); - } + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + auto temp = output(indices[i], true); + nd4j::TransformLoops::template loopTransform>( inArrs[i]->bufferAsT(), inArrs[i]->getShapeInfo(), temp.bufferAsT(), temp.getShapeInfo(), nullptr, 0, 1); + } + }; + + samediff::Threads::parallel_tad(func, 0, numOfArrs); } /** @@ -137,21 +142,15 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint auto z = reinterpret_cast(vz); auto x = reinterpret_cast(vx); - // aggregation step -#ifdef _OPENMP - int _threads = omp_get_max_threads(); -#else - // we can use whatever we want here, this value won't be used if there's no omp - int _threads = 4; -#endif - - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong i = 0; i < length; i++) { - - for (Nd4jLong ar = 0; ar < n; ar++) { - z[i] += x[ar][i]; + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + for (auto ar = 0L; ar < n; ar++) { + z[i] += x[ar][i]; + } } - } + }; + + samediff::Threads::parallel_for(func, 0, length); } @@ -175,24 +174,18 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint z = x[0]; PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < length; i++) { + for (uint64_t i = 0; i < length; i++) { z[i] /= n; } -#ifdef _OPENNMP - int _threads = omp_get_max_threads(); //nd4j::math::nd4j_min(omp_get_max_threads() / 2, 4); -#else - // we can use whatever we want here, this value won't be used if there's no omp - int _threads = 4; -#endif - - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong i = 0; i < length; i++) { - - for (Nd4jLong ar = 1; ar < n; ar++) { - z[i] += x[ar][i] / n; + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + for (Nd4jLong ar = 1; ar < n; ar++) { + z[i] += x[ar][i] / n; + } } - } + }; + samediff::Threads::parallel_for(func, 0, length); // instead of doing element-wise propagation, we just issue memcpy to propagate data for (Nd4jLong ar = 1; ar < n; ar++) { @@ -205,20 +198,14 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint memset(z, 0, length * sizeof(T)); // aggregation step -#ifdef _OPENNMP - int _threads = omp_get_max_threads(); //nd4j::math::nd4j_min(omp_get_max_threads() / 2, 4); -#else - // we can use whatever we want here, this value won't be used if there's no omp - int _threads = 4; -#endif - - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong i = 0; i < length; i++) { - - for (Nd4jLong ar = 0; ar < n; ar++) { - z[i] += x[ar][i] / n; + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { + for (Nd4jLong ar = 0; ar < n; ar++) { + z[i] += x[ar][i] / n; + } } - } + }; + samediff::Threads::parallel_for(func, 0, length); // instead of doing element-wise propagation, we just issue memcpy to propagate data for (Nd4jLong ar = 0; ar < n; ar++) { @@ -348,12 +335,14 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) Nd4jLong xTadLength = shape::tadLength(xShapeInfo, dimension, dimensionLength); int numTads = xLength / xTadLength; - PRAGMA_OMP_PARALLEL_FOR - for (int r = 0; r < numTads; r++) { - T *dx = x + tadOffsets[r]; + auto func = PRAGMA_THREADS_FOR { + for (auto r = start; r < stop; r += increment) { + T *dx = x + tadOffsets[r]; - quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending); - } + quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending); + } + }; + samediff::Threads::parallel_tad(func, 0, numTads); } @@ -368,23 +357,25 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) float threshold = fb.f_; - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong e = 4; e < lim; e++) { + auto func = PRAGMA_THREADS_FOR { + for (auto e = start; e < stop; e += increment) { + for (int bitId = 0; bitId < 16; bitId++) { + bool hasBit = (x[e] & 1 << (bitId)) != 0; + bool hasSign = (x[e] & 1 << (bitId + 16)) != 0; - for (int bitId = 0; bitId < 16; bitId++) { - bool hasBit = (x[e] & 1 << (bitId) ) != 0; - bool hasSign = (x[e] & 1 << (bitId + 16) ) != 0; - - if (hasBit) { - if (hasSign) - dz[(e - 4) * 16 + bitId] -= threshold; - else - dz[(e - 4) * 16 + bitId] += threshold; - } else if (hasSign) { - dz[(e - 4) * 16 + bitId] -= threshold / 2; + if (hasBit) { + if (hasSign) + dz[(e - 4) * 16 + bitId] -= threshold; + else + dz[(e - 4) * 16 + bitId] += threshold; + } else if (hasSign) { + dz[(e - 4) * 16 + bitId] -= threshold / 2; + } } } - } + }; + + samediff::Threads::parallel_for(func, 4, lim); } template @@ -392,17 +383,14 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) auto x = reinterpret_cast(dx); auto z = reinterpret_cast(dz); - if (N < nd4j::Environment::getInstance()->elementwiseThreshold()) { - for (int i = 0; i < N; i++) { - z[i] = static_cast(x[i]); - } - } else { - PRAGMA_OMP_PARALLEL_FOR - for (int i = 0; i < N; i++) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i += increment) { z[i] = static_cast(x[i]); } - } + }; + + samediff::Threads::parallel_for(func, 0, N); }; BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES); @@ -410,49 +398,49 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) Nd4jLong SpecialMethods::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) { auto dx = reinterpret_cast(vx); - Nd4jLong retVal = 0L; +//PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal)) + auto func = PRAGMA_REDUCE_LONG { + Nd4jLong retVal = 0L; -PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal)) - for (Nd4jLong x = 0; x < N; x += 16) { + for (auto x = start; x < stop; x += increment) { + int byte = 0; + int byteId = x / 16 + 4; - int byte = 0; - int byteId = x / 16 + 4; + for (int f = 0; f < 16; f++) { + Nd4jLong e = x + f; - for (int f = 0; f < 16; f++) { - Nd4jLong e = x + f; + if (e >= N) + continue; - if (e >= N) - continue; + T val = dx[e]; + T abs = nd4j::math::nd4j_abs(val); - T val = dx[e]; - T abs = nd4j::math::nd4j_abs(val); + int bitId = e % 16; - int bitId = e % 16; + if (abs >= (T) threshold) { + byte |= 1 << (bitId); + retVal++; - if (abs >= (T) threshold) { - byte |= 1 << (bitId); - - retVal++; - - - if (val < (T) 0.0f) { + if (val < (T) 0.0f) { + byte |= 1 << (bitId + 16); + dx[e] += threshold; + } else { + dx[e] -= threshold; + } + } else if (abs >= (T) threshold / (T) 2.0f && val < (T) 0.0f) { byte |= 1 << (bitId + 16); - dx[e] += threshold; - } else { - dx[e] -= threshold; - } - } else if (abs >= (T) threshold / (T) 2.0f && val < (T) 0.0f) { - byte |= 1 << (bitId + 16); - dx[e] += threshold / 2; + dx[e] += threshold / 2; - retVal++; + retVal++; + } } + + dz[byteId] = byte; } - dz[byteId] = byte; - } - - return retVal; + return retVal; + }; + return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16); } template @@ -637,13 +625,16 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) auto xTadLength = shape::length(packX.primaryShapeInfo()); auto numTads = packX.numberOfTads(); - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong r = 0; r < numTads; r++) { - auto dx = x + packX.primaryOffsets()[r]; - auto dy = y + packY.primaryOffsets()[r]; + auto func = PRAGMA_THREADS_FOR { + for (auto r = start; r < stop; r += increment) { + auto dx = x + packX.primaryOffsets()[r]; + auto dy = y + packY.primaryOffsets()[r]; - quickSort_parallel_key(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); - } + quickSort_parallel_key(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); + } + }; + + samediff::Threads::parallel_tad(func, 0, numTads); } template @@ -658,13 +649,16 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) auto xTadLength = shape::length(packX.primaryShapeInfo()); auto numTads = packX.numberOfTads(); - PRAGMA_OMP_PARALLEL_FOR - for (Nd4jLong r = 0; r < numTads; r++) { - auto dx = x + packX.primaryOffsets()[r]; - auto dy = y + packY.primaryOffsets()[r]; + auto func = PRAGMA_THREADS_FOR { + for (auto r = start; r < stop; r += increment) { + auto dx = x + packX.primaryOffsets()[r]; + auto dy = y + packY.primaryOffsets()[r]; - quickSort_parallel_value(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); - } + quickSort_parallel_value(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); + } + }; + + samediff::Threads::parallel_tad(func, 0, numTads); } BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES); diff --git a/libnd4j/include/ops/ops.h b/libnd4j/include/ops/ops.h index 601481b21..ab4bfca90 100644 --- a/libnd4j/include/ops/ops.h +++ b/libnd4j/include/ops/ops.h @@ -77,42 +77,6 @@ #define SELU_ALPHA 1.6732632423543772848170429916717 #define SELU_LAMBDA 1.0507009873554804934193349852946 -#ifdef _OPENMP -#pragma omp declare reduction(maxTF : float,double,float16,bfloat16 : \ - omp_out = nd4j::math::nd4j_max(omp_in, omp_out) )\ - initializer (omp_priv=-MAX_FLOAT) - -#pragma omp declare reduction(minTF : float,double,float16,bfloat16 : \ - omp_out = nd4j::math::nd4j_min(omp_in, omp_out) )\ - initializer (omp_priv=MAX_FLOAT) - -#pragma omp declare reduction(maxT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t : \ - omp_out = nd4j::math::nd4j_max(omp_in, omp_out) )\ - initializer (omp_priv=0) - -#pragma omp declare reduction(minT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t : \ - omp_out = nd4j::math::nd4j_min(omp_in, omp_out) )\ - initializer (omp_priv=0) - -#pragma omp declare reduction(amaxT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t : \ - omp_out = nd4j::math::nd4j_max(nd4j::math::nd4j_abs(omp_in), nd4j::math::nd4j_abs(omp_out)) ) - -#pragma omp declare reduction(aminT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t : \ - omp_out = nd4j::math::nd4j_min(nd4j::math::nd4j_abs(omp_in), nd4j::math::nd4j_abs(omp_out)) ) - -#pragma omp declare reduction(asumT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t : \ - omp_out = nd4j::math::nd4j_abs(omp_in) + nd4j::math::nd4j_abs(omp_out))\ - initializer (omp_priv=0) - -#pragma omp declare reduction(sumT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t : \ - omp_out = omp_in + omp_out)\ - initializer (omp_priv=0) - -#pragma omp declare reduction(prodT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t : \ - omp_out = omp_in * omp_out)\ - initializer (omp_priv=1) -#endif - namespace functions { namespace indexreduce { diff --git a/libnd4j/include/ops/special_accumulation_ops.h b/libnd4j/include/ops/special_accumulation_ops.h deleted file mode 100644 index 3f2b2ed1d..000000000 --- a/libnd4j/include/ops/special_accumulation_ops.h +++ /dev/null @@ -1,213 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -// -// @author raver119@gmail.com -// - -#ifndef LIBND4J_SPECIAL_ACCUMULATION_OPS_H -#define LIBND4J_SPECIAL_ACCUMULATION_OPS_H - -#include -#include -#include -//#include -//#include - -namespace simdOps { - - template - class LogSumExp { - public: - static const bool requiresSpecialAccumulation = true; - - constexpr static functions::ReduceType reduceType = functions::ReduceType::SUM; - - op_def static T startingValue(const T *input) { - return (T) 0.0f; - } - - op_def static Z merge(T old, T opOutput, Z *extraParams) { - return opOutput + old; - } - - op_def static T update(T old, T opOutput, Z *extraParams) { - return opOutput + old; - } - - op_def static Z op(T d1, T d2) { - return nd4j::math::nd4j_exp(d1 - d2); - } - - op_def static Z op(T d1, Z* extraParams) { - return nd4j::math::nd4j_exp(static_cast(d1) - extraParams[0]); - } - - op_def static Z postProcess(T reduction, Nd4jLong n, Z *extraParams) { - return extraParams[0] + nd4j::math::nd4j_log(reduction); - } - -#ifdef __CUDACC__ - __device__ static inline void aggregatePartials(Z *sPartials, int tid, int numItems, Z *extraParams) { - // start the shared memory loop on the next power of 2 less - // than the block size. If block size is not a power of 2, - // accumulate the intermediate sums in the remainder range. - int floorPow2 = numItems; - - if (floorPow2 & (floorPow2 - 1)) { - while (floorPow2 & (floorPow2 - 1)) { - floorPow2 &= floorPow2 - 1; - } - if (tid >= floorPow2) { - sPartials[tid - floorPow2] = update(sPartials[tid - floorPow2], sPartials[tid], extraParams); - } - - __syncthreads(); - } - - - for (int activeThreads = floorPow2 >> 1; activeThreads; activeThreads >>= 1) { - if (tid < activeThreads && tid + activeThreads < numItems) { - sPartials[tid] = update(sPartials[tid], sPartials[tid + activeThreads], extraParams); - } - __syncthreads(); - } - } - - static inline __device__ void execSpecialCuda( - T *dx, - Nd4jLong *xShapeInfo, - Z *extraParams, - Z *result, - Nd4jLong *resultShapeInfo, - int *dimension, - int dimensionLength, - Z *reductionBuffer, - Nd4jLong *tadOnlyShapeInfo, - Nd4jLong *tadOffsets) { - - // we assume that RESULT already holds max values - - //shared memory space for storing intermediate results - __shared__ Z *sPartials; - - // __shared__ shape::TAD *tad; - __shared__ Nd4jLong tadLength; - __shared__ Nd4jLong numTads; - - if (threadIdx.x == 0) { - extern __shared__ unsigned char shmem[]; - sPartials = (Z *) shmem; - tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength); - numTads = shape::length(xShapeInfo) / tadLength; - } - __syncthreads(); - - for (int r = blockIdx.x; r < numTads; r += gridDim.x) { - auto tadOffsetForBlock = tadOffsets[r]; - - sPartials[threadIdx.x] = startingValue(dx + tadOffsetForBlock); - - for (int i = threadIdx.x; i < tadLength; i += blockDim.x) { - auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo); - sPartials[threadIdx.x] = update(sPartials[threadIdx.x], op(dx[xOffset], result[r]), extraParams); - } - __syncthreads(); - - // aggregate. do NOT reduce for elements > tadLength - aggregatePartials(sPartials, threadIdx.x, nd4j::math::nd4j_min(blockDim.x, tadLength), &result[r]); - - __syncthreads(); - if (threadIdx.x == 0) - result[r] = postProcess(sPartials[threadIdx.x], tadLength, &result[r]); - } - } -#endif - - static void execSpecial(T *x, - Nd4jLong *xShapeInfo, - Z *extraParams, - Z *result, - Nd4jLong *resultShapeInfoBuffer, - int *dimension, - int dimensionLength, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffset) { - Nd4jLong resultLength = shape::length(resultShapeInfoBuffer); - - auto tadOnlyShapeInfo = tadShapeInfo; - auto tadOffsets = tadOffset; - - if (tadOnlyShapeInfo == nullptr || tadOffsets == nullptr) { - if (dimensionLength < 1) - return; - - auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength); - tadOnlyShapeInfo = tadPack.primaryShapeInfo(); - tadOffsets = tadPack.primaryOffsets(); - } - - - const Nd4jLong tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength); - auto numTads = shape::length(xShapeInfo) / tadLength; - auto tadEWS = shape::elementWiseStride(tadOnlyShapeInfo); - - int tadsPerThread = resultLength / TAD_THRESHOLD; - int num_threads = nd4j::math::nd4j_max(1, tadsPerThread); - num_threads = nd4j::math::nd4j_min(num_threads, omp_get_max_threads()); - - if (tadEWS > 0 && (numTads == 1 || shape::isVector(tadOnlyShapeInfo) || shape::isScalar(tadOnlyShapeInfo))) { - - PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads) - for (int i = 0; i < resultLength; i++) { - - T *iter = x + tadOffsets[i]; - T start = startingValue(iter); - if (tadEWS == 1) { - for (int j = 0; j < tadLength; j++) { - start = update(start, op(iter[j], result[i]), extraParams); - - } - } - else { - for (int j = 0; j < tadLength; j++) { - start = update(start, op(iter[j * tadEWS], result[i]), extraParams); - } - } - result[i] = postProcess(start, tadLength, &result[i]); - } - } - else { - - PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads) - for (int i = 0; i < resultLength; i++) { - - auto offset = tadOffsets[i]; - T start = startingValue(x + offset); - - for (int j = 0; j < tadLength; j++) { - auto xOffset = offset + shape::getIndexOffset(j, tadOnlyShapeInfo); - start = update(start, op(x[xOffset], result[i]), extraParams); - } - - result[i] = postProcess(start, tadLength, &result[i]);; - } - } - } - }; -} - -#endif //LIBND4J_SPECIAL_ACCUMULATION_OPS_H diff --git a/libnd4j/include/ops/special_ops.h b/libnd4j/include/ops/special_ops.h deleted file mode 100644 index 8f6ef6b5b..000000000 --- a/libnd4j/include/ops/special_ops.h +++ /dev/null @@ -1,2293 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __CUDACC__ -#include -#include -#include -#endif - -namespace functions { - namespace broadcast { - template - class Broadcast; - } - - namespace transform { - template - class TransformStrict; - } - - namespace scalar { - } - - namespace reduce { - template - class ReduceFloatFunction; - - template - class ReduceSameFunction; - } -} - -namespace simdOps { - - template - class Pooling2D { - public: - static const bool requiresSpecial = true; -#ifdef __CUDACC__ - inline __host__ __device__ -#elif defined(__GNUC__) - -#endif - static int outSize(int size, int k, int s, int p, bool coverAll) { - if (coverAll) - return (size + p * 2 - k + s - 1) / s + 1; - else - return (size + p * 2 - k) / s + 1; - } - -#ifdef __CUDACC__ - /** - * Based on: https://github.com/pjreddie/darknet/blob/master/src/im2col_kernels.cu - */ - - static inline __device__ void execSpecialCuda( - T *dx, Nd4jLong *xShapeBuffer, - Z *result, Nd4jLong *zShapeBuffer, - Z *extraParams, - int *allocationPointer, Z *reductionPointer, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - __shared__ int kH; - __shared__ int kW; - __shared__ int sH; - __shared__ int sW; - __shared__ int pH; - __shared__ int pW; - __shared__ int dH; - __shared__ int dW; - __shared__ int poolingMode; - __shared__ Z extraParam0; - - __shared__ int batchSize; - __shared__ int inChannels; - __shared__ int outH; - __shared__ int outW; - __shared__ int inH; - __shared__ int inW; - - //__shared__ int *strideIn; - //__shared__ int *strideOut; - __shared__ int strideB; - __shared__ int strideC; - __shared__ int strideY; - __shared__ int strideX; - - __shared__ int strideOB; - __shared__ int strideOC; - __shared__ int strideOY; - __shared__ int strideOX; - - __shared__ int length; - __shared__ int kHEff; - __shared__ int kWEff; - __shared__ bool fOrder; - - - if (threadIdx.x == 0) { - kH = (int)extraParams[0]; - kW = (int)extraParams[1]; - sH = (int)extraParams[2]; - sW = (int)extraParams[3]; - pH = (int)extraParams[4]; - pW = (int)extraParams[5]; - dH = (int)extraParams[6]; //Dilation, height dimension - dW = (int)extraParams[7]; //Dilation, width dimension - poolingMode = (int)extraParams[9]; - extraParam0 = extraParams[10]; - - batchSize = shape::sizeAt(xShapeBuffer, 0); - inChannels = shape::sizeAt(xShapeBuffer, 1); - outH = shape::sizeAt(zShapeBuffer, 2); - outW = shape::sizeAt(zShapeBuffer, 3); - inH = shape::sizeAt(xShapeBuffer, 2); - inW = shape::sizeAt(xShapeBuffer, 3); - - strideB = shape::stride(xShapeBuffer)[0]; - strideC = shape::stride(xShapeBuffer)[1]; - strideY = shape::stride(xShapeBuffer)[2]; - strideX = shape::stride(xShapeBuffer)[3]; - - strideOB = shape::stride(zShapeBuffer)[0]; - strideOC = shape::stride(zShapeBuffer)[1]; - strideOY = shape::stride(zShapeBuffer)[2]; - strideOX = shape::stride(zShapeBuffer)[3]; - - length = shape::length(zShapeBuffer); - - //Replace kernel H/W with *effective* kernel H/W accounting for dilatyon - kHEff = kH + (kH-1)*(dH-1); - kWEff = kW + (kW-1)*(dW-1); - - fOrder = shape::order(zShapeBuffer) == 'f'; -/* - if (blockIdx.x == 0) { - printf("kH: %i; kW: %i; sH: %i; sW: %i; pH: %i; pW: %i; dH: %i; dW: %i; poolingMode: %i; extraParam0: %f;\n", kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, (float) extraParam0); - printf("batchSize: %i; inChannels: %i; outH: %i; outW: %i; inH: %i; inW: %i; strideB: %i; strideC: %i; strideY: %i; strideX: %i;\n", batchSize, inChannels, outH, outW, inH, inW, strideB, strideC, strideY, strideX); - } -*/ - } - __syncthreads(); - - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - for (int index = tid; index < length; index += blockDim.x * gridDim.x) { - const int pw = index % outW; - const int ph = (index / outW) % outH; - const int c = (index / outW / outH) % inChannels; - const int n = index / outW / outH / inChannels; - int hstart = sH * ph - pH; - int wstart = sW * pw - pW; - int hend = hstart + kHEff; - int wend = wstart + kWEff; - -// const int hSO = hstart; -// const int hEO = hend; - - if(hstart < 0){ - int f = nd4j::math::nd4j_ceil((Z) -hstart / (Z)dH); - hstart += f * dH; - } - if(wstart < 0){ - int f = nd4j::math::nd4j_ceil((Z) -wstart / (Z) dW); - wstart += f * dW; - } - if(hend > inH){ - int f = nd4j::math::nd4j_ceil((Z) (hend-inH) / (Z) dH); - hend -= f * dH; - } - if(wend > inW){ - int f = nd4j::math::nd4j_ceil((Z) (wend-inW) / (Z) dW); - wend -= f * dW; - } - //Accounts for dilation - int pool_size = nd4j::math::nd4j_ceil((double) (hend-hstart) / (double) dH) * nd4j::math::nd4j_ceil((double) (wend-wstart) / (double) dW); - - Z sum = poolingMode == 0 ? -nd4j::DataTypeUtils::max() : static_cast(0.f); - - T *input_slice = dx + (n * strideB + c * strideC); - if (poolingMode == 0) { - for (int h = hstart; h < hend; h += dH) { - for (int w = wstart; w < wend; w += dW) { - Z v = static_cast(input_slice[h * strideY + w * strideX]); - if (v > sum) - sum = v; - } - } - } else if (poolingMode == 1) { - for (int h = hstart; h < hend; h += dH) { - for (int w = wstart; w < wend; w += dW) { - sum += static_cast(input_slice[h * strideY + w * strideX]); - } - } - } else if (poolingMode == 2) { - for (int h = hstart; h < hend; h += dH) { - for (int w = wstart; w < wend; w += dW) { - sum += nd4j::math::nd4j_pow(static_cast(nd4j::math::nd4j_abs(input_slice[h * strideY + w * strideX])), extraParam0); - } - } - } - - Z res; - - if (poolingMode == 0) { - res = sum; - } else if (poolingMode == 1) { - int divide_factor = pool_size; //Case 0: exclude padding - if ((int) extraParam0 == 1) //Case 1: include padding - divide_factor = kH * kW; - - res = sum / static_cast(divide_factor); - } else if (poolingMode == 2) { - res = nd4j::math::nd4j_pow(sum, (Z) 1.0f / extraParam0); - } - - - if (!fOrder) { - result[index] = res; - } else { - result[n * strideOB + c * strideOC + pw * strideOX + ph * strideOY] = res; - } -/* - if (index >= 0 && index < 400000) { - printf("index: %i; hstart: %i; hend: %i; wstart: %i; wend: %i; ph: %i; pw: %i; hstart_orig: %i; hend_orig: %i;\n", index, hstart, hend, wstart, wend, ph, pw, hSO, hEO); - } -*/ - } - - __syncthreads(); - } -#endif - - -static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outShapeBuffer, Z *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - // input is [bS, iC, iH, iW] - // output is [bS, iC, oH, oW] - - const Nd4jLong kH = (int)extraParams[0]; - const Nd4jLong kW = (int)extraParams[1]; - const Nd4jLong sH = (int)extraParams[2]; - const Nd4jLong sW = (int)extraParams[3]; - const Nd4jLong pH = (int)extraParams[4]; - const Nd4jLong pW = (int)extraParams[5]; - const Nd4jLong dH = (int)extraParams[6]; - const Nd4jLong dW = (int)extraParams[7]; - Nd4jLong poolingMode = (int)extraParams[9]; - T extraParam0 = extraParams[10]; - - if(dH == 0 || dW == 0) { - printf("Special_ops pooling2d:: dilation must not be zero, but got instead {%lld, %lld} \n", dH, dW); - throw ""; - } - - const Nd4jLong kHEff = kH + (kH-1)*(dH-1); - const Nd4jLong kWEff = kW + (kW-1)*(dW-1); - - const int bS = shape::sizeAt(inShapeBuffer, 0); - const int iC = shape::sizeAt(inShapeBuffer, 1); - const int iH = shape::sizeAt(inShapeBuffer, 2); - const int iW = shape::sizeAt(inShapeBuffer, 3); - const int oH = shape::sizeAt(outShapeBuffer, 2); - const int oW = shape::sizeAt(outShapeBuffer, 3); - const Nd4jLong iStride0 = shape::stride(inShapeBuffer)[0]; - const Nd4jLong iStride1 = shape::stride(inShapeBuffer)[1]; - const Nd4jLong iStride2 = shape::stride(inShapeBuffer)[2]; - const Nd4jLong iStride3 = shape::stride(inShapeBuffer)[3]; - const Nd4jLong oStride0 = shape::stride(outShapeBuffer)[0]; - const Nd4jLong oStride1 = shape::stride(outShapeBuffer)[1]; - const Nd4jLong oStride2 = shape::stride(outShapeBuffer)[2]; - const Nd4jLong oStride3 = shape::stride(outShapeBuffer)[3]; - - const Nd4jLong iStep2 = dH*iStride2; - const Nd4jLong iStep3 = dW*iStride3; - const int kProd = kH*kW; - const T iStep2Inv = 1./iStep2; - const T iStep3Inv = 1./iStep3; - - Nd4jLong hstart, wstart, hend, wend; - T sum, *pIn; - - if(poolingMode == 0) { // max - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { - - pIn = in + b * iStride0 + c * iStride1; - - hstart = oh * sH - pH; - wstart = ow * sW - pW; - hend = hstart + kHEff; - wend = wstart + kWEff; - - if(hstart < 0) - hstart += dH * (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); - if(wstart < 0) - wstart += dW * (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); - if(hend > iH) - hend -= dH * (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); - if(wend > iW) - wend -= dW * (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); - - hstart *= iStride2; - hend *= iStride2; - wstart *= iStride3; - wend *= iStride3; - - sum = -nd4j::DataTypeUtils::max(); - - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) { - T val = pIn[kh + kw]; - if (val > sum) - sum = val; - } - out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum; - } - } - } - } - } -/*************************************************************************/ - else if(poolingMode == 1) { // avg - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { - - pIn = in + b * iStride0 + c * iStride1; - - hstart = oh * sH - pH; - wstart = ow * sW - pW; - hend = hstart + kHEff; - wend = wstart + kWEff; - - if(hstart < 0) - hstart += dH * (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); - if(wstart < 0) - wstart += dW * (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); - if(hend > iH) - hend -= dH * (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); - if(wend > iW) - wend -= dW * (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); - - hstart *= iStride2; - hend *= iStride2; - wstart *= iStride3; - wend *= iStride3; - - sum = static_cast(0.); - - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) - sum += pIn[kh + kw]; - - if ((int) extraParam0 == 0) //Exclude padding - sum /= static_cast(nd4j::math::nd4j_ceil(static_cast(hend-hstart) / static_cast(iStep2))) * static_cast(nd4j::math::nd4j_ceil(static_cast(wend-wstart) / static_cast(iStep3))); //Accounts for dilation - else if ((int) extraParam0 == 1) //Include padding - sum /= kProd; - - out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum; - } - } - } - } - } -/*************************************************************************/ - else if(poolingMode == 2) { // pnorm - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2)) - for(int b = 0; b < bS; ++b) { - for(int c = 0; c < iC; ++c) { - for(int oh = 0; oh < oH; ++oh) { - for(int ow = 0; ow < oW; ++ow) { - - pIn = in + b * iStride0 + c * iStride1; - - hstart = oh * sH - pH; - wstart = ow * sW - pW; - hend = hstart + kHEff; - wend = wstart + kWEff; - - if(hstart < 0) - hstart += dH * (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-hstart) / static_cast(dH)); - if(wstart < 0) - wstart += dW * (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(-wstart) / static_cast(dW)); - if(hend > iH) - hend -= dH * (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(hend-iH) / static_cast(dH)); - if(wend > iW) - wend -= dW * (Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); - - hstart *= iStride2; - hend *= iStride2; - wstart *= iStride3; - wend *= iStride3; - - sum = static_cast(0.); - - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) - sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kh + kw]), extraParam0); - - sum = nd4j::math::nd4j_pow(sum, (T) 1. / extraParam0); - - out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum; - } - } - } - } - } - else { - nd4j_printf("Special_ops::pooling2d: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode); - throw ""; - } -} - - op_def static T op(T d1, Z *params) { - return d1; - } - - - /** Calculate buffer offset (like Shape.getOffset) without checking on input for negative indices etc - * normally negative indices are bad, OK here because of other checks on input indices - * Uses unrolled loop specifically for length 4 - */ - static _CUDA_HD int getOffsetUnsafe4(int baseOffset, int *shape, int *stride, int *indices) { - int offset = baseOffset; - if (shape[0] != 1) offset += indices[0] * stride[0]; - if (shape[1] != 1) offset += indices[1] * stride[1]; - if (shape[2] != 1) offset += indices[2] * stride[2]; - if (shape[3] != 1) offset += indices[3] * stride[3]; - return offset; - } - - - /** - * A version of Shape.getOffset without checking on input for negative indices etc - * normally negative indices are bad, OK here because of other checks on input indices - * Uses unrolled loop specifically for length 6, where indices[2] and indices[3] are zero (always are here) - */ - static _CUDA_HD int getOffsetUnsafe6(int baseOffset, int *shape, int *stride, int *indices) { - int offset = baseOffset; - if (shape[0] != 1) offset += indices[0] * stride[0]; - if (shape[1] != 1) offset += indices[1] * stride[1]; - if (shape[4] != 1) offset += indices[4] * stride[4]; - if (shape[5] != 1) offset += indices[5] * stride[5]; - return offset; - } - - }; - - - FORCEINLINE bool is_a_ge_zero_and_a_lt_b(int a, int b) { - return static_cast(a) < static_cast(b); - } - - template - class - Im2col { - public: - static const bool requiresSpecial = true; - - static _CUDA_HD int outSize(int size, int k, int s, int p, bool coverAll) { - if (coverAll) - return (size + p * 2 - k + s - 1) / s + 1; - else - return (size + p * 2 - k) / s + 1; - } - -#ifdef __CUDACC__ - /** - * Based on: https://github.com/pjreddie/darknet/blob/master/src/im2col_kernels.cu - */ - - static inline __device__ void execSpecialCuda( - T *dx, Nd4jLong *xShapeBuffer, - T *result, Nd4jLong *zShapeBuffer, - T *extraParams, - int *allocationPointer, T *reductionPointer, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - /*kernel[0], kernel[1], stride[0], stride[1], padding[0], padding[1], 0, false*/ - __shared__ int kernelHeight, kernelWidth, strideY, strideX, padHeight, padWidth, dY, dX, kSize, samples, depth, height, width, strideex, stridech, strideh, stridew, height_col, width_col, n; - __shared__ T zeroPadVal; - __shared__ Nd4jLong *outShape, *outStride, *inShape, *inStride; - __shared__ char resultOrder; - - if (threadIdx.x == 0) { - kernelHeight = (int) extraParams[0]; - kernelWidth = (int) extraParams[1]; - strideY = (int) extraParams[2]; - strideX = (int) extraParams[3]; - padHeight = (int) extraParams[4]; - padWidth = (int) extraParams[5]; - dY = (int) extraParams[6]; //Dilation, height/y dimension - dX = (int) extraParams[7]; //Dilation, width/x dimension - kSize = kernelWidth * kernelHeight; - zeroPadVal = (T) extraParams[9]; //Value to use when value is padding. Usually 0 but not always - - outShape = shape::shapeOf(zShapeBuffer); - resultOrder = shape::order(zShapeBuffer); - outStride = shape::stride(zShapeBuffer); - - inShape = shape::shapeOf(xShapeBuffer); - inStride = shape::stride(xShapeBuffer); - - samples = (int) inShape[0]; - depth = (int) inShape[1]; - height = (int) inShape[2]; - width = (int) inShape[3]; - - - strideex = (int) inStride[0]; - stridech = (int) inStride[1]; - strideh = (int) inStride[2]; - stridew = (int) inStride[3]; - - // (height + 2 * padHeight - kernelHeight) / strideX + 1; // - // (width + 2 * padWidth - kernelWidth) / strideY + 1; // - height_col = (int) outShape[4]; - width_col = (int) outShape[5]; - - n = samples * depth * height_col * width_col; - } - __syncthreads(); - - int index = blockIdx.x * blockDim.x + threadIdx.x; - for (; index < n; index += blockDim.x*gridDim.x) { - int h_index = index / width_col; - int h_col = h_index % height_col; - int w_col = index % width_col; - - int c_im = h_index / height_col; - int c_col = c_im * kSize; - - int depth_im = c_im % depth; - int num_im = c_im / depth; - int h_offset = h_col * strideY - padHeight; - int w_offset = w_col * strideX - padWidth; - - T* data_col_ptr = result; - - int i_c = (c_col * height_col + h_col) * width_col + w_col; - data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; - - T* data_im_ptr = dx; - - data_im_ptr += num_im * strideex + depth_im * stridech + h_offset * strideh + w_offset*stridew; - - for (int i = 0; i < kernelHeight; ++i) { - for (int j = 0; j < kernelWidth; ++j) { - int h_im = h_offset + i * dY; - int w_im = w_offset + j * dX; - int i_f = 0; - int i_c_temp = i_c; - for (int dim = 5; dim >= 0; dim--) { - i_f += (i_c_temp % outShape[dim]) * outStride[dim]; - i_c_temp = i_c_temp / outShape[dim]; - } - if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width){ - result[i_f] = data_im_ptr[i * dY * strideh + j * dX * stridew]; - } else result[i_f] = zeroPadVal; - - //result[i_f] = (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? data_im_ptr[i * strideh + j*stridew] : 0; - data_col_ptr += height_col * width_col; - i_c += height_col * width_col; - } - } - } - } -#endif - - - static void execSpecial( - T *imBuff, - Nd4jLong *imShapeBuffer, - T *colBuff, - Nd4jLong *colShapeBuffer, - T *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - /*kernel[0], kernel[1], stride[0], stride[1], padding[0], padding[1], 0, false*/ - - // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW] - - int kH = (int)extraParams[0]; - int kW = (int)extraParams[1]; - int sH = (int)extraParams[2]; - int sW = (int)extraParams[3]; - int pH = (int)extraParams[4]; - int pW = (int)extraParams[5]; - int dH = (int)extraParams[6]; //Dilation, height/y dimension - int dW = (int)extraParams[7]; //Dilation, width/x dimension - T zeroPadVal = extraParams[9]; - - auto colShape = shape::shapeOf(colShapeBuffer); - auto colStride = shape::stride(colShapeBuffer); - auto imShape = shape::shapeOf(imShapeBuffer); - auto imStride = shape::stride(imShapeBuffer); - - const int bS = imShape[0]; - const int iC = imShape[1]; - const int iH = imShape[2]; - const int iW = imShape[3]; - const int oH = colShape[4]; - const int oW = colShape[5]; - const Nd4jLong colStride0 = colStride[0]; - const Nd4jLong colStride1 = colStride[1]; - const Nd4jLong colStride2 = colStride[2]; - const Nd4jLong colStride3 = colStride[3]; - const Nd4jLong colStride4 = colStride[4]; - const Nd4jLong colStride5 = colStride[5]; - const Nd4jLong imStride0 = imStride[0]; - const Nd4jLong imStride1 = imStride[1]; - const Nd4jLong imStride2 = imStride[2]; - const Nd4jLong imStride3 = imStride[3]; - - T *col, *im; - int imRow, imCol; - - if (shape::order(imShapeBuffer) == 'c' && shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) { - - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, im, imRow, imCol) collapse(2)) - for (int b = 0; b < bS; b++) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { - - imRow = (-pH + kRow * dH) + colH*sH; - imCol = (-pW + kCol * dW) + colW*sW; - - col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; - im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; - - if (static_cast(imRow) >= static_cast(iH) || static_cast(imCol) >= static_cast(iW)) - *col = zeroPadVal; - else - *col = *im; - } - } - } - } - } - } - } - else { - - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(im, col, imRow, imCol) collapse(2)) - for (int b = 0; b < bS; b++) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - - imRow = (-pH + kRow * dH) + colH*sH; - imCol = (-pW + kCol * dW) + colW*sW; - - col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; - im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; - - if (static_cast(imRow) >= static_cast(iH) || static_cast(imCol) >= static_cast(iW)) - *col = zeroPadVal; - else - *col = *im; - } - } - } - } - } - } - } - } - - op_def static T op(T d1, T *params) { - return d1; - } - - - /** Calculate buffer offset (like Shape.getOffset) without checking on input for negative indices etc - * normally negative indices are bad, OK here because of other checks on input indices - * Uses unrolled loop specifically for length 4 - */ - static _CUDA_HD int getOffsetUnsafe4(int baseOffset, int *shape, int *stride, int *indices) { - int offset = baseOffset; - if (shape[0] != 1) offset += indices[0] * stride[0]; - if (shape[1] != 1) offset += indices[1] * stride[1]; - if (shape[2] != 1) offset += indices[2] * stride[2]; - if (shape[3] != 1) offset += indices[3] * stride[3]; - return offset; - } - - - /** - * A version of Shape.getOffset without checking on input for negative indices etc - * normally negative indices are bad, OK here because of other checks on input indices - * Uses unrolled loop specifically for length 6, where indices[2] and indices[3] are zero (always are here) - */ - static _CUDA_HD int getOffsetUnsafe6(int baseOffset, int *shape, int *stride, int *indices) { - int offset = baseOffset; - if (shape[0] != 1) offset += indices[0] * stride[0]; - if (shape[1] != 1) offset += indices[1] * stride[1]; - if (shape[4] != 1) offset += indices[4] * stride[4]; - if (shape[5] != 1) offset += indices[5] * stride[5]; - return offset; - } - - }; - - template - class Histogram { - public: - static const bool requiresSpecial = true; - -#ifdef __CUDACC__ - static inline __device__ void execSpecialCuda( - T *dx, Nd4jLong *xShapeBuffer, - Z *result, Nd4jLong *zShapeBuffer, - Z *extraParams, - int *allocationPointer, Z *reductionPointer, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - - - }; -#endif - - static void execSpecial( - T *dx, - Nd4jLong *xShapeBuffer, - Z *result, - Nd4jLong *zShapeBuffer, - Z *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - - - } - - - op_def static T op(T d1, Z *params) { - return d1; - } - }; - - template - class Col2Im { - - public: - static const bool requiresSpecial = true; -#ifdef __CUDACC__ - /** - * https://github.com/pjreddie/darknet/blob/master/src/col2im_kernels.cu - */ - - static inline __device__ void execSpecialCuda( - X *dx, Nd4jLong *xShapeBuffer, - X *result, Nd4jLong *zShapeBuffer, - X *extraParams, int *allocationPointer, - X *reductionPointer, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - __shared__ int strideex, stridech, stridekrow, stridekcol, striderow, stridecol, kernelHeight, kernelWidth, strideY, strideX, padHeight, padWidth, imgHeight, imgWidth, dY, dX, samples, depth, imgH, imgW, height_col, width_col, n, kEffectiveW, kEffectiveH; - __shared__ Nd4jLong *inShape, *inStride, *outShape, *outStride; - __shared__ char resultOrder; - - if (threadIdx.x == 0) { - inShape = shape::shapeOf(xShapeBuffer); - inStride = shape::stride(xShapeBuffer); - - strideex = (int) inStride[0]; - stridech = (int) inStride[1]; - stridekrow = (int) inStride[2]; - stridekcol = (int) inStride[3]; - striderow = (int) inStride[4]; - stridecol = (int) inStride[5]; - - kernelHeight = (int) inShape[2]; - kernelWidth = (int) inShape[3]; - - strideY = (int) extraParams[0]; - strideX = (int) extraParams[1]; - padHeight = (int) extraParams[2]; - padWidth = (int) extraParams[3]; - imgHeight = (int) extraParams[4]; - imgWidth = (int) extraParams[5]; - dY = (int) extraParams[6]; //Dilation in height/y dimension - dX = (int) extraParams[7]; //Dilation in width/x dimension - - outShape = shape::shapeOf(zShapeBuffer); - resultOrder = shape::order(zShapeBuffer); - outStride = shape::stride(zShapeBuffer); - - samples = (int) outShape[0]; - depth = (int) outShape[1]; - imgH = (int) outShape[2]; - imgW = (int) outShape[3]; - - height_col = inShape[4];//(imgHeight + 2 * padHeight - kernelHeight) / strideX + 1; - width_col = inShape[5];//(imgWidth + 2 * padWidth - kernelWidth) / strideY + 1; - - n = samples * depth * imgHeight * imgWidth; - - //Effective kernel size, accounting for dilation - kEffectiveW = kernelWidth + (kernelWidth - 1) * (dX - 1); - kEffectiveH = kernelHeight + (kernelHeight - 1) * (dY - 1); - } - __syncthreads(); - - for (int i = (blockDim.x * blockIdx.x) + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { - X val = 0; - int w_im = i % imgWidth + padWidth; - int h_im = (i / imgWidth) % imgHeight + padHeight; - int c_im = i / (imgWidth * imgHeight); - - int num_im = c_im / depth; - int depth_im = c_im % depth; - - // compute the start and end of the output - // These are the indexes for dimensions ??? in the 6d col matrix - int w_col_start = (w_im < kEffectiveW) ? 0 : (w_im - kEffectiveW) / strideX + 1; - int w_col_end = nd4j::math::nd4j_min(w_im / strideX + 1, width_col); - - int h_col_start = (h_im < kEffectiveH) ? 0 : (h_im - kEffectiveH) / strideY + 1; - int h_col_end = nd4j::math::nd4j_min(h_im / strideY + 1, height_col); - - - //Iterate over col entries in the 6d array... these are added up - for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) { - for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) { - int h_k = (h_im - h_col * strideY); - int w_k = (w_im - w_col * strideX); - - if(h_k % dY == 0 && w_k % dX == 0){ - h_k /= dY; - w_k /= dX; - - int data_col_index = num_im * strideex + depth_im * stridech + h_k * stridekrow + w_k * stridekcol + h_col * striderow + w_col * stridecol; - val += dx[data_col_index]; - } - } - } - int i_f = 0; - int i_c = i; - for (int dim = 3; dim >= 0; dim--) - { - i_f += (i_c % outShape[dim]) * outStride[dim]; - i_c = i_c / outShape[dim]; - } - result[i_f] = val; - } - } -#endif - - static void execSpecial( - X *colBuff, - Nd4jLong *colShapeBuffer, - X *imBuff, - Nd4jLong *imShapeBuffer, - X *extraParams, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets) { - - // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW] - - auto colShape = shape::shapeOf(colShapeBuffer); - auto colStride = shape::stride(colShapeBuffer); - auto imShape = shape::shapeOf(imShapeBuffer); - auto imStride = shape::stride(imShapeBuffer); - - const int sH = (int)extraParams[0]; - const int sW = (int)extraParams[1]; - const int pH = (int)extraParams[2]; - const int pW = (int)extraParams[3]; - const int iH = (int)extraParams[4]; - const int iW = (int)extraParams[5]; - const int dH = (int)extraParams[6]; - const int dW = (int)extraParams[7]; - - const int bS = imShape[0]; - const int iC = imShape[1]; - const int kH = colShape[2]; - const int kW = colShape[3]; - const int oH = colShape[4]; - const int oW = colShape[5]; - const Nd4jLong colStride0 = colStride[0]; - const Nd4jLong colStride1 = colStride[1]; - const Nd4jLong colStride2 = colStride[2]; - const Nd4jLong colStride3 = colStride[3]; - const Nd4jLong colStride4 = colStride[4]; - const Nd4jLong colStride5 = colStride[5]; - const Nd4jLong imStride0 = imStride[0]; - const Nd4jLong imStride1 = imStride[1]; - const Nd4jLong imStride2 = imStride[2]; - const Nd4jLong imStride3 = imStride[3]; - - auto zLength = shape::length(imShapeBuffer); - - // initial zeroing of image content - memset(imBuff, 0, zLength * sizeof(X)); - - - X *col, *im; - int imRow, imCol; - - if (shape::order(colShapeBuffer) == 'c' && shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) { - - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, im, imRow, imCol) collapse(2)) - for (int b = 0; b < bS; b++) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { - - imRow = (-pH + kRow * dH) + colH*sH; - imCol = (-pW + kCol * dW) + colW*sW; - - col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; - im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; - - if (static_cast(imRow) < static_cast(iH) && static_cast(imCol) < static_cast(iW)) - *im += *col; - } - } - } - } - } - } - } - else { - - PRAGMA_OMP_PARALLEL_FOR_ARGS(private(im, col, imRow, imCol)) - for (int b = 0; b < bS; b++) { - for (int colH = 0; colH < oH; ++colH) { - for (int colW = 0; colW < oW; ++colW) { - for (int c = 0; c < iC; ++c) { - for (int kRow = 0; kRow < kH; ++kRow) { - for (int kCol = 0; kCol < kW; ++kCol) { - - imRow = (-pH + kRow * dH) + colH*sH; - imCol = (-pW + kCol * dW) + colW*sW; - - col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5; - im = imBuff + b*imStride0 + c*imStride1 + imRow*imStride2 + imCol*imStride3; - - if (static_cast(imRow) < static_cast(iH) && static_cast(imCol) < static_cast(iW)) - *im += *col; - } - } - } - } - } - } - } - } - - op_def static X op(X d1, X *params) { - return d1; - } - - - /** Calculate buffer offset (like Shape.getOffset) without checking on input for negative indices etc - * normally negative indices are bad, OK here because of other checks on input indices - * Uses unrolled loop specifically for length 4 - */ - static _CUDA_HD int getOffsetUnsafe4(int baseOffset, int *shape, int *stride, int *indices) { - int offset = baseOffset; - if (shape[0] != 1) offset += indices[0] * stride[0]; - if (shape[1] != 1) offset += indices[1] * stride[1]; - if (shape[2] != 1) offset += indices[2] * stride[2]; - if (shape[3] != 1) offset += indices[3] * stride[3]; - return offset; - } - - /** A version of Shape.getOffset without checking on input for negative indices etc - * normally negative indices are bad, OK here because of other checks on input indices - * Uses unrolled loop specifically for length 6, where indices[2] and indices[3] are zero (always are here) - */ - static _CUDA_HD int getOffsetUnsafe6(int baseOffset, int *shape, int *stride, int *indices) { - int offset = baseOffset; - if (shape[0] != 1) offset += indices[0] * stride[0]; - if (shape[1] != 1) offset += indices[1] * stride[1]; - if (shape[4] != 1) offset += indices[4] * stride[4]; - if (shape[5] != 1) offset += indices[5] * stride[5]; - return offset; - } - - }; - - - template - class Reverse { - public: - static const bool requiresSpecial = true; - -#ifdef __CUDACC__ - static inline __device__ void execSpecialCuda(X *dx, Nd4jLong *xShapeBuffer, - X *result, Nd4jLong *zShapeBuffer, - X *extraParams, int *allocationPointer, - X *reductionPointer, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - __shared__ Nd4jLong xLength; - __shared__ int xEWS; - __shared__ char xOrder; - __shared__ Nd4jLong sLength; - __shared__ X *shmem; - int tid = threadIdx.x + blockIdx.x * blockDim.x; - - if (threadIdx.x == 0) { - xLength = shape::length(xShapeBuffer); - xEWS = shape::elementWiseStride(xShapeBuffer); - xOrder = shape::order(xShapeBuffer); - sLength = xLength - 1; - - extern __shared__ unsigned char shrd[]; - shmem = (X *) shrd; - } - __syncthreads(); - - - - if (dx == result) { - - if (xEWS == 1) { - for (int e = tid; e < xLength / 2; e += blockDim.x * gridDim.x) { - Nd4jLong idx = sLength - e; - X tmp = dx[e]; - dx[e] = dx[idx]; - dx[idx] = tmp; - } - } else if (xEWS >= 1) { - for (int e = tid; e < xLength / 2; e += blockDim.x * gridDim.x) { - Nd4jLong idx1 = (sLength - e) * xEWS; - Nd4jLong idx2 = e * xEWS; - X tmp = dx[idx2]; - dx[idx2] = dx[idx1]; - dx[idx1] = tmp; - } - } - else { - - for (int e = tid; e < xLength / 2; e += blockDim.x * gridDim.x) { - auto xOffset = shape::getIndexOffset(e, xShapeBuffer); - auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer); - result[zOffset] = dx[xOffset]; - } - } - - } else { - __shared__ int zEWS; - __shared__ char zOrder; - - if (threadIdx.x == 0) { - zEWS = shape::elementWiseStride(zShapeBuffer); - zOrder = shape::order(zShapeBuffer); - } - __syncthreads(); - - if (xEWS == 1 && zEWS == 1 && xOrder == zOrder) { - // loop for whole array - for (int e = tid; e < xLength; e += blockDim.x * gridDim.x) { - result[sLength - e] = dx[e]; - } - } else if (xEWS >= 1 && zEWS >= 1 && xOrder == zOrder) { - - for (int e = tid; e < xLength; e += blockDim.x * gridDim.x) { - result[(sLength - e) * zEWS] = dx[e * xEWS]; - } - } - else { - - for (int e = tid; e < xLength; e += blockDim.x * gridDim.x) { - auto xOffset = shape::getIndexOffset(e, xShapeBuffer); - auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer); - result[zOffset] = dx[xOffset]; - } - } - } - } - -#endif - - - static void execSpecial(X *dx, Nd4jLong *xShapeBuffer, X *result, Nd4jLong *zShapeBuffer, X *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - Nd4jLong xLength = shape::length(xShapeBuffer); - int xEWS = shape::elementWiseStride(xShapeBuffer); - char xOrder = shape::order(xShapeBuffer); - Nd4jLong sLength = xLength - 1; - - // two step phase here - if (dx == result) { - if (xEWS == 1) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < xLength / 2; e++) { - Nd4jLong idx = sLength - e; - auto tmp = dx[e]; - dx[e] = dx[idx]; - dx[idx] = tmp; - } - } else if (xEWS > 1) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < xLength / 2; e++) { - Nd4jLong idx1 = (sLength - e) * xEWS; - Nd4jLong idx2 = e * xEWS; - auto tmp = dx[idx2]; - dx[idx2] = dx[idx1]; - dx[idx1] = tmp; - } - } - else { - - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < xLength / 2; e++) { - auto xOffset = shape::getIndexOffset(e, xShapeBuffer); - auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer); - - result[zOffset] = dx[xOffset]; - } - } - } else { - // single step phase here - auto zEWS = shape::elementWiseStride(zShapeBuffer); - auto zOrder = shape::order(zShapeBuffer); - - if (xEWS == 1 && zEWS == 1 && xOrder == zOrder) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < xLength; e++) { - result[sLength - e] = dx[e]; - } - } else if (xEWS >= 1 && zEWS >= 1 && xOrder == zOrder) { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < xLength; e++) { - result[(sLength - e) * zEWS] = dx[e * xEWS]; - } - } - else { - - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (Nd4jLong e = 0; e < xLength; e++) { - auto xOffset = shape::getIndexOffset(e, xShapeBuffer); - auto zOffset = shape::getIndexOffset(sLength - e, zShapeBuffer); - result[zOffset] = dx[xOffset]; - } - } - } - } - - op_def static X op(X d1, X *params) { - return d1; - } - }; - - template - class SoftMax { - public: - static const bool requiresSpecial = true; - -#ifdef __CUDACC__ - /** - * - */ - - static inline __device__ void execSpecialCuda( - void *vx, Nd4jLong *xShapeBuffer, - void *vresult, Nd4jLong *zShapeBuffer, - void *vextraParams, - int *allocationPointer, void *reductionPointer, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - auto dx = reinterpret_cast(vx); - auto result = reinterpret_cast(vresult); - auto extraParams = reinterpret_cast(vextraParams); - - auto shape = shape::shapeOf(xShapeBuffer); - __shared__ X maxResult; - __shared__ Nd4jLong *maxResultShapeBuffer; - - auto length = shape::length(xShapeBuffer); - - auto stride = shape::stride(xShapeBuffer); - //compute the row wise maxes - - __shared__ Nd4jLong maxShape[2]; - - // it's always 2d here - __shared__ Nd4jLong tempBuffer[8]; - - if (threadIdx.x == 0) { - maxResult = (X) 0.0; - maxShape[0] = shape[0]; - maxShape[1] = 1; - maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT(), maxShape, tempBuffer); - } - __syncthreads(); - - - functions::reduce::ReduceSameInplace::execScalarCudaLegacy(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr); - __syncthreads(); - - //subtract max of each row - functions::scalar::ScalarInplace::transformCudaLegacy(nd4j::scalar::Subtract, &maxResult, dx, xShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer); - __syncthreads(); - - //after subtracting the row wise maxes take the exp - functions::transform::TransformStrictInplace::transformCudaLegacy(nd4j::transform::Exp, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets); - __syncthreads(); - - //take the sum for the exponential - functions::reduce::ReduceSameInplace::execScalarCudaLegacy(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr); - __syncthreads(); - - //divide by the sum - functions::scalar::ScalarInplace::transformCudaLegacy(nd4j::scalar::Divide, &maxResult, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer); - } -#endif - - static void execSpecial( - void *vx, - Nd4jLong *xShapeInfo, - void *vz, - Nd4jLong *zShapeInfo, - void *vextraParams, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets) { - - auto x = reinterpret_cast(vx); - auto z = reinterpret_cast(vz); - auto extraParams = reinterpret_cast(vextraParams); - - if (shape::isMatrix(xShapeInfo)) { - - if(shape::equalsStrict(xShapeInfo, zShapeInfo)) { - if (tadShapeInfo == nullptr) { - auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, 1); - tadShapeInfo = tadPack.primaryShapeInfo(); - tadOffsets = tadPack.primaryOffsets(); - } - - const uint tadLen = shape::length(tadShapeInfo); - const uint numOfTads = shape::length(xShapeInfo) / tadLen; - - if(shape::elementWiseStride(tadShapeInfo) == 1) { - - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (uint i = 0; i < numOfTads; ++i) { - - X* inBuff = x + tadOffsets[i]; - X* outBuff = z + tadOffsets[i]; - - X max = -nd4j::DataTypeUtils::max(); - X sum = 0; - - for(uint j = 0; j < tadLen; ++j) - max = nd4j::math::nd4j_max(max, inBuff[j]); - - for (uint j = 0; j < tadLen; ++j) { - X temp = nd4j::math::nd4j_exp(inBuff[j] - max); - outBuff[j] = temp; - sum += temp; - } - - for (uint j = 0; j < tadLen; ++j) - outBuff[j] /= sum; - } - } - else { - - uint xShapeInfoCast[MAX_RANK]; - bool canCast = nd4j::DataTypeUtils::castShapeInfo(tadShapeInfo, xShapeInfoCast); - - auto offsets = new Nd4jLong[tadLen]; - shape::calcOffsets(tadShapeInfo, offsets); - - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (uint i = 0; i < numOfTads; ++i) { - - X* inBuff = x + tadOffsets[i]; - X* outBuff = z + tadOffsets[i]; - - X max = -nd4j::DataTypeUtils::max(); - X sum = 0.f; - - for(uint j = 0; j < tadLen; ++j) - max = nd4j::math::nd4j_max(max, inBuff[offsets[j]]); - - for (uint j = 0; j < tadLen; ++j) { - X temp = nd4j::math::nd4j_exp(inBuff[offsets[j]] - max); - outBuff[offsets[j]] = temp; - sum += temp; - } - - for (uint j = 0; j < tadLen; ++j) - outBuff[offsets[j]] /= sum; - } - delete []offsets; - } - } - else { - - auto shape = shape::shapeOf(xShapeInfo); - //iterate along rows - int dimension[1] = { 0 }; - int maxDimension[1] = { 1 }; - //compute the row wise maxes - auto maxResult = new X[shape[0]]; - for (int i = 0; i < shape[0]; i++) - maxResult[i] = 0.0; - Nd4jLong maxShape[2] = { shape[0], 1 }; - auto maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT(), maxShape); - functions::reduce::ReduceSameFunction::exec(nd4j::reduce::Max, x, xShapeInfo, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr); - - //subtract max of each row - functions::broadcast::Broadcast::exec(nd4j::broadcast::Subtract, x, xShapeInfo, maxResult, maxResultShapeBuffer, z, zShapeInfo, dimension, 1, nullptr, nullptr, nullptr, nullptr); - - //after subtracting the row wise maxes take the exp - functions::transform::TransformStrict::exec(nd4j::transform::Exp, z, zShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets); - - //take the sum for the exponential - functions::reduce::ReduceSameFunction::exec(nd4j::reduce::Sum, z, zShapeInfo, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr); - - //divide by the sum - functions::broadcast::Broadcast::exec(nd4j::broadcast::Divide, z, zShapeInfo, maxResult, maxResultShapeBuffer, z, zShapeInfo, dimension, 1, nullptr, nullptr, nullptr, nullptr); - - delete[] maxResultShapeBuffer; - delete[] maxResult; - } - } - else if (shape::isVector(xShapeInfo)) { - auto max = -nd4j::DataTypeUtils::max(); - X sum = 0; - int elementWiseStride = shape::elementWiseStride(xShapeInfo); - int resultElementWiseStride = shape::elementWiseStride(zShapeInfo); - int length = shape::length(xShapeInfo); - if (elementWiseStride >= 1 && resultElementWiseStride >= 1) { - if (elementWiseStride == 1 && resultElementWiseStride == 1) { - - for (int i = 0; i < length; i++) { - max = nd4j::math::nd4j_max(max, x[i]); - } - - for (int i = 0; i < length; i++) { - z[i] = nd4j::math::nd4j_exp(x[i] - max); - sum += z[i]; - } - - PRAGMA_OMP_SIMD - for (int i = 0; i < length; i++) { - z[i] /= sum; - } - } - else { - - for (int i = 0; i < length; i++) { - max = nd4j::math::nd4j_max(max, x[i * elementWiseStride]); - } - - for (int i = 0; i < length; i++) { - auto r = nd4j::math::nd4j_exp(x[i * elementWiseStride] - max); - z[i * resultElementWiseStride] = r; - sum += r; - } - - for (int i = 0; i < length; i++) { - z[i * resultElementWiseStride] /= sum; - } - } - } - } - } - - op_def static X op(X d1, X *params) { - return d1; - } - }; - - - - template - class LogSoftMax { - public: - static const bool requiresSpecial = true; -#ifdef __CUDACC__ - /** - * - */ - - static inline __device__ void execSpecialCuda( - void *vx, Nd4jLong *xShapeBuffer, - void *vresult, Nd4jLong *zShapeBuffer, - void *vextraParams, - int *allocationPointer, void *reductionPointer, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - auto shape = shape::shapeOf(xShapeBuffer); - auto stride = shape::stride(xShapeBuffer); - //iterate along rows - - auto dx = reinterpret_cast(vx); - auto result = reinterpret_cast(vresult); - auto extraParams = reinterpret_cast(vextraParams); - - __shared__ X maxResult; - __shared__ Nd4jLong *maxResultShapeBuffer; - if (threadIdx.x == 0) { - maxResult = (X) 0.0; - } - __syncthreads(); - //compute the row wise maxes - - Nd4jLong maxShape[2] = { shape[0], 1 }; - __shared__ Nd4jLong tempBuffer[8]; - - if (threadIdx.x == 0) - maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT(), maxShape, tempBuffer); - __syncthreads(); - - functions::reduce::ReduceSameInplace::execScalarCudaLegacy(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr); - __syncthreads(); - - //subtract max of each row - functions::scalar::ScalarInplace::transformCudaLegacy(nd4j::scalar::Subtract, &maxResult, dx, xShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer); - __syncthreads(); - - //after subtracting the row wise maxes take the exp - functions::transform::TransformStrictInplace::transformCudaLegacy(nd4j::transform::Exp, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets); - __syncthreads(); - - //take the sum for the exponential - functions::reduce::ReduceSameInplace::execScalarCudaLegacy(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr); - __syncthreads(); - - //divide by the sum - functions::scalar::ScalarInplace::transformCudaLegacy(nd4j::scalar::Divide, &maxResult, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer); - __syncthreads(); - - functions::transform::TransformStrictInplace::transformCudaLegacy(nd4j::transform::Log, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets); - - } -#endif - - - static void execSpecial( - void *vx, - Nd4jLong *xShapeBuffer, - void *vresult, - Nd4jLong *zShapeBuffer, - void *vextraParams, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets) { - - auto dx = reinterpret_cast(vx); - auto result = reinterpret_cast(vresult); - auto extraParams = reinterpret_cast(vextraParams); - - if (shape::isMatrix(xShapeBuffer, 2)) { - auto shape = shape::shapeOf(xShapeBuffer); - //iterate along rows - int dimension[1] = { 0 }; - int maxDimension[1] = { 1 }; - //compute the row wise maxes - auto maxResult = new X[shape[0]]; - - PRAGMA_OMP_SIMD - for (int i = 0; i < shape[0]; i++) - maxResult[i] = 0.0; - - Nd4jLong maxShape[2] = { shape[0], 1 }; - auto maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT(), maxShape); - functions::reduce::ReduceSameFunction::exec(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr); - - //subtract max of each row - functions::broadcast::Broadcast::exec(nd4j::broadcast::Subtract, dx, xShapeBuffer, maxResult, maxResultShapeBuffer, result, zShapeBuffer, dimension, 1, nullptr, nullptr, nullptr, nullptr); - - //after subtracting the row wise maxes take the exp - functions::transform::TransformStrict::exec(nd4j::transform::Exp, result, zShapeBuffer, result, zShapeBuffer, extraParams, tadShapeInfo, tadOffsets); - - //take the sum for the exponential - functions::reduce::ReduceSameFunction::exec(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr); - - //divide by the sum - functions::broadcast::Broadcast::exec(nd4j::broadcast::Divide, result, zShapeBuffer, maxResult, maxResultShapeBuffer, result, zShapeBuffer, dimension, 1, nullptr, nullptr, nullptr, nullptr); - - functions::transform::TransformStrict::exec(nd4j::transform::Log, result, zShapeBuffer, result, zShapeBuffer, extraParams, tadShapeInfo, tadOffsets); - - - delete[] maxResultShapeBuffer; - } - else if (shape::isVector(xShapeBuffer, 2)) { - auto max = -FLOAT_MAX_VALUE; - X sum = 0; - - auto elementWiseStride = shape::elementWiseStride(xShapeBuffer); - auto length = shape::length(xShapeBuffer); - if (elementWiseStride == 1) { - - for (int i = 0; i < length; i++) { - max = nd4j::math::nd4j_max(max, result[i]); - } - - - for (int i = 0; i < length; i++) { - result[i] = nd4j::math::nd4j_exp(dx[i] - max); - sum += result[i]; - } - - PRAGMA_OMP_SIMD - for (int i = 0; i < length; i++) { - result[i] /= sum; - result[i] = nd4j::math::nd4j_log(result[i]); - } - } - else if (elementWiseStride > 1) { - for (int i = 0; i < length; i++) { - max = nd4j::math::nd4j_max(max, result[i * elementWiseStride]); - } - - for (int i = 0; i < length; i++) { - result[i * elementWiseStride] = nd4j::math::nd4j_exp(dx[i * elementWiseStride] - max); - sum += result[i * elementWiseStride]; - } - - for (int i = 0; i < length; i++) { - result[i * elementWiseStride] /= sum; - result[i * elementWiseStride] = nd4j::math::nd4j_log(result[i * elementWiseStride]); - } - } - } - } - - op_def static X op(X d1, X *params) { - return d1; - } - }; - - - /** - * softmax(x) - */ - template - class SoftMaxDerivative { - public: - static const bool requiresSpecial = true; - -#ifdef __CUDACC__ - /** - * - */ - - static inline __device__ void execSpecialCuda( - void *vx, Nd4jLong *xShapeBuffer, - void *vresult, Nd4jLong *zShapeBuffer, - void *vextraParams, - int *allocationPointer, void *reductionPointer, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - auto dx = reinterpret_cast(vx); - auto result = reinterpret_cast(vresult); - auto extraParams = reinterpret_cast(vextraParams); - - auto shape = shape::shapeOf(xShapeBuffer); - __shared__ X maxResult; - __shared__ Nd4jLong *maxResultShapeBuffer; - __shared__ Nd4jLong resultEWS; - - auto length = shape::length(xShapeBuffer); - - if (threadIdx.x == 0) { - resultEWS = shape::elementWiseStride(zShapeBuffer); - - maxResult = (X) 0.0; - } - __syncthreads(); - - auto tride = shape::stride(xShapeBuffer); - Nd4jLong maxShape[2] = { shape[0], 1 }; - - __shared__ Nd4jLong tempBuffer[8]; - - if (threadIdx.x == 0) - maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT(), maxShape, tempBuffer); - __syncthreads(); - - functions::reduce::ReduceSameInplace::execScalarCudaLegacy(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr); - __syncthreads(); - - //subtract max of each row - functions::scalar::ScalarInplace::transformCudaLegacy(nd4j::scalar::Subtract, &maxResult, dx, xShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer); - __syncthreads(); - - //after subtracting the row wise maxes take the exp - functions::transform::TransformStrictInplace::transformCudaLegacy(nd4j::transform::Exp, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets); - __syncthreads(); - - //take the sum for the exponential - functions::reduce::ReduceSameInplace::execScalarCudaLegacy(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr); - __syncthreads(); - - //divide by the sum - functions::scalar::ScalarInplace::transformCudaLegacy(nd4j::scalar::Divide, &maxResult, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer); - __syncthreads(); - - if (resultEWS >= 1) { - for (int i = threadIdx.x; i < length; i += blockDim.x) { - result[i * resultEWS] = result[i * resultEWS] * ((X) 1.0 - result[i * resultEWS]); - } - } - else { - printf("Non element wise stride not supported right now\n"); - } - - } -#endif - - - static void execSpecial( - void *vx, - Nd4jLong *xShapeBuffer, - void *vresult, - Nd4jLong *zShapeBuffer, - void *vextraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - auto dx = reinterpret_cast(vx); - auto result = reinterpret_cast(vresult); - auto extraParams = reinterpret_cast(vextraParams); - - if (shape::isMatrix(xShapeBuffer, 2)) { - auto shape = shape::shapeOf(xShapeBuffer); - - auto resultEleStide = shape::elementWiseStride(zShapeBuffer); - - //iterate along rows - int dimension[1] = { 0 }; - int maxDimension[1] = { 1 }; - auto len = shape::length(xShapeBuffer); - //compute the row wise maxes - auto maxResult = new X[shape[0]]; - - PRAGMA_OMP_SIMD - for (int i = 0; i < shape[0]; i++) - maxResult[i] = 0.0f; - - Nd4jLong maxShape[2] = { shape[0], 1 }; - auto maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT(), maxShape); - functions::reduce::ReduceSameFunction::exec(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr); - - //subtract max of each row - functions::broadcast::Broadcast::exec(nd4j::broadcast::Subtract, result, zShapeBuffer, maxResult, maxResultShapeBuffer, result, zShapeBuffer, dimension, 1, nullptr, nullptr, nullptr, nullptr); - - //after subtracting the row wise maxes take the exp - functions::transform::TransformStrict::exec(nd4j::transform::Exp, result, zShapeBuffer, result, zShapeBuffer, extraParams, tadShapeInfo, tadOffsets); - - //take the sum for the exponential - functions::reduce::ReduceSameFunction::exec(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr); - - //divide by the sum - functions::broadcast::Broadcast::exec(nd4j::broadcast::Divide, result, zShapeBuffer, maxResult, maxResultShapeBuffer, result, zShapeBuffer, dimension, 1, nullptr, nullptr, nullptr, nullptr); - - if (resultEleStide >= 1) { - if (resultEleStide == 1) { - PRAGMA_OMP_SIMD - for (int i = 0; i < len; i++) { - result[i] = result[i] * (static_cast(1.0f) - result[i]); - } - - } - else { - PRAGMA_OMP_SIMD - for (int i = 0; i < len; i++) { - result[i * resultEleStide] = result[i * resultEleStide] * (static_cast(1.0f) - result[i * resultEleStide]); - } - - } - } - else { - - for (int i = 0; i < len; i++) { - Nd4jLong zOffset = shape::getIndexOffset(i, zShapeBuffer); - result[zOffset] = result[zOffset] * ((X) 1.0f - result[zOffset]); - } - } - - - delete[] maxResultShapeBuffer; - delete[] maxResult; - } - else if (shape::isVector(xShapeBuffer, 2)) { - auto max = -nd4j::DataTypeUtils::max(); - X sum = 0; - - auto elementWiseStride = shape::elementWiseStride(xShapeBuffer); - auto length = shape::length(xShapeBuffer); - if (elementWiseStride == 1) { - - for (int i = 0; i < length; i++) { - max = nd4j::math::nd4j_max(max, result[i]); - } - - for (int i = 0; i < length; i++) { - result[i] -= max; - result[i] = nd4j::math::nd4j_exp(result[i]); - sum += result[i]; - } - - for (int i = 0; i < length; i++) { - result[i] /= sum; - } - - for (int i = 0; i < length; i++) { - result[i] = result[i] * ((X) 1.0f - result[i]); - } - } else if (elementWiseStride >= 1) { - - for (int i = 0; i < length; i++) { - max = nd4j::math::nd4j_max(max, result[i * elementWiseStride]); - } - - for (int i = 0; i < length; i++) { - result[i * elementWiseStride] -= max; - result[i * elementWiseStride] = nd4j::math::nd4j_exp(result[i * elementWiseStride]); - sum += result[i * elementWiseStride]; - } - - PRAGMA_OMP_SIMD - for (int i = 0; i < length; i++) { - result[i * elementWiseStride] /= sum; - } - - PRAGMA_OMP_SIMD - for (int i = 0; i < length; i++) { - result[i * elementWiseStride] = result[i * elementWiseStride] * ((X) 1.0f - result[i * elementWiseStride]); - } - } else { - printf("non-ews access on row not implemented yet"); - } - } - } - - op_def static X op(X d1, X *params) { - return d1; - } - }; - - - template - class IsMax { - public: - static const bool requiresSpecial = true; - - -#ifdef __CUDACC__ - - static inline __device__ void doAllCuda( - void *vx, - Nd4jLong *xShapeBuffer, - void *vresult, - Nd4jLong *zShapeBuffer, - void *vextraParams, - int *allocationPointer, void *reductionPointer) { - - auto dx = reinterpret_cast(vx); - auto result = reinterpret_cast(vresult); - auto extraParams = reinterpret_cast(vextraParams); - -// this code is safe to delete, it's never used -/* - __shared__ int maxIdx; - __shared__ int length; - if (threadIdx.x == 0) { - length = shape::length(zShapeBuffer); - } - __syncthreads(); - - functions::indexreduce::IndexReduce::template transform>( - dx, - xShapeBuffer, - extraParams, - result, - zShapeBuffer, - nullptr, - 1, - 1, allocationPointer, reductionPointer, nullptr, nullptr); - - __syncthreads(); - if (threadIdx.x == 0) - maxIdx = (int)result[0]; - __syncthreads(); - - for (int i = threadIdx.x; i < length; i += blockDim.x) - result[i] = 0; - __syncthreads(); - - if (threadIdx.x == 0) { - result[maxIdx] = 1.0; - } - */ - } -#endif - -#ifdef __CUDACC__ - inline __host__ - -#elif defined(__GNUC__) - - -#endif - static void doAll( - void *vx, - Nd4jLong *xShapeBuffer, - void *vresult, - Nd4jLong *zShapeBuffer, - void *vextraParams) { - - auto dx = reinterpret_cast(vx); - auto result = reinterpret_cast(vresult); - auto extraParams = reinterpret_cast(vextraParams); - - auto length = shape::length(xShapeBuffer); - auto eleStride = shape::elementWiseStride(xShapeBuffer); - auto resultEleStride = shape::elementWiseStride(zShapeBuffer); - auto xOrder = shape::order(xShapeBuffer); - auto resultOrder = shape::order(zShapeBuffer); - - if (xOrder == resultOrder && xOrder == 'c') { - if (eleStride == 1 && resultEleStride == 1) { - if (length < ELEMENT_THRESHOLD) { - int maxIdx = 0; - auto currMax = dx[0]; - - for (int i = 0; i < length; i++) { - if (currMax < dx[i]) { - currMax = dx[i]; - maxIdx = i; - } - - result[i] = static_cast(0); - - } - - result[maxIdx] = static_cast(1); - - } - else { - int maxIdx = 0; - auto currMax = dx[0]; - - -{ - int maxIdxLocal = maxIdx; - auto currMaxLocal = currMax; - - for (int i = 0; i < length; i++) { - if (currMaxLocal < dx[i]) { - currMaxLocal = dx[i]; - maxIdxLocal = i; - } - result[i] = static_cast(0); - } - -PRAGMA_OMP_CRITICAL -{ - if (currMax < currMaxLocal) { - currMax = currMaxLocal; - maxIdx = maxIdxLocal; - } -} -} - result[maxIdx] = static_cast(1); - } - - } - else { - if (length < ELEMENT_THRESHOLD) { - int maxIdx = 0; - auto currMax = dx[0]; - - for (int i = 0; i < length; i++) { - result[i * resultEleStride] = static_cast(0); - if (currMax < dx[i * eleStride]) { - currMax = dx[i * eleStride]; - maxIdx = i; - } - } - - result[maxIdx * resultEleStride] = static_cast(1); - - } - else { - int maxIdx = 0; - auto currMax = dx[0]; - - -{ - int maxIdxLocal = maxIdx; - auto currMaxLocal = currMax; - - for (int i = 0; i < length; i++) { - result[i * resultEleStride] = static_cast(0); - if (currMaxLocal < dx[i * eleStride]) { - currMaxLocal = dx[i * eleStride]; - maxIdxLocal = i; - } - } - -PRAGMA_OMP_CRITICAL -{ - if (currMax < currMaxLocal) { - currMax = currMaxLocal; - maxIdx = maxIdxLocal; - } -} -} - result[maxIdx * resultEleStride] = static_cast(1); - } - - } - } - - - else { - Nd4jLong shapeIter[MAX_RANK]; - Nd4jLong coord[MAX_RANK]; - int dim; - Nd4jLong xStridesIter[MAX_RANK]; - Nd4jLong resultStridesIter[MAX_RANK]; - auto xShape = shape::shapeOf(xShapeBuffer); - auto xStride = shape::stride(xShapeBuffer); - auto resultStride = shape::stride(zShapeBuffer); - auto rank = shape::rank(xShapeBuffer); - auto originalResult = result; - if (PrepareTwoRawArrayIter(rank, - xShape, - dx, - xStride, - result, - resultStride, - &rank, - shapeIter, - &dx, - xStridesIter, - &result, - resultStridesIter) >= 0) { - auto value = dx[0]; - int idx = 0; - int maxIdx = 0; - ND4J_RAW_ITER_START(dim, rank, coord, shapeIter); { - if (dx[0] > value) { - value = dx[0]; - maxIdx = idx; - } - - idx++; - result[0] = static_cast(0); - - } - ND4J_RAW_ITER_TWO_NEXT( - dim, - rank, - coord, - shapeIter, - dx, - xStridesIter, - result, - resultStridesIter); - - //pointer to where max value would be - if (shape::order(zShapeBuffer) == 'c' || (shape::order(zShapeBuffer) == 'f' && - maxIdx * shape::stride(zShapeBuffer)[shape::rank(zShapeBuffer) - 1] >= - shape::length(zShapeBuffer))) - originalResult[maxIdx] = static_cast(1); - else - originalResult[maxIdx * shape::stride(zShapeBuffer)[shape::rank(zShapeBuffer) - 1]] = static_cast(1); - } - } - - - } - public: - - -#ifdef __CUDACC__ - /** - * - */ - - static inline __device__ void execSpecialCuda( - void *vx, Nd4jLong *xShapeBuffer, - void *vresult, Nd4jLong *zShapeBuffer, - void *vextraParams, int *allocationPointer, - void *reductionPointer, - Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) { - - auto dx = reinterpret_cast(vx); - auto result = reinterpret_cast(vresult); - auto extraParams = reinterpret_cast(vextraParams); - - // FIXME: MAX_DIMENSION is lower then FP16 frame - if (extraParams == nullptr || (int) extraParams[0] == MAX_DIMENSION) { - doAllCuda(dx, xShapeBuffer, result, zShapeBuffer, extraParams, allocationPointer, reductionPointer); - } - } -#endif - - static void execSpecial( - void *vx, - Nd4jLong *xShapeBuffer, - void *vresult, - Nd4jLong *zShapeBuffer, - void *vextraParams, - Nd4jLong *tadShapeInfo, - Nd4jLong *tadOffsets) { - - auto dx = reinterpret_cast(vx); - auto result = reinterpret_cast(vresult); - auto extraParams = reinterpret_cast(vextraParams); - - //FIXME: this op should be moved to CustomOps - if (extraParams == nullptr || (int)extraParams[0] == 0 || - ((int)extraParams[0] == 1 && (int)extraParams[1] == MAX_DIMENSION)) { - doAll(dx, xShapeBuffer, result, zShapeBuffer, extraParams); - } - else if (shape::isVector(xShapeBuffer)) { - auto dimensionLength = (int)extraParams[0]; - auto dimension = new int[dimensionLength]; - auto length = shape::length(xShapeBuffer); - for (int i = 0; i < dimensionLength; i++) { - dimension[i] = (int)extraParams[i + 1]; - } - if (shape::shapeOf(xShapeBuffer)[dimension[0]] == 1) { - for (int i = 0; i < length; i++) { - result[i] = static_cast(1); - } - } - else { - auto eleStride = shape::elementWiseStride(xShapeBuffer); - if (eleStride == 1) { - int maxIdx = 0; - auto currMax = dx[0]; - if (length < ELEMENT_THRESHOLD) { - - for (int i = 0; i < length; i++) { - if (currMax < dx[i]) { - currMax = dx[i]; - maxIdx = i; - } - - result[i] = static_cast(0); - - } - } - else { -PRAGMA_OMP_PARALLEL -{ - int maxIdxLocal = maxIdx; - auto currMaxLocal = currMax; - - for (int i = 0; i < length; i++) { - if (currMaxLocal < dx[i]) { - currMaxLocal = dx[i]; - maxIdxLocal = i; - } - - result[i] = static_cast(0); - - } - - PRAGMA_OMP_CRITICAL - { - if (currMax < currMaxLocal) { - currMax = currMaxLocal; - maxIdx = maxIdxLocal; - } - } -} - } - - result[maxIdx] = static_cast(1); - - } - - - else { - int maxIdx = 0; - auto currMax = dx[0]; - if (length < ELEMENT_THRESHOLD) { - - for (int i = 0; i < length; i++) { - if (currMax < dx[i * eleStride]) { - currMax = dx[i * eleStride]; - maxIdx = i; - } - - result[i] = static_cast(0); - } - } - else { - -{ - int maxIdxLocal = maxIdx; - auto currMaxLocal = currMax; - - for (int i = 0; i < length; i++) { - if (currMaxLocal < dx[i * eleStride]) { - currMaxLocal = dx[i * eleStride]; - maxIdxLocal = i; - } - - result[i] = static_cast(0); - } - -PRAGMA_OMP_CRITICAL -{ - if (currMax < currMaxLocal) { - currMax = currMaxLocal; - maxIdx = maxIdxLocal; - } -} -} - } - - result[maxIdx] = static_cast(1); - } - } - - - } - else { - auto dimensionLength = (int) extraParams[0]; - auto dimension = new int[dimensionLength]; - - PRAGMA_OMP_SIMD - for (int i = 0; i < dimensionLength; i++) { - dimension[i] = (int) extraParams[i + 1]; - } - //decompose in to several sub tads after - //moving all dimensions (in sorted order) - //to the back. - //permuted version of the x shape info for setting up the tad problem - auto tadShapeShapeInfo = tadShapeInfo; - if(tadShapeInfo==nullptr) { - auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeBuffer, dimension, dimensionLength); - - tadShapeShapeInfo = tadPack.primaryShapeInfo(); - tadOffsets = tadPack.primaryOffsets(); - tadShapeInfo = tadShapeShapeInfo; - } - - auto tadLength = shape::length(tadShapeInfo);//shape::tadLength(xShapeBuffer, dimension, dimensionLength); - auto tads = shape::length(xShapeBuffer) / tadLength; - - int tadsPerThread = tads / TAD_THRESHOLD; - int num_threads = nd4j::math::nd4j_max(1, tadsPerThread); - num_threads = nd4j::math::nd4j_min(num_threads, omp_get_max_threads()); - - auto tadEWS = shape::elementWiseStride(tadShapeShapeInfo); - auto zEWS = tadEWS; - - int span = (tads / num_threads) + 8; - - PRAGMA_OMP_PARALLEL_THREADS(num_threads) - { - int tid = omp_get_thread_num(); - int start = span * tid; - int end = span * (tid + 1); - if (end > tads) end = tads; - - for (int r = start; r < end; r++) { - if (tadEWS > 0 && zEWS > 0 && dimensionLength == 1) { - auto rX = dx + tadOffsets[r]; - auto rZ = result + tadOffsets[r]; - - auto maxValue = rX[0]; - int maxIdx = 0; - if (tadEWS == 1 && zEWS == 1) { - - for (int i = 0; i < tadLength; i++) { - if (rX[i] > maxValue) { - maxIdx = i; - maxValue = rX[i]; - } - } - - - for (int i = 0; i < tadLength; i++) { - rZ[i] = static_cast(maxIdx == i); - } - - } else { - - for (int i = 0; i < tadLength; i++) { - if (rX[i * tadEWS] > maxValue) { - maxIdx = i; - maxValue = rX[i * tadEWS]; - } - } - - for (int i = 0; i < tadLength; i++) { - rZ[i * zEWS] = static_cast(maxIdx == i); - } - } - } else { - int tadsPerThread = tads / TAD_THRESHOLD; - int num_threads = nd4j::math::nd4j_max(1, tadsPerThread); - num_threads = nd4j::math::nd4j_min(num_threads, omp_get_max_threads()); - - auto offset = tadOffsets[r]; - Nd4jLong shapeIter[MAX_RANK]; - Nd4jLong coord[MAX_RANK]; - int dim; - Nd4jLong xStridesIter[MAX_RANK]; - Nd4jLong resultStridesIter[MAX_RANK]; - auto xShape = shape::shapeOf(tadShapeShapeInfo); - auto xStride = shape::stride(tadShapeShapeInfo); - auto resultStride = shape::stride(tadShapeShapeInfo); - int rank = shape::rank(tadShapeShapeInfo); - auto xPointer = dx + offset; - auto resultPointer = result + offset; - auto maxValue = xPointer[0]; - - auto maxCursor = resultPointer; - Nd4jPointer maxCursorLong = reinterpret_cast(maxCursor); - if (PrepareTwoRawArrayIter(rank, - xShape, - xPointer, - xStride, - resultPointer, - resultStride, - &rank, - shapeIter, - &xPointer, - xStridesIter, - &resultPointer, - resultStridesIter) >= 0) { - ND4J_RAW_ITER_START(dim, rank, coord, shapeIter); { - if (maxValue < xPointer[0]) { - maxCursor = resultPointer; - maxCursorLong = reinterpret_cast(resultPointer); - maxValue = xPointer[0]; - } - - resultPointer[0] = static_cast(0); - } - ND4J_RAW_ITER_TWO_NEXT(dim, - rank, - coord, - shapeIter, - xPointer, - xStridesIter, - resultPointer, - resultStridesIter); - maxCursor = reinterpret_cast(maxCursorLong); - maxCursor[0] = static_cast(1);; - } - } - } - } - - delete[] dimension; - } - } - - op_def static Z op(X d1, X *params) { - return nd4j::math::softplus(d1); - } - }; -} diff --git a/libnd4j/include/ops/special_random_ops.h b/libnd4j/include/ops/special_random_ops.h index 1ae310ad4..a25aa36ec 100644 --- a/libnd4j/include/ops/special_random_ops.h +++ b/libnd4j/include/ops/special_random_ops.h @@ -25,6 +25,7 @@ #include #include #include +#include namespace randomOps { @@ -152,9 +153,9 @@ namespace randomOps { // TODO: we probably might want to skip this sum, and state that probabilities array should be real probabilities, i.e. should sum to 1.0 //T probSum = extraArguments[0]; - Nd4jLong xLength = shape::length(xShapeBuffer); - Nd4jLong yLength = shape::length(yShapeBuffer); - Nd4jLong zLength = shape::length(zShapeBuffer); + auto xLength = shape::length(xShapeBuffer); + auto yLength = shape::length(yShapeBuffer); + auto zLength = shape::length(zShapeBuffer); auto xEWS = shape::elementWiseStride(xShapeBuffer); auto yEWS = shape::elementWiseStride(yShapeBuffer); @@ -162,47 +163,53 @@ namespace randomOps { int elementsPerThread = zLength / TAD_THRESHOLD; int _threads = nd4j::math::nd4j_max(1, elementsPerThread); - _threads = nd4j::math::nd4j_min(_threads, omp_get_max_threads()); + _threads = nd4j::math::nd4j_min(_threads, nd4j::Environment::getInstance()->maxThreads()); if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) { - PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads) - for (Nd4jLong e = 0; e < zLength; e++) { - T prob = rng->relativeT(e); - T cumProb = (T) 0.0f; - for (Nd4jLong f = 0; f < yLength; f++) { - T relProb = y[f * yEWS]; - cumProb += relProb; + auto func = PRAGMA_THREADS_FOR { + for (uint64_t e = start; e < stop; e += increment) { + T prob = rng->relativeT(e); + T cumProb = (T) 0.0f; + for (Nd4jLong f = 0; f < yLength; f++) { + T relProb = y[f * yEWS]; + cumProb += relProb; - if (prob <= cumProb || f == yLength - 1) { - z[e * zEWS] = x[f * xEWS]; - break; + if (prob <= cumProb || f == yLength - 1) { + z[e * zEWS] = x[f * xEWS]; + break; + } } } - } + }; + + samediff::Threads::parallel_for(func, 0, zLength, 1, _threads); } else { - PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads) - for (Nd4jLong i = 0; i < zLength; i++) { + auto func = PRAGMA_THREADS_FOR { + for (Nd4jLong i = 0; i < zLength; i++) { - auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer); - T prob = rng->relativeT(i); - T cumProb = (T) 0.0f; + auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer); + T prob = rng->relativeT(i); + T cumProb = (T) 0.0f; - for (Nd4jLong f = 0; f < yLength; f++) { + for (Nd4jLong f = 0; f < yLength; f++) { - auto yOffset2 = shape::getIndexOffset(f, yShapeBuffer); - T relProb = y[yOffset2]; - cumProb += relProb; + auto yOffset2 = shape::getIndexOffset(f, yShapeBuffer); + T relProb = y[yOffset2]; + cumProb += relProb; - if (prob <= cumProb || f == yLength - 1) { + if (prob <= cumProb || f == yLength - 1) { - auto xOffset2 = shape::getIndexOffset(f, xShapeBuffer); - z[zOffset2] = x[xOffset2]; - break; + auto xOffset2 = shape::getIndexOffset(f, xShapeBuffer); + z[zOffset2] = x[xOffset2]; + break; + } } } - } + }; + + samediff::Threads::parallel_for(func, 0, zLength, 1, _threads); } } }; @@ -308,7 +315,7 @@ namespace randomOps { int elementsPerThread = middle / TAD_THRESHOLD; int _threads = nd4j::math::nd4j_max(1, elementsPerThread); - _threads = nd4j::math::nd4j_min(_threads, omp_get_max_threads()); + _threads = nd4j::math::nd4j_min(_threads, nd4j::Environment::getInstance()->maxThreads()); int span = (middle / _threads) + 8; @@ -322,25 +329,30 @@ namespace randomOps { const T epsilon = static_cast(1e-5); - PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads) - for (Nd4jLong e = 0; e < middle; e++) { - auto epm = e + middle; + auto func = PRAGMA_THREADS_FOR { + for (uint64_t e = start; e < stop; e += increment) { + auto epm = e + middle; - // we need to get random values - T r0 = rng->relativeT(e, epsilon, static_cast(1.0f)); - T r1 = rng->relativeT(epm, epsilon, static_cast(1.0f)); + // we need to get random values + T r0 = rng->relativeT(e, epsilon, static_cast(1.0f)); + T r1 = rng->relativeT(epm, epsilon, static_cast(1.0f)); - T realMean0 = y == z ? mean : y[e * yEWS]; + T realMean0 = y == z ? mean : y[e * yEWS]; - auto z0 = (nd4j::math::nd4j_sqrt(static_cast(-2.0f) * nd4j::math::nd4j_log(r0)) * nd4j::math::nd4j_cos(two_pi * r1)) * stddev + realMean0; - z[e * zEWS] = z0; + auto z0 = (nd4j::math::nd4j_sqrt(static_cast(-2.0f) * nd4j::math::nd4j_log(r0)) * + nd4j::math::nd4j_cos(two_pi * r1)) * stddev + realMean0; + z[e * zEWS] = z0; - if (epm < zLength) { - T realMean1 = y == z ? mean : y[epm * yEWS]; - auto z1 = (nd4j::math::nd4j_sqrt(static_cast(-2.0f) * nd4j::math::nd4j_log(r0)) * nd4j::math::nd4j_sin(two_pi * r1)) * stddev + realMean1; - z[epm * zEWS] = z1; + if (epm < zLength) { + T realMean1 = y == z ? mean : y[epm * yEWS]; + auto z1 = (nd4j::math::nd4j_sqrt(static_cast(-2.0f) * nd4j::math::nd4j_log(r0)) * + nd4j::math::nd4j_sin(two_pi * r1)) * stddev + realMean1; + z[epm * zEWS] = z1; + } } - } + }; + + samediff::Threads::parallel_for(func, 0, middle, 1, _threads); } }; @@ -422,21 +434,13 @@ namespace randomOps { int elementsPerThread = zLength / TAD_THRESHOLD; int _threads = nd4j::math::nd4j_max(1, elementsPerThread); - _threads = nd4j::math::nd4j_min(_threads, omp_get_max_threads()); + _threads = nd4j::math::nd4j_min(_threads, nd4j::Environment::getInstance()->maxThreads()); - auto span = (zLength / _threads) + 8; + T prob = extraArguments[1]; nd4j::graph::RandomGenerator* rng = reinterpret_cast(state); - PRAGMA_OMP_PARALLEL_THREADS(_threads) - { - int tid = omp_get_thread_num(); - auto start = span * tid; - auto end = span * (tid + 1); - if (end > zLength) end = zLength; - - T prob = extraArguments[1]; - - for (Nd4jLong e = start; e < end; e++) { + auto func = PRAGMA_THREADS_FOR { + for (Nd4jLong e = start; e < stop; e += increment) { int success = 0; for (int t = 1; t <= trials; t++) { @@ -453,7 +457,9 @@ namespace randomOps { // if trials is set to 0, effectively we just have successful memset z[e * zEWS] = static_cast(success); } - } + }; + + samediff::Threads::parallel_for(func, 0, zLength, 1, _threads); } }; @@ -536,22 +542,14 @@ namespace randomOps { int elementsPerThread = zLength / TAD_THRESHOLD; int _threads = nd4j::math::nd4j_max(1, elementsPerThread); - _threads = nd4j::math::nd4j_min(_threads, omp_get_max_threads()); + _threads = nd4j::math::nd4j_min(_threads, nd4j::Environment::getInstance()->maxThreads()); - auto span = (zLength / _threads) + 8; + T prob = extraArguments[1]; //nd4j::random::RandomBuffer *buffer = reinterpret_cast (state); nd4j::graph::RandomGenerator* rng = reinterpret_cast(state); - PRAGMA_OMP_PARALLEL_THREADS(_threads) - { - int tid = omp_get_thread_num(); - Nd4jLong start = span * tid; - Nd4jLong end = span * (tid + 1); - if (end > zLength) end = zLength; - - T prob = extraArguments[1]; - - for (Nd4jLong e = start; e < end; e++) { + auto func = PRAGMA_THREADS_FOR { + for (uint64_t e = start; e < stop; e += increment) { int success = 0; for (int t = 1; t <= trials; t++) { @@ -568,7 +566,9 @@ namespace randomOps { // if trials is set to 0, effectively we just have successful memset z[e * zEWS] = static_cast(success); } - } + }; + + samediff::Threads::parallel_for(func, 0, zLength, 1, _threads); } }; @@ -685,19 +685,22 @@ namespace randomOps { Nd4jLong middle = zLength / 2 + (zLength % 2); int elementsPerThread = middle / TAD_THRESHOLD; int _threads = nd4j::math::nd4j_max(1, elementsPerThread); - _threads = nd4j::math::nd4j_min(_threads, omp_get_max_threads()); + _threads = nd4j::math::nd4j_min(_threads, nd4j::Environment::getInstance()->maxThreads()); const T epsilon = static_cast(1e-5); - PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads) - for (Nd4jLong e = 0; e < zLength; ++e) { - if (z[e] > mean + ds || z[e] < mean - ds) { - z[e] = step(rng, mean, stddev, e, middle, z[e]); + auto func = PRAGMA_THREADS_FOR { + for (uint64_t e = start; e < stop; e += increment) { + if (z[e] > mean + ds || z[e] < mean - ds) { + z[e] = step(rng, mean, stddev, e, middle, z[e]); - if (z[e] > mean + ds || z[e] < mean - ds) - z[e] = mean + nd4j::DataTypeUtils::min(); + if (z[e] > mean + ds || z[e] < mean - ds) + z[e] = mean + nd4j::DataTypeUtils::min(); + } } - } + }; + + samediff::Threads::parallel_for(func, 0, zLength, 1, _threads); } }; @@ -799,7 +802,7 @@ namespace randomOps { int elementsPerThread = middle / TAD_THRESHOLD; int _threads = nd4j::math::nd4j_max(1, elementsPerThread); - _threads = nd4j::math::nd4j_min(_threads, omp_get_max_threads()); + _threads = nd4j::math::nd4j_min(_threads, nd4j::Environment::getInstance()->maxThreads()); int span = (zLength / _threads) + 8; @@ -813,16 +816,9 @@ namespace randomOps { const T stddev = extraArguments[1]; const T epsilon = static_cast(1e-5); - PRAGMA_OMP_PARALLEL_THREADS(_threads) - { - int tid = omp_get_thread_num(); - Nd4jLong start = span * tid; - Nd4jLong end = span * (tid + 1); - if (end > middle) - end = middle; - + auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (Nd4jLong e = start; e < end; e++) { + for (uint64_t e = start; e < stop; e += increment) { auto epm = e + middle; // we need to get random values @@ -838,7 +834,9 @@ namespace randomOps { z[epm * zEWS] = nd4j::math::nd4j_exp((nd4j::math::nd4j_sqrt(static_cast(-2.0f) * nd4j::math::nd4j_log(r0)) * nd4j::math::nd4j_sin(two_pi * r1)) * stddev + realMean); } } - } + }; + + samediff::Threads::parallel_for(func, 0, middle, 1, _threads); } }; diff --git a/libnd4j/include/ops/specials.h b/libnd4j/include/ops/specials.h index 6919aa38d..d8030db0b 100644 --- a/libnd4j/include/ops/specials.h +++ b/libnd4j/include/ops/specials.h @@ -18,8 +18,8 @@ // Created by raver119 on 24.04.17. // -#ifndef LIBND4J_CONCAT_H -#define LIBND4J_CONCAT_H +#ifndef LIBND4J_SPECIALS_H +#define LIBND4J_SPECIALS_H #ifdef __CUDACC__ @@ -28,6 +28,7 @@ #endif #include +#include namespace nd4j { class NDArray; @@ -81,4 +82,4 @@ namespace nd4j { } -#endif //LIBND4J_CONCAT_H +#endif //LIBND4J_SPECIALS_H diff --git a/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp b/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp index d35346e2b..22bb87103 100644 --- a/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp +++ b/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp @@ -21,8 +21,9 @@ #include #include #include +#include -#ifdef _RELEASE +#ifdef RELEASE_BUILD int wIterations = 4; int rIterations = 20; int gemmRegularUpperPow = 11; diff --git a/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp b/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp index caad37867..9e179db7f 100644 --- a/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp +++ b/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp @@ -21,14 +21,14 @@ #include #include "performance/benchmarking/LightBenchmarkSuit.h" -#ifdef _RELEASE -#define WARMUP 3 -#define NUM_ITER 10 +#ifdef RELEASE_BUILD +#define WARMUP 5 +#define NUM_ITER 100 #else -#define WARMUP 0 -#define NUM_ITER 1 +#define WARMUP 5 +#define NUM_ITER 100 #endif @@ -592,7 +592,7 @@ namespace nd4j { } std::string LightBenchmarkSuit::runSuit() { -#ifdef _RELEASE +#ifdef RELEASE_BUILD std::vector dtypes({nd4j::DataType::FLOAT32, nd4j::DataType::HALF}); #else std::vector dtypes({nd4j::DataType::FLOAT32}); @@ -609,7 +609,7 @@ namespace nd4j { nd4j_printf("Running LightBenchmarkSuite.pairwiseBenchmark [%s]\n", DataTypeUtils::asString(t).c_str()); BUILD_SINGLE_SELECTOR(t, result += pairwiseBenchmark, (), LIBND4J_TYPES); - +/* nd4j_printf("Running LightBenchmarkSuite.reduceFullBenchmark [%s]\n", DataTypeUtils::asString(t).c_str()); BUILD_SINGLE_SELECTOR(t, result += reduceFullBenchmark, (), LIBND4J_TYPES); @@ -627,12 +627,13 @@ namespace nd4j { nd4j_printf("Running LightBenchmarkSuite.lstmBenchmark [%s]\n", DataTypeUtils::asString(t).c_str()); BUILD_SINGLE_SELECTOR(t, result += lstmBenchmark, (), LIBND4J_TYPES); + */ } nd4j_printf("Running LightBenchmarkSuite.broadcast2d\n", ""); - result += broadcast2d(); + //result += broadcast2d(); nd4j_printf("Running LightBenchmarkSuite.mismatchedOrderAssign\n", ""); - result += mismatchedOrderAssign(); + //result += mismatchedOrderAssign(); return result; } diff --git a/libnd4j/include/pointercast.h b/libnd4j/include/pointercast.h index c6161782a..e080b33b6 100644 --- a/libnd4j/include/pointercast.h +++ b/libnd4j/include/pointercast.h @@ -21,6 +21,7 @@ #ifndef NATIVEOPERATIONS_POINTERCAST_H #define NATIVEOPERATIONS_POINTERCAST_H +#include #include typedef void* Nd4jPointer; diff --git a/libnd4j/include/templatemath.h b/libnd4j/include/templatemath.h index 96f97f762..23f6b342d 100644 --- a/libnd4j/include/templatemath.h +++ b/libnd4j/include/templatemath.h @@ -44,7 +44,6 @@ #define M_PI 3.14159265358979323846 #endif - namespace nd4j { #ifdef __CUDACC__ @@ -1651,4 +1650,46 @@ inline __device__ bfloat16 nd4j_atomicDiv(bfloat16* address, bfloat16 } +#ifdef _OPENMP + +#ifndef MAX_FLOAT +#define MAX_FLOAT 1e37 +#endif + +#pragma omp declare reduction(maxTF : float,double,float16,bfloat16 : \ + omp_out = nd4j::math::nd4j_max(omp_in, omp_out) )\ + initializer (omp_priv=-MAX_FLOAT) + +#pragma omp declare reduction(minTF : float,double,float16,bfloat16 : \ + omp_out = nd4j::math::nd4j_min(omp_in, omp_out) )\ + initializer (omp_priv=MAX_FLOAT) + +#pragma omp declare reduction(maxT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t : \ + omp_out = nd4j::math::nd4j_max(omp_in, omp_out) )\ + initializer (omp_priv=0) + +#pragma omp declare reduction(minT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t : \ + omp_out = nd4j::math::nd4j_min(omp_in, omp_out) )\ + initializer (omp_priv=0) + +#pragma omp declare reduction(amaxT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t : \ + omp_out = nd4j::math::nd4j_max(nd4j::math::nd4j_abs(omp_in), nd4j::math::nd4j_abs(omp_out)) ) + +#pragma omp declare reduction(aminT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t : \ + omp_out = nd4j::math::nd4j_min(nd4j::math::nd4j_abs(omp_in), nd4j::math::nd4j_abs(omp_out)) ) + +#pragma omp declare reduction(asumT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t : \ + omp_out = nd4j::math::nd4j_abs(omp_in) + nd4j::math::nd4j_abs(omp_out))\ + initializer (omp_priv=0) + +#pragma omp declare reduction(sumT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t : \ + omp_out = omp_in + omp_out)\ + initializer (omp_priv=0) + +#pragma omp declare reduction(prodT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t : \ + omp_out = omp_in * omp_out)\ + initializer (omp_priv=1) + +#endif + #endif /* TEMPLATEMATH_H_ */ diff --git a/libnd4j/pom.xml b/libnd4j/pom.xml index f33f8577f..3e766b944 100644 --- a/libnd4j/pom.xml +++ b/libnd4j/pom.xml @@ -185,6 +185,8 @@ bash run_tests.sh + --chip + ${libnd4j.chip} diff --git a/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp index 2cbc8513e..20469ed2d 100644 --- a/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp @@ -141,7 +141,7 @@ TEST_F(BooleanOpsTests, test_where_1) { auto z = result->at(0); - z->printIndexedBuffer("z"); + //z->printIndexedBuffer("z"); ASSERT_EQ(e, *z); diff --git a/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp index c6b834a33..33a8fa10a 100644 --- a/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp @@ -41,6 +41,8 @@ TEST_F(BroadcastableOpsTests, Test_Add_1) { y.linspace(1); exp.linspace(1); + //exp.printIndexedBuffer("E B"); + exp.applyBroadcast(broadcast::Add, {1}, &y); nd4j::ops::add op; @@ -50,8 +52,8 @@ TEST_F(BroadcastableOpsTests, Test_Add_1) { auto z = result->at(0); - // exp.printIndexedBuffer("E"); - // z->printIndexedBuffer("Z"); + //exp.printIndexedBuffer("E A"); + //z->printIndexedBuffer("Z"); ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); @@ -717,7 +719,7 @@ TEST_F(BroadcastableOpsTests, broadcast_bool_empty_2) { auto z = result->at(0); - z->printShapeInfo("z"); + // z->printShapeInfo("z"); ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(e.isSameShape(z)); diff --git a/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp b/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp index 0fa4d687d..9a8f09b87 100644 --- a/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp @@ -54,7 +54,7 @@ TEST_F(BroadcastMultiDimTest,MultimDimTest) { tad->tadOnlyShapeInfo, //tadShapeInfo tad->tadOffsets, //tadOffset tad->tadOnlyShapeInfo, //tadShapeInfoZ - tad->tadOffsets); //tadOffsetZ + tad->tadOffsets, 0, tad->numTads); //tadOffsetZ for(int i = 0; i < 30; i++) { ASSERT_EQ(dataAssertion[i],result[i]); } diff --git a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt index 6f964d0ac..8a58fe3a5 100644 --- a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt +++ b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt @@ -34,7 +34,7 @@ if (CUDA_BLAS) endif() if ("${COMPUTE}" STREQUAL "all") - list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -gencode arch=compute_35,code=sm_35 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70) + list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70) else() list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -arch=compute_${COMPUTE} -code=sm_${COMPUTE}) endif() @@ -43,18 +43,19 @@ endif() # -fsanitize=address # -fsanitize=leak if (APPLE) - set(CMAKE_CXX_FLAGS " -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fmax-errors=2 -D__APPLE_OS__=true") -elseif(WIN32) - if (CPU_BLAS) - set(CMAKE_CXX_FLAGS " -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fmax-errors=2") + set(CMAKE_CXX_FLAGS " -fPIC -std=c++11 -fmax-errors=2 -D__APPLE_OS__=true") +elseif(WIN32) + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -march=native -mtune=native -O3") + if (CPU_BLAS AND LINUX) + set(CMAKE_CXX_FLAGS " -fPIC -std=c++11 -fmax-errors=2") endif() else() - - set(CMAKE_CXX_FLAGS " -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fmax-errors=2") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") + set(CMAKE_CXX_FLAGS " -fPIC -std=c++11 -fmax-errors=2") if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*") set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native") else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -mtune=native") endif() if (CPU_BLAS) @@ -130,6 +131,10 @@ foreach (TMP_PATH ${TEST_SOURCES}) endforeach(TMP_PATH) if (CPU_BLAS) + if (NOT BLAS_LIBRARIES) + set(BLAS_LIBRARIES "") + endif() + add_executable(runtests ${TEST_SOURCES}) target_link_libraries(runtests ${LIBND4J_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES} gtest gtest_main) elseif(CUDA_BLAS) diff --git a/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp b/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp index 2d4f9205f..60ba4733c 100644 --- a/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp @@ -160,7 +160,6 @@ TEST_F(ConditionalTests, Flat_Test_2) { auto exp = NDArrayFactory::create('c', {2, 2}, {1, 1, 1, 1}); - z->printIndexedBuffer("z"); ASSERT_TRUE(exp.equalsTo(z)); delete graph; } diff --git a/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp b/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp index 383815417..9134ef0a4 100644 --- a/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp @@ -140,8 +140,8 @@ TEST_F(ConstantShapeHelperTests, basic_test_5) { auto arrayA = NDArrayFactory::create(1); auto arrayB = NDArrayFactory::create_('c', {128, 256}); - arrayA.printShapeInfo("A"); - arrayB->printShapeInfo("B"); + //arrayA.printShapeInfo("A"); + //arrayB->printShapeInfo("B"); ASSERT_EQ(0, arrayA.rankOf()); ASSERT_EQ(2, arrayB->rankOf()); ASSERT_NE(arrayA.dataType(), arrayB->dataType()); diff --git a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp index 853f82cda..353e51ad3 100644 --- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp +++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp @@ -614,182 +614,6 @@ TYPED_TEST(TypedConvolutionTests1, sconv2d_conv2d_1) { delete result2D; } - - -TEST_F(ConvolutionTests1, Test_im2col_col2im_1) { - int kY = 5; - int kX = 5; - int sY = 1; - int sX = 1; - int pY = 0; - int pX = 0; - int dY = 1; - int dX = 1; - int inY = 28; - int inX = 28; - int channels = 3; - - bool isSameMode = true; - - auto x = NDArrayFactory::create('c', {2, channels, inY, inX}); - x.linspace(1); - - int oY, oX; - x.syncToDevice(); - //ASSERT_TRUE(x.isActualOnDeviceSide()); - ASSERT_TRUE(x.isActualOnHostSide()); - //x.printBuffer("x", 64); - - nd4j::ops::ConvolutionUtils::calcOutSizePool2D(oY, oX, kY, kX, sY, sX, pY, pX, dY, dX, inY, inX, isSameMode); - - if (isSameMode) - nd4j::ops::ConvolutionUtils::calcPadding2D(pY, pX, oY, oX, inY, inX, kY, kX, sY, sX, dY, dX); - - auto im2col0 = NDArrayFactory::create('c', {2, channels, kY, kX, oY, oX}); - - ExtraArguments args({(double) kY, (double) kX, (double) sY, (double) sX, (double) pY, (double) pX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0, (double)0.0, (double) 0.}); - x.applyTransform(transform::Im2col, &im2col0, &args); - - nd4j::ops::im2col op; - auto result2col = op.execute({&x}, {}, {kY, kX, sY, sX, pY, pX, dY, dX, isSameMode ? 1 : 0}); - - auto im2col1 = result2col->at(0); - - //im2col0.printBuffer("transformed"); - //im2col1->printBuffer("customized", 64); - - ASSERT_TRUE(im2col1->isSameShape(&im2col0)); - ASSERT_TRUE(im2col1->equalsTo(&im2col0)); - - - ExtraArguments args2({ (double) sY, (double) sX, (double) pY, (double) pX, (double) inY, (double) inX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0}); - auto col2im0 = NDArrayFactory::create('c', {2, channels, inY, inX}); - im2col0.applyTransform(transform::Col2Im, &col2im0, &args2); - - nd4j::ops::col2im op2im; - auto result2im = op2im.execute({im2col1}, {}, {sY, sX, pY, pX, inY, inX, dY, dX, isSameMode ? 1 : 0}); - auto col2im1 = result2im->at(0); - - ASSERT_TRUE(col2im1->isSameShape(&col2im0)); - ASSERT_TRUE(col2im1->equalsTo(&col2im0)); - - delete result2col; - delete result2im; -} - - -TEST_F(ConvolutionTests1, Test_im2col_col2im_2) { - int kY = 5; - int kX = 5; - int sY = 1; - int sX = 1; - int pY = 0; - int pX = 0; - int dY = 1; - int dX = 1; - int inY = 28; - int inX = 28; - int channels = 3; - - bool isSameMode = true; - - auto x = NDArrayFactory::create('c', {2, channels, inY, inX}); - x.linspace(1); - - int oY, oX; - - nd4j::ops::ConvolutionUtils::calcOutSizePool2D(oY, oX, kY, kX, sY, sX, pY, pX, dY, dX, inY, inX, isSameMode); - - if (isSameMode) - nd4j::ops::ConvolutionUtils::calcPadding2D(pY, pX, oY, oX, inY, inX, kY, kX, sY, sX, dY, dX); - - auto im2col0 = NDArrayFactory::create('c', {2, channels, oY, oX, kY, kX}); - im2col0.permutei({0, 1, 4, 5, 2, 3}); - - ExtraArguments args2col({(double) kY, (double) kX, (double) sY, (double) sX, (double) pY, (double) pX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0, (double)0.0, (double) 0.}); - x.applyTransform(transform::Im2col, &im2col0, &args2col); - - nd4j::ops::im2col op; - auto result2col = op.execute({&x}, {}, {kY, kX, sY, sX, pY, pX, dY, dX, isSameMode ? 1 : 0}); - - auto im2col1 = result2col->at(0); - - ASSERT_TRUE(im2col1->isSameShape(&im2col0)); - ASSERT_TRUE(im2col1->equalsTo(&im2col0)); - - - ExtraArguments args2im({ (double) sY, (double) sX, (double) pY, (double) pX, (double) inY, (double) inX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0}); - auto col2im0 = NDArrayFactory::create('c', {2, channels, inY, inX}); - im2col0.applyTransform(transform::Col2Im, &col2im0, &args2im); - - nd4j::ops::col2im op2im; - auto result2im = op2im.execute({im2col1}, {}, {sY, sX, pY, pX, inY, inX, dY, dX, isSameMode ? 1 : 0}); - auto col2im1 = result2im->at(0); - - ASSERT_TRUE(col2im1->isSameShape(&col2im0)); - ASSERT_TRUE(col2im1->equalsTo(&col2im0)); - - delete result2col; - delete result2im; -} - -TEST_F(ConvolutionTests1, Test_im2col_col2im_3) { - int kY = 5; - int kX = 5; - int sY = 1; - int sX = 1; - int pY = 0; - int pX = 0; - int dY = 1; - int dX = 1; - int inY = 28; - int inX = 28; - int channels = 3; - - bool isSameMode = true; - - auto x = NDArrayFactory::create('c', {2, channels, inY, inX}); - x.linspace(1); - - int oY, oX; - - nd4j::ops::ConvolutionUtils::calcOutSizePool2D(oY, oX, kY, kX, sY, sX, pY, pX, dY, dX, inY, inX, isSameMode); - - if (isSameMode) - nd4j::ops::ConvolutionUtils::calcPadding2D(pY, pX, oY, oX, inY, inX, kY, kX, sY, sX, dY, dX); - - auto im2col0 = NDArrayFactory::create('c', {2, channels, oY, oX, kY, kX}); - im2col0.permutei({0, 1, 4, 5, 2, 3}); - - auto im2col1 = NDArrayFactory::create('c', {2, channels, oY, oX, kY, kX}); - im2col1.permutei({0, 1, 4, 5, 2, 3}); - - ExtraArguments args2col({(double) kY, (double) kX, (double) sY, (double) sX, (double) pY, (double) pX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0, (double)0.0, (double) 0.}); - x.applyTransform(transform::Im2col, &im2col0, &args2col); - - nd4j::ops::im2col op; - auto status = op.execute({&x}, {&im2col1}, {}, {kY, kX, sY, sX, pY, pX, dY, dX, isSameMode ? 1 : 0}, {}); - ASSERT_EQ(Status::OK(), status); - - ASSERT_TRUE(im2col1.isSameShape(&im2col0)); - ASSERT_TRUE(im2col1.equalsTo(&im2col0)); - - - ExtraArguments args2im({ (double) sY, (double) sX, (double) pY, (double) pX, (double) inY, (double) inX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0}); - auto col2im0 = NDArrayFactory::create('c', {2, channels, inY, inX}); - im2col0.applyTransform(transform::Col2Im, &col2im0, &args2im); - - nd4j::ops::col2im op2im; - auto result2im = op2im.execute({&im2col1}, {}, {sY, sX, pY, pX, inY, inX, dY, dX, isSameMode ? 1 : 0}); - auto col2im1 = result2im->at(0); - - ASSERT_TRUE(col2im1->isSameShape(&col2im0)); - ASSERT_TRUE(col2im1->equalsTo(&col2im0)); - - delete result2im; -} - - TEST_F(ConvolutionTests1, TestDeconv_bp_1) { int bS=3, iH=4,iW=4, iC=3,oC=2, kH=1,kW=1, sH=1,sW=1, pH=0,pW=0, dH=1,dW=1; @@ -1212,8 +1036,8 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_bp_test1) { nd4j::ops::conv3dnew_bp op; auto results = op.execute({&input, &weights, &bias, &gradO}, {}, {kD,kH,kW, sD,sH,sW, pD,pH,pW, dD,dH,dW, paddingMode, dataFormat}); - auto* gradI = results->at(0); - auto* gradW = results->at(1); + auto gradI = results->at(0); + auto gradW = results->at(1); ASSERT_EQ(Status::OK(), results->status()); ASSERT_TRUE(expGradI.isSameShape(gradI)); diff --git a/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp b/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp index c018e58d0..45b35eb4e 100644 --- a/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp @@ -110,7 +110,7 @@ TEST_F(DataTypesValidationTests, test_bfloat16_rand_1) { RandomGenerator gen(119, 120); RandomLauncher::fillUniform(LaunchContext::defaultContext(), gen, &x, 1, 6); - ASSERT_TRUE(x.sumNumber().e(0) > 0); + ASSERT_TRUE(x.sumNumber().e(0) != 0.f); } TEST_F(DataTypesValidationTests, test_bfloat16_rand_2) { @@ -118,7 +118,7 @@ TEST_F(DataTypesValidationTests, test_bfloat16_rand_2) { RandomGenerator gen(119, 120); RandomLauncher::fillGaussian(LaunchContext::defaultContext(), gen, &x, 0, 1); - ASSERT_TRUE(x.sumNumber().e(0) > 0); + ASSERT_TRUE(x.sumNumber().e(0) != 0.f); } TEST_F(DataTypesValidationTests, cast_1) { diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp index 458858c57..8dd2e7a40 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp @@ -164,9 +164,7 @@ TEST_F(DeclarableOpsTests1, ApplyGradientDescent_1) { auto result = op.execute({&x, &y}, {1.}, {}, {}, false, nd4j::DataType::DOUBLE); ASSERT_EQ(result->status(), ND4J_STATUS_OK); auto z = result->at(0); -// result->at(0)->printIndexedBuffer("OUTPUT"); -// result->at(0)->printShapeInfo("OUTPUT Shape"); -// exp.printIndexedBuffer("EXPECT"); + ASSERT_TRUE(z->equalsTo(exp)); delete result; } @@ -180,9 +178,7 @@ TEST_F(DeclarableOpsTests1, AssignBroadcastTest_1) { auto result = op.execute({&x, &y}, {}, {}, {}, false, nd4j::DataType::DOUBLE); ASSERT_EQ(result->status(), ND4J_STATUS_OK); auto z = result->at(0); -// result->at(0)->printIndexedBuffer("OUTPUT"); -// result->at(0)->printShapeInfo("OUTPUT Shape"); -// exp.printIndexedBuffer("EXPECT"); + ASSERT_TRUE(z->equalsTo(exp)); delete result; } @@ -199,11 +195,6 @@ TEST_F(DeclarableOpsTests1, AssignBroadcastTest_2) { ASSERT_EQ(result->status(), ND4J_STATUS_OK); auto z1 = result->at(0); auto z2 = result->at(1); -// z1->printIndexedBuffer("OUTPUT"); -// z2->printIndexedBuffer("OUTPUT"); -// -// exp1.printIndexedBuffer("EXPECT"); -// exp2.printIndexedBuffer("EXPECT"); ASSERT_TRUE(z1->equalsTo(exp1)); ASSERT_TRUE(z2->equalsTo(exp2)); @@ -220,9 +211,7 @@ TEST_F(DeclarableOpsTests1, AXpY_Test_1) { auto result = op.execute({&x, &y}, {2.}, {}, {}, false, nd4j::DataType::DOUBLE); ASSERT_EQ(result->status(), ND4J_STATUS_OK); auto z = result->at(0); -// result->at(0)->printIndexedBuffer("OUTPUT"); -// result->at(0)->printShapeInfo("OUTPUT Shape"); -// exp.printIndexedBuffer("EXPECT"); + ASSERT_TRUE(z->equalsTo(exp)); delete result; } @@ -265,14 +254,6 @@ TEST_F(DeclarableOpsTests1, TestTensorMmul1) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *out = results->at(0); - // exp.printShapeInfo(); - // out->printShapeInfo(); - // exp.printBuffer(); - // out->printBuffer(); - - // PointersManager manager(x.getContext(), "scatter"); - // manager.printDevContentOnHost(out->getSpecialBuffer(), out->lengthOf()); - // manager.printDevContentOnHost(exp.getSpecialBuffer(), exp.lengthOf()); ASSERT_TRUE(exp.isSameShape(out)); ASSERT_TRUE(exp.equalsTo(out)); @@ -293,8 +274,6 @@ TEST_F(DeclarableOpsTests1, TestTensorDot2) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *out = results->at(0); - // out->printBuffer(); - // out->printShapeInfo(); ASSERT_TRUE(exp.isSameShape(out)); ASSERT_TRUE(exp.equalsTo(out)); @@ -315,8 +294,6 @@ TEST_F(DeclarableOpsTests1, TestTensorDot3) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *out = results->at(0); - // out->printBuffer(); - // out->printShapeInfo(); ASSERT_TRUE(exp.isSameShape(out)); ASSERT_TRUE(exp.equalsTo(out)); @@ -337,8 +314,6 @@ TEST_F(DeclarableOpsTests1, TestTensorDot4) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *out = results->at(0); - // out->printBuffer(); - // out->printShapeInfo(); ASSERT_TRUE(exp.isSameShape(out)); ASSERT_TRUE(exp.equalsTo(out)); @@ -631,8 +606,6 @@ TEST_F(DeclarableOpsTests1, ClipByValue1) { clip.execute(block); - // x->printIndexedBuffer("Result"); - // exp.printIndexedBuffer("Expect"); ASSERT_TRUE(x->equalsTo(&exp)); @@ -775,7 +748,7 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractMatrices1) { nd4j::ops::reversesubtract subOp; subOp.execute(block); - // x->printIndexedBuffer("Output Subtract"); + ASSERT_TRUE(x->equalsTo(&exp)); delete variableSpace; @@ -814,7 +787,7 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractTest_2) { y.assign(1.f); exp.assign(-2.f); x.applyTrueBroadcast(BROADCAST(ReverseSubtract), &y, &z, true); -// x.printIndexedBuffer("ReverseSubtract Legacy"); + ASSERT_TRUE(exp.equalsTo(&z)); nd4j::ops::reversesubtract subOp; @@ -822,7 +795,6 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractTest_2) { auto res = subOp.execute({&x, &y}, {}, {}); ASSERT_TRUE(res->status() == ND4J_STATUS_OK); - //res->at(0)->printIndexedBuffer("OUtput REVERSED SUB"); ASSERT_TRUE(res->at(0)->equalsTo(&exp)); delete res; @@ -862,8 +834,8 @@ TEST_F(DeclarableOpsTests1, ReverseModTest_1) { y.assign(9.f); exp.assign(1.f); y.applyTrueBroadcast(BROADCAST(Mod), &x, &z, true); - // z.printIndexedBuffer("MOD1"); ASSERT_TRUE(exp.equalsTo(&z)); + x.applyTrueBroadcast(BROADCAST(ReverseMod), &y, &exp, true); ASSERT_TRUE(exp.equalsTo(&z)); @@ -899,7 +871,6 @@ TEST_F(DeclarableOpsTests1, ReverseModTest_2) { auto res = subOp.execute({&x, &y}, {}, {}); ASSERT_TRUE(res->status() == ND4J_STATUS_OK); -// res->at(0)->printIndexedBuffer("OUtput REVERSED MOD2"); ASSERT_TRUE(res->at(0)->equalsTo(&exp)); delete res; @@ -1355,7 +1326,6 @@ TEST_F(DeclarableOpsTests1, DivideScalarScalar1) { div.execute(block); - //x->printBuffer("x"); ASSERT_TRUE(x->equalsTo(&exp)); delete variableSpace; @@ -1503,10 +1473,6 @@ TEST_F(DeclarableOpsTests1, Test_Cast_1) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); - // z->printIndexedBuffer("OUtput"); - // yExp.printIndexedBuffer("Expect"); - // z->printShapeInfo("OUt shape"); - // yExp.printShapeInfo("Exp shape"); ASSERT_TRUE(yExp.equalsTo(z)); delete result; @@ -1515,8 +1481,6 @@ TEST_F(DeclarableOpsTests1, Test_Cast_1) { ////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests1, TestRegistrator1) { auto res = nd4j::ops::OpRegistrator::getInstance()->getAllCustomOperations(); - - // nd4j_printf("Ops: %s\n", res) } // ////////////////////////////////////////////////////////////////////// @@ -1555,7 +1519,6 @@ TEST_F(DeclarableOpsTests1, TestRegistrator1) { // //auto status = execCustomOp(nullptr, hash, inputBuffers, inputShapes, 2, outputBuffers, outputShapes, 1, nullptr, 0, nullptr, 0, false); // auto status = execCustomOp(nullptr, hash, inputBuffers, inputShapes, 2, outputBuffers, outputShapes, 1, nullptr, 0, nullptr, 0, nullptr, 0, false); // ASSERT_EQ(ND4J_STATUS_OK, status); -// // z->printIndexedBuffer("Output add"); // ASSERT_NEAR(2.0f, y->meanNumber().e(0), 1e-5); // ASSERT_NEAR(1.0f, x->meanNumber().e(0), 1e-5); // ASSERT_NEAR(3.0f, z->meanNumber().e(0), 1e-5); @@ -1636,8 +1599,6 @@ TEST_F(DeclarableOpsTests1, TestGemv1) { nd4j::blas::GEMV::op('f', x->rows(), x->columns(), 1.0f, x->getBuffer(), y->rows(), y->getBuffer(), 1, 0.0, z->getBuffer(), 1); - //z->printBuffer(); - ASSERT_TRUE(z->equalsTo(exp)); delete []xBuffer; delete []xShape; delete x; delete []yBuffer; delete []yShape; delete y; delete z; delete []expBuffer; delete exp; @@ -2020,8 +1981,6 @@ TEST_F(DeclarableOpsTests1, TestCustomShape1) { auto inshapes = new ShapeList(input->getShapeInfo()); auto shapes = test.calculateOutputShape(inshapes, *block); - //input.printShapeInfo("input"); - //shape::printShapeInfoLinear(shape); ASSERT_EQ(input->getShapeInfo()[0] , shapes->at(0)[0]); ASSERT_EQ(input->getShapeInfo()[1] * 2, shapes->at(0)[1]); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp index 3fd9d26c6..f0ae83168 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp @@ -130,7 +130,7 @@ TEST_F(DeclarableOpsTests10, Test_Not_1) { auto result = op.execute({&x, &y}, {}, {}, {}, false, nd4j::DataType::BOOL); ASSERT_EQ(Status::OK(), result->status()); auto res = result->at(0); - res->printBuffer("OUtput NOT"); + ASSERT_TRUE(e.equalsTo(res)); delete result; @@ -163,7 +163,7 @@ TEST_F(DeclarableOpsTests10, MirrorPad_SGO_Test_1) { auto res = op.execute({&in, &pad}, {10.0}, {0}, {}, false, nd4j::DataType::DOUBLE); ASSERT_EQ(res->status(), ND4J_STATUS_OK); - res->at(0)->printIndexedBuffer("Mirror pad:"); + ASSERT_TRUE(exp.equalsTo(res->at(0))); delete res; } @@ -180,9 +180,6 @@ TEST_F(DeclarableOpsTests10, Unique_SGO_Test_1) { auto res1 = res->at(0); auto res2 = res->at(1); - res1->printIndexedBuffer("Unique values"); - res2->printIndexedBuffer("Unique idxs"); - ASSERT_TRUE(exp.equalsTo(res1)); ASSERT_TRUE(expIdx.equalsTo(res2)); delete res; @@ -215,8 +212,7 @@ TEST_F(DeclarableOpsTests10, Where_SGO_Test_02) { auto res = op.execute({&input}, {}, {}); ASSERT_TRUE(res->status() == ND4J_STATUS_OK); auto resA = res->at(0); - resA->printIndexedBuffer("Where02"); - resA->printBuffer("Where02lINEAR"); + ASSERT_TRUE(exp.equalsTo(resA)); ASSERT_TRUE(exp.isSameShape(resA)); // ASSERT_TRUE(expIdx.equalsTo(res->at(1))); @@ -329,8 +325,7 @@ TEST_F(DeclarableOpsTests10, Where_SGO_Test_5) { ASSERT_TRUE(res->status() == ND4J_STATUS_OK); auto resA = res->at(0); //ASSERT_TRUE(resA->isEmpty()); - resA->printIndexedBuffer("Result A"); - //resA->printShapeInfo("ShapeA"); + ASSERT_TRUE(exp.equalsTo(resA)); ASSERT_TRUE(exp.isSameShape(resA)); // ASSERT_TRUE(expIdx.equalsTo(res->at(1))); @@ -658,8 +653,7 @@ TEST_F(DeclarableOpsTests10, top_k_permuted_test1) { auto z = result->at(0); auto zI = result->at(1); - z->printIndexedBuffer("TopK(5)"); - zI->printIndexedBuffer("TopKI(5)"); + ASSERT_TRUE(expUnsorted.isSameShape(z)); ASSERT_TRUE(expUnsorted.equalsTo(z)); @@ -669,8 +663,7 @@ TEST_F(DeclarableOpsTests10, top_k_permuted_test1) { z = result2->at(0); zI = result2->at(1); - z->printIndexedBuffer("sorted TopK(5)"); - zI->printIndexedBuffer("sorted TopKI(5)"); + ASSERT_TRUE(expSorted.isSameShape(z)); ASSERT_TRUE(expSorted.equalsTo(z)); @@ -693,8 +686,7 @@ TEST_F(DeclarableOpsTests10, top_k_permuted_test2) { auto z = result->at(0); auto zI = result->at(1); - z->printIndexedBuffer("TopK(5)"); - zI->printIndexedBuffer("TopKI(5)"); + ASSERT_TRUE(expUnsorted.isSameShape(z)); ASSERT_TRUE(expUnsorted.equalsTo(z)); @@ -704,8 +696,7 @@ TEST_F(DeclarableOpsTests10, top_k_permuted_test2) { z = result2->at(0); zI = result2->at(1); - z->printIndexedBuffer("sorted TopK(5)"); - zI->printIndexedBuffer("sorted TopKI(5)"); + ASSERT_TRUE(expSorted.isSameShape(z)); ASSERT_TRUE(expSorted.equalsTo(z)); @@ -1022,8 +1013,6 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_2) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); NDArray* output = results->at(0); - output->printIndexedBuffer("Output 2"); - exp.printIndexedBuffer("Expect 2"); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1046,8 +1035,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_3) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); NDArray* output = results->at(0); - output->printIndexedBuffer("Output 3"); - exp.printIndexedBuffer("Expect 3"); + ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1179,7 +1167,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_7) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); NDArray* output = results->at(0); - output->printIndexedBuffer("NTH rank3_n2"); + ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1206,7 +1194,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_8) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); NDArray* output = results->at(0); - output->printIndexedBuffer("NTH rank3_n2_reverse"); + ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1812,7 +1800,7 @@ TEST_F(DeclarableOpsTests10, LinSpace_Test1) { auto result = op.execute({&start, &finish, &num}, {}, {}); ASSERT_EQ(result->status(), ND4J_STATUS_OK); auto res = result->at(0); - res->printIndexedBuffer("from 1 to 24"); + ASSERT_TRUE(expect.equalsTo(res)); delete result; } @@ -2084,7 +2072,7 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_2) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto result = results->at(0); - result->printIndexedBuffer("Cropped and Resized"); + ASSERT_TRUE(expected.isSameShapeStrict(result)); ASSERT_TRUE(expected.equalsTo(result)); @@ -2108,7 +2096,7 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_3) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto result = results->at(0); - result->printIndexedBuffer("Cropped and Resized"); + ASSERT_TRUE(expected.isSameShapeStrict(result)); ASSERT_TRUE(expected.equalsTo(result)); @@ -2156,7 +2144,7 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_5) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto result = results->at(0); - result->printShapeInfo("Cropped and Resized"); + ASSERT_TRUE(expected.isSameShapeStrict(result)); //ASSERT_TRUE(expected.equalsTo(result)); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp index 988e5d583..d077f886d 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp @@ -916,7 +916,6 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test1) { auto result = op.execute({&x, &y}, {}, {}); ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.equalsTo(result->at(0))); - result->at(0)->printBuffer("Output"); delete result; } @@ -928,7 +927,6 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test2) { nd4j::ops::squaredsubtract op; auto result = op.execute({&x, &y}, {}, {}); ASSERT_EQ(Status::OK(), result->status()); - result->at(0)->printBuffer("Output"); ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; } @@ -941,7 +939,6 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test3) { nd4j::ops::squaredsubtract_bp op; auto result = op.execute({&x, &y, &eps}, {}, {}); ASSERT_EQ(Status::OK(), result->status()); - result->at(0)->printBuffer("Output"); ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; } @@ -1372,7 +1369,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_1) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto res = results->at(0); - res->printIndexedBuffer("BFloat16 sum:"); ASSERT_TRUE(res->equalsTo(exp)); delete results; @@ -1394,7 +1390,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_2) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto res = results->at(0); - res->printIndexedBuffer("BFloat16 sum:"); ASSERT_TRUE(res->equalsTo(exp)); delete results; @@ -1416,7 +1411,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_3) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto res = results->at(0); - res->printIndexedBuffer("BFloat16 sum:"); ASSERT_TRUE(res->equalsTo(exp)); delete results; @@ -1869,7 +1863,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_4) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto res = results->at(0); - res->printIndexedBuffer("BFloat16 sum:"); ASSERT_TRUE(res->equalsTo(exp)); delete results; @@ -1891,7 +1884,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_5) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto res = results->at(0); - res->printIndexedBuffer("BFloat16 subtract:"); ASSERT_TRUE(res->equalsTo(exp)); delete results; @@ -1913,7 +1905,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_6) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto res = results->at(0); - res->printIndexedBuffer("BFloat16 subtract2:"); ASSERT_TRUE(res->equalsTo(exp)); delete results; @@ -2189,7 +2180,6 @@ TEST_F(DeclarableOpsTests11, SafeDivideMixed_Test1) { NDArray numOfNonZero(sumDiff.getShapeInfo(), nd4j::DataType::INT64, false); numOfNonZero.assign(1); sumDiff.applyPairwiseTransform(pairwise::SafeDivide, &numOfNonZero, &sumDiff, nullptr); - sumDiff.printIndexedBuffer("Output as Is"); } ///////////////////////////////////////////////////////////////// @@ -2393,7 +2383,6 @@ TEST_F(DeclarableOpsTests11, Multiply_BP_Test1) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *dLdo = results->at(0); - dLdo->printBuffer("Output for multiply_bp op"); ASSERT_TRUE(dLdpExp.isSameShape(dLdo)); ASSERT_TRUE(dLdpExp.equalsTo(dLdo)); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp index 3f868c45c..59da5edb4 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp @@ -402,8 +402,6 @@ TEST_F(DeclarableOpsTests12, TestDivideBP_1) { Nd4jStatus status = op.execute({&x, &y, &eps}, {&output1, &output2}, {}, {}, {}); ASSERT_EQ(ND4J_STATUS_OK, status); - output1.printIndexedBuffer("DivideBP X out"); - output2.printIndexedBuffer("DivideBP Y out"); //ASSERT_TRUE(output.e(0) == 47.); } @@ -427,8 +425,6 @@ TEST_F(DeclarableOpsTests12, TestDivideBP_2) { Nd4jStatus status = op.execute({&x, &y, &eps}, {&output1, &output2}, {}, {}, {}); ASSERT_EQ(ND4J_STATUS_OK, status); - output1.printIndexedBuffer("2DivideBP X out"); - output2.printIndexedBuffer("2DivideBP Y out"); ASSERT_TRUE(output1.equalsTo(exp1)); ASSERT_TRUE(output2.equalsTo(exp2)); } @@ -450,8 +446,6 @@ TEST_F(DeclarableOpsTests12, TestReverseDivideBP_1) { Nd4jStatus status = op.execute({&y, &x, &eps}, {&output2, &output1}, {}, {}, {}); ASSERT_EQ(ND4J_STATUS_OK, status); - output1.printIndexedBuffer("RDivideBP X out"); - output2.printIndexedBuffer("RDivideBP Y out"); //ASSERT_TRUE(output.e(0) == 47.); } @@ -476,8 +470,6 @@ TEST_F(DeclarableOpsTests12, TestReverseDivideBP_2) { Nd4jStatus status = op.execute({&y, &x, &eps}, {&output2, &output1}, {}, {}, {}); ASSERT_EQ(ND4J_STATUS_OK, status); - output1.printIndexedBuffer("2RDivideBP X out"); - output2.printIndexedBuffer("2RDivideBP Y out"); ASSERT_TRUE(output1.equalsTo(exp1)); ASSERT_TRUE(output2.equalsTo(exp2)); } @@ -501,7 +493,6 @@ TEST_F(DeclarableOpsTests12, TestSliceBP_1) { Nd4jStatus status = op.execute({&x, &eps}, {&output}, {}, {1,1,2,2}, {}); ASSERT_EQ(ND4J_STATUS_OK, status); - output.printIndexedBuffer("SLICE_BP out"); ASSERT_TRUE(output.equalsTo(exp)); //ASSERT_TRUE(output2.equalsTo(exp2)); } @@ -526,7 +517,6 @@ TEST_F(DeclarableOpsTests12, TestConfusionZero_1) { Nd4jStatus status = op.execute({&x, &i}, {&output}, {}, {4}, {}); ASSERT_EQ(ND4J_STATUS_OK, status); - output.printIndexedBuffer("Confusion out"); ASSERT_TRUE(output.equalsTo(exp)); //ASSERT_TRUE(output2.equalsTo(exp2)); } @@ -545,8 +535,6 @@ TEST_F(DeclarableOpsTests12, TestMaximumBP_1) { output1.assign(119); x.linspace(1.); y.linspace(12., -1.); - x.printBuffer("X"); - y.printBuffer("Y"); eps.linspace(1.); //exp1.assign(1.); //exp2.assign(-2.); @@ -554,8 +542,6 @@ TEST_F(DeclarableOpsTests12, TestMaximumBP_1) { Nd4jStatus status = op.execute({&x, &y, &eps}, {&output1, &output2}, {}, {}, {}); ASSERT_EQ(ND4J_STATUS_OK, status); - output1.printIndexedBuffer("X max"); - output2.printIndexedBuffer("Y max"); ASSERT_TRUE(output1.equalsTo(exp1)); ASSERT_TRUE(output2.equalsTo(exp2)); } @@ -574,8 +560,6 @@ TEST_F(DeclarableOpsTests12, TestMinimumBP_1) { output1.assign(119); x.linspace(1.); y.linspace(12., -1.); - x.printBuffer("X"); - y.printBuffer("Y"); eps.linspace(1.); //exp1.assign(1.); //exp2.assign(-2.); @@ -583,8 +567,6 @@ TEST_F(DeclarableOpsTests12, TestMinimumBP_1) { Nd4jStatus status = op.execute({&x, &y, &eps}, {&output2, &output1}, {}, {}, {}); ASSERT_EQ(ND4J_STATUS_OK, status); - output2.printIndexedBuffer("X min"); - output1.printIndexedBuffer("Y min"); ASSERT_TRUE(output1.equalsTo(exp1)); ASSERT_TRUE(output2.equalsTo(exp2)); } diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp index 9d460f152..71ee8a04e 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp @@ -533,7 +533,6 @@ TEST_F(DeclarableOpsTests13, adjustSaturation_1) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto result = results->at(0); - // result->printIndexedBuffer(); ASSERT_TRUE(exp.isSameShape(result)); ASSERT_TRUE(exp.equalsTo(result)); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp index 2d8311828..574da8993 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp @@ -58,12 +58,7 @@ TEST_F(DeclarableOpsTests14, Test_Reshape_CF_1) { auto x = NDArrayFactory::create('f', {2, 3}, {1.0, 4.0, 2.0, 5.0, 3.0, 6.0}); auto e = NDArrayFactory::create('f', {3, 2}, {1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); - x.printShapeInfo("x shape"); - x.printBuffer("x buffr"); - x.printIndexedBuffer("x indxd"); - - auto r = x.reshape('c', {3, 2}); - r.printIndexedBuffer("r pre-s"); + auto r = x.reshape('c', {3, 2});; r.streamline('f'); nd4j::ops::reshape op; @@ -92,7 +87,7 @@ TEST_F(DeclarableOpsTests14, Test_Inf_Comparison_2) { TEST_F(DeclarableOpsTests14, Multiply_test) { for(int k=2;k<10;k++){ - nd4j_printf("k=%d\n", k); + //nd4j_printf("k=%d\n", k); NDArray x = NDArrayFactory::create('c', {k, 1}); NDArray y = NDArrayFactory::create('c', {k}); NDArray e = NDArrayFactory::create('c', {k, k}); @@ -122,7 +117,6 @@ TEST_F(DeclarableOpsTests14, Test_EvalReductionShape_1) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - z->printIndexedBuffer("Reduced shape"); ASSERT_EQ(e, *z); delete result; @@ -416,8 +410,6 @@ TEST_F(DeclarableOpsTests14, test_empty_argmax_1) { auto z = result->at(0); - z->printShapeInfo("Z"); - ASSERT_EQ(e, *z); delete result; diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp index 6eabc964a..97e7d2d91 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp @@ -250,7 +250,6 @@ TEST_F(DeclarableOpsTests15, Test_BitCast_2) { auto result = op.execute({&x}, {}, {nd4j::DataType::HALF}, {}); ASSERT_EQ(Status::OK(), result->status()); auto out = result->at(0); - out->printIndexedBuffer("Casted result"); ASSERT_TRUE(e.equalsTo(out)); delete result; } diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp index d95e86b1c..1a459a012 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp @@ -149,5 +149,16 @@ TEST_F(DeclarableOpsTests16, test_knn_mindistance_1) { nd4j::ops::knn_mindistance op; auto result = op.execute({&input, &low, &high}, {&output}, {}, {}, {}); ASSERT_EQ(Status::OK(), result); +} +TEST_F(DeclarableOpsTests16, test_empty_cast_1) { + auto x = NDArrayFactory::create('c', {1, 0, 2}); + auto e = NDArrayFactory::create('c', {1, 0, 2}); + + nd4j::ops::cast op; + auto result = op.execute({&x}, {}, {10}); + ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(e, *result->at(0)); + + delete result; } \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp index 62172dbf2..4941e7459 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp @@ -3589,8 +3589,6 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test1) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto *result = results->at(0); - result->printIndexedBuffer("SCEL Output"); - expected.printIndexedBuffer("SCEL Expect"); ASSERT_TRUE(expected.isSameShape(result)); ASSERT_TRUE(expected.equalsTo(result)); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp index 2f56eaf2a..478a31d4a 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp @@ -479,7 +479,6 @@ TEST_F(DeclarableOpsTests4, Test_FlattenTests_4) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); - z->printIndexedBuffer(); ASSERT_TRUE(exp.equalsTo(z)); @@ -1045,7 +1044,6 @@ TEST_F(DeclarableOpsTests4, Test_StridedSlice_Alex_3) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - z->printShapeInfo("Emply shape expected"); ASSERT_TRUE(z->isEmpty()); delete result; @@ -1065,9 +1063,6 @@ TEST_F(DeclarableOpsTests4, Test_StridedSlice_Alex_4) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - z->printBuffer("Strided Slice"); - z->printShapeInfo("Vector size 1 shape expected"); - exp.printShapeInfo("Expected shape"); ASSERT_TRUE(z->lengthOf() == 1); ASSERT_TRUE(exp.equalsTo(z)); delete result; @@ -1482,9 +1477,6 @@ TEST_F(DeclarableOpsTests4, WeightedCrossEntropyWithLogits_2) { auto results = op.execute({&targets, &input, &weights}, {}, {}, {}, false, nd4j::DataType::DOUBLE); auto output = results->at(0); - output->printIndexedBuffer("Result is "); - expected.printIndexedBuffer("Expected is "); - ASSERT_EQ(Status::OK(), results->status()); ASSERT_TRUE(expected.isSameShape(output)); ASSERT_TRUE(expected.equalsTo(output)); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp index 86acca29c..2e8d96f3c 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp @@ -304,7 +304,6 @@ TEST_F(DeclarableOpsTests5, hardsigmoid_test1) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); - z->printIndexedBuffer("Hadrdsigmoid 2x2"); ASSERT_TRUE(exp.equalsTo(z)); delete result; @@ -321,7 +320,6 @@ TEST_F(DeclarableOpsTests5, hardsigmoid_test2) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); - z->printIndexedBuffer("Hadrdsigmoid 2x2"); ASSERT_TRUE(exp.equalsTo(z)); delete result; @@ -384,7 +382,6 @@ TEST_F(DeclarableOpsTests5, histogram_test2) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); - z->printIndexedBuffer("Histogram4"); ASSERT_TRUE(exp.equalsTo(z)); delete result; @@ -400,7 +397,6 @@ TEST_F(DeclarableOpsTests5, Identity_test1) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); -// z->printIndexedBuffer("Histogram3"); ASSERT_TRUE(matrix.equalsTo(z)); delete result; @@ -416,7 +412,6 @@ TEST_F(DeclarableOpsTests5, Identity_test2) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); - z->printIndexedBuffer("Identity_BP"); ASSERT_TRUE(z->equalsTo(eps)); delete result; @@ -433,7 +428,6 @@ TEST_F(DeclarableOpsTests5, Log1p_test1) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); - z->printIndexedBuffer("Log1p"); ASSERT_TRUE(z->equalsTo(y)); delete result; @@ -450,7 +444,6 @@ TEST_F(DeclarableOpsTests5, Test_SpaceToBatch_1) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - // z->printIndexedBuffer(); ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); @@ -846,9 +839,6 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test1) { auto output = results->at(0); - exp.printIndexedBuffer("E"); - output->printIndexedBuffer("O"); - ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1314,17 +1304,6 @@ TEST_F(DeclarableOpsTests5, Test_TopK_3_unsorted) { auto v = result->at(0); auto i = result->at(1); -// v->printShapeInfo("shape v"); -// expV.printShapeInfo("shape expV"); - -// i->printShapeInfo("shape I"); -// expI.printShapeInfo("shape expI"); - - v->printIndexedBuffer("v"); -// expV.printIndexedBuffer("expV"); - i->printIndexedBuffer("i"); -// expI.printIndexedBuffer("expI"); - ASSERT_TRUE(expV.isSameShape(v)); ASSERT_TRUE(expV.equalsTo(v)); @@ -1349,17 +1328,6 @@ TEST_F(DeclarableOpsTests5, Test_TopK_4) { auto v = result->at(0); auto i = result->at(1); -// v->printShapeInfo("shape v"); -// expV.printShapeInfo("shape expV"); - -// i->printShapeInfo("shape I"); -// expI.printShapeInfo("shape expI"); - -// v->printIndexedBuffer("v"); -// expV.printIndexedBuffer("expV"); -// i->printIndexedBuffer("i"); -// expI.printIndexedBuffer("expI"); - ASSERT_TRUE(expV.isSameShape(v)); ASSERT_TRUE(expV.equalsTo(v)); @@ -1377,11 +1345,6 @@ TEST_F(DeclarableOpsTests5, Test_TopK_5) { nd4j::ops::top_k op; auto result = op.execute({&x}, {}, {2, 1}); - for (Nd4jLong r = 0; r < 2; r++) { - for (Nd4jLong c = 0; c < 3; c++) - nd4j_printf("%f, ", x.e(r,c)); - nd4j_printf("\n", ""); - } ASSERT_EQ(ND4J_STATUS_OK, result->status()); ASSERT_EQ(2, result->size()); @@ -1389,18 +1352,6 @@ TEST_F(DeclarableOpsTests5, Test_TopK_5) { auto v = result->at(0); auto i = result->at(1); -// x.printShapeInfo("shape of the source X"); -// v->printShapeInfo("shape v"); -// expV.printShapeInfo("shape expV"); - -// i->printShapeInfo("shape I"); -// expI.printShapeInfo("shape expI"); - - v->printIndexedBuffer("v"); - expV.printIndexedBuffer("expV"); - i->printIndexedBuffer("i"); - expI.printIndexedBuffer("expI"); - ASSERT_TRUE(expV.isSameShape(v)); ASSERT_TRUE(expV.equalsTo(v)); @@ -2025,10 +1976,6 @@ TEST_F(DeclarableOpsTests5, DynamicPartition_2) { for (int e = 0; e < result->size(); e++) { auto output = result->at(e); - nd4j_printf("%i: ", e); - output->printShapeInfo("Output shape> "); - exp[e].printShapeInfo("Expected shape> "); - output->printIndexedBuffer("Output data> "); ASSERT_TRUE(exp[e].isSameShape(output)); ASSERT_TRUE(exp[e].equalsTo(output)); @@ -2126,10 +2073,6 @@ TEST_F(DeclarableOpsTests5, DynamicStitch_1) { auto output = result->at(0); - // output->printShapeInfo("Output shape> "); - // exp.printShapeInfo("Expected shape> "); - output->printIndexedBuffer("O data"); - exp.printIndexedBuffer("E data"); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2334,8 +2277,6 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test1) { ASSERT_EQ(Status::OK(), results->status()); auto output = results->at(0); - output->printIndexedBuffer("CM output"); - expected.printIndexedBuffer("CM expected"); ASSERT_TRUE(expected.isSameShape(output)); ASSERT_TRUE(expected.equalsTo(output)); @@ -2355,9 +2296,6 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test2) { ASSERT_EQ(Status::OK(), results->status()); auto output = results->at(0); - output->printIndexedBuffer("CM2 output"); - expected.printIndexedBuffer("CM2 expected"); - ASSERT_TRUE(expected.isSameShape(output)); ASSERT_TRUE(expected.equalsTo(output)); @@ -2376,8 +2314,6 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test3) { nd4j::ops::confusion_matrix op; auto results = op.execute({&labels, &predictions, &weights}, {}, {3}); auto output = results->at(0); - output->printIndexedBuffer("CM3"); - ASSERT_EQ(Status::OK(), results->status()); ASSERT_TRUE(expected.isSameShape(output)); @@ -2397,7 +2333,6 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test4) { nd4j::ops::confusion_matrix op; auto results = op.execute({&labels, &predictions, &weights}, {}, {3, nd4j::DataType::DOUBLE}); auto output = results->at(0); - output->printIndexedBuffer("CM4"); ASSERT_EQ(Status::OK(), results->status()); ASSERT_TRUE(expected.isSameShape(output)); @@ -2470,11 +2405,6 @@ TEST_F(DeclarableOpsTests5, XWPlusB_1) { auto output = result->at(0); - output->printShapeInfo("Output shape> "); - exp.printShapeInfo("Expected shape> "); - output->printIndexedBuffer("Output data> "); - exp.printIndexedBuffer("Expected res>"); - ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2778,7 +2708,7 @@ TEST_F(DeclarableOpsTests5, L2_Loss_1) { ASSERT_EQ(Status::OK(), results->status()); ASSERT_TRUE(output->isScalar()); - output->printIndexedBuffer("L2_Loss output"); + ASSERT_EQ(output->e(0), exp); delete results; diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp index 34b66c61a..79a569e0f 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp @@ -118,8 +118,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_4) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - z->printShapeInfo("SS OS shape"); - z->printIndexedBuffer("SS OS out"); + ASSERT_TRUE(z->equalsTo(exp)); //ASSERT_EQ(exp, *z); @@ -127,9 +126,10 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_4) { } TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_04) { + int z = 0; auto matrix = NDArrayFactory::create('c', {1}, {10}); auto b = NDArrayFactory::create_('c', {1}, {1}); - auto e = NDArrayFactory::create_('c', {1}, {(int)0}); + auto e = NDArrayFactory::create_('c', {1}, {z}); auto s = NDArrayFactory::create_('c', {1}, {1}); nd4j::ops::ones_as opOnes; //auto exp = NDArrayFactory::create('c', {2}, {1.0f, 2.0f}); @@ -138,7 +138,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_04) { ASSERT_EQ(onesRes->status(), Status::OK()); auto ones = onesRes->at(0); - ones->printShapeInfo("Shape ones"); *ones *= 10; auto onesD = ones->dup(); @@ -161,9 +160,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_04) { nd4j::ops::strided_slice op; auto result = op.calculateOutputShape(inputShapes, *block); //execute({ones, &b, &e, &s}, {}, {0, 1, 0, 0, 0}); ASSERT_EQ(result->size(), 1); - shape::printShapeInfoLinear(result->at(0)); - //auto z = result->at(0); -// z->printShapeInfo("SS OS shape"); ASSERT_TRUE(shape::isEmpty(result->at(0))); //ASSERT_EQ(exp, *z); delete block; @@ -189,8 +185,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_5) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - z->printShapeInfo("Output shape"); - z->printIndexedBuffer("Output"); ASSERT_TRUE(exp.equalsTo(z)); delete result; @@ -211,8 +205,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_6) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - z->printShapeInfo("Output shape"); - z->printIndexedBuffer("Output"); ASSERT_TRUE(exp.equalsTo(z)); delete result; @@ -234,8 +226,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_7) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - z->printShapeInfo("Output shape"); - z->printIndexedBuffer("Output"); //ASSERT_TRUE(exp.equalsTo(z)); delete result; @@ -258,8 +248,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_BP_1) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - z->printShapeInfo("Output shape"); - z->printIndexedBuffer("Output"); //ASSERT_TRUE(exp.equalsTo(z)); delete result; @@ -282,8 +270,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_BP_2) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - z->printShapeInfo("Output shape"); - z->printIndexedBuffer("Output"); //ASSERT_TRUE(exp.equalsTo(z)); delete result; @@ -306,8 +292,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_BP_3) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - z->printShapeInfo("Output shape"); - z->printIndexedBuffer("Output"); //ASSERT_TRUE(exp.equalsTo(z)); delete result; @@ -362,8 +346,6 @@ TEST_F(DeclarableOpsTests6, Test_Order_1) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - z->printIndexedBuffer("O Output"); - exp.printIndexedBuffer("O Expect"); ASSERT_TRUE(exp.equalsTo(z)); ASSERT_NE(x.ordering(), z->ordering()); @@ -379,7 +361,6 @@ TEST_F(DeclarableOpsTests6, cumSum_1) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); - // z->printIndexedBuffer("CumSum1"); ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); @@ -910,9 +891,7 @@ TEST_F(DeclarableOpsTests6, TestRank_1) { auto ress = op.execute({&x}, {}, {}, {}); ASSERT_EQ(ND4J_STATUS_OK, ress->status()); - ress->at(0)->printIndexedBuffer("RANK Result is "); - // x.printIndexedBuffer("Input is"); ASSERT_TRUE(ress->at(0)->equalsTo(exp)); delete ress; } @@ -926,8 +905,6 @@ TEST_F(DeclarableOpsTests6, TestDropout_2) { auto ress = op.execute({&x}, {0.4f}, {113}, {}, false, nd4j::DataType::DOUBLE); ASSERT_EQ(ND4J_STATUS_OK, ress->status()); - //x.printIndexedBuffer("Input is"); - //ress->at(0)->printIndexedBuffer("Result is "); delete ress; } @@ -943,8 +920,6 @@ TEST_F(DeclarableOpsTests6, TestDropout_3) { auto ress = op.execute({&x, &shape}, {0.4f}, {113}, {}, false, nd4j::DataType::DOUBLE); ASSERT_EQ(ND4J_STATUS_OK, ress->status()); - //x.printIndexedBuffer("Input is"); - //ress->at(0)->printIndexedBuffer("Result is "); delete ress; } @@ -1556,8 +1531,6 @@ TEST_F(DeclarableOpsTests6, LogMatrixDeterminant_1) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); - z->printIndexedBuffer("Log ABS Output "); - exp.printIndexedBuffer("Log ABS Expected "); ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); @@ -1578,8 +1551,6 @@ TEST_F(DeclarableOpsTests6, LogDet_1) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); -// z->printIndexedBuffer("LogDet Output1 "); -// exp.printIndexedBuffer("LogDet Expected1 "); ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); @@ -1593,16 +1564,12 @@ TEST_F(DeclarableOpsTests6, LogDet_2) { auto x = NDArrayFactory::create('c', {1, 3, 3}, {4,12,-16,12,37,-43,-16,-43,98}); auto exp = NDArrayFactory::create('c', {1}, { 3.5835189}); - //x.printIndexedBuffer("Input"); nd4j::ops::logdet op; auto result = op.execute({&x}, {}, {}); ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); -// z->printIndexedBuffer("LogDet Output2 "); -// z->printShapeInfo("Shape"); -// exp.printIndexedBuffer("LogDet Expected2 "); ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); @@ -1616,16 +1583,12 @@ TEST_F(DeclarableOpsTests6, LogDet_3) { auto x = NDArrayFactory::create('c', {3, 3}, {4,12,-16,12,37,-43,-16,-43,98}); auto exp = NDArrayFactory::create( 3.5835189); - //x.printIndexedBuffer("Input"); nd4j::ops::logdet op; auto result = op.execute({&x}, {}, {}); ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); -// z->printIndexedBuffer("LogDet Output3 "); -// z->printShapeInfo("Shape"); -// exp.printIndexedBuffer("LogDet Expected3 "); ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); @@ -1670,8 +1633,6 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_1) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); -// z->printIndexedBuffer("Output "); -// exp.printIndexedBuffer("Expected "); ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); @@ -1710,8 +1671,6 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_01) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); -// z->printIndexedBuffer("Output "); -// exp.printIndexedBuffer("Expected "); ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); @@ -1731,8 +1690,6 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_02) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); -// z->printIndexedBuffer("Output "); -// exp.printIndexedBuffer("Expected "); ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp index c80d75372..e9fe7264e 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp @@ -66,7 +66,6 @@ TEST_F(DeclarableOpsTests7, Test_CHOOSE_SCALAR_LARGE) { auto z = result->at(1); - z->printIndexedBuffer("CHOOSE test"); ASSERT_EQ(148,z->e(0)); //ASSERT_TRUE(exp.isSameShape(z)); @@ -572,8 +571,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Stitch_119_1) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - z->printIndexedBuffer("Stitch"); - z->printShapeInfo("Stitch Shape"); + ASSERT_TRUE(z->isSameShape(exp)); ASSERT_TRUE(z->equalsTo(exp)); @@ -664,8 +662,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Stitch_119_2) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - z->printIndexedBuffer("Stitch"); - z->printShapeInfo("Stitch Shape"); + ASSERT_TRUE(z->isSameShape(exp)); ASSERT_TRUE(z->equalsTo(exp)); @@ -683,11 +680,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Partition_119) { ASSERT_EQ(Status::OK(), result->status()); ASSERT_EQ(4, result->size()); auto z = result->at(0); -// z->printShapeInfo("Output shape info"); -// z->printIndexedBuffer("Output1"); -// result->at(1)->printIndexedBuffer("Output2"); -// result->at(2)->printIndexedBuffer("Output3"); -// result->at(3)->printIndexedBuffer("Output4"); + ASSERT_TRUE(e.isSameShape(z)); delete result; @@ -1080,7 +1073,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_1) { auto result = op.execute({&x, &idx}, {}, {}); ASSERT_EQ(result->status(), Status::OK()); auto out = result->at(0); - out->printIndexedBuffer("Segment mIN1"); + ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; @@ -1097,7 +1090,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_01) { auto result = op.execute({&x, &idx}, {}, {}); ASSERT_EQ(result->status(), Status::OK()); auto out = result->at(0); - out->printIndexedBuffer("Segment mIN01"); + ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; @@ -1113,7 +1106,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_02) { auto result = op.execute({&x, &idx}, {}, {}); ASSERT_EQ(result->status(), Status::OK()); auto out = result->at(0); - out->printIndexedBuffer("Segment mIN02"); + ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; @@ -1130,8 +1123,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMinBP_1) { auto result = op.execute({&x, &idx, &eps}, {}, {}); ASSERT_EQ(result->status(), Status::OK()); - //result->at(0)->printIndexedBuffer("Output1"); - //exp.printIndexedBuffer("Expecte"); ASSERT_TRUE(exp.equalsTo(result->at(0))); @@ -1433,9 +1424,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_02) { auto result = op.execute({&x, &idx}, {}, {}); ASSERT_EQ(result->status(), Status::OK()); ASSERT_EQ(result->size(), 1); - exp.printIndexedBuffer("Expect Mean"); - result->at(0)->printIndexedBuffer("Output Mean"); -// exp.printShapeInfo("Exp Shape"); ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; @@ -1451,9 +1439,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_021) { auto result = op.execute({&x, &idx}, {}, {}); ASSERT_EQ(result->status(), Status::OK()); ASSERT_EQ(result->size(), 1); - exp.printIndexedBuffer("Expect Mean"); - result->at(0)->printIndexedBuffer("Output Mean"); -// exp.printShapeInfo("Exp Shape"); ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; @@ -1470,9 +1455,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_022) { auto result = op.execute({&x, &idx}, {&z}, {}, {}, {}, false, nd4j::DataType::FLOAT32); ASSERT_EQ(result, Status::OK()); - exp.printIndexedBuffer("Expect Mean"); - z.printIndexedBuffer("Output Mean"); -// exp.printShapeInfo("Exp Shape"); ASSERT_TRUE(exp.equalsTo(z)); // delete result; @@ -1491,9 +1473,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMeanBP_2) { auto result = op.execute({&x, &idx, &eps}, {}, {}); ASSERT_EQ(result->status(), Status::OK()); ASSERT_EQ(result->size(), 2); -// exp.printIndexedBuffer("Expect"); -// result->at(0)->printIndexedBuffer("Output"); -// exp.printShapeInfo("Exp Shape"); ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; @@ -1842,8 +1821,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentSum_1) { auto result = op.execute({&x, &idx}, {}, {}); ASSERT_EQ(result->status(), Status::OK()); - result->at(0)->printIndexedBuffer("Output Sum"); - exp.printIndexedBuffer("Expect Sum"); ASSERT_TRUE(exp.equalsTo(result->at(0))); @@ -2001,8 +1978,6 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSum_1) { auto result = op.execute({&x, &idx}, {}, {5}); ASSERT_EQ(result->status(), Status::OK()); - result->at(0)->printIndexedBuffer("UnsortedSum1"); - exp.printIndexedBuffer("Unsorted Sum1 Exp"); ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; @@ -2019,8 +1994,6 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSum_2) { auto result = op.execute({&x, &idx}, {}, {3}); ASSERT_EQ(result->status(), Status::OK()); ASSERT_EQ(result->size(), 1); -// exp.printIndexedBuffer("Expect"); -// exp.printShapeInfo("Exp Shape"); ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; @@ -2241,10 +2214,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_04) { auto result = op.execute({&x, &idx}, {}, {}); ASSERT_EQ(result->status(), Status::OK()); - result->at(0)->printIndexedBuffer("Output"); -// result->at(0)->printShapeInfo("Out Shape"); - exp.printIndexedBuffer("Expect"); -// exp.printShapeInfo("Exp Shape"); ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; @@ -2262,10 +2231,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_05) { auto result = op.execute({&x, &idx}, {}, {}); ASSERT_EQ(result->status(), Status::OK()); - result->at(0)->printIndexedBuffer("Output"); -// result->at(0)->printShapeInfo("Out Shape"); - exp.printIndexedBuffer("Expect"); -// exp.printShapeInfo("Exp Shape"); ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; @@ -2279,15 +2244,10 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_06) { auto idx = NDArrayFactory::create({0,0,1,2,2,2,3,3}); auto exp = NDArrayFactory::create({ 2, 3, 120, 56}); - x.printIndexedBuffer("INPUT INT8"); nd4j::ops::segment_prod op; auto result = op.execute({&x, &idx}, {}, {}); ASSERT_EQ(result->status(), Status::OK()); - result->at(0)->printIndexedBuffer("Output"); -// result->at(0)->printShapeInfo("Out Shape"); - exp.printIndexedBuffer("Expect"); -// exp.printShapeInfo("Exp Shape"); ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; @@ -2301,15 +2261,10 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_07) { auto idx = NDArrayFactory::create({0,0,1,2,2,2,3,3}); auto exp = NDArrayFactory::create({ 2, 3, 120, 56}); - x.printIndexedBuffer("INPUT INT8"); nd4j::ops::segment_prod op; auto result = op.execute({&x, &idx}, {}, {}); ASSERT_EQ(result->status(), Status::OK()); - result->at(0)->printIndexedBuffer("Output"); -// result->at(0)->printShapeInfo("Out Shape"); - exp.printIndexedBuffer("Expect"); -// exp.printShapeInfo("Exp Shape"); ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; @@ -2577,12 +2532,6 @@ auto exp = NDArrayFactory::create('c', {3, 1, 2, 6}, { auto result = op.execute({&x}, {}, {2,1,3,2,2,2,0}); ASSERT_EQ(result->status(), Status::OK()); -// x.printIndexedBuffer("images"); -// nd4j_printf("input params: ksize = [1, 2, 1, 1], strides = [1, 3, 2, 1], rates = [1, 2, 2, 1]\n", ""); - result->at(0)->printBuffer("Output"); - //result->at(0)->printShapeInfo("Out Shape"); - exp.printBuffer("Expect"); - //exp.printShapeInfo("Exp Shape"); ASSERT_TRUE(exp.isSameShape(result->at(0))); ASSERT_TRUE(exp.equalsTo(result->at(0))); @@ -3142,8 +3091,6 @@ auto exp = NDArrayFactory::create('c', {2, 2, 4, 2}, { auto result = op.execute({&x}, {}, {6}, {}, false, nd4j::DataType::DOUBLE); ASSERT_EQ(result->status(), Status::OK()); - result->at(0)->printIndexedBuffer("z"); - ASSERT_TRUE(exp.equalsTo(result->at(0))); delete result; @@ -3358,9 +3305,6 @@ auto exp = NDArrayFactory::create('c', {2, 3, 3}, { auto result = op.execute({&x}, {y}, {}, {1, 1}, {}, true, nd4j::DataType::DOUBLE); ASSERT_EQ(result, Status::OK()); - x.printIndexedBuffer("Output"); - //exp.printIndexedBuffer("Expect"); - ASSERT_TRUE(exp.equalsTo(&x)); // delete result; @@ -3431,8 +3375,6 @@ TEST_F(DeclarableOpsTests7, TestRoll_12) { auto result = op.execute({&x, &shift, &axis}, {}, {}, {}, false, nd4j::DataType::DOUBLE); ASSERT_EQ(result->status(), Status::OK()); auto out = result->at(0); - out->printIndexedBuffer("Output"); - //exp.printIndexedBuffer("Expect"); ASSERT_TRUE(exp.equalsTo(out)); @@ -3457,9 +3399,6 @@ TEST_F(DeclarableOpsTests7, TestRoll_13) { ASSERT_EQ(result->status(), Status::OK()); auto out = result->at(0); -// out->printIndexedBuffer("Output"); - //exp.printIndexedBuffer("Expect"); - ASSERT_TRUE(exp.equalsTo(out)); delete result; @@ -4274,11 +4213,8 @@ TEST_F(DeclarableOpsTests7, TypesConversion_test4) { ASSERT_EQ(ND4J_STATUS_OK, result32->status()); ASSERT_EQ(ND4J_STATUS_OK, result64->status()); auto out1 = result32->at(0); - out1->printIndexedBuffer("OUT_F"); auto out2 = result64->at(0); - out2->printIndexedBuffer("OUT_D"); -// output->printIndexedBuffer("Toggled"); ASSERT_TRUE(exp32.equalsTo(out1)); ASSERT_TRUE(exp64.equalsTo(out2)); @@ -4369,8 +4305,6 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test5) { nd4j::ops::mirror_pad op; auto result = op.execute({&input, &paddings}, {}, {0}); auto output = result->at(0); - output->printBuffer("Output"); - exp.printBuffer("Expected"); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -6204,8 +6138,6 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_BP_1) { nd4j::ops::reduce_max_bp op; auto result = op.execute({&x, &eps}, {}, {0, 1}); auto output = result->at(0); - exp.printIndexedBuffer("E"); - output->printIndexedBuffer("O"); ASSERT_EQ(ND4J_STATUS_OK, result->status()); ASSERT_TRUE(exp.isSameShape(output)); @@ -6379,8 +6311,6 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_BP_02) { auto result = op.execute({&x, &eps, &axes}, {}, {}, {false}); ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto output = result->at(0); - output->printIndexedBuffer("Result is"); - exp.printIndexedBuffer("Expect is"); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -6397,7 +6327,6 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_BP_3) { nd4j::ops::reduce_norm1_bp op; auto result = op.execute({&x, &eps}, {1.f}, {0,1}); auto output = result->at(0); -// output->printIndexedBuffer("Result is"); ASSERT_EQ(ND4J_STATUS_OK, result->status()); ASSERT_TRUE(exp.isSameShape(output)); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp index 82b3d2db7..9f98ab3a1 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp @@ -55,12 +55,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test1) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f}); auto exp = NDArrayFactory::create('c', {4}, {602.2222f, 727.13885f, 993.5555f, 755.8889f}); - + nd4j::ops::reduce_variance op; auto result = op.execute({&x}, {}, {0,1}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -73,12 +73,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test2) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f}); auto exp = NDArrayFactory::create('c', {1,1,4}, {602.2222f, 727.13885f, 993.5555f, 755.8889f}); - + nd4j::ops::reduce_variance op; auto result = op.execute({&x}, {1.}, {0,1}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -91,12 +91,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test3) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f}); auto exp = NDArrayFactory::create('c', {3}, {900.9375f, 969.8594f, 424.1875f}); - + nd4j::ops::reduce_variance op; auto result = op.execute({&x}, {}, {0,2}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -108,13 +108,13 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test3) { TEST_F(DeclarableOpsTests8, reduceVariance_test4) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f}); - auto exp = NDArrayFactory::create('c', {1,3,1}, {900.9375f, 969.8594f, 424.1875f}); - + auto exp = NDArrayFactory::create('c', {1,3,1}, {900.9375f, 969.8594f, 424.1875f}); + nd4j::ops::reduce_variance op; auto result = op.execute({&x}, {1.}, {0,2}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -127,12 +127,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test5) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f}); auto exp = NDArrayFactory::create(788.6927f); - + nd4j::ops::reduce_variance op; auto result = op.execute({&x}, {}, {}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -145,12 +145,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test6) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.}); auto exp = NDArrayFactory::create(788.6927f); - + nd4j::ops::reduce_variance op; auto result = op.execute({&x}, {}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -163,12 +163,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test7) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.}); auto exp = NDArrayFactory::create('c', {1,1,1}, {788.6927f}); - + nd4j::ops::reduce_variance op; auto result = op.execute({&x}, {1.}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -199,12 +199,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test1) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.}); auto exp = NDArrayFactory::create('c', {4}, {24.54022f, 26.96551f, 31.52072f, 27.49343f}); - + nd4j::ops::reduce_stdev op; auto result = op.execute({&x}, {}, {0,1}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -217,12 +217,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test2) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.}); auto exp = NDArrayFactory::create('c', {1,1,4}, {24.54022f, 26.96551f, 31.52072f, 27.49343f}); - + nd4j::ops::reduce_stdev op; auto result = op.execute({&x}, {1.}, {0,1}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -235,12 +235,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test3) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.}); auto exp = NDArrayFactory::create('c', {3}, {30.01562f, 31.14257f, 20.59581f}); - + nd4j::ops::reduce_stdev op; auto result = op.execute({&x}, {}, {0,2}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -252,13 +252,13 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test3) { TEST_F(DeclarableOpsTests8, reduceStDev_test4) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.}); - auto exp = NDArrayFactory::create('c', {1,3,1}, {30.01562f, 31.14257f, 20.59581f}); - + auto exp = NDArrayFactory::create('c', {1,3,1}, {30.01562f, 31.14257f, 20.59581f}); + nd4j::ops::reduce_stdev op; auto result = op.execute({&x}, {1.}, {0,2}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -271,12 +271,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test5) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.}); auto exp = NDArrayFactory::create(28.08367f); - + nd4j::ops::reduce_stdev op; auto result = op.execute({&x}, {}, {}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -289,12 +289,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test6) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.}); auto exp = NDArrayFactory::create(28.08367f); - + nd4j::ops::reduce_stdev op; auto result = op.execute({&x}, {}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -307,12 +307,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test7) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.}); auto exp = NDArrayFactory::create('c', {1,1,1}, {28.08367f}); - + nd4j::ops::reduce_stdev op; auto result = op.execute({&x}, {1.f}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -325,12 +325,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test8) { auto x = NDArrayFactory::create('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.}); auto exp = NDArrayFactory::create('c', {4}, {26.88246f, 29.53924f, 34.52921f, 30.11755f}); - + nd4j::ops::reduce_stdev op; auto result = op.execute({&x}, {0.f,1.f}, {0,1}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); // output->printBuffer("Reduced STDDEV"); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -366,36 +366,36 @@ TEST_F(DeclarableOpsTests8, reduceVarianceBP_test1) { auto exp34 = NDArrayFactory::create('c', {3,4}, {-0.45833334f, -0.375f, -0.29166666f, -0.20833333f, -0.125f, -0.041666668f, 0.041666668f, 0.125f, 0.20833333f, 0.29166666f, 0.375f, 0.45833334f}); x.linspace(1); - + nd4j::ops::reduce_variance_bp op; auto result = op.execute({&x, &gradO2}, {0,1}, {}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); auto output = result->at(0); ASSERT_TRUE(exp12.isSameShape(output)); ASSERT_TRUE(exp12.equalsTo(output)); delete result; result = op.execute({&x, &gradO1}, {1,1}, {}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); output = result->at(0); ASSERT_TRUE(exp12.isSameShape(output)); ASSERT_TRUE(exp12.equalsTo(output)); - delete result; + delete result; result = op.execute({&x, &gradO2}, {0,0}, {}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); output = result->at(0); ASSERT_TRUE(exp34.isSameShape(output)); ASSERT_TRUE(exp34.equalsTo(output)); - delete result; + delete result; result = op.execute({&x, &gradO1}, {1,0}, {}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); output = result->at(0); ASSERT_TRUE(exp34.isSameShape(output)); ASSERT_TRUE(exp34.equalsTo(output)); - delete result; + delete result; } @@ -409,36 +409,36 @@ TEST_F(DeclarableOpsTests8, reduceVarianceBP_test2) { auto exp34 = NDArrayFactory::create('c', {3,4}, {-4.000000f, -8.000000f, -12.000000f, -16.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 4.000000f, 8.000000f, 12.000000f, 16.000000f}); x.linspace(1); - + nd4j::ops::reduce_variance_bp op; auto result = op.execute({&x, &gradO2}, {0,0}, {0}); - ASSERT_EQ(Status::OK(), result->status()); - auto output = result->at(0); + ASSERT_EQ(Status::OK(), result->status()); + auto output = result->at(0); ASSERT_TRUE(exp12.isSameShape(output)); ASSERT_TRUE(exp12.equalsTo(output)); delete result; result = op.execute({&x, &gradO1}, {1,0}, {0}); - ASSERT_EQ(Status::OK(), result->status()); - output = result->at(0); + ASSERT_EQ(Status::OK(), result->status()); + output = result->at(0); ASSERT_TRUE(exp12.isSameShape(output)); - ASSERT_TRUE(exp12.equalsTo(output)); - delete result; + ASSERT_TRUE(exp12.equalsTo(output)); + delete result; result = op.execute({&x, &gradO2}, {0,1}, {0}); - ASSERT_EQ(Status::OK(), result->status()); - output = result->at(0); - ASSERT_TRUE(exp34.isSameShape(output)); - ASSERT_TRUE(exp34.equalsTo(output)); - delete result; - - result = op.execute({&x, &gradO1}, {1,1}, {0}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); output = result->at(0); ASSERT_TRUE(exp34.isSameShape(output)); ASSERT_TRUE(exp34.equalsTo(output)); - delete result; + delete result; + + result = op.execute({&x, &gradO1}, {1,1}, {0}); + ASSERT_EQ(Status::OK(), result->status()); + output = result->at(0); + ASSERT_TRUE(exp34.isSameShape(output)); + ASSERT_TRUE(exp34.equalsTo(output)); + delete result; } @@ -537,15 +537,15 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test1) { auto x = NDArrayFactory::create('c', {3,4}); auto gradO1 = NDArrayFactory::create('c', {1,1}, {0.5f}); auto gradO2 = NDArrayFactory::create(0.5f); - auto exp12 = NDArrayFactory::create('c', {3,4}, {-0.069337524f, -0.056730703f, -0.04412388f, -0.031517055f, -0.018910235f, -0.0063034114f, 0.0063034114f, 0.018910235f, 0.031517055f, 0.04412388f, 0.056730703f, 0.069337524f}); + auto exp12 = NDArrayFactory::create('c', {3,4}, {-0.069337524f, -0.056730703f, -0.04412388f, -0.031517055f, -0.018910235f, -0.0063034114f, 0.0063034114f, 0.018910235f, 0.031517055f, 0.04412388f, 0.056730703f, 0.069337524f}); auto exp34 = NDArrayFactory::create('c', {3,4}, {-0.06638563f, -0.05431551f, -0.0422454f, -0.030175284f, -0.01810517f, -0.006035057f, 0.006035057f, 0.01810517f, 0.030175284f, 0.0422454f, 0.05431551f, 0.06638563f}); x.linspace(1); - + nd4j::ops::reduce_stdev_bp op; auto result = op.execute({&x, &gradO2}, {0,1}, {}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); auto output = result->at(0); // output->printIndexedBuffer(); ASSERT_TRUE(exp12.isSameShape(output)); @@ -553,21 +553,21 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test1) { delete result; result = op.execute({&x, &gradO1}, {1,1}, {}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); output = result->at(0); ASSERT_TRUE(exp12.isSameShape(output)); ASSERT_TRUE(exp12.equalsTo(output)); delete result; result = op.execute({&x, &gradO2}, {0,0}, {}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); output = result->at(0); ASSERT_TRUE(exp34.isSameShape(output)); ASSERT_TRUE(exp34.equalsTo(output)); delete result; result = op.execute({&x, &gradO1}, {1,0}, {}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); output = result->at(0); ASSERT_TRUE(exp34.isSameShape(output)); ASSERT_TRUE(exp34.equalsTo(output)); @@ -584,36 +584,36 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test2) { auto exp34 = NDArrayFactory::create('c', {3,4}, {-0.5f, -1.0f, -1.5f, -2.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.5f, 1.0f, 1.5f, 2.0f}); x.linspace(1); - + nd4j::ops::reduce_stdev_bp op; auto result = op.execute({&x, &gradO2}, {0,0}, {0}); - ASSERT_EQ(Status::OK(), result->status()); - auto output = result->at(0); + ASSERT_EQ(Status::OK(), result->status()); + auto output = result->at(0); ASSERT_TRUE(exp12.isSameShape(output)); ASSERT_TRUE(exp12.equalsTo(output)); delete result; result = op.execute({&x, &gradO1}, {1,0}, {0}); - ASSERT_EQ(Status::OK(), result->status()); - output = result->at(0); + ASSERT_EQ(Status::OK(), result->status()); + output = result->at(0); ASSERT_TRUE(exp12.isSameShape(output)); - ASSERT_TRUE(exp12.equalsTo(output)); - delete result; + ASSERT_TRUE(exp12.equalsTo(output)); + delete result; result = op.execute({&x, &gradO2}, {0,1}, {0}); - ASSERT_EQ(Status::OK(), result->status()); - output = result->at(0); - ASSERT_TRUE(exp34.isSameShape(output)); - ASSERT_TRUE(exp34.equalsTo(output)); - delete result; - - result = op.execute({&x, &gradO1}, {1,1}, {0}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); output = result->at(0); ASSERT_TRUE(exp34.isSameShape(output)); ASSERT_TRUE(exp34.equalsTo(output)); - delete result; + delete result; + + result = op.execute({&x, &gradO1}, {1,1}, {0}); + ASSERT_EQ(Status::OK(), result->status()); + output = result->at(0); + ASSERT_TRUE(exp34.isSameShape(output)); + ASSERT_TRUE(exp34.equalsTo(output)); + delete result; } //////////////////////////////////////////////////////////////////////////////// @@ -669,44 +669,44 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test3) { auto exp34 = NDArrayFactory::create('c', {3,4}, {-0.38729835f, -0.12909944f, 0.12909944f, 0.38729835f, -0.7745967f, -0.2581989f, 0.2581989f, 0.7745967f, -1.161895f, -0.38729835f, 0.38729835f, 1.161895f}); x.linspace(1); - + nd4j::ops::reduce_stdev_bp op; auto result = op.execute({&x, &gradO2}, {0,0}, {1}); - ASSERT_EQ(Status::OK(), result->status()); - auto output = result->at(0); + ASSERT_EQ(Status::OK(), result->status()); + auto output = result->at(0); ASSERT_TRUE(exp12.isSameShape(output)); ASSERT_TRUE(exp12.equalsTo(output)); delete result; result = op.execute({&x, &gradO1}, {1,0}, {1}); - ASSERT_EQ(Status::OK(), result->status()); - output = result->at(0); + ASSERT_EQ(Status::OK(), result->status()); + output = result->at(0); ASSERT_TRUE(exp12.isSameShape(output)); - ASSERT_TRUE(exp12.equalsTo(output)); - delete result; + ASSERT_TRUE(exp12.equalsTo(output)); + delete result; result = op.execute({&x, &gradO2}, {0,1}, {1}); - ASSERT_EQ(Status::OK(), result->status()); - output = result->at(0); - ASSERT_TRUE(exp34.isSameShape(output)); - ASSERT_TRUE(exp34.equalsTo(output)); - delete result; - - result = op.execute({&x, &gradO1}, {1,1}, {1}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); output = result->at(0); ASSERT_TRUE(exp34.isSameShape(output)); ASSERT_TRUE(exp34.equalsTo(output)); - delete result; + delete result; + + result = op.execute({&x, &gradO1}, {1,1}, {1}); + ASSERT_EQ(Status::OK(), result->status()); + output = result->at(0); + ASSERT_TRUE(exp34.isSameShape(output)); + ASSERT_TRUE(exp34.equalsTo(output)); + delete result; } //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_1) { - - auto input = NDArrayFactory::create('c', {3, 5}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.}); + + auto input = NDArrayFactory::create('c', {3, 5}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.}); auto exp = NDArrayFactory::create(120.f); //************************************// @@ -714,7 +714,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_1) { auto result = op.execute({&input}, {}, {}); ASSERT_EQ(Status::OK(), result->status()); - auto z = result->at(0); + auto z = result->at(0); //z->printIndexedBuffer("Result is "); ASSERT_TRUE(exp.equalsTo(z)); delete result; @@ -722,8 +722,8 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_1) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_2) { - - auto input = NDArrayFactory::create('c', {3, 5}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.}); + + auto input = NDArrayFactory::create('c', {3, 5}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.}); auto exp = NDArrayFactory::create({15.f, 40.f, 65.f}); //************************************// @@ -731,7 +731,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_2) { auto result = op.execute({&input}, {}, {1}); ASSERT_EQ(Status::OK(), result->status()); - auto z = result->at(0); + auto z = result->at(0); // z->printIndexedBuffer("Result is "); ASSERT_TRUE(exp.equalsTo(z)); delete result; @@ -757,8 +757,8 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_03) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_1) { - - auto input = NDArrayFactory::create('c', {3, 5}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.}); + + auto input = NDArrayFactory::create('c', {3, 5}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.}); auto exp = NDArrayFactory::create(1307674368000.f); //************************************// @@ -766,7 +766,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_1) { auto result = op.execute({&input}, {}, {}); ASSERT_EQ(Status::OK(), result->status()); - auto z = result->at(0); + auto z = result->at(0); //z->printIndexedBuffer("Result is "); ASSERT_TRUE(exp.equalsTo(z)); delete result; @@ -774,8 +774,8 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_1) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_2) { - - auto input = NDArrayFactory::create('c', {3, 5}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.}); + + auto input = NDArrayFactory::create('c', {3, 5}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.}); auto exp = NDArrayFactory::create({120.f, 30240.f, 360360.f}); //************************************// @@ -783,7 +783,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_2) { auto result = op.execute({&input}, {}, {1}); ASSERT_EQ(Status::OK(), result->status()); - auto z = result->at(0); + auto z = result->at(0); // z->printIndexedBuffer("Result is "); ASSERT_TRUE(exp.equalsTo(z)); delete result; @@ -798,9 +798,9 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_01) { nd4j::ops::reduce_sum op; auto result = op.execute({&x}, {}, {0,1}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -817,10 +817,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_02) { nd4j::ops::reduce_sum op; auto result = op.execute({&x}, {1.}, {0, 1}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -837,10 +837,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_3) { nd4j::ops::reduce_sum op; auto result = op.execute({&x}, {}, {0, 2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -857,10 +857,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_4) { nd4j::ops::reduce_sum op; auto result = op.execute({&x}, {1.}, {0, 2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -877,10 +877,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_5) { nd4j::ops::reduce_sum op; auto result = op.execute({&x}, {}, {}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -894,13 +894,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_6) { auto x = NDArrayFactory::create('c', {2,3,4}); auto exp = NDArrayFactory::create(300.f); x.linspace(1); - + nd4j::ops::reduce_sum op; auto result = op.execute({&x}, {}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -914,13 +914,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_7) { auto x = NDArrayFactory::create('c', {2,3,4}); auto exp = NDArrayFactory::create('c', {1,1,1}, {300.f}); x.linspace(1); -// x.printIndexedBuffer("Input with shape (2, 3, 4) is"); +// x.printIndexedBuffer("Input with shape (2, 3, 4) is"); nd4j::ops::reduce_sum op; auto result = op.execute({&x}, {1.}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -937,9 +937,9 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_01) { nd4j::ops::reduce_prod op; auto result = op.execute({&x}, {}, {0,1}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -956,10 +956,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_02) { nd4j::ops::reduce_prod op; auto result = op.execute({&x}, {1.}, {0, 1}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -976,10 +976,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_3) { nd4j::ops::reduce_prod op; auto result = op.execute({&x}, {}, {0, 2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -996,10 +996,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_4) { nd4j::ops::reduce_prod op; auto result = op.execute({&x}, {1.}, {0, 2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1034,13 +1034,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_5) { auto x = NDArrayFactory::create('c', {2,3,2}); auto exp = NDArrayFactory::create(479001600.f); x.linspace(1); - + nd4j::ops::reduce_prod op; auto result = op.execute({&x}, {}, {}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1054,13 +1054,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_6) { auto x = NDArrayFactory::create('c', {2,3,2}); auto exp = NDArrayFactory::create(479001600.f); x.linspace(1); - + nd4j::ops::reduce_prod op; auto result = op.execute({&x}, {}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1074,13 +1074,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_7) { auto x = NDArrayFactory::create('c', {2,3,2}); auto exp = NDArrayFactory::create('c', {1, 1, 1}, {479001600.f}); x.linspace(1); -// x.printIndexedBuffer("Input with shape (2, 3, 4) is"); +// x.printIndexedBuffer("Input with shape (2, 3, 4) is"); nd4j::ops::reduce_prod op; auto result = op.execute({&x}, {1.}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1097,9 +1097,9 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_1) { nd4j::ops::reduce_min op; auto result = op.execute({&x}, {}, {0, 1}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1116,10 +1116,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_2) { nd4j::ops::reduce_min op; auto result = op.execute({&x}, {1.}, {0, 1}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1136,10 +1136,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_3) { nd4j::ops::reduce_min op; auto result = op.execute({&x}, {}, {0, 2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1156,10 +1156,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_4) { nd4j::ops::reduce_min op; auto result = op.execute({&x}, {1.}, {0, 2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1194,13 +1194,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_5) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create(1.f); x.linspace(1); - + nd4j::ops::reduce_min op; auto result = op.execute({&x}, {}, {}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1214,13 +1214,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_6) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create(1.f); x.linspace(1); - + nd4j::ops::reduce_min op; auto result = op.execute({&x}, {}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1234,13 +1234,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_7) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create('c', {1, 1, 1}, {1.f}); x.linspace(1); - // x.printIndexedBuffer("Input with shape (2, 3, 4) is"); + // x.printIndexedBuffer("Input with shape (2, 3, 4) is"); nd4j::ops::reduce_min op; auto result = op.execute({&x}, {1.}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1257,10 +1257,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_1) { nd4j::ops::reduce_max op; auto result = op.execute({&x}, {}, {0,1}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); // output->printShapeInfo("Output shape"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1277,10 +1277,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_2) { nd4j::ops::reduce_max op; auto result = op.execute({&x}, {1.}, {0, 1}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1297,10 +1297,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_3) { nd4j::ops::reduce_max op; auto result = op.execute({&x}, {}, {0, 2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1317,10 +1317,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_4) { nd4j::ops::reduce_max op; auto result = op.execute({&x}, {1.}, {0, 2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1355,13 +1355,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_5) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create(24.f); x.linspace(1); - + nd4j::ops::reduce_max op; auto result = op.execute({&x}, {}, {}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1375,13 +1375,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_6) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create(24.f); x.linspace(1); - + nd4j::ops::reduce_max op; auto result = op.execute({&x}, {}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1395,13 +1395,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_7) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create('c', {1, 1, 1}, {24.f}); x.linspace(1); -// x.printIndexedBuffer("Input with shape (2, 3, 4) is"); +// x.printIndexedBuffer("Input with shape (2, 3, 4) is"); nd4j::ops::reduce_max op; auto result = op.execute({&x}, {1.}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1419,7 +1419,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_1) { auto result = op.execute({&x}, {}, {0,1}); auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1436,10 +1436,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_2) { nd4j::ops::reduce_norm1 op; auto result = op.execute({&x}, {1.}, {0, 1}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1456,10 +1456,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_3) { nd4j::ops::reduce_norm1 op; auto result = op.execute({&x}, {}, {0, 2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1476,10 +1476,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_4) { nd4j::ops::reduce_norm1 op; auto result = op.execute({&x}, {1.}, {0, 2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1514,13 +1514,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_5) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create(300.f); x.linspace(1); - + nd4j::ops::reduce_norm1 op; auto result = op.execute({&x}, {}, {}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1534,13 +1534,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_6) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create(300.f); x.linspace(1); - + nd4j::ops::reduce_norm1 op; auto result = op.execute({&x}, {}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1554,13 +1554,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_7) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create('c', {1, 1, 1}, {300.f}); x.linspace(1); -// x.printIndexedBuffer("Input with shape (2, 3, 4) is"); +// x.printIndexedBuffer("Input with shape (2, 3, 4) is"); nd4j::ops::reduce_norm1 op; auto result = op.execute({&x}, {1.}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1578,7 +1578,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_1) { auto result = op.execute({&x}, {}, {0,1}); auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1595,10 +1595,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_2) { nd4j::ops::reduce_norm2 op; auto result = op.execute({&x}, {1.}, {0, 1}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1615,10 +1615,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_3) { nd4j::ops::reduce_norm2 op; auto result = op.execute({&x}, {}, {0, 2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1635,10 +1635,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_4) { nd4j::ops::reduce_norm2 op; auto result = op.execute({&x}, {1.}, {0, 2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1673,13 +1673,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_5) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create(70.f); x.linspace(1); - + nd4j::ops::reduce_norm2 op; auto result = op.execute({&x}, {}, {}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1693,13 +1693,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_6) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create(70.f); x.linspace(1); - + nd4j::ops::reduce_norm2 op; auto result = op.execute({&x}, {}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1713,13 +1713,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_7) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create('c', {1, 1, 1}, {70.f}); x.linspace(1); -// x.printIndexedBuffer("Input with shape (2, 3, 4) is"); +// x.printIndexedBuffer("Input with shape (2, 3, 4) is"); nd4j::ops::reduce_norm2 op; auto result = op.execute({&x}, {1.}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1738,7 +1738,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_1) { auto result = op.execute({&x}, {}, {0,1}); auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1757,7 +1757,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_2) { auto result = op.execute({&x}, {1.f}, {0,1}); auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1776,7 +1776,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_3) { auto result = op.execute({&x}, {}, {0,2}); auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1795,7 +1795,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_4) { auto result = op.execute({&x}, {1.f}, {0,2}); auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1829,13 +1829,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_5) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create(24.f); x.linspace(1); - + nd4j::ops::reduce_norm_max op; auto result = op.execute({&x}, {}, {}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1849,13 +1849,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_6) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create(24.f); x.linspace(1); - + nd4j::ops::reduce_norm_max op; auto result = op.execute({&x}, {}, {0, 1, 2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1869,13 +1869,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_7) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create('c', {1, 1, 1}, {24.f}); x.linspace(1); - + nd4j::ops::reduce_norm_max op; auto result = op.execute({&x}, {1.f}, {}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1894,7 +1894,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_1) { auto result = op.execute({&x}, {}, {0,1}); auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1913,7 +1913,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_2) { auto result = op.execute({&x}, {1.f}, {0,1}); auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1932,7 +1932,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_3) { auto result = op.execute({&x}, {}, {0,2}); auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1951,7 +1951,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_4) { auto result = op.execute({&x}, {1.f}, {0,2}); auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -1985,13 +1985,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_5) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create(4900.f); x.linspace(1); - + nd4j::ops::reduce_sqnorm op; auto result = op.execute({&x}, {}, {}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2005,13 +2005,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_6) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create(4900.f); x.linspace(1); - + nd4j::ops::reduce_sqnorm op; auto result = op.execute({&x}, {}, {0, 1, 2}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2025,13 +2025,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_7) { auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create('c', {1, 1, 1}, {4900.f}); x.linspace(1); - + nd4j::ops::reduce_sqnorm op; auto result = op.execute({&x}, {1.f}, {}); - auto output = result->at(0); + auto output = result->at(0); // output->printIndexedBuffer("Result is"); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2041,8 +2041,8 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_7) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_1) { - - auto input = NDArrayFactory::create('c', {3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.}); + + auto input = NDArrayFactory::create('c', {3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.}); auto eps = NDArrayFactory::create(0.5f); auto exp = NDArrayFactory::create('c', {3, 4}, {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,0.5f}); //************************************// @@ -2051,7 +2051,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_1) { auto result = op.execute({&input, &eps}, {}, {}); ASSERT_EQ(Status::OK(), result->status()); - auto z = result->at(0); + auto z = result->at(0); // z->printIndexedBuffer("Result is "); // z->printShapeInfo(); ASSERT_TRUE(exp.equalsTo(z)); @@ -2060,11 +2060,11 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_1) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_2) { - - auto input = NDArrayFactory::create('c', {3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.}); + + auto input = NDArrayFactory::create('c', {3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.}); auto eps = NDArrayFactory::create('c', {1, 1}, {0.5f}); - auto exp = NDArrayFactory::create('c', {3, 4}, {0.5f, 0.5f, 0.5f, 0.5f, - 0.5f, 0.5f, 0.5f, 0.5f, + auto exp = NDArrayFactory::create('c', {3, 4}, {0.5f, 0.5f, 0.5f, 0.5f, + 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,0.5f}); //************************************// @@ -2072,7 +2072,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_2) { auto result = op.execute({&input, &eps}, {1.f}, {}); ASSERT_EQ(Status::OK(), result->status()); - auto z = result->at(0); + auto z = result->at(0); // z->printIndexedBuffer("Result is "); // z->printShapeInfo(); ASSERT_TRUE(exp.equalsTo(z)); @@ -2081,11 +2081,11 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_2) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_3) { - - auto input = NDArrayFactory::create('c', {3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.}); + + auto input = NDArrayFactory::create('c', {3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.}); auto eps = NDArrayFactory::create('c', {4}, {1.f, 2.f, 3.f, 4.f}); - auto exp = NDArrayFactory::create('c', {3, 4}, {1.f, 2.f, 3.f, 4.f, - 1.f, 2.f, 3.f, 4.f, + auto exp = NDArrayFactory::create('c', {3, 4}, {1.f, 2.f, 3.f, 4.f, + 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); //************************************// @@ -2093,7 +2093,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_3) { auto result = op.execute({&input, &eps}, {}, {0}); ASSERT_EQ(Status::OK(), result->status()); - auto z = result->at(0); + auto z = result->at(0); // z->printIndexedBuffer("Result is "); // z->printShapeInfo(); ASSERT_TRUE(exp.equalsTo(z)); @@ -2102,11 +2102,11 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_3) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_4) { - - auto input = NDArrayFactory::create('c', {3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.}); + + auto input = NDArrayFactory::create('c', {3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.}); auto eps = NDArrayFactory::create('c', {1, 4}, {1.f, 2.f, 3.f, 4.f}); - auto exp = NDArrayFactory::create('c', {3, 4}, {1.f, 2.f, 3.f, 4.f, - 1.f, 2.f, 3.f, 4.f, + auto exp = NDArrayFactory::create('c', {3, 4}, {1.f, 2.f, 3.f, 4.f, + 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); //************************************// @@ -2114,7 +2114,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_4) { auto result = op.execute({&input, &eps}, {1.f}, {0}); ASSERT_EQ(Status::OK(), result->status()); - auto z = result->at(0); + auto z = result->at(0); // z->printIndexedBuffer("Result is "); // z->printShapeInfo(); ASSERT_TRUE(exp.equalsTo(z)); @@ -2146,23 +2146,23 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_04) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_BP_1) { - - auto input = NDArrayFactory::create('c', {3, 5}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f}); + + auto input = NDArrayFactory::create('c', {3, 5}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f}); auto eps = NDArrayFactory::create(1307674368000.f); //************************************// // auto exp = NDArrayFactory::create('c', {3, 4}, {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,0.5f}); //************************************// - auto exp = NDArrayFactory::create('c', {3, 5}, {1710012166826558903812096.f, 855006083413279451906048.f, 570004067618451974258688.f, - 427503041706639725953024.f, 342002454982589992140800.f, 285002033809225987129344.f, - 244287457550765131825152.f, 213751520853319862976512.f, 190001355872817324752896.f, - 171001227491294996070400.f, 155455648254341989531648.f, 142501016904612993564672.f, - 131539399526781282156544.f, 122143728775382565912576.f, 114000815325130245799936.f}); + auto exp = NDArrayFactory::create('c', {3, 5}, {1710012166826558903812096.f, 855006083413279451906048.f, 570004067618451974258688.f, + 427503041706639725953024.f, 342002454982589992140800.f, 285002033809225987129344.f, + 244287457550765131825152.f, 213751520853319862976512.f, 190001355872817324752896.f, + 171001227491294996070400.f, 155455648254341989531648.f, 142501016904612993564672.f, + 131539399526781282156544.f, 122143728775382565912576.f, 114000815325130245799936.f}); nd4j::ops::reduce_prod_bp op; auto result = op.execute({&input, &eps}, {}, {}); ASSERT_EQ(Status::OK(), result->status()); - auto z = result->at(0); + auto z = result->at(0); // z->printIndexedBuffer("Result is "); // z->printShapeInfo(); ASSERT_TRUE(exp.equalsTo(z)); @@ -2175,13 +2175,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test1) { auto x = NDArrayFactory::create('c', {2,3,4}); auto exp = NDArrayFactory::create('c', {4}, {11.f, 12.f, 13.f, 14.f}); x.linspace(1); - - + + nd4j::ops::reduce_mean op; auto result = op.execute({&x}, {}, {0,1}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2195,13 +2195,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test2) { auto x = NDArrayFactory::create('c', {2,3,4}); auto exp = NDArrayFactory::create('c', {1,1,4}, {11.f, 12.f, 13.f, 14.f}); x.linspace(1); - - + + nd4j::ops::reduce_mean op; auto result = op.execute({&x}, {1.}, {0,1}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2215,13 +2215,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test3) { auto x = NDArrayFactory::create('c', {2,3,4}); auto exp = NDArrayFactory::create('c', {3}, {8.5f, 12.5f, 16.5f}); x.linspace(1); - - + + nd4j::ops::reduce_mean op; auto result = op.execute({&x}, {}, {0,2}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2235,13 +2235,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test4) { auto x = NDArrayFactory::create('c', {2,3,4}); auto exp = NDArrayFactory::create('c', {1,3,1}, {8.5f, 12.5f, 16.5f}); x.linspace(1); - - + + nd4j::ops::reduce_mean op; auto result = op.execute({&x}, {1.f}, {0,2}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2255,13 +2255,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test5) { auto x = NDArrayFactory::create('c', {2,3,4}); auto exp = NDArrayFactory::create(12.5f); x.linspace(1); - - + + nd4j::ops::reduce_mean op; auto result = op.execute({&x}, {}, {}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2275,12 +2275,12 @@ TEST_F(DeclarableOpsTests8, reduceMean_test6) { auto x = NDArrayFactory::create('c', {2,3,4}); auto exp = NDArrayFactory::create(12.5f); x.linspace(1); - + nd4j::ops::reduce_mean op; auto result = op.execute({&x}, {}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2294,12 +2294,12 @@ TEST_F(DeclarableOpsTests8, reduceMean_test7) { auto x = NDArrayFactory::create('c', {2,3,4}); auto exp = NDArrayFactory::create('c', {1,1,1}, {12.5f}); x.linspace(1); - + nd4j::ops::reduce_mean op; auto result = op.execute({&x}, {1.}, {0,1,2}); - auto output = result->at(0); + auto output = result->at(0); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2336,11 +2336,11 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test1) { auto exp = NDArrayFactory::create('c', {3,4}, {1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24}); x.linspace(1); - + nd4j::ops::reduce_mean_bp op; auto result = op.execute({&x, &gradO1}, {0}, {}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); auto output = result->at(0); // output->printShapeInfo("o"); @@ -2350,7 +2350,7 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test1) { delete result; result = op.execute({&x, &gradO2}, {1}, {}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); output = result->at(0); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2367,18 +2367,18 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test2) { auto exp = NDArrayFactory::create('c', {3,4}, {1.f/3.f, 2.f/3.f, 1.f, 4.f/3.f, 1.f/3.f, 2.f/3.f, 1.f, 4.f/3.f, 1.f/3.f, 2.f/3.f, 1.f, 4.f/3.f}); x.linspace(1); - + nd4j::ops::reduce_mean_bp op; auto result = op.execute({&x, &gradO1}, {0}, {0}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); auto output = result->at(0); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); delete result; result = op.execute({&x, &gradO2}, {1}, {0}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); output = result->at(0); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2422,18 +2422,18 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test3) { auto exp = NDArrayFactory::create('c', {3,4}, {0.25f, 0.25f, 0.25f, 0.25f, 0.5f, 0.5f, 0.5f, 0.5f, 0.75f, 0.75f, 0.75f, 0.75f}); x.linspace(1); - + nd4j::ops::reduce_mean_bp op; auto result = op.execute({&x, &gradO1}, {0}, {1}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); auto output = result->at(0); ASSERT_TRUE(exp.isSameShape(output)); - ASSERT_TRUE(exp.equalsTo(output)); + ASSERT_TRUE(exp.equalsTo(output)); delete result; result = op.execute({&x, &gradO2}, {1}, {1}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); output = result->at(0); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2444,14 +2444,14 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test3) { TEST_F(DeclarableOpsTests8, reduceStDevBP_test4) { auto x = NDArrayFactory::create('c', {3}, {2.f, 3.f, 4.f}); - auto gradO = NDArrayFactory::create(0.5f); - auto exp = NDArrayFactory::create('c', {3}, {-0.25f, 0.f, 0.25f}); - + auto gradO = NDArrayFactory::create(0.5f); + auto exp = NDArrayFactory::create('c', {3}, {-0.25f, 0.f, 0.25f}); + nd4j::ops::reduce_stdev_bp op; auto result = op.execute({&x, &gradO}, {0,1}, {}); - ASSERT_EQ(Status::OK(), result->status()); - auto output = result->at(0); + ASSERT_EQ(Status::OK(), result->status()); + auto output = result->at(0); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2481,7 +2481,7 @@ TEST_F(DeclarableOpsTests8, avgpool2d_test13) { nd4j::ops::avgpool2d op; auto results = op.execute({&input}, {}, {kH,kW, sH,sW, pH,pW, dH,dW, paddingMode, 0, dataFormat}); - auto output = results->at(0); + auto output = results->at(0); ASSERT_EQ(Status::OK(), results->status()); @@ -2489,19 +2489,19 @@ TEST_F(DeclarableOpsTests8, avgpool2d_test13) { //expected.printIndexedBuffer("expected"); ASSERT_TRUE(expected.isSameShape(output)); - ASSERT_TRUE(expected.equalsTo(output)); - + ASSERT_TRUE(expected.equalsTo(output)); + delete results; } - + /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test1) { - + auto labels = NDArrayFactory::create('c', {2,3,4},{0,1,1,0,0,0,1,0,1,0,1,1,1,0,1,0,1,0,0,1,1,0,1,0}); auto logits = NDArrayFactory::create('c', {2,3,4}); auto expected = NDArrayFactory::create('c', {2,3}, {2.78507, 1.34254, 4.12761, 2.88507, 2.78507, 2.88507}); - + logits.linspace(0.1, 0.1); nd4j::ops::softmax_cross_entropy_loss_with_logits op; @@ -2509,7 +2509,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test1) { ASSERT_EQ(Status::OK(), results->status()); - auto *output = results->at(0); + auto *output = results->at(0); ASSERT_TRUE(expected.isSameShape(output)); ASSERT_TRUE(expected.equalsTo(output)); @@ -2519,11 +2519,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test1) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test2) { - + auto labels = NDArrayFactory::create('c', {2,3,4},{0,1,1,0,0,0,1,0,1,0,1,1,1,0,1,0,1,0,0,1,1,0,1,0}); auto logits = NDArrayFactory::create('c', {2,3,4}); auto expected = NDArrayFactory::create('c', {3,4}, {0.26328, 1.46328, 1.72656, 0. , 0.26328, 0. , 1.46328, 0.26328, 1.72656, 0. , 1.72656, 1.46328}); - + logits.linspace(0.1, 0.1); nd4j::ops::softmax_cross_entropy_loss_with_logits op; @@ -2531,7 +2531,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test2) { ASSERT_EQ(Status::OK(), results->status()); - auto *output = results->at(0); + auto *output = results->at(0); ASSERT_TRUE(expected.isSameShape(output)); ASSERT_TRUE(expected.equalsTo(output)); @@ -2541,11 +2541,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test2) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test3) { - + auto labels = NDArrayFactory::create('c', {2,3,4},{0,1,1,0,0,0,1,0,1,0,1,1,1,0,1,0,1,0,0,1,1,0,1,0}); auto logits = NDArrayFactory::create('c', {2,3,4}); auto expected = NDArrayFactory::create('c', {2,4}, {0.75125, 1.55125, 3.45375, 0.75125, 3.45375, 0. , 2.3025 , 1.15125}); - + logits.linspace(0.1, 0.1); nd4j::ops::softmax_cross_entropy_loss_with_logits op; @@ -2553,7 +2553,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test3) { ASSERT_EQ(Status::OK(), results->status()); - auto *output = results->at(0); + auto *output = results->at(0); ASSERT_TRUE(expected.isSameShape(output)); ASSERT_TRUE(expected.equalsTo(output)); @@ -2563,11 +2563,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test3) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test4) { - + auto labels = NDArrayFactory::create('c', {2,3},{0,1,1,0,0,1}); auto logits = NDArrayFactory::create('c', {2,3}); auto expected = NDArrayFactory::create('c', {2}, {2.10389, 1.00194}); - + logits.linspace(0.1, 0.1); nd4j::ops::softmax_cross_entropy_loss_with_logits op; @@ -2585,11 +2585,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test4) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test5) { - + auto labels = NDArrayFactory::create('c', {2,3},{0,1,1,0,0,1}); auto logits = NDArrayFactory::create('c', {2,3}); auto expected = NDArrayFactory::create('c', {3}, {0., 0.85436, 1.40871}); - + logits.linspace(0.1, 0.1); nd4j::ops::softmax_cross_entropy_loss_with_logits op; @@ -2607,11 +2607,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test5) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test6) { - + auto labels = NDArrayFactory::create('c', {2,1}, {0,1}); auto logits = NDArrayFactory::create('c', {2,1}); auto expected = NDArrayFactory::create('c', {1}, {0.6444}); - + logits.linspace(0.1, 0.1); nd4j::ops::softmax_cross_entropy_loss_with_logits op; @@ -2629,11 +2629,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test6) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test7) { - + auto labels = NDArrayFactory::create('c', {2,1}, {0,1}); auto logits = NDArrayFactory::create('c', {2,1}); auto expected = NDArrayFactory::create('c', {2}, {0., 0.}); - + logits.linspace(0.1, 0.1); nd4j::ops::softmax_cross_entropy_loss_with_logits op; @@ -2651,11 +2651,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test7) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test8) { - + auto labels = NDArrayFactory::create('c', {2}, {0,1}); auto logits = NDArrayFactory::create('c', {2}); auto expected = NDArrayFactory::create(0.6444); - + logits.linspace(0.1, 0.1); nd4j::ops::softmax_cross_entropy_loss_with_logits op; @@ -2663,7 +2663,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test8) { ASSERT_EQ(Status::OK(), results->status()); - auto *output = results->at(0); + auto *output = results->at(0); ASSERT_TRUE(expected.isSameShape(output)); ASSERT_TRUE(expected.equalsTo(output)); @@ -2673,11 +2673,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test8) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test9) { - + auto labels = NDArrayFactory::create('c', {1}, {0.}); auto logits = NDArrayFactory::create('c', {1}, {0.2}); auto expected = NDArrayFactory::create(0.); - + nd4j::ops::softmax_cross_entropy_loss_with_logits op; auto results = op.execute({&logits, &labels}, {}, {}); @@ -2693,11 +2693,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test9) { /////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test10) { - + auto labels = NDArrayFactory::create('c', {1,2}, {0,1}); auto logits = NDArrayFactory::create('c', {1,2}); auto expected = NDArrayFactory::create('c', {2}, {0., 0.}); - + logits.linspace(0.1, 0.1); nd4j::ops::softmax_cross_entropy_loss_with_logits op; @@ -2715,14 +2715,14 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test10) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, clipbynorm_test4) { - + auto x = NDArrayFactory::create('c', {3, 5}, {0.7044955, 0.55606544, 0.15833677, 0.001874401, 0.61595726, 0.3924779, 0.7414847, 0.4127324, 0.24026828, 0.26093036, 0.46741188, 0.01863421, 0.08528871, 0.529365, 0.5510694}); - auto exp = NDArrayFactory::create('c', {3, 5}, {0.405392, 0.319980, 0.091113, 0.001079, 0.354444, 0.225846, 0.426676, 0.237501, 0.138259, 0.150149, 0.268965, 0.010723, 0.049078, 0.304615, 0.317105}); + auto exp = NDArrayFactory::create('c', {3, 5}, {0.405392, 0.319980, 0.091113, 0.001079, 0.354444, 0.225846, 0.426676, 0.237501, 0.138259, 0.150149, 0.268965, 0.010723, 0.049078, 0.304615, 0.317105}); nd4j::ops::clipbynorm op; auto result = op.execute({&x}, {1.f}, {}, {}, false, nd4j::DataType::DOUBLE); auto output = result->at(0); - + ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2731,16 +2731,18 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test4) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, clipbynorm_test5) { - + + // auto x = NDArrayFactory::create('c', {3, 5}, {1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5}); auto x = NDArrayFactory::create('c', {3, 5}); - auto exp = NDArrayFactory::create('c', {3, 5}, {1., 2., 2.89271, 3.50524, 4.00892, 6., 7., 7.71389, 7.88678, 8.01784, 11., 12., 12.53507, 12.26833, 12.02676}); + auto exp = NDArrayFactory::create('c', {3, 5}, {1., 2., 2.89271, 3.50524, 4.00892, 6., 7., 7.71389, 7.88678, 8.01784, 11., 12., 12.53507, 12.26833, 12.02676}); + // auto exp = NDArrayFactory::create('c', {3, 5}, {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}); x.linspace(1); nd4j::ops::clipbynorm op; auto result = op.execute({&x}, {15.f}, {0}, {}, false, nd4j::DataType::DOUBLE); auto output = result->at(0); - + ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -2749,25 +2751,25 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test5) { //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, clipbynorm_test6) { - + auto x = NDArrayFactory::create('c', {3, 5}); - auto exp = NDArrayFactory::create('c', {3, 5}, {1., 2., 3., 4., 5., 4.95434, 5.78006, 6.60578, 7.43151, 8.25723, 5.64288, 6.15587, 6.66886, 7.18185, 7.69484}); + auto exp = NDArrayFactory::create('c', {3, 5}, {1., 2., 3., 4., 5., 4.95434, 5.78006, 6.60578, 7.43151, 8.25723, 5.64288, 6.15587, 6.66886, 7.18185, 7.69484}); x.linspace(1); nd4j::ops::clipbynorm op; auto result = op.execute({&x}, {15.f}, {1}, {}, false, nd4j::DataType::DOUBLE); auto output = result->at(0); - + ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); delete result; } - + //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, clipbynorm_test7) { - + auto x = NDArrayFactory::create('c', {3, 5}); auto exp = NDArrayFactory::create('c', {3, 5}, {0.42597, 0.85194, 1.27791, 1.70389, 2.12986, 2.55583, 2.9818 , 3.40777, 3.83374, 4.25971, 4.68569, 5.11166, 5.53763, 5.9636 , 6.38957}); @@ -2782,10 +2784,10 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test7) { delete result; } - + //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, clipbynorm_test8) { - + auto x = NDArrayFactory::create('c', {3, 5}); auto exp = NDArrayFactory::create('c', {3, 5}, {0.42597, 0.85194, 1.27791, 1.70389, 2.12986, 2.55583, 2.9818 , 3.40777, 3.83374, 4.25971, 4.68569, 5.11166, 5.53763, 5.9636 , 6.38957}); @@ -2800,12 +2802,12 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test8) { delete result; } - + //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, clipbynorm_test9) { - + auto x = NDArrayFactory::create('c', {2}, {3., 4.}); - auto exp = NDArrayFactory::create('c', {2}, {2.4, 3.2}); + auto exp = NDArrayFactory::create('c', {2}, {2.4, 3.2}); nd4j::ops::clipbynorm op; auto result = op.execute({&x}, {4.}, {}, {}, false, nd4j::DataType::DOUBLE); @@ -2816,10 +2818,10 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test9) { delete result; } - + //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, clipbynorm_test10) { - + auto x = NDArrayFactory::create(6.); auto exp = NDArrayFactory::create(5.); @@ -2832,10 +2834,10 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test10) { delete result; } - + //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests8, clipbynorm_test11) { - + auto x = NDArrayFactory::create('c', {2, 3, 4}); auto exp = NDArrayFactory::create('c', {2, 3, 4}, {1., 2., 3., 4., 4.44787, 5.33745, 6.22702, 7.1166 , 6.33046, 7.03384, 7.73723, 8.44061, 13., 14., 15., 16., 15.12277, 16.01235, 16.90192, 17.7915 ,14.77107, 15.47446, 16.17784, 16.88123}); @@ -2872,19 +2874,19 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test4) { auto gradO1 = NDArrayFactory::create('c', {4}, {1., 2., 3., 4.}); auto gradO2 = NDArrayFactory::create('c', {1, 4}, {1., 2., 3., 4.}); auto exp = NDArrayFactory::create('c', {3,4}, {0.333333, 0.666667, 1.000000, 1.333333, 0.333333, 0.666667, 1.000000, 1.333333, 0.333333, 0.666667, 1.000000, 1.333333}); - + nd4j::ops::reduce_mean_bp op; auto result = op.execute({&x, &gradO1}, {0}, {0}); - ASSERT_EQ(Status::OK(), result->status()); - auto output = result->at(0); + ASSERT_EQ(Status::OK(), result->status()); + auto output = result->at(0); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); delete result; result = op.execute({&x, &gradO2}, {1}, {0}); - ASSERT_EQ(Status::OK(), result->status()); - output = result->at(0); + ASSERT_EQ(Status::OK(), result->status()); + output = result->at(0); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); delete result; @@ -2898,19 +2900,19 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test5) { auto gradO1 = NDArrayFactory::create('c', {3}, {1., 2., 3.}); auto gradO2 = NDArrayFactory::create('c', {3, 1}, {1., 2., 3.}); auto exp = NDArrayFactory::create('c', {3,4}, {0.2500,0.2500,0.2500,0.2500, 0.5000,0.5000,0.5000,0.5000, 0.7500,0.7500,0.7500,0.7500}); - + nd4j::ops::reduce_mean_bp op; - + auto result = op.execute({&x, &gradO1}, {0}, {1}); - ASSERT_EQ(Status::OK(), result->status()); - auto output = result->at(0); + ASSERT_EQ(Status::OK(), result->status()); + auto output = result->at(0); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); delete result; result = op.execute({&x, &gradO2}, {1}, {1}); - ASSERT_EQ(Status::OK(), result->status()); - output = result->at(0); + ASSERT_EQ(Status::OK(), result->status()); + output = result->at(0); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); delete result; @@ -2924,19 +2926,19 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test5) { auto gradO1 = NDArrayFactory::create('c', {4}, {1., 2., 3., 4.}); auto gradO2 = NDArrayFactory::create('c', {1, 4}, {1., 2., 3., 4.}); auto exp = NDArrayFactory::create('c', {3,4}, {-0.408248, -0.816497, -1.224745, -1.632993, 0.000000, 0.000000, 0.000000, 0.000000, 0.408248, 0.816497, 1.224745, 1.632993}); - + nd4j::ops::reduce_stdev_bp op; auto result = op.execute({&x, &gradO1}, {0}, {0}); - ASSERT_EQ(Status::OK(), result->status()); - auto output = result->at(0); + ASSERT_EQ(Status::OK(), result->status()); + auto output = result->at(0); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); delete result; result = op.execute({&x, &gradO2}, {1}, {0}); - ASSERT_EQ(Status::OK(), result->status()); - output = result->at(0); + ASSERT_EQ(Status::OK(), result->status()); + output = result->at(0); ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); delete result; @@ -2948,12 +2950,12 @@ TEST_F(DeclarableOpsTests8, zeros_as_test1) { auto x = NDArrayFactory::create(10.f); auto y = NDArrayFactory::create(100.f); auto exp = NDArrayFactory::create(0.f); - + nd4j::ops::zeros_as op; Nd4jStatus status = op.execute({&x}, {&y}, {}, {}, {}); - ASSERT_EQ(Status::OK(), status); - + ASSERT_EQ(Status::OK(), status); + ASSERT_TRUE(y.isSameShape(exp)); ASSERT_TRUE(y.equalsTo(exp)); @@ -2987,11 +2989,11 @@ TEST_F(DeclarableOpsTests8, ones_as_test1) { nd4j::ops::ones_as op; Nd4jStatus status = op.execute({&x}, {&y}, {}, {}, {}, false, nd4j::DataType::DOUBLE); - ASSERT_EQ(Status::OK(), status); - + ASSERT_EQ(Status::OK(), status); + ASSERT_TRUE(y.isSameShape(exp)); ASSERT_TRUE(y.equalsTo(exp)); - + } //////////////////////////////////////////////////////////////////////////////// @@ -3017,7 +3019,7 @@ TEST_F(DeclarableOpsTests8, NormalizeMoments_SGO_1) { auto data = NDArrayFactory::create('c', {10, 10}); data.linspace(1); - + auto means = data.reduceAlongDimension(reduce::Sum, {0}); auto deviance = NDArrayFactory::create('c', {10}, {825., 825. , 825., 825., 825., 825., 825., 825., 825., 825. }); // data.varianceAlongDimension(variance::SummaryStatsVariance, false, {0}); // = NDArrayFactory::create('c', {10, 10}); @@ -3040,24 +3042,24 @@ TEST_F(DeclarableOpsTests8, NormalizeMoments_SGO_1) { ASSERT_EQ(Status::OK(), results->status()); ASSERT_EQ(results->size(), 2); - auto outputMeans = results->at(0); - auto outputDeviance = results->at(1); + auto outputMeans = results->at(0); + auto outputDeviance = results->at(1); // outputMeans->printIndexedBuffer("Means"); // outputDeviance->printIndexedBuffer("Variance"); // deviance.printIndexedBuffer("Expected"); // means->printIndexedBuffer("Expected means"); ASSERT_TRUE(means->isSameShape(outputMeans)); - ASSERT_TRUE(means->equalsTo(outputMeans)); + ASSERT_TRUE(means->equalsTo(outputMeans)); ASSERT_TRUE(deviance.isSameShape(outputDeviance)); ASSERT_TRUE(deviance.equalsTo(outputDeviance)); delete means; //delete deviance; delete ssSquared; // ASSERT_TRUE(expMeans.isSameShape(outputMeans)); -// ASSERT_TRUE(expMeans.equalsTo(outputMeans)); +// ASSERT_TRUE(expMeans.equalsTo(outputMeans)); // ASSERT_TRUE(expMeans.isSameShape(outputDeviance)); -// ASSERT_TRUE(expDeviance.equalsTo(outputDeviance)); +// ASSERT_TRUE(expDeviance.equalsTo(outputDeviance)); delete results; } @@ -3073,10 +3075,10 @@ TEST_F(DeclarableOpsTests8, Test_Moments_1) { nd4j::ops::moments op; auto result = op.execute({&x}, {}, {0, 1}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); - auto outputMeans = result->at(0); - auto outputVariance = result->at(1); + auto outputMeans = result->at(0); + auto outputVariance = result->at(1); // outputMeans->printIndexedBuffer("Means"); // outputVariance->printIndexedBuffer("Variance"); @@ -3103,10 +3105,10 @@ TEST_F(DeclarableOpsTests8, Test_Moments_2) { nd4j::ops::moments op; auto result = op.execute({&x}, {1.}, {0, 1}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); - auto outputMeans = result->at(0); - auto outputVariance = result->at(1); + auto outputMeans = result->at(0); + auto outputVariance = result->at(1); // outputMeans->printIndexedBuffer("Means"); // outputVariance->printIndexedBuffer("Variance"); @@ -3132,10 +3134,10 @@ TEST_F(DeclarableOpsTests8, Test_Moments_3) { nd4j::ops::moments op; auto result = op.execute({&x}, {}, {0, 2}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); - auto outputMeans = result->at(0); - auto outputVariance = result->at(1); + auto outputMeans = result->at(0); + auto outputVariance = result->at(1); // outputMeans->printIndexedBuffer("Means"); // outputVariance->printIndexedBuffer("Variance"); @@ -3161,10 +3163,10 @@ TEST_F(DeclarableOpsTests8, Test_Moments_4) { nd4j::ops::moments op; auto result = op.execute({&x}, {1.}, {0, 2}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); - auto outputMeans = result->at(0); - auto outputVariance = result->at(1); + auto outputMeans = result->at(0); + auto outputVariance = result->at(1); // outputMeans->printIndexedBuffer("Means"); // outputVariance->printIndexedBuffer("Variance"); @@ -3187,13 +3189,13 @@ TEST_F(DeclarableOpsTests8, Test_Moments_6) { auto x = NDArrayFactory::create('c', {2, 3, 4}); x.linspace(1); - + nd4j::ops::moments op; auto result = op.execute({&x}, {}, {0,1,2}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); - auto outputMeans = result->at(0); - auto outputVariance = result->at(1); + auto outputMeans = result->at(0); + auto outputVariance = result->at(1); // outputMeans->printIndexedBuffer("Means"); // outputVariance->printIndexedBuffer("Variance"); @@ -3216,13 +3218,13 @@ TEST_F(DeclarableOpsTests8, Test_Moments_7) { auto expVariance = NDArrayFactory::create('c', {1,1,1}, {47.916668f}); x.linspace(1); - // x.printIndexedBuffer("Input with shape (2, 3, 4) is"); + // x.printIndexedBuffer("Input with shape (2, 3, 4) is"); nd4j::ops::moments op; auto result = op.execute({&x}, {1.}, {0,1,2}); - ASSERT_EQ(Status::OK(), result->status()); + ASSERT_EQ(Status::OK(), result->status()); - auto outputMeans = result->at(0); - auto outputVariance = result->at(1); + auto outputMeans = result->at(0); + auto outputVariance = result->at(1); // outputMeans->printIndexedBuffer("Means"); // outputVariance->printIndexedBuffer("Variance"); @@ -3319,13 +3321,13 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_1) { nd4j::ops::lrn op; auto results = op.execute({&x}, {1.0, 1.0, 0.5}, {2}, {}, false, nd4j::DataType::DOUBLE); auto out = results->at(0); - + ASSERT_EQ(Status::OK(), results->status()); ASSERT_TRUE(exp.isSameShape(out)); // out->printIndexedBuffer("LRN out"); // exp.printIndexedBuffer("LRN exp"); - ASSERT_TRUE(exp.equalsTo(out)); - + ASSERT_TRUE(exp.equalsTo(out)); + delete results; } @@ -3334,75 +3336,75 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_2) { auto x = NDArrayFactory::create('c', {3, 3, 5, 5}); x.linspace(1); - + auto exp = NDArrayFactory::create('c', {3, 3, 5, 5}, { - 0.2581989f, 0.3592106f, 0.40089184f, 0.53935987f, 0.70014f, - 0.4898979f, 0.46056613f, 0.43971977f, 0.5240002f, 0.6375767f, - 0.5274096f, 0.47771242f, 0.4443308f, 0.5163977f, 0.61701745f, - 0.5424508f, 0.48452914f, 0.44570294f, 0.5123918f, 0.6068971f, - 0.5505386f, 0.4881662f, 0.4462865f, 0.5099462f, 0.60088515f, + 0.2581989f, 0.3592106f, 0.40089184f, 0.53935987f, 0.70014f, + 0.4898979f, 0.46056613f, 0.43971977f, 0.5240002f, 0.6375767f, + 0.5274096f, 0.47771242f, 0.4443308f, 0.5163977f, 0.61701745f, + 0.5424508f, 0.48452914f, 0.44570294f, 0.5123918f, 0.6068971f, + 0.5505386f, 0.4881662f, 0.4462865f, 0.5099462f, 0.60088515f, - 0.5555859f, 0.49042296f, 0.44658744f, 0.5083028f, 0.59690416f, - 0.55903524f, 0.4919585f, 0.44676256f, 0.5071239f, 0.59407425f, - 0.5615412f, 0.49307042f, 0.44687328f, 0.50623745f, 0.5919596f, - 0.56344414f, 0.49391258f, 0.4469477f, 0.5055468f, 0.59031945f, - 0.56493837f, 0.49457246f, 0.4470002f, 0.5049936f, 0.5890103f, + 0.5555859f, 0.49042296f, 0.44658744f, 0.5083028f, 0.59690416f, + 0.55903524f, 0.4919585f, 0.44676256f, 0.5071239f, 0.59407425f, + 0.5615412f, 0.49307042f, 0.44687328f, 0.50623745f, 0.5919596f, + 0.56344414f, 0.49391258f, 0.4469477f, 0.5055468f, 0.59031945f, + 0.56493837f, 0.49457246f, 0.4470002f, 0.5049936f, 0.5890103f, - 0.56614274f, 0.49510333f, 0.44703856f, 0.50454074f, 0.5879411f, + 0.56614274f, 0.49510333f, 0.44703856f, 0.50454074f, 0.5879411f, 0.567134f, 0.49553978f, 0.4470674f, 0.504163f, 0.5870515f, - 0.5679643f, 0.4959048f, 0.44708967f, 0.5038433f, 0.5862998f, - 0.56866974f, 0.4962146f, 0.44710726f, 0.5035692f, 0.58565617f, - 0.56927663f, 0.49648085f, 0.4471213f, 0.5033315f, 0.5850988f, + 0.5679643f, 0.4959048f, 0.44708967f, 0.5038433f, 0.5862998f, + 0.56866974f, 0.4962146f, 0.44710726f, 0.5035692f, 0.58565617f, + 0.56927663f, 0.49648085f, 0.4471213f, 0.5033315f, 0.5850988f, - 0.56980413f, 0.49671215f, 0.44713274f, 0.50312346f, 0.58461165f, - 0.57026696f, 0.49691492f, 0.4471422f, 0.50293994f, 0.58418214f, - 0.5706764f, 0.49709415f, 0.44715008f, 0.5027767f, 0.5838005f, + 0.56980413f, 0.49671215f, 0.44713274f, 0.50312346f, 0.58461165f, + 0.57026696f, 0.49691492f, 0.4471422f, 0.50293994f, 0.58418214f, + 0.5706764f, 0.49709415f, 0.44715008f, 0.5027767f, 0.5838005f, 0.571041f, 0.4972537f, 0.44715673f, 0.50263065f, 0.58345926f, - 0.57136786f, 0.49739665f, 0.44716236f, 0.5024992f, 0.58315235f, + 0.57136786f, 0.49739665f, 0.44716236f, 0.5024992f, 0.58315235f, - 0.5716625f, 0.49752548f, 0.4471672f, 0.5023803f, 0.5828747f, - 0.5719295f, 0.49764213f, 0.44717142f, 0.5022721f, 0.5826225f, - 0.57217246f, 0.49774826f, 0.44717506f, 0.5021734f, 0.58239233f, - 0.5723947f, 0.4978453f, 0.44717824f, 0.5020829f, 0.58218133f, - 0.57259864f, 0.49793428f, 0.44718108f, 0.5019997f, 0.5819874f, + 0.5716625f, 0.49752548f, 0.4471672f, 0.5023803f, 0.5828747f, + 0.5719295f, 0.49764213f, 0.44717142f, 0.5022721f, 0.5826225f, + 0.57217246f, 0.49774826f, 0.44717506f, 0.5021734f, 0.58239233f, + 0.5723947f, 0.4978453f, 0.44717824f, 0.5020829f, 0.58218133f, + 0.57259864f, 0.49793428f, 0.44718108f, 0.5019997f, 0.5819874f, - 0.5727864f, 0.49801624f, 0.44718358f, 0.5019227f, 0.5818083f, + 0.5727864f, 0.49801624f, 0.44718358f, 0.5019227f, 0.5818083f, 0.57296f, 0.49809194f, 0.44718578f, 0.5018515f, 0.5816426f, - 0.5731208f, 0.49816203f, 0.44718775f, 0.5017854f, 0.58148885f, - 0.57327026f, 0.49822718f, 0.4471895f, 0.5017239f, 0.5813457f, + 0.5731208f, 0.49816203f, 0.44718775f, 0.5017854f, 0.58148885f, + 0.57327026f, 0.49822718f, 0.4471895f, 0.5017239f, 0.5813457f, 0.57340944f, 0.49828786f, 0.44719115f, 0.5016664f, 0.581212f, - 0.57353944f, 0.4983446f, 0.44719255f, 0.50161266f, 0.58108705f, - 0.5736612f, 0.49839762f, 0.4471939f, 0.50156236f, 0.5809699f, + 0.57353944f, 0.4983446f, 0.44719255f, 0.50161266f, 0.58108705f, + 0.5736612f, 0.49839762f, 0.4471939f, 0.50156236f, 0.5809699f, 0.5737754f, 0.4984474f, 0.44719502f, 0.501515f, 0.58085984f, - 0.5738828f, 0.49849418f, 0.4471962f, 0.50147045f, 0.5807564f, - 0.5739839f, 0.49853817f, 0.44719717f, 0.5014284f, 0.5806588f, + 0.5738828f, 0.49849418f, 0.4471962f, 0.50147045f, 0.5807564f, + 0.5739839f, 0.49853817f, 0.44719717f, 0.5014284f, 0.5806588f, - 0.5740793f, 0.49857965f, 0.4471981f, 0.5013887f, 0.5805666f, - 0.5741694f, 0.49861887f, 0.44719887f, 0.50135124f, 0.58047944f, - 0.57425463f, 0.49865603f, 0.44719967f, 0.5013157f, 0.5803969f, - 0.5743354f, 0.4986912f, 0.44720036f, 0.5012819f, 0.5803186f, - 0.57441217f, 0.49872455f, 0.44720104f, 0.5012499f, 0.58024424f, + 0.5740793f, 0.49857965f, 0.4471981f, 0.5013887f, 0.5805666f, + 0.5741694f, 0.49861887f, 0.44719887f, 0.50135124f, 0.58047944f, + 0.57425463f, 0.49865603f, 0.44719967f, 0.5013157f, 0.5803969f, + 0.5743354f, 0.4986912f, 0.44720036f, 0.5012819f, 0.5803186f, + 0.57441217f, 0.49872455f, 0.44720104f, 0.5012499f, 0.58024424f, - 0.57448506f, 0.4987563f, 0.44720164f, 0.5012194f, 0.58017343f, - 0.57455444f, 0.4987865f, 0.4472022f, 0.5011904f, 0.5801061f, - 0.57462054f, 0.49881527f, 0.44720277f, 0.5011627f, 0.5800419f, - 0.57468355f, 0.49884263f, 0.44720328f, 0.50113624f, 0.5799805f, + 0.57448506f, 0.4987563f, 0.44720164f, 0.5012194f, 0.58017343f, + 0.57455444f, 0.4987865f, 0.4472022f, 0.5011904f, 0.5801061f, + 0.57462054f, 0.49881527f, 0.44720277f, 0.5011627f, 0.5800419f, + 0.57468355f, 0.49884263f, 0.44720328f, 0.50113624f, 0.5799805f, 0.57474375f, 0.49886885f, 0.44720373f, 0.50111103f, 0.5799219f } ); // nd4j::ops::lrn op; auto results = op.execute({&x}, {1.0, 1.0, 0.5}, {2}, {}, false, nd4j::DataType::DOUBLE); auto out = results->at(0); - + ASSERT_EQ(Status::OK(), results->status()); // ASSERT_TRUE(exp.isSameShape(out)); // out->printIndexedBuffer("LRN out"); // exp.printIndexedBuffer("LRN exp"); - ASSERT_TRUE(exp.equalsTo(out)); - + ASSERT_TRUE(exp.equalsTo(out)); + delete results; } @@ -3413,60 +3415,60 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_3) { x.linspace(1); auto exp = NDArrayFactory::create('c', {3, 3, 5, 5}, { - 0.2581989f, 0.3592106f, 0.40089184f, 0.53935987f, 0.70014f, - 0.4898979f, 0.46056613f, 0.43971977f, 0.5240002f, 0.6375767f, - 0.5274096f, 0.47771242f, 0.4443308f, 0.5163977f, 0.61701745f, - 0.5424508f, 0.48452914f, 0.44570294f, 0.5123918f, 0.6068971f, - 0.5505386f, 0.4881662f, 0.4462865f, 0.5099462f, 0.60088515f, + 0.2581989f, 0.3592106f, 0.40089184f, 0.53935987f, 0.70014f, + 0.4898979f, 0.46056613f, 0.43971977f, 0.5240002f, 0.6375767f, + 0.5274096f, 0.47771242f, 0.4443308f, 0.5163977f, 0.61701745f, + 0.5424508f, 0.48452914f, 0.44570294f, 0.5123918f, 0.6068971f, + 0.5505386f, 0.4881662f, 0.4462865f, 0.5099462f, 0.60088515f, - 0.5555859f, 0.49042296f, 0.44658744f, 0.5083028f, 0.59690416f, - 0.55903524f, 0.4919585f, 0.44676256f, 0.5071239f, 0.59407425f, - 0.5615412f, 0.49307042f, 0.44687328f, 0.50623745f, 0.5919596f, - 0.56344414f, 0.49391258f, 0.4469477f, 0.5055468f, 0.59031945f, - 0.56493837f, 0.49457246f, 0.4470002f, 0.5049936f, 0.5890103f, + 0.5555859f, 0.49042296f, 0.44658744f, 0.5083028f, 0.59690416f, + 0.55903524f, 0.4919585f, 0.44676256f, 0.5071239f, 0.59407425f, + 0.5615412f, 0.49307042f, 0.44687328f, 0.50623745f, 0.5919596f, + 0.56344414f, 0.49391258f, 0.4469477f, 0.5055468f, 0.59031945f, + 0.56493837f, 0.49457246f, 0.4470002f, 0.5049936f, 0.5890103f, - 0.56614274f, 0.49510333f, 0.44703856f, 0.50454074f, 0.5879411f, + 0.56614274f, 0.49510333f, 0.44703856f, 0.50454074f, 0.5879411f, 0.567134f, 0.49553978f, 0.4470674f, 0.504163f, 0.5870515f, - 0.5679643f, 0.4959048f, 0.44708967f, 0.5038433f, 0.5862998f, - 0.56866974f, 0.4962146f, 0.44710726f, 0.5035692f, 0.58565617f, - 0.56927663f, 0.49648085f, 0.4471213f, 0.5033315f, 0.5850988f, + 0.5679643f, 0.4959048f, 0.44708967f, 0.5038433f, 0.5862998f, + 0.56866974f, 0.4962146f, 0.44710726f, 0.5035692f, 0.58565617f, + 0.56927663f, 0.49648085f, 0.4471213f, 0.5033315f, 0.5850988f, - 0.56980413f, 0.49671215f, 0.44713274f, 0.50312346f, 0.58461165f, - 0.57026696f, 0.49691492f, 0.4471422f, 0.50293994f, 0.58418214f, - 0.5706764f, 0.49709415f, 0.44715008f, 0.5027767f, 0.5838005f, + 0.56980413f, 0.49671215f, 0.44713274f, 0.50312346f, 0.58461165f, + 0.57026696f, 0.49691492f, 0.4471422f, 0.50293994f, 0.58418214f, + 0.5706764f, 0.49709415f, 0.44715008f, 0.5027767f, 0.5838005f, 0.571041f, 0.4972537f, 0.44715673f, 0.50263065f, 0.58345926f, - 0.57136786f, 0.49739665f, 0.44716236f, 0.5024992f, 0.58315235f, + 0.57136786f, 0.49739665f, 0.44716236f, 0.5024992f, 0.58315235f, - 0.5716625f, 0.49752548f, 0.4471672f, 0.5023803f, 0.5828747f, - 0.5719295f, 0.49764213f, 0.44717142f, 0.5022721f, 0.5826225f, - 0.57217246f, 0.49774826f, 0.44717506f, 0.5021734f, 0.58239233f, - 0.5723947f, 0.4978453f, 0.44717824f, 0.5020829f, 0.58218133f, - 0.57259864f, 0.49793428f, 0.44718108f, 0.5019997f, 0.5819874f, + 0.5716625f, 0.49752548f, 0.4471672f, 0.5023803f, 0.5828747f, + 0.5719295f, 0.49764213f, 0.44717142f, 0.5022721f, 0.5826225f, + 0.57217246f, 0.49774826f, 0.44717506f, 0.5021734f, 0.58239233f, + 0.5723947f, 0.4978453f, 0.44717824f, 0.5020829f, 0.58218133f, + 0.57259864f, 0.49793428f, 0.44718108f, 0.5019997f, 0.5819874f, - 0.5727864f, 0.49801624f, 0.44718358f, 0.5019227f, 0.5818083f, + 0.5727864f, 0.49801624f, 0.44718358f, 0.5019227f, 0.5818083f, 0.57296f, 0.49809194f, 0.44718578f, 0.5018515f, 0.5816426f, - 0.5731208f, 0.49816203f, 0.44718775f, 0.5017854f, 0.58148885f, - 0.57327026f, 0.49822718f, 0.4471895f, 0.5017239f, 0.5813457f, + 0.5731208f, 0.49816203f, 0.44718775f, 0.5017854f, 0.58148885f, + 0.57327026f, 0.49822718f, 0.4471895f, 0.5017239f, 0.5813457f, 0.57340944f, 0.49828786f, 0.44719115f, 0.5016664f, 0.581212f, - 0.57353944f, 0.4983446f, 0.44719255f, 0.50161266f, 0.58108705f, - 0.5736612f, 0.49839762f, 0.4471939f, 0.50156236f, 0.5809699f, + 0.57353944f, 0.4983446f, 0.44719255f, 0.50161266f, 0.58108705f, + 0.5736612f, 0.49839762f, 0.4471939f, 0.50156236f, 0.5809699f, 0.5737754f, 0.4984474f, 0.44719502f, 0.501515f, 0.58085984f, - 0.5738828f, 0.49849418f, 0.4471962f, 0.50147045f, 0.5807564f, - 0.5739839f, 0.49853817f, 0.44719717f, 0.5014284f, 0.5806588f, + 0.5738828f, 0.49849418f, 0.4471962f, 0.50147045f, 0.5807564f, + 0.5739839f, 0.49853817f, 0.44719717f, 0.5014284f, 0.5806588f, - 0.5740793f, 0.49857965f, 0.4471981f, 0.5013887f, 0.5805666f, - 0.5741694f, 0.49861887f, 0.44719887f, 0.50135124f, 0.58047944f, - 0.57425463f, 0.49865603f, 0.44719967f, 0.5013157f, 0.5803969f, - 0.5743354f, 0.4986912f, 0.44720036f, 0.5012819f, 0.5803186f, - 0.57441217f, 0.49872455f, 0.44720104f, 0.5012499f, 0.58024424f, + 0.5740793f, 0.49857965f, 0.4471981f, 0.5013887f, 0.5805666f, + 0.5741694f, 0.49861887f, 0.44719887f, 0.50135124f, 0.58047944f, + 0.57425463f, 0.49865603f, 0.44719967f, 0.5013157f, 0.5803969f, + 0.5743354f, 0.4986912f, 0.44720036f, 0.5012819f, 0.5803186f, + 0.57441217f, 0.49872455f, 0.44720104f, 0.5012499f, 0.58024424f, - 0.57448506f, 0.4987563f, 0.44720164f, 0.5012194f, 0.58017343f, - 0.57455444f, 0.4987865f, 0.4472022f, 0.5011904f, 0.5801061f, - 0.57462054f, 0.49881527f, 0.44720277f, 0.5011627f, 0.5800419f, - 0.57468355f, 0.49884263f, 0.44720328f, 0.50113624f, 0.5799805f, + 0.57448506f, 0.4987563f, 0.44720164f, 0.5012194f, 0.58017343f, + 0.57455444f, 0.4987865f, 0.4472022f, 0.5011904f, 0.5801061f, + 0.57462054f, 0.49881527f, 0.44720277f, 0.5011627f, 0.5800419f, + 0.57468355f, 0.49884263f, 0.44720328f, 0.50113624f, 0.5799805f, 0.57474375f, 0.49886885f, 0.44720373f, 0.50111103f, 0.5799219f } ); // @@ -3526,13 +3528,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_4_119) { auto ttlTime = std::chrono::duration_cast ((timeEnd - timeStart)).count(); - //ASSERT_EQ(Status::OK(), results); - - nd4j_printf("avg time: %lld ms\n", spanTime); - // ASSERT_TRUE(exp.isSameShape(out)); -// out->printIndexedBuffer("LRN out"); -// exp.printIndexedBuffer("LRN exp"); // ASSERT_TRUE(exp.equalsTo(out)); } @@ -3548,8 +3544,6 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_5) { ASSERT_EQ(Status::OK(), results->status()); // ASSERT_TRUE(exp.isSameShape(out)); -// out->printIndexedBuffer("LRN out"); -// exp.printIndexedBuffer("LRN exp"); // ASSERT_TRUE(exp.equalsTo(out)); delete results; @@ -3626,13 +3620,13 @@ auto exp = NDArrayFactory::create('c', {3,3,5,5}, { nd4j::ops::lrn_bp op; auto results = op.execute({&x, &eps}, {1.0, 1.0, 0.5}, {2}, {}, false, typeid(TypeParam) == typeid(float) ? nd4j::DataType::FLOAT32 : nd4j::DataType::DOUBLE); auto out = results->at(0); - + ASSERT_EQ(Status::OK(), results->status()); // ASSERT_TRUE(exp.isSameShape(out)); // out->printBuffer("LRN BP out"); // exp.printBuffer("LRN BP exp"); //ASSERT_TRUE(exp.equalsTo(out)); - + delete results; } @@ -3641,7 +3635,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_BP_2) { auto x = NDArrayFactory::create( 'c', {3, 3, 5, 5}); x.linspace(1); - + auto eps = NDArrayFactory::create('c', {3, 3, 5, 5}, { 0.2581989 ,0.3592106 , 0.40089184, 0.53935987, 0.70014, 0.4898979 ,0.46056613, 0.43971977, 0.5240002 , 0.6375767, 0.5274096 ,0.47771242, 0.4443308 , 0.5163977 , 0.61701745, @@ -3706,13 +3700,13 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_BP_2) { nd4j::ops::lrn_bp op; auto results = op.execute({&x, &eps}, {1.0, 1.0, 0.5}, {2}, {}, false, typeid(TypeParam) == typeid(float) ? nd4j::DataType::FLOAT32 : nd4j::DataType::DOUBLE); auto out = results->at(0); - + ASSERT_EQ(Status::OK(), results->status()); ASSERT_TRUE(exp.isSameShape(out)); //out->printBuffer("LRN BP out"); // exp.printIndexedBuffer("LRN exp"); // ASSERT_TRUE(exp.equalsTo(out)); - + delete results; } diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp index 4871c12e4..f88d6e930 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp @@ -924,8 +924,6 @@ TEST_F(DeclarableOpsTests9, tile_test1) { auto reps = NDArrayFactory::create('c', {1, 2}, {2, 1}); auto expOut = NDArrayFactory::create('c', {2, 6,}, {1.,2.,3.,4.,5.,6., 1.,2.,3.,4.,5.,6.}); - expOut.printIndexedBuffer("expOut"); - nd4j::ops::tile op; auto results = op.execute({&input, &reps}, {}, {}); auto out = results->at(0); @@ -1660,8 +1658,6 @@ TEST_F(DeclarableOpsTests9, test_range_int_1) { auto z = result->at(0); - z->printIndexedBuffer("z"); - delete result; } @@ -2901,31 +2897,29 @@ TEST_F(DeclarableOpsTests9, Floormod_BP_Test_4) { delete result; } -//////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests9, batchnorm_bp_test1) { NDArray input ('c', {2,3,4}, nd4j::DataType::FLOAT32); - NDArray mean ('c', {4}, nd4j::DataType::FLOAT32); + NDArray mean ('c', {4}, {1.1, 1.2, 1.3, 1.4}, nd4j::DataType::FLOAT32); NDArray variance('c', {4}, nd4j::DataType::FLOAT32); NDArray gamma ('c', {4}, nd4j::DataType::FLOAT32); NDArray beta ('c', {4}, nd4j::DataType::FLOAT32); NDArray gradO ('c', {2,3,4}, nd4j::DataType::FLOAT32); - NDArray expdLdI('c', {2,3,4}, {-1.527335, -1.272779, -1.018224, -0.763668,-0.509112, -0.254556, 0., 0.254556,0.509112, 0.763668, 1.018224, 1.272779, - 1.527335, 1.781891, 2.036447, 2.291003,2.545559, 2.800115, 3.054671, 3.309227,3.563783, 3.818338, 4.072894, 4.32745}, nd4j::DataType::FLOAT32); - NDArray expdLdG('c', {4}, {6.448749, 7.212417, 8.230641, 9.50342 }, nd4j::DataType::FLOAT32); + NDArray expdLdI('c', {2,3,4}, {-0.000056, -0.000056, -0.000056, -0.000056, -0.000034, -0.000034, -0.000034, -0.000034, -0.000011, -0.000011, -0.000011, -0.000011, 0.000011, 0.000011, 0.000011, 0.000011, 0.000034, 0.000034, 0.000034, 0.000034, 0.000056, 0.000056, 0.000056, 0.000056}, nd4j::DataType::FLOAT32); + NDArray expdLdG('c', {4}, {6.148104, 6.148104, 6.148105, 6.148105}, nd4j::DataType::FLOAT32); NDArray expdLdB('c', {4}, {3.6, 4.5, 5.4, 6.3}, nd4j::DataType::FLOAT32); input.linspace(0.1, 0.1); - mean.assign(1.); - variance.assign(0.5); + variance.assign(0.46666667); gamma.assign(1.2); - // beta.assign(1.); // has no effect on gradient calculations + beta.assign(1.); // has no effect on gradient calculations gradO.linspace(-0.9, 0.15); nd4j::ops::batchnorm_bp op; - auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1}); + auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); @@ -2945,20 +2939,22 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test1) { delete results; } + //////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests9, batchnorm_bp_test2) { - NDArray input ('c', {2,3,4}, nd4j::DataType::DOUBLE); - NDArray mean ('c', {3}, {1.05, 1.1, 1.15}); - NDArray variance('c', {3}, {0.5, 0.6, 0.7}); - NDArray gamma ('c', {3}, {1.2, 1.3, 1.4}); - NDArray beta ('c', {3}, nd4j::DataType::DOUBLE); - NDArray gradO ('c', {2,3,4}, nd4j::DataType::DOUBLE); + NDArray input ('c', {2,3,4}, nd4j::DataType::FLOAT32); + NDArray mean ('c', {3}, {1.05, 1.1, 1.15}, nd4j::DataType::FLOAT32); + NDArray variance('c', {3}, {0.5, 0.6, 0.7}, nd4j::DataType::FLOAT32); + NDArray gamma ('c', {3}, {1.2, 1.3, 1.4}, nd4j::DataType::FLOAT32); + NDArray beta ('c', {3}, nd4j::DataType::FLOAT32); + NDArray gradO ('c', {2,3,4}, nd4j::DataType::FLOAT32); - NDArray expdLdI('c', {2,3,4}, {-1.527335, -1.272779, -1.018224, -0.763668,-0.503484, -0.251742, 0., 0.251742,0.501992, 0.752989, 1.003985, 1.254981, - 1.527335, 1.781891, 2.036447, 2.291003,2.517418, 2.76916 , 3.020902, 3.272644,3.513947, 3.764943, 4.015939, 4.266936}); - NDArray expdLdG('c', {3}, {5.81236 , 7.048771, 12.155388}); - NDArray expdLdB('c', {3}, {1.8, 6.6, 11.4}); + NDArray expdLdI('c', {2,3,4}, {-0.601415, -0.521226, -0.441037, -0.360849, -0.456306, -0.395465, -0.334624, -0.273784, 0.396631, 0.343747, + 0.290863, 0.237978, 0.360849, 0.441037, 0.521226, 0.601415, 0.273784, 0.334625, 0.395465, 0.456306, -0.237978, + -0.290863, -0.343746, -0.396631}, nd4j::DataType::FLOAT32); + NDArray expdLdG('c', {3}, {5.81236 , 7.048771, 12.155388}, nd4j::DataType::FLOAT32); + NDArray expdLdB('c', {3}, {1.8, 6.6, 11.4}, nd4j::DataType::FLOAT32); input.linspace(0.1, 0.1); // beta.assign(1.); // has no effect on gradient calculations @@ -2966,7 +2962,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test2) { nd4j::ops::batchnorm_bp op; - auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,1}); + auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,1}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); @@ -2989,17 +2985,18 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test2) { //////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests9, batchnorm_bp_test3) { - NDArray input ('c', {2,3,4}, nd4j::DataType::DOUBLE); - NDArray mean ('c', {2,1,4}, {1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4}); - NDArray variance('c', {2,1,4}, {0.5, 0.6, 0.7, 0.8, 0.9, 1., 1.1, 1.2}); - NDArray gamma ('c', {2,1,4}, {1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9}); - NDArray beta ('c', {2,1,4}, nd4j::DataType::DOUBLE); - NDArray gradO ('c', {2,3,4}, nd4j::DataType::DOUBLE); + NDArray input ('c', {2,3,4}, nd4j::DataType::FLOAT32); + NDArray mean ('c', {2,1,4}, {1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4}, nd4j::DataType::FLOAT32); + NDArray variance('c', {2,1,4}, {0.5, 0.6, 0.7, 0.8, 0.9, 1., 1.1, 1.2}, nd4j::DataType::FLOAT32); + NDArray gamma ('c', {2,1,4}, {1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9}, nd4j::DataType::FLOAT32); + NDArray beta ('c', {2,1,4}, nd4j::DataType::FLOAT32); + NDArray gradO ('c', {2,3,4}, nd4j::DataType::FLOAT32); - NDArray expdLdI('c', {2,3,4}, {-1.527335, -1.258709, -1.003985, -0.754668,-0.509112, -0.251742, 0., 0.251556,0.509112, 0.755225, 1.003985, 1.25778 , - 1.517885, 1.784991, 2.05947 , 2.341504,2.529808, 2.804986, 3.089205, 3.382173,3.541731, 3.824981, 4.11894 , 4.422841}); - NDArray expdLdG('c', {2,1,4}, {1.378844, 0.910144, 0.573706, 0.335408, 2.640487, 2.954985, 3.289431, 3.64234 }); - NDArray expdLdB('c', {2,1,4}, {-0.9 , -0.45, 0. , 0.45, 4.5 , 4.95, 5.4 , 5.85}); + NDArray expdLdI('c', {2,3,4}, {-0.577002, -0.744041, -0.850999, -0.922373, -0.000000, -0.000000, -0.000000, -0.000000, 0.577002, + 0.744041, 0.850999, 0.922373, -0.386037, -0.350205, -0.312047, -0.271737, -0.000000, -0.000000, + -0.000000, -0.000000, 0.386037, 0.350205, 0.312047, 0.271736}, nd4j::DataType::FLOAT32); + NDArray expdLdG('c', {2,1,4}, {1.378844, 0.910144, 0.573706, 0.335408, 2.640487, 2.954985, 3.289431, 3.64234 }, nd4j::DataType::FLOAT32); + NDArray expdLdB('c', {2,1,4}, {-0.9 , -0.45, 0. , 0.45, 4.5 , 4.95, 5.4 , 5.85}, nd4j::DataType::FLOAT32); input.linspace(0.1, 0.1); // beta.assign(1.); // has no effect on gradient calculations @@ -3007,7 +3004,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test3) { nd4j::ops::batchnorm_bp op; - auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,0,2}); + auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,0,2}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); @@ -3037,8 +3034,8 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test4) { NDArray beta ('c', {4}, nd4j::DataType::FLOAT32); NDArray gradO ('c', {2,4}, nd4j::DataType::FLOAT32); - NDArray expdLdI('c', {2,4}, {1.527335, -1.16534 , 0.885433, -0.643584, 0.509112, -0.233068, -0., 0.214528}, nd4j::DataType::FLOAT32); - NDArray expdLdG('c', {4}, {1.442483, 0.9502 , 0.569207, 0.314641}, nd4j::DataType::FLOAT32); + NDArray expdLdI('c', {2,4}, {0.162923, -0.289673, 0.354174, -0.386151, -0.162923, 0.289673, -0.354174, 0.386151}, nd4j::DataType::FLOAT32); + NDArray expdLdG('c', {4}, {1.442483, 0.950200, 0.569207, 0.314641}, nd4j::DataType::FLOAT32); NDArray expdLdB('c', {4}, {-1.2, -0.9, -0.6, -0.3}, nd4j::DataType::FLOAT32); input.linspace(0.1, 0.1); @@ -3046,7 +3043,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test4) { nd4j::ops::batchnorm_bp op; - auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1}); + auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); @@ -3076,8 +3073,9 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test5) { NDArray beta ('c', {4}, nd4j::DataType::FLOAT32); NDArray gradO ('c', {2,4,2,2}, nd4j::DataType::FLOAT32); - NDArray expdLdI('c', {2,4,2,2}, {1.527335, 1.272779,1.018224, 0.763668,-0.466136, -0.233068,0., 0.233068,-0.442716, -0.664075,-0.885433, -1.106791,1.287169, 1.501697,1.716225, 1.930753, - -2.545559, -2.800115,-3.054671, -3.309227,3.262951, 3.496019,3.729087, 3.962155,-3.984448, -4.205806,-4.427164, -4.648522,4.719618, 4.934146,5.148675, 5.363203}, nd4j::DataType::FLOAT32); + NDArray expdLdI('c', {2,4,2,2}, {-0.737512, -0.659880, -0.582247, -0.504614, 0.561404, 0.502309, 0.443214, 0.384118, -1.168243, + -1.045270, -0.922297, -0.799324, 1.899026, 1.699128, 1.499231, 1.299333, 0.504614, 0.582247, 0.659880, 0.737512, -0.384118, + -0.443214, -0.502308, -0.561404, 0.799324, 0.922297, 1.045270, 1.168243, -1.299334, -1.499231, -1.699129, -1.899026}, nd4j::DataType::FLOAT32); NDArray expdLdG('c', {4}, {11.073181, 12.585667, 17.708657, 24.313186}, nd4j::DataType::FLOAT32); NDArray expdLdB('c', {4}, {4.2, 9. , 13.8, 18.6}, nd4j::DataType::FLOAT32); @@ -3086,7 +3084,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test5) { nd4j::ops::batchnorm_bp op; - auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,1}); + auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,1}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); @@ -3116,8 +3114,9 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test6) { NDArray beta ('c', {4}, nd4j::DataType::FLOAT32); NDArray gradO ('c', {2,2,2,4}, nd4j::DataType::FLOAT32); - NDArray expdLdI('c', {2,2,2,4}, {1.527335, -1.16534 , 0.885433, -0.643584, 0.509112, -0.233068, -0., 0.214528, -0.509112, 0.699204, -0.885433, 1.072641, -1.527335, 1.631475, -1.770866, 1.930753, - -2.545559, 2.563747, -2.656298, 2.788865, -3.563783, 3.496019, -3.541731, 3.646978, -4.582006, 4.42829 , -4.427164, 4.50509 , -5.60023 , 5.360562, -5.312597, 5.363203}, nd4j::DataType::FLOAT32); + NDArray expdLdI('c', {2,2,2,4}, {-4.989124, 2.540357, -1.515022, 0.791769, -3.563660, 1.814540, -1.082159, 0.565549, -2.138196, 1.088724, -0.649295, + 0.339329, -0.712732, 0.362908, -0.216432, 0.113110, 0.712732, -0.362908, 0.216432, -0.113110, 2.138195, -1.088724, 0.649295, + -0.339330, 3.563660,-1.814540, 1.082159, -0.565549, 4.989125, -2.540356, 1.515022, -0.791770}, nd4j::DataType::FLOAT32); NDArray expdLdG('c', {4}, {20.364472, 17.856588, 16.949714, 15.903684}, nd4j::DataType::FLOAT32); NDArray expdLdB('c', {4}, {9.6, 10.8, 12. , 13.2}, nd4j::DataType::FLOAT32); @@ -3126,7 +3125,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test6) { nd4j::ops::batchnorm_bp op; - auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,3}); + auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,3}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); @@ -3156,20 +3155,21 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test7) { NDArray beta ('c', {4}, nd4j::DataType::FLOAT32); NDArray gradO ('c', {2,2,2,2,4}, nd4j::DataType::FLOAT32); - NDArray expdLdI('c', {2,2,2,2,4}, {1.527335, -1.16534 , 0.885433, -0.643584,0.509112, -0.233068, -0., 0.214528,-0.509112, 0.699204, -0.885433, 1.072641,-1.527335, 1.631475, -1.770866, - 1.930753,-2.545559, 2.563747, -2.656298, 2.788865,-3.563783, 3.496019, -3.541731, 3.646978,-4.582006, 4.42829 , -4.427164, - 4.50509 ,-5.60023 , 5.360562, -5.312597, 5.363203, -6.618453, 6.292834, -6.19803 , 6.221315,-7.636677, 7.225105, -7.083463, - 7.079428,-8.6549 , 8.157377, -7.968895, 7.93754 ,-9.673124, 9.089649, -8.854328, 8.795652, -10.691348, 10.02192 , -9.739761, - 9.653765,-11.709571, 10.954192, -10.625194, 10.511877,-12.727795, 11.886464, -11.510627, 11.36999 ,-13.746018, 12.818735, -12.39606 , 12.228102}, nd4j::DataType::FLOAT32); + NDArray expdLdI('c', {2,2,2,2,4}, {-119.435059, 78.159744, -58.732986, 46.630123, -103.510391, 67.738441, -50.901920, 40.412773, -87.585716, 57.317142, + -43.070854, 34.195419, -71.661041, 46.895844, -35.239792, 27.978071, -55.736359, 36.474548, -27.408726, 21.760721, -39.811687, 26.053242, -19.577662, + 15.543370, -23.887009, 15.631950, -11.746595, 9.326023, -7.962326, 5.210644, -3.915531, 3.108671, 7.962341, -5.210655, 3.915535, -3.108677, 23.887032, + -15.631958, 11.746601, -9.326031, 39.811691, -26.053246, 19.577671, -15.543377, 55.736382, -36.474548, 27.408726, -21.760731, 71.661064, -46.895851, 35.239788, + -27.978077, 87.585732, -57.317154, 43.070866, -34.195431, 103.510384, -67.738464, 50.901920, -40.412777, 119.435097, -78.159744, 58.732998, -46.630131}, nd4j::DataType::FLOAT32); NDArray expdLdG('c', {4}, {282.38734 , 244.542027, 224.140995, 207.548793}, nd4j::DataType::FLOAT32); NDArray expdLdB('c', {4}, {57.6, 60. , 62.4, 64.8}, nd4j::DataType::FLOAT32); input.linspace(0.1, 0.1); gradO.linspace(-0.9, 0.15); + nd4j::ops::batchnorm_bp op; - auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,4}); + auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,4}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); @@ -3201,10 +3201,11 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test8) { NDArray beta ('c', {4}, nd4j::DataType::FLOAT32); NDArray gradO ('c', {2,4,2,2,2}, nd4j::DataType::FLOAT32); - NDArray expdLdI('c', {2,4,2,2,2}, {1.527335, 1.272779, 1.018224, 0.763668, 0.509112, 0.254556, -0. , -0.254556, 0.466136, 0.699204, 0.932272, 1.16534 , 1.398407, 1.631475, 1.864543, 2.097611, - -2.213582, -2.43494 , -2.656298, -2.877657, -3.099015, -3.320373, -3.541731, -3.76309 , 3.861506, 4.076034, 4.290562, 4.50509 , 4.719618, 4.934146, 5.148675, 5.363203, - -6.618453, -6.873009, -7.127565, -7.382121, -7.636677, -7.891233, -8.145789, -8.400345, 7.924309, 8.157377, 8.390445, 8.623513, 8.856581, 9.089649, 9.322717, 9.555784, - -9.297045, -9.518403, -9.739761, -9.961119, -10.182477, -10.403836, -10.625194, -10.846552, 10.726405, 10.940933, 11.155462, 11.36999 , 11.584518, 11.799046, 12.013574, 12.228102}, nd4j::DataType::FLOAT32); + NDArray expdLdI('c', {2,4,2,2,2}, {-34.373802, -32.611046, -30.848286, -29.085529, -27.322769, -25.560009, -23.797251, -22.034491, 36.146996, 34.293301, + 32.439610, 30.585917, 28.732227, 26.878534, 25.024841, 23.171150, -42.876553, -40.677757, -38.478958, -36.280159, -34.081367, -31.882565, -29.683767, + -27.484968, 50.674446, 48.075760, 45.477066, 42.878380, 40.279686, 37.681000, 35.082310, 32.483616, 22.034489, 23.797249, 25.560009, 27.322765, 29.085526, + 30.848286, 32.611046, 34.373802, -23.171146, -25.024837, -26.878536, -28.732231, -30.585918, -32.439613, -34.293297, -36.146996, 27.484982, 29.683773, + 31.882572, 34.081364, 36.280178, 38.478970, 40.677776, 42.876560, -32.483627, -35.082329, -37.681023, -40.279701, -42.878403, -45.477081, -48.075775, -50.674484}, nd4j::DataType::FLOAT32); NDArray expdLdG('c', {4}, {134.490365, 179.785003, 248.933114, 330.087248}, nd4j::DataType::FLOAT32); NDArray expdLdB('c', {4}, {32.4, 51.6, 70.8, 90.}, nd4j::DataType::FLOAT32); @@ -3213,7 +3214,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test8) { nd4j::ops::batchnorm_bp op; - auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,1}); + auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,1}); ASSERT_EQ(ND4J_STATUS_OK, results->status()); @@ -3338,8 +3339,8 @@ TEST_F(DeclarableOpsTests9, Cholesky_Test_3) { auto result = op.execute({&x}, {}, {}); ASSERT_EQ(result->status(), ND4J_STATUS_OK); auto res = result->at(0); -// res->printIndexedBuffer("Output for Cholesky 3"); - ASSERT_TRUE(exp.equalsTo(res)); + // res->printIndexedBuffer("Output for Cholesky 3"); + ASSERT_TRUE(exp.equalsTo(res, 1e-4)); delete result; } diff --git a/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp b/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp index baba901bf..8ae123260 100644 --- a/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp @@ -121,7 +121,6 @@ TEST_F(EmptyTests, Test_Concat_3) { auto z = result->at(0); - z->printIndexedBuffer("z"); ASSERT_EQ(exp, *z); delete result; @@ -141,7 +140,6 @@ TEST_F(EmptyTests, Test_Concat_4) { auto z = result->at(0); - z->printIndexedBuffer("z"); ASSERT_EQ(exp, *z); delete result; @@ -282,7 +280,6 @@ TEST_F(EmptyTests, test_shaped_empty_3) { TEST_F(EmptyTests, test_shaped_empty_4) { auto shape = ConstantShapeHelper::getInstance()->vectorShapeInfo(0, nd4j::DataType::FLOAT32); - shape::printShapeInfoLinear("shape", shape); NDArray array(shape, true, nd4j::LaunchContext::defaultContext()); std::vector shapeOf({0}); diff --git a/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp b/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp index 2ed43d08a..1dc2c8e48 100644 --- a/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp +++ b/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -46,14 +47,14 @@ public: #ifndef __CUDABLAS__ TEST_F(HelpersTests1, test_binary_search_1) { - std::array array({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); + std::array array = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; auto idx = nd4j::ops::helpers::binarySearch(array.data(), 2, 10); ASSERT_EQ(2, idx); } TEST_F(HelpersTests1, test_binary_search_2) { - std::array array({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); + std::array array = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; auto idx = nd4j::ops::helpers::binarySearch(array.data(), 18, 10); ASSERT_EQ(-1, idx); diff --git a/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp b/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp index 8097aab33..96c480fd9 100644 --- a/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp @@ -58,7 +58,6 @@ TEST_F(IndexingTests, StridedSlice_1) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); - z->printIndexedBuffer("Output"); ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); @@ -379,8 +378,6 @@ TEST_F(IndexingTests, Test_StridedSlice_1) { auto z = result->at(0); - z->printIndexedBuffer("Z"); - ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); @@ -424,8 +421,6 @@ TEST_F(IndexingTests, Test_StridedSlice_3) { auto z = result->at(0); - z->printIndexedBuffer("Z"); - ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); diff --git a/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu b/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu index 294e03c12..f442c0bb9 100644 --- a/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu +++ b/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu @@ -50,7 +50,6 @@ TEST_F(JavaInteropCudaTests, test_DeclarableOp_execution_1) { context.setOutputArray(0, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo()); - nd4j_printf("Starting execution...\n",""); PointersManager pm(LaunchContext::defaultContext(), "test_DeclarableOp_execution_1"); execCustomOp2(nullptr, op.getOpHash(), &context); @@ -78,7 +77,6 @@ TEST_F(JavaInteropCudaTests, test_DeclarableOp_execution_2) { context.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo()); - nd4j_printf("Starting execution...\n",""); PointersManager pm(LaunchContext::defaultContext(), "test_DeclarableOp_execution_2"); execCustomOp2(nullptr, op.getOpHash(), &context); diff --git a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp index 21af8e380..aa75ea1ab 100644 --- a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp @@ -426,6 +426,24 @@ TEST_F(JavaInteropTests, Test_FastPath_Validation_2) { ASSERT_NE(Status::OK(), status); } +TEST_F(JavaInteropTests, Test_empty_cast_1) { + auto x = NDArrayFactory::create('c', {1, 0, 2}); + auto z = NDArrayFactory::create('c', {1, 0, 2}); + auto e = NDArrayFactory::create('c', {1, 0, 2}); + + Nd4jLong iArgs[] = {10}; + + Context ctx(1); + ctx.setInputArray(0, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo()); + ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo()); + ctx.setIArguments(iArgs, 1); + + nd4j::ops::cast op; + auto result = op.execute(&ctx); + ASSERT_EQ(Status::OK(), result); + ASSERT_EQ(e, z); +} + /* TEST_F(JavaInteropTests, test_avgpooling_edge_1) { int inOutH = 35; @@ -1183,7 +1201,9 @@ TEST_F(JavaInteropTests, test_bfloat16_rng) { RandomGenerator rng(119, 323841120L); bfloat16 args[2] = {(bfloat16) 0.0f, (bfloat16) 1.0f}; execRandom(nullptr, nd4j::random::Ops::UniformDistribution, &rng, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), args); - z.printIndexedBuffer("z"); + + //z.printIndexedBuffer("z"); + ASSERT_TRUE(z.sumNumber().e(0) > 0); } @@ -1192,7 +1212,7 @@ TEST_F(JavaInteropTests, test_ismax_view) { auto v = original.subarray({NDIndex::all(), NDIndex::all(), NDIndex::interval(0, 40, 2)}); v->assign(1.0); - auto e = v->ulike(); + auto e = v->like(); auto t = e.tensorAlongDimension(0, {0, 1}); t->assign(1.0); @@ -1208,7 +1228,6 @@ TEST_F(JavaInteropTests, test_ismax_view) { nd4j::ops::ismax op; op.execute(&ctx); - z.printIndexedBuffer("z"); ASSERT_EQ(e, z); delete v; diff --git a/libnd4j/tests_cpu/layers_tests/LambdaTests.cu b/libnd4j/tests_cpu/layers_tests/LambdaTests.cu index c1dc1acfe..30244b7dc 100644 --- a/libnd4j/tests_cpu/layers_tests/LambdaTests.cu +++ b/libnd4j/tests_cpu/layers_tests/LambdaTests.cu @@ -68,8 +68,6 @@ TEST_F(LambdaTests, test_basic_1) { ASSERT_EQ(0, res); ASSERT_EQ(e, x); - - x.printIndexedBuffer("x"); } void test(NDArray &x) { @@ -127,7 +125,6 @@ TEST_F(LambdaTests, test_basic_2) { test(x); - x.printIndexedBuffer("x"); ASSERT_EQ(e, x); } @@ -137,7 +134,6 @@ TEST_F(LambdaTests, test_basic_3) { test(x); - x.printIndexedBuffer("x"); ASSERT_EQ(e, x); } @@ -147,7 +143,6 @@ TEST_F(LambdaTests, test_basic_4) { test2(x); - x.printIndexedBuffer("x"); ASSERT_EQ(e, x); } @@ -158,7 +153,6 @@ TEST_F(LambdaTests, test_basic_5) { testPairwise(x, y); - x.printIndexedBuffer("x"); ASSERT_EQ(e, x); } @@ -168,7 +162,6 @@ TEST_F(LambdaTests, test_basic_6) { testIndexed(x); - x.printIndexedBuffer("x"); ASSERT_EQ(e, x); } @@ -180,7 +173,6 @@ TEST_F(LambdaTests, test_basic_7) { testTriplewise(w, x, y); - w.printIndexedBuffer("w"); ASSERT_EQ(e, w); } @@ -191,7 +183,6 @@ TEST_F(LambdaTests, test_basic_8) { testIndexedPairwise(x, y); - x.printIndexedBuffer("x"); ASSERT_EQ(e, x); } diff --git a/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp index 5308ee99d..f48ee54f6 100644 --- a/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp @@ -442,11 +442,11 @@ TEST_F(LegacyOpsTests, reduce3_1) { //int *tadShapeBuffer = shape::computeResultShape(shapeBuffer,dimension,dimensionLength); auto tadShapeBuffer = nd4j::ShapeUtils::evalReduceShapeInfo('c', dim, shapeBuffer, false, true, nullptr); - functions::reduce3::Reduce3::exec(opNum, x, xShapeBuffer, extraVals, y, shapeBuffer, result, tadShapeBuffer, dimension, dimensionLength); + functions::reduce3::Reduce3::exec(opNum, x, xShapeBuffer, extraVals, y, shapeBuffer, result, tadShapeBuffer, dimension, dimensionLength, 0, 4); float distancesAssertion[4] = {0.0,8.0,16.0,24.0}; for(int i = 0; i < 4; i++) - ASSERT_EQ(distancesAssertion[i],result[i]); + ASSERT_NEAR(distancesAssertion[i],result[i], 1e-5); delete[] shapeBuffer; delete[] xShapeBuffer; @@ -726,6 +726,26 @@ TEST_F(LegacyOpsTests, test_legacy_reduce_empty_3) { ASSERT_EQ(e, z); } +TEST_F(LegacyOpsTests, test_legacy_reduce_empty_4) { + if (!Environment::getInstance()->isCPU()) + return; + int a = 0; + + auto x = NDArrayFactory::create('c', {1, 0, 2}); + auto d = NDArrayFactory::create('c', {1}, {a}); + auto z = NDArrayFactory::create('c', {0, 2}); + auto e = NDArrayFactory::create('c', {0, 2}); + + + + ::execReduceSame2(nullptr, reduce::SameOps::Sum, + x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(), + nullptr, + z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), + d.buffer(), d.shapeInfo(), d.specialBuffer(), d.specialShapeInfo()); + +} + TEST_F(LegacyOpsTests, test_legacy_transform_float_1) { auto x = NDArrayFactory::create('c', {1, 0, 4}); diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu b/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu index 4ab884d28..71ad6929b 100644 --- a/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu +++ b/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu @@ -152,7 +152,6 @@ TEST_F(NDArrayCudaBasicsTests, Test_Cosine_1) { //ASSERT_TRUE(y->isActualOnDeviceSide()); //ASSERT_TRUE(y->isActualOnHostSide()); //y->syncToHost(); - y->printBuffer("Cosine"); delete x; delete y; } @@ -251,9 +250,6 @@ TEST_F(NDArrayCudaBasicsTests, TestAdd_3) { cudaMemcpy(z.buffer(), z.specialBuffer(), z.lengthOf() * z.sizeOfT(), cudaMemcpyDeviceToHost); res = cudaStreamSynchronize(*stream); ASSERT_EQ(0, res); - x.printBuffer("3X = "); - y.printBuffer("3Y = "); - z.printBuffer("3Result out"); // // cudaFree(devBufferPtrX); @@ -347,11 +343,7 @@ TEST_F(NDArrayCudaBasicsTests, TestAdd_6) { x += y; //x.applyPairwiseTransform(pairwise::Add, &y, &z, nullptr); x.syncToHost(); - x.printBuffer("6X = "); - //y.printBuffer("3Y = "); - //z.printBuffer("3Result out"); - // // cudaFree(devBufferPtrX); //cudaFree(devBufferPtrZ); //cudaFree(devShapePtrX); @@ -381,11 +373,7 @@ TEST_F(NDArrayCudaBasicsTests, TestAdd_7) { x += 2.; //x.applyPairwiseTransform(pairwise::Add, &y, &z, nullptr); x.syncToHost(); - x.printBuffer("7X = "); - //y.printBuffer("3Y = "); - //z.printBuffer("3Result out"); - // // cudaFree(devBufferPtrX); //cudaFree(devBufferPtrZ); //cudaFree(devShapePtrX); @@ -445,9 +433,6 @@ TEST_F(NDArrayCudaBasicsTests, TestMultiply_2) { //res = cudaMalloc(reinterpret_cast(&devShapePtrX), shape::shapeInfoByteLength(x.shapeInfo())); //ASSERT_EQ(0, res); x.applyPairwiseTransform(pairwise::Multiply, &y, &z, nullptr); - x.printBuffer("3X = "); - y.printBuffer("3Y = "); - z.printBuffer("3Result out"); // // cudaFree(devBufferPtrX); @@ -744,8 +729,7 @@ TEST_F(NDArrayCudaBasicsTests, TestRawBroadcast_2) { cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult); z.tickWriteDevice(); - z.printBuffer("Result with Broadcast2 (multiply)"); - exp.printBuffer("Expect with Broadcast2 (multiply)"); + // verify results for (int e = 0; e < z.lengthOf(); e++) ASSERT_NEAR(exp.e(e), z.e(e), 1e-5); @@ -811,7 +795,6 @@ TEST_F(NDArrayCudaBasicsTests, TestRawBroadcast_3) { //cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult); //z.syncToHost(); - z.printBuffer("Result with Broadcast3 (multiply)"); // verify results for (int e = 0; e < z.lengthOf(); e++) ASSERT_NEAR(exp.e(e), z.e(e), 1e-5); @@ -842,11 +825,8 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_1) { //res = cudaMalloc(reinterpret_cast(&devShapePtrX), shape::shapeInfoByteLength(x.shapeInfo())); //ASSERT_EQ(0, res); //x.applyPairwiseTransform(pairwise::Multiply, &y, &z, nullptr); - //x.printBuffer("23X = "); - //y.printBuffer("23Y = "); x *= y; //x.syncToHost(); - x.printBuffer("54Result out"); // // cudaFree(devBufferPtrX); @@ -995,7 +975,6 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastRaw_1) { // allocate required amount of global device memory and copy host data to it //cudaResult = allocateDeviceMem(*pLc, devicePtrs, hostData); ASSERT_EQ(0, cudaResult); for(size_t i = 0; i < devicePtrs.size(); ++i) { - nd4j_printf("Allocation of %i bytes with device\n", hostData[i].second) cudaResult = cudaMalloc(&devicePtrs[i], hostData[i].second); //if(cudaResult != 0) return cudaResult; ASSERT_EQ(cudaResult, 0); cudaMemcpy(devicePtrs[i], hostData[i].first, hostData[i].second, cudaMemcpyHostToDevice); @@ -1047,7 +1026,6 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply) { //x.printBuffer("23X = "); //y.printBuffer("23Y = "); x *= y; - x.printBuffer("55Result out"); // // cudaFree(devBufferPtrX); @@ -1082,7 +1060,6 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_2) { //y.printBuffer("23Y = "); //void NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray* other, NDArray* target, const bool checkTargetShape, ExtraArguments *extraArgs) x.applyTrueBroadcast(BroadcastOpsTuple::Multiply(), &y, &exp); - exp.printBuffer("56Result out"); // // cudaFree(devBufferPtrX); @@ -1111,8 +1088,6 @@ TEST_F(NDArrayCudaBasicsTests, TestReduceSum_1) { ASSERT_EQ(0, res); y.syncToHost(); - x.printBuffer("X = "); - y.printBuffer("Y = "); ASSERT_NEAR(y.e(0), 15, 1e-5); } @@ -1120,7 +1095,6 @@ TEST_F(NDArrayCudaBasicsTests, TestReduceSum_1) { TEST_F(NDArrayCudaBasicsTests, TestDup1) { NDArray array('c', {2,3}, {1,2,3,4,5,6}); - array.printBuffer("Array at start"); auto arrC = array.dup('c'); auto arrF = array.dup('f'); // arrC->printBuffer("arrC"); @@ -1498,22 +1472,18 @@ TEST_F(NDArrayCudaBasicsTests, EqualityTest1) { arrayA->p(i, k, (float) i); } } - arrayA->printBuffer("arrayA is "); + for (int i = 0; i < arrayB->rows(); i++) { for (int k = 0; k < arrayB->columns(); k++) { arrayB->p(i, k, (float) i); } } - arrayB->printBuffer("arrayB is "); for (int i = 0; i < arrayC->rows(); i++) { for (int k = 0; k < arrayC->columns(); k++) { arrayC->p(i, k, (float) i+1); } } - arrayC->printBuffer("arrayC is "); - - ASSERT_TRUE(arrayA->equalsTo(arrayB, 1e-5)); @@ -1920,8 +1890,6 @@ TEST_F(NDArrayCudaBasicsTests, Tile_Test_2_2) auto y = x.tile({1,2,1}); auto exp = NDArrayFactory::create('f', {2, 2, 2}); exp = 10.; - y.printShapeInfo("Output SHAPE"); - y.printBuffer("Output TILE"); ASSERT_TRUE(exp.equalsTo(y)); } @@ -1945,17 +1913,13 @@ TEST_F(NDArrayCudaBasicsTests, Operator_Plus_Test_2) { double expBuff[] = {2., 3, 3., 4., 4., 5, 5., 6., 6., 7, 7., 8.}; NDArray a('c', {4,4}, {1.,2,3,4,5,6,7,8,9,2,3,2,1,0,4,7.}, nd4j::DataType::FLOAT32); - a.printBuffer(); auto x = NDArrayFactory::create('c', {3, 2, 1}); auto y = NDArrayFactory::create('c', {1, 2}); auto expected = NDArrayFactory::create(expBuff, 'c', {3, 2, 2}); x.linspace(1); y.linspace(1); - x.printBuffer("X="); - y.printBuffer("Y="); auto result = x + y; - result.printIndexedBuffer("Result"); ASSERT_TRUE(expected.isSameShape(&result)); ASSERT_TRUE(expected.equalsTo(&result)); @@ -2133,7 +2097,7 @@ TEST_F(NDArrayCudaBasicsTests, Test_diagonal_1) { for (Nd4jLong e = 0; e < exp.lengthOf(); ++e) { printf("VAL[%ld] = %f\n", e, diag->e(e)); //, exp.e(e), 1.e-5); } - diag->printIndexedBuffer("DIAGONAL"); + for (Nd4jLong e = 0; e < exp.lengthOf(); ++e) { ASSERT_NEAR(diag->e(e), exp.e(e), 1.e-5); } diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp index 75608f2bc..747ecc183 100644 --- a/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp @@ -174,8 +174,6 @@ TEST_F(NDArrayTest, EqualityTest1) { arrayC->p(i, k, (float) i+1); } } - arrayB->printBuffer("B ="); - arrayC->printBuffer("C ="); //nd4j_printf("A B\n",""); ASSERT_TRUE(arrayA->equalsTo(arrayB, 1e-5)); @@ -1699,7 +1697,6 @@ TEST_F(NDArrayTest, TestVarianceAlongDimension2) { NDArray exp(expBuff, expShapeInfo); auto result = x.varianceAlongDimension(variance::SummaryStatsVariance, false, {1}); - result->printIndexedBuffer("VARIANCE2"); ASSERT_TRUE(exp.isSameShapeStrict(result)); ASSERT_TRUE(exp.equalsTo(result)); @@ -1714,7 +1711,6 @@ TEST_F(NDArrayTest, TestVarianceAlongDimension3) { x.linspace(1); // 1, 2, 3, ..., 100 exp.assign(825.f); auto result = x.varianceAlongDimension(variance::SummaryStatsVariance, false, {0}); - result->printIndexedBuffer("VARIANCE3"); ASSERT_TRUE(exp.isSameShapeStrict(result)); ASSERT_TRUE(exp.equalsTo(result)); @@ -1729,7 +1725,6 @@ TEST_F(NDArrayTest, TestVarianceAlongDimension4) { x.linspace(1); // 1, 2, 3, ..., 100 exp.assign(1716.); auto result = x.varianceAlongDimension(variance::SummaryStatsVariance, false, {0}); - result->printIndexedBuffer("VARIANCE4"); ASSERT_TRUE(exp.isSameShapeStrict(result)); ASSERT_TRUE(exp.equalsTo(result)); diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp index 9f9937368..a497cd9e6 100644 --- a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp +++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp @@ -184,7 +184,6 @@ TEST_F(NDArrayTest2, SetIdentity_test_8) { auto x = NDArrayFactory::create('c', {3, 3, 3}); auto xExp = NDArrayFactory::create('c', {3, 3, 3}, {1.,0.,0. ,0.,0.,0., 0.,0.,0., 0.,0.,0. ,0.,1.,0., 0.,0.,0., 0.,0.,0. ,0.,0.,0., 0.,0.,1.}); - xExp.printIndexedBuffer("Identity8"); x.setIdentity(); ASSERT_TRUE(x.equalsTo(&xExp)); @@ -921,8 +920,6 @@ TEST_F(NDArrayTest2, test_subarray_ews_1) { NDArray x('c', {10, 5}, nd4j::DataType::FLOAT32); auto subArr1 = x.subarray({NDIndex::all(), NDIndex::point(2)}); - subArr1->printShapeInfo("subArr1"); - ASSERT_EQ(5, subArr1->ews()); delete subArr1; } @@ -933,8 +930,6 @@ TEST_F(NDArrayTest2, test_subarray_ews_2) { NDArray x('f', {10, 5}, nd4j::DataType::FLOAT32); auto subArr1 = x.subarray({NDIndex::all(), NDIndex::point(2)}); - subArr1->printShapeInfo("subArr1"); - ASSERT_EQ(1, subArr1->ews()); delete subArr1; } @@ -945,8 +940,6 @@ TEST_F(NDArrayTest2, test_subarray_ews_3) { NDArray x('c', {10, 5}, nd4j::DataType::FLOAT32); auto subArr1 = x.subarray({NDIndex::point(2), NDIndex::all()}); - subArr1->printShapeInfo("subArr1"); - ASSERT_EQ(1, subArr1->ews()); delete subArr1; } @@ -957,8 +950,6 @@ TEST_F(NDArrayTest2, test_subarray_ews_4) { NDArray x('f', {10, 5}, nd4j::DataType::FLOAT32); auto subArr1 = x.subarray({NDIndex::point(2), NDIndex::all()}); - subArr1->printShapeInfo("subArr1"); - ASSERT_EQ(10, subArr1->ews()); delete subArr1; } @@ -1074,8 +1065,6 @@ TEST_F(NDArrayTest2, test_subarray_interval_1) { NDArray x('f', {10, 10}, nd4j::DataType::FLOAT32); auto subArr1 = x.subarray({NDIndex::all(), NDIndex::interval(0,9)}); - subArr1->printShapeInfo("subArr1"); - ASSERT_EQ(10, subArr1->sizeAt(0)); ASSERT_EQ(9, subArr1->sizeAt(1)); delete subArr1; @@ -1086,8 +1075,6 @@ TEST_F(NDArrayTest2, test_subarray_interval_2) { NDArray x('c', {10, 10}, nd4j::DataType::FLOAT32); auto subArr1 = x.subarray({NDIndex::all(), NDIndex::interval(0,9)}); - subArr1->printShapeInfo("subArr1"); - ASSERT_EQ(10, subArr1->sizeAt(0)); ASSERT_EQ(9, subArr1->sizeAt(1)); delete subArr1; @@ -1098,10 +1085,8 @@ TEST_F(NDArrayTest2, test_subarray_3d_cf) { NDArray c('c', {10, 20, 30}, nd4j::DataType::FLOAT32); auto subarrayF = f({0,0, 0,0, 2,3}, true); - subarrayF.printShapeInfo("F subarray shapeInfo"); auto subarrayC = c({2,3, 0,0, 0,0}, true); - subarrayC.printShapeInfo("C subarray shapeInfo"); } TEST_F(NDArrayTest2, test_broadcast_row_1) { @@ -1133,8 +1118,6 @@ TEST_F(NDArrayTest2, test_broadcast_column_2) { e.assign(1.0f); x.applyTrueBroadcast(BroadcastOpsTuple::Add(), &y, &x, false); - x.printShapeInfo(); - x.printIndexedBuffer(); ASSERT_EQ(e, x); } @@ -1189,8 +1172,6 @@ TEST_F(NDArrayTest2, test_long_sum_1) { auto x = NDArrayFactory::create('c', {2, 2}, {1, 2, 3, 4}); auto z = x.reduceAlongDims(reduce::Sum, {0}); - - z.printIndexedBuffer("z long"); } ////////////////////////////////////////////////////////////////////// diff --git a/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp index 9aac42ddf..95b3027cc 100644 --- a/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp @@ -191,7 +191,8 @@ TEST_F(NativeOpsTests, ExecBroadcast_2) { #ifdef __CUDABLAS__ printf("Unsupported for cuda now.\n"); #else - auto dimension = NDArrayFactory::create('c', {1}, {(int)0}); + int dimd = 0; + auto dimension = NDArrayFactory::create('c', {1}, {dimd}); ::execBroadcastBool(nullptr, broadcast::EqualTo, @@ -525,8 +526,8 @@ TEST_F(NativeOpsTests, Reduce3Test_1) { y.specialBuffer(), y.specialShapeInfo(), exp.buffer(), exp.shapeInfo(), exp.specialBuffer(), exp.specialShapeInfo()); -// x.printIndexedBuffer("Input"); -// exp.printIndexedBuffer("Reduce3 Dot"); + //z.printIndexedBuffer("Z"); + //exp.printIndexedBuffer("Reduce3 Dot"); ASSERT_TRUE(exp.equalsTo(z)); } diff --git a/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp b/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp index d8174f000..0d879748d 100644 --- a/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp @@ -81,34 +81,6 @@ TEST_F(OmpLaunchHelperTests, Test_BetterThreads_3) { ASSERT_EQ(1, n); } -////////////////////////////////////////////////////////////////////// -TEST_F(OmpLaunchHelperTests, loop_test1) { - - const Nd4jLong N = 20010; - Nd4jLong desiredNumThreads = 2; - int x[N] = {0}; - - OmpLaunchHelper info(N, desiredNumThreads); - PRAGMA_OMP_PARALLEL_THREADS(info._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto xi = x + info.getThreadOffset(threadNum); - - auto ulen = static_cast(info.getItersPerThread(threadNum)); - - PRAGMA_OMP_SIMD - for (Nd4jLong i = 0; i < ulen; i++) - xi[i] = xi[i] + 1; - } - - #ifdef _OPENMP - ASSERT_EQ(desiredNumThreads, info._numThreads); - #else - ASSERT_EQ(1, info._numThreads); - #endif - -} - TEST_F(OmpLaunchHelperTests, test_tad_threads_1) { Nd4jLong numTads = 16; Nd4jLong tadLength = 16; diff --git a/libnd4j/tests_cpu/layers_tests/OpsArena.cpp b/libnd4j/tests_cpu/layers_tests/OpsArena.cpp deleted file mode 100644 index b09a4e043..000000000 --- a/libnd4j/tests_cpu/layers_tests/OpsArena.cpp +++ /dev/null @@ -1,200 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -// -// Created by raver119 on 11.10.2017. -// -// This "set of tests" is special one - we don't check ops results here. we just check for memory equality BEFORE op launch and AFTER op launch -// -// -#include "testlayers.h" -#include -#include -#include -#include -#include -#include -#include - -using namespace nd4j; -using namespace nd4j::ops; - -class OpsArena : public testing::Test { -public: - const int numIterations = 0; - std::vector tuples; - - - OpsArena() { - // nd4j_printf("\nStarting memory tests...\n",""); - - - // conv2d_bp - tuples.push_back((new OpTuple("conv2d_bp")) - ->addInput(NDArrayFactory::create_('c', {2, 1, 4, 4})) - ->addInput(NDArrayFactory::create_('c', {3, 3, 1, 2})) - //->addInput(new NDArray('c', {2, 1})) - ->addInput(NDArrayFactory::create_('c', {2, 2, 4, 4})) - ->setIArgs({3, 3, 1, 1, 0, 0, 1, 1, 1})); - - - // mergeavg - tuples.emplace_back((new OpTuple("mergeavg")) - ->addInput(NDArrayFactory::create_('c', {100, 100})) - ->addInput(NDArrayFactory::create_('c', {100, 100})) - ->addInput(NDArrayFactory::create_('c', {100, 100})) - ->addInput(NDArrayFactory::create_('c', {100, 100}))); - - // mergemax - auto mergeMax_X0 = NDArrayFactory::create_('c', {100, 100}); - auto mergeMax_X1 = NDArrayFactory::create_('c', {100, 100}); - auto mergeMax_X2 = NDArrayFactory::create_('c', {100, 100}); - tuples.push_back(new OpTuple("mergemax", {mergeMax_X0, mergeMax_X1, mergeMax_X2}, {}, {})); - - // conv2d - auto conv2d_Input = NDArrayFactory::create_('c', {1, 2, 5, 4}); - auto conv2d_Weights = NDArrayFactory::create_('c', {2, 2, 2, 3}); - auto conv2d_Bias = NDArrayFactory::create_('c', {3, 1}); - tuples.push_back(new OpTuple("conv2d", {conv2d_Input, conv2d_Weights, conv2d_Bias}, {}, {2, 2, 1, 1, 0, 0, 1, 1, 1, 0})); - - // test custom op - tuples.emplace_back((new OpTuple("testcustom")) - ->setIArgs({1, 2}) - ->addInput(NDArrayFactory::create_('c', {100, 100}))); - - - // deconv2d - tuples.emplace_back((new OpTuple("deconv2d")) - ->addInput(NDArrayFactory::create_('c', {2, 3, 4, 4})) - ->addInput(NDArrayFactory::create_('c', {5, 5, 3, 3})) - ->setIArgs({5, 5, 1, 1, 0, 0, 1, 1, 0, 0})); - - // maxpool2d - tuples.emplace_back((new OpTuple("maxpool2d")) - ->addInput(NDArrayFactory::create_('c', {2, 1, 28, 28})) - ->setIArgs({5, 5, 1, 1, 0, 0, 2, 2, 0})); - } - - - ~OpsArena() { - for (auto v: tuples) - delete v; - } - -}; - - -TEST_F(OpsArena, TestFeedForward) { - nd4j::ops::mergeavg op0; - nd4j::ops::mergemax op1; - -#ifdef _WIN32 - if (1 > 0) - return; -#endif - - for (auto tuple: tuples) { - auto op = OpRegistrator::getInstance()->getOperation(tuple->_opName); - if (op == nullptr) { - // nd4j_printf("Can't find Op by name: [%s]\n", tuple->_opName); - ASSERT_TRUE(false); - } - - // nd4j_printf("Testing op [%s]\n", tuple->_opName); - nd4j::memory::MemoryReport before, after; - - // warmup - auto tmp1 = op->execute(tuple->_inputs, tuple->_tArgs, tuple->_iArgs); - auto tmp2 = op->execute(tuple->_inputs, tuple->_tArgs, tuple->_iArgs); - delete tmp1; - delete tmp2; - - auto b = nd4j::memory::MemoryUtils::retrieveMemoryStatistics(before); - - if (!b) - ASSERT_TRUE(false); - - for (int e = 0; e < numIterations; e++) { - auto result = op->execute(tuple->_inputs, tuple->_tArgs, tuple->_iArgs); - - // we just want to be sure op was executed successfully - ASSERT_TRUE(result->size() > 0); - - delete result; - } - - - auto a = nd4j::memory::MemoryUtils::retrieveMemoryStatistics(after); - if (!a) - ASSERT_TRUE(false); - - - // this is our main assertion. memory footprint after op run should NOT be higher then before - if (after > before) { - // nd4j_printf("WARNING!!! OpName: [%s]; RSS before: [%lld]; RSS after: [%lld]\n", tuple->_opName, before.getRSS(), after.getRSS()) - // ASSERT_TRUE(after <= before); - } - } -} - - - -TEST_F(OpsArena, TestMmulHelper1) { - auto a = NDArrayFactory::create('c', {100, 100}); - auto b = NDArrayFactory::create('c', {100, 100}); - auto c = NDArrayFactory::create('c', {100, 100}); - - nd4j::MmulHelper::mmul(&a, &b, &c); - - nd4j::memory::MemoryReport before, after; - - nd4j::memory::MemoryUtils::retrieveMemoryStatistics(before); - - for (int e = 0; e < numIterations; e++) { - nd4j::MmulHelper::mmul(&a, &b, &c); - } - - nd4j::memory::MemoryUtils::retrieveMemoryStatistics(after); - if (after > before) { - // nd4j_printf("WARNING!!! OpName: [%s]; RSS before: [%lld]; RSS after: [%lld]\n", "mmulHelper", before.getRSS(), after.getRSS()) - //ASSERT_TRUE(after <= before); - } -} - - -TEST_F(OpsArena, TestMmulHelper2) { - auto a = NDArrayFactory::create('c', {100, 100}); - auto b = NDArrayFactory::create('c', {100, 100}); - - auto c = nd4j::MmulHelper::mmul(&a, &b); - delete c; - - nd4j::memory::MemoryReport before, after; - - nd4j::memory::MemoryUtils::retrieveMemoryStatistics(before); - - for (int e = 0; e < numIterations; e++) { - c = nd4j::MmulHelper::mmul(&a, &b); - delete c; - } - - nd4j::memory::MemoryUtils::retrieveMemoryStatistics(after); - if (after > before) { - // nd4j_printf("WARNING!!! OpName: [%s]; RSS before: [%lld]; RSS after: [%lld]\n", "mmulHelper", before.getRSS(), after.getRSS()) - ASSERT_TRUE(after <= before); - } -} - diff --git a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp index 0254d1877..d5880d689 100644 --- a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp @@ -419,9 +419,6 @@ TEST_F(ParityOpsTests, Test_Shape_1) { auto z = result->at(0); - z->printShapeInfo("z shape"); - z->printIndexedBuffer(" z buffr"); - ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); @@ -1362,7 +1359,8 @@ TEST_F(ParityOpsTests, scatterND_sub_test2) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); - // z->printIndexedBuffer(); + //exp.printIndexedBuffer("e"); + //z->printIndexedBuffer("z"); ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z)); diff --git a/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp b/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp new file mode 100644 index 000000000..998b8164b --- /dev/null +++ b/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp @@ -0,0 +1,95 @@ +/******************************************************************************* + * Copyright (c) 2019 Konduit + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include "testlayers.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +using namespace nd4j; +using namespace nd4j::graph; + +class PerformanceTests : public testing::Test { +public: + int numIterations = 100; + + PerformanceTests() { + samediff::ThreadPool::getInstance(); + } +}; + +#ifdef RELEASE_BUILD + +TEST_F(PerformanceTests, test_maxpooling2d_1) { + std::vector valuesX; + auto x = NDArrayFactory::create('c', {32, 3, 224, 224}); + auto z = NDArrayFactory::create('c', {32, 3, 224, 224}); + x.linspace(1.0f); + Nd4jLong k = 5; + + + Nd4jLong iArgs[] {k,k, 1,1, 0,0, 1,1, 1}; + Context ctx(1); + ctx.setInputArray(0, &x); + ctx.setOutputArray(0, &z); + ctx.setIArguments(iArgs, 9); + + nd4j::ops::maxpool2d op; + + for (int i = 0; i < numIterations; i++) { + auto timeStart = std::chrono::system_clock::now(); + + op.execute(&ctx); + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + valuesX.emplace_back(outerTime); + + if ((i + 1) % 1000 == 0) + nd4j_printf("Iteration %i finished...\n", i + 1); + } + + std::sort(valuesX.begin(), valuesX.end()); + nd4j_printf("Execution time: %lld; Min: %lld; Max: %lld;\n", valuesX[valuesX.size() / 2], valuesX[0], valuesX[valuesX.size() - 1]); +} + +#endif \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp index e95c6eca6..dfb685e22 100644 --- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp @@ -57,12 +57,201 @@ public: fflush(stdout); } }; - +/* TEST_F(PlaygroundTests, test_s_1) { auto t = ::runLightBenchmarkSuit(true); delete[] t; } +TEST_F(PlaygroundTests, test_s_2) { + std::atomic s; + s = 0; + auto func = PRAGMA_THREADS_FOR { + s++; + }; + + samediff::Threads::parallel_for(func, 0, 8192, 1, 4); + std::vector values; + + for (int e = 0; e < 100000; e++) { + s = 0; + + auto timeStart = std::chrono::system_clock::now(); + //samediff::Threads::parallel_for(func, 0, 8192, 1, 4); + PRAGMA_OMP_PARALLEL_THREADS(4) { + s++; + } + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast (timeEnd - timeStart).count(); + values.emplace_back(outerTime); + }; + std::sort(values.begin(), values.end()); + + nd4j_printf("Time: %lld;\n", values[values.size() / 2]); +} + */ +/* +TEST_F(PlaygroundTests, test_s_4) { + std::atomic f; + std::atomic s; + std::vector valuesX, valuesY; + int iterations = 1000; + s = 0; + auto func = PRAGMA_THREADS_FOR { + s++; + }; + + samediff::Threads::parallel_for(func, 0, 8192, 1, 4); + + //////// + + auto x = NDArrayFactory::create('c', {32, 3, 256, 256}); + auto z = NDArrayFactory::create('c', {32, 3, 256, 256}); + x.linspace(1.0); + + auto xs0 = x.sizeAt(0); + auto xs1 = x.sizeAt(1); + auto xs2 = x.sizeAt(2); + auto xs3 = x.sizeAt(3); + + auto buffer = x.bufferAsT(); + auto zbuffer = z.bufferAsT(); + + for (int e = 0; e < iterations; e++) { + auto timeStart = std::chrono::system_clock::now(); + PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2) + for (int i = 0; i < xs0; i++) { + for (int j = 0; j < xs1; j++) { + auto thread_id = omp_get_thread_num(); + for (int k = 0; k < xs2; k++) { + for (int l = 0; l < xs3; l++) { + zbuffer[thread_id] += buffer[i * j + (k*l)] * 2.5f; + } + } + } + } + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + valuesX.emplace_back(outerTime); + } + + + for (int e = 0; e < iterations; e++) { + auto timeStart = std::chrono::system_clock::now(); + auto f2d = PRAGMA_THREADS_FOR_2D { + for (auto i = start_x; i < stop_x; i++) { + for (auto j = start_y; j < stop_y; j++) { + + for (auto k = 0; k < xs2; k++) { + for (auto l = 0; l < xs3; l++) { + zbuffer[thread_id] += buffer[i * j + (k * l)] * 2.5f; + } + } + } + } + }; + samediff::Threads::parallel_for(f2d, 0, xs0, 1, 0, xs1, 1); + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + valuesY.emplace_back(outerTime); + } + + if (valuesX.size() > 0) { + std::sort(valuesX.begin(), valuesX.end()); + nd4j_printf("OpenMP time: %lld; Min: %lld; Max: %lld;\n", valuesX[valuesX.size() / 2], valuesX[0], valuesX[valuesX.size() - 1]); + } + + if (valuesY.size() > 0) { + std::sort(valuesY.begin(), valuesY.end()); + nd4j_printf("Threads time: %lld; Min: %lld; Max: %lld;\n", valuesY[valuesY.size() / 2], valuesY[0], valuesY[valuesY.size() - 1]); + } + + nd4j_printf("Sum: %f\n", z.sumNumber().e(0)); +} + + +TEST_F(PlaygroundTests, test_s_5) { + auto x = NDArrayFactory::create('c', {32, 1, 28, 28}); + + std::vector values; + auto iterations = 100; + + auto startX = 0; + auto stopX = x.sizeAt(0); + auto incX = 1; + auto startY = 0; + auto stopY = x.sizeAt(1); + auto incY = 1; + auto numThreads = 4; + + // number of elements per loop + auto delta_x = (stopX - startX); + auto delta_y = (stopY - startY); + + // number of iterations per loop + auto itersX = delta_x / incX; + auto itersY = delta_y / incY; + + for (int e = 0; e < iterations; e++) { + auto timeStart = std::chrono::system_clock::now(); + + // picking best fit here + auto splitLoop = samediff::ThreadsHelper::pickLoop2d(numThreads, itersX, itersY); + auto span = samediff::Span2::build(splitLoop, 0, numThreads, startX, stopX, incX, startY, stopY, incY); + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + values.emplace_back(outerTime); + } + + std::sort(values.begin(), values.end()); + + nd4j_printf("Calculations time: [Median: %lld; Min: %lld; Max: %lld;]\n", values[values.size() / 2], values[0], values[values.size()-1]); +} + + +TEST_F(PlaygroundTests, test_s_6) { + auto x = NDArrayFactory::create('c', {1024 * 1024 * 64}); + auto buffer = x.bufferAsT(); + auto len = x.lengthOf(); + std::vector values; + auto iterations = 1000; + + for (int i = 0; i < iterations; i++) { + auto timeStart = std::chrono::system_clock::now(); + + // picking best fit here + for (int e = 0; e < len; e++) { + buffer[e] = (buffer[e] + 1.72f) * 3.17f - 0.0012f; + } + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + values.emplace_back(outerTime); + } + + std::sort(values.begin(), values.end()); + + nd4j_printf("Calculations time: [Median: %lld; Min: %lld; Max: %lld;]\n", values[values.size() / 2], values[0], values[values.size()-1]); +} + + +TEST_F(PlaygroundTests, test_s_3) { + std::atomic s; + s = 0; + auto func = PRAGMA_THREADS_FOR { + s++; + }; + + for (int e = 0; e < 10000; e++) { + + samediff::Threads::parallel_for(func, 0, 8192, 1, 4); + } +} + */ + /* TEST_F(PlaygroundTests, test_relubp_1) { auto x = NDArrayFactory::create('c', {128, 64, 224, 224}); diff --git a/libnd4j/tests_cpu/layers_tests/RNGTests.cpp b/libnd4j/tests_cpu/layers_tests/RNGTests.cpp index bc4db6e63..5c3ca340b 100644 --- a/libnd4j/tests_cpu/layers_tests/RNGTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/RNGTests.cpp @@ -868,7 +868,6 @@ TEST_F(RNGTests, Test_UniformDistribution_04) { ASSERT_EQ(Status::OK(), result->status()); auto z = result->at(0); - z->printIndexedBuffer("Uniform int distribution"); ASSERT_TRUE(exp0.isSameShape(z)); ASSERT_FALSE(exp0.equalsTo(z)); diff --git a/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp b/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp index 4df0f3dc8..8bf12f58b 100644 --- a/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp @@ -77,7 +77,7 @@ TEST_F(EuclideanDistanceTest,Test1) { result, tadShapeBuffer, dimension, - dimensionLength); + dimensionLength, 0, 2); ASSERT_EQ(result[1],result[0]); } @@ -107,7 +107,7 @@ TEST_F(StdTest,MultiDimTest) { dimensionsForStd, dimensionLength, tad->tadOnlyShapeInfo, - tad->tadOffsets); + tad->tadOffsets, 0, shape::length(resultShapeInfo)); // for(int i = 0; i < shape::length(resultShapeInfo); i++) // printf("%f\n",result[i]); @@ -145,7 +145,7 @@ TEST_F(ReduceTest,MatrixTest) { dimension, dimensionLength, tad->tadOnlyShapeInfo, - tad->tadOffsets); + tad->tadOffsets, 0, tad->numTads); // for(int i = 0; i < shape::length(resultShapeInfo); i++) // printf("%f\n",result[i]); diff --git a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp index ecc91779e..a8f430fe3 100644 --- a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp +++ b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp @@ -234,7 +234,6 @@ TEST_F(NormalThreeFourFive,DimensionTest) { tad->init(inputShapeBuffer,dimension,dimensionLength); tad->createTadOnlyShapeInfo(); tad->createOffsets(); - shape::printShapeInfoLinear(tad->tadOnlyShapeInfo); ASSERT_TRUE(arrsEquals(8,assertionBuffer,tad->tadOnlyShapeInfo)); delete tad; diff --git a/libnd4j/tests_cpu/layers_tests/TadTests.cpp b/libnd4j/tests_cpu/layers_tests/TadTests.cpp index aabef927f..b4a631a8c 100644 --- a/libnd4j/tests_cpu/layers_tests/TadTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/TadTests.cpp @@ -206,8 +206,6 @@ TEST_F(TadTests, test_TAD_empty_dims_1) { xTad.init(xShape, reinterpret_cast(112L), 0); xTad.createTadOnlyShapeInfo(); xTad.createOffsets(); - nd4j_printf("numTads: %i\n", (int) xTad.numTads); - shape::printShapeInfoLinear("TAD shape", xTad.tadOnlyShapeInfo); } TEST_F(TadTests, test_tad_order_1) { @@ -218,7 +216,6 @@ TEST_F(TadTests, test_tad_order_1) { xTad.init(xShape, &dim, 1); xTad.createTadOnlyShapeInfo(); - shape::printShapeInfoLinear("tad shape", xTad.tadOnlyShapeInfo); ASSERT_TRUE(shape::equalsStrict(tShape, xTad.tadOnlyShapeInfo)); } @@ -230,7 +227,6 @@ TEST_F(TadTests, test_tad_order_2) { xTad.init(xShape, &dim, 1); xTad.createTadOnlyShapeInfo(); - shape::printShapeInfoLinear("tad shape", xTad.tadOnlyShapeInfo); ASSERT_TRUE(shape::equalsStrict(tShape, xTad.tadOnlyShapeInfo)); } @@ -243,7 +239,6 @@ TEST_F(TadTests, test_tad_order_3) { xTad.init(xShape, &dim, 1); xTad.createTadOnlyShapeInfo(); - shape::printShapeInfoLinear("tad shape", xTad.tadOnlyShapeInfo); ASSERT_TRUE(shape::equalsStrict(tShape, xTad.tadOnlyShapeInfo)); } @@ -256,7 +251,6 @@ TEST_F(TadTests, test_tad_order_4) { xTad.init(xShape, dim, 2); xTad.createTadOnlyShapeInfo(); - shape::printShapeInfoLinear("tad shape", xTad.tadOnlyShapeInfo); ASSERT_TRUE(shape::equalsStrict(tShape, xTad.tadOnlyShapeInfo)); } @@ -264,7 +258,6 @@ TEST_F(TadTests, test_column_1) { auto x = NDArrayFactory::create('c', {5, 2}); auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), 0); - shape::printShapeInfoLinear("column view", tadPack.primaryShapeInfo()); ASSERT_EQ(1, shape::rank(tadPack.primaryShapeInfo())); ASSERT_EQ(5, shape::length(tadPack.primaryShapeInfo())); ASSERT_TRUE(shape::isVector(tadPack.primaryShapeInfo())); diff --git a/libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp b/libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp new file mode 100644 index 000000000..1139d6076 --- /dev/null +++ b/libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp @@ -0,0 +1,233 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include "testlayers.h" +#include +#include +#include +#include +#include + +using namespace samediff; +using namespace nd4j; +using namespace nd4j::ops; +using namespace nd4j::graph; + +class ThreadsTests : public testing::Test { +public: + +}; + +TEST_F(ThreadsTests, th_test_1) { + ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1023)); + ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1024)); + ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1026)); + + ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 2043)); + ASSERT_EQ(2, ThreadsHelper::numberOfThreads(6, 2048)); +} + + +TEST_F(ThreadsTests, th_test_2) { + // in this case we'll get better split over second loop - exactly 32 elements per thread + ASSERT_EQ(2, ThreadsHelper::pickLoop2d(32, 48, 1024)); + ASSERT_EQ(2, ThreadsHelper::pickLoop2d(6, 4, 16384)); + + // in this case we'll get better split over first loop - 2 loops/2048 elements per thread + ASSERT_EQ(1, ThreadsHelper::pickLoop2d(32, 64, 1024)); + ASSERT_EQ(1, ThreadsHelper::pickLoop2d(6, 6, 16384)); + + // in this case none of loops are good enough, but second loop is too small for split + ASSERT_EQ(1, ThreadsHelper::pickLoop2d(6, 64, 32)); + + // all loops are good enough, but we go with bigger one, since small + ASSERT_EQ(1, ThreadsHelper::pickLoop2d(2, 64, 32)); + + // obviously split goes into second loop, to give 1024 elements per thread + ASSERT_EQ(2, ThreadsHelper::pickLoop2d(2, 1, 2048)); +} + +TEST_F(ThreadsTests, th_test_3) { + // typical conv cases + ASSERT_EQ(1, ThreadsHelper::pickLoop3d(4, 32, 3, 128)); + ASSERT_EQ(2, ThreadsHelper::pickLoop3d(4, 1, 128, 64)); + ASSERT_EQ(3, ThreadsHelper::pickLoop3d(4, 1, 3, 128)); + + // checking for optimal threads for conv inference + ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 1, 3, 128)); + ASSERT_EQ(4, ThreadsHelper::numberOfThreads3d(4, 1, 3, 128)); + ASSERT_EQ(8, ThreadsHelper::numberOfThreads3d(8, 1, 3, 128)); + + // checking for optimal threads for conv training + ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 16, 3, 128)); + ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 8, 3, 128)); + + + ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 8, 3, 64)); + ASSERT_EQ(1, ThreadsHelper::pickLoop3d(6, 8, 3, 64)); +} + +TEST_F(ThreadsTests, th_test_4) { + // typical conv cases + ASSERT_EQ(2, ThreadsHelper::numberOfThreads2d(2, 32, 3)); + ASSERT_EQ(4, ThreadsHelper::numberOfThreads2d(4, 32, 3)); + ASSERT_EQ(6, ThreadsHelper::numberOfThreads2d(6, 32, 1)); + ASSERT_EQ(8, ThreadsHelper::numberOfThreads2d(8, 16, 64)); + + ASSERT_EQ(1, ThreadsHelper::pickLoop2d(4, 32, 1)); + ASSERT_EQ(1, ThreadsHelper::pickLoop2d(8, 19, 17)); + + // primes edge cases + ASSERT_EQ(6, ThreadsHelper::numberOfThreads2d(6, 19, 17)); + ASSERT_EQ(8, ThreadsHelper::numberOfThreads2d(8, 19, 17)); + + ASSERT_EQ(1, ThreadsHelper::pickLoop2d(8, 19, 17)); + + for (auto e = 0; e < 6; e++) { + auto span = Span2::build(1, e, 6, 0, 19, 1, 0, 17, 1); + + nd4j_printf("Span start: %lld; stop: %lld\n", span.startX(), span.stopX()); + } + + nd4j_printf("-----------------------\n",""); + for (auto e = 0; e < 6; e++) { + auto span = Span2::build(1, e, 6, 0, 32, 1, 0, 3, 1); + + nd4j_printf("Span start: %lld; stop: %lld\n", span.startX(), span.stopX()); + } +} + + +TEST_F(ThreadsTests, test_span_converage_1) { + for (int b = 1; b <= 128; b++) { + for (int c = 1; c <= 64; c++) { + for (int t = 1; t <= 64; t++) { + + auto threads = ThreadsHelper::numberOfThreads2d(t, b, c); + auto loop = ThreadsHelper::pickLoop2d(threads, b, c); + + if (t > 1 && threads == 1 && (b > 1 && c > 1)) { + nd4j_printf("Got 1 thread for [%i, %i] loop; initial max threads: %i\n", b, c, t) + } + + auto sum = 0; + for (auto a = 0; a < threads; a++) { + auto span = Span2::build(loop, a,threads, 0, b, 1, 0, c, 1); + + if (loop == 1) + sum += span.stopX() - span.startX(); + else if (loop == 2) + sum += span.stopY() - span.startY(); + else + throw std::runtime_error("Bad loop!"); + } + + if (loop == 1) + ASSERT_EQ(b, sum); + else + ASSERT_EQ(c, sum); + } + } + } +} + +TEST_F(ThreadsTests, validation_test_2d_1) { + if (1 > 0) + return; + + std::vector threads({1, 2, 4, 6, 8, 12, 16, 20, 32, 48, 64}); + + for (int e = 1; e < 1024; e++) { + for (int i = 1; i <= 1024; i++ ) { + for (auto t:threads) { + std::atomic sum; + sum.store(0); + + auto func = PRAGMA_THREADS_FOR_2D { + for (auto x = start_x; x < stop_x; x += inc_x) { + for (auto y = start_y; y < stop_y; y += inc_y) { + sum++; + } + } + }; + + samediff::Threads::parallel_for(func, 0, e, 1, 0, i, 1, t, true); + + ASSERT_EQ(e * i, sum.load()); + } + } + + nd4j_printf("Finished iteration %i\n", e); + } +} + +TEST_F(ThreadsTests, reduction_test_1) { + + auto func = PRAGMA_REDUCE_LONG { + int64_t sum = 0; + + for (auto e = start; e < stop; e++) { + sum++; + }; + + return sum; + }; + + auto sum = samediff::Threads::parallel_long(func, LAMBDA_AL {return _old + _new;}, 0, 8192, 1, 4); + ASSERT_EQ(8192, sum); +} + +/* +TEST_F(ThreadsTests, basic_test_1) { + if (!Environment::getInstance()->isCPU()) + return; + + auto instance = samediff::ThreadPool::getInstance(); + + auto array = NDArrayFactory::create('c', {512, 768}); + auto like = array.like(); + auto buffer = array.bufferAsT(); + auto lbuffer = like.bufferAsT(); + + auto func = PRAGMA_THREADS_FOR { + PRAGMA_OMP_SIMD + for (uint64_t e = start; e < stop; e += increment) { + buffer[e] += 1.0f; + } + }; + + auto timeStartThreads = std::chrono::system_clock::now(); + samediff::Threads::parallel_for(func, 0, array.lengthOf()); + auto timeEndThreads = std::chrono::system_clock::now(); + auto outerTimeThreads = std::chrono::duration_cast (timeEndThreads - timeStartThreads).count(); + + auto timeStartOmp = std::chrono::system_clock::now(); + PRAGMA_OMP_PARALLEL_FOR_SIMD + for (uint64_t e = 0; e < array.lengthOf(); e ++) { + lbuffer[e] += 1.0f; + } + auto timeEndOmp = std::chrono::system_clock::now(); + auto outerTimeOmp = std::chrono::duration_cast (timeEndOmp - timeStartOmp).count(); + + ASSERT_NEAR((float) array.lengthOf(), array.sumNumber().e(0), 1e-5f); + + nd4j_printf("Threads time: %lld us; OMP time: %lld us; %p\n", outerTimeThreads, outerTimeOmp, instance) +} + */ \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp b/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp index 72ca854f8..fd277b971 100644 --- a/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp @@ -55,7 +55,6 @@ TEST_F(WorkspaceTests, BasicInitialization2) { auto v = array.reduceNumber(reduce::Sum); auto f = v.e(0); - v.printShapeInfo("v shape"); ASSERT_NEAR(2.0f, f, 1e-5); @@ -77,7 +76,6 @@ TEST_F(WorkspaceTests, BasicInitialization3) { auto v = array.reduceNumber(reduce::Sum); auto f = v.e(0); - v.printShapeInfo("v shape"); ASSERT_NEAR(2.0f, array.reduceNumber(reduce::Sum).e(0), 1e-5); diff --git a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt index 218035421..315839dba 100644 --- a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt +++ b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt @@ -109,15 +109,17 @@ endif() # -fsanitize=address # -fsanitize=leak if (APPLE) - set(CMAKE_CXX_FLAGS " -O0 -g -fPIC -std=c++11 -D__APPLE_OS__=true") + set(CMAKE_CXX_FLAGS " -O0 -g -fPIC -std=c++11 -D__APPLE_OS__=true -DAPPLE_BUILD=true") elseif(WIN32) if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") set(CMAKE_CXX_FLAGS " -g -fPIC -std=c++11 -Wa,-mbig-obj") endif() else() + set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -DLINUX_BUILD=true") + if ("${_RELEASE}" OR CMAKE_BUILD_TYPE STREQUAL "Release") message("Release build for tests") - set(CMAKE_CXX_FLAGS "-O3 -fPIC -std=c++11") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC -std=c++11 -D_RELEASE=true") if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*") set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native") else() diff --git a/libnd4j/tests_cpu/run_tests.sh b/libnd4j/tests_cpu/run_tests.sh index e5cbd4106..2932827d4 100755 --- a/libnd4j/tests_cpu/run_tests.sh +++ b/libnd4j/tests_cpu/run_tests.sh @@ -16,9 +16,30 @@ # SPDX-License-Identifier: Apache-2.0 ################################################################################ - set -exo pipefail +while [[ $# > 0 ]] +do + key="$1" + value="${2:-}" + + case $key in + -c|--chip) + CHIP="${value}" + shift # past argument + ;; + *) + # unknown option + ;; + esac + + if [[ $# > 0 ]]; then + shift # past argument or value + fi +done + +CHIP="${CHIP:-cpu}" + # On Mac, make sure it can find libraries for GCC export DYLD_LIBRARY_PATH=/usr/local/lib/gcc/8/:/usr/local/lib/gcc/7/:/usr/local/lib/gcc/6/:/usr/local/lib/gcc/5/ @@ -30,4 +51,4 @@ if [ -n "$BUILD_PATH" ]; then export PATH="$PATH:$BUILD_PATH" fi -../blasbuild/cpu/tests_cpu/layers_tests/runtests --gtest_output="xml:../target/surefire-reports/TEST-results.xml" +../blasbuild/${CHIP}/tests_cpu/layers_tests/runtests --gtest_output="xml:../target/surefire-reports/TEST-${CHIP}-results.xml" diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunction.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunction.java index 32df3e69d..8c80e3bb4 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunction.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunction.java @@ -509,7 +509,7 @@ public abstract class DifferentialFunction { * @return the arguments for a given function */ public SDVariable[] args() { - return sameDiff.getInputVariablesForOp(this); + return sameDiff == null ? null : sameDiff.getInputVariablesForOp(this); } /** diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/listeners/debugging/OpBenchmarkListener.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/listeners/debugging/OpBenchmarkListener.java new file mode 100644 index 000000000..103b0f960 --- /dev/null +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/listeners/debugging/OpBenchmarkListener.java @@ -0,0 +1,189 @@ +package org.nd4j.autodiff.listeners.debugging; + +import lombok.*; +import org.nd4j.autodiff.listeners.At; +import org.nd4j.autodiff.listeners.BaseListener; +import org.nd4j.autodiff.listeners.Operation; +import org.nd4j.autodiff.samediff.SameDiff; +import org.nd4j.autodiff.samediff.internal.SameDiffOp; +import org.nd4j.linalg.api.ndarray.INDArray; +import org.nd4j.linalg.api.ops.DynamicCustomOp; +import org.nd4j.linalg.api.ops.Op; +import org.nd4j.linalg.dataset.api.MultiDataSet; +import org.nd4j.linalg.factory.Nd4j; +import org.nd4j.linalg.util.ArrayUtil; + +import java.text.DecimalFormat; +import java.util.*; + +/** + * A simple listener for benchmarking single operations in SameDiff
+ * Supports 2 modes:
+ * - SINGLE_ITER_PRINT: Print the runtime of the first iteration
+ * - AGGREGATE: Collect statistics for multiple runs, that can be accessed (by op name) via {@link #getAggregateModeMap()} + * + * @author Alex Black + */ +@Getter +public class OpBenchmarkListener extends BaseListener { + + public enum Mode {SINGLE_ITER_PRINT, AGGREGATE} + + private final Operation operation; + private final Mode mode; + private final long minRuntime; + private Map aggregateModeMap; + + @Getter(AccessLevel.PRIVATE) + private long start; + @Getter(AccessLevel.PRIVATE) + private boolean printActive; + private boolean printDone; + + public OpBenchmarkListener(Operation operation, @NonNull Mode mode) { + this(operation, mode, 0); + } + + /** + * @param operation Operation to collect stats for + * @param mode Mode - see {@link OpBenchmarkListener} + * @param minRuntime Minimum runtime - only applies to Mode.SINGLE_ITER_PRINT. If op runtime below this: don't print + */ + public OpBenchmarkListener(Operation operation, @NonNull Mode mode, long minRuntime) { + this.operation = operation; + this.mode = mode; + this.minRuntime = minRuntime; + } + + @Override + public boolean isActive(Operation operation) { + return this.operation == null || this.operation == operation; + } + + @Override + public void operationStart(SameDiff sd, Operation op) { + if(printDone) + return; + if(this.operation == null || this.operation == op) + printActive = true; + } + + @Override + public void operationEnd(SameDiff sd, Operation op) { + if(printDone) + return; + if(this.operation == null || this.operation == op) { + printActive = false; + printDone = true; + } + } + + @Override + public void preOpExecution(SameDiff sd, At at, SameDiffOp op) { + start = System.currentTimeMillis(); + } + + @Override + public void opExecution(SameDiff sd, At at, MultiDataSet batch, SameDiffOp op, INDArray[] outputs) { + long now = System.currentTimeMillis(); + + if (mode == Mode.SINGLE_ITER_PRINT && printActive && (now-start) > this.minRuntime) { + System.out.println(getOpString(op, now)); + } else if (mode == Mode.AGGREGATE) { + if(aggregateModeMap == null) + aggregateModeMap = new LinkedHashMap<>(); + + if(!aggregateModeMap.containsKey(op.getName())){ + String s = getOpString(op, null); + OpExec oe = new OpExec(op.getName(), op.getOp().opName(), op.getOp().getClass(), + new ArrayList(), s); + aggregateModeMap.put(op.getName(), oe); + } + + aggregateModeMap.get(op.getName()).getRuntimeMs().add(now-start); + } + } + + private String getOpString(SameDiffOp op, Long now){ + StringBuilder sb = new StringBuilder(); + sb.append(op.getName()).append(" - ").append(op.getOp().getClass().getSimpleName()) + .append("(").append(op.getOp().opName()).append(") - "); + if(now != null) { + sb.append(now - start).append(" ms\n"); + } + + if (op.getOp() instanceof DynamicCustomOp) { + DynamicCustomOp dco = (DynamicCustomOp) op.getOp(); + int x = 0; + + for (INDArray i : dco.inputArguments()) { + sb.append(" in ").append(x++).append(": ").append(i.shapeInfoToString()).append("\n"); + } + x = 0; + for (INDArray o : dco.outputArguments()) { + sb.append(" out ").append(x++).append(": ").append(o.shapeInfoToString()).append("\n"); + } + long[] iargs = dco.iArgs(); + boolean[] bargs = dco.bArgs(); + double[] targs = dco.tArgs(); + if (iargs != null && iargs.length > 0) { + sb.append(" iargs: ").append(Arrays.toString(iargs)).append("\n"); + } + if (bargs != null && bargs.length > 0) { + sb.append(" bargs: ").append(Arrays.toString(bargs)).append("\n"); + } + if (targs != null && targs.length > 0) { + sb.append(" targs: ").append(Arrays.toString(targs)).append("\n"); + } + } else { + Op o = (Op) op.getOp(); + if (o.x() != null) + sb.append(" x: ").append(o.x().shapeInfoToString()); + if (o.y() != null) + sb.append(" y: ").append(o.y().shapeInfoToString()); + if (o.z() != null) + sb.append(" z: ").append(o.z().shapeInfoToString()); + } + return sb.toString(); + } + + + @AllArgsConstructor + @Data + public static class OpExec { + private final String opOwnName; + private final String opName; + private final Class opClass; + private List runtimeMs; + private String firstIter; + + @Override + public String toString(){ + DecimalFormat df = new DecimalFormat("0.000"); + + return opOwnName + " - op class: " + opClass.getSimpleName() + " (op name: " + opName + ")\n" + + "count: " + runtimeMs.size() + ", mean: " + df.format(avgMs()) + "ms, std: " + df.format(stdMs()) + "ms, min: " + minMs() + "ms, max: " + maxMs() + "ms\n" + + firstIter; + } + + public double avgMs() { + long sum = 0; + for (Long l : runtimeMs) { + sum += l; + } + return sum / (double) runtimeMs.size(); + } + + public double stdMs() { + return Nd4j.createFromArray(ArrayUtil.toArrayLong(runtimeMs)).stdNumber().doubleValue(); + } + + public long minMs() { + return Nd4j.createFromArray(ArrayUtil.toArrayLong(runtimeMs)).minNumber().longValue(); + } + + public long maxMs() { + return Nd4j.createFromArray(ArrayUtil.toArrayLong(runtimeMs)).maxNumber().longValue(); + } + } +} diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/InferenceSession.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/InferenceSession.java index 55165b530..32a1cc362 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/InferenceSession.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/InferenceSession.java @@ -24,7 +24,7 @@ import org.nd4j.autodiff.listeners.Listener; import org.nd4j.autodiff.samediff.SDVariable; import org.nd4j.autodiff.samediff.SameDiff; import org.nd4j.autodiff.samediff.VariableType; -import org.nd4j.autodiff.samediff.internal.memory.ArrayCloseMemoryMgr; +import org.nd4j.autodiff.samediff.internal.memory.ArrayCacheMemoryMgr; import org.nd4j.base.Preconditions; import org.nd4j.linalg.api.buffer.DataType; import org.nd4j.linalg.api.memory.MemoryWorkspace; @@ -84,8 +84,7 @@ public class InferenceSession extends AbstractSession { public InferenceSession(@NonNull SameDiff sameDiff) { super(sameDiff); - - mmgr = new ArrayCloseMemoryMgr(); //TODO replace this with new (planned) array reuse memory manager + mmgr = new ArrayCacheMemoryMgr(); } @Override @@ -215,7 +214,6 @@ public class InferenceSession extends AbstractSession { } INDArray[] out = doExec(op.getOp(), outputFrameIter, opInputs, allIterInputs, constAndPhInputs); - op.getOp().clearArrays(); if (log.isTraceEnabled()) { StringBuilder sb = new StringBuilder(); @@ -254,6 +252,7 @@ public class InferenceSession extends AbstractSession { } } } + op.getOp().clearArrays(); //Record array uses for memory management/deallocation @@ -842,11 +841,10 @@ public class InferenceSession extends AbstractSession { reqShape = reqShape.asDataType(dt); } - if (currOutput == null || currOutput.wasClosed() || !currOutput.shapeDescriptor().equals(reqShape) || currOutput.isEmpty() != reqShape.isEmpty() || isLoop) { - boolean isOutput = allReqVariables.contains(outNames[i]); - INDArray out = mmgr.allocate(isOutput, reqShape); - customOp.setOutputArgument(i, out); - } + //Always allocate new output array, rely on memory manager for efficient memory management and array reuse etc + boolean isOutput = allReqVariables.contains(outNames[i]); + INDArray out = mmgr.allocate(isOutput, reqShape); + customOp.setOutputArgument(i, out); } } else if (df instanceof Op) { @@ -893,29 +891,17 @@ public class InferenceSession extends AbstractSession { //Check output shape; allocate a new Z if required //For example, if minibatch size has changed since last op execution + boolean isOutput = allReqVariables.contains(((BaseOp) op).outputVariablesNames()[0]); if (emptyReduce) { - INDArray z = op.z(); - if (z == null || !op.x().equalShapes(z) || isLoop) { - //Note: edge case: [x,y].sum(empty) = [x,y] for TF import compatibility. - z = mmgr.allocate(false, op.x().dataType(), op.x().shape()); - op.setZ(z); - } + //Always allocate new output array, rely on memory manager for efficient memory management and array reuse etc + INDArray z = mmgr.allocate(false, op.x().dataType(), op.x().shape()); + op.setZ(z); } else { List outputShape = ((BaseOp) op).calculateOutputShape(); Preconditions.checkState(outputShape != null && outputShape.size() == 1, "Could not calculate output shape for op: %s", op.getClass()); - INDArray z = op.z(); - if (z == null || z.wasClosed() || !outputShape.get(0).equals(z.shapeDescriptor()) || isLoop) { - if (log.isTraceEnabled()) { - log.trace("Existing op result (z) array shape for op {} was {}, allocating new array of shape {}", - op.getClass().getSimpleName(), (z == null ? null : Arrays.toString(z.shape())), outputShape.get(0).toString()); - } - - LongShapeDescriptor lsd = outputShape.get(0); - - boolean isOutput = allReqVariables.contains(((BaseOp) op).outputVariablesNames()[0]); - z = mmgr.allocate(isOutput, lsd); - op.setZ(z); - } + LongShapeDescriptor lsd = outputShape.get(0); + INDArray z = mmgr.allocate(isOutput, lsd); + op.setZ(z); } } diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/memory/ArrayCacheMemoryMgr.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/memory/ArrayCacheMemoryMgr.java new file mode 100644 index 000000000..c802dd4e2 --- /dev/null +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/memory/ArrayCacheMemoryMgr.java @@ -0,0 +1,292 @@ +package org.nd4j.autodiff.samediff.internal.memory; + +import lombok.*; +import org.bytedeco.javacpp.Pointer; +import org.nd4j.base.Preconditions; +import org.nd4j.linalg.api.buffer.DataType; +import org.nd4j.linalg.api.ndarray.INDArray; +import org.nd4j.linalg.api.shape.LongShapeDescriptor; +import org.nd4j.linalg.factory.Nd4j; +import org.nd4j.linalg.util.ArrayUtil; + +import java.util.*; + +/** + * ArrayCacheMemoryMgr reuses arrays to reduce the number of memory allocations and deallocations.
+ * Memory allocations and deallocations can be quite expensive, especially on GPUs.
+ * Note that when arrays are reused, they are reused for the same datatype only.
+ * If caching a released array would result in the the maximum cache size being is exceeded, the oldest arrays will + * be deallocated first, until the new array can in the cache. + *

+ * By default, the following parameters are used for the cache: + *
    + *
  • Maximum cache size: 0.25 x max memory, where:
  • + *
      + *
    • CPU: max memory is determined using {@link Pointer#maxBytes()}
    • + *
    • GPU: max memory is determined using GPU 0 total memory
    • + *
    + *
  • Larger array max multiple: 2.0
  • + *
      + *
    • This means: if an exact array size can't be provided from the cache, use the next smallest array with a buffer up to 2.0x larger than requested
    • + *
    • If no cached arrays of size < 2x requested exists, allocate a new array
    • + *
    + *
  • Small array threshold: 1024 elements
  • + *
      + *
    • This means: the "larger array max multiple" doesn't apply below this level. For example, we might return a size 1 array backed by a size 1023 buffer
    • + *
    + *
+ * + * @author Alex Black + */ +@Getter +public class ArrayCacheMemoryMgr extends AbstractMemoryMgr { + + private final double maxMemFrac; + private final long smallArrayThreshold; + private final double largerArrayMaxMultiple; + + private final long maxCacheBytes; + private final long totalMemBytes; + + private long currentCacheSize = 0; + private Map arrayStores = new HashMap<>(); + + private LinkedHashSet lruCache = new LinkedHashSet<>(); + private Map lruCacheValues = new HashMap<>(); + + /** + * Create an ArrayCacheMemoryMgr with default settings as per {@link ArrayCacheMemoryMgr} + */ + public ArrayCacheMemoryMgr() { + this(0.25, 1024, 2.0); + } + + /** + * @param maxMemFrac Maximum memory fraciton to use as cache + * @param smallArrayThreshold Below this size (elements), don't apply the "largerArrayMaxMultiple" rule + * @param largerArrayMaxMultiple Maximum multiple of the requested size to return from the cache. If an array of size + * 1024 is requested, and largerArrayMaxMultiple is 2.0, then we'll return from the cache + * the array with the smallest data buffer up to 2.0*1024 elements; otherwise we'll return + * a new array + */ + public ArrayCacheMemoryMgr(double maxMemFrac, long smallArrayThreshold, double largerArrayMaxMultiple) { + Preconditions.checkArgument(maxMemFrac > 0 && maxMemFrac < 1, "Maximum memory fraction for cache must be between 0.0 and 1.0, got %s", maxMemFrac); + Preconditions.checkArgument(smallArrayThreshold >= 0, "Small array threshould must be >= 0, got %s", smallArrayThreshold); + Preconditions.checkArgument(largerArrayMaxMultiple >= 1.0, "Larger array max multiple must be >= 1.0, got %s", largerArrayMaxMultiple); + this.maxMemFrac = maxMemFrac; + this.smallArrayThreshold = smallArrayThreshold; + this.largerArrayMaxMultiple = largerArrayMaxMultiple; + + if(isCpu()){ + totalMemBytes = Pointer.maxBytes(); + } else { + Properties p = Nd4j.getExecutioner().getEnvironmentInformation(); + List devList = (List) p.get("cuda.devicesInformation"); + Map m = (Map) devList.get(0); + totalMemBytes = (Long)m.get("cuda.totalMemory"); + } + maxCacheBytes = (long)(maxMemFrac * totalMemBytes); + } + + private boolean isCpu(){ + String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend"); + return !"CUDA".equalsIgnoreCase(backend); + } + + @Override + public INDArray allocate(boolean detached, DataType dataType, long... shape) { + if (arrayStores.containsKey(dataType)) { + INDArray arr = arrayStores.get(dataType).get(shape); + if (arr != null) { + //Decrement cache size + currentCacheSize -= dataType.width() * arr.data().length(); + + return arr; //Allocated from cache + } + } + + //Allocation failed, allocate new array + return Nd4j.createUninitializedDetached(dataType, shape); + } + + @Override + public INDArray allocate(boolean detached, LongShapeDescriptor descriptor) { + return allocate(detached, descriptor.dataType(), descriptor.getShape()); + } + + @Override + public void release(@NonNull INDArray array) { + //Check for multiple releases of the array + long id = array.getId(); + Preconditions.checkState(!lruCache.contains(id), "Array was released multiple times: id=%s, shape=%ndShape", id, array); + + + DataType dt = array.dataType(); + long thisBytes = array.data().length() * dt.width(); + if(array.dataType() == DataType.UTF8) { + //Don't cache string arrays due to variable length buffers + if(array.closeable()) + array.close(); + } else if (currentCacheSize + thisBytes > maxCacheBytes) { + if(thisBytes > maxCacheBytes){ + //Can't store even if we clear everything - too large + if(array.closeable()) + array.close(); + return; + } + + //Need to deallocate some arrays to stay under limit - do in "oldest first" order + Iterator iter = lruCache.iterator(); + while(currentCacheSize + thisBytes > maxCacheBytes){ + long next = iter.next(); + iter.remove(); + INDArray nextOldest = lruCacheValues.remove(next); + DataType ndt = nextOldest.dataType(); + long nextBytes = ndt.width() * nextOldest.data().length(); + arrayStores.get(ndt).removeObject(nextOldest); + currentCacheSize -= nextBytes; + + if(nextOldest.closeable()) + nextOldest.close(); + } + + //After clearing space - can now cache + cacheArray(array); + } else { + //OK to cache + cacheArray(array); + } + + //Store in LRU cache for "last used" removal if we exceed cache size + lruCache.add(array.getId()); + lruCacheValues.put(array.getId(), array); + } + + private void cacheArray(INDArray array){ + DataType dt = array.dataType(); + if (!arrayStores.containsKey(dt)) + arrayStores.put(dt, new ArrayStore()); + arrayStores.get(dt).add(array); + currentCacheSize += array.data().length() * dt.width(); + + lruCache.add(array.getId()); + lruCacheValues.put(array.getId(), array); + } + + @Override + public void close() { + for (ArrayStore as : arrayStores.values()) { + as.close(); + } + } + + + @Getter + public class ArrayStore { + private INDArray[] sorted = new INDArray[1000]; //TODO resizing, don't hardcode + private long[] lengths = new long[1000]; + private long lengthSum; + private long bytesSum; + private int size; + + private void add(@NonNull INDArray array) { + //Resize arrays + if(size == sorted.length){ + sorted = Arrays.copyOf(sorted, 2*sorted.length); + lengths = Arrays.copyOf(lengths, 2*lengths.length); + } + + long length = array.data().length(); + int idx = Arrays.binarySearch(lengths, 0, size, length); + if (idx < 0) { + idx = -idx - 1; //See binarySearch javadoc + } + for (int i = size - 1; i >= idx; i--) { + sorted[i + 1] = sorted[i]; + lengths[i + 1] = lengths[i]; + } + sorted[idx] = array; + lengths[idx] = length; + size++; + lengthSum += length; + bytesSum += length * array.dataType().width(); + } + + private INDArray get(long[] shape) { + if (size == 0) + return null; + + long length = shape.length == 0 ? 1 : ArrayUtil.prod(shape); + + int idx = Arrays.binarySearch(lengths, 0, size, length); + if (idx < 0) { + idx = -idx - 1; + if (idx >= size) { + //Largest array is smaller than required -> can't return from cache + return null; + } + INDArray nextSmallest = sorted[idx]; + long nextSmallestLength = nextSmallest.data().length(); + long nextSmallestLengthBytes = nextSmallestLength * nextSmallest.dataType().width(); + + boolean tooLarge = (length > (long) (nextSmallestLength * largerArrayMaxMultiple)); + + if (nextSmallestLengthBytes > smallArrayThreshold && tooLarge) { + return null; + } // If less than smallArrayThreshold, ok, return as is + } + + //Remove + INDArray arr = removeIdx(idx); + + lruCache.remove(arr.getId()); + lruCacheValues.remove(arr.getId()); + + //Create a new array with the specified buffer. This is for 2 reasons: + //(a) the cached array and requested array sizes may differ (though this is easy to check for) + //(b) Some SameDiff array use tracking uses *object identity* - so we want different objects when reusing arrays + // to avoid issues there + return Nd4j.create(arr.data(), shape); + } + + private void removeObject(INDArray array){ + long length = array.data().length(); + int idx = Arrays.binarySearch(lengths, 0, size, length); + Preconditions.checkState(idx > 0, "Cannot remove array from ArrayStore: no array with this length exists in the cache"); + boolean found = false; + int i = 0; + while(!found && i <= size && lengths[i] == length){ + found = sorted[i++] == array; //Object equality + } + Preconditions.checkState(found, "Cannot remove array: not found in ArrayCache"); + removeIdx(i - 1); + } + + private INDArray removeIdx(int idx){ + INDArray arr = sorted[idx]; + for (int i = idx; i < size; i++) { + sorted[i] = sorted[i + 1]; + lengths[i] = lengths[i + 1]; + } + sorted[size] = null; + lengths[size] = 0; + size--; + + bytesSum -= (arr.data().length() * arr.dataType().width()); + lengthSum -= arr.data().length(); + + return arr; + } + + private void close() { + for (int i = 0; i < size; i++) { + if (sorted[i].closeable()) + sorted[i].close(); + lengths[i] = 0; + } + lengthSum = 0; + bytesSum = 0; + size = 0; + } + } +} diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce3/EqualsWithEps.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce3/EqualsWithEps.java index 3bf3105f8..0f8f48d86 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce3/EqualsWithEps.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce3/EqualsWithEps.java @@ -49,7 +49,7 @@ public class EqualsWithEps extends BaseReduce3Op { public EqualsWithEps(INDArray x, INDArray y, INDArray z, double eps, int... dimensions) { super(x, y, z, false, dimensions); - this.extraArgs = new Object[] {eps}; + this.extraArgs = new Object[] {0.0, 0.0, eps}; } public EqualsWithEps(INDArray x, INDArray y, double eps, int... dimensions) { diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java index efa70d691..fecb64012 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java @@ -731,7 +731,6 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper { // #define ND4J_EXPORT // #endif // #include -// #include /* int tad_threshold = 1; @@ -3604,6 +3603,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include // #include +// #include // #include // #include // #include diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuMemoryManager.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuMemoryManager.java index 8af56286d..58ad965a6 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuMemoryManager.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuMemoryManager.java @@ -67,7 +67,7 @@ public class CpuMemoryManager extends BasicMemoryManager { */ @Override public void release(@NonNull Pointer pointer, MemoryKind kind) { - Pointer.free(pointer); + NativeOpsHolder.getInstance().getDeviceNativeOps().freeHost(pointer); pointer.setNull(); } diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java index f915c8152..06c061fad 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java @@ -1,4 +1,4 @@ -// Targeted by JavaCPP version 1.5.1-1: DO NOT EDIT THIS FILE +// Targeted by JavaCPP version 1.5.2: DO NOT EDIT THIS FILE package org.nd4j.nativeblas; @@ -731,7 +731,6 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper { // #define ND4J_EXPORT // #endif // #include -// #include /* int tad_threshold = 1; @@ -3604,6 +3603,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include // #include +// #include // #include // #include // #include @@ -5454,6 +5454,10 @@ NDArray& NDArray::operator()(const Nd4jLong* idx) { + + + + @@ -21232,6 +21236,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); /******************************************************************************* * Copyright (c) 2015-2018 Skymind, Inc. + * Copyright (c) 2019 Konduit K.K. * * This program and the accompanying materials are made available under the * terms of the Apache License, Version 2.0 which is available at @@ -21290,6 +21295,18 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); } // #endif + /* + * random_uniform distribution for types int32,int64, float16, float and double + * by default dtype is float32 + * + * input: + * 0 - shape of output (1D int tensor) + * 1 - min val (0D of output type) - optional (0 as default) + * 2 - max val (0D of output type) - optional (inf as default) + * + * output: + * 0 - uniformly distributed values of given type (between min and max) + */ // #if NOT_EXCLUDED(OP_randomuniform) @Namespace("nd4j::ops") public static class randomuniform extends DeclarableCustomOp { static { Loader.load(); } @@ -21362,6 +21379,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); } // #endif +// #if NOT_EXCLUDED(OP_random_crop) @Namespace("nd4j::ops") public static class random_crop extends DeclarableCustomOp { static { Loader.load(); } /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ @@ -21377,6 +21395,50 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); private native void allocate(); public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); } +// #endif + + /** + * random_gamma op. + */ +// #if NOT_EXCLUDED(OP_random_gamma) + @Namespace("nd4j::ops") public static class random_gamma extends DeclarableCustomOp { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public random_gamma(Pointer p) { super(p); } + /** Native array allocator. Access with {@link Pointer#position(long)}. */ + public random_gamma(long size) { super((Pointer)null); allocateArray(size); } + private native void allocateArray(long size); + @Override public random_gamma position(long position) { + return (random_gamma)super.position(position); + } + + public random_gamma() { super((Pointer)null); allocate(); } + private native void allocate(); + public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); + } +// #endif + + /** + * random_poisson op. + */ +// #if NOT_EXCLUDED(OP_random_poisson) + @Namespace("nd4j::ops") public static class random_poisson extends DeclarableCustomOp { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public random_poisson(Pointer p) { super(p); } + /** Native array allocator. Access with {@link Pointer#position(long)}. */ + public random_poisson(long size) { super((Pointer)null); allocateArray(size); } + private native void allocateArray(long size); + @Override public random_poisson position(long position) { + return (random_poisson)super.position(position); + } + + public random_poisson() { super((Pointer)null); allocate(); } + private native void allocate(); + public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); + } +// #endif + diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/samediff/MemoryMgrTest.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/samediff/MemoryMgrTest.java new file mode 100644 index 000000000..6505bee20 --- /dev/null +++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/samediff/MemoryMgrTest.java @@ -0,0 +1,119 @@ +package org.nd4j.autodiff.samediff; + +import org.junit.Test; +import org.nd4j.autodiff.samediff.internal.memory.ArrayCacheMemoryMgr; +import org.nd4j.linalg.BaseNd4jTest; +import org.nd4j.linalg.api.buffer.DataType; +import org.nd4j.linalg.api.ndarray.INDArray; +import org.nd4j.linalg.factory.Nd4j; +import org.nd4j.linalg.factory.Nd4jBackend; + +import java.lang.reflect.Field; + +import static org.junit.Assert.*; + +public class MemoryMgrTest extends BaseNd4jTest { + + public MemoryMgrTest(Nd4jBackend b){ + super(b); + } + + @Override + public char ordering(){ + return 'c'; + } + + @Test + public void testArrayReuseTooLarge() throws Exception { + + ArrayCacheMemoryMgr mmgr = new ArrayCacheMemoryMgr(); + Field f = ArrayCacheMemoryMgr.class.getDeclaredField("maxCacheBytes"); + f.setAccessible(true); + f.set(mmgr, 1000); + + assertEquals(1000, mmgr.getMaxCacheBytes()); + + INDArray[] arrays = new INDArray[100]; + for( int i=0; i