From b4575d11e97dd847dfd7cb8b2255ee3386faaa93 Mon Sep 17 00:00:00 2001 From: Oleh Date: Wed, 26 Feb 2020 20:12:19 +0200 Subject: [PATCH] Loops auto-vectorization problem fix (#274) * libnd4j cast loop types Signed-off-by: Oleg * libnd4j more type castination added to loops Signed-off-by: Oleg * libnd4j sync casting types of iterated variable in loops Signed-off-by: Oleg * libnd4j more loops reviewed for vectorization problem fix Signed-off-by: Oleg * libnd4j fixed several typos Signed-off-by: Oleg * libnd4j several more files reviewed to fix auto-vectorization problem in loops Signed-off-by: Oleg * libnd4j merge master and reviewed more files to fix auto-vectorization problem in loops Signed-off-by: Oleg * libnd4j several type casting added in broadcasting that were missed, fixed mac builds Signed-off-by: Oleg * libnd4j double check all files and fix several more places in loops Signed-off-by: Oleg * libnd4j fixed builds Signed-off-by: Oleg * libnd4j revert changes for lup.cpp Signed-off-by: Oleg --- libnd4j/include/helpers/Loops.h | 1687 +++++++++-------- .../include/loops/cpu/TrueBroadcastHelper.hpp | 16 +- libnd4j/include/loops/cpu/broadcasting.hpp | 48 +- .../include/loops/cpu/broadcasting_bool.cpp | 20 +- .../include/loops/cpu/broadcasting_int.cpp | 20 +- libnd4j/include/loops/cpu/indexreduce.hpp | 2 +- libnd4j/include/loops/cpu/random.hpp | 14 +- .../include/loops/cpu/reduce/reduce_bool.cpp | 8 +- .../include/loops/cpu/reduce/reduce_float.hpp | 6 +- .../include/loops/cpu/reduce/reduce_long.cpp | 6 +- .../include/loops/cpu/reduce/reduce_same.cpp | 6 +- libnd4j/include/loops/cpu/reduce3.hpp | 2 +- libnd4j/include/loops/cpu/scalar.hpp | 4 +- libnd4j/include/loops/cpu/scalar_bool.cpp | 4 +- libnd4j/include/loops/cpu/scalar_int.cpp | 4 +- .../include/loops/cpu/summarystatsreduce.cpp | 10 +- .../ops/declarable/generic/nn/batchnorm.cpp | 8 +- .../declarable/helpers/cpu/BarnesHutTsne.cpp | 6 +- .../declarable/helpers/cpu/activations.cpp | 12 +- .../ops/declarable/helpers/cpu/addBias.cpp | 4 +- .../ops/declarable/helpers/cpu/axis.cpp | 4 +- .../ops/declarable/helpers/cpu/batchnorm.cpp | 4 +- .../ops/declarable/helpers/cpu/col2im.cpp | 6 +- .../ops/declarable/helpers/cpu/dilation2d.cpp | 8 +- .../ops/declarable/helpers/cpu/dropout.cpp | 2 +- .../ops/declarable/helpers/cpu/dynamic.cpp | 18 +- .../ops/declarable/helpers/cpu/flatten.cpp | 2 +- .../ops/declarable/helpers/cpu/hashcode.cpp | 9 +- .../ops/declarable/helpers/cpu/histogram.cpp | 2 +- .../ops/declarable/helpers/cpu/im2col.cpp | 8 +- .../declarable/helpers/cpu/image_resize.cpp | 20 +- .../ops/declarable/helpers/cpu/lrn.cpp | 20 +- .../ops/declarable/helpers/cpu/lstm.cpp | 2 +- .../declarable/helpers/cpu/matrix_band.cpp | 2 +- .../ops/declarable/helpers/cpu/prefix.cpp | 4 +- .../include/ops/declarable/helpers/cpu/qr.cpp | 8 +- .../ops/declarable/helpers/cpu/random.cpp | 12 +- .../declarable/helpers/cpu/random_crop.cpp | 2 +- .../ops/declarable/helpers/cpu/roll.cpp | 2 +- .../ops/declarable/helpers/cpu/s_t_b.cpp | 16 +- .../ops/declarable/helpers/cpu/segment.cpp | 58 +- .../ops/declarable/helpers/cpu/sg_cb.cpp | 6 +- .../ops/declarable/helpers/cpu/solve.cpp | 8 +- .../ops/declarable/helpers/cpu/split.cpp | 6 +- .../ops/declarable/helpers/cpu/top_k.cpp | 14 +- .../ops/declarable/helpers/cpu/transforms.cpp | 16 +- .../helpers/cpu/triangular_solve.cpp | 20 +- .../ops/declarable/helpers/cpu/weights.cpp | 2 +- 48 files changed, 1084 insertions(+), 1084 deletions(-) diff --git a/libnd4j/include/helpers/Loops.h b/libnd4j/include/helpers/Loops.h index fb1582056..680a5f0aa 100644 --- a/libnd4j/include/helpers/Loops.h +++ b/libnd4j/include/helpers/Loops.h @@ -14,9 +14,9 @@ * SPDX-License-Identifier: Apache-2.0 ******************************************************************************/ -// -// @author Yurii Shyrma (iuriish@yahoo.com), created on 14.03.2019 -// + // + // @author Yurii Shyrma (iuriish@yahoo.com), created on 14.03.2019 + // #ifndef LIBND4J_LOOPS_H #define LIBND4J_LOOPS_H @@ -45,7 +45,7 @@ namespace nd4j { }; template - class ReductionFloatLoops : public ReductionLoops { + class ReductionFloatLoops : public ReductionLoops { public: static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop); @@ -54,7 +54,7 @@ namespace nd4j { }; template - class ND4J_EXPORT ReductionBoolLoops : public ReductionLoops { + class ND4J_EXPORT ReductionBoolLoops : public ReductionLoops { public: static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop); @@ -63,7 +63,7 @@ namespace nd4j { }; template - class ND4J_EXPORT ReductionLongLoops : public ReductionLoops { + class ND4J_EXPORT ReductionLongLoops : public ReductionLoops { public: static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop); @@ -72,7 +72,7 @@ namespace nd4j { }; template - class ND4J_EXPORT ReductionSameLoops : public ReductionLoops { + class ND4J_EXPORT ReductionSameLoops : public ReductionLoops { public: static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop); @@ -125,158 +125,158 @@ namespace nd4j { -/* -////////////////////////////////////////////////////////////////////////////// -template -void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, - const Y* y, const Nd4jLong* yShapeInfo, - Z* z, const Nd4jLong* zShapeInfo, - Z* extraParams, - std::function op) { + /* + ////////////////////////////////////////////////////////////////////////////// + template + void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, + const Y* y, const Nd4jLong* yShapeInfo, + Z* z, const Nd4jLong* zShapeInfo, + Z* extraParams, + std::function op) { - const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo); + const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo); - const Nd4jLong* xShape = shape::shapeOf(xShapeInfo); - const Nd4jLong* xStride = shape::stride(xShapeInfo); - const Nd4jLong* yStride = shape::stride(yShapeInfo); - const Nd4jLong* zStride = shape::stride(zShapeInfo); + const Nd4jLong* xShape = shape::shapeOf(xShapeInfo); + const Nd4jLong* xStride = shape::stride(xShapeInfo); + const Nd4jLong* yStride = shape::stride(yShapeInfo); + const Nd4jLong* zStride = shape::stride(zShapeInfo); - const Nd4jLong len = shape::length(xShapeInfo); + const Nd4jLong len = shape::length(xShapeInfo); - OmpLaunchHelper threadsInfo(len); + OmpLaunchHelper threadsInfo(len); - switch (kindOfLoop) { + switch (kindOfLoop) { - case LoopKind::EWS1: { - PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads) - { - const auto threadNum = omp_get_thread_num(); - const auto threadOffset = threadsInfo.getThreadOffset(threadNum); - const auto lenPerThread = static_cast(threadsInfo.getItersPerThread(threadNum)); + case LoopKind::EWS1: { + PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads) + { + const auto threadNum = omp_get_thread_num(); + const auto threadOffset = threadsInfo.getThreadOffset(threadNum); + const auto lenPerThread = static_cast(threadsInfo.getItersPerThread(threadNum)); - const auto xi = x + threadOffset; - const auto yi = y + threadOffset; - auto zi = z + threadOffset; + const auto xi = x + threadOffset; + const auto yi = y + threadOffset; + auto zi = z + threadOffset; - PRAGMA_OMP_SIMD - for (uint i = 0; i < lenPerThread; i++) - zi[i] = op(xi[i], yi[i], extraParams); + PRAGMA_OMP_SIMD + for (uint i = 0; i < lenPerThread; i++) + zi[i] = op(xi[i], yi[i], extraParams); + } } - } - break; + break; - case LoopKind::EWSNONZERO: { - const uint xEws = shape::elementWiseStride(xShapeInfo); - const uint yEws = shape::elementWiseStride(yShapeInfo); - const uint zEws = shape::elementWiseStride(zShapeInfo); + case LoopKind::EWSNONZERO: { + const uint xEws = shape::elementWiseStride(xShapeInfo); + const uint yEws = shape::elementWiseStride(yShapeInfo); + const uint zEws = shape::elementWiseStride(zShapeInfo); - PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads) - { - const auto threadNum = omp_get_thread_num(); - const auto threadOffset = threadsInfo.getThreadOffset(threadNum); - const auto lenPerThread = static_cast(threadsInfo.getItersPerThread(threadNum)); - const auto xi = x + threadOffset * xEws; - const auto yi = y + threadOffset * yEws; - auto zi = z + threadOffset * zEws; + PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads) + { + const auto threadNum = omp_get_thread_num(); + const auto threadOffset = threadsInfo.getThreadOffset(threadNum); + const auto lenPerThread = static_cast(threadsInfo.getItersPerThread(threadNum)); + const auto xi = x + threadOffset * xEws; + const auto yi = y + threadOffset * yEws; + auto zi = z + threadOffset * zEws; - PRAGMA_OMP_SIMD - for (uint i = 0; i < lenPerThread; i++) - zi[i*zEws] = op(xi[i*xEws], yi[i*yEws], extraParams); + PRAGMA_OMP_SIMD + for (uint i = 0; i < lenPerThread; i++) + zi[i*zEws] = op(xi[i*xEws], yi[i*yEws], extraParams); + } } - } - break; + break; - case LoopKind::RANK1: { - PRAGMA_OMP_PARALLEL_FOR - for (uint i0 = 0; i0 < len; ++i0) - z[i0 * zStride[0]] = op(x[i0 * xStride[0]], y[i0 * yStride[0]], extraParams); - } - break; + case LoopKind::RANK1: { + PRAGMA_OMP_PARALLEL_FOR + for (uint i0 = 0; i0 < len; ++i0) + z[i0 * zStride[0]] = op(x[i0 * xStride[0]], y[i0 * yStride[0]], extraParams); + } + break; - case LoopKind::RANK2: { - PRAGMA_OMP_PARALLEL_FOR_SIMD - for (uint i0 = 0; i0 < xShape[0]; ++i0) - for (uint i1 = 0; i1 < xShape[1]; ++i1) - z[i0 * zStride[0] + i1 * zStride[1]] = op(x[i0 * xStride[0] + i1 * xStride[1]], y[i0 * yStride[0] + i1 * yStride[1]], extraParams); - } - break; + case LoopKind::RANK2: { + PRAGMA_OMP_PARALLEL_FOR_SIMD + for (uint i0 = 0; i0 < xShape[0]; ++i0) + for (uint i1 = 0; i1 < xShape[1]; ++i1) + z[i0 * zStride[0] + i1 * zStride[1]] = op(x[i0 * xStride[0] + i1 * xStride[1]], y[i0 * yStride[0] + i1 * yStride[1]], extraParams); + } + break; - case LoopKind::RANK3: { - PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2) - for (uint i0 = 0; i0 < xShape[0]; ++i0) - for (uint i1 = 0; i1 < xShape[1]; ++i1) - for (uint i2 = 0; i2 < xShape[2]; ++i2) - z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]], extraParams); - } - break; + case LoopKind::RANK3: { + PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2) + for (uint i0 = 0; i0 < xShape[0]; ++i0) + for (uint i1 = 0; i1 < xShape[1]; ++i1) + for (uint i2 = 0; i2 < xShape[2]; ++i2) + z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]], extraParams); + } + break; - case LoopKind::RANK4: { - PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(3) - for (uint i0 = 0; i0 < xShape[0]; ++i0) - for (uint i1 = 0; i1 < xShape[1]; ++i1) - for (uint i2 = 0; i2 < xShape[2]; ++i2) - for (uint i3 = 0; i3 < xShape[3]; ++i3) - z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]+i3*zStride[3]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]+i3*xStride[3]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]+i3*yStride[3]], extraParams); - } - break; + case LoopKind::RANK4: { + PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(3) + for (uint i0 = 0; i0 < xShape[0]; ++i0) + for (uint i1 = 0; i1 < xShape[1]; ++i1) + for (uint i2 = 0; i2 < xShape[2]; ++i2) + for (uint i3 = 0; i3 < xShape[3]; ++i3) + z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]+i3*zStride[3]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]+i3*xStride[3]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]+i3*yStride[3]], extraParams); + } + break; - case LoopKind::RANK5: { - PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(4) - for (uint i0 = 0; i0 < xShape[0]; ++i0) - for (uint i1 = 0; i1 < xShape[1]; ++i1) - for (uint i2 = 0; i2 < xShape[2]; ++i2) - for (uint i3 = 0; i3 < xShape[3]; ++i3) - for (uint i4 = 0; i4 < xShape[4]; ++i4) - z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]+i3*zStride[3]+i4*zStride[4]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]+i3*xStride[3]+i4*xStride[4]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]+i3*yStride[3]+i4*yStride[4]], extraParams); - } - break; + case LoopKind::RANK5: { + PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(4) + for (uint i0 = 0; i0 < xShape[0]; ++i0) + for (uint i1 = 0; i1 < xShape[1]; ++i1) + for (uint i2 = 0; i2 < xShape[2]; ++i2) + for (uint i3 = 0; i3 < xShape[3]; ++i3) + for (uint i4 = 0; i4 < xShape[4]; ++i4) + z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]+i3*zStride[3]+i4*zStride[4]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]+i3*xStride[3]+i4*xStride[4]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]+i3*yStride[3]+i4*yStride[4]], extraParams); + } + break; - default: { - uint xShapeInfoCast[MAX_RANK]; - uint yShapeInfoCast[MAX_RANK]; - uint zShapeInfoCast[MAX_RANK]; + default: { + uint xShapeInfoCast[MAX_RANK]; + uint yShapeInfoCast[MAX_RANK]; + uint zShapeInfoCast[MAX_RANK]; - bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - bool canCastY = DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); - bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); + bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); + bool canCastY = DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); + bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads) - { - auto threadNum = omp_get_thread_num(); - auto threadOffset = threadsInfo.getThreadOffset(threadNum); - auto lenPerThread = static_cast(threadsInfo.getItersPerThread(threadNum)); - PRAGMA_OMP_SIMD - for (uint i = 0; i < lenPerThread; i++) { - auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); - auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); - auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); - z[zOffset] = op(x[xOffset], y[yOffset], extraParams); + PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads) + { + auto threadNum = omp_get_thread_num(); + auto threadOffset = threadsInfo.getThreadOffset(threadNum); + auto lenPerThread = static_cast(threadsInfo.getItersPerThread(threadNum)); + PRAGMA_OMP_SIMD + for (uint i = 0; i < lenPerThread; i++) { + auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX); + auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY); + auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ); + z[zOffset] = op(x[xOffset], y[yOffset], extraParams); + } } } } } -} -*/ + */ -////////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////////// template template void nd4j::ReductionLoops::loopReduce(X* x, Nd4jLong* xShapeInfo, - Z* z, Nd4jLong* zShapeInfo, - Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, - E* extraParams, int64_t start, int64_t stop) { + Z* z, Nd4jLong* zShapeInfo, + Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, + E* extraParams, int64_t start, int64_t stop) { const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopTadXZ(xShapeInfo, zShapeInfo, tadShapeInfo); - const Nd4jLong zLen = shape::length(zShapeInfo); + const Nd4jLong zLen = shape::length(zShapeInfo); const Nd4jLong tadLen = shape::length(tadShapeInfo); const uint tadEws = shape::elementWiseStride(tadShapeInfo); - const uint zEws = shape::elementWiseStride(zShapeInfo); + const uint zEws = shape::elementWiseStride(zShapeInfo); - const Nd4jLong* tadShape = shape::shapeOf(tadShapeInfo); + const Nd4jLong* tadShape = shape::shapeOf(tadShapeInfo); const Nd4jLong* tadStride = shape::stride(tadShapeInfo); int numThreads = OmpLaunchHelper::tadThreads(tadLen, zLen); @@ -298,192 +298,192 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, // printf("%u - %lld\n", i, zOffset); // } // } - case LoopKind::SMALLARR2DX: { - const auto uTadLen = static_cast(tadLen); - const auto uZLenMinusOne = static_cast(zLen - 1); - const auto xLen = static_cast(zLen * uTadLen); - const auto sv = static_cast(OpType::startingValue(x)); + case LoopKind::SMALLARR2DX: { + const auto uTadLen = static_cast(tadLen); + const auto uZLenMinusOne = static_cast(zLen - 1); + const auto xLen = static_cast(zLen * uTadLen); + const auto sv = static_cast(OpType::startingValue(x)); - for (uint i = 0; i <= uZLenMinusOne; i++) - z[i] = OpType::startingValue(x); + for (uint i = 0; i <= uZLenMinusOne; i++) + z[i] = OpType::startingValue(x); - uint zOffset = 0; - for (uint i = 0; i < xLen; ++i) { - z[zOffset] = OpType::update(z[zOffset], OpType::op(x[i], extraParams), extraParams); - zOffset = zOffset == uZLenMinusOne ? 0 : zOffset + 1; + uint zOffset = 0; + for (uint i = 0; i < xLen; ++i) { + z[zOffset] = OpType::update(z[zOffset], OpType::op(x[i], extraParams), extraParams); + zOffset = zOffset == uZLenMinusOne ? 0 : zOffset + 1; + } + + for (uint i = 0; i <= uZLenMinusOne; i++) + z[i] = OpType::postProcess(z[i], tadLen, extraParams); + } + break; + + //*********************************************// + case LoopKind::EWS1: { + for (auto i = start; i < stop; i++) { + auto tad = x + tadOffsets[i]; + auto s = OpType::startingValue(tad); + + for (Nd4jLong j = 0; j < tadLen; j++) + s = OpType::update(s, OpType::op(tad[j], extraParams), extraParams); + + z[i] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; + + //*********************************************// + case LoopKind::EWSNONZERO: { + for (auto i = start; i < stop; i++) { + auto tad = x + tadOffsets[i]; + auto s = OpType::startingValue(tad); + + for (Nd4jLong j = 0; j < tadLen; j++) + s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams); + + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; + + //*********************************************// + case LoopKind::RANK1: { + for (auto i = start; i < stop; i++) { + auto tad = x + tadOffsets[i]; + auto s = OpType::startingValue(tad); + + for (Nd4jLong i0 = 0; i0 < tadLen; ++i0) + s = OpType::update(s, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams); + + z[i] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; + + //*********************************************// + case LoopKind::RANK2: { + for (auto i = start; i < stop; i++) { + auto tad = x + tadOffsets[i]; + auto s = OpType::startingValue(tad); + + for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) + for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) + s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1]], extraParams), extraParams); + + z[i] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; + + //*********************************************// + case LoopKind::RANK3: { + for (auto i = start; i < stop; i++) { + auto tad = x + tadOffsets[i]; + auto s = OpType::startingValue(tad); + + for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) + for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) + for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) + s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2]], extraParams), extraParams); + + z[i] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; + + //*********************************************// + case LoopKind::RANK4: { + for (auto i = start; i < stop; i++) { + auto tad = x + tadOffsets[i]; + auto s = OpType::startingValue(tad); + + for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) + for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) + for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) + for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) + s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3]], extraParams), extraParams); + + z[i] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; + + //*********************************************// + case LoopKind::RANK5: { + for (auto i = start; i < stop; i++) { + auto tad = x + tadOffsets[i]; + auto s = OpType::startingValue(tad); + + for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) + for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) + for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) + for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) + for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4) + s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4]], extraParams), extraParams); + + z[i] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; + + //*********************************************// + case LoopKind::X_EWSNONZERO: { + uint castZShapeInfo[MAX_RANK]; + const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, castZShapeInfo); + + for (auto i = start; i < stop; i++) { + auto tad = x + tadOffsets[i]; + auto s = OpType::startingValue(tad); + + for (Nd4jLong j = 0; j < tadLen; j++) + s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams); + + auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); + z[zOffset] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; + + //*********************************************// + case LoopKind::Z_EWSNONZERO: { + uint castTadShapeInfo[MAX_RANK]; + const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo(tadShapeInfo, castTadShapeInfo); + + for (auto i = start; i < stop; i++) { + auto tad = x + tadOffsets[i]; + auto s = OpType::startingValue(tad); + + for (Nd4jLong j = 0; j < tadLen; j++) { + auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad); + s = OpType::update(s, OpType::op(tad[tadOffset], extraParams), extraParams); } - for (uint i = 0; i <= uZLenMinusOne; i++) - z[i] = OpType::postProcess(z[i], tadLen, extraParams); - } - break; + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; - //*********************************************// - case LoopKind::EWS1: { - for (auto i = start; i < stop; i++) { - auto tad = x + tadOffsets[i]; - auto s = OpType::startingValue(tad); + //*********************************************// + default: { + auto innertadOffsets = new Nd4jLong[tadLen]; + shape::calcOffsets(tadShapeInfo, innertadOffsets); - for (uint j = 0; j < tadLen; j++) - s = OpType::update(s, OpType::op(tad[j], extraParams), extraParams); + uint castZShapeInfo[MAX_RANK]; + const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, castZShapeInfo); - z[i] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; + for (auto i = start; i < stop; i++) { + auto tad = x + tadOffsets[i]; + auto s = OpType::startingValue(tad); - //*********************************************// - case LoopKind::EWSNONZERO: { - for (auto i = start; i < stop; i++) { - auto tad = x + tadOffsets[i]; - auto s = OpType::startingValue(tad); + for (Nd4jLong j = 0; j < tadLen; j++) + s = OpType::update(s, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams); - for (uint j = 0; j < tadLen; j++) - s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams); + auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); + z[zOffset] = OpType::postProcess(s, tadLen, extraParams); + }; - z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; - - //*********************************************// - case LoopKind::RANK1: { - for (auto i = start; i < stop; i++) { - auto tad = x + tadOffsets[i]; - auto s = OpType::startingValue(tad); - - for (uint i0 = 0; i0 < tadLen; ++i0) - s = OpType::update(s, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams); - - z[i] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; - - //*********************************************// - case LoopKind::RANK2: { - for (auto i = start; i < stop; i++) { - auto tad = x + tadOffsets[i]; - auto s = OpType::startingValue(tad); - - for (uint i0 = 0; i0 < tadShape[0]; ++i0) - for (uint i1 = 0; i1 < tadShape[1]; ++i1) - s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1]], extraParams), extraParams); - - z[i] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; - - //*********************************************// - case LoopKind::RANK3: { - for (auto i = start; i < stop; i++) { - auto tad = x + tadOffsets[i]; - auto s = OpType::startingValue(tad); - - for (uint i0 = 0; i0 < tadShape[0]; ++i0) - for (uint i1 = 0; i1 < tadShape[1]; ++i1) - for (uint i2 = 0; i2 < tadShape[2]; ++i2) - s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2]], extraParams), extraParams); - - z[i] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; - - //*********************************************// - case LoopKind::RANK4: { - for (auto i = start; i < stop; i++) { - auto tad = x + tadOffsets[i]; - auto s = OpType::startingValue(tad); - - for (uint i0 = 0; i0 < tadShape[0]; ++i0) - for (uint i1 = 0; i1 < tadShape[1]; ++i1) - for (uint i2 = 0; i2 < tadShape[2]; ++i2) - for (uint i3 = 0; i3 < tadShape[3]; ++i3) - s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3]], extraParams), extraParams); - - z[i] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; - - //*********************************************// - case LoopKind::RANK5: { - for (auto i = start; i < stop; i++) { - auto tad = x + tadOffsets[i]; - auto s = OpType::startingValue(tad); - - for (uint i0 = 0; i0 < tadShape[0]; ++i0) - for (uint i1 = 0; i1 < tadShape[1]; ++i1) - for (uint i2 = 0; i2 < tadShape[2]; ++i2) - for (uint i3 = 0; i3 < tadShape[3]; ++i3) - for (uint i4 = 0; i4 < tadShape[4]; ++i4) - s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4]], extraParams), extraParams); - - z[i] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; - - //*********************************************// - case LoopKind::X_EWSNONZERO: { - uint castZShapeInfo[MAX_RANK]; - const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, castZShapeInfo); - - for (auto i = start; i < stop; i++) { - auto tad = x + tadOffsets[i]; - auto s = OpType::startingValue(tad); - - for (uint j = 0; j < tadLen; j++) - s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams); - - auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); - z[zOffset] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; - - //*********************************************// - case LoopKind::Z_EWSNONZERO: { - uint castTadShapeInfo[MAX_RANK]; - const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo(tadShapeInfo, castTadShapeInfo); - - for (auto i = start; i < stop; i++) { - auto tad = x + tadOffsets[i]; - auto s = OpType::startingValue(tad); - - for (uint j = 0; j < tadLen; j++) { - auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad); - s = OpType::update(s, OpType::op(tad[tadOffset], extraParams), extraParams); - } - - z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; - - //*********************************************// - default: { - auto innertadOffsets = new Nd4jLong[tadLen]; - shape::calcOffsets(tadShapeInfo, innertadOffsets); - - uint castZShapeInfo[MAX_RANK]; - const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, castZShapeInfo); - - for (auto i = start; i < stop; i++) { - auto tad = x + tadOffsets[i]; - auto s = OpType::startingValue(tad); - - for (uint j = 0; j < tadLen; j++) - s = OpType::update(s, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams); - - auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); - z[zOffset] = OpType::postProcess(s, tadLen, extraParams); - }; - - delete[] innertadOffsets; - } + delete[] innertadOffsets; + } } } @@ -492,13 +492,13 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, ////////////////////////////////////////////////////////////////////////////// template template - void nd4j::TransformLoops::loopTransform(X* x, Nd4jLong* xShapeInfo, - Z* z, Nd4jLong* zShapeInfo, - E* extraParams, uint64_t threadId, uint64_t numThreads) { + void nd4j::TransformLoops::loopTransform(X* x, Nd4jLong* xShapeInfo, + Z* z, Nd4jLong* zShapeInfo, + E* extraParams, uint64_t threadId, uint64_t numThreads) { const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo); - const Nd4jLong* xShape = shape::shapeOf(const_cast(xShapeInfo)); + const Nd4jLong* xShape = shape::shapeOf(const_cast(xShapeInfo)); const Nd4jLong* xStride = shape::stride(const_cast(xShapeInfo)); const Nd4jLong* zStride = shape::stride(const_cast(zShapeInfo)); @@ -510,182 +510,183 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, switch (kindOfLoop) { //*********************************************// - case LoopKind::EWS1: { - auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); - int64_t start = span.startX(), stop = span.stopX(); + case LoopKind::EWS1: { + auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); + int64_t start = span.startX(), stop = span.stopX(); - for (auto i = start; i < stop; i++) - z[i] = OpType::op(x[i], extraParams); + for (auto i = start; i < stop; i++) + z[i] = OpType::op(x[i], extraParams); + } + break; + + //*********************************************// + case LoopKind::EWSNONZERO: { + const uint xEws = shape::elementWiseStride(xShapeInfo); + const uint zEws = shape::elementWiseStride(zShapeInfo); + + auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); + int64_t start = span.startX(), stop = span.stopX(); + + for (auto i = start; i < stop; i++) + z[i * zEws] = OpType::op(x[i * xEws], extraParams); + } + break; + + //*********************************************// + case LoopKind::Z_EWSNONZERO: { + const uint zEws = shape::elementWiseStride(zShapeInfo); + uint castXShapeInfo[MAX_RANK]; + const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, castXShapeInfo); + + auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); + int64_t start = span.startX(), stop = span.stopX(); + + if (zEws > 1) { + for (auto i = start; i < stop; i++) { + const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX); + z[i * zEws] = OpType::op(x[xOffset], extraParams); } - break; - - //*********************************************// - case LoopKind::EWSNONZERO: { - const uint xEws = shape::elementWiseStride(xShapeInfo); - const uint zEws = shape::elementWiseStride(zShapeInfo); - - auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); - int64_t start = span.startX(), stop = span.stopX(); - - for (auto i = start; i < stop; i++) - z[i*zEws] = OpType::op(x[i*xEws], extraParams); + } + else { + for (auto i = start; i < stop; i++) { + const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX); + z[i] = OpType::op(x[xOffset], extraParams); } - break; + } + } + break; - //*********************************************// - case LoopKind::Z_EWSNONZERO: { - const uint zEws = shape::elementWiseStride(zShapeInfo); - uint castXShapeInfo[MAX_RANK]; - const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, castXShapeInfo); + //*********************************************// + case LoopKind::RANK1: { + auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); - auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); - int64_t start = span.startX(), stop = span.stopX(); + for (auto i0 = span.startX(); i0 < span.stopX(); i0++) + z[i0 * zStride[0]] = OpType::op(x[i0 * xStride[0]], extraParams); + } + break; + + //*********************************************// + case LoopKind::RANK2: { + auto uXShape0 = static_cast(xShape[0]); + auto uXShape1 = static_cast(xShape[1]); + + auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1); + auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1); + + for (auto i0 = span.startX(); i0 < span.stopX(); i0++) { + auto z0 = i0 * zStride[0]; + auto x0 = i0 * xStride[0]; + + for (auto i1 = span.startY(); i1 < span.stopY(); ++i1) + z[z0 + i1 * zStride[1]] = OpType::op(x[x0 + i1 * xStride[1]], extraParams); + } + } + break; + + //*********************************************// + case LoopKind::RANK3: { + auto uXShape0 = xShape[0]; + auto uXShape1 = xShape[1]; + auto uXShape2 = xShape[2]; + + auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1); + auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1); + + + for (auto i0 = span.startX(); i0 < span.stopX(); i0++) + for (auto i1 = span.startY(); i1 < span.stopY(); i1++) { + auto z0 = i0 * zStride[0] + i1 * zStride[1]; + auto x0 = i0 * xStride[0] + i1 * xStride[1]; + + for (Nd4jLong i2 = 0; i2 < uXShape2; ++i2) + z[z0 + i2 * zStride[2]] = OpType::op(x[x0 + i2 * xStride[2]], extraParams); + } + } + break; + + //*********************************************// + case LoopKind::RANK4: { + auto uXShape0 = xShape[0]; + auto uXShape1 = xShape[1]; + auto uXShape2 = xShape[2]; + auto uXShape3 = xShape[3]; + + auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2); + auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1); + + for (auto i0 = span.startX(); i0 < span.stopX(); i0++) + for (auto i1 = span.startY(); i1 < span.stopY(); i1++) + for (auto i2 = span.startZ(); i2 < span.stopZ(); i2++) { + auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2]; + auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2]; + + for (Nd4jLong i3 = 0; i3 < uXShape3; ++i3) + z[z0 + i3 * zStride[3]] = OpType::op(x[x0 + i3 * xStride[3]], extraParams); + } + } + break; + + //*********************************************// + case LoopKind::RANK5: { + auto uXShape0 = xShape[0]; + auto uXShape1 = xShape[1]; + auto uXShape2 = xShape[2]; + auto uXShape3 = xShape[3]; + auto uXShape4 = xShape[4]; + + auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2); + auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1); + + + for (auto i0 = span.startX(); i0 < span.stopX(); i0++) + for (auto i1 = span.startY(); i1 < span.stopY(); i1++) + for (auto i2 = span.startZ(); i2 < span.stopZ(); i2++) { + auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2]; + auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2]; + + for (Nd4jLong i3 = 0; i3 < uXShape3; ++i3) { + + auto z1 = z0 + i3 * zStride[3]; + auto x1 = x0 + i3 * xStride[3]; + + for (Nd4jLong i4 = 0; i4 < uXShape4; ++i4) + z[z1 + i4 * zStride[4]] = OpType::op(x[x1 + i4 * xStride[4]], extraParams); - if (zEws > 1) { - for (auto i = start; i < stop; i++) { - const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX); - z[i * zEws] = OpType::op(x[xOffset], extraParams); - } - } else { - for (auto i = start; i < stop; i++) { - const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX); - z[i] = OpType::op(x[xOffset], extraParams); } } - } - break; - //*********************************************// - case LoopKind::RANK1: { - auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); + } + break; - for (auto i0 = span.startX(); i0 < span.stopX(); i0++) - z[i0 * zStride[0]] = OpType::op(x[i0 * xStride[0]], extraParams); - } - break; + //*********************************************// + default: { + uint xShapeInfoCast[MAX_RANK]; + uint zShapeInfoCast[MAX_RANK]; - //*********************************************// - case LoopKind::RANK2: { - auto uXShape0 = static_cast(xShape[0]); - auto uXShape1 = static_cast(xShape[1]); + bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); + bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1); - auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1); + auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); - for (auto i0 = span.startX(); i0 < span.stopX(); i0++) { - auto z0 = i0 * zStride[0]; - auto x0 = i0 * xStride[0]; - - for (uint i1 = span.startY(); i1 < span.stopY(); ++i1) - z[z0 + i1 * zStride[1]] = OpType::op(x[x0 + i1 * xStride[1]], extraParams); - } - } - break; - - //*********************************************// - case LoopKind::RANK3: { - auto uXShape0 = static_cast(xShape[0]); - auto uXShape1 = static_cast(xShape[1]); - auto uXShape2 = static_cast(xShape[2]); - - auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1); - auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1); - - - for (auto i0 = span.startX(); i0 < span.stopX(); i0++) - for (auto i1 = span.startY(); i1 < span.stopY(); i1++) { - auto z0 = i0 * zStride[0] + i1 * zStride[1]; - auto x0 = i0 * xStride[0] + i1 * xStride[1]; - - for (uint i2 = 0; i2 < uXShape2; ++i2) - z[z0 + i2 * zStride[2]] = OpType::op(x[x0 + i2 * xStride[2]], extraParams); - } - } - break; - - //*********************************************// - case LoopKind::RANK4: { - auto uXShape0 = static_cast(xShape[0]); - auto uXShape1 = static_cast(xShape[1]); - auto uXShape2 = static_cast(xShape[2]); - auto uXShape3 = static_cast(xShape[3]); - - auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2); - auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1); - - for (auto i0 = span.startX(); i0 < span.stopX(); i0++) - for (auto i1 = span.startY(); i1 < span.stopY(); i1++) - for (auto i2 = span.startZ(); i2 < span.stopZ(); i2++) { - auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2]; - auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2]; - - for (uint i3 = 0; i3 < uXShape3; ++i3) - z[z0 + i3 * zStride[3]] = OpType::op(x[x0 + i3 * xStride[3]], extraParams); - } - } - break; - - //*********************************************// - case LoopKind::RANK5: { - auto uXShape0 = static_cast(xShape[0]); - auto uXShape1 = static_cast(xShape[1]); - auto uXShape2 = static_cast(xShape[2]); - auto uXShape3 = static_cast(xShape[3]); - auto uXShape4 = static_cast(xShape[4]); - - auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2); - auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1); - - - for (auto i0 = span.startX(); i0 < span.stopX(); i0++) - for (auto i1 = span.startY(); i1 < span.stopY(); i1++) - for (auto i2 = span.startZ(); i2 < span.stopZ(); i2++) { - auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2]; - auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2]; - - for (uint i3 = 0; i3 < uXShape3; ++i3) { - - auto z1 = z0 + i3 * zStride[3]; - auto x1 = x0 + i3 * xStride[3]; - - for (uint i4 = 0; i4 < uXShape4; ++i4) - z[z1 + i4 * zStride[4]] = OpType::op(x[x1 + i4 * xStride[4]], extraParams); - - } - } - - } - break; - - //*********************************************// - default: { - uint xShapeInfoCast[MAX_RANK]; - uint zShapeInfoCast[MAX_RANK]; - - bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast); - - auto span = samediff::Span::build(threadId, numThreads, 0, len, 1); - - for (auto i = span.startX(); i < span.stopX(); i++) { - auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); - auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); - z[zOffset] = OpType::op(x[xOffset], extraParams); - } - } + for (auto i = span.startX(); i < span.stopX(); i++) { + auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); + auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); + z[zOffset] = OpType::op(x[xOffset], extraParams); + } + } } } -////////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////////// template template void nd4j::Reduction3Loops::loopReduce3(X* x, Nd4jLong* xShapeInfo, - X* y, Nd4jLong* yShapeInfo, - Z* z, Nd4jLong* zShapeInfo, - int* dims, int dimsLen, - Z* extraParameters, int64_t start, int64_t stop) { + X* y, Nd4jLong* yShapeInfo, + Z* z, Nd4jLong* zShapeInfo, + int* dims, int dimsLen, + Z* extraParameters, int64_t start, int64_t stop) { // both tads have same shape, however strides and ews may differ @@ -694,29 +695,29 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, const Nd4jLong xLen = shape::length(xShapeInfo); const Nd4jLong yLen = shape::length(yShapeInfo); - Nd4jLong *xTadShapeInfo = nullptr, *yTadShapeInfo = nullptr, *xTadOffsets = nullptr, *yTadOffsets = nullptr; + Nd4jLong* xTadShapeInfo = nullptr, * yTadShapeInfo = nullptr, * xTadOffsets = nullptr, * yTadOffsets = nullptr; TadPack tadPackX, tadPackY; std::vector zeroOffsets; - if(xLen == yLen) { - tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dims, dimsLen); - tadPackY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dims, dimsLen); + if (xLen == yLen) { + tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dims, dimsLen); + tadPackY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dims, dimsLen); xTadShapeInfo = tadPackX.primaryShapeInfo(); yTadShapeInfo = tadPackY.primaryShapeInfo(); - xTadOffsets = tadPackX.primaryOffsets(); - yTadOffsets = tadPackY.primaryOffsets(); + xTadOffsets = tadPackX.primaryOffsets(); + yTadOffsets = tadPackY.primaryOffsets(); } - else if(yLen > xLen) { - tadPackY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dims, dimsLen); + else if (yLen > xLen) { + tadPackY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dims, dimsLen); xTadShapeInfo = xShapeInfo; yTadShapeInfo = tadPackY.primaryShapeInfo(); - yTadOffsets = tadPackY.primaryOffsets(); + yTadOffsets = tadPackY.primaryOffsets(); } else { - tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dims, dimsLen); + tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dims, dimsLen); yTadShapeInfo = yShapeInfo; xTadShapeInfo = tadPackX.primaryShapeInfo(); - xTadOffsets = tadPackX.primaryOffsets(); + xTadOffsets = tadPackX.primaryOffsets(); } @@ -724,162 +725,196 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, const auto xTadEws = shape::elementWiseStride(xTadShapeInfo); const auto yTadEws = shape::elementWiseStride(yTadShapeInfo); - const auto zEws = shape::elementWiseStride(zShapeInfo); + const auto zEws = shape::elementWiseStride(zShapeInfo); - const auto zLen = shape::length(zShapeInfo); + const auto zLen = shape::length(zShapeInfo); const auto tadLen = shape::length(xTadShapeInfo); - const auto tadShape = shape::shapeOf(xTadShapeInfo); - const auto xTadStride = shape::stride(xTadShapeInfo); - const auto yTadStride = shape::stride(xTadShapeInfo); + const auto tadShape = shape::shapeOf(xTadShapeInfo); + const auto xTadStride = shape::stride(xTadShapeInfo); + const auto yTadStride = shape::stride(xTadShapeInfo); int numThreads = OmpLaunchHelper::tadThreads(tadLen, zLen); switch (kindOfLoop) { - //*********************************************// - case LoopKind::EWS1: { - Z extraParams[3]; - for (auto i = start; i < stop; i++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; + //*********************************************// + case LoopKind::EWS1: { + Z extraParams[3]; + for (auto i = start; i < stop; i++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; - const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; - const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto s = OpType::startingValue(xTad); + const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; + const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; + auto s = OpType::startingValue(xTad); - for (uint j = 0; j < tadLen; ++j) - s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams); + for (Nd4jLong j = 0; j < tadLen; ++j) + s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams); - z[i] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; + z[i] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; - //*********************************************// - case LoopKind::EWSNONZERO: { - Z extraParams[3]; - for (auto i = start; i < stop; i++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; + //*********************************************// + case LoopKind::EWSNONZERO: { + Z extraParams[3]; + for (auto i = start; i < stop; i++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; - const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; - const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto s = OpType::startingValue(xTad); + const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; + const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; + auto s = OpType::startingValue(xTad); - for (uint j = 0; j < tadLen; ++j) - s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams); + for (Nd4jLong j = 0; j < tadLen; ++j) + s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams); - z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; - //*********************************************// - case LoopKind::RANK1: { - Z extraParams[3]; - for (auto i = start; i < stop; i++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; + //*********************************************// + case LoopKind::RANK1: { + Z extraParams[3]; + for (auto i = start; i < stop; i++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; - const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; - const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto s = OpType::startingValue(xTad); + const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; + const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; + auto s = OpType::startingValue(xTad); - for (uint i0 = 0; i0 < tadLen; ++i0) { - const auto xTadOffset = i0 * xTadStride[0]; - const auto yTadOffset = i0 * yTadStride[0]; + for (Nd4jLong i0 = 0; i0 < tadLen; ++i0) { + const auto xTadOffset = i0 * xTadStride[0]; + const auto yTadOffset = i0 * yTadStride[0]; + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); + } + + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; + + //*********************************************// + case LoopKind::RANK2: { + Z extraParams[3]; + for (auto i = start; i < stop; i++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; + + const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; + const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; + auto s = OpType::startingValue(xTad); + + for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) { + for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) { + const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1]; + const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1]; s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } + } + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; - z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; + //*********************************************// + case LoopKind::RANK3: { + Z extraParams[3]; + for (auto i = start; i < stop; i++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; - //*********************************************// - case LoopKind::RANK2: { - Z extraParams[3]; - for (auto i = start; i < stop; i++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; + const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; + const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; + auto s = OpType::startingValue(xTad); - const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; - const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto s = OpType::startingValue(xTad); - - for (uint i0 = 0; i0 < tadShape[0]; ++i0) { - for (uint i1 = 0; i1 < tadShape[1]; ++i1) { - const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1]; - const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1]; + for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) { + for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) { + for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) { + const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2]; + const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2]; s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } } - z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; + } + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; - //*********************************************// - case LoopKind::RANK3: { - Z extraParams[3]; - for (auto i = start; i < stop; i++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; + //*********************************************// + case LoopKind::RANK4: { + Z extraParams[3]; + for (auto i = start; i < stop; i++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; - const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; - const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto s = OpType::startingValue(xTad); + const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; + const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; + auto s = OpType::startingValue(xTad); - for (uint i0 = 0; i0 < tadShape[0]; ++i0) { - for (uint i1 = 0; i1 < tadShape[1]; ++i1) { - for (uint i2 = 0; i2 < tadShape[2]; ++i2) { - const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2]; - const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2]; + for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) { + for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) { + for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) { + for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) { + const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3]; + const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3]; s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } } } - z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; + } + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; - //*********************************************// - case LoopKind::RANK4: { - Z extraParams[3]; - for (auto i = start; i < stop; i++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; + //*********************************************// + case LoopKind::RANK5: { + Z extraParams[3]; + for (auto i = start; i < stop; i++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; - const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; - const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto s = OpType::startingValue(xTad); + const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; + const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; + auto s = OpType::startingValue(xTad); - for (uint i0 = 0; i0 < tadShape[0]; ++i0) { - for (uint i1 = 0; i1 < tadShape[1]; ++i1) { - for (uint i2 = 0; i2 < tadShape[2]; ++i2) { - for (uint i3 = 0; i3 < tadShape[3]; ++i3) { - const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3]; - const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3]; + for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) { + for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) { + for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) { + for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) { + for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4) { + const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4]; + const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4]; s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } } } } - z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); - }; - } - break; + } + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; + } + break; - //*********************************************// - case LoopKind::RANK5: { + //*********************************************// + default: { + uint castXTadShapeInfo[MAX_RANK]; + const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo(xTadShapeInfo, castXTadShapeInfo); + + if (shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) { Z extraParams[3]; for (auto i = start; i < stop; i++) { extraParams[0] = param0; @@ -890,83 +925,49 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; auto s = OpType::startingValue(xTad); - for (uint i0 = 0; i0 < tadShape[0]; ++i0) { - for (uint i1 = 0; i1 < tadShape[1]; ++i1) { - for (uint i2 = 0; i2 < tadShape[2]; ++i2) { - for (uint i3 = 0; i3 < tadShape[3]; ++i3) { - for (uint i4 = 0; i4 < tadShape[4]; ++i4) { - const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4]; - const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4]; - s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); - } - } - } - } + for (Nd4jLong j = 0; j < tadLen; ++j) { + const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); + s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams); + } + + z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); + }; + } + else { + uint castYTadShapeInfo[MAX_RANK]; + const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo(yTadShapeInfo, castYTadShapeInfo); + + Z extraParams[3]; + for (auto i = start; i < stop; i++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; + + const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; + const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; + auto s = OpType::startingValue(xTad); + + for (Nd4jLong j = 0; j < tadLen; ++j) { + const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); + const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad); + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); }; } - break; - - //*********************************************// - default: { - uint castXTadShapeInfo[MAX_RANK]; - const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo(xTadShapeInfo, castXTadShapeInfo); - - if(shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) { - Z extraParams[3]; - for (auto i = start; i < stop; i++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; - - const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; - const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto s = OpType::startingValue(xTad); - - for (uint j = 0; j < tadLen; ++j) { - const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); - s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams); - } - - z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); - }; - } - else { - uint castYTadShapeInfo[MAX_RANK]; - const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo(yTadShapeInfo, castYTadShapeInfo); - - Z extraParams[3]; - for (auto i = start; i < stop; i++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; - - const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x; - const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; - auto s = OpType::startingValue(xTad); - - for (uint j = 0; j < tadLen; ++j) { - const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); - const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad); - s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); - } - z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); - }; - } - } + } } } -////////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////////// template template void nd4j::Reduction3Loops::loopReduce3All(X* x, Nd4jLong* xShapeInfo, - X* y, Nd4jLong* yShapeInfo, - Z* z, Nd4jLong* zShapeInfo, - Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, - Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, - Z* extraParameters, int64_t start, int64_t stop) { + X* y, Nd4jLong* yShapeInfo, + Z* z, Nd4jLong* zShapeInfo, + Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, + Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, + Z* extraParameters, int64_t start, int64_t stop) { // both tads have same shape, however strides and ews may differ @@ -976,186 +977,223 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, const auto xTadEws = shape::elementWiseStride(xTadShapeInfo); const auto yTadEws = shape::elementWiseStride(yTadShapeInfo); - const auto zEws = shape::elementWiseStride(zShapeInfo); + const auto zEws = shape::elementWiseStride(zShapeInfo); - const auto zLen = shape::length(zShapeInfo); + const auto zLen = shape::length(zShapeInfo); const auto tadLen = shape::length(xTadShapeInfo); const auto numXTads = shape::length(xShapeInfo) / tadLen; const auto numYTads = shape::length(yShapeInfo) / tadLen; - const auto tadShape = shape::shapeOf(xTadShapeInfo); - const auto xTadStride = shape::stride(xTadShapeInfo); - const auto yTadStride = shape::stride(yTadShapeInfo); + const auto tadShape = shape::shapeOf(xTadShapeInfo); + const auto xTadStride = shape::stride(xTadShapeInfo); + const auto yTadStride = shape::stride(yTadShapeInfo); const auto startVal = OpType::startingValue(x); - int numThreads = OmpLaunchHelper::tadThreads(tadLen, numXTads*numYTads); + int numThreads = OmpLaunchHelper::tadThreads(tadLen, numXTads * numYTads); switch (kindOfLoop) { - //*********************************************// - case LoopKind::EWS1: { - Z extraParams[3]; - for (auto ix = 0; ix < numXTads; ix++) { - for (auto iy = 0; iy < numYTads; iy++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; + //*********************************************// + case LoopKind::EWS1: { + Z extraParams[3]; + for (Nd4jLong ix = 0; ix < numXTads; ix++) { + for (Nd4jLong iy = 0; iy < numYTads; iy++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto s = startVal; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; - for (uint j = 0; j < tadLen; ++j) - s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams); + for (Nd4jLong j = 0; j < tadLen; ++j) + s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams); - z[zInd] = OpType::postProcess(s, tadLen, extraParams); + z[zInd] = OpType::postProcess(s, tadLen, extraParams); + } + }; + } + break; + + //*********************************************// + case LoopKind::EWSNONZERO: { + Z extraParams[3]; + for (Nd4jLong ix = 0; ix < numXTads; ix++) { + for (Nd4jLong iy = 0; iy < numYTads; iy++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; + + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; + + for (Nd4jLong j = 0; j < tadLen; ++j) + s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams); + + z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); + } + }; + } + break; + + //*********************************************// + case LoopKind::RANK1: { + Z extraParams[3]; + for (Nd4jLong ix = 0; ix < numXTads; ix++) { + for (Nd4jLong iy = 0; iy < numYTads; iy++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; + + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; + + for (Nd4jLong i0 = 0; i0 < tadLen; ++i0) { + const auto xTadOffset = i0 * xTadStride[0]; + const auto yTadOffset = i0 * yTadStride[0]; + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } - }; - } - break; + z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); + } + }; + } + break; - //*********************************************// - case LoopKind::EWSNONZERO: { - Z extraParams[3]; - for (auto ix = 0; ix < numXTads; ix++) { - for (auto iy = 0; iy < numYTads; iy++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; + //*********************************************// + case LoopKind::RANK2: { + Z extraParams[3]; + for (Nd4jLong ix = 0; ix < numXTads; ix++) { + for (Nd4jLong iy = 0; iy < numYTads; iy++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto s = startVal; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; - for (uint j = 0; j < tadLen; ++j) - s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams); - - z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); - } - }; - } - break; - - //*********************************************// - case LoopKind::RANK1: { - Z extraParams[3]; - for (auto ix = 0; ix < numXTads; ix++) { - for (auto iy = 0; iy < numYTads; iy++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; - - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto s = startVal; - - for (uint i0 = 0; i0 < tadLen; ++i0) { - const auto xTadOffset = i0 * xTadStride[0]; - const auto yTadOffset = i0 * yTadStride[0]; + for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) { + for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) { + const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1]; + const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1]; s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } - z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); } - }; - } - break; + z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); + } + }; + } + break; - //*********************************************// - case LoopKind::RANK2: { - Z extraParams[3]; - for (auto ix = 0; ix < numXTads; ix++) { - for (auto iy = 0; iy < numYTads; iy++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; + //*********************************************// + case LoopKind::RANK3: { + Z extraParams[3]; + for (Nd4jLong ix = 0; ix < numXTads; ix++) { + for (Nd4jLong iy = 0; iy < numYTads; iy++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto s = startVal; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; - for (uint i0 = 0; i0 < tadShape[0]; ++i0) { - for (uint i1 = 0; i1 < tadShape[1]; ++i1) { - const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1]; - const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1]; + for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) { + for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) { + for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) { + const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2]; + const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2]; s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } } - z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); } - }; - } - break; + z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); + } + }; + } + break; - //*********************************************// - case LoopKind::RANK3: { - Z extraParams[3]; - for (auto ix = 0; ix < numXTads; ix++) { - for (auto iy = 0; iy < numYTads; iy++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; + //*********************************************// + case LoopKind::RANK4: { + Z extraParams[3]; + for (Nd4jLong ix = 0; ix < numXTads; ix++) { + for (Nd4jLong iy = 0; iy < numYTads; iy++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto s = startVal; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; - for (uint i0 = 0; i0 < tadShape[0]; ++i0) { - for (uint i1 = 0; i1 < tadShape[1]; ++i1) { - for (uint i2 = 0; i2 < tadShape[2]; ++i2) { - const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2]; - const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2]; + for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) { + for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) { + for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) { + for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) { + const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3]; + const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3]; s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } } } - z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); } - }; - } - break; + z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); + } + }; + } + break; - //*********************************************// - case LoopKind::RANK4: { - Z extraParams[3]; - for (auto ix = 0; ix < numXTads; ix++) { - for (auto iy = 0; iy < numYTads; iy++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; + //*********************************************// + case LoopKind::RANK5: { + Z extraParams[3]; + for (Nd4jLong ix = 0; ix < numXTads; ix++) { + for (Nd4jLong iy = 0; iy < numYTads; iy++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto s = startVal; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; - for (uint i0 = 0; i0 < tadShape[0]; ++i0) { - for (uint i1 = 0; i1 < tadShape[1]; ++i1) { - for (uint i2 = 0; i2 < tadShape[2]; ++i2) { - for (uint i3 = 0; i3 < tadShape[3]; ++i3) { - const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3]; - const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3]; + for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) { + for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) { + for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) { + for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) { + for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4) { + const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4]; + const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4]; s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } } } } - z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); } - }; - } - break; + z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams); + } + }; + } + break; - //*********************************************// - case LoopKind::RANK5: { + //*********************************************// + default: { + uint castXTadShapeInfo[MAX_RANK]; + const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo(xTadShapeInfo, castXTadShapeInfo); + + if (shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) { Z extraParams[3]; - for (auto ix = 0; ix < numXTads; ix++) { - for (auto iy = 0; iy < numYTads; iy++) { + for (Nd4jLong ix = 0; ix < numXTads; ix++) { + for (Nd4jLong iy = 0; iy < numYTads; iy++) { extraParams[0] = param0; extraParams[1] = param1; extraParams[2] = param2; @@ -1165,79 +1203,42 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo, const auto zInd = ix * numYTads + iy; auto s = startVal; - for (uint i0 = 0; i0 < tadShape[0]; ++i0) { - for (uint i1 = 0; i1 < tadShape[1]; ++i1) { - for (uint i2 = 0; i2 < tadShape[2]; ++i2) { - for (uint i3 = 0; i3 < tadShape[3]; ++i3) { - for (uint i4 = 0; i4 < tadShape[4]; ++i4) { - const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4]; - const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4]; - s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); - } - } - } - } + for (Nd4jLong j = 0; j < tadLen; ++j) { + const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); + s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams); } - z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams); + z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); } }; } - break; + else { + uint castYTadShapeInfo[MAX_RANK]; + const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo(yTadShapeInfo, castYTadShapeInfo); - //*********************************************// - default: { - uint castXTadShapeInfo[MAX_RANK]; - const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo(xTadShapeInfo, castXTadShapeInfo); + Z extraParams[3]; + for (Nd4jLong ix = 0; ix < numXTads; ix++) { + for (Nd4jLong iy = 0; iy < numYTads; iy++) { + extraParams[0] = param0; + extraParams[1] = param1; + extraParams[2] = param2; - if(shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) { - Z extraParams[3]; - for (auto ix = 0; ix < numXTads; ix++) { - for (auto iy = 0; iy < numYTads; iy++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; + const auto xTad = x + xTadOffsets[ix]; + const auto yTad = y + yTadOffsets[iy]; + const auto zInd = ix * numYTads + iy; + auto s = startVal; - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto s = startVal; - - for (uint j = 0; j < tadLen; ++j) { - const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); - s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams); - } - z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); + for (Nd4jLong j = 0; j < tadLen; ++j) { + const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); + const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad); + s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); } - }; - } - else { - uint castYTadShapeInfo[MAX_RANK]; - const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo(yTadShapeInfo, castYTadShapeInfo); - Z extraParams[3]; - for (auto ix = 0; ix < numXTads; ix++) { - for (auto iy = 0; iy < numYTads; iy++) { - extraParams[0] = param0; - extraParams[1] = param1; - extraParams[2] = param2; - - const auto xTad = x + xTadOffsets[ix]; - const auto yTad = y + yTadOffsets[iy]; - const auto zInd = ix * numYTads + iy; - auto s = startVal; - - for (uint j = 0; j < tadLen; ++j) { - const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); - const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad); - s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); - } - - z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); - } - }; - } + z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); + } + }; } } + } } diff --git a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp index f047d1136..f18f0c788 100644 --- a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp +++ b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp @@ -50,12 +50,12 @@ namespace nd4j { 1 == zArr.ews() && 'c' == zArr.ordering()); if (bSpecialCase && yArr.isColumnVector() && 1 == xArr.sizeAt(-1) ) { - auto yLen = (uint32_t)yArr.lengthOf(); + auto yLen = yArr.lengthOf(); auto func = PRAGMA_THREADS_FOR{ - for (uint32_t i = start; i < stop; i++) { + for (auto i = start; i < stop; i++) { auto rZ = z + (i * yLen); auto v = x[i]; - for (uint32_t j = 0; j < yLen; j++) { + for (Nd4jLong j = 0; j < yLen; j++) { rZ[j] = OpType::op(v, y[j]); } } @@ -74,13 +74,13 @@ namespace nd4j { if (bSpecialCase && bSpecialCase2) { - int zDim1 = zArr.sizeAt(-2); - int zDim2 = zArr.sizeAt(-1); + uint32_t zDim1 = zArr.sizeAt(-2); + uint32_t zDim2 = zArr.sizeAt(-1); - int nLen = zArr.lengthOf() / yArr.sizeAt(-1); + uint32_t nLen = zArr.lengthOf() / yArr.sizeAt(-1); auto func = PRAGMA_THREADS_FOR{ - for (uint32_t total = start; total < stop; total++) { + for (auto total = start; total < stop; total++) { uint32_t i = total / zDim1; uint32_t j = total % zDim1; @@ -127,7 +127,7 @@ namespace nd4j { yCoords[iy--] = 0; } } - } + } const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data()); const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data()); diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp index 62058bd20..7226d00b3 100644 --- a/libnd4j/include/loops/cpu/broadcasting.hpp +++ b/libnd4j/include/loops/cpu/broadcasting.hpp @@ -184,7 +184,7 @@ namespace functions { const auto oX = x[i]; PRAGMA_OMP_SIMD - for (unsigned int f = 0; f < loopLength; f++) + for (Nd4jLong f = 0; f < loopLength; f++) oZ[f] = OpType::op(oX, oY[f]); } } else if(kindOfLoop == nd4j::LoopKind::BROADCAST_SCALAR_Y){ @@ -198,7 +198,7 @@ namespace functions { const auto oY = y[i]; PRAGMA_OMP_SIMD - for (unsigned int f = 0; f < loopLength; f++) + for (Nd4jLong f = 0; f < loopLength; f++) oZ[f] = OpType::op(oX[f], oY); } } @@ -213,14 +213,14 @@ namespace functions { Nd4jLong yStrides[3] = { 0,0,0 }; nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides); - uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1); - uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2); + uint64_t nSize1 = shape::sizeAt(zShapeInfo, 1); + uint64_t nSize2 = shape::sizeAt(zShapeInfo, 2); - for (uint32_t index0 = start; index0 < stop; index0++) { + for (auto index0 = start; index0 < stop; index0++) { PRAGMA_OMP_SIMD - for (uint32_t index1 = 0; index1 < nSize1; index1++) { - for (uint32_t index2 = 0; index2 < nSize2; index2++) { + for (uint64_t index1 = 0; index1 < nSize1; index1++) { + for (uint64_t index2 = 0; index2 < nSize2; index2++) { auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2); auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2); auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2); @@ -242,18 +242,18 @@ namespace functions { Nd4jLong yStrides[4] = { 0,0,0,0 }; nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides); - uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1); - uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2); - uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3); + uint64_t nSize1 = shape::sizeAt(zShapeInfo, 1); + uint64_t nSize2 = shape::sizeAt(zShapeInfo, 2); + uint64_t nSize3 = shape::sizeAt(zShapeInfo, 3); - for (uint32_t i = start; i < stop; i++) { + for (auto i = start; i < stop; i++) { - uint32_t index0 = i / nSize1; - uint32_t index1 = i % nSize1; + uint64_t index0 = i / nSize1; + uint64_t index1 = i % nSize1; PRAGMA_OMP_SIMD - for (uint32_t index2 = 0; index2 < nSize2; index2++) { - for (uint32_t index3 = 0; index3 < nSize3; index3++) { + for (uint64_t index2 = 0; index2 < nSize2; index2++) { + for (uint64_t index3 = 0; index3 < nSize3; index3++) { auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2 + xStrides[3] * index3); auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2 + yStrides[3] * index3); auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2 + zStrides[3] * index3); @@ -279,7 +279,7 @@ namespace functions { uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3); uint32_t nSize4 = shape::sizeAt(zShapeInfo, 4); - for (uint32_t i = start; i < stop; i++) { + for (auto i = start; i < stop; i++) { uint32_t index0 = i / nSize1; uint32_t index1 = i % nSize1; @@ -326,7 +326,7 @@ namespace functions { auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[offset], y[offset]); @@ -344,7 +344,7 @@ namespace functions { auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[offset], y[yOffset]); @@ -362,7 +362,7 @@ namespace functions { auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[xOffset], y[offset]); @@ -382,7 +382,7 @@ namespace functions { auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); @@ -497,7 +497,7 @@ namespace functions { auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(x[offset], oY[offset]); @@ -515,7 +515,7 @@ namespace functions { auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[xOffset], oY[offset]); @@ -533,7 +533,7 @@ namespace functions { auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[offset], oY[yOffset]); @@ -553,7 +553,7 @@ namespace functions { auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); diff --git a/libnd4j/include/loops/cpu/broadcasting_bool.cpp b/libnd4j/include/loops/cpu/broadcasting_bool.cpp index 8d62b9506..faf6fdff6 100644 --- a/libnd4j/include/loops/cpu/broadcasting_bool.cpp +++ b/libnd4j/include/loops/cpu/broadcasting_bool.cpp @@ -183,7 +183,7 @@ namespace functions { auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); oZ[offset] = OpType::op(oX[offset], y[offset], extraParams); } @@ -200,7 +200,7 @@ namespace functions { auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[offset], y[offset], extraParams); @@ -218,7 +218,7 @@ namespace functions { auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[offset], y[yOffset], extraParams); @@ -237,7 +237,7 @@ namespace functions { auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[xOffset], y[offset], extraParams); @@ -257,7 +257,7 @@ namespace functions { auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); @@ -357,7 +357,7 @@ namespace functions { auto oZ = z + zTadOffset[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); oZ[offset] = OpType::op(x[offset], oY[offset], extraParams); } @@ -375,7 +375,7 @@ namespace functions { auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(x[offset], oY[offset], extraParams); @@ -394,7 +394,7 @@ namespace functions { auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[xOffset], oY[offset], extraParams); @@ -413,7 +413,7 @@ namespace functions { auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[offset], oY[yOffset], extraParams); @@ -434,7 +434,7 @@ namespace functions { auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); diff --git a/libnd4j/include/loops/cpu/broadcasting_int.cpp b/libnd4j/include/loops/cpu/broadcasting_int.cpp index 9dcce7545..9737cb4bb 100644 --- a/libnd4j/include/loops/cpu/broadcasting_int.cpp +++ b/libnd4j/include/loops/cpu/broadcasting_int.cpp @@ -177,7 +177,7 @@ namespace functions { auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); oZ[offset] = OpType::op(oX[offset], y[offset]); } @@ -194,7 +194,7 @@ namespace functions { auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(oX[offset], y[offset]); @@ -212,7 +212,7 @@ namespace functions { auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[offset], y[yOffset]); @@ -230,7 +230,7 @@ namespace functions { auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); oZ[offset] = OpType::op(oX[xOffset], y[offset]); @@ -250,7 +250,7 @@ namespace functions { auto oX = x + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (unsigned int f = 0; f < tadLength; f++) { auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); @@ -347,7 +347,7 @@ namespace functions { auto oZ = z + zTadOffset[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (uint f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); oZ[offset] = OpType::op(x[offset], oY[offset]); } @@ -364,7 +364,7 @@ namespace functions { auto oZ = z + zTadOffset[i]; auto oY = y + tadOffsets[i]; - for (int f = 0; f < tadLength; f++) { + for (uint f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); oZ[zOffset] = OpType::op(x[offset], oY[offset]); @@ -382,7 +382,7 @@ namespace functions { auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (uint f = 0; f < tadLength; f++) { auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[xOffset], oY[offset]); @@ -400,7 +400,7 @@ namespace functions { auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (uint f = 0; f < tadLength; f++) { auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); oZ[offset] = OpType::op(x[offset], oY[yOffset]); @@ -420,7 +420,7 @@ namespace functions { auto oY = y + tadOffsets[i]; PRAGMA_OMP_SIMD - for (int f = 0; f < tadLength; f++) { + for (uint f = 0; f < tadLength; f++) { auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); diff --git a/libnd4j/include/loops/cpu/indexreduce.hpp b/libnd4j/include/loops/cpu/indexreduce.hpp index 8d3af7eb4..f875e170b 100644 --- a/libnd4j/include/loops/cpu/indexreduce.hpp +++ b/libnd4j/include/loops/cpu/indexreduce.hpp @@ -124,7 +124,7 @@ void IndexReduce::exec(void *vx, Nd4jLong *xShapeInfo, return; const auto indexValue = OpType::startingIndexValue(x); - for (uint i = 0; i < zLen; i++) + for (Nd4jLong i = 0; i < zLen; i++) z[i] = (Z) indexValue.index; return; diff --git a/libnd4j/include/loops/cpu/random.hpp b/libnd4j/include/loops/cpu/random.hpp index ab9793694..3b9b3c515 100644 --- a/libnd4j/include/loops/cpu/random.hpp +++ b/libnd4j/include/loops/cpu/random.hpp @@ -93,7 +93,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i++) { + for (auto i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); @@ -111,7 +111,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i++) { + for (auto i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments); @@ -129,7 +129,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i++) { + for (auto i = start; i < stop; i++) { auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments); @@ -149,7 +149,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i++) { + for (auto i = start; i < stop; i++) { auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); @@ -197,7 +197,7 @@ namespace functions { else{ auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i++) { + for (auto i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments); } @@ -213,7 +213,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i++) { + for (auto i = start; i < stop; i++) { auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments); @@ -255,7 +255,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i++) { + for (auto i = start; i < stop; i++) { auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); z[offset] = OpClass::op(i, length, rng, extraArguments); } diff --git a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp index 1ee820853..79eb9b209 100644 --- a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp +++ b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp @@ -55,7 +55,7 @@ namespace functions { return; const auto startingVal = OpType::startingValue(x); - for (uint i = 0; i < length; i++) + for (Nd4jLong i = 0; i < length; i++) z[i] = startingVal; return; } @@ -68,7 +68,7 @@ namespace functions { uint xShapeInfoCast[MAX_RANK]; const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - for (auto i = 0; i < length; i++) + for (Nd4jLong i = 0; i < length; i++) startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); z[0] = OpType::postProcess(startingValue, length, extraParams); @@ -94,7 +94,7 @@ namespace functions { uint xShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - for (auto i = 0; i < length; i++) + for (Nd4jLong i = 0; i < length; i++) startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); return OpType::postProcess(startingValue, length, extraParams); @@ -156,7 +156,7 @@ namespace functions { return; const auto startingVal = OpType::startingValue(x); - for (uint i = 0; i < resultLength; i++) + for (Nd4jLong i = 0; i < resultLength; i++) z[i] = startingVal; return; } diff --git a/libnd4j/include/loops/cpu/reduce/reduce_float.hpp b/libnd4j/include/loops/cpu/reduce/reduce_float.hpp index d0a80a3f5..4437f52c0 100644 --- a/libnd4j/include/loops/cpu/reduce/reduce_float.hpp +++ b/libnd4j/include/loops/cpu/reduce/reduce_float.hpp @@ -59,7 +59,7 @@ namespace functions { return; const auto startingVal = OpType::startingValue(x); - for (uint i = 0; i < length; i++) + for (Nd4jLong i = 0; i < length; i++) z[i] = startingVal; return; @@ -113,7 +113,7 @@ namespace functions { uint xShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - for (auto i = 0; i < length; i++) + for (Nd4jLong i = 0; i < length; i++) startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); return OpType::postProcess(startingValue, length, extraParams); @@ -184,7 +184,7 @@ namespace functions { return; const auto startingVal = std::is_same>::value ? nd4j::DataTypeUtils::nanOrZero() : static_cast(OpType::startingValue(x)); - for (uint i = 0; i < resultLength; i++) + for (Nd4jLong i = 0; i < resultLength; i++) z[i] = startingVal; return; } diff --git a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp index e53c9ac8e..08664fcab 100644 --- a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp +++ b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp @@ -55,7 +55,7 @@ namespace functions { return; const auto startingVal = OpType::startingValue(x); - for (uint i = 0; i < length; i++) + for (Nd4jLong i = 0; i < length; i++) z[i] = startingVal; return; } @@ -110,7 +110,7 @@ namespace functions { uint xShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - for (auto i = 0; i < length; i++) + for (Nd4jLong i = 0; i < length; i++) startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); return OpType::postProcess(startingValue, length, extraParams); @@ -173,7 +173,7 @@ namespace functions { return; const auto startingVal = OpType::startingValue(x); - for (uint i = 0; i < resultLength; i++) + for (Nd4jLong i = 0; i < resultLength; i++) z[i] = startingVal; return; } diff --git a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp index 929d9c4ff..e546f71ee 100644 --- a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp +++ b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp @@ -57,7 +57,7 @@ namespace functions { return; const auto startingVal = OpType::startingValue(x); - for (uint i = 0; i < length; i++) + for (Nd4jLong i = 0; i < length; i++) z[i] = startingVal; return; } @@ -111,7 +111,7 @@ namespace functions { uint xShapeInfoCast[MAX_RANK]; bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - for (auto i = 0; i < length; i++) + for (Nd4jLong i = 0; i < length; i++) startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); return OpType::postProcess(startingValue, length, extraParams); @@ -182,7 +182,7 @@ namespace functions { return; const auto startingVal = OpType::startingValue(x); - for (uint i = 0; i < zLength; i++) + for (Nd4jLong i = 0; i < zLength; i++) z[i] = startingVal; return; } diff --git a/libnd4j/include/loops/cpu/reduce3.hpp b/libnd4j/include/loops/cpu/reduce3.hpp index c24a3d474..39cd63754 100644 --- a/libnd4j/include/loops/cpu/reduce3.hpp +++ b/libnd4j/include/loops/cpu/reduce3.hpp @@ -53,7 +53,7 @@ void Reduce3::execScalar(void *vx, Nd4jLong *xShapeInfo, return; const auto startingVal = OpType::startingValue(x); - for (uint i = 0; i < length; i++) + for (Nd4jLong i = 0; i < length; i++) z[i] = startingVal; return; diff --git a/libnd4j/include/loops/cpu/scalar.hpp b/libnd4j/include/loops/cpu/scalar.hpp index 071913e22..17a7e88d2 100644 --- a/libnd4j/include/loops/cpu/scalar.hpp +++ b/libnd4j/include/loops/cpu/scalar.hpp @@ -73,7 +73,7 @@ void ScalarTransform::transform(void *vx, Nd4jLong *xShapeInfo, auto oX = x + xTadOffsets[r]; PRAGMA_OMP_SIMD - for (unsigned int f = 0; f < tadLength; f++) + for (int f = 0; f < tadLength; f++) oZ[f] = OpType::op(oX[f], scalars[r], extraParams); }; } @@ -83,7 +83,7 @@ void ScalarTransform::transform(void *vx, Nd4jLong *xShapeInfo, auto oX = x + xTadOffsets[r]; PRAGMA_OMP_SIMD - for (unsigned int f = 0; f < tadLength; f++) + for (int f = 0; f < tadLength; f++) oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams); }; } diff --git a/libnd4j/include/loops/cpu/scalar_bool.cpp b/libnd4j/include/loops/cpu/scalar_bool.cpp index d6dce445b..83e14ae66 100644 --- a/libnd4j/include/loops/cpu/scalar_bool.cpp +++ b/libnd4j/include/loops/cpu/scalar_bool.cpp @@ -74,7 +74,7 @@ namespace functions { auto oX = x + xTadOffsets[r]; PRAGMA_OMP_SIMD - for (unsigned int f = 0; f < tadLength; f++) + for (int f = 0; f < tadLength; f++) oZ[f] = OpType::op(oX[f], scalars[r], extraParams); }; } @@ -84,7 +84,7 @@ namespace functions { auto oX = x + xTadOffsets[r]; PRAGMA_OMP_SIMD - for (unsigned int f = 0; f < tadLength; f++) + for (int f = 0; f < tadLength; f++) oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams); }; } diff --git a/libnd4j/include/loops/cpu/scalar_int.cpp b/libnd4j/include/loops/cpu/scalar_int.cpp index e9f96ff70..5fa51f765 100644 --- a/libnd4j/include/loops/cpu/scalar_int.cpp +++ b/libnd4j/include/loops/cpu/scalar_int.cpp @@ -74,7 +74,7 @@ namespace functions { auto oX = x + xTadOffsets[r]; PRAGMA_OMP_SIMD - for (unsigned int f = 0; f < tadLength; f++) + for (int f = 0; f < tadLength; f++) oZ[f] = OpType::op(oX[f], scalars[r], extraParams); }; } @@ -84,7 +84,7 @@ namespace functions { auto oX = x + xTadOffsets[r]; PRAGMA_OMP_SIMD - for (unsigned int f = 0; f < tadLength; f++) + for (int f = 0; f < tadLength; f++) oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams); }; } diff --git a/libnd4j/include/loops/cpu/summarystatsreduce.cpp b/libnd4j/include/loops/cpu/summarystatsreduce.cpp index 2e36b8085..ec3c847ec 100644 --- a/libnd4j/include/loops/cpu/summarystatsreduce.cpp +++ b/libnd4j/include/loops/cpu/summarystatsreduce.cpp @@ -91,7 +91,7 @@ namespace functions { uint xShapeInfoCast[MAX_RANK]; const bool canCast = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); - for (uint64_t i = 0; i < length; i++) { + for (Nd4jLong i = 0; i < length; i++) { auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCast); SummaryStatsData curr; @@ -116,7 +116,7 @@ namespace functions { auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); auto extraParams = reinterpret_cast(vextraParams); - int resultLength = shape::length(zShapeInfo); + auto resultLength = shape::length(zShapeInfo); if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) { if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY) @@ -124,7 +124,7 @@ namespace functions { SummaryStatsData comp; comp.initWithValue(x[0]); - for (uint i = 0; i < resultLength; i++) + for (Nd4jLong i = 0; i < resultLength; i++) z[i] = OpType::getValue(biasCorrected, comp); return; } @@ -166,14 +166,14 @@ namespace functions { comp.initWithValue(tx[0]); if (tadEWS == 1 && tadOrder == 'c') { - for (int i = 1; i < tadLength; i++) { + for (Nd4jLong i = 1; i < tadLength; i++) { SummaryStatsData indexVal2; indexVal2.initWithValue(tx[i]); comp = update(comp, OpType::op(indexVal2, extraParams), extraParams); } } else { - for (int i = 1; i < tadLength; i++) { + for (Nd4jLong i = 1; i < tadLength; i++) { auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast); SummaryStatsData indexVal2; diff --git a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp index 3cf088ae9..ca9622af9 100644 --- a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp +++ b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp @@ -61,7 +61,7 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) { else axes.push_back(inRank-1); // default dimension to reduce along is last dimension - const int numOfAxes = axes.size(); + const uint numOfAxes = axes.size(); REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank); // evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes @@ -83,7 +83,7 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) { REQUIRE_TRUE(beta->isSameShape(expShape), 0, "BATCHNORM op: wrong shape of beta array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(beta).c_str()); // types of all input arrays should be the same - for(int i = 1; i < block.width(); ++i) + for(unsigned long i = 1; i < block.width(); ++i) REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM op: types of all input arrays should be the same !"); nd4j_debug("MKL-DNN is not used for batchnorm!\n", 0); @@ -167,7 +167,7 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) { else axes.push_back(inRank-1); // default dimension to reduce along is last dimension - const int numOfAxes = axes.size(); + const uint numOfAxes = axes.size(); REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM_BP op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank); // evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes @@ -191,7 +191,7 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) { REQUIRE_TRUE(input->isSameShape(dLdO), 0, "BATCHNORM_BP op: wrong shape of output gradients array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(input).c_str(), ShapeUtils::shapeAsString(dLdO).c_str()); // types of all input arrays should be the same (except dLdO) - for(int i = 1; i < block.width() - 2; ++i) + for(unsigned long i = 1; i < block.width() - 2; ++i) REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP op: types of arrays (input, mean, variance, gamma, beta) should be the same !"); // ***** calculations ***** // diff --git a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp index baf19de10..ee45d46a7 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp @@ -30,7 +30,7 @@ namespace helpers { int* pRowCounts = reinterpret_cast(rowCounts.buffer()); int const* pRows = reinterpret_cast(rowP->getBuffer()); int const* pCols = reinterpret_cast(colP->getBuffer()); - for (int n = 0; n < N; n++) { + for (Nd4jLong n = 0; n < N; n++) { int begin = pRows[n];//->e(n); int end = pRows[n + 1];//rowP->e(n + 1); for (int i = begin; i < end; i++) { @@ -72,7 +72,7 @@ namespace helpers { int const* pRows = reinterpret_cast(rowP->getBuffer()); int* symRowP = reinterpret_cast(outputRows->buffer()); symRowP[0] = 0; - for (int n = 0; n < N; n++) + for (Nd4jLong n = 0; n < N; n++) symRowP[n + 1] = symRowP[n] + rowCounts->e(n); // outputRows->printBuffer("output rows"); @@ -86,7 +86,7 @@ namespace helpers { std::vector offset(N);// = NDArrayFactory::create('c', {N}); //PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(guided) shared(offset)) - for (int n = 0; n < N; n++) { + for (Nd4jLong n = 0; n < N; n++) { int begin = pRows[n]; int bound = pRows[n + 1]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp index 2e63c9d5e..738da9bc5 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp @@ -146,17 +146,17 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr auto length = shape::length(inShapeInfo); if (inEWS == 1) { - for (int i = 0; i < length; i++) + for (Nd4jLong i = 0; i < length; i++) max = nd4j::math::nd4j_max(max, inBuff[i]); PRAGMA_OMP_SIMD_SUM(sum) - for (int i = 0; i < length; i++) { + for (Nd4jLong i = 0; i < length; i++) { outBuff[i] = nd4j::math::nd4j_exp(inBuff[i] - max); sum += outBuff[i]; } PRAGMA_OMP_SIMD - for (int i = 0; i < length; i++) { + for (Nd4jLong i = 0; i < length; i++) { outBuff[i] /= sum; outBuff[i] = nd4j::math::nd4j_log(outBuff[i]); } @@ -164,17 +164,17 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr else if (inEWS > 1) { PRAGMA_OMP_SIMD_MAX(max) - for (int i = 0; i < length; i++) + for (Nd4jLong i = 0; i < length; i++) max = nd4j::math::nd4j_max(max, inBuff[i * inEWS]); PRAGMA_OMP_SIMD_SUM(sum) - for (int i = 0; i < length; i++) { + for (Nd4jLong i = 0; i < length; i++) { outBuff[i * inEWS] = nd4j::math::nd4j_exp(inBuff[i * inEWS] - max); sum += outBuff[i * inEWS]; } PRAGMA_OMP_SIMD - for (int i = 0; i < length; i++) { + for (Nd4jLong i = 0; i < length; i++) { outBuff[i * inEWS] /= sum; outBuff[i * inEWS] = nd4j::math::nd4j_log(outBuff[i * inEWS]); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp index 39e51f6d7..bfa1d5a32 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp @@ -443,7 +443,7 @@ namespace nd4j { const X* bias_new; X* bias_extra = nullptr; size_t total_num = 1; - for (size_t i = 0; i < rank; i++) { + for (Nd4jLong i = 0; i < rank; i++) { total_num *= bases[i]; } Nd4jLong inc; @@ -574,7 +574,7 @@ namespace nd4j { for (size_t i = 0; i < 2; i++) { numNC *= bases[i]; } - for (size_t i = 2; i < rank; i++) { + for (Nd4jLong i = 2; i < rank; i++) { numHW *= bases[i]; } Nd4jLong total_num = numNC * numHW; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/axis.cpp b/libnd4j/include/ops/declarable/helpers/cpu/axis.cpp index eb56acb9c..f082cd248 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/axis.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/axis.cpp @@ -27,7 +27,7 @@ namespace helpers { void adjustAxis(Nd4jLong rank, NDArray* axisVector, std::vector& output) { output.resize(axisVector->lengthOf()); - for (int e = 0; e < axisVector->lengthOf(); e++) { + for (Nd4jLong e = 0; e < axisVector->lengthOf(); e++) { auto ca = axisVector->e(e); if (ca < 0) ca += rank; @@ -37,7 +37,7 @@ namespace helpers { } void adjustAxis(Nd4jLong rank, std::vector &axisVector) { - for (int e = 0; e < axisVector.size(); e++) { + for (size_t e = 0; e < axisVector.size(); e++) { auto a = axisVector[e]; if (a < 0) axisVector[e] = a + rank; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp index aa9624600..32824684f 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp @@ -66,7 +66,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray* Nd4jLong* zOffsets = xzSameOffset ? xOffsets : new Nd4jLong[steps]; Nd4jLong* auxBuff = new Nd4jLong[2 * input->rankOf()]; - for (int j = 0; j < lenSmall; ++j) { + for (Nd4jLong j = 0; j < lenSmall; ++j) { const bool isOwner = (j < info._numThreads) ? thread_id == j : thread_id == (j % info._numThreads); @@ -96,7 +96,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray* shape::outerArrayOffsets(zOffsets, j, output->getShapeInfo(), mean->getShapeInfo(), auxBuff, dimsToExclude.data()); PRAGMA_OMP_SIMD - for (uint i = 0; i < steps; ++i) + for (Nd4jLong i = 0; i < steps; ++i) z[zOffsets[i]] = (x[xOffsets[i]] - meanVal) * sigmaInvGam + betaVal; } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp index 26f82bdd9..5573bb8f6 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp @@ -65,8 +65,8 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp T *col, *im; int imRow, imCol; - for (uint b = start_x; b < stop_x; b += inc_x) { - for (uint c = start_y; c < stop_y; c += inc_y) { + for (auto b = start_x; b < stop_x; b += inc_x) { + for (auto c = start_y; c < stop_y; c += inc_y) { for (int kRow = 0; kRow < kH; ++kRow) { for (int kCol = 0; kCol < kW; ++kCol) { for (int colH = 0; colH < oH; ++colH) { @@ -96,7 +96,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp auto func = PRAGMA_THREADS_FOR { T *col, *im; - for (uint b = start; b < stop; b++) { + for (auto b = start; b < stop; b++) { T *im0 = imBuff + b * imStride0; T *col4 = colBuff + b * colStride0; for (int colH = 0; colH < oH; ++colH, col4 += colStride4) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp index 6e801b1fa..b17d33db3 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp @@ -55,8 +55,8 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const auto func = PRAGMA_THREADS_FOR_2D { - for (uint b = start_x; b < stop_x; b += inc_x) { - for (uint oh = start_y; oh < stop_y; oh += inc_y) { + for (auto b = start_x; b < stop_x; b += inc_x) { + for (auto oh = start_y; oh < stop_y; oh += inc_y) { for (uint ow = 0; ow < oW; ++ow) { for (uint c = 0; c < iC; ++c) { @@ -70,7 +70,7 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const const int iw = ow * sW - pW + kw * dW; if (iw < 0 || iw >= iW) continue; - uint xCoords[4] = {b, (uint)ih, (uint)iw, c}; + uint xCoords[4] = { static_cast(b), static_cast(ih), static_cast(iw), c}; uint yCoords[3] = {kh, kw, c}; const X val = x[shape::getOffset(xShapeInfo, xCoords)] + y[shape::getOffset(yShapeInfo, yCoords)]; @@ -79,7 +79,7 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const } } - uint zCoords[4] = {b, oh, ow, c}; + uint zCoords[4] = { static_cast(b), static_cast(oh), ow, c}; z[shape::getOffset(zShapeInfo, zCoords)] = static_cast(max); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp index a470f140a..e529ab84f 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp @@ -63,7 +63,7 @@ namespace helpers { std::vector dims(reduceShape->lengthOf()); bool fit = true; - for( int i = 0; i < dims.size(); i++ ) { + for(auto i = 0; i < dims.size(); i++ ) { if (fit) { dims[i] = reduceShape->e(i); for (int e = 0; e < input->rankOf(); ++e) diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp index 0673a6f2b..3030b1255 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp @@ -53,7 +53,7 @@ namespace nd4j { outputs[i].second = 0; //PRAGMA_OMP_PARALLEL_FOR_IF(indices->lengthOf() > Environment::getInstance()->elementwiseThreshold()) - for (int e = 0; e < indices->lengthOf(); ++e) + for (Nd4jLong e = 0; e < indices->lengthOf(); ++e) if ((*indices).e(e) == i) listOutForCurrent.at(outputs[i].second++)->assign(listOfTensors.at(e)); } @@ -65,7 +65,7 @@ namespace nd4j { for (auto i = start; i < stop; i++) { outputs[i].first = outputList[i]; outputs[i].second = 0; - for (int e = 0; e < indices->lengthOf(); ++e) + for (Nd4jLong e = 0; e < indices->lengthOf(); ++e) if (indices->e(e) == i) outputs[i].first->p(outputs[i].second++, input->e(e)); } @@ -83,7 +83,7 @@ namespace nd4j { for (int e = 0; e < numOfData; e++) { auto data = inputs[e]; auto index = indices[e]; - for (int i = 0; i < index->lengthOf(); i++) { + for (Nd4jLong i = 0; i < index->lengthOf(); i++) { Nd4jLong pos = index->e(i); if (pos < 0) { nd4j_printf("dynamic_stitch: Index value should be non-negative. But %i was given", pos); @@ -100,7 +100,7 @@ namespace nd4j { } else { std::vector restDims(output->rankOf() - 1); - for (int i = restDims.size(); i > 0; i--) + for (auto i = restDims.size(); i > 0; i--) restDims[restDims.size() - i] = output->rankOf() - i; ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims); @@ -109,12 +109,12 @@ namespace nd4j { auto data = inputs[e]; auto index = indices[e]; std::vector sourceDims(data->rankOf() - index->rankOf()); - for (int i = sourceDims.size(); i > 0; i--) + for (auto i = sourceDims.size(); i > 0; i--) sourceDims[sourceDims.size() - i] = data->rankOf() - i; ResultSet listOfTensors = data->allTensorsAlongDimension(sourceDims) ; - for (int i = 0; i < index->lengthOf(); i++) { + for (Nd4jLong i = 0; i < index->lengthOf(); i++) { auto pos = index->e(i); if (pos < 0) { nd4j_printf("dynamic_stitch: Index value should be non-negative. But %i was given", pos); @@ -146,7 +146,7 @@ namespace nd4j { ResultSet listOfTensors = outputList[0]->allTensorsAlongDimension(sourceDims); - for (unsigned int i = 0; i < inputGradientList.size(); i++) { + for (auto i = 0; i < inputGradientList.size(); i++) { outputs[i].first = inputGradientList[i]; if (outputs[i].first->rankOf() < 1) continue; // skip empty gradient outs std::vector outDims(outputs[i].first->rankOf() - 1); @@ -158,7 +158,7 @@ namespace nd4j { outputs[i].second = 0; - for (int e = 0; e < indices->lengthOf(); ++e) + for (Nd4jLong e = 0; e < indices->lengthOf(); ++e) if (indices->e(e) == i) listOfTensors.at(e)->assign(listOutForCurrent.at(outputs[i].second++)); } @@ -171,7 +171,7 @@ namespace nd4j { for (auto i = start; i < stop; i++) { outputs[i].first = inputGradientList[i]; outputs[i].second = 0; - for (int e = 0; e < indices->lengthOf(); ++e) + for (Nd4jLong e = 0; e < indices->lengthOf(); ++e) if (indices->e(e) == i) output->p(e, outputs[i].first->e(outputs[i].second++)); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp b/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp index 30d4d3f7a..d43cd716f 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp @@ -45,7 +45,7 @@ namespace nd4j { auto xShapeInfo = inputs[e]->shapeInfo(); auto xLength = inputs[e]->lengthOf(); - for (uint i = 0; i < xLength; i++) + for (Nd4jLong i = 0; i < xLength; i++) z[i] = xBuffer[getIndexOffsetOrdered(i, xShapeInfo, order)]; } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp index beb48e382..6ece88ae6 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp @@ -26,7 +26,7 @@ namespace nd4j { namespace helpers { template static void hashCode_(LaunchContext *context, NDArray &array, NDArray &result) { - auto blockSize = 32; + Nd4jLong blockSize = 32; auto length = array.lengthOf(); int numBlocks = length / blockSize + ((length % blockSize == 0) ? 0 : 1); auto tempA = NDArrayFactory::create('c', {numBlocks}, context); @@ -42,11 +42,11 @@ namespace nd4j { // we divide array into 32 element chunks, and store intermediate results once auto func = PRAGMA_THREADS_FOR { - for (auto b = 0; b < stop; b++) { + for (auto b = start; b < stop; b++) { auto blockBuffer = buffer + b * numBlocks; Nd4jLong r = 1; - for (int e = 0; e < blockSize && e + (b * numBlocks) < length; e++) { + for (Nd4jLong e = 0; e < blockSize && e + (b * numBlocks) < length; e++) { auto v = longBytes(blockBuffer[e]); r = 31 * r + v; } @@ -68,7 +68,7 @@ namespace nd4j { auto blockBuffer = tempBuffer + b * numBlocks; Nd4jLong r = 1; - for (int e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) { + for (Nd4jLong e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) { auto v = longBytes(blockBuffer[e]); r = 31 * r + v; } @@ -103,4 +103,3 @@ namespace nd4j { } } } - diff --git a/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp b/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp index 911230367..8720b53d9 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp @@ -49,7 +49,7 @@ namespace nd4j { } PRAGMA_OMP_SIMD - for (int x = 0; x < numBins; x++) { + for (Nd4jLong x = 0; x < numBins; x++) { result[x] += bins[x]; } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp b/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp index 7be34e6ca..43fa52d34 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp @@ -64,8 +64,8 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input, NDArra if (shape::order(imShapeBuffer) == 'c' && shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) { auto func = PRAGMA_THREADS_FOR_2D { - for (int b = start_x; b < stop_x; b++) { - for (int c = start_y; c < stop_y; c++) { + for (auto b = start_x; b < stop_x; b++) { + for (auto c = start_y; c < stop_y; c++) { for (int kRow = 0; kRow < kH; ++kRow) { for (int kCol = 0; kCol < kW; ++kCol) { for (int colH = 0; colH < oH; ++colH) { @@ -98,8 +98,8 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input, NDArra T *col, *im; int imRow, imCol; - for (int b = start_x; b < stop_x; b += inc_x) { - for (int colH = start_y; colH < stop_y; colH += inc_y) { + for (auto b = start_x; b < stop_x; b += inc_x) { + for (auto colH = start_y; colH < stop_y; colH += inc_y) { for (int colW = 0; colW < oW; ++colW) { for (int c = 0; c < iC; ++c) { for (int kRow = 0; kRow < kH; ++kRow) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp index 23acab375..18b52925a 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp @@ -219,16 +219,16 @@ namespace helpers { auto func = PRAGMA_THREADS_FOR { for (auto batch = start; batch < stop; ++batch) { auto pInput = pInputBuf + batch * inBatchNumValues; - for (auto y = 0; y < outHeight; ++y) { + for (Nd4jLong y = 0; y < outHeight; ++y) { auto pOutput = pOutputBuf + (batch * outHeight + y) * outRowSize; const T* ysInputLowerPtr = pInput + ys[y]._bottomIndex * inRowSize; const T* ysInputUpperPtr = pInput + ys[y]._topIndex * inRowSize; double yVal = ys[y]._interpolarValue; - for (auto x = 0; x < outWidth; ++x) { + for (Nd4jLong x = 0; x < outWidth; ++x) { auto xsBottom = xsPtr[x]._bottomIndex; auto xsTop = xsPtr[x]._topIndex; auto xVal = xsPtr[x]._interpolarValue; - for (auto c = 0; c < channels; ++c) { + for (Nd4jLong c = 0; c < channels; ++c) { double topLeft(ysInputLowerPtr[xsBottom + c]); double topRight(ysInputLowerPtr[xsTop + c]); double bottomLeft(ysInputUpperPtr[xsBottom + c]); @@ -310,14 +310,14 @@ namespace helpers { if (halfPixelCenter) { inY = nd4j::math::nd4j_max(0LL, inY); } - for (auto x = 0; x < outWidth; ++x) { + for (Nd4jLong x = 0; x < outWidth; ++x) { auto posX = alignCorners ? static_cast(nd4j::math::p_round(scaler(x, st.widthScale))) : static_cast(nd4j::math::p_floor(scaler(x, st.widthScale))); Nd4jLong inX = nd4j::math::nd4j_min(posX,inWidth - 1); if (halfPixelCenter) { inX = nd4j::math::nd4j_max(0LL, inX); } // copy pixel over all channels - for (auto e = 0; e < channels; e++) + for (Nd4jLong e = 0; e < channels; e++) output->t(b, y, x, e) = images->t(b, inY, inX, e); } } @@ -613,7 +613,7 @@ namespace helpers { for (auto b = start; b < stop; ++b) { auto pInput = inputPtr + b * inBatchWidth; - for (auto y = 0; y < outHeight; ++y) { + for (Nd4jLong y = 0; y < outHeight; ++y) { auto pOutput = &pOutputY[(b * outHeight + y) * outWidth * numChannels]; WeightsAndIndices yWai; @@ -635,7 +635,7 @@ namespace helpers { F cached_value_0[4] = {0}; F cached_value_1[4] = {0}; F cached_value_2[4] = {0}; - for (auto x = 0; x < resizerState.outWidth; ++x) { + for (Nd4jLong x = 0; x < resizerState.outWidth; ++x) { const WeightsAndIndices &xWai = xWais[x]; // Shift values in cached_value_* to fill first '_advance' values. switch (xWai._advance) { @@ -712,7 +712,7 @@ namespace helpers { xWai._weight2, xWai._weight3); } } else { - for (auto x = 0; x < resizerState.outWidth; ++x) { + for (Nd4jLong x = 0; x < resizerState.outWidth; ++x) { const WeightsAndIndices &xWai = xWais[x]; // Shift values in cachedValue to fill first '_advance' values. switch (xWai._advance) { @@ -828,7 +828,7 @@ namespace helpers { float sum_0 = 0; float sum_1 = 0; float sum_2 = 0; - for (int i = 0; i < yPtrs.size(); ++i) { + for (size_t i = 0; i < yPtrs.size(); ++i) { const T* ptr = yPtrs[i].yPtr; float scaleX = xCache.startScale; Nd4jLong offset = 3 * boundIfNeeded(xCache.start, st.inWidth); @@ -879,7 +879,7 @@ namespace helpers { const auto numChannels = st.channels; for (Nd4jLong c = 0; c < numChannels; ++c) { float sum = 0; - for (int i = 0; i < yPtrs.size(); ++i) { + for (size_t i = 0; i < yPtrs.size(); ++i) { T const* ptr = yPtrs[i].yPtr; float scaleX = xCache.startScale; float sumY = static_cast(ptr[numChannels * boundIfNeeded(xCache.start, st.inWidth) + c]) * scaleX; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp index aeb9e38b0..226e3ceed 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp @@ -62,7 +62,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out if(inTadEws == 1 && outTadEws == 1) { auto func = PRAGMA_THREADS_FOR { - for (uint i = start; i < stop; i++) { + for (auto i = start; i < stop; i++) { const T *x = inBuff + inTadOffsets[i]; T *y = outBuff + outTadOffsets[i]; @@ -70,7 +70,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1] // we store each squared sum in corresponding element of y array - for (uint j = 0; j < tadLen; ++j) { + for (Nd4jLong j = 0; j < tadLen; ++j) { const uint begin = nd4j::math::nd4j_max(0, j - depth); const uint last = depth + j + 1; const uint end = nd4j::math::nd4j_min(last, tadLen); @@ -100,7 +100,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out } else { auto func = PRAGMA_THREADS_FOR { - for (uint i = 0; i < numOfTads; ++i) { + for (Nd4jLong i = 0; i < numOfTads; ++i) { const T *x = inBuff + inTadOffsets[i]; T *y = outBuff + outTadOffsets[i]; @@ -108,7 +108,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1] // we store each squared sum in corresponding element of y array - for (uint j = 0; j < tadLen; ++j) { + for (Nd4jLong j = 0; j < tadLen; ++j) { const uint begin = nd4j::math::nd4j_max(0, j - depth); const uint last = depth + j + 1; const uint end = nd4j::math::nd4j_min(last, tadLen); @@ -179,13 +179,13 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c if(inTadEws == 1 && gradITadEws == 1) { auto func = PRAGMA_THREADS_FOR { - for (uint i = start; i < stop; i++) { + for (auto i = start; i < stop; i++) { const X *x = inBuff + inTadOffsets[i]; Y *y = gradIBuff + gradITadOffsets[i]; // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1] // we store each squared sum in corresponding element of y array - for (uint j = 0; j < tadLen; ++j) { + for (Nd4jLong j = 0; j < tadLen; ++j) { const uint begin = nd4j::math::nd4j_max(0, j - depth); const uint last = depth + j + 1; const uint end = nd4j::math::nd4j_min(last, tadLen); @@ -208,7 +208,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c Y prev = 0; // second loop calculates derivatives using information gained in first loop above - for (uint j = 0; j < tadLen; ++j) { + for (Nd4jLong j = 0; j < tadLen; ++j) { const uint begin = nd4j::math::nd4j_max(0, j - depth); const uint last = depth + j + 1; const uint end = nd4j::math::nd4j_min(last, tadLen); @@ -247,13 +247,13 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c else { auto func = PRAGMA_THREADS_FOR { - for (uint i = start; i < stop; i++) { + for (auto i = start; i < stop; i++) { const X *x = inBuff + inTadOffsets[i]; Y *y = gradIBuff + gradITadOffsets[i]; // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1] // we store each squared sum in corresponding element of y array - for (uint j = 0; j < tadLen; ++j) { + for (Nd4jLong j = 0; j < tadLen; ++j) { const uint begin = nd4j::math::nd4j_max(0, j - depth); const uint last = depth + j + 1; const uint end = nd4j::math::nd4j_min(last, tadLen); @@ -280,7 +280,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c Y prev = 0; // second loop calculates derivatives using information gained in first loop above - for (uint j = 0; j < tadLen; ++j) { + for (Nd4jLong j = 0; j < tadLen; ++j) { const uint begin = nd4j::math::nd4j_max(0, j - depth); const uint last = depth + j + 1; const uint end = nd4j::math::nd4j_min(last, tadLen); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp index 634d875d2..47c5c2a22 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp @@ -124,7 +124,7 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast, auto h_ = h->bufferAsT(); auto func = PRAGMA_THREADS_FOR { - for (uint e = start; e < stop; e++) { + for (auto e = start; e < stop; e++) { c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]); h_[e] = nd4j::math::nd4j_tanh(c_[e]); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrix_band.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrix_band.cpp index fbab49e80..53531dd17 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_band.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/matrix_band.cpp @@ -32,7 +32,7 @@ namespace helpers { Nd4jLong preLastDim = input->rankOf() - 2; ResultSet listOut = output->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}); ResultSet listDiag = input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}); - for (Nd4jLong e = 0; e < listOut.size(); ++e) { + for (Nd4jLong e = 0; e < static_cast(listOut.size()); ++e) { NDArray* inputMatrix = listDiag.at(e); NDArray* outputMatrix = listOut.at(e); if (outputMatrix != inputMatrix) // if not inplace diff --git a/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp b/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp index 43c65f14b..233d7d972 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp @@ -68,7 +68,7 @@ namespace nd4j { if (shape::elementWiseStride(xShapeInfo) == 1 && shape::elementWiseStride(zShapeInfo) == 1 && shape::order(xShapeInfo) == 'c' && shape::order(zShapeInfo) == 'c') { - for (int e = 0; e < length; e++) { + for (Nd4jLong e = 0; e < length; e++) { sum = op == scalar::Add ? simdOps::Add::op(sum, x[e]) : simdOps::Multiply::op(sum, x[e]); if (!exclusive) @@ -81,7 +81,7 @@ namespace nd4j { } else { - for (int e = 0; e < length; e++) { + for (Nd4jLong e = 0; e < length; e++) { auto xOffset = shape::getIndexOffset(e, xShapeInfo); auto zOffset = shape::getIndexOffset(e, zShapeInfo); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp index 9e1980e54..8daccaeac 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp @@ -43,8 +43,8 @@ namespace helpers { T const* vBuf = v.getDataBuffer()->primaryAsT(); T* resBuf = res.dataBuffer()->primaryAsT(); auto interloop = PRAGMA_THREADS_FOR_2D { - for (int i = start_x; i < n; i += inc_x) - for (int j = start_y; j < n; j += inc_y) + for (auto i = start_x; i < n; i += inc_x) + for (auto j = start_y; j < n; j += inc_y) resBuf[i * n + j] = -2 * vBuf[i] * vBuf[j] + (i == j ? T(1) : T(0)); }; @@ -63,7 +63,7 @@ namespace helpers { NDArray z = *matrix; NDArray e('c', {M}, DataTypeUtils::fromT()); // two internal buffers and scalar for squared norm - for (auto k = 0; k < N && k < M - 1; k++) { // loop for columns, but not further then row number + for (Nd4jLong k = 0; k < N && k < M - 1; k++) { // loop for columns, but not further then row number e.nullify(); z = matrixMinor(z, k); // minor computing for current column with given matrix z (initally is a input matrix) // z.printIndexedBuffer("Minor!!!"); @@ -87,7 +87,7 @@ namespace helpers { } resQ.assign(q[0]); // // MmulHelper::matmul(&q[0], matrix, &resR, false, false); - for (int i = 1; i < N && i < M - 1; i++) { + for (Nd4jLong i = 1; i < N && i < M - 1; i++) { auto tempResQ = resQ; MmulHelper::matmul(&q[i], &resQ, &tempResQ, false, false); // use mmulMxM? resQ = std::move(tempResQ); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/random.cpp b/libnd4j/include/ops/declarable/helpers/cpu/random.cpp index ad04db307..e8f37f31c 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/random.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/random.cpp @@ -57,10 +57,10 @@ namespace helpers { T* outputBuf = output->dataBuffer()->primaryAsT(); PRAGMA_OMP_PARALLEL_FOR - for (auto k = 0; k < shift; k++) { + for (Nd4jLong k = 0; k < shift; k++) { auto pos = k * step; auto u = rng.relativeT(k, 0., 1.); - for (auto e = 0; e < step; e++) + for (Nd4jLong e = 0; e < step; e++) if (directOutput) { outputBuf[pos + e] = math::nd4j_igamma(copyAlpha->t(e), beta != nullptr ? copyBeta->t(e) * u : u); @@ -104,10 +104,10 @@ namespace helpers { bool directLa = lambda->ews() == 1 && lambda->ordering() == 'c'; bool directOut = output->ews() == 1 && output->ordering() == 'c'; PRAGMA_OMP_PARALLEL_FOR - for (auto k = 0; k < shift; k++) { + for (Nd4jLong k = 0; k < shift; k++) { auto pos = k * step; auto u = rng.relativeT(k, 0., 1.); - for (auto e = 0; e < step; e++) { + for (Nd4jLong e = 0; e < step; e++) { auto p = math::nd4j_exp(-lambda->t(e)); auto s = p; auto x = T(0.f); @@ -143,7 +143,7 @@ namespace helpers { RandomLauncher::fillUniform(context, rng, output, minVal, maxVal); else { PRAGMA_OMP_PARALLEL_FOR - for (auto i = 0; i < output->lengthOf(); i++) { + for (Nd4jLong i = 0; i < output->lengthOf(); i++) { output->t(i) = rng.relativeT(i, minVal, maxVal); } } @@ -184,7 +184,7 @@ namespace helpers { auto nSamplesPerBatch = nBatchIndex * numOfClassX * numOfSamples; auto nClassesPerSample = nSampleIndexInBatch * numOfClassX; - for (auto nClass = 0; nClass < numOfClassX; nClass += 1) { + for (Nd4jLong nClass = 0; nClass < numOfClassX; nClass += 1) { auto nIndex = nSamplesPerBatch + nClassesPerSample + nClass; auto unifornLog = nd4j::math::nd4j_log(-nd4j::math::nd4j_log(rng.relativeT(nIndex, minVal, maxVal))); Tx tValue = (xTad[nClass * xDimAstride] - unifornLog); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/random_crop.cpp b/libnd4j/include/ops/declarable/helpers/cpu/random_crop.cpp index c39e28928..9fb2281b0 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/random_crop.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/random_crop.cpp @@ -50,7 +50,7 @@ namespace helpers { width = lastDim; } - for (int i = 0; i < input->lengthOf(); i += lastDim) { + for (Nd4jLong i = 0; i < input->lengthOf(); i += lastDim) { for (Nd4jLong k = startPos; k < width && pos < output->lengthOf(); k++) { output->p(pos++, input->e(i + k)); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp b/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp index 8bfc1ca1a..f61f1a1cf 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp @@ -110,7 +110,7 @@ namespace helpers { } else { std::vector dims(source->rankOf() - axe - 1); - for (int i = 0; i < dims.size(); ++i) + for (size_t i = 0; i < dims.size(); ++i) dims[i] = axe + 1 + i; ResultSet listOfTensors = source->allTensorsAlongDimension({dims}); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp index 09a628b84..3f2c5d02f 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp @@ -55,9 +55,9 @@ static void batchToSpace_(const NDArray& input, NDArray& output, const uint crop // loop through output array auto func = PRAGMA_THREADS_FOR_3D { - for (uint b = start_x; b < stop_x; b += inc_x) { - for (uint h = start_y; h < stop_y; h += inc_y) { - for (uint w = start_z; w < stop_z; w += inc_z) { + for (auto b = start_x; b < stop_x; b += inc_x) { + for (auto h = start_y; h < stop_y; h += inc_y) { + for (auto w = start_z; w < stop_z; w += inc_z) { for (uint c = 0; c < iC; ++c) { const Nd4jLong xOffset = b * xShapeInfo[5] + h * xShapeInfo[6] + w * xShapeInfo[7] + c * xShapeInfo[8]; const Nd4jLong zOffset = b * zShapeInfo[5] + (h - cropBottom) * zShapeInfo[6] + (w - cropLeft) * zShapeInfo[7] + c * zShapeInfo[8]; @@ -146,11 +146,11 @@ void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const ND std::vector temp(numOfSpatialDims + rank); - int i; + uint i; for(i = 0; i < numOfSpatialDims; ++i) temp[i] = blockShape.e(i); temp[i++] = output.sizeAt(0); - for(int j = 1; j < rank; ++i, ++j) + for(uint j = 1; j < rank; ++i, ++j) temp[i] = input.sizeAt(j); NDArray inputRearranged0 = input.reshape(input.ordering(), temp); @@ -163,7 +163,7 @@ void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const ND temp[2*i - 1] = numOfSpatialDims + i; temp[2*i] = i - 1; } - for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i) + for(i = 2 * numOfSpatialDims + 1; i < static_cast(temp.size()); ++i) temp[i] = i; inputRearranged0.permutei(temp); @@ -216,8 +216,8 @@ static void spaceToBatch_(const NDArray& input, NDArray& output, const uint padB // loop through output array auto func = PRAGMA_THREADS_FOR_2D { - for (uint b = start_x; b < stop_x; b += inc_x) { - for (uint h = start_y; h < stop_y; h += inc_y) { + for (auto b = start_x; b < stop_x; b += inc_x) { + for (auto h = start_y; h < stop_y; h += inc_y) { for (uint w = 0; w < oW; ++w) { for (uint c = 0; c < iC; ++c) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp index 1679557af..06833d6b3 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp @@ -87,7 +87,7 @@ namespace helpers { if (input->isVector()) { T val = input->e(0); - for (int e = 1; e < indices->lengthOf(); e++) { + for (Nd4jLong e = 1; e < indices->lengthOf(); e++) { if (idx == indices->e(e)) { // min val = nd4j::math::nd4j_min(val, input->t(e)); @@ -115,7 +115,7 @@ namespace helpers { for (Nd4jLong i = 1; i < indices->lengthOf(); i++) { if (indices->e(i) == idx) { - for (int e = 0; e < minT->lengthOf(); e++) { + for (Nd4jLong e = 0; e < minT->lengthOf(); e++) { minT->p(e, nd4j::math::nd4j_min(minT->e(e), listOfTensors.at(i)->e(e))); } } @@ -138,7 +138,7 @@ namespace helpers { T val = T(0.f); int count = 0; - for (int e = 0; e < indices->lengthOf(); e++) { + for (Nd4jLong e = 0; e < indices->lengthOf(); e++) { if (idx == indices->e(e)) { // mean val += input->e(e); @@ -166,7 +166,7 @@ namespace helpers { auto meanV = meanT->dup(); meanV.assign(listOfTensors.at(0)); - for (int i = 1; i < indices->lengthOf(); i++) { + for (Nd4jLong i = 1; i < indices->lengthOf(); i++) { if (indices->e(i) == idx) { auto func = PRAGMA_THREADS_FOR { for (auto e = start; e < stop; e++) { @@ -198,7 +198,7 @@ namespace helpers { if (input->isVector()) { T val = T(0.f); int count = 0; - for (int e = 0; e < indices->lengthOf(); e++) { + for (Nd4jLong e = 0; e < indices->lengthOf(); e++) { if (idx == indices->e(e)) { // sum val += input->t(e); @@ -220,7 +220,7 @@ namespace helpers { std::vector> outputs(numOfClasses); auto sumT = listOfOutTensors.at(idx); - for (int i = 0; i < indices->lengthOf(); i++) { + for (Nd4jLong i = 0; i < indices->lengthOf(); i++) { if (indices->e(i) == idx) { auto func = PRAGMA_THREADS_FOR { for (auto e = start; e < stop; e++) { @@ -248,7 +248,7 @@ namespace helpers { T val = input->e(0); int count = 0; - for (int e = 1; e < indices->lengthOf(); e++) { + for (Nd4jLong e = 1; e < indices->lengthOf(); e++) { if (idx == indices->e(e)) { // sum val *= input->e(e); @@ -269,7 +269,7 @@ namespace helpers { int numOfClasses = output->sizeAt(0); // number of classes auto sumT = listOfOutTensors.at(idx); sumT->assign(listOfTensors.at(0)); - for (int i = 1; i < indices->lengthOf(); i++) { + for (Nd4jLong i = 1; i < indices->lengthOf(); i++) { if (indices->e(i) == idx) { auto func = PRAGMA_THREADS_FOR { for (auto e = start; e < stop; e++) { @@ -313,7 +313,7 @@ namespace helpers { bool segmentIndicesValidate(nd4j::LaunchContext * context, NDArray* indices, NDArray& expected, NDArray& output) { auto val = indices->e(0); - for (int e = 1; e < indices->lengthOf(); e++) { + for (Nd4jLong e = 1; e < indices->lengthOf(); e++) { output = indices->e(e); if (val.e(0) > output.e(0)) return false; @@ -362,7 +362,7 @@ namespace helpers { for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { T val = input->e(fi->second.at(0)); - for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) { + for (Nd4jLong idx = 1; idx < static_cast(fi->second.size()); ++idx) { val = nd4j::math::nd4j_max(val, input->e(fi->second.at(idx))); } output->p(fi->first, val); @@ -380,7 +380,7 @@ namespace helpers { for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { auto outputT = listOfOutTensors.at(fi->first); outputT->assign(listOfTensors.at(fi->second.at(0))); - for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) { + for (Nd4jLong idx = 1; idx < static_cast(fi->second.size()); ++idx) { auto maxT = listOfTensors.at(fi->second.at(idx)); for (Nd4jLong e = 0; e < outputT->lengthOf(); ++e) { T val = nd4j::math::nd4j_max(maxT->e(e), outputT->e(e)); @@ -432,7 +432,7 @@ namespace helpers { for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { auto outputT = listOfOutTensors.at(fi->first); outputT->assign(listOfTensors.at(fi->second.at(0))); - for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) { + for (size_t idx = 1; idx < fi->second.size(); ++idx) { auto minT = listOfTensors.at(fi->second.at(idx)); for (Nd4jLong e = 0; e < outputT->lengthOf(); ++e) { @@ -560,7 +560,7 @@ namespace helpers { for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { auto outputT = listOfOutTensors.at(fi->first); outputT->assign(listOfTensors.at(fi->second.at(0))); - for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) { + for (size_t idx = 1; idx < fi->second.size(); ++idx) { auto current = listOfTensors.at(fi->second.at(idx)); *outputT *= *current; @@ -584,7 +584,7 @@ namespace helpers { if (input->isVector()) { // 1D case for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { double sumValue = input->e(fi->second.at(0)); - for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) { + for (size_t idx = 1; idx < fi->second.size(); ++idx) { sumValue += input->e(fi->second.at(idx)); } output->p(fi->first, sumValue / nd4j::math::nd4j_sqrt(fi->second.size())); @@ -599,7 +599,7 @@ namespace helpers { for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { auto outputT = listOfOutTensors.at(fi->first); outputT->assign(listOfTensors.at(fi->second.at(0))); - for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) { + for (size_t idx = 1; idx < fi->second.size(); ++idx) { auto current = listOfTensors.at(fi->second.at(idx)); *outputT += *current; } @@ -651,7 +651,7 @@ namespace helpers { auto currentOut = listOfOutTensors.at(i); auto currentGradOut = listOfGradOuts.at(classNum); - for (uint64_t e = 0; e < current->lengthOf(); e++) { + for (Nd4jLong e = 0; e < current->lengthOf(); e++) { if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e(e) - current->e(e)) <= T(1.e-6)) currentOut->p(e, currentGradOut->e(e)); } @@ -703,7 +703,7 @@ namespace helpers { auto currentOut = listOfOutTensors.at(i); auto currentGradOut = listOfGradOuts.at(classNum); - for (int e = 0; e < current->lengthOf(); e++) { + for (Nd4jLong e = 0; e < current->lengthOf(); e++) { if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e(e) - current->e(e)) < 1.e-5) currentOut->p(e, currentGradOut->e(e)); @@ -746,13 +746,13 @@ namespace helpers { int pos = 0; //auto func = [&](uint64_t thread_id, uint64_t start, uint64_t stop, uint64_t increment) -> void { - for (auto i = 0; i < indices->lengthOf(); i++) { + for (Nd4jLong i = 0; i < indices->lengthOf(); i++) { auto classNum = indices->e(i); auto current = listOfTensors.at(i); auto currentOut = listOfOutTensors.at(i); auto currentGradOut = listOfGradOuts.at(classNum); - for (int e = 0; e < current->lengthOf(); e++) { + for (Nd4jLong e = 0; e < current->lengthOf(); e++) { currentOut->p(e, currentGradOut->e(e) / classCount.at(classNum)); } } @@ -781,7 +781,7 @@ namespace helpers { ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims); //auto func = PRAGMA_THREADS_FOR { - for (auto i = 0; i < indices->lengthOf(); i++) { + for (Nd4jLong i = 0; i < indices->lengthOf(); i++) { auto classNum = indices->e(i); auto current = listOfTensors.at(i); auto currentOut = listOfOutTensors.at(i); @@ -817,7 +817,7 @@ namespace helpers { //std::vector> outputs(numOfClasses); //auto func = PRAGMA_THREADS_FOR { - for (auto i = 0; i < indices->lengthOf(); i++) { + for (Nd4jLong i = 0; i < indices->lengthOf(); i++) { auto classNum = indices->e(i); auto current = listOfTensors.at(i); auto currentOut = listOfOutTensors.at(i); @@ -860,7 +860,7 @@ namespace helpers { ResultSet listOfTensors = input->allTensorsAlongDimension(restDims); ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims); - for (int i = 0; i < indices->lengthOf(); i++) { + for (Nd4jLong i = 0; i < indices->lengthOf(); i++) { Nd4jLong classNum = indices->e(i); NDArray* current = listOfTensors.at(i); NDArray* currentOut = listOfOutTensors.at(i); @@ -905,13 +905,13 @@ namespace helpers { ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims); //auto func = PRAGMA_THREADS_FOR { - for (auto i = 0; i < indices->lengthOf(); i++) { + for (Nd4jLong i = 0; i < indices->lengthOf(); i++) { auto classNum = indices->e(i); auto current = listOfTensors.at(i); auto currentOut = listOfOutTensors.at(i); auto currentGradOut = listOfGradOuts.at(classNum); - for (int e = 0; e < current->lengthOf(); e++) { + for (Nd4jLong e = 0; e < current->lengthOf(); e++) { if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->t(e) - current->t(e)) < 1.e-6) currentOut->t(e) = currentGradOut->t(e); } @@ -955,7 +955,7 @@ namespace helpers { ResultSet listOfTensors = input->allTensorsAlongDimension(restDims); ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims); - for (int i = 0; i < indices->lengthOf(); i++) { + for (Nd4jLong i = 0; i < indices->lengthOf(); i++) { Nd4jLong classNum = indices->e(i); NDArray* current = listOfTensors.at(i); NDArray* currentOut = listOfOutTensors.at(i); @@ -984,7 +984,7 @@ namespace helpers { ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims); //auto func = PRAGMA_THREADS_FOR { - for (auto i = 0; i < indices->lengthOf(); i++) { + for (Nd4jLong i = 0; i < indices->lengthOf(); i++) { auto classNum = indices->e(i); auto currentOut = listOfOutTensors.at(i); auto currentGradOut = listOfGradOuts.at(classNum); @@ -1021,7 +1021,7 @@ namespace helpers { ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims); //auto func = PRAGMA_THREADS_FOR { - for (auto i = 0; i < indices->lengthOf(); i++) { + for (Nd4jLong i = 0; i < indices->lengthOf(); i++) { auto classNum = indices->e(i); auto current = listOfTensors.at(i); auto currentOut = listOfOutTensors.at(i); @@ -1053,7 +1053,7 @@ namespace helpers { // if input is a vector: (as if in doc sample) if (input->isVector()) { //auto func = PRAGMA_THREADS_FOR { - for (auto e = 0; e < indices->lengthOf(); e++) { + for (Nd4jLong e = 0; e < indices->lengthOf(); e++) { auto classNum = indices->e(e); output->p(e, gradOut->e(classNum) / nd4j::math::nd4j_sqrt(classCount[classNum])); } @@ -1069,7 +1069,7 @@ namespace helpers { ResultSet listOfOutTensors =output->allTensorsAlongDimension(restDims); //auto func = PRAGMA_THREADS_FOR { - for (auto i = 0; i < indices->lengthOf(); i++) { + for (Nd4jLong i = 0; i < indices->lengthOf(); i++) { auto classNum = indices->e(i); auto current = listOfTensors.at(i); auto currentOut = listOfOutTensors.at(i); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp index 05353bf5e..3c3db8139 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp @@ -378,7 +378,7 @@ namespace nd4j { int irow = 0; auto cShift = t * idxShift; - for (int e = 0; e < hsRounds; e++) { + for (Nd4jLong e = 0; e < hsRounds; e++) { irow = bIndices[e + cShift]; if (irow < 0 || irow >= vocabSize) continue; @@ -457,7 +457,7 @@ namespace nd4j { T sneu1[600]; T sneu1e[600]; - for (int e = start; e < stop; e++) { + for (auto e = start; e < stop; e++) { T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength]; T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; @@ -500,7 +500,7 @@ namespace nd4j { // hierarchic softmax step if (!indices.isEmpty()) { - for (int i = 0; i < numIndices; i++) { + for (Nd4jLong i = 0; i < numIndices; i++) { const int cIndex = bIndices[(e * numIndices) + i]; const int cCode = bCodes[(e * numIndices) + i]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp index c8774f028..63c7758dc 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp @@ -41,8 +41,8 @@ namespace helpers { auto batchLoop = PRAGMA_THREADS_FOR { for (auto batch = start; batch < stop; batch++) { - for (auto r = 0; r < rows; r++) { - for (auto c = 0; c < r; c++) { + for (Nd4jLong r = 0; r < rows; r++) { + for (Nd4jLong c = 0; c < r; c++) { math::nd4j_swap(outputPart[batch]->t(r, c) , outputPart[batch]->t(c, r)); } } @@ -66,7 +66,7 @@ namespace helpers { auto permutationsPart = permutations.allTensorsAlongDimension({-1}); for (auto batch = 0; batch < permutationsPart.size(); ++batch) { - for (auto row = 0; row < PPart[batch]->rows(); ++row) { + for (Nd4jLong row = 0; row < PPart[batch]->rows(); ++row) { PPart[batch]->t(row, permutationsPart[batch]->t(row)) = T(1.f); } } @@ -77,7 +77,7 @@ namespace helpers { MmulHelper::matmul(&P, rightInput, &rightPermuted, 0, 0); ResultSet leftLowerPart = leftLower.allTensorsAlongDimension({-2, -1}); for (auto i = 0; i < leftLowerPart.size(); i++) { - for (auto r = 0; r < leftLowerPart[i]->rows(); r++) + for (Nd4jLong r = 0; r < leftLowerPart[i]->rows(); r++) leftLowerPart[i]->t(r,r) = (T)1.f; } // stage 2: triangularSolveFunctor for Lower with given b diff --git a/libnd4j/include/ops/declarable/helpers/cpu/split.cpp b/libnd4j/include/ops/declarable/helpers/cpu/split.cpp index 5c9c2bbf7..b648c2b82 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/split.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/split.cpp @@ -29,7 +29,7 @@ namespace helpers { ////////////////////////////////////////////////////////////////////////// template static void split_(const NDArray& input, const std::vector& outArrs, const int axis) { - int numSplits = outArrs.size(); + uint numSplits = outArrs.size(); const auto sizeofT = input.sizeOfT(); @@ -73,9 +73,9 @@ namespace helpers { if (luckCase2) { - const uint xDim = input.sizeAt(axis); + const auto xDim = input.sizeAt(axis); - for (uint i = 0; i < input.lengthOf() / xDim; ++i) { + for (Nd4jLong i = 0; i < input.lengthOf() / xDim; ++i) { T* x = xBuff + xDim * i; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp index c4b45b398..e50b18cd6 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp @@ -39,7 +39,7 @@ namespace helpers { // } // ----------------------------------------------------------------------------------------------- // std::vector dimsToExclude(input->rankOf() - 1); - for (int d = 0; d < dimsToExclude.size(); ++d) + for (size_t d = 0; d < dimsToExclude.size(); ++d) dimsToExclude[d] = d; const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(input->getShapeInfo(), dimsToExclude); @@ -72,7 +72,7 @@ namespace helpers { NDArray topValues = NDArrayFactory::create('c', {k}); NDArray sortedVals = NDArrayFactory::create('c', {k}); NDArray topIndices = NDArrayFactory::create('c', {k}); - for (Nd4jLong pos = 0; pos < k; ++pos) { + for (uint pos = 0; pos < k; ++pos) { topIndices.t(pos) = pos; topValues.t(pos) = trial.t(pos); } @@ -80,7 +80,7 @@ namespace helpers { sortedVals.assign(topValues);// = NDArrayFactory::create('c', {k}); //std::sort(sortedVals.begin(), sortedVals.end()); // sorted in ascending order SpecialMethods::sortGeneric(sortedVals.buffer(), sortedVals.shapeInfo(), false); - for (int i = k; i < width; ++i) { + for (Nd4jLong i = static_cast(k); i < width; ++i) { T val = trial.e(i); T minTopVal = sortedVals.t(0); if (minTopVal < val) { // value should be inserted to top k @@ -104,15 +104,15 @@ namespace helpers { if (needSort) { SpecialMethods::sortGeneric(topValues.buffer(), topValues.shapeInfo(), true); - for (int j = 0; j < width; j++) - for (int pos = 0; pos < k; ++pos) + for (Nd4jLong j = 0; j < width; j++) + for (uint pos = 0; pos < k; ++pos) if (topValues.t(pos) == trial.t(j)) topIndices.t(pos) = j; } else { // else sort by indices std::map sortValsMap; //std::vector> data(topValues.lengthOf()); - for (size_t e = 0; e < topValues.lengthOf(); ++e) { + for (Nd4jLong e = 0; e < topValues.lengthOf(); ++e) { sortValsMap[topIndices.t(e)] = topValues.t(e); } @@ -152,7 +152,7 @@ namespace helpers { auto func = PRAGMA_THREADS_FOR { for (auto e = start; e < stop; e++) { bool found = false; - for (int j = 0; j < k; j++) { + for (uint j = 0; j < k; j++) { if (target->e(e) == indices->e(e * k + j)) { found = true; break; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp index 1f630e8e0..f0b3a3a25 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp @@ -597,7 +597,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) { zCoordStart[yRank - 1] = coordToRestore; // construct coordinates for x - for (uint j = 0; j < yLastDim; ++j) + for (int j = 0; j < yLastDim; ++j) xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart); @@ -628,7 +628,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con if (indices != nullptr) { - for(int i = 0; i < indices->lengthOf(); ++i) + for(Nd4jLong i = 0; i < indices->lengthOf(); ++i) if(indices->e(i) >= input->sizeAt(axis)) throw std::runtime_error("helpers::gather function: indices array contains wrong elements, each element must be smaller than corresponding dimension of input array !"); @@ -733,7 +733,7 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat // increasing counter to skip numIndices e++; std::vector indices; - for (; e < intArgs->size(); e++) + for (; e < static_cast(intArgs->size()); e++) indices.push_back((*intArgs)[e]); auto func = PRAGMA_THREADS_FOR { @@ -813,7 +813,7 @@ static void mergeMaxIndex_(const std::vector& inArrs, NDArray& output) T max = -DataTypeUtils::max(); Nd4jLong idx = 0; - for (int i = 0; i < numArgs; i++) { + for (Nd4jLong i = 0; i < numArgs; i++) { T v = inArrs[i]->e(e); if (v > max) { max = v; @@ -841,7 +841,7 @@ static void mergeMax_(const std::vector& inArrs, NDArray& output) { auto func = PRAGMA_THREADS_FOR { for (auto e = start; e < stop; e++) { T max = -DataTypeUtils::max(); - for (int i = 0; i < numArgs; i++) { + for (Nd4jLong i = 0; i < numArgs; i++) { T v = inArrs[i]->e(e); if (v > max) max = v; @@ -867,7 +867,7 @@ static void mergeAvg_(const std::vector& inArrs, NDArray& output) { auto func = PRAGMA_THREADS_FOR { for (auto e = start; e < stop; e++) { T sum = 0.; - for (int i = 0; i < numArgs; i++) { + for (Nd4jLong i = 0; i < numArgs; i++) { T v = inArrs[i]->e(e); sum += v; } @@ -893,7 +893,7 @@ static void mergeAdd_(const std::vector& inArrs, NDArray& output) { auto func = PRAGMA_THREADS_FOR { for (auto e = start; e < stop; e++) { T sum = (T) 0.f; - for (int i = 0; i < numArgs; i++) + for (Nd4jLong i = 0; i < numArgs; i++) sum += inArrs[i]->e(e); output.p(e, sum); @@ -1242,7 +1242,7 @@ static void tileBP_(const NDArray& gradO /*input*/, NDArray& gradI /*output*/, c memset(gradIBuff, 0, gradILen * sizeof(T)); else { //PRAGMA_OMP_PARALLEL_FOR_SIMD - for (int i = 0; i < gradILen * gradIEWS; i += gradIEWS) + for (Nd4jLong i = 0; i < gradILen * gradIEWS; i += gradIEWS) gradIBuff[i] = static_cast(0.f); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp index c825a8fee..04508dcf8 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp @@ -43,10 +43,10 @@ namespace helpers { auto rows = leftInput->rows(); auto cols = rightInput->columns(); //output->t(0,0) = rightInput->t(0,0) / leftInput->t(0,0); - for (auto r = 0; r < rows; r++) { - for (auto j = 0; j < cols; j++) { + for (Nd4jLong r = 0; r < rows; r++) { + for (Nd4jLong j = 0; j < cols; j++) { auto sum = rightInput->t(r, j); - for (auto c = 0; c < r; c++) { + for (Nd4jLong c = 0; c < r; c++) { sum -= leftInput->t(r, c) * output->t(c, j); } output->t(r, j) = sum / leftInput->t(r, r); @@ -72,10 +72,10 @@ namespace helpers { static void upperTriangularSolve(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output) { auto rows = leftInput->rows(); auto cols = rightInput->columns(); - for (auto r = rows; r > 0; r--) { - for (auto j = 0; j < cols; j++) { + for (Nd4jLong r = rows; r > 0; r--) { + for (Nd4jLong j = 0; j < cols; j++) { auto sum = rightInput->t(r - 1, j); - for (auto c = r; c < rows; c++) { + for (Nd4jLong c = r; c < rows; c++) { sum -= leftInput->t(r - 1, c) * output->t(c, j); } output->t(r - 1, j) = sum / leftInput->t(r - 1, r - 1); @@ -114,14 +114,14 @@ namespace helpers { auto batchLoop = PRAGMA_THREADS_FOR { for (auto batch = start; batch < stop; batch++) { if (!lower) { - for (auto r = 0; r < rows; r++) { - for (auto c = 0; c <= r; c++) { + for (Nd4jLong r = 0; r < rows; r++) { + for (Nd4jLong c = 0; c <= r; c++) { outputPart[batch]->t(r, c) = inputPart[batch]->t(c, r); } } } else { - for (auto r = 0; r < rows; r++) { - for (auto c = r; c < cols; c++) { + for (Nd4jLong r = 0; r < rows; r++) { + for (Nd4jLong c = r; c < cols; c++) { outputPart[batch]->t(r, c) = inputPart[batch]->t(c, r); } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/weights.cpp b/libnd4j/include/ops/declarable/helpers/cpu/weights.cpp index 2d4bfee21..2dd936b09 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/weights.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/weights.cpp @@ -26,7 +26,7 @@ namespace helpers { template static void adjustWeights_(NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength) { - for (int e = 0; e < input->lengthOf(); e++) { + for (Nd4jLong e = 0; e < input->lengthOf(); e++) { int val = input->e(e); if (val < maxLength) { if (weights != nullptr)