diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp index 8abee8d82..6c5f6a8c8 100644 --- a/libnd4j/blas/NDArray.hpp +++ b/libnd4j/blas/NDArray.hpp @@ -501,7 +501,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector(bufferAsT() + headerLength); auto func = PRAGMA_THREADS_FOR{ - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto cdata = data + offsets[e]; if (dataType == DataType::UTF16) { unicode::utf8to16(string[e], cdata, std::char_traits::length(string[e])); @@ -568,7 +568,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector(bufferAsT() + headerLength); auto func = PRAGMA_THREADS_FOR{ - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto cdata = data + offsets[e]; if (dataType == DataType::UTF16) { unicode::utf8to16(string[e].data(), cdata, string[e].size()); @@ -635,7 +635,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector(bufferAsT() + headerLength); auto func = PRAGMA_THREADS_FOR{ - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto cdata = data + offsets[e]; if (dtype == DataType::UTF16) { memcpy(cdata, string[e].data(), string[e].size() * sizeof(uint16_t)); @@ -701,7 +701,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector::length(string[e]) * sizeof(uint16_t)); @@ -767,7 +767,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector(bufferAsT() + headerLength); auto func = PRAGMA_THREADS_FOR{ - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto cdata = data + offsets[e]; if (dtype == DataType::UTF16) { unicode::utf32to16(string[e].data(), cdata, string[e].size()); @@ -833,7 +833,7 @@ NDArray::NDArray(const std::vector& shape, const std::vector(bufferAsT() + headerLength); auto func = PRAGMA_THREADS_FOR{ - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto cdata = data + offsets[e]; if (dtype == DataType::UTF16) { unicode::utf32to16(string[e], cdata, std::char_traits::length(string[e])); @@ -2367,7 +2367,7 @@ NDArray NDArray::asS() const { const auto inData = bufferAsT() + offsetsLength; auto func = PRAGMA_THREADS_FOR{ - for (int e = start; e < stop; e += increment) { + for (int e = start; e < stop; e++) { auto cdata = outData + offsets[e]; auto end = nInputoffsets[e + 1]; auto idata = inData + nInputoffsets[e]; @@ -3466,7 +3466,7 @@ NDArray NDArray::dup(const char newOrder) const { std::vector strings(lengthOf()); auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { strings[i] = std::move(this->e(i)); } }; @@ -3479,7 +3479,7 @@ NDArray NDArray::dup(const char newOrder) const { std::vector strings(lengthOf()); auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { strings[i] = std::move(this->e(i)); } }; @@ -3491,7 +3491,7 @@ NDArray NDArray::dup(const char newOrder) const { std::vector strings(lengthOf()); auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { strings[i] = std::move(this->e(i)); } }; diff --git a/libnd4j/blas/cpu/NDArray.cpp b/libnd4j/blas/cpu/NDArray.cpp index 9bdf41a16..58d4b3c34 100644 --- a/libnd4j/blas/cpu/NDArray.cpp +++ b/libnd4j/blas/cpu/NDArray.cpp @@ -98,7 +98,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t auto func = PRAGMA_THREADS_FOR { Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, target.getShapeInfo(), coords); const auto zOffset = shape::getOffset(target.getShapeInfo(), coords); @@ -152,7 +152,7 @@ static void templatedSwap(void *xBuffer, void *yBuffer, Nd4jLong length) { auto y = reinterpret_cast(yBuffer); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto temp = x[i]; x[i] = y[i]; y[i] = temp; @@ -266,7 +266,7 @@ NDArray NDArray::tile(const std::vector& reps) const { if(result.ordering() == 'c') { // ews == 1 always here auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo()); BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES); } @@ -277,7 +277,7 @@ NDArray NDArray::tile(const std::vector& reps) const { else { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto xOffset = result.getOffset(i); auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo()); BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES); @@ -377,7 +377,7 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vectorordering() == second.ordering() && this->ordering() == third.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == second.ews() && this->ews() == third.ews()) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) z[e] = func(f[e], s[e], t[e]); }; @@ -31,7 +31,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std:: if (f == z) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto tOffset = this->getOffset(e); auto uOffset = second.getOffset(e); auto vOffset = third.getOffset(e); @@ -44,7 +44,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std:: } else { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto tOffset = this->getOffset(e); auto uOffset = second.getOffset(e); auto vOffset = third.getOffset(e); @@ -93,7 +93,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::functionordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) z[e] = func(f[e], s[e]); }; @@ -102,7 +102,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::functiongetOffset(e); auto yOffset = other.getOffset(e); @@ -114,7 +114,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::functiongetOffset(e); auto yOffset = other.getOffset(e); auto zOffset = target.getOffset(e); @@ -156,7 +156,7 @@ void NDArray::applyLambda(const std::function& func, NDArray& target) { if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) z[e] = func(f[e]); }; @@ -165,7 +165,7 @@ void NDArray::applyLambda(const std::function& func, NDArray& target) { if (f == z) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto xOffset = this->getOffset(e); f[xOffset] = func(f[xOffset]); @@ -176,7 +176,7 @@ void NDArray::applyLambda(const std::function& func, NDArray& target) { } else { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto xOffset = this->getOffset(e); auto zOffset = target.getOffset(e); @@ -217,7 +217,7 @@ void NDArray::applyIndexedLambda(const std::function& func, NDAr if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) z[e] = func(e, f[e]); }; @@ -226,7 +226,7 @@ void NDArray::applyIndexedLambda(const std::function& func, NDAr if (f == z) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto xOffset = this->getOffset(e); f[xOffset] = func(e, f[xOffset]); @@ -237,7 +237,7 @@ void NDArray::applyIndexedLambda(const std::function& func, NDAr } else { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto xOffset = this->getOffset(e); auto zOffset = target.getOffset(e); @@ -283,7 +283,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::functionordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) { auto loop = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) z[e] = func((Nd4jLong) e, f[e], s[e]); }; @@ -292,7 +292,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::functiongetOffset(e); auto yOffset = other.getOffset(e); @@ -304,7 +304,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::functiongetOffset(e); auto yOffset = other.getOffset(e); auto zOffset = target.getOffset(e); diff --git a/libnd4j/blas/cpu/NativeOps.cpp b/libnd4j/blas/cpu/NativeOps.cpp index b945c5bcf..e82f2224e 100644 --- a/libnd4j/blas/cpu/NativeOps.cpp +++ b/libnd4j/blas/cpu/NativeOps.cpp @@ -1291,7 +1291,7 @@ void pullRowsGeneric(void *vx, _threads = nd4j::math::nd4j_min(_threads, nd4j::Environment::getInstance()->maxThreads()); auto func = PRAGMA_THREADS_FOR { - for (auto idx = start; idx < stop; idx += increment) { + for (auto idx = start; idx < stop; idx++) { auto xTadOffsetForBlock = tadOffsets[indexes[idx]]; auto zTadOffsetForBlock = zTadOffsets[idx]; @@ -1356,7 +1356,7 @@ void tearGeneric(void *vx, auto numTads = shape::length(hXShapeInfo) / tadLength; auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto hZ = reinterpret_cast(targets[i]); auto s = hX + tadOffsets[i]; @@ -1478,7 +1478,7 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS auto dZ = reinterpret_cast(dz); auto func = PRAGMA_THREADS_FOR { - for (auto f = start; f < stop; f += increment) { + for (auto f = start; f < stop; f++) { auto hX = reinterpret_cast(dX[f]); //auto hZ = reinterpret_cast(dZ[f]); diff --git a/libnd4j/include/array/DataTypeConversions.h b/libnd4j/include/array/DataTypeConversions.h index 3af77ca39..abc804f5e 100644 --- a/libnd4j/include/array/DataTypeConversions.h +++ b/libnd4j/include/array/DataTypeConversions.h @@ -52,7 +52,7 @@ namespace nd4j { TypeCast::convertGeneric(nullptr, tmp, length, buffer); #else auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); }; @@ -110,7 +110,7 @@ namespace nd4j { TypeCast::convertGeneric(nullptr, tmp, length, buffer); #else auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); }; @@ -138,7 +138,7 @@ namespace nd4j { #else auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); }; @@ -164,7 +164,7 @@ namespace nd4j { TypeCast::convertGeneric(nullptr, tmp, length, buffer); #else auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) buffer[e] = canKeep ? static_cast(tmp[e]) : BitwiseUtils::swap_bytes(static_cast(tmp[e])); }; diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp index 1aaaaebc7..b661d02e7 100644 --- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp +++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp @@ -49,7 +49,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, case nd4j::LoopKind::EWS1: { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -70,7 +70,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, case nd4j::LoopKind::EWSNONZERO: { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -91,7 +91,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, case nd4j::LoopKind::RANK1: { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -114,7 +114,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, shape::updateStrides(2, tadShape, newStride, 'c'); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -141,7 +141,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, shape::updateStrides(3, tadShape, newStride, 'c'); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -170,7 +170,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, shape::updateStrides(4, tadShape, newStride, 'c'); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -201,7 +201,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, shape::updateStrides(5, tadShape, newStride, 'c'); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -234,7 +234,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, castZShapeInfo); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -258,7 +258,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo(tadShapeInfo, castTadShapeInfo); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); @@ -284,7 +284,7 @@ void nd4j::IndexReductionLoops::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, castZShapeInfo); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto tad = const_cast(x) + tadOffsets[i]; auto indexValue = OpType::startingIndexValue(tad); diff --git a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp index 95fe19109..f047d1136 100644 --- a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp +++ b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp @@ -80,7 +80,7 @@ namespace nd4j { int nLen = zArr.lengthOf() / yArr.sizeAt(-1); auto func = PRAGMA_THREADS_FOR{ - for (uint32_t total = start; total < stop; total += increment) { + for (uint32_t total = start; total < stop; total++) { uint32_t i = total / zDim1; uint32_t j = total % zDim1; diff --git a/libnd4j/include/loops/cpu/indexreduce.hpp b/libnd4j/include/loops/cpu/indexreduce.hpp index 829f60a18..8d3af7eb4 100644 --- a/libnd4j/include/loops/cpu/indexreduce.hpp +++ b/libnd4j/include/loops/cpu/indexreduce.hpp @@ -73,7 +73,7 @@ Nd4jLong IndexReduce::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex auto func = PRAGMA_THREADS_FOR { intermediatery[thread_id] = OpType::startingIndexValue(x); - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { IndexValue curr(x[i], i); intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams); } @@ -88,7 +88,7 @@ Nd4jLong IndexReduce::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex auto func = PRAGMA_THREADS_FOR { intermediatery[thread_id] = OpType::startingIndexValue(x); - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); IndexValue curr(x[offset], i); intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams); diff --git a/libnd4j/include/loops/cpu/random.hpp b/libnd4j/include/loops/cpu/random.hpp index 35674de36..ab9793694 100644 --- a/libnd4j/include/loops/cpu/random.hpp +++ b/libnd4j/include/loops/cpu/random.hpp @@ -75,7 +75,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); } @@ -93,7 +93,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i += increment) { + for (uint64_t i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); @@ -111,7 +111,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i += increment) { + for (uint64_t i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments); @@ -129,7 +129,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i += increment) { + for (uint64_t i = start; i < stop; i++) { auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments); @@ -149,7 +149,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i += increment) { + for (uint64_t i = start; i < stop; i++) { auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); @@ -197,7 +197,7 @@ namespace functions { else{ auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i += increment) { + for (uint64_t i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments); } @@ -213,7 +213,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i += increment) { + for (uint64_t i = start; i < stop; i++) { auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments); @@ -255,7 +255,7 @@ namespace functions { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t i = start; i < stop; i += increment) { + for (uint64_t i = start; i < stop; i++) { auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); z[offset] = OpClass::op(i, length, rng, extraArguments); } diff --git a/libnd4j/include/loops/cpu/reduce3.hpp b/libnd4j/include/loops/cpu/reduce3.hpp index 8d50aedbc..c24a3d474 100644 --- a/libnd4j/include/loops/cpu/reduce3.hpp +++ b/libnd4j/include/loops/cpu/reduce3.hpp @@ -88,7 +88,7 @@ void Reduce3::execScalar(void *vx, Nd4jLong *xShapeInfo, if (kindOfLoop == nd4j::LoopKind::EWS1) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); } }; @@ -98,7 +98,7 @@ void Reduce3::execScalar(void *vx, Nd4jLong *xShapeInfo, } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); } @@ -110,7 +110,7 @@ void Reduce3::execScalar(void *vx, Nd4jLong *xShapeInfo, const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); diff --git a/libnd4j/include/loops/cpu/summarystatsreduce.cpp b/libnd4j/include/loops/cpu/summarystatsreduce.cpp index a8f766f6a..2e36b8085 100644 --- a/libnd4j/include/loops/cpu/summarystatsreduce.cpp +++ b/libnd4j/include/loops/cpu/summarystatsreduce.cpp @@ -158,7 +158,7 @@ namespace functions { const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo(tadShapeShapeInfo, tadShapeShapeInfoCast); auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { + for (auto r = start; r < stop; r++) { auto tadOffsetForBlock = tadPack.primaryOffsets()[r]; auto tx = x + tadOffsetForBlock; diff --git a/libnd4j/include/loops/impl/type_conversions.cpp b/libnd4j/include/loops/impl/type_conversions.cpp index b12ff5796..36c95e731 100644 --- a/libnd4j/include/loops/impl/type_conversions.cpp +++ b/libnd4j/include/loops/impl/type_conversions.cpp @@ -81,7 +81,7 @@ namespace nd4j { // now we actually apply quantization auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { rz[e] = static_cast(nd4j::math::nd4j_round( 1.0f * static_cast(x[e]) / nd4j::math::nd4j_max(amax, amin) * max_byte)); } }; @@ -177,7 +177,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write) int flimit = limit + 4; auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { int el = x[e]; int ael = nd4j::math::nd4j_abs(el) - 1; z[ael] += el > 0 ? static_cast(threshold) : static_cast(-threshold); @@ -202,7 +202,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write) auto z = reinterpret_cast(dz); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { z[i] = static_cast(static_cast(x[i])); } }; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp index f8704d7b0..baf19de10 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp @@ -153,7 +153,7 @@ namespace helpers { auto rowSize = sizeof(T) * colCount; auto func = PRAGMA_THREADS_FOR { - for (auto n = start; n < stop; n += increment) { + for (auto n = start; n < stop; n++) { int s = rowP->e(n); int end = rowP->e(n + 1); int shift = n * colCount; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp index 56c93b611..2e63c9d5e 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp @@ -291,7 +291,7 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra shape::calcOffsets(tadShapeInfo, offsets); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto inBuff = input.bufferAsT() + tadOffsets[i]; auto outBuff = output.bufferAsT() + tadOffsets[i]; @@ -341,7 +341,7 @@ void prelu(nd4j::LaunchContext * context, const NDArray& input, const NDArray& a const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { // FIXME: double! double x = input.e(i); if (x < 0.0) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp index 978c037fa..5a22b02eb 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp @@ -67,7 +67,7 @@ static void adjustHue_(const NDArray *input, const NDArray* deltaScalarArr, NDAr const Nd4jLong zDimCstride = output->stridesOf()[dimC]; auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const T *xTad = x + packX.platformOffsets()[i]; T *zTad = z + packZ.platformOffsets()[i]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp index d4b0de398..594280ebe 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp @@ -66,7 +66,7 @@ static void adjustSaturation_(const NDArray *input, const NDArray* factorScalarA const Nd4jLong zDimCstride = output->stridesOf()[dimC]; auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const T *xTad = x + packX.platformOffsets()[i]; T *zTad = z + packZ.platformOffsets()[i]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp index b408da720..c63dc3c1c 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp @@ -94,7 +94,7 @@ void bgemm_(const std::vector& vA, const std::vector& vB, st int vaSize = vA.size(); auto func = PRAGMA_THREADS_FOR { - for (auto p = start; p < stop; p += increment) { + for (auto p = start; p < stop; p++) { auto A = reinterpret_cast(vA.at(p)->buffer()); auto B = reinterpret_cast(vB.at(p)->buffer()); auto C = reinterpret_cast(vC.at(p)->buffer()); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp index ad2e29a97..aa9624600 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp @@ -141,7 +141,7 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, input->getShapeInfo(), coords); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp index 83cc966ba..5e80d12fb 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp @@ -117,7 +117,7 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con int xLen = x.lengthOf(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) output.t(i) = betaIncCore(a.t(i), b.t(i), x.t(i)); }; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp index 5aad38da8..26f82bdd9 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp @@ -96,7 +96,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp auto func = PRAGMA_THREADS_FOR { T *col, *im; - for (uint b = start; b < stop; b += increment) { + for (uint b = start; b < stop; b++) { T *im0 = imBuff + b * imStride0; T *col4 = colBuff + b * colStride0; for (int colH = 0; colH < oH; ++colH, col4 += colStride4) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp b/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp new file mode 100644 index 000000000..1bdf0a6ad --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp @@ -0,0 +1,41 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author Yurii Shyrma (iuriish@yahoo.com), created on 20.04.2018 +// + + +#include +#include + +namespace nd4j { + namespace ops { + namespace helpers { + ////////////////////////////////////////////////////////////////////////// + template + static void concat_(const std::vector& inArrs, NDArray& output, const int axis) { + nd4j::SpecialMethods::concatCpuGeneric(inArrs, output, axis); + } + + void concat(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output, const int axis) { + BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES); + } + + BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector& inArrs, NDArray& output, const int axis), LIBND4J_TYPES); + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp index 4f8989caf..39449c7f8 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp @@ -32,7 +32,7 @@ namespace helpers { int lLen = labels->lengthOf(); auto func = PRAGMA_THREADS_FOR { - for (int j = start; j < stop; j += increment) { + for (int j = start; j < stop; j++) { auto label = labels->e(j); auto pred = predictions->e(j); T value = (weights == nullptr ? (T) 1.0f : weights->e(j)); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp index ca30d73bd..1f55378c0 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp @@ -50,7 +50,7 @@ namespace nd4j { T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0); auto func = PRAGMA_THREADS_FOR { - for (auto y = start; y < stop; y += increment) { + for (auto y = start; y < stop; y++) { const float inY = (cropHeight > 1) ? y1 * (imageHeight - 1) + y * heightScale : 0.5 * (y1 + y2) * (imageHeight - 1); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp index c12b1ce4f..6a8523925 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp @@ -39,7 +39,7 @@ void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray int tads = tadsA.size(); auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto a_ = tadsA.at(e); auto b_ = tadsB.at(e); auto o_ = tadsO.at(e); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp index f041452ab..d3e524ff4 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp @@ -46,7 +46,7 @@ namespace helpers { if (isNHWC) { const int total_count = batch_size * output_height * output_width * output_depth; auto func = PRAGMA_THREADS_FOR { - for (auto out_idx = start; out_idx < stop; out_idx += increment) { + for (auto out_idx = start; out_idx < stop; out_idx++) { const int d = out_idx % output_depth; const int out_idx2 = out_idx / output_depth; const int w = out_idx2 % output_width; @@ -70,7 +70,7 @@ namespace helpers { const int total_count = batch_size * input_depth_by_input_area; auto func = PRAGMA_THREADS_FOR { - for (int input_idx = start; input_idx < stop; input_idx += increment) { + for (int input_idx = start; input_idx < stop; input_idx++) { const int n_bY_bX_oC_iY = input_idx / input_width; const int iX = input_idx - n_bY_bX_oC_iY * input_width; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp b/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp index 8035f8216..2a51b92a6 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp @@ -32,7 +32,7 @@ template static void diGamma_(const NDArray& x, NDArray& z) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) z.p(i, diGammaScalar(x.e(i))); }; samediff::Threads::parallel_for(func, 0, x.lengthOf()); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp index 9db974b36..a470f140a 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp @@ -35,7 +35,7 @@ namespace helpers { int inLen = input->lengthOf(); auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { float val = nodeRng.relativeT(e, T(0.f), T(1.f)); if (val < probValue) @@ -130,7 +130,7 @@ namespace helpers { nd4j::graph::RandomGenerator nodeRng(3019L, seed); auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { float randVal = nodeRng.relativeT(e, T(0.f), T(1.f)); float xVal = input->e(e); output->p(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp index 281e6c809..0673a6f2b 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp @@ -62,7 +62,7 @@ namespace nd4j { unsigned int outSize = outputList.size(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { outputs[i].first = outputList[i]; outputs[i].second = 0; for (int e = 0; e < indices->lengthOf(); ++e) @@ -168,7 +168,7 @@ namespace nd4j { unsigned int gradsSize = inputGradientList.size(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { outputs[i].first = inputGradientList[i]; outputs[i].second = 0; for (int e = 0; e < indices->lengthOf(); ++e) diff --git a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp index 0a46c995e..b2707ea5c 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp @@ -50,7 +50,7 @@ namespace helpers { colCast = 0; auto func = PRAGMA_THREADS_FOR { - for (auto batch = 0; batch < stop; batch += increment) { + for (auto batch = 0; batch < stop; batch++) { auto patch = listOfMatricies.at(batch); auto outMatrix = listOfOutputs.at(batch); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp index 09c8c09ea..ed844e84f 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp @@ -59,7 +59,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* if(input->rankOf() == 1 && output->rankOf() == 1) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) output->p(i, input->e(indices->e(i))); }; @@ -88,7 +88,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e(i)]); void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); @@ -100,7 +100,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* } else { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e(i)]); void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); @@ -140,7 +140,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]); void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); @@ -155,7 +155,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]); void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp index 9e3bdf885..fc6fc768b 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp @@ -56,7 +56,7 @@ namespace nd4j { if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto _x = static_cast(xBuffer[e]); auto _y = static_cast(yBuffer[e]); @@ -67,7 +67,7 @@ namespace nd4j { maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf); } else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto _x = static_cast(xBuffer[e * xEws]); auto _y = static_cast(yBuffer[e * yEws]); @@ -78,7 +78,7 @@ namespace nd4j { maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf); } else { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto _x = static_cast(x.e(e)); auto _y = static_cast(y.e(e)); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp index 04df86c36..beb48e382 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp @@ -42,7 +42,7 @@ namespace nd4j { // we divide array into 32 element chunks, and store intermediate results once auto func = PRAGMA_THREADS_FOR { - for (auto b = 0; b < stop; b += increment) { + for (auto b = 0; b < stop; b++) { auto blockBuffer = buffer + b * numBlocks; Nd4jLong r = 1; @@ -64,7 +64,7 @@ namespace nd4j { auto func2 = PRAGMA_THREADS_FOR { - for (auto b = start; b < stop; b += increment) { + for (auto b = start; b < stop; b++) { auto blockBuffer = tempBuffer + b * numBlocks; Nd4jLong r = 1; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp index 9d30ddcf7..23acab375 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp @@ -280,7 +280,7 @@ namespace helpers { int xsSize = xs.size(); // Scale x interpolation weights to avoid a multiplication during iteration. auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { xs[i]._bottomIndex *= channels; xs[i]._topIndex *= channels; } @@ -906,7 +906,7 @@ namespace helpers { auto outputPtr = output->bufferAsT(); // output is always float. TO DO: provide another float types also with template declaration auto batchProcess = PRAGMA_THREADS_FOR { - for (auto batch = start; batch < stop; batch += increment) { + for (auto batch = start; batch < stop; batch++) { for (auto y = 0; y < st.outHeight; ++y) { const float inY = y * st.heightScale; const float inY1 = (y + 1) * st.heightScale; @@ -961,7 +961,7 @@ namespace helpers { if (Status::OK() == res) { std::vector xCached(st.outWidth); auto cachingProcedure = PRAGMA_THREADS_FOR { - for (auto x = start; x < stop; x += increment) { + for (auto x = start; x < stop; x++) { auto &xCache = xCached[x]; const float inX = x * st.widthScale; const float inX1 = (x + 1) * st.widthScale; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp index e065174d5..b98e7f026 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp @@ -39,7 +39,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) { 'c' == output.ordering() && 1 == output.ews()){ auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const auto xStep = i*3; z[i] = 0.2989f*x[xStep] + 0.5870f*x[xStep + 1] + 0.1140f*x[xStep + 2]; } @@ -52,7 +52,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) { auto func = PRAGMA_THREADS_FOR{ Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, output.getShapeInfo(), coords); const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); const auto xOffset0 = shape::getOffset(input.getShapeInfo(), coords); @@ -99,7 +99,7 @@ FORCEINLINE static void rgbToFromYuv_(const NDArray& input, NDArray& output, con const Nd4jLong zDimCstride = output.stridesOf()[dimC]; auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const T* xTad = x + packX.platformOffsets()[i]; T* zTad = z + packZ.platformOffsets()[i]; op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]); @@ -157,7 +157,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output, const Nd4jLong zDimCstride = output->stridesOf()[dimC]; auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const T* xTad = x + packX.platformOffsets()[i]; T* zTad = z + packZ.platformOffsets()[i]; op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]); @@ -207,7 +207,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output, const Nd4jLong zDimCstride = output->stridesOf()[dimC]; auto func = PRAGMA_THREADS_FOR{ - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const T* xTad = x + packX.platformOffsets()[i]; T* zTad = z + packZ.platformOffsets()[i]; //simple M*v //tr.T*v diff --git a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp index 4bc9d3304..1fea8e4fe 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp @@ -146,7 +146,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector int span = (tads / num_threads) + 8; auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { + for (auto r = start; r < stop; r++) { auto rX = const_cast(input)->bufferAsT() + tadOffsets[r]; auto rZ = output->bufferAsT() + zOfsets[r]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp index c9b833cf5..aeb9e38b0 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp @@ -62,7 +62,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out if(inTadEws == 1 && outTadEws == 1) { auto func = PRAGMA_THREADS_FOR { - for (uint i = start; i < stop; i += increment) { + for (uint i = start; i < stop; i++) { const T *x = inBuff + inTadOffsets[i]; T *y = outBuff + outTadOffsets[i]; @@ -179,7 +179,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c if(inTadEws == 1 && gradITadEws == 1) { auto func = PRAGMA_THREADS_FOR { - for (uint i = start; i < stop; i += increment) { + for (uint i = start; i < stop; i++) { const X *x = inBuff + inTadOffsets[i]; Y *y = gradIBuff + gradITadOffsets[i]; @@ -247,7 +247,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c else { auto func = PRAGMA_THREADS_FOR { - for (uint i = start; i < stop; i += increment) { + for (uint i = start; i < stop; i++) { const X *x = inBuff + inTadOffsets[i]; Y *y = gradIBuff + gradITadOffsets[i]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp index 683a82392..634d875d2 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp @@ -124,7 +124,7 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast, auto h_ = h->bufferAsT(); auto func = PRAGMA_THREADS_FOR { - for (uint e = start; e < stop; e += increment) { + for (uint e = start; e < stop; e++) { c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]); h_[e] = nd4j::math::nd4j_tanh(c_[e]); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp index 2856e73b9..7d2eb5051 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp @@ -45,7 +45,7 @@ namespace helpers { auto n = shape::sizeAt(matrixShape, -1); auto loop = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { Nd4jLong theFirstPos[] = {theFirst, i}; Nd4jLong theSecondPos[] = {theSecond, i}; auto theFirstIndex = shape::getOffset(matrixShape, theFirstPos, 0); @@ -203,7 +203,7 @@ namespace helpers { auto result = -1; //auto loop = PRAGMA_THREADS_FOR { auto start = column, stop = rowNum, increment = 1; - for (auto rowCounter = start; rowCounter < stop; rowCounter += increment) { + for (auto rowCounter = start; rowCounter < stop; rowCounter++) { Nd4jLong xPos[] = {rowCounter, column}; auto xIndex = shape::getOffset(compoundShape, xPos, 0); if (nd4j::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) { @@ -221,7 +221,7 @@ namespace helpers { Nd4jLong xDiag[] = {currentRow, currentRow}; auto diagIndex = shape::getOffset(compoundShape, xDiag, 0); auto loop = PRAGMA_THREADS_FOR { - for (int j = start; j < stop; j += increment) { + for (auto j = start; j < stop; j++) { Nd4jLong xRow[] = {j, currentRow}; auto rowIndex = shape::getOffset(compoundShape, xRow, 0); compoundBuf[rowIndex] /= compoundBuf[diagIndex]; //output->t(i, i); @@ -310,7 +310,7 @@ namespace helpers { permutations = permutationVectors->allTensorsAlongDimension({-1}); auto loop = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { luNN_(context, outputs.at(i), permutationVectors?permutations.at(i):nullptr, n); } }; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp index cc43c1866..8a2048263 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp @@ -46,7 +46,7 @@ int _matrixDiagPart(const NDArray* input, NDArray* output) { int lO = listOut.size(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) for (int j = 0; j < lastDimension; ++j) listOut.at(i)->p(j, listDiag.at(i)->e(j, j)); }; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp index dcca5075e..20d8bd34f 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp @@ -55,7 +55,7 @@ namespace helpers { Nd4jLong oL = output->lengthOf(); auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto row = rows.at(e); output->p(e, row->e(n)); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp index 3e18d6d14..71beed7f9 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp @@ -49,7 +49,7 @@ namespace nd4j { if (tadEws >= 1) { auto func = PRAGMA_THREADS_FOR { - for (auto e = 0; e < stop; e += increment) { + for (auto e = 0; e < stop; e++) { auto cO = output + tadPack.primaryOffsets()[e]; auto idx = static_cast(indices[e]); @@ -70,7 +70,7 @@ namespace nd4j { samediff::Threads::parallel_tad(func, 0, numTads); } else { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto cO = output + tadPack.primaryOffsets()[e]; auto idx = static_cast(indices[e]); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp index fc572677e..df80636ee 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp @@ -70,7 +70,7 @@ template static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const T order = n.e(i); if(order != static_cast(order)) // if order has fractional part then do not perform calculations and return NAN output.p(i, std::numeric_limits::quiet_NaN()); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp index 90b69ca6f..9e1980e54 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp @@ -113,7 +113,7 @@ namespace helpers { ResultSet listOutR(outputR->allTensorsAlongDimension({(int)preLastDim, (int)lastDim})); ResultSet listInput(input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim})); auto batching = PRAGMA_THREADS_FOR { - for (auto batch = start; batch < stop; batch += increment) { + for (auto batch = start; batch < stop; batch++) { //qr here qrSingle(listInput.at(batch), listOutQ.at(batch), listOutR.at(batch), fullMatricies); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp index bb0e7e24e..a14fb89f9 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp @@ -39,7 +39,7 @@ static void _range(const NDArray& start, const NDArray& delta, NDArray& outVecto auto d = delta.e(0); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) buff[i] = s + i * d; }; samediff::Threads::parallel_for(func, 0, len); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp index 9ee906bd5..4c80e3bf2 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp @@ -54,7 +54,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * if (inArr == outArr) { if (inEWS == 1) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto idx = sLength - e; swap(inArr, e, idx); } @@ -63,7 +63,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * } else if (inEWS > 1) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto idx1 = (sLength - e) * inEWS; Nd4jLong idx2 = e * inEWS; swap(inArr, idx1, idx2); @@ -75,7 +75,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * else { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto inOffset = shape::getIndexOffset(e, inShapeBuffer); auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer); swap(outArr, inOffset, outOffset); @@ -93,14 +93,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) { auto func = PRAGMA_THREADS_FOR { - for (Nd4jLong e = start; e < stop; e += increment) + for (Nd4jLong e = start; e < stop; e++) outArr[sLength - e] = inArr[e]; }; samediff::Threads::parallel_for(func, 0, numOfElemsToReverse); if(inLength != numOfElemsToReverse) { auto f2 = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) outArr[e] = inArr[e]; }; samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength); @@ -109,14 +109,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) outArr[(sLength - e) * outEWS] = inArr[e * inEWS]; }; samediff::Threads::parallel_for(func, 0, numOfElemsToReverse); if(inLength != numOfElemsToReverse) { auto f2 = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) outArr[e * outEWS] = inArr[e * inEWS]; }; samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength); @@ -125,7 +125,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * else { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto inOffset = shape::getIndexOffset(e, inShapeBuffer); auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer); outArr[outOffset] = inArr[inOffset]; @@ -136,7 +136,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * if(inLength != numOfElemsToReverse) { auto f2 = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto inOffset = shape::getIndexOffset(e, inShapeBuffer); auto outOffset = shape::getIndexOffset(e, outShapeBuffer); outArr[outOffset] = inArr[inOffset]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp index 01e346136..09a628b84 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp @@ -114,7 +114,7 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray& // loop through input array auto func = PRAGMA_THREADS_FOR { Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, output.getShapeInfo(), coords); @@ -300,7 +300,7 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra // loop through output array auto func = PRAGMA_THREADS_FOR { Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, output.getShapeInfo(), coords); const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp index fd285ed9c..557d63fd3 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp @@ -48,7 +48,7 @@ namespace helpers { const int total_count = batch_size * input_height * input_width * input_depth; auto func = PRAGMA_THREADS_FOR { - for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) { + for (auto inp_idx = start; inp_idx < stop; inp_idx++) { // inp_idx = d + input_depth * (w + input_width * (h + input_height * b)) const int d = inp_idx % input_depth; const int inp_idx2 = inp_idx / input_depth; @@ -74,7 +74,7 @@ namespace helpers { const int total_count = batch_size * output_depth_by_output_area; auto func = PRAGMA_THREADS_FOR { - for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) { + for (auto inp_idx = start; inp_idx < stop; inp_idx++) { const int n_iC_oY_bY_oX = inp_idx / block_size; const int bX = inp_idx - n_iC_oY_bY_oX * block_size; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp index a3f0c01be..2de2b2d22 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp @@ -45,7 +45,7 @@ Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int Nd4jLong xCoords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, xShapeInfo, xCoords); @@ -79,7 +79,7 @@ void scatter(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& ind if(outRank == 1) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { Nd4jLong idx = indices.e(i); NDArray out = output({idx, idx + 1}); @@ -99,7 +99,7 @@ void scatter(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& ind std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { NDArray outSubArr = output(indices.e(i), std::vector({0})); NDArray updSubArr = updates(i, dimsToExcludeUpd); @@ -121,7 +121,7 @@ void scatterND(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& i if(outRank == 1) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { Nd4jLong idx = indices.e(i); NDArray out = output({idx, idx + 1}); @@ -139,7 +139,7 @@ void scatterND(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& i auto func = PRAGMA_THREADS_FOR { std::vector idxRangeOut(2*outRank, 0); - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { NDArray indSubArr = indices(i, dimsToExcludeInd); for (Nd4jLong j = 0; j < indLastDim; ++j) { @@ -170,7 +170,7 @@ void scatterForLoss(nd4j::LaunchContext *context, const NDArray& indices, NDArr if(!calcGrad) { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto subArr = updates(i, dimsToExclude); output.p(i, subArr.e(indices.e(i))); } @@ -179,7 +179,7 @@ void scatterForLoss(nd4j::LaunchContext *context, const NDArray& indices, NDArr samediff::Threads::parallel_for(func, 0, indicesLen); } else { auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto subArr = updates(i, dimsToExclude); auto ind = indices.e(i); subArr.p(ind, subArr.e(ind) - 1.); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp index e20145735..08aafc98c 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp @@ -169,7 +169,7 @@ namespace helpers { for (int i = 1; i < indices->lengthOf(); i++) { if (indices->e(i) == idx) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { meanV.p(e, meanV.e(e) + listOfTensors.at(i)->e(e)); } }; @@ -223,7 +223,7 @@ namespace helpers { for (int i = 0; i < indices->lengthOf(); i++) { if (indices->e(i) == idx) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { sumT->p(e, sumT->e(e) + listOfTensors.at(i)->e(e)); } }; @@ -272,7 +272,7 @@ namespace helpers { for (int i = 1; i < indices->lengthOf(); i++) { if (indices->e(i) == idx) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { sumT->p(e, sumT->e(e) * listOfTensors.at(i)->e(e)); } }; @@ -625,7 +625,7 @@ namespace helpers { Nd4jLong loop_size = input->lengthOf(); auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto classNum = indices->e(e); if (nd4j::math::nd4j_abs(tempRes.e(classNum) - input->e(e)) <= T(1.e-6)) output->p(e, gradOut->e(classNum)); @@ -645,7 +645,7 @@ namespace helpers { //std::vector> outputs(numOfClasses); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto classNum = indices->e(i); auto current = listOfTensors.at(i); auto currentOut = listOfOutTensors.at(i); @@ -675,7 +675,7 @@ namespace helpers { segmentMinFunctor(context, input, indices, &tempRes); if (input->isVector()) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto classNum = indices->e(e); if (nd4j::math::nd4j_abs(tempRes.e(classNum) - input->e(e)) < 1.e-5) output->p(e, gradOut->e(classNum)); @@ -697,7 +697,7 @@ namespace helpers { int pos = 0; auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto classNum = indices->e(i); auto current = listOfTensors.at(i); auto currentOut = listOfOutTensors.at(i); @@ -887,7 +887,7 @@ namespace helpers { if (input->isVector()) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto classNum = indices->e(e); if (nd4j::math::nd4j_abs(tempRes.t(classNum) - input->t(e)) < 1.e-6) output->t(e) = gradOut->t(classNum); @@ -1004,7 +1004,7 @@ namespace helpers { unsortedSegmentProdFunctor(context, input, indices, numOfClasses, &tempRes); if (input->isVector()) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto classNum = indices->e(e); output->p(e, gradOut->e(classNum) * tempRes.e(classNum) / input->e(e)); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp index 59c257c28..05353bf5e 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp @@ -364,7 +364,7 @@ namespace nd4j { auto func = PRAGMA_THREADS_FOR { T sneu1e[600]; - for (auto t = start; t < stop; t += increment) { + for (auto t = start; t < stop; t++) { T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; memset(neu1e, 0, vectorLength * sizeof(T)); @@ -457,7 +457,7 @@ namespace nd4j { T sneu1[600]; T sneu1e[600]; - for (int e = start; e < stop; e += increment) { + for (int e = start; e < stop; e++) { T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength]; T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp index 48f7f0d9a..c8774f028 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp @@ -40,7 +40,7 @@ namespace helpers { output->assign(input); auto batchLoop = PRAGMA_THREADS_FOR { - for (auto batch = start; batch < stop; batch += increment) { + for (auto batch = start; batch < stop; batch++) { for (auto r = 0; r < rows; r++) { for (auto c = 0; c < r; c++) { math::nd4j_swap(outputPart[batch]->t(r, c) , outputPart[batch]->t(c, r)); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp index 642dd37da..d2dd3bf30 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp @@ -143,7 +143,7 @@ static void sruBI_(NDArray* x, const NDArray* w, const NDArray* b, const NDArray T* pCt = ct->bufferAsT(); auto func = PRAGMA_THREADS_FOR { - for (auto col = start; col < stop; col += increment) { + for (auto col = start; col < stop; col++) { const auto colNum = col % d2; bool flip = colNum >= K; T maskVal = mask ? *(pMask + col) : T(1); @@ -236,7 +236,7 @@ static void sruBIBP_(NDArray* x, const NDArray* w, const NDArray* b, const NDArr T* pGradInit = gradC0->bufferAsT(); auto func = PRAGMA_THREADS_FOR { - for (auto col = start; col < stop; col += increment) { + for (auto col = start; col < stop; col++) { T gbF = 0.f; T gbR = 0.f; const auto colNum = col % d2; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp index db9b6afff..a3d27702d 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp @@ -37,7 +37,7 @@ static void stack_(const std::vector& inArrs, NDArray* outArr, c int inSize = inArrs.size(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) outArr->p(i, inArrs[i]->t(0)); }; @@ -50,7 +50,7 @@ static void stack_(const std::vector& inArrs, NDArray* outArr, c int listSize = list.size(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) list.at(i)->assign(inArrs[i]); }; samediff::Threads::parallel_tad(func, 0, listSize); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp index e38232928..c4b45b398 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp @@ -150,7 +150,7 @@ namespace helpers { result->assign(0); if (status == ND4J_STATUS_OK) { auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { bool found = false; for (int j = 0; j < k; j++) { if (target->e(e) == indices->e(e * k + j)) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp index ea5e90cd8..1f630e8e0 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp @@ -43,7 +43,7 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N int dLen = dOdI.lengthOf(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { if (dOdI.t(i) != static_cast(0.f)) dOdI.t(i) = static_cast(1.f); } @@ -65,7 +65,7 @@ static void trace_(const NDArray& input, NDArray& output) { auto setOfSubArrs = input.allTensorsAlongDimension({inRank-2, inRank-1}); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) output.p(i, setOfSubArrs.at(i)->getTrace()); }; samediff::Threads::parallel_for(func, 0, setOfSubArrs.size()); @@ -189,7 +189,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray auto func = PRAGMA_THREADS_FOR { Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, output.getShapeInfo(), coords); const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); @@ -220,7 +220,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray auto func = PRAGMA_THREADS_FOR { Nd4jLong coords[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, output.getShapeInfo(), coords); const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); @@ -566,7 +566,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) { auto func = PRAGMA_THREADS_FOR { Nd4jLong coords[MAX_RANK * 3]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { Nd4jLong *zCoordStart, *xCoordStart; if (yLastDim == xRank) { @@ -650,7 +650,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con else if (input->rankOf() == 1 && indices->isVector()) { // special case auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) + for (auto e = start; e < stop; e++) output->p(e, input->e(indices->e(e))); }; @@ -663,7 +663,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), dimsOut); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { NDArray subArrOut = (*output)(i, dimsOut); NDArray subArrIn = (*input)(indices->e(i), {axis}); subArrOut.assign(subArrIn); @@ -687,7 +687,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), {axis}); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { NDArray subArrOut = (*output)(i, {axis}); NDArray subArrIn = (*input)(intArgs[i + 1], {axis}); subArrOut.assign(subArrIn); @@ -710,7 +710,7 @@ void eye(nd4j::LaunchContext * context, NDArray& output) { auto arrs = output.allTensorsAlongDimension({rank-2, rank-1}); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) arrs.at(i)->setIdentity(); }; @@ -737,7 +737,7 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat indices.push_back((*intArgs)[e]); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto inSubArr = input(indices[i], dimsToExclude, true); auto updSubArr = updates(i, dimsToExclude, true); @@ -786,7 +786,7 @@ void scatterSimple(nd4j::LaunchContext * context, const int opId, NDArray& input case 6: { // copy auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto inSubArr = input(i, dimensions); inSubArr.p(indices.t(i), updates.e(i)); } @@ -809,7 +809,7 @@ static void mergeMaxIndex_(const std::vector& inArrs, NDArray& output) auto x = inArrs[0]; auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { T max = -DataTypeUtils::max(); Nd4jLong idx = 0; @@ -839,7 +839,7 @@ static void mergeMax_(const std::vector& inArrs, NDArray& output) { auto x = inArrs[0]; auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { T max = -DataTypeUtils::max(); for (int i = 0; i < numArgs; i++) { T v = inArrs[i]->e(e); @@ -865,7 +865,7 @@ static void mergeAvg_(const std::vector& inArrs, NDArray& output) { auto x = inArrs[0]; auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { T sum = 0.; for (int i = 0; i < numArgs; i++) { T v = inArrs[i]->e(e); @@ -891,7 +891,7 @@ static void mergeAdd_(const std::vector& inArrs, NDArray& output) { auto x = inArrs[0]; auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { T sum = (T) 0.f; for (int i = 0; i < numArgs; i++) sum += inArrs[i]->e(e); @@ -928,7 +928,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector& auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { const T iNormActual = norm2.e(i); if (iNormActual > normClip) *listOfInSubArrs.at(i) *= normClip / iNormActual; @@ -952,7 +952,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector& auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto inputSubArr = listOfInSubArrs.at(i); auto outputSubArr = listOfOutSubArrs.at(i); outputSubArr->assign(inputSubArr); @@ -1058,7 +1058,7 @@ static void clipByNormBP_(const NDArray& input, const NDArray& gradO, NDArray& g auto cn = clipNorm.e(0); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { T N = norm2.e(i); auto gradOSubArr = gradOSubArrs.at(i); @@ -1190,7 +1190,7 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o auto func = PRAGMA_THREADS_FOR { Nd4jLong inIdx[MAX_RANK]; Nd4jLong outIdx[MAX_RANK]; - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { shape::index2coords(i, output.getShapeInfo(), outIdx); for (int j = 0; j < rank; ++j) { @@ -1225,17 +1225,6 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o BUILD_SINGLE_TEMPLATE(template void mirrorPad_, (const NDArray& input, const NDArray& paddings, NDArray& output, const int mode), LIBND4J_TYPES); -////////////////////////////////////////////////////////////////////////// -template -static void concat_(const std::vector& inArrs, NDArray& output, const int axis) { - nd4j::SpecialMethods::concatCpuGeneric(inArrs, output, axis); -} - - void concat(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output, const int axis) { - BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES); - } - - BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector& inArrs, NDArray& output, const int axis), LIBND4J_TYPES); ////////////////////////////////////////////////////////////////////////// template diff --git a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp index ceb228439..c825a8fee 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp @@ -90,7 +90,7 @@ namespace helpers { auto outputPart = output->allTensorsAlongDimension({-2, -1}); auto batchLoop = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { if (lower) { lowerTriangularSolve(context, leftPart[i], rightPart[i], adjoint, outputPart[i]); } else { @@ -112,7 +112,7 @@ namespace helpers { auto rows = input->sizeAt(-2); auto batchLoop = PRAGMA_THREADS_FOR { - for (auto batch = start; batch < stop; batch += increment) { + for (auto batch = start; batch < stop; batch++) { if (!lower) { for (auto r = 0; r < rows; r++) { for (auto c = 0; c <= r; c++) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp index 5d4ed9f2e..90ef634c1 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp @@ -64,7 +64,7 @@ static void zeta_(nd4j::LaunchContext * context, const NDArray& x, const NDArray int xLen = x.lengthOf(); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) + for (auto i = start; i < stop; i++) z.p(i, zetaScalar(x.e(i), q.e(i))); }; diff --git a/libnd4j/include/ops/declarable/helpers/cross.h b/libnd4j/include/ops/declarable/helpers/cross.h index 02b7e8467..3ea80966b 100644 --- a/libnd4j/include/ops/declarable/helpers/cross.h +++ b/libnd4j/include/ops/declarable/helpers/cross.h @@ -68,7 +68,7 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND int tads = tadsA.size(); auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto a_ = tadsA.at(e); auto b_ = tadsB.at(e); auto o_ = tadsO.at(e); diff --git a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp index 8ef63101e..3bcdea865 100644 --- a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp +++ b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp @@ -69,7 +69,7 @@ namespace helpers { } auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { values->p(e, static_cast(valuesVector[e])); if (counts != nullptr) counts->p(e, countsMap[valuesVector[e]]); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp index 1a35ecd47..8ef8032bb 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp @@ -19,8 +19,10 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_0); + + BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES); } \ No newline at end of file diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp index be8edad04..5bb518d76 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_1); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp index 915983bb0..27b68e732 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_2); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp index d2f59137d..80e2258c7 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_3); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp index 29caeae84..e34b0c528 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_4); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp index 489d1fc6a..96797cc98 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_5); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp index 6f50c4682..70c7f3990 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_6); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp index 03a31221f..e2d1df0e9 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_7); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp index 074f09238..25e14d39f 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_8); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp index 8de7c663b..f3b4cbcb6 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_double.hpp" namespace nd4j { BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_9); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp index 3e841dfae..4d1575123 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_0); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp index 59a215c20..b50c487b7 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_1); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp index 77617173d..972b936dd 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_2); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp index 2c19c3bc6..9eb99b238 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_3); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp index cd6babb61..6558d7284 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_4); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp index b54028b42..d89652899 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_5); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp index 4ca54e7b1..40c9598ee 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_6); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp index 3d843ca4c..e49ace221 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_7); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp index d8dc34f1c..973b25edc 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_8); diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp index 2c12f2803..b3bf0beeb 100644 --- a/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp +++ b/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp @@ -19,7 +19,7 @@ // @author raver119@gmail.com // -#include "../specials.hpp" +#include "../specials_single.hpp" namespace nd4j { BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_9); diff --git a/libnd4j/include/ops/impl/gemm.cpp b/libnd4j/include/ops/impl/gemm.cpp index 2779bdadf..efd57a7c5 100644 --- a/libnd4j/include/ops/impl/gemm.cpp +++ b/libnd4j/include/ops/impl/gemm.cpp @@ -34,7 +34,7 @@ namespace nd4j { // handle transpose in parallel auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { + for (auto r = start; r < stop; r++) { for (int c = 0; c < cols; c++) { int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c); int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c); @@ -73,7 +73,7 @@ namespace nd4j { C[r] = z; } else { auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) + for (auto r = start; r < stop; r++) C[r] = z; }; samediff::Threads::parallel_for(func, 0, length); @@ -130,7 +130,7 @@ namespace nd4j { auto aT = TRANS == CblasTrans ? reinterpret_cast(nd4j::blas::transpose(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast(x))) : x; auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { + for (auto r = start; r < stop; r++) { int aIdx = linearIndexC(M, N, r, 0); auto aX = aT + aIdx; diff --git a/libnd4j/include/ops/impl/specials_double.hpp b/libnd4j/include/ops/impl/specials_double.hpp new file mode 100644 index 000000000..73f50c772 --- /dev/null +++ b/libnd4j/include/ops/impl/specials_double.hpp @@ -0,0 +1,270 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com, created on 07.10.2017. +// @author Yurii Shyrma (iuriish@yahoo.com) +// + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nd4j { + + + template + void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) { + auto x = reinterpret_cast(dx); + auto z = reinterpret_cast(dz); + + + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i++) { + z[i] = static_cast(x[i]); + } + }; + + samediff::Threads::parallel_for(func, 0, N); + }; + + + template + void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) { + int i = left, j = right; + X ktmp; + X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)]; + + Y vtmp; + + { + /* PARTITION PART */ + while (i <= j) { + if (descending) { + while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot) + i++; + while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot) + j--; + if (i <= j) { + ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; + key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; + key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; + + vtmp = values[shape::getIndexOffset(i, yShapeInfo)]; + values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)]; + values[shape::getIndexOffset(j, yShapeInfo)] = vtmp; + + i++; + j--; + } + } else { + while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot) + i++; + while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot) + j--; + if (i <= j) { + ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; + key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; + key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; + + vtmp = values[shape::getIndexOffset(i, yShapeInfo)]; + values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)]; + values[shape::getIndexOffset(j, yShapeInfo)] = vtmp; + + i++; + j--; + } + } + } + + } + + // + + if ( ((right-left) + void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) { + int i = left, j = right; + X ktmp; + Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)]; + + Y vtmp; + + { + /* PARTITION PART */ + while (i <= j) { + if (descending) { + while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot) + i++; + while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot) + j--; + if (i <= j) { + ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; + key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; + key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; + + vtmp = value[shape::getIndexOffset(i, yShapeInfo)]; + value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)]; + value[shape::getIndexOffset(j, yShapeInfo)] = vtmp; + + i++; + j--; + } + } else { + while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot) + i++; + while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot) + j--; + if (i <= j) { + ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; + key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; + key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; + + vtmp = value[shape::getIndexOffset(i, yShapeInfo)]; + value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)]; + value[shape::getIndexOffset(j, yShapeInfo)] = vtmp; + + i++; + j--; + } + } + } + + } + + // + + if ( ((right-left) + static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){ + auto array = reinterpret_cast(varray); + auto values = reinterpret_cast(yarray); + int cutoff = 1000; + + PRAGMA_OMP_PARALLEL_THREADS(numThreads) + { +PRAGMA_OMP_SINGLE_ARGS(nowait) + { + quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending); + } + } + } + + template + static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){ + auto array = reinterpret_cast(varray); + auto values = reinterpret_cast(yarray); + int cutoff = 1000; + + PRAGMA_OMP_PARALLEL_THREADS(numThreads) + { +PRAGMA_OMP_SINGLE_ARGS(nowait) + { + quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending); + } + } + } + + template + void DoubleMethods::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) { + quickSort_parallel_key(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending); + } + + template + void DoubleMethods::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) { + quickSort_parallel_value(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending); + } + + template + void DoubleMethods::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) { + auto x = reinterpret_cast(vx); + auto y = reinterpret_cast(vy); + + auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength); + auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength); + + auto xLength = shape::length(xShapeInfo); + auto xTadLength = shape::length(packX.primaryShapeInfo()); + auto numTads = packX.numberOfTads(); + + auto func = PRAGMA_THREADS_FOR { + for (auto r = start; r < stop; r++) { + auto dx = x + packX.primaryOffsets()[r]; + auto dy = y + packY.primaryOffsets()[r]; + + quickSort_parallel_key(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); + } + }; + + samediff::Threads::parallel_tad(func, 0, numTads); + } + + template + void DoubleMethods::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) { + auto x = reinterpret_cast(vx); + auto y = reinterpret_cast(vy); + + auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength); + auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength); + + auto xLength = shape::length(xShapeInfo); + auto xTadLength = shape::length(packX.primaryShapeInfo()); + auto numTads = packX.numberOfTads(); + + auto func = PRAGMA_THREADS_FOR { + for (auto r = start; r < stop; r++) { + auto dx = x + packX.primaryOffsets()[r]; + auto dy = y + packY.primaryOffsets()[r]; + + quickSort_parallel_value(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); + } + }; + + samediff::Threads::parallel_tad(func, 0, numTads); + } +} + diff --git a/libnd4j/include/ops/impl/specials.hpp b/libnd4j/include/ops/impl/specials_single.hpp similarity index 56% rename from libnd4j/include/ops/impl/specials.hpp rename to libnd4j/include/ops/impl/specials_single.hpp index 207ca5964..030e9c6d7 100644 --- a/libnd4j/include/ops/impl/specials.hpp +++ b/libnd4j/include/ops/impl/specials_single.hpp @@ -64,7 +64,7 @@ void SpecialMethods::concatCpuGeneric(const std::vector& inArrs, ND T* outBuff = output.bufferAsT(); auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { + for (auto r = start; r < stop; r++) { const Nd4jLong arrLen = inArrs[r]->lengthOf(); const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]]; @@ -99,7 +99,7 @@ void SpecialMethods::concatCpuGeneric(const std::vector& inArrs, ND } auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { auto temp = output(indices[i], true); nd4j::TransformLoops::template loopTransform>( inArrs[i]->bufferAsT(), inArrs[i]->getShapeInfo(), temp.bufferAsT(), temp.getShapeInfo(), nullptr, 0, 1); } @@ -143,7 +143,7 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint auto x = reinterpret_cast(vx); auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { for (auto ar = 0L; ar < n; ar++) { z[i] += x[ar][i]; } @@ -179,7 +179,7 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint } auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { for (Nd4jLong ar = 1; ar < n; ar++) { z[i] += x[ar][i] / static_cast(n); } @@ -199,7 +199,7 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint // aggregation step auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { + for (auto i = start; i < stop; i++) { for (Nd4jLong ar = 0; ar < n; ar++) { z[i] += x[ar][i] / static_cast(n); } @@ -336,7 +336,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) int numTads = xLength / xTadLength; auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { + for (auto r = start; r < stop; r++) { T *dx = x + tadOffsets[r]; quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending); @@ -358,7 +358,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) auto func = PRAGMA_THREADS_FOR { - for (auto e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { for (int bitId = 0; bitId < 16; bitId++) { bool hasBit = (x[e] & 1 << (bitId)) != 0; bool hasSign = (x[e] & 1 << (bitId + 16)) != 0; @@ -378,22 +378,6 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) samediff::Threads::parallel_for(func, 4, lim); } - template - void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) { - auto x = reinterpret_cast(dx); - auto z = reinterpret_cast(dz); - - - auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { - z[i] = static_cast(x[i]); - } - }; - - samediff::Threads::parallel_for(func, 0, N); - }; - BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES); - template Nd4jLong SpecialMethods::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) { auto dx = reinterpret_cast(vx); @@ -442,226 +426,5 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) }; return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16); } - - template - void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) { - int i = left, j = right; - X ktmp; - X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)]; - - Y vtmp; - - { - /* PARTITION PART */ - while (i <= j) { - if (descending) { - while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot) - i++; - while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot) - j--; - if (i <= j) { - ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; - key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; - key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; - - vtmp = values[shape::getIndexOffset(i, yShapeInfo)]; - values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)]; - values[shape::getIndexOffset(j, yShapeInfo)] = vtmp; - - i++; - j--; - } - } else { - while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot) - i++; - while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot) - j--; - if (i <= j) { - ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; - key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; - key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; - - vtmp = values[shape::getIndexOffset(i, yShapeInfo)]; - values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)]; - values[shape::getIndexOffset(j, yShapeInfo)] = vtmp; - - i++; - j--; - } - } - } - - } - - // - - if ( ((right-left) - void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) { - int i = left, j = right; - X ktmp; - Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)]; - - Y vtmp; - - { - /* PARTITION PART */ - while (i <= j) { - if (descending) { - while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot) - i++; - while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot) - j--; - if (i <= j) { - ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; - key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; - key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; - - vtmp = value[shape::getIndexOffset(i, yShapeInfo)]; - value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)]; - value[shape::getIndexOffset(j, yShapeInfo)] = vtmp; - - i++; - j--; - } - } else { - while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot) - i++; - while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot) - j--; - if (i <= j) { - ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; - key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; - key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; - - vtmp = value[shape::getIndexOffset(i, yShapeInfo)]; - value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)]; - value[shape::getIndexOffset(j, yShapeInfo)] = vtmp; - - i++; - j--; - } - } - } - - } - - // - - if ( ((right-left) - static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){ - auto array = reinterpret_cast(varray); - auto values = reinterpret_cast(yarray); - int cutoff = 1000; - - PRAGMA_OMP_PARALLEL_THREADS(numThreads) - { -PRAGMA_OMP_SINGLE_ARGS(nowait) - { - quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending); - } - } - } - - template - static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){ - auto array = reinterpret_cast(varray); - auto values = reinterpret_cast(yarray); - int cutoff = 1000; - - PRAGMA_OMP_PARALLEL_THREADS(numThreads) - { -PRAGMA_OMP_SINGLE_ARGS(nowait) - { - quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending); - } - } - } - - template - void DoubleMethods::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) { - quickSort_parallel_key(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending); - } - - template - void DoubleMethods::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) { - quickSort_parallel_value(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending); - } - - template - void DoubleMethods::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) { - auto x = reinterpret_cast(vx); - auto y = reinterpret_cast(vy); - - auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength); - auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength); - - auto xLength = shape::length(xShapeInfo); - auto xTadLength = shape::length(packX.primaryShapeInfo()); - auto numTads = packX.numberOfTads(); - - auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { - auto dx = x + packX.primaryOffsets()[r]; - auto dy = y + packY.primaryOffsets()[r]; - - quickSort_parallel_key(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); - } - }; - - samediff::Threads::parallel_tad(func, 0, numTads); - } - - template - void DoubleMethods::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) { - auto x = reinterpret_cast(vx); - auto y = reinterpret_cast(vy); - - auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength); - auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength); - - auto xLength = shape::length(xShapeInfo); - auto xTadLength = shape::length(packX.primaryShapeInfo()); - auto numTads = packX.numberOfTads(); - - auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r += increment) { - auto dx = x + packX.primaryOffsets()[r]; - auto dy = y + packY.primaryOffsets()[r]; - - quickSort_parallel_value(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); - } - }; - - samediff::Threads::parallel_tad(func, 0, numTads); - } - - //BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES); - //BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES); } diff --git a/libnd4j/include/ops/special_random_ops.h b/libnd4j/include/ops/special_random_ops.h index a25aa36ec..354f8e328 100644 --- a/libnd4j/include/ops/special_random_ops.h +++ b/libnd4j/include/ops/special_random_ops.h @@ -167,7 +167,7 @@ namespace randomOps { if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) { auto func = PRAGMA_THREADS_FOR { - for (uint64_t e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { T prob = rng->relativeT(e); T cumProb = (T) 0.0f; for (Nd4jLong f = 0; f < yLength; f++) { @@ -330,7 +330,7 @@ namespace randomOps { const T epsilon = static_cast(1e-5); auto func = PRAGMA_THREADS_FOR { - for (uint64_t e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto epm = e + middle; // we need to get random values @@ -440,7 +440,7 @@ namespace randomOps { nd4j::graph::RandomGenerator* rng = reinterpret_cast(state); auto func = PRAGMA_THREADS_FOR { - for (Nd4jLong e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { int success = 0; for (int t = 1; t <= trials; t++) { @@ -549,7 +549,7 @@ namespace randomOps { //nd4j::random::RandomBuffer *buffer = reinterpret_cast (state); nd4j::graph::RandomGenerator* rng = reinterpret_cast(state); auto func = PRAGMA_THREADS_FOR { - for (uint64_t e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { int success = 0; for (int t = 1; t <= trials; t++) { @@ -690,7 +690,7 @@ namespace randomOps { const T epsilon = static_cast(1e-5); auto func = PRAGMA_THREADS_FOR { - for (uint64_t e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { if (z[e] > mean + ds || z[e] < mean - ds) { z[e] = step(rng, mean, stddev, e, middle, z[e]); @@ -818,7 +818,7 @@ namespace randomOps { auto func = PRAGMA_THREADS_FOR { PRAGMA_OMP_SIMD - for (uint64_t e = start; e < stop; e += increment) { + for (auto e = start; e < stop; e++) { auto epm = e + middle; // we need to get random values