Minor improvements (#255)

* static increments in loops

Signed-off-by: raver119 <raver119@gmail.com>

* specials and concat split into separate units

Signed-off-by: raver119 <raver119@gmail.com>
master
raver119 2020-02-20 11:43:26 +03:00 committed by GitHub
parent d9058b469a
commit 215641ea9e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
83 changed files with 529 additions and 464 deletions

View File

@ -501,7 +501,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto cdata = data + offsets[e]; auto cdata = data + offsets[e];
if (dataType == DataType::UTF16) { if (dataType == DataType::UTF16) {
unicode::utf8to16(string[e], cdata, std::char_traits<char>::length(string[e])); unicode::utf8to16(string[e], cdata, std::char_traits<char>::length(string[e]));
@ -568,7 +568,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::stri
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto cdata = data + offsets[e]; auto cdata = data + offsets[e];
if (dataType == DataType::UTF16) { if (dataType == DataType::UTF16) {
unicode::utf8to16(string[e].data(), cdata, string[e].size()); unicode::utf8to16(string[e].data(), cdata, string[e].size());
@ -635,7 +635,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u16s
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto cdata = data + offsets[e]; auto cdata = data + offsets[e];
if (dtype == DataType::UTF16) { if (dtype == DataType::UTF16) {
memcpy(cdata, string[e].data(), string[e].size() * sizeof(uint16_t)); memcpy(cdata, string[e].data(), string[e].size() * sizeof(uint16_t));
@ -701,7 +701,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto cdata = data + offsets[e]; auto cdata = data + offsets[e];
if (dtype == DataType::UTF16) { if (dtype == DataType::UTF16) {
memcpy(cdata, string[e], std::char_traits<char16_t>::length(string[e]) * sizeof(uint16_t)); memcpy(cdata, string[e], std::char_traits<char16_t>::length(string[e]) * sizeof(uint16_t));
@ -767,7 +767,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u32s
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto cdata = data + offsets[e]; auto cdata = data + offsets[e];
if (dtype == DataType::UTF16) { if (dtype == DataType::UTF16) {
unicode::utf32to16(string[e].data(), cdata, string[e].size()); unicode::utf32to16(string[e].data(), cdata, string[e].size());
@ -833,7 +833,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto cdata = data + offsets[e]; auto cdata = data + offsets[e];
if (dtype == DataType::UTF16) { if (dtype == DataType::UTF16) {
unicode::utf32to16(string[e], cdata, std::char_traits<char32_t>::length(string[e])); unicode::utf32to16(string[e], cdata, std::char_traits<char32_t>::length(string[e]));
@ -2367,7 +2367,7 @@ NDArray NDArray::asS() const {
const auto inData = bufferAsT<int8_t>() + offsetsLength; const auto inData = bufferAsT<int8_t>() + offsetsLength;
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (int e = start; e < stop; e += increment) { for (int e = start; e < stop; e++) {
auto cdata = outData + offsets[e]; auto cdata = outData + offsets[e];
auto end = nInputoffsets[e + 1]; auto end = nInputoffsets[e + 1];
auto idata = inData + nInputoffsets[e]; auto idata = inData + nInputoffsets[e];
@ -3466,7 +3466,7 @@ NDArray NDArray::dup(const char newOrder) const {
std::vector<std::string> strings(lengthOf()); std::vector<std::string> strings(lengthOf());
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
strings[i] = std::move(this->e<std::string>(i)); strings[i] = std::move(this->e<std::string>(i));
} }
}; };
@ -3479,7 +3479,7 @@ NDArray NDArray::dup(const char newOrder) const {
std::vector<std::u16string> strings(lengthOf()); std::vector<std::u16string> strings(lengthOf());
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
strings[i] = std::move(this->e<std::u16string>(i)); strings[i] = std::move(this->e<std::u16string>(i));
} }
}; };
@ -3491,7 +3491,7 @@ NDArray NDArray::dup(const char newOrder) const {
std::vector<std::u32string> strings(lengthOf()); std::vector<std::u32string> strings(lengthOf());
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
strings[i] = std::move(this->e<std::u32string>(i)); strings[i] = std::move(this->e<std::u32string>(i));
} }
}; };

View File

@ -98,7 +98,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK]; Nd4jLong coords[MAX_RANK];
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
shape::index2coords(i, target.getShapeInfo(), coords); shape::index2coords(i, target.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(target.getShapeInfo(), coords); const auto zOffset = shape::getOffset(target.getShapeInfo(), coords);
@ -152,7 +152,7 @@ static void templatedSwap(void *xBuffer, void *yBuffer, Nd4jLong length) {
auto y = reinterpret_cast<T *>(yBuffer); auto y = reinterpret_cast<T *>(yBuffer);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto temp = x[i]; auto temp = x[i];
x[i] = y[i]; x[i] = y[i];
y[i] = temp; y[i] = temp;
@ -266,7 +266,7 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
if(result.ordering() == 'c') { // ews == 1 always here if(result.ordering() == 'c') { // ews == 1 always here
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo()); auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES); BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES);
} }
@ -277,7 +277,7 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
else { else {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto xOffset = result.getOffset(i); auto xOffset = result.getOffset(i);
auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo()); auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES); BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES);
@ -377,7 +377,7 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
// loop through input array // loop through input array
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK]; Nd4jLong coords[MAX_RANK];
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
shape::index2coords(i, output.getShapeInfo(), coords); shape::index2coords(i, output.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);

View File

@ -22,7 +22,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
if (this->ordering() == second.ordering() && this->ordering() == third.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == second.ews() && this->ews() == third.ews()) { if (this->ordering() == second.ordering() && this->ordering() == third.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == second.ews() && this->ews() == third.ews()) {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) for (auto e = start; e < stop; e++)
z[e] = func(f[e], s[e], t[e]); z[e] = func(f[e], s[e], t[e]);
}; };
@ -31,7 +31,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
if (f == z) { if (f == z) {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto tOffset = this->getOffset(e); auto tOffset = this->getOffset(e);
auto uOffset = second.getOffset(e); auto uOffset = second.getOffset(e);
auto vOffset = third.getOffset(e); auto vOffset = third.getOffset(e);
@ -44,7 +44,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
} else { } else {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto tOffset = this->getOffset(e); auto tOffset = this->getOffset(e);
auto uOffset = second.getOffset(e); auto uOffset = second.getOffset(e);
auto vOffset = third.getOffset(e); auto vOffset = third.getOffset(e);
@ -93,7 +93,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) { if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) for (auto e = start; e < stop; e++)
z[e] = func(f[e], s[e]); z[e] = func(f[e], s[e]);
}; };
@ -102,7 +102,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
if (f == z) { if (f == z) {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto xOffset = this->getOffset(e); auto xOffset = this->getOffset(e);
auto yOffset = other.getOffset(e); auto yOffset = other.getOffset(e);
@ -114,7 +114,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
} else { } else {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto xOffset = this->getOffset(e); auto xOffset = this->getOffset(e);
auto yOffset = other.getOffset(e); auto yOffset = other.getOffset(e);
auto zOffset = target.getOffset(e); auto zOffset = target.getOffset(e);
@ -156,7 +156,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) { if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) for (auto e = start; e < stop; e++)
z[e] = func(f[e]); z[e] = func(f[e]);
}; };
@ -165,7 +165,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
if (f == z) { if (f == z) {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto xOffset = this->getOffset(e); auto xOffset = this->getOffset(e);
f[xOffset] = func(f[xOffset]); f[xOffset] = func(f[xOffset]);
@ -176,7 +176,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
} else { } else {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto xOffset = this->getOffset(e); auto xOffset = this->getOffset(e);
auto zOffset = target.getOffset(e); auto zOffset = target.getOffset(e);
@ -217,7 +217,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) { if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) for (auto e = start; e < stop; e++)
z[e] = func(e, f[e]); z[e] = func(e, f[e]);
}; };
@ -226,7 +226,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
if (f == z) { if (f == z) {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto xOffset = this->getOffset(e); auto xOffset = this->getOffset(e);
f[xOffset] = func(e, f[xOffset]); f[xOffset] = func(e, f[xOffset]);
@ -237,7 +237,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
} else { } else {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto xOffset = this->getOffset(e); auto xOffset = this->getOffset(e);
auto zOffset = target.getOffset(e); auto zOffset = target.getOffset(e);
@ -283,7 +283,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) { if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) for (auto e = start; e < stop; e++)
z[e] = func((Nd4jLong) e, f[e], s[e]); z[e] = func((Nd4jLong) e, f[e], s[e]);
}; };
@ -292,7 +292,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
if (f == z) { if (f == z) {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto xOffset = this->getOffset(e); auto xOffset = this->getOffset(e);
auto yOffset = other.getOffset(e); auto yOffset = other.getOffset(e);
@ -304,7 +304,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
} else { } else {
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto xOffset = this->getOffset(e); auto xOffset = this->getOffset(e);
auto yOffset = other.getOffset(e); auto yOffset = other.getOffset(e);
auto zOffset = target.getOffset(e); auto zOffset = target.getOffset(e);

View File

@ -1291,7 +1291,7 @@ void pullRowsGeneric(void *vx,
_threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads()); _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto idx = start; idx < stop; idx += increment) { for (auto idx = start; idx < stop; idx++) {
auto xTadOffsetForBlock = tadOffsets[indexes[idx]]; auto xTadOffsetForBlock = tadOffsets[indexes[idx]];
auto zTadOffsetForBlock = zTadOffsets[idx]; auto zTadOffsetForBlock = zTadOffsets[idx];
@ -1356,7 +1356,7 @@ void tearGeneric(void *vx,
auto numTads = shape::length(hXShapeInfo) / tadLength; auto numTads = shape::length(hXShapeInfo) / tadLength;
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto hZ = reinterpret_cast<T *>(targets[i]); auto hZ = reinterpret_cast<T *>(targets[i]);
auto s = hX + tadOffsets[i]; auto s = hX + tadOffsets[i];
@ -1478,7 +1478,7 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS
auto dZ = reinterpret_cast<T **>(dz); auto dZ = reinterpret_cast<T **>(dz);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto f = start; f < stop; f += increment) { for (auto f = start; f < stop; f++) {
auto hX = reinterpret_cast<T *>(dX[f]); auto hX = reinterpret_cast<T *>(dX[f]);
//auto hZ = reinterpret_cast<T *>(dZ[f]); //auto hZ = reinterpret_cast<T *>(dZ[f]);

View File

@ -52,7 +52,7 @@ namespace nd4j {
TypeCast::convertGeneric<T2, T>(nullptr, tmp, length, buffer); TypeCast::convertGeneric<T2, T>(nullptr, tmp, length, buffer);
#else #else
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) for (auto e = start; e < stop; e++)
buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e])); buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
}; };
@ -110,7 +110,7 @@ namespace nd4j {
TypeCast::convertGeneric<float, T>(nullptr, tmp, length, buffer); TypeCast::convertGeneric<float, T>(nullptr, tmp, length, buffer);
#else #else
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) for (auto e = start; e < stop; e++)
buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e])); buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
}; };
@ -138,7 +138,7 @@ namespace nd4j {
#else #else
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) for (auto e = start; e < stop; e++)
buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e])); buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
}; };
@ -164,7 +164,7 @@ namespace nd4j {
TypeCast::convertGeneric<float16, T>(nullptr, tmp, length, buffer); TypeCast::convertGeneric<float16, T>(nullptr, tmp, length, buffer);
#else #else
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) for (auto e = start; e < stop; e++)
buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e])); buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
}; };

View File

@ -49,7 +49,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
case nd4j::LoopKind::EWS1: { case nd4j::LoopKind::EWS1: {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto tad = const_cast<X *>(x) + tadOffsets[i]; auto tad = const_cast<X *>(x) + tadOffsets[i];
auto indexValue = OpType::startingIndexValue(tad); auto indexValue = OpType::startingIndexValue(tad);
@ -70,7 +70,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
case nd4j::LoopKind::EWSNONZERO: { case nd4j::LoopKind::EWSNONZERO: {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto tad = const_cast<X *>(x) + tadOffsets[i]; auto tad = const_cast<X *>(x) + tadOffsets[i];
auto indexValue = OpType::startingIndexValue(tad); auto indexValue = OpType::startingIndexValue(tad);
@ -91,7 +91,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
case nd4j::LoopKind::RANK1: { case nd4j::LoopKind::RANK1: {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto tad = const_cast<X *>(x) + tadOffsets[i]; auto tad = const_cast<X *>(x) + tadOffsets[i];
auto indexValue = OpType::startingIndexValue(tad); auto indexValue = OpType::startingIndexValue(tad);
@ -114,7 +114,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
shape::updateStrides(2, tadShape, newStride, 'c'); shape::updateStrides(2, tadShape, newStride, 'c');
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto tad = const_cast<X *>(x) + tadOffsets[i]; auto tad = const_cast<X *>(x) + tadOffsets[i];
auto indexValue = OpType::startingIndexValue(tad); auto indexValue = OpType::startingIndexValue(tad);
@ -141,7 +141,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
shape::updateStrides(3, tadShape, newStride, 'c'); shape::updateStrides(3, tadShape, newStride, 'c');
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto tad = const_cast<X *>(x) + tadOffsets[i]; auto tad = const_cast<X *>(x) + tadOffsets[i];
auto indexValue = OpType::startingIndexValue(tad); auto indexValue = OpType::startingIndexValue(tad);
@ -170,7 +170,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
shape::updateStrides(4, tadShape, newStride, 'c'); shape::updateStrides(4, tadShape, newStride, 'c');
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto tad = const_cast<X *>(x) + tadOffsets[i]; auto tad = const_cast<X *>(x) + tadOffsets[i];
auto indexValue = OpType::startingIndexValue(tad); auto indexValue = OpType::startingIndexValue(tad);
@ -201,7 +201,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
shape::updateStrides(5, tadShape, newStride, 'c'); shape::updateStrides(5, tadShape, newStride, 'c');
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto tad = const_cast<X *>(x) + tadOffsets[i]; auto tad = const_cast<X *>(x) + tadOffsets[i];
auto indexValue = OpType::startingIndexValue(tad); auto indexValue = OpType::startingIndexValue(tad);
@ -234,7 +234,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo); const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto tad = const_cast<X *>(x) + tadOffsets[i]; auto tad = const_cast<X *>(x) + tadOffsets[i];
auto indexValue = OpType::startingIndexValue(tad); auto indexValue = OpType::startingIndexValue(tad);
@ -258,7 +258,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo); const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto tad = const_cast<X *>(x) + tadOffsets[i]; auto tad = const_cast<X *>(x) + tadOffsets[i];
auto indexValue = OpType::startingIndexValue(tad); auto indexValue = OpType::startingIndexValue(tad);
@ -284,7 +284,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo); const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto tad = const_cast<X *>(x) + tadOffsets[i]; auto tad = const_cast<X *>(x) + tadOffsets[i];
auto indexValue = OpType::startingIndexValue(tad); auto indexValue = OpType::startingIndexValue(tad);

View File

@ -80,7 +80,7 @@ namespace nd4j {
int nLen = zArr.lengthOf() / yArr.sizeAt(-1); int nLen = zArr.lengthOf() / yArr.sizeAt(-1);
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (uint32_t total = start; total < stop; total += increment) { for (uint32_t total = start; total < stop; total++) {
uint32_t i = total / zDim1; uint32_t i = total / zDim1;
uint32_t j = total % zDim1; uint32_t j = total % zDim1;

View File

@ -73,7 +73,7 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
intermediatery[thread_id] = OpType::startingIndexValue(x); intermediatery[thread_id] = OpType::startingIndexValue(x);
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
IndexValue<X> curr(x[i], i); IndexValue<X> curr(x[i], i);
intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams); intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
} }
@ -88,7 +88,7 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
intermediatery[thread_id] = OpType::startingIndexValue(x); intermediatery[thread_id] = OpType::startingIndexValue(x);
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
IndexValue<X> curr(x[offset], i); IndexValue<X> curr(x[offset], i);
intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams); intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);

View File

@ -75,7 +75,7 @@ namespace functions {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
} }
@ -93,7 +93,7 @@ namespace functions {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t i = start; i < stop; i += increment) { for (uint64_t i = start; i < stop; i++) {
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
@ -111,7 +111,7 @@ namespace functions {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t i = start; i < stop; i += increment) { for (uint64_t i = start; i < stop; i++) {
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments); z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
@ -129,7 +129,7 @@ namespace functions {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t i = start; i < stop; i += increment) { for (uint64_t i = start; i < stop; i++) {
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
auto offset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); auto offset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments); z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
@ -149,7 +149,7 @@ namespace functions {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t i = start; i < stop; i += increment) { for (uint64_t i = start; i < stop; i++) {
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
@ -197,7 +197,7 @@ namespace functions {
else{ else{
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t i = start; i < stop; i += increment) { for (uint64_t i = start; i < stop; i++) {
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments); z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
} }
@ -213,7 +213,7 @@ namespace functions {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t i = start; i < stop; i += increment) { for (uint64_t i = start; i < stop; i++) {
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments); z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
@ -255,7 +255,7 @@ namespace functions {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t i = start; i < stop; i += increment) { for (uint64_t i = start; i < stop; i++) {
auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
z[offset] = OpClass::op(i, length, rng, extraArguments); z[offset] = OpClass::op(i, length, rng, extraArguments);
} }

View File

@ -88,7 +88,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
if (kindOfLoop == nd4j::LoopKind::EWS1) { if (kindOfLoop == nd4j::LoopKind::EWS1) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
} }
}; };
@ -98,7 +98,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
} else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
} }
@ -110,7 +110,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);

View File

@ -158,7 +158,7 @@ namespace functions {
const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeShapeInfo, tadShapeShapeInfoCast); const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeShapeInfo, tadShapeShapeInfoCast);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto r = start; r < stop; r += increment) { for (auto r = start; r < stop; r++) {
auto tadOffsetForBlock = tadPack.primaryOffsets()[r]; auto tadOffsetForBlock = tadPack.primaryOffsets()[r];
auto tx = x + tadOffsetForBlock; auto tx = x + tadOffsetForBlock;

View File

@ -81,7 +81,7 @@ namespace nd4j {
// now we actually apply quantization // now we actually apply quantization
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
rz[e] = static_cast<char>(nd4j::math::nd4j_round<float, char>( 1.0f * static_cast<float>(x[e]) / nd4j::math::nd4j_max<float>(amax, amin) * max_byte)); rz[e] = static_cast<char>(nd4j::math::nd4j_round<float, char>( 1.0f * static_cast<float>(x[e]) / nd4j::math::nd4j_max<float>(amax, amin) * max_byte));
} }
}; };
@ -177,7 +177,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
int flimit = limit + 4; int flimit = limit + 4;
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
int el = x[e]; int el = x[e];
int ael = nd4j::math::nd4j_abs<int>(el) - 1; int ael = nd4j::math::nd4j_abs<int>(el) - 1;
z[ael] += el > 0 ? static_cast<T>(threshold) : static_cast<T>(-threshold); z[ael] += el > 0 ? static_cast<T>(threshold) : static_cast<T>(-threshold);
@ -202,7 +202,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
auto z = reinterpret_cast<T *>(dz); auto z = reinterpret_cast<T *>(dz);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
z[i] = static_cast<T>(static_cast<float>(x[i])); z[i] = static_cast<T>(static_cast<float>(x[i]));
} }
}; };

View File

@ -153,7 +153,7 @@ namespace helpers {
auto rowSize = sizeof(T) * colCount; auto rowSize = sizeof(T) * colCount;
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto n = start; n < stop; n += increment) { for (auto n = start; n < stop; n++) {
int s = rowP->e<int>(n); int s = rowP->e<int>(n);
int end = rowP->e<int>(n + 1); int end = rowP->e<int>(n + 1);
int shift = n * colCount; int shift = n * colCount;

View File

@ -291,7 +291,7 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
shape::calcOffsets(tadShapeInfo, offsets); shape::calcOffsets(tadShapeInfo, offsets);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto inBuff = input.bufferAsT<T>() + tadOffsets[i]; auto inBuff = input.bufferAsT<T>() + tadOffsets[i];
auto outBuff = output.bufferAsT<T>() + tadOffsets[i]; auto outBuff = output.bufferAsT<T>() + tadOffsets[i];
@ -341,7 +341,7 @@ void prelu(nd4j::LaunchContext * context, const NDArray& input, const NDArray& a
const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo(); const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
// FIXME: double! // FIXME: double!
double x = input.e<double>(i); double x = input.e<double>(i);
if (x < 0.0) { if (x < 0.0) {

View File

@ -67,7 +67,7 @@ static void adjustHue_(const NDArray *input, const NDArray* deltaScalarArr, NDAr
const Nd4jLong zDimCstride = output->stridesOf()[dimC]; const Nd4jLong zDimCstride = output->stridesOf()[dimC];
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
const T *xTad = x + packX.platformOffsets()[i]; const T *xTad = x + packX.platformOffsets()[i];
T *zTad = z + packZ.platformOffsets()[i]; T *zTad = z + packZ.platformOffsets()[i];

View File

@ -66,7 +66,7 @@ static void adjustSaturation_(const NDArray *input, const NDArray* factorScalarA
const Nd4jLong zDimCstride = output->stridesOf()[dimC]; const Nd4jLong zDimCstride = output->stridesOf()[dimC];
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
const T *xTad = x + packX.platformOffsets()[i]; const T *xTad = x + packX.platformOffsets()[i];
T *zTad = z + packZ.platformOffsets()[i]; T *zTad = z + packZ.platformOffsets()[i];

View File

@ -94,7 +94,7 @@ void bgemm_(const std::vector<NDArray*>& vA, const std::vector<NDArray*>& vB, st
int vaSize = vA.size(); int vaSize = vA.size();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto p = start; p < stop; p += increment) { for (auto p = start; p < stop; p++) {
auto A = reinterpret_cast<T *>(vA.at(p)->buffer()); auto A = reinterpret_cast<T *>(vA.at(p)->buffer());
auto B = reinterpret_cast<T *>(vB.at(p)->buffer()); auto B = reinterpret_cast<T *>(vB.at(p)->buffer());
auto C = reinterpret_cast<T *>(vC.at(p)->buffer()); auto C = reinterpret_cast<T *>(vC.at(p)->buffer());

View File

@ -141,7 +141,7 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray
Nd4jLong coords[MAX_RANK]; Nd4jLong coords[MAX_RANK];
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
shape::index2coords(i, input->getShapeInfo(), coords); shape::index2coords(i, input->getShapeInfo(), coords);

View File

@ -117,7 +117,7 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con
int xLen = x.lengthOf(); int xLen = x.lengthOf();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) for (auto i = start; i < stop; i++)
output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i)); output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i));
}; };

View File

@ -96,7 +96,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
T *col, *im; T *col, *im;
for (uint b = start; b < stop; b += increment) { for (uint b = start; b < stop; b++) {
T *im0 = imBuff + b * imStride0; T *im0 = imBuff + b * imStride0;
T *col4 = colBuff + b * colStride0; T *col4 = colBuff + b * colStride0;
for (int colH = 0; colH < oH; ++colH, col4 += colStride4) { for (int colH = 0; colH < oH; ++colH, col4 += colStride4) {

View File

@ -0,0 +1,41 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author Yurii Shyrma (iuriish@yahoo.com), created on 20.04.2018
//
#include <ops/declarable/helpers/transforms.h>
#include <ops/specials.h>
namespace nd4j {
namespace ops {
namespace helpers {
//////////////////////////////////////////////////////////////////////////
template<typename T>
static void concat_(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
nd4j::SpecialMethods<T>::concatCpuGeneric(inArrs, output, axis);
}
void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES);
}
BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector<NDArray*>& inArrs, NDArray& output, const int axis), LIBND4J_TYPES);
}
}
}

View File

@ -32,7 +32,7 @@ namespace helpers {
int lLen = labels->lengthOf(); int lLen = labels->lengthOf();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (int j = start; j < stop; j += increment) { for (int j = start; j < stop; j++) {
auto label = labels->e<Nd4jLong>(j); auto label = labels->e<Nd4jLong>(j);
auto pred = predictions->e<Nd4jLong>(j); auto pred = predictions->e<Nd4jLong>(j);
T value = (weights == nullptr ? (T) 1.0f : weights->e<T>(j)); T value = (weights == nullptr ? (T) 1.0f : weights->e<T>(j));

View File

@ -50,7 +50,7 @@ namespace nd4j {
T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0); T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto y = start; y < stop; y += increment) { for (auto y = start; y < stop; y++) {
const float inY = (cropHeight > 1) const float inY = (cropHeight > 1)
? y1 * (imageHeight - 1) + y * heightScale ? y1 * (imageHeight - 1) + y * heightScale
: 0.5 * (y1 + y2) * (imageHeight - 1); : 0.5 * (y1 + y2) * (imageHeight - 1);

View File

@ -39,7 +39,7 @@ void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray
int tads = tadsA.size(); int tads = tadsA.size();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto a_ = tadsA.at(e); auto a_ = tadsA.at(e);
auto b_ = tadsB.at(e); auto b_ = tadsB.at(e);
auto o_ = tadsO.at(e); auto o_ = tadsO.at(e);

View File

@ -46,7 +46,7 @@ namespace helpers {
if (isNHWC) { if (isNHWC) {
const int total_count = batch_size * output_height * output_width * output_depth; const int total_count = batch_size * output_height * output_width * output_depth;
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto out_idx = start; out_idx < stop; out_idx += increment) { for (auto out_idx = start; out_idx < stop; out_idx++) {
const int d = out_idx % output_depth; const int d = out_idx % output_depth;
const int out_idx2 = out_idx / output_depth; const int out_idx2 = out_idx / output_depth;
const int w = out_idx2 % output_width; const int w = out_idx2 % output_width;
@ -70,7 +70,7 @@ namespace helpers {
const int total_count = batch_size * input_depth_by_input_area; const int total_count = batch_size * input_depth_by_input_area;
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (int input_idx = start; input_idx < stop; input_idx += increment) { for (int input_idx = start; input_idx < stop; input_idx++) {
const int n_bY_bX_oC_iY = input_idx / input_width; const int n_bY_bX_oC_iY = input_idx / input_width;
const int iX = input_idx - n_bY_bX_oC_iY * input_width; const int iX = input_idx - n_bY_bX_oC_iY * input_width;

View File

@ -32,7 +32,7 @@ template <typename T>
static void diGamma_(const NDArray& x, NDArray& z) { static void diGamma_(const NDArray& x, NDArray& z) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) for (auto i = start; i < stop; i++)
z.p(i, diGammaScalar<T>(x.e<T>(i))); z.p(i, diGammaScalar<T>(x.e<T>(i)));
}; };
samediff::Threads::parallel_for(func, 0, x.lengthOf()); samediff::Threads::parallel_for(func, 0, x.lengthOf());

View File

@ -35,7 +35,7 @@ namespace helpers {
int inLen = input->lengthOf(); int inLen = input->lengthOf();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
float val = nodeRng.relativeT<T>(e, T(0.f), T(1.f)); float val = nodeRng.relativeT<T>(e, T(0.f), T(1.f));
if (val < probValue) if (val < probValue)
@ -130,7 +130,7 @@ namespace helpers {
nd4j::graph::RandomGenerator nodeRng(3019L, seed); nd4j::graph::RandomGenerator nodeRng(3019L, seed);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
float randVal = nodeRng.relativeT(e, T(0.f), T(1.f)); float randVal = nodeRng.relativeT(e, T(0.f), T(1.f));
float xVal = input->e<float>(e); float xVal = input->e<float>(e);
output->p<float>(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1); output->p<float>(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1);

View File

@ -62,7 +62,7 @@ namespace nd4j {
unsigned int outSize = outputList.size(); unsigned int outSize = outputList.size();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
outputs[i].first = outputList[i]; outputs[i].first = outputList[i];
outputs[i].second = 0; outputs[i].second = 0;
for (int e = 0; e < indices->lengthOf(); ++e) for (int e = 0; e < indices->lengthOf(); ++e)
@ -168,7 +168,7 @@ namespace nd4j {
unsigned int gradsSize = inputGradientList.size(); unsigned int gradsSize = inputGradientList.size();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
outputs[i].first = inputGradientList[i]; outputs[i].first = inputGradientList[i];
outputs[i].second = 0; outputs[i].second = 0;
for (int e = 0; e < indices->lengthOf(); ++e) for (int e = 0; e < indices->lengthOf(); ++e)

View File

@ -50,7 +50,7 @@ namespace helpers {
colCast = 0; colCast = 0;
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto batch = 0; batch < stop; batch += increment) { for (auto batch = 0; batch < stop; batch++) {
auto patch = listOfMatricies.at(batch); auto patch = listOfMatricies.at(batch);
auto outMatrix = listOfOutputs.at(batch); auto outMatrix = listOfOutputs.at(batch);

View File

@ -59,7 +59,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
if(input->rankOf() == 1 && output->rankOf() == 1) { if(input->rankOf() == 1 && output->rankOf() == 1) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) for (auto i = start; i < stop; i++)
output->p(i, input->e(indices->e<Nd4jLong>(i))); output->p(i, input->e(indices->e<Nd4jLong>(i)));
}; };
@ -88,7 +88,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]); void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]);
void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
@ -100,7 +100,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
} }
else { else {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]); void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]);
void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
@ -140,7 +140,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]); void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]);
void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
@ -155,7 +155,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]); void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]);
void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);

View File

@ -56,7 +56,7 @@ namespace nd4j {
if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) { if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto _x = static_cast<unsigned long long>(xBuffer[e]); auto _x = static_cast<unsigned long long>(xBuffer[e]);
auto _y = static_cast<unsigned long long>(yBuffer[e]); auto _y = static_cast<unsigned long long>(yBuffer[e]);
@ -67,7 +67,7 @@ namespace nd4j {
maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf); maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
} else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) { } else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto _x = static_cast<unsigned long long>(xBuffer[e * xEws]); auto _x = static_cast<unsigned long long>(xBuffer[e * xEws]);
auto _y = static_cast<unsigned long long>(yBuffer[e * yEws]); auto _y = static_cast<unsigned long long>(yBuffer[e * yEws]);
@ -78,7 +78,7 @@ namespace nd4j {
maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf); maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
} else { } else {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto _x = static_cast<unsigned long long>(x.e<Nd4jLong>(e)); auto _x = static_cast<unsigned long long>(x.e<Nd4jLong>(e));
auto _y = static_cast<unsigned long long>(y.e<Nd4jLong>(e)); auto _y = static_cast<unsigned long long>(y.e<Nd4jLong>(e));

View File

@ -42,7 +42,7 @@ namespace nd4j {
// we divide array into 32 element chunks, and store intermediate results once // we divide array into 32 element chunks, and store intermediate results once
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto b = 0; b < stop; b += increment) { for (auto b = 0; b < stop; b++) {
auto blockBuffer = buffer + b * numBlocks; auto blockBuffer = buffer + b * numBlocks;
Nd4jLong r = 1; Nd4jLong r = 1;
@ -64,7 +64,7 @@ namespace nd4j {
auto func2 = PRAGMA_THREADS_FOR { auto func2 = PRAGMA_THREADS_FOR {
for (auto b = start; b < stop; b += increment) { for (auto b = start; b < stop; b++) {
auto blockBuffer = tempBuffer + b * numBlocks; auto blockBuffer = tempBuffer + b * numBlocks;
Nd4jLong r = 1; Nd4jLong r = 1;

View File

@ -280,7 +280,7 @@ namespace helpers {
int xsSize = xs.size(); int xsSize = xs.size();
// Scale x interpolation weights to avoid a multiplication during iteration. // Scale x interpolation weights to avoid a multiplication during iteration.
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
xs[i]._bottomIndex *= channels; xs[i]._bottomIndex *= channels;
xs[i]._topIndex *= channels; xs[i]._topIndex *= channels;
} }
@ -906,7 +906,7 @@ namespace helpers {
auto outputPtr = output->bufferAsT<float>(); // output is always float. TO DO: provide another float types also with template <typename X, typename Z> declaration auto outputPtr = output->bufferAsT<float>(); // output is always float. TO DO: provide another float types also with template <typename X, typename Z> declaration
auto batchProcess = PRAGMA_THREADS_FOR { auto batchProcess = PRAGMA_THREADS_FOR {
for (auto batch = start; batch < stop; batch += increment) { for (auto batch = start; batch < stop; batch++) {
for (auto y = 0; y < st.outHeight; ++y) { for (auto y = 0; y < st.outHeight; ++y) {
const float inY = y * st.heightScale; const float inY = y * st.heightScale;
const float inY1 = (y + 1) * st.heightScale; const float inY1 = (y + 1) * st.heightScale;
@ -961,7 +961,7 @@ namespace helpers {
if (Status::OK() == res) { if (Status::OK() == res) {
std::vector<CachedInterpolation> xCached(st.outWidth); std::vector<CachedInterpolation> xCached(st.outWidth);
auto cachingProcedure = PRAGMA_THREADS_FOR { auto cachingProcedure = PRAGMA_THREADS_FOR {
for (auto x = start; x < stop; x += increment) { for (auto x = start; x < stop; x++) {
auto &xCache = xCached[x]; auto &xCache = xCached[x];
const float inX = x * st.widthScale; const float inX = x * st.widthScale;
const float inX1 = (x + 1) * st.widthScale; const float inX1 = (x + 1) * st.widthScale;

View File

@ -39,7 +39,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
'c' == output.ordering() && 1 == output.ews()){ 'c' == output.ordering() && 1 == output.ews()){
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
const auto xStep = i*3; const auto xStep = i*3;
z[i] = 0.2989f*x[xStep] + 0.5870f*x[xStep + 1] + 0.1140f*x[xStep + 2]; z[i] = 0.2989f*x[xStep] + 0.5870f*x[xStep + 1] + 0.1140f*x[xStep + 2];
} }
@ -52,7 +52,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
Nd4jLong coords[MAX_RANK]; Nd4jLong coords[MAX_RANK];
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
shape::index2coords(i, output.getShapeInfo(), coords); shape::index2coords(i, output.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
const auto xOffset0 = shape::getOffset(input.getShapeInfo(), coords); const auto xOffset0 = shape::getOffset(input.getShapeInfo(), coords);
@ -99,7 +99,7 @@ FORCEINLINE static void rgbToFromYuv_(const NDArray& input, NDArray& output, con
const Nd4jLong zDimCstride = output.stridesOf()[dimC]; const Nd4jLong zDimCstride = output.stridesOf()[dimC];
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
const T* xTad = x + packX.platformOffsets()[i]; const T* xTad = x + packX.platformOffsets()[i];
T* zTad = z + packZ.platformOffsets()[i]; T* zTad = z + packZ.platformOffsets()[i];
op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]); op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
@ -157,7 +157,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
const Nd4jLong zDimCstride = output->stridesOf()[dimC]; const Nd4jLong zDimCstride = output->stridesOf()[dimC];
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
const T* xTad = x + packX.platformOffsets()[i]; const T* xTad = x + packX.platformOffsets()[i];
T* zTad = z + packZ.platformOffsets()[i]; T* zTad = z + packZ.platformOffsets()[i];
op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]); op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
@ -207,7 +207,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
const Nd4jLong zDimCstride = output->stridesOf()[dimC]; const Nd4jLong zDimCstride = output->stridesOf()[dimC];
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
const T* xTad = x + packX.platformOffsets()[i]; const T* xTad = x + packX.platformOffsets()[i];
T* zTad = z + packZ.platformOffsets()[i]; T* zTad = z + packZ.platformOffsets()[i];
//simple M*v //tr.T*v //simple M*v //tr.T*v

View File

@ -146,7 +146,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
int span = (tads / num_threads) + 8; int span = (tads / num_threads) + 8;
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto r = start; r < stop; r += increment) { for (auto r = start; r < stop; r++) {
auto rX = const_cast<NDArray*>(input)->bufferAsT<X>() + tadOffsets[r]; auto rX = const_cast<NDArray*>(input)->bufferAsT<X>() + tadOffsets[r];
auto rZ = output->bufferAsT<Z>() + zOfsets[r]; auto rZ = output->bufferAsT<Z>() + zOfsets[r];

View File

@ -62,7 +62,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
if(inTadEws == 1 && outTadEws == 1) { if(inTadEws == 1 && outTadEws == 1) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (uint i = start; i < stop; i += increment) { for (uint i = start; i < stop; i++) {
const T *x = inBuff + inTadOffsets[i]; const T *x = inBuff + inTadOffsets[i];
T *y = outBuff + outTadOffsets[i]; T *y = outBuff + outTadOffsets[i];
@ -179,7 +179,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
if(inTadEws == 1 && gradITadEws == 1) { if(inTadEws == 1 && gradITadEws == 1) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (uint i = start; i < stop; i += increment) { for (uint i = start; i < stop; i++) {
const X *x = inBuff + inTadOffsets[i]; const X *x = inBuff + inTadOffsets[i];
Y *y = gradIBuff + gradITadOffsets[i]; Y *y = gradIBuff + gradITadOffsets[i];
@ -247,7 +247,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
else { else {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (uint i = start; i < stop; i += increment) { for (uint i = start; i < stop; i++) {
const X *x = inBuff + inTadOffsets[i]; const X *x = inBuff + inTadOffsets[i];
Y *y = gradIBuff + gradITadOffsets[i]; Y *y = gradIBuff + gradITadOffsets[i];

View File

@ -124,7 +124,7 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast,
auto h_ = h->bufferAsT<T>(); auto h_ = h->bufferAsT<T>();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (uint e = start; e < stop; e += increment) { for (uint e = start; e < stop; e++) {
c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]); c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]); h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]);
} }

View File

@ -45,7 +45,7 @@ namespace helpers {
auto n = shape::sizeAt(matrixShape, -1); auto n = shape::sizeAt(matrixShape, -1);
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
Nd4jLong theFirstPos[] = {theFirst, i}; Nd4jLong theFirstPos[] = {theFirst, i};
Nd4jLong theSecondPos[] = {theSecond, i}; Nd4jLong theSecondPos[] = {theSecond, i};
auto theFirstIndex = shape::getOffset(matrixShape, theFirstPos, 0); auto theFirstIndex = shape::getOffset(matrixShape, theFirstPos, 0);
@ -203,7 +203,7 @@ namespace helpers {
auto result = -1; auto result = -1;
//auto loop = PRAGMA_THREADS_FOR { //auto loop = PRAGMA_THREADS_FOR {
auto start = column, stop = rowNum, increment = 1; auto start = column, stop = rowNum, increment = 1;
for (auto rowCounter = start; rowCounter < stop; rowCounter += increment) { for (auto rowCounter = start; rowCounter < stop; rowCounter++) {
Nd4jLong xPos[] = {rowCounter, column}; Nd4jLong xPos[] = {rowCounter, column};
auto xIndex = shape::getOffset(compoundShape, xPos, 0); auto xIndex = shape::getOffset(compoundShape, xPos, 0);
if (nd4j::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) { if (nd4j::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) {
@ -221,7 +221,7 @@ namespace helpers {
Nd4jLong xDiag[] = {currentRow, currentRow}; Nd4jLong xDiag[] = {currentRow, currentRow};
auto diagIndex = shape::getOffset(compoundShape, xDiag, 0); auto diagIndex = shape::getOffset(compoundShape, xDiag, 0);
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (int j = start; j < stop; j += increment) { for (auto j = start; j < stop; j++) {
Nd4jLong xRow[] = {j, currentRow}; Nd4jLong xRow[] = {j, currentRow};
auto rowIndex = shape::getOffset(compoundShape, xRow, 0); auto rowIndex = shape::getOffset(compoundShape, xRow, 0);
compoundBuf[rowIndex] /= compoundBuf[diagIndex]; //output->t<T>(i, i); compoundBuf[rowIndex] /= compoundBuf[diagIndex]; //output->t<T>(i, i);
@ -310,7 +310,7 @@ namespace helpers {
permutations = permutationVectors->allTensorsAlongDimension({-1}); permutations = permutationVectors->allTensorsAlongDimension({-1});
auto loop = PRAGMA_THREADS_FOR { auto loop = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
luNN_<T, I>(context, outputs.at(i), permutationVectors?permutations.at(i):nullptr, n); luNN_<T, I>(context, outputs.at(i), permutationVectors?permutations.at(i):nullptr, n);
} }
}; };

View File

@ -46,7 +46,7 @@ int _matrixDiagPart(const NDArray* input, NDArray* output) {
int lO = listOut.size(); int lO = listOut.size();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) for (auto i = start; i < stop; i++)
for (int j = 0; j < lastDimension; ++j) for (int j = 0; j < lastDimension; ++j)
listOut.at(i)->p(j, listDiag.at(i)->e<T>(j, j)); listOut.at(i)->p(j, listDiag.at(i)->e<T>(j, j));
}; };

View File

@ -55,7 +55,7 @@ namespace helpers {
Nd4jLong oL = output->lengthOf(); Nd4jLong oL = output->lengthOf();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto row = rows.at(e); auto row = rows.at(e);
output->p(e, row->e<T>(n)); output->p(e, row->e<T>(n));
} }

View File

@ -49,7 +49,7 @@ namespace nd4j {
if (tadEws >= 1) { if (tadEws >= 1) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = 0; e < stop; e += increment) { for (auto e = 0; e < stop; e++) {
auto cO = output + tadPack.primaryOffsets()[e]; auto cO = output + tadPack.primaryOffsets()[e];
auto idx = static_cast<int>(indices[e]); auto idx = static_cast<int>(indices[e]);
@ -70,7 +70,7 @@ namespace nd4j {
samediff::Threads::parallel_tad(func, 0, numTads); samediff::Threads::parallel_tad(func, 0, numTads);
} else { } else {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto cO = output + tadPack.primaryOffsets()[e]; auto cO = output + tadPack.primaryOffsets()[e];
auto idx = static_cast<int>(indices[e]); auto idx = static_cast<int>(indices[e]);

View File

@ -70,7 +70,7 @@ template <typename T>
static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) { static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
const T order = n.e<T>(i); const T order = n.e<T>(i);
if(order != static_cast<int>(order)) // if order has fractional part then do not perform calculations and return NAN if(order != static_cast<int>(order)) // if order has fractional part then do not perform calculations and return NAN
output.p(i, std::numeric_limits<T>::quiet_NaN()); output.p(i, std::numeric_limits<T>::quiet_NaN());

View File

@ -113,7 +113,7 @@ namespace helpers {
ResultSet listOutR(outputR->allTensorsAlongDimension({(int)preLastDim, (int)lastDim})); ResultSet listOutR(outputR->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}));
ResultSet listInput(input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim})); ResultSet listInput(input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}));
auto batching = PRAGMA_THREADS_FOR { auto batching = PRAGMA_THREADS_FOR {
for (auto batch = start; batch < stop; batch += increment) { for (auto batch = start; batch < stop; batch++) {
//qr here //qr here
qrSingle<T>(listInput.at(batch), listOutQ.at(batch), listOutR.at(batch), fullMatricies); qrSingle<T>(listInput.at(batch), listOutQ.at(batch), listOutR.at(batch), fullMatricies);
} }

View File

@ -39,7 +39,7 @@ static void _range(const NDArray& start, const NDArray& delta, NDArray& outVecto
auto d = delta.e<T>(0); auto d = delta.e<T>(0);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) for (auto i = start; i < stop; i++)
buff[i] = s + i * d; buff[i] = s + i * d;
}; };
samediff::Threads::parallel_for(func, 0, len); samediff::Threads::parallel_for(func, 0, len);

View File

@ -54,7 +54,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
if (inArr == outArr) { if (inArr == outArr) {
if (inEWS == 1) { if (inEWS == 1) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto idx = sLength - e; auto idx = sLength - e;
swap(inArr, e, idx); swap(inArr, e, idx);
} }
@ -63,7 +63,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
} }
else if (inEWS > 1) { else if (inEWS > 1) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto idx1 = (sLength - e) * inEWS; auto idx1 = (sLength - e) * inEWS;
Nd4jLong idx2 = e * inEWS; Nd4jLong idx2 = e * inEWS;
swap(inArr, idx1, idx2); swap(inArr, idx1, idx2);
@ -75,7 +75,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
else { else {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto inOffset = shape::getIndexOffset(e, inShapeBuffer); auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer); auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer);
swap(outArr, inOffset, outOffset); swap(outArr, inOffset, outOffset);
@ -93,14 +93,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) { if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (Nd4jLong e = start; e < stop; e += increment) for (Nd4jLong e = start; e < stop; e++)
outArr[sLength - e] = inArr[e]; outArr[sLength - e] = inArr[e];
}; };
samediff::Threads::parallel_for(func, 0, numOfElemsToReverse); samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
if(inLength != numOfElemsToReverse) { if(inLength != numOfElemsToReverse) {
auto f2 = PRAGMA_THREADS_FOR { auto f2 = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) for (auto e = start; e < stop; e++)
outArr[e] = inArr[e]; outArr[e] = inArr[e];
}; };
samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength); samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
@ -109,14 +109,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) { else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) for (auto e = start; e < stop; e++)
outArr[(sLength - e) * outEWS] = inArr[e * inEWS]; outArr[(sLength - e) * outEWS] = inArr[e * inEWS];
}; };
samediff::Threads::parallel_for(func, 0, numOfElemsToReverse); samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
if(inLength != numOfElemsToReverse) { if(inLength != numOfElemsToReverse) {
auto f2 = PRAGMA_THREADS_FOR { auto f2 = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) for (auto e = start; e < stop; e++)
outArr[e * outEWS] = inArr[e * inEWS]; outArr[e * outEWS] = inArr[e * inEWS];
}; };
samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength); samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
@ -125,7 +125,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
else { else {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto inOffset = shape::getIndexOffset(e, inShapeBuffer); auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer); auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer);
outArr[outOffset] = inArr[inOffset]; outArr[outOffset] = inArr[inOffset];
@ -136,7 +136,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
if(inLength != numOfElemsToReverse) { if(inLength != numOfElemsToReverse) {
auto f2 = PRAGMA_THREADS_FOR { auto f2 = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto inOffset = shape::getIndexOffset(e, inShapeBuffer); auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
auto outOffset = shape::getIndexOffset(e, outShapeBuffer); auto outOffset = shape::getIndexOffset(e, outShapeBuffer);
outArr[outOffset] = inArr[inOffset]; outArr[outOffset] = inArr[inOffset];

View File

@ -114,7 +114,7 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray&
// loop through input array // loop through input array
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK]; Nd4jLong coords[MAX_RANK];
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
shape::index2coords(i, output.getShapeInfo(), coords); shape::index2coords(i, output.getShapeInfo(), coords);
@ -300,7 +300,7 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
// loop through output array // loop through output array
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK]; Nd4jLong coords[MAX_RANK];
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
shape::index2coords(i, output.getShapeInfo(), coords); shape::index2coords(i, output.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);

View File

@ -48,7 +48,7 @@ namespace helpers {
const int total_count = batch_size * input_height * input_width * input_depth; const int total_count = batch_size * input_height * input_width * input_depth;
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) { for (auto inp_idx = start; inp_idx < stop; inp_idx++) {
// inp_idx = d + input_depth * (w + input_width * (h + input_height * b)) // inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
const int d = inp_idx % input_depth; const int d = inp_idx % input_depth;
const int inp_idx2 = inp_idx / input_depth; const int inp_idx2 = inp_idx / input_depth;
@ -74,7 +74,7 @@ namespace helpers {
const int total_count = batch_size * output_depth_by_output_area; const int total_count = batch_size * output_depth_by_output_area;
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) { for (auto inp_idx = start; inp_idx < stop; inp_idx++) {
const int n_iC_oY_bY_oX = inp_idx / block_size; const int n_iC_oY_bY_oX = inp_idx / block_size;
const int bX = inp_idx - n_iC_oY_bY_oX * block_size; const int bX = inp_idx - n_iC_oY_bY_oX * block_size;

View File

@ -45,7 +45,7 @@ Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int
Nd4jLong xCoords[MAX_RANK]; Nd4jLong xCoords[MAX_RANK];
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
shape::index2coords(i, xShapeInfo, xCoords); shape::index2coords(i, xShapeInfo, xCoords);
@ -79,7 +79,7 @@ void scatter(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& ind
if(outRank == 1) { if(outRank == 1) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
Nd4jLong idx = indices.e<Nd4jLong>(i); Nd4jLong idx = indices.e<Nd4jLong>(i);
NDArray out = output({idx, idx + 1}); NDArray out = output({idx, idx + 1});
@ -99,7 +99,7 @@ void scatter(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& ind
std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0); std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0})); NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0}));
NDArray updSubArr = updates(i, dimsToExcludeUpd); NDArray updSubArr = updates(i, dimsToExcludeUpd);
@ -121,7 +121,7 @@ void scatterND(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& i
if(outRank == 1) { if(outRank == 1) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
Nd4jLong idx = indices.e<Nd4jLong>(i); Nd4jLong idx = indices.e<Nd4jLong>(i);
NDArray out = output({idx, idx + 1}); NDArray out = output({idx, idx + 1});
@ -139,7 +139,7 @@ void scatterND(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& i
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
std::vector<Nd4jLong> idxRangeOut(2*outRank, 0); std::vector<Nd4jLong> idxRangeOut(2*outRank, 0);
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
NDArray indSubArr = indices(i, dimsToExcludeInd); NDArray indSubArr = indices(i, dimsToExcludeInd);
for (Nd4jLong j = 0; j < indLastDim; ++j) { for (Nd4jLong j = 0; j < indLastDim; ++j) {
@ -170,7 +170,7 @@ void scatterForLoss(nd4j::LaunchContext *context, const NDArray& indices, NDArr
if(!calcGrad) { if(!calcGrad) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto subArr = updates(i, dimsToExclude); auto subArr = updates(i, dimsToExclude);
output.p(i, subArr.e(indices.e<Nd4jLong>(i))); output.p(i, subArr.e(indices.e<Nd4jLong>(i)));
} }
@ -179,7 +179,7 @@ void scatterForLoss(nd4j::LaunchContext *context, const NDArray& indices, NDArr
samediff::Threads::parallel_for(func, 0, indicesLen); samediff::Threads::parallel_for(func, 0, indicesLen);
} else { } else {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto subArr = updates(i, dimsToExclude); auto subArr = updates(i, dimsToExclude);
auto ind = indices.e<Nd4jLong>(i); auto ind = indices.e<Nd4jLong>(i);
subArr.p(ind, subArr.e(ind) - 1.); subArr.p(ind, subArr.e(ind) - 1.);

View File

@ -169,7 +169,7 @@ namespace helpers {
for (int i = 1; i < indices->lengthOf(); i++) { for (int i = 1; i < indices->lengthOf(); i++) {
if (indices->e<int>(i) == idx) { if (indices->e<int>(i) == idx) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
meanV.p<T>(e, meanV.e<T>(e) + listOfTensors.at(i)->e<T>(e)); meanV.p<T>(e, meanV.e<T>(e) + listOfTensors.at(i)->e<T>(e));
} }
}; };
@ -223,7 +223,7 @@ namespace helpers {
for (int i = 0; i < indices->lengthOf(); i++) { for (int i = 0; i < indices->lengthOf(); i++) {
if (indices->e<int>(i) == idx) { if (indices->e<int>(i) == idx) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
sumT->p(e, sumT->e<T>(e) + listOfTensors.at(i)->e<T>(e)); sumT->p(e, sumT->e<T>(e) + listOfTensors.at(i)->e<T>(e));
} }
}; };
@ -272,7 +272,7 @@ namespace helpers {
for (int i = 1; i < indices->lengthOf(); i++) { for (int i = 1; i < indices->lengthOf(); i++) {
if (indices->e<int>(i) == idx) { if (indices->e<int>(i) == idx) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
sumT->p(e, sumT->e<T>(e) * listOfTensors.at(i)->e<T>(e)); sumT->p(e, sumT->e<T>(e) * listOfTensors.at(i)->e<T>(e));
} }
}; };
@ -625,7 +625,7 @@ namespace helpers {
Nd4jLong loop_size = input->lengthOf(); Nd4jLong loop_size = input->lengthOf();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto classNum = indices->e<Nd4jLong>(e); auto classNum = indices->e<Nd4jLong>(e);
if (nd4j::math::nd4j_abs(tempRes.e<T>(classNum) - input->e<T>(e)) <= T(1.e-6)) if (nd4j::math::nd4j_abs(tempRes.e<T>(classNum) - input->e<T>(e)) <= T(1.e-6))
output->p(e, gradOut->e<T>(classNum)); output->p(e, gradOut->e<T>(classNum));
@ -645,7 +645,7 @@ namespace helpers {
//std::vector<std::pair<NDArray*, int>> outputs(numOfClasses); //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto classNum = indices->e<Nd4jLong>(i); auto classNum = indices->e<Nd4jLong>(i);
auto current = listOfTensors.at(i); auto current = listOfTensors.at(i);
auto currentOut = listOfOutTensors.at(i); auto currentOut = listOfOutTensors.at(i);
@ -675,7 +675,7 @@ namespace helpers {
segmentMinFunctor(context, input, indices, &tempRes); segmentMinFunctor(context, input, indices, &tempRes);
if (input->isVector()) { if (input->isVector()) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto classNum = indices->e<Nd4jLong>(e); auto classNum = indices->e<Nd4jLong>(e);
if (nd4j::math::nd4j_abs(tempRes.e<double>(classNum) - input->e<double>(e)) < 1.e-5) if (nd4j::math::nd4j_abs(tempRes.e<double>(classNum) - input->e<double>(e)) < 1.e-5)
output->p(e, gradOut->e<double>(classNum)); output->p(e, gradOut->e<double>(classNum));
@ -697,7 +697,7 @@ namespace helpers {
int pos = 0; int pos = 0;
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto classNum = indices->e<Nd4jLong>(i); auto classNum = indices->e<Nd4jLong>(i);
auto current = listOfTensors.at(i); auto current = listOfTensors.at(i);
auto currentOut = listOfOutTensors.at(i); auto currentOut = listOfOutTensors.at(i);
@ -887,7 +887,7 @@ namespace helpers {
if (input->isVector()) { if (input->isVector()) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto classNum = indices->e<Nd4jLong>(e); auto classNum = indices->e<Nd4jLong>(e);
if (nd4j::math::nd4j_abs(tempRes.t<T>(classNum) - input->t<T>(e)) < 1.e-6) if (nd4j::math::nd4j_abs(tempRes.t<T>(classNum) - input->t<T>(e)) < 1.e-6)
output->t<T>(e) = gradOut->t<T>(classNum); output->t<T>(e) = gradOut->t<T>(classNum);
@ -1004,7 +1004,7 @@ namespace helpers {
unsortedSegmentProdFunctor(context, input, indices, numOfClasses, &tempRes); unsortedSegmentProdFunctor(context, input, indices, numOfClasses, &tempRes);
if (input->isVector()) { if (input->isVector()) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto classNum = indices->e<Nd4jLong>(e); auto classNum = indices->e<Nd4jLong>(e);
output->p<double>(e, gradOut->e<double>(classNum) * tempRes.e<double>(classNum) / input->e<double>(e)); output->p<double>(e, gradOut->e<double>(classNum) * tempRes.e<double>(classNum) / input->e<double>(e));
} }

View File

@ -364,7 +364,7 @@ namespace nd4j {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
T sneu1e[600]; T sneu1e[600];
for (auto t = start; t < stop; t += increment) { for (auto t = start; t < stop; t++) {
T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
memset(neu1e, 0, vectorLength * sizeof(T)); memset(neu1e, 0, vectorLength * sizeof(T));
@ -457,7 +457,7 @@ namespace nd4j {
T sneu1[600]; T sneu1[600];
T sneu1e[600]; T sneu1e[600];
for (int e = start; e < stop; e += increment) { for (int e = start; e < stop; e++) {
T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength]; T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength];
T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];

View File

@ -40,7 +40,7 @@ namespace helpers {
output->assign(input); output->assign(input);
auto batchLoop = PRAGMA_THREADS_FOR { auto batchLoop = PRAGMA_THREADS_FOR {
for (auto batch = start; batch < stop; batch += increment) { for (auto batch = start; batch < stop; batch++) {
for (auto r = 0; r < rows; r++) { for (auto r = 0; r < rows; r++) {
for (auto c = 0; c < r; c++) { for (auto c = 0; c < r; c++) {
math::nd4j_swap(outputPart[batch]->t<T>(r, c) , outputPart[batch]->t<T>(c, r)); math::nd4j_swap(outputPart[batch]->t<T>(r, c) , outputPart[batch]->t<T>(c, r));

View File

@ -143,7 +143,7 @@ static void sruBI_(NDArray* x, const NDArray* w, const NDArray* b, const NDArray
T* pCt = ct->bufferAsT<T>(); T* pCt = ct->bufferAsT<T>();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto col = start; col < stop; col += increment) { for (auto col = start; col < stop; col++) {
const auto colNum = col % d2; const auto colNum = col % d2;
bool flip = colNum >= K; bool flip = colNum >= K;
T maskVal = mask ? *(pMask + col) : T(1); T maskVal = mask ? *(pMask + col) : T(1);
@ -236,7 +236,7 @@ static void sruBIBP_(NDArray* x, const NDArray* w, const NDArray* b, const NDArr
T* pGradInit = gradC0->bufferAsT<T>(); T* pGradInit = gradC0->bufferAsT<T>();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto col = start; col < stop; col += increment) { for (auto col = start; col < stop; col++) {
T gbF = 0.f; T gbF = 0.f;
T gbR = 0.f; T gbR = 0.f;
const auto colNum = col % d2; const auto colNum = col % d2;

View File

@ -37,7 +37,7 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
int inSize = inArrs.size(); int inSize = inArrs.size();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) for (auto i = start; i < stop; i++)
outArr->p<T>(i, inArrs[i]->t<T>(0)); outArr->p<T>(i, inArrs[i]->t<T>(0));
}; };
@ -50,7 +50,7 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
int listSize = list.size(); int listSize = list.size();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) for (auto i = start; i < stop; i++)
list.at(i)->assign(inArrs[i]); list.at(i)->assign(inArrs[i]);
}; };
samediff::Threads::parallel_tad(func, 0, listSize); samediff::Threads::parallel_tad(func, 0, listSize);

View File

@ -150,7 +150,7 @@ namespace helpers {
result->assign(0); result->assign(0);
if (status == ND4J_STATUS_OK) { if (status == ND4J_STATUS_OK) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
bool found = false; bool found = false;
for (int j = 0; j < k; j++) { for (int j = 0; j < k; j++) {
if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) { if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) {

View File

@ -43,7 +43,7 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N
int dLen = dOdI.lengthOf(); int dLen = dOdI.lengthOf();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
if (dOdI.t<T>(i) != static_cast<T>(0.f)) if (dOdI.t<T>(i) != static_cast<T>(0.f))
dOdI.t<T>(i) = static_cast<T>(1.f); dOdI.t<T>(i) = static_cast<T>(1.f);
} }
@ -65,7 +65,7 @@ static void trace_(const NDArray& input, NDArray& output) {
auto setOfSubArrs = input.allTensorsAlongDimension({inRank-2, inRank-1}); auto setOfSubArrs = input.allTensorsAlongDimension({inRank-2, inRank-1});
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) for (auto i = start; i < stop; i++)
output.p(i, setOfSubArrs.at(i)->getTrace()); output.p(i, setOfSubArrs.at(i)->getTrace());
}; };
samediff::Threads::parallel_for(func, 0, setOfSubArrs.size()); samediff::Threads::parallel_for(func, 0, setOfSubArrs.size());
@ -189,7 +189,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK]; Nd4jLong coords[MAX_RANK];
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
shape::index2coords(i, output.getShapeInfo(), coords); shape::index2coords(i, output.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
@ -220,7 +220,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK]; Nd4jLong coords[MAX_RANK];
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
shape::index2coords(i, output.getShapeInfo(), coords); shape::index2coords(i, output.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
@ -566,7 +566,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK * 3]; Nd4jLong coords[MAX_RANK * 3];
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
Nd4jLong *zCoordStart, *xCoordStart; Nd4jLong *zCoordStart, *xCoordStart;
if (yLastDim == xRank) { if (yLastDim == xRank) {
@ -650,7 +650,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
else if (input->rankOf() == 1 && indices->isVector()) { else if (input->rankOf() == 1 && indices->isVector()) {
// special case // special case
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) for (auto e = start; e < stop; e++)
output->p(e, input->e<T>(indices->e<Nd4jLong>(e))); output->p(e, input->e<T>(indices->e<Nd4jLong>(e)));
}; };
@ -663,7 +663,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), dimsOut); const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), dimsOut);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
NDArray subArrOut = (*output)(i, dimsOut); NDArray subArrOut = (*output)(i, dimsOut);
NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis}); NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis});
subArrOut.assign(subArrIn); subArrOut.assign(subArrIn);
@ -687,7 +687,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), {axis}); const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), {axis});
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
NDArray subArrOut = (*output)(i, {axis}); NDArray subArrOut = (*output)(i, {axis});
NDArray subArrIn = (*input)(intArgs[i + 1], {axis}); NDArray subArrIn = (*input)(intArgs[i + 1], {axis});
subArrOut.assign(subArrIn); subArrOut.assign(subArrIn);
@ -710,7 +710,7 @@ void eye(nd4j::LaunchContext * context, NDArray& output) {
auto arrs = output.allTensorsAlongDimension({rank-2, rank-1}); auto arrs = output.allTensorsAlongDimension({rank-2, rank-1});
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) for (auto i = start; i < stop; i++)
arrs.at(i)->setIdentity(); arrs.at(i)->setIdentity();
}; };
@ -737,7 +737,7 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat
indices.push_back((*intArgs)[e]); indices.push_back((*intArgs)[e]);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto inSubArr = input(indices[i], dimsToExclude, true); auto inSubArr = input(indices[i], dimsToExclude, true);
auto updSubArr = updates(i, dimsToExclude, true); auto updSubArr = updates(i, dimsToExclude, true);
@ -786,7 +786,7 @@ void scatterSimple(nd4j::LaunchContext * context, const int opId, NDArray& input
case 6: { // copy case 6: { // copy
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto inSubArr = input(i, dimensions); auto inSubArr = input(i, dimensions);
inSubArr.p(indices.t<Nd4jLong>(i), updates.e(i)); inSubArr.p(indices.t<Nd4jLong>(i), updates.e(i));
} }
@ -809,7 +809,7 @@ static void mergeMaxIndex_(const std::vector<NDArray*>& inArrs, NDArray& output)
auto x = inArrs[0]; auto x = inArrs[0];
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
T max = -DataTypeUtils::max<T>(); T max = -DataTypeUtils::max<T>();
Nd4jLong idx = 0; Nd4jLong idx = 0;
@ -839,7 +839,7 @@ static void mergeMax_(const std::vector<NDArray*>& inArrs, NDArray& output) {
auto x = inArrs[0]; auto x = inArrs[0];
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
T max = -DataTypeUtils::max<T>(); T max = -DataTypeUtils::max<T>();
for (int i = 0; i < numArgs; i++) { for (int i = 0; i < numArgs; i++) {
T v = inArrs[i]->e<T>(e); T v = inArrs[i]->e<T>(e);
@ -865,7 +865,7 @@ static void mergeAvg_(const std::vector<NDArray*>& inArrs, NDArray& output) {
auto x = inArrs[0]; auto x = inArrs[0];
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
T sum = 0.; T sum = 0.;
for (int i = 0; i < numArgs; i++) { for (int i = 0; i < numArgs; i++) {
T v = inArrs[i]->e<T>(e); T v = inArrs[i]->e<T>(e);
@ -891,7 +891,7 @@ static void mergeAdd_(const std::vector<NDArray*>& inArrs, NDArray& output) {
auto x = inArrs[0]; auto x = inArrs[0];
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
T sum = (T) 0.f; T sum = (T) 0.f;
for (int i = 0; i < numArgs; i++) for (int i = 0; i < numArgs; i++)
sum += inArrs[i]->e<T>(e); sum += inArrs[i]->e<T>(e);
@ -928,7 +928,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions); auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
const T iNormActual = norm2.e<T>(i); const T iNormActual = norm2.e<T>(i);
if (iNormActual > normClip) if (iNormActual > normClip)
*listOfInSubArrs.at(i) *= normClip / iNormActual; *listOfInSubArrs.at(i) *= normClip / iNormActual;
@ -952,7 +952,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions); auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto inputSubArr = listOfInSubArrs.at(i); auto inputSubArr = listOfInSubArrs.at(i);
auto outputSubArr = listOfOutSubArrs.at(i); auto outputSubArr = listOfOutSubArrs.at(i);
outputSubArr->assign(inputSubArr); outputSubArr->assign(inputSubArr);
@ -1058,7 +1058,7 @@ static void clipByNormBP_(const NDArray& input, const NDArray& gradO, NDArray& g
auto cn = clipNorm.e<T>(0); auto cn = clipNorm.e<T>(0);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
T N = norm2.e<T>(i); T N = norm2.e<T>(i);
auto gradOSubArr = gradOSubArrs.at(i); auto gradOSubArr = gradOSubArrs.at(i);
@ -1190,7 +1190,7 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
Nd4jLong inIdx[MAX_RANK]; Nd4jLong inIdx[MAX_RANK];
Nd4jLong outIdx[MAX_RANK]; Nd4jLong outIdx[MAX_RANK];
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
shape::index2coords(i, output.getShapeInfo(), outIdx); shape::index2coords(i, output.getShapeInfo(), outIdx);
for (int j = 0; j < rank; ++j) { for (int j = 0; j < rank; ++j) {
@ -1225,17 +1225,6 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
BUILD_SINGLE_TEMPLATE(template void mirrorPad_, (const NDArray& input, const NDArray& paddings, NDArray& output, const int mode), LIBND4J_TYPES); BUILD_SINGLE_TEMPLATE(template void mirrorPad_, (const NDArray& input, const NDArray& paddings, NDArray& output, const int mode), LIBND4J_TYPES);
//////////////////////////////////////////////////////////////////////////
template<typename T>
static void concat_(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
nd4j::SpecialMethods<T>::concatCpuGeneric(inArrs, output, axis);
}
void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES);
}
BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector<NDArray*>& inArrs, NDArray& output, const int axis), LIBND4J_TYPES);
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template <typename T> template <typename T>

View File

@ -90,7 +90,7 @@ namespace helpers {
auto outputPart = output->allTensorsAlongDimension({-2, -1}); auto outputPart = output->allTensorsAlongDimension({-2, -1});
auto batchLoop = PRAGMA_THREADS_FOR { auto batchLoop = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
if (lower) { if (lower) {
lowerTriangularSolve<T>(context, leftPart[i], rightPart[i], adjoint, outputPart[i]); lowerTriangularSolve<T>(context, leftPart[i], rightPart[i], adjoint, outputPart[i]);
} else { } else {
@ -112,7 +112,7 @@ namespace helpers {
auto rows = input->sizeAt(-2); auto rows = input->sizeAt(-2);
auto batchLoop = PRAGMA_THREADS_FOR { auto batchLoop = PRAGMA_THREADS_FOR {
for (auto batch = start; batch < stop; batch += increment) { for (auto batch = start; batch < stop; batch++) {
if (!lower) { if (!lower) {
for (auto r = 0; r < rows; r++) { for (auto r = 0; r < rows; r++) {
for (auto c = 0; c <= r; c++) { for (auto c = 0; c <= r; c++) {

View File

@ -64,7 +64,7 @@ static void zeta_(nd4j::LaunchContext * context, const NDArray& x, const NDArray
int xLen = x.lengthOf(); int xLen = x.lengthOf();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) for (auto i = start; i < stop; i++)
z.p(i, zetaScalar<T>(x.e<T>(i), q.e<T>(i))); z.p(i, zetaScalar<T>(x.e<T>(i), q.e<T>(i)));
}; };

View File

@ -68,7 +68,7 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND
int tads = tadsA.size(); int tads = tadsA.size();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto a_ = tadsA.at(e); auto a_ = tadsA.at(e);
auto b_ = tadsB.at(e); auto b_ = tadsB.at(e);
auto o_ = tadsO.at(e); auto o_ = tadsO.at(e);

View File

@ -69,7 +69,7 @@ namespace helpers {
} }
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
values->p(e, static_cast<T>(valuesVector[e])); values->p(e, static_cast<T>(valuesVector[e]));
if (counts != nullptr) if (counts != nullptr)
counts->p(e, countsMap[valuesVector[e]]); counts->p(e, countsMap[valuesVector[e]]);

View File

@ -19,8 +19,10 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_double.hpp"
namespace nd4j { namespace nd4j {
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_0); BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_0);
BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES);
} }

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_double.hpp"
namespace nd4j { namespace nd4j {
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_1); BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_1);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_double.hpp"
namespace nd4j { namespace nd4j {
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_2); BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_2);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_double.hpp"
namespace nd4j { namespace nd4j {
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_3); BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_3);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_double.hpp"
namespace nd4j { namespace nd4j {
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_4); BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_4);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_double.hpp"
namespace nd4j { namespace nd4j {
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_5); BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_5);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_double.hpp"
namespace nd4j { namespace nd4j {
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_6); BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_6);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_double.hpp"
namespace nd4j { namespace nd4j {
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_7); BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_7);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_double.hpp"
namespace nd4j { namespace nd4j {
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_8); BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_8);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_double.hpp"
namespace nd4j { namespace nd4j {
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_9); BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_9);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_single.hpp"
namespace nd4j { namespace nd4j {
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_0); BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_0);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_single.hpp"
namespace nd4j { namespace nd4j {
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_1); BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_1);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_single.hpp"
namespace nd4j { namespace nd4j {
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_2); BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_2);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_single.hpp"
namespace nd4j { namespace nd4j {
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_3); BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_3);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_single.hpp"
namespace nd4j { namespace nd4j {
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_4); BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_4);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_single.hpp"
namespace nd4j { namespace nd4j {
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_5); BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_5);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_single.hpp"
namespace nd4j { namespace nd4j {
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_6); BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_6);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_single.hpp"
namespace nd4j { namespace nd4j {
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_7); BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_7);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_single.hpp"
namespace nd4j { namespace nd4j {
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_8); BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_8);

View File

@ -19,7 +19,7 @@
// @author raver119@gmail.com // @author raver119@gmail.com
// //
#include "../specials.hpp" #include "../specials_single.hpp"
namespace nd4j { namespace nd4j {
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_9); BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_9);

View File

@ -34,7 +34,7 @@ namespace nd4j {
// handle transpose in parallel // handle transpose in parallel
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto r = start; r < stop; r += increment) { for (auto r = start; r < stop; r++) {
for (int c = 0; c < cols; c++) { for (int c = 0; c < cols; c++) {
int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c); int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c);
int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c); int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c);
@ -73,7 +73,7 @@ namespace nd4j {
C[r] = z; C[r] = z;
} else { } else {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto r = start; r < stop; r += increment) for (auto r = start; r < stop; r++)
C[r] = z; C[r] = z;
}; };
samediff::Threads::parallel_for(func, 0, length); samediff::Threads::parallel_for(func, 0, length);
@ -130,7 +130,7 @@ namespace nd4j {
auto aT = TRANS == CblasTrans ? reinterpret_cast<X *>(nd4j::blas::transpose<X>(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast<void *>(x))) : x; auto aT = TRANS == CblasTrans ? reinterpret_cast<X *>(nd4j::blas::transpose<X>(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast<void *>(x))) : x;
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto r = start; r < stop; r += increment) { for (auto r = start; r < stop; r++) {
int aIdx = linearIndexC(M, N, r, 0); int aIdx = linearIndexC(M, N, r, 0);
auto aX = aT + aIdx; auto aX = aT + aIdx;

View File

@ -0,0 +1,270 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com, created on 07.10.2017.
// @author Yurii Shyrma (iuriish@yahoo.com)
//
#include <pointercast.h>
#include <helpers/shape.h>
#include <helpers/TAD.h>
#include <specials.h>
#include <dll.h>
#include <NDArray.h>
#include <ops/declarable/CustomOperations.h>
#include <types/types.h>
#include <helpers/Loops.h>
namespace nd4j {
template<typename S, typename T>
void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) {
auto x = reinterpret_cast<S *>(dx);
auto z = reinterpret_cast<T *>(dz);
auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i++) {
z[i] = static_cast<T>(x[i]);
}
};
samediff::Threads::parallel_for(func, 0, N);
};
template <typename X, typename Y>
void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
int i = left, j = right;
X ktmp;
X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)];
Y vtmp;
{
/* PARTITION PART */
while (i <= j) {
if (descending) {
while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot)
i++;
while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot)
j--;
if (i <= j) {
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
i++;
j--;
}
} else {
while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot)
i++;
while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot)
j--;
if (i <= j) {
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
i++;
j--;
}
}
}
}
//
if ( ((right-left)<cutoff) ){
if (left < j){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
}else{
PRAGMA_OMP_TASK
{ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
PRAGMA_OMP_TASK
{ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
}
}
template <typename X, typename Y>
void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
int i = left, j = right;
X ktmp;
Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)];
Y vtmp;
{
/* PARTITION PART */
while (i <= j) {
if (descending) {
while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot)
i++;
while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot)
j--;
if (i <= j) {
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
i++;
j--;
}
} else {
while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot)
i++;
while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot)
j--;
if (i <= j) {
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
i++;
j--;
}
}
}
}
//
if ( ((right-left)<cutoff) ){
if (left < j){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
}else{
PRAGMA_OMP_TASK
{ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
PRAGMA_OMP_TASK
{ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
}
}
template <typename X, typename Y>
static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
auto array = reinterpret_cast<X *>(varray);
auto values = reinterpret_cast<Y *>(yarray);
int cutoff = 1000;
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
{
PRAGMA_OMP_SINGLE_ARGS(nowait)
{
quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
}
}
}
template <typename X, typename Y>
static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
auto array = reinterpret_cast<X *>(varray);
auto values = reinterpret_cast<Y *>(yarray);
int cutoff = 1000;
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
{
PRAGMA_OMP_SINGLE_ARGS(nowait)
{
quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
}
}
}
template <typename X, typename Y>
void DoubleMethods<X,Y>::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
quickSort_parallel_key<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
}
template <typename X, typename Y>
void DoubleMethods<X,Y>::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
quickSort_parallel_value<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
}
template <typename X, typename Y>
void DoubleMethods<X,Y>::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
auto x = reinterpret_cast<X*>(vx);
auto y = reinterpret_cast<Y*>(vy);
auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
auto xLength = shape::length(xShapeInfo);
auto xTadLength = shape::length(packX.primaryShapeInfo());
auto numTads = packX.numberOfTads();
auto func = PRAGMA_THREADS_FOR {
for (auto r = start; r < stop; r++) {
auto dx = x + packX.primaryOffsets()[r];
auto dy = y + packY.primaryOffsets()[r];
quickSort_parallel_key<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
}
};
samediff::Threads::parallel_tad(func, 0, numTads);
}
template <typename X, typename Y>
void DoubleMethods<X,Y>::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
auto x = reinterpret_cast<X*>(vx);
auto y = reinterpret_cast<Y*>(vy);
auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
auto xLength = shape::length(xShapeInfo);
auto xTadLength = shape::length(packX.primaryShapeInfo());
auto numTads = packX.numberOfTads();
auto func = PRAGMA_THREADS_FOR {
for (auto r = start; r < stop; r++) {
auto dx = x + packX.primaryOffsets()[r];
auto dy = y + packY.primaryOffsets()[r];
quickSort_parallel_value<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
}
};
samediff::Threads::parallel_tad(func, 0, numTads);
}
}

View File

@ -64,7 +64,7 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, ND
T* outBuff = output.bufferAsT<T>(); T* outBuff = output.bufferAsT<T>();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto r = start; r < stop; r += increment) { for (auto r = start; r < stop; r++) {
const Nd4jLong arrLen = inArrs[r]->lengthOf(); const Nd4jLong arrLen = inArrs[r]->lengthOf();
const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]]; const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
@ -99,7 +99,7 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, ND
} }
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
auto temp = output(indices[i], true); auto temp = output(indices[i], true);
nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1); nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
} }
@ -143,7 +143,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
auto x = reinterpret_cast<T **>(vx); auto x = reinterpret_cast<T **>(vx);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
for (auto ar = 0L; ar < n; ar++) { for (auto ar = 0L; ar < n; ar++) {
z[i] += x[ar][i]; z[i] += x[ar][i];
} }
@ -179,7 +179,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
} }
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
for (Nd4jLong ar = 1; ar < n; ar++) { for (Nd4jLong ar = 1; ar < n; ar++) {
z[i] += x[ar][i] / static_cast<T>(n); z[i] += x[ar][i] / static_cast<T>(n);
} }
@ -199,7 +199,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
// aggregation step // aggregation step
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) { for (auto i = start; i < stop; i++) {
for (Nd4jLong ar = 0; ar < n; ar++) { for (Nd4jLong ar = 0; ar < n; ar++) {
z[i] += x[ar][i] / static_cast<T>(n); z[i] += x[ar][i] / static_cast<T>(n);
} }
@ -336,7 +336,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
int numTads = xLength / xTadLength; int numTads = xLength / xTadLength;
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto r = start; r < stop; r += increment) { for (auto r = start; r < stop; r++) {
T *dx = x + tadOffsets[r]; T *dx = x + tadOffsets[r];
quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending); quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending);
@ -358,7 +358,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
for (int bitId = 0; bitId < 16; bitId++) { for (int bitId = 0; bitId < 16; bitId++) {
bool hasBit = (x[e] & 1 << (bitId)) != 0; bool hasBit = (x[e] & 1 << (bitId)) != 0;
bool hasSign = (x[e] & 1 << (bitId + 16)) != 0; bool hasSign = (x[e] & 1 << (bitId + 16)) != 0;
@ -378,22 +378,6 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
samediff::Threads::parallel_for(func, 4, lim); samediff::Threads::parallel_for(func, 4, lim);
} }
template<typename S, typename T>
void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) {
auto x = reinterpret_cast<S *>(dx);
auto z = reinterpret_cast<T *>(dz);
auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i += increment) {
z[i] = static_cast<T>(x[i]);
}
};
samediff::Threads::parallel_for(func, 0, N);
};
BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES);
template<typename T> template<typename T>
Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) { Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) {
auto dx = reinterpret_cast<T *>(vx); auto dx = reinterpret_cast<T *>(vx);
@ -442,226 +426,5 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
}; };
return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16); return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
} }
template <typename X, typename Y>
void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
int i = left, j = right;
X ktmp;
X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)];
Y vtmp;
{
/* PARTITION PART */
while (i <= j) {
if (descending) {
while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot)
i++;
while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot)
j--;
if (i <= j) {
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
i++;
j--;
}
} else {
while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot)
i++;
while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot)
j--;
if (i <= j) {
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
i++;
j--;
}
}
}
}
//
if ( ((right-left)<cutoff) ){
if (left < j){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
}else{
PRAGMA_OMP_TASK
{ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
PRAGMA_OMP_TASK
{ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
}
}
template <typename X, typename Y>
void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
int i = left, j = right;
X ktmp;
Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)];
Y vtmp;
{
/* PARTITION PART */
while (i <= j) {
if (descending) {
while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot)
i++;
while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot)
j--;
if (i <= j) {
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
i++;
j--;
}
} else {
while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot)
i++;
while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot)
j--;
if (i <= j) {
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
i++;
j--;
}
}
}
}
//
if ( ((right-left)<cutoff) ){
if (left < j){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
}else{
PRAGMA_OMP_TASK
{ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
PRAGMA_OMP_TASK
{ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
}
}
template <typename X, typename Y>
static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
auto array = reinterpret_cast<X *>(varray);
auto values = reinterpret_cast<Y *>(yarray);
int cutoff = 1000;
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
{
PRAGMA_OMP_SINGLE_ARGS(nowait)
{
quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
}
}
}
template <typename X, typename Y>
static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
auto array = reinterpret_cast<X *>(varray);
auto values = reinterpret_cast<Y *>(yarray);
int cutoff = 1000;
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
{
PRAGMA_OMP_SINGLE_ARGS(nowait)
{
quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
}
}
}
template <typename X, typename Y>
void DoubleMethods<X,Y>::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
quickSort_parallel_key<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
}
template <typename X, typename Y>
void DoubleMethods<X,Y>::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
quickSort_parallel_value<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
}
template <typename X, typename Y>
void DoubleMethods<X,Y>::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
auto x = reinterpret_cast<X*>(vx);
auto y = reinterpret_cast<Y*>(vy);
auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
auto xLength = shape::length(xShapeInfo);
auto xTadLength = shape::length(packX.primaryShapeInfo());
auto numTads = packX.numberOfTads();
auto func = PRAGMA_THREADS_FOR {
for (auto r = start; r < stop; r += increment) {
auto dx = x + packX.primaryOffsets()[r];
auto dy = y + packY.primaryOffsets()[r];
quickSort_parallel_key<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
}
};
samediff::Threads::parallel_tad(func, 0, numTads);
}
template <typename X, typename Y>
void DoubleMethods<X,Y>::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
auto x = reinterpret_cast<X*>(vx);
auto y = reinterpret_cast<Y*>(vy);
auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
auto xLength = shape::length(xShapeInfo);
auto xTadLength = shape::length(packX.primaryShapeInfo());
auto numTads = packX.numberOfTads();
auto func = PRAGMA_THREADS_FOR {
for (auto r = start; r < stop; r += increment) {
auto dx = x + packX.primaryOffsets()[r];
auto dy = y + packY.primaryOffsets()[r];
quickSort_parallel_value<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
}
};
samediff::Threads::parallel_tad(func, 0, numTads);
}
//BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES);
//BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES);
} }

View File

@ -167,7 +167,7 @@ namespace randomOps {
if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) { if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (uint64_t e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
T prob = rng->relativeT<T>(e); T prob = rng->relativeT<T>(e);
T cumProb = (T) 0.0f; T cumProb = (T) 0.0f;
for (Nd4jLong f = 0; f < yLength; f++) { for (Nd4jLong f = 0; f < yLength; f++) {
@ -330,7 +330,7 @@ namespace randomOps {
const T epsilon = static_cast<T>(1e-5); const T epsilon = static_cast<T>(1e-5);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (uint64_t e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto epm = e + middle; auto epm = e + middle;
// we need to get random values // we need to get random values
@ -440,7 +440,7 @@ namespace randomOps {
nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state); nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (Nd4jLong e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
int success = 0; int success = 0;
for (int t = 1; t <= trials; t++) { for (int t = 1; t <= trials; t++) {
@ -549,7 +549,7 @@ namespace randomOps {
//nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state); //nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state); nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (uint64_t e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
int success = 0; int success = 0;
for (int t = 1; t <= trials; t++) { for (int t = 1; t <= trials; t++) {
@ -690,7 +690,7 @@ namespace randomOps {
const T epsilon = static_cast<T>(1e-5); const T epsilon = static_cast<T>(1e-5);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (uint64_t e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
if (z[e] > mean + ds || z[e] < mean - ds) { if (z[e] > mean + ds || z[e] < mean - ds) {
z[e] = step(rng, mean, stddev, e, middle, z[e]); z[e] = step(rng, mean, stddev, e, middle, z[e]);
@ -818,7 +818,7 @@ namespace randomOps {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t e = start; e < stop; e += increment) { for (auto e = start; e < stop; e++) {
auto epm = e + middle; auto epm = e + middle;
// we need to get random values // we need to get random values