Minor improvements (#255)
* static increments in loops Signed-off-by: raver119 <raver119@gmail.com> * specials and concat split into separate units Signed-off-by: raver119 <raver119@gmail.com>master
parent
d9058b469a
commit
215641ea9e
|
@ -501,7 +501,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
|
||||||
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
|
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto cdata = data + offsets[e];
|
auto cdata = data + offsets[e];
|
||||||
if (dataType == DataType::UTF16) {
|
if (dataType == DataType::UTF16) {
|
||||||
unicode::utf8to16(string[e], cdata, std::char_traits<char>::length(string[e]));
|
unicode::utf8to16(string[e], cdata, std::char_traits<char>::length(string[e]));
|
||||||
|
@ -568,7 +568,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::stri
|
||||||
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
|
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto cdata = data + offsets[e];
|
auto cdata = data + offsets[e];
|
||||||
if (dataType == DataType::UTF16) {
|
if (dataType == DataType::UTF16) {
|
||||||
unicode::utf8to16(string[e].data(), cdata, string[e].size());
|
unicode::utf8to16(string[e].data(), cdata, string[e].size());
|
||||||
|
@ -635,7 +635,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u16s
|
||||||
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
|
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto cdata = data + offsets[e];
|
auto cdata = data + offsets[e];
|
||||||
if (dtype == DataType::UTF16) {
|
if (dtype == DataType::UTF16) {
|
||||||
memcpy(cdata, string[e].data(), string[e].size() * sizeof(uint16_t));
|
memcpy(cdata, string[e].data(), string[e].size() * sizeof(uint16_t));
|
||||||
|
@ -701,7 +701,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
|
||||||
|
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto cdata = data + offsets[e];
|
auto cdata = data + offsets[e];
|
||||||
if (dtype == DataType::UTF16) {
|
if (dtype == DataType::UTF16) {
|
||||||
memcpy(cdata, string[e], std::char_traits<char16_t>::length(string[e]) * sizeof(uint16_t));
|
memcpy(cdata, string[e], std::char_traits<char16_t>::length(string[e]) * sizeof(uint16_t));
|
||||||
|
@ -767,7 +767,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u32s
|
||||||
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
|
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto cdata = data + offsets[e];
|
auto cdata = data + offsets[e];
|
||||||
if (dtype == DataType::UTF16) {
|
if (dtype == DataType::UTF16) {
|
||||||
unicode::utf32to16(string[e].data(), cdata, string[e].size());
|
unicode::utf32to16(string[e].data(), cdata, string[e].size());
|
||||||
|
@ -833,7 +833,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
|
||||||
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
|
auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto cdata = data + offsets[e];
|
auto cdata = data + offsets[e];
|
||||||
if (dtype == DataType::UTF16) {
|
if (dtype == DataType::UTF16) {
|
||||||
unicode::utf32to16(string[e], cdata, std::char_traits<char32_t>::length(string[e]));
|
unicode::utf32to16(string[e], cdata, std::char_traits<char32_t>::length(string[e]));
|
||||||
|
@ -2367,7 +2367,7 @@ NDArray NDArray::asS() const {
|
||||||
const auto inData = bufferAsT<int8_t>() + offsetsLength;
|
const auto inData = bufferAsT<int8_t>() + offsetsLength;
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (int e = start; e < stop; e += increment) {
|
for (int e = start; e < stop; e++) {
|
||||||
auto cdata = outData + offsets[e];
|
auto cdata = outData + offsets[e];
|
||||||
auto end = nInputoffsets[e + 1];
|
auto end = nInputoffsets[e + 1];
|
||||||
auto idata = inData + nInputoffsets[e];
|
auto idata = inData + nInputoffsets[e];
|
||||||
|
@ -3466,7 +3466,7 @@ NDArray NDArray::dup(const char newOrder) const {
|
||||||
std::vector<std::string> strings(lengthOf());
|
std::vector<std::string> strings(lengthOf());
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
strings[i] = std::move(this->e<std::string>(i));
|
strings[i] = std::move(this->e<std::string>(i));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -3479,7 +3479,7 @@ NDArray NDArray::dup(const char newOrder) const {
|
||||||
std::vector<std::u16string> strings(lengthOf());
|
std::vector<std::u16string> strings(lengthOf());
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
strings[i] = std::move(this->e<std::u16string>(i));
|
strings[i] = std::move(this->e<std::u16string>(i));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -3491,7 +3491,7 @@ NDArray NDArray::dup(const char newOrder) const {
|
||||||
|
|
||||||
std::vector<std::u32string> strings(lengthOf());
|
std::vector<std::u32string> strings(lengthOf());
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
strings[i] = std::move(this->e<std::u32string>(i));
|
strings[i] = std::move(this->e<std::u32string>(i));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -98,7 +98,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
Nd4jLong coords[MAX_RANK];
|
Nd4jLong coords[MAX_RANK];
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
shape::index2coords(i, target.getShapeInfo(), coords);
|
shape::index2coords(i, target.getShapeInfo(), coords);
|
||||||
const auto zOffset = shape::getOffset(target.getShapeInfo(), coords);
|
const auto zOffset = shape::getOffset(target.getShapeInfo(), coords);
|
||||||
|
|
||||||
|
@ -152,7 +152,7 @@ static void templatedSwap(void *xBuffer, void *yBuffer, Nd4jLong length) {
|
||||||
auto y = reinterpret_cast<T *>(yBuffer);
|
auto y = reinterpret_cast<T *>(yBuffer);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto temp = x[i];
|
auto temp = x[i];
|
||||||
x[i] = y[i];
|
x[i] = y[i];
|
||||||
y[i] = temp;
|
y[i] = temp;
|
||||||
|
@ -266,7 +266,7 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
|
||||||
if(result.ordering() == 'c') { // ews == 1 always here
|
if(result.ordering() == 'c') { // ews == 1 always here
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
|
auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
|
||||||
BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES);
|
BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES);
|
||||||
}
|
}
|
||||||
|
@ -277,7 +277,7 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
|
||||||
else {
|
else {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto xOffset = result.getOffset(i);
|
auto xOffset = result.getOffset(i);
|
||||||
auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
|
auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
|
||||||
BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES);
|
BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES);
|
||||||
|
@ -377,7 +377,7 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
|
||||||
// loop through input array
|
// loop through input array
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
Nd4jLong coords[MAX_RANK];
|
Nd4jLong coords[MAX_RANK];
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
shape::index2coords(i, output.getShapeInfo(), coords);
|
shape::index2coords(i, output.getShapeInfo(), coords);
|
||||||
|
|
||||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
||||||
|
|
|
@ -22,7 +22,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
|
||||||
if (this->ordering() == second.ordering() && this->ordering() == third.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == second.ews() && this->ews() == third.ews()) {
|
if (this->ordering() == second.ordering() && this->ordering() == third.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == second.ews() && this->ews() == third.ews()) {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment)
|
for (auto e = start; e < stop; e++)
|
||||||
z[e] = func(f[e], s[e], t[e]);
|
z[e] = func(f[e], s[e], t[e]);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
|
||||||
if (f == z) {
|
if (f == z) {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto tOffset = this->getOffset(e);
|
auto tOffset = this->getOffset(e);
|
||||||
auto uOffset = second.getOffset(e);
|
auto uOffset = second.getOffset(e);
|
||||||
auto vOffset = third.getOffset(e);
|
auto vOffset = third.getOffset(e);
|
||||||
|
@ -44,7 +44,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto tOffset = this->getOffset(e);
|
auto tOffset = this->getOffset(e);
|
||||||
auto uOffset = second.getOffset(e);
|
auto uOffset = second.getOffset(e);
|
||||||
auto vOffset = third.getOffset(e);
|
auto vOffset = third.getOffset(e);
|
||||||
|
@ -93,7 +93,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
|
||||||
if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) {
|
if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment)
|
for (auto e = start; e < stop; e++)
|
||||||
z[e] = func(f[e], s[e]);
|
z[e] = func(f[e], s[e]);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -102,7 +102,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
|
||||||
if (f == z) {
|
if (f == z) {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto xOffset = this->getOffset(e);
|
auto xOffset = this->getOffset(e);
|
||||||
auto yOffset = other.getOffset(e);
|
auto yOffset = other.getOffset(e);
|
||||||
|
|
||||||
|
@ -114,7 +114,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto xOffset = this->getOffset(e);
|
auto xOffset = this->getOffset(e);
|
||||||
auto yOffset = other.getOffset(e);
|
auto yOffset = other.getOffset(e);
|
||||||
auto zOffset = target.getOffset(e);
|
auto zOffset = target.getOffset(e);
|
||||||
|
@ -156,7 +156,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
|
||||||
if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) {
|
if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment)
|
for (auto e = start; e < stop; e++)
|
||||||
z[e] = func(f[e]);
|
z[e] = func(f[e]);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -165,7 +165,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
|
||||||
if (f == z) {
|
if (f == z) {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto xOffset = this->getOffset(e);
|
auto xOffset = this->getOffset(e);
|
||||||
|
|
||||||
f[xOffset] = func(f[xOffset]);
|
f[xOffset] = func(f[xOffset]);
|
||||||
|
@ -176,7 +176,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto xOffset = this->getOffset(e);
|
auto xOffset = this->getOffset(e);
|
||||||
auto zOffset = target.getOffset(e);
|
auto zOffset = target.getOffset(e);
|
||||||
|
|
||||||
|
@ -217,7 +217,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
|
||||||
if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) {
|
if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment)
|
for (auto e = start; e < stop; e++)
|
||||||
z[e] = func(e, f[e]);
|
z[e] = func(e, f[e]);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -226,7 +226,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
|
||||||
if (f == z) {
|
if (f == z) {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto xOffset = this->getOffset(e);
|
auto xOffset = this->getOffset(e);
|
||||||
|
|
||||||
f[xOffset] = func(e, f[xOffset]);
|
f[xOffset] = func(e, f[xOffset]);
|
||||||
|
@ -237,7 +237,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto xOffset = this->getOffset(e);
|
auto xOffset = this->getOffset(e);
|
||||||
auto zOffset = target.getOffset(e);
|
auto zOffset = target.getOffset(e);
|
||||||
|
|
||||||
|
@ -283,7 +283,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
|
||||||
if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) {
|
if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment)
|
for (auto e = start; e < stop; e++)
|
||||||
z[e] = func((Nd4jLong) e, f[e], s[e]);
|
z[e] = func((Nd4jLong) e, f[e], s[e]);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -292,7 +292,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
|
||||||
if (f == z) {
|
if (f == z) {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto xOffset = this->getOffset(e);
|
auto xOffset = this->getOffset(e);
|
||||||
auto yOffset = other.getOffset(e);
|
auto yOffset = other.getOffset(e);
|
||||||
|
|
||||||
|
@ -304,7 +304,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto xOffset = this->getOffset(e);
|
auto xOffset = this->getOffset(e);
|
||||||
auto yOffset = other.getOffset(e);
|
auto yOffset = other.getOffset(e);
|
||||||
auto zOffset = target.getOffset(e);
|
auto zOffset = target.getOffset(e);
|
||||||
|
|
|
@ -1291,7 +1291,7 @@ void pullRowsGeneric(void *vx,
|
||||||
_threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
|
_threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto idx = start; idx < stop; idx += increment) {
|
for (auto idx = start; idx < stop; idx++) {
|
||||||
auto xTadOffsetForBlock = tadOffsets[indexes[idx]];
|
auto xTadOffsetForBlock = tadOffsets[indexes[idx]];
|
||||||
auto zTadOffsetForBlock = zTadOffsets[idx];
|
auto zTadOffsetForBlock = zTadOffsets[idx];
|
||||||
|
|
||||||
|
@ -1356,7 +1356,7 @@ void tearGeneric(void *vx,
|
||||||
auto numTads = shape::length(hXShapeInfo) / tadLength;
|
auto numTads = shape::length(hXShapeInfo) / tadLength;
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto hZ = reinterpret_cast<T *>(targets[i]);
|
auto hZ = reinterpret_cast<T *>(targets[i]);
|
||||||
auto s = hX + tadOffsets[i];
|
auto s = hX + tadOffsets[i];
|
||||||
|
|
||||||
|
@ -1478,7 +1478,7 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS
|
||||||
auto dZ = reinterpret_cast<T **>(dz);
|
auto dZ = reinterpret_cast<T **>(dz);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto f = start; f < stop; f += increment) {
|
for (auto f = start; f < stop; f++) {
|
||||||
auto hX = reinterpret_cast<T *>(dX[f]);
|
auto hX = reinterpret_cast<T *>(dX[f]);
|
||||||
//auto hZ = reinterpret_cast<T *>(dZ[f]);
|
//auto hZ = reinterpret_cast<T *>(dZ[f]);
|
||||||
|
|
||||||
|
|
|
@ -52,7 +52,7 @@ namespace nd4j {
|
||||||
TypeCast::convertGeneric<T2, T>(nullptr, tmp, length, buffer);
|
TypeCast::convertGeneric<T2, T>(nullptr, tmp, length, buffer);
|
||||||
#else
|
#else
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment)
|
for (auto e = start; e < stop; e++)
|
||||||
buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
|
buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -110,7 +110,7 @@ namespace nd4j {
|
||||||
TypeCast::convertGeneric<float, T>(nullptr, tmp, length, buffer);
|
TypeCast::convertGeneric<float, T>(nullptr, tmp, length, buffer);
|
||||||
#else
|
#else
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment)
|
for (auto e = start; e < stop; e++)
|
||||||
buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
|
buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -138,7 +138,7 @@ namespace nd4j {
|
||||||
|
|
||||||
#else
|
#else
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment)
|
for (auto e = start; e < stop; e++)
|
||||||
buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
|
buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -164,7 +164,7 @@ namespace nd4j {
|
||||||
TypeCast::convertGeneric<float16, T>(nullptr, tmp, length, buffer);
|
TypeCast::convertGeneric<float16, T>(nullptr, tmp, length, buffer);
|
||||||
#else
|
#else
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment)
|
for (auto e = start; e < stop; e++)
|
||||||
buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
|
buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -49,7 +49,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
case nd4j::LoopKind::EWS1: {
|
case nd4j::LoopKind::EWS1: {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
|
@ -70,7 +70,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
case nd4j::LoopKind::EWSNONZERO: {
|
case nd4j::LoopKind::EWSNONZERO: {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
|
@ -91,7 +91,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
case nd4j::LoopKind::RANK1: {
|
case nd4j::LoopKind::RANK1: {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
|
@ -114,7 +114,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
shape::updateStrides(2, tadShape, newStride, 'c');
|
shape::updateStrides(2, tadShape, newStride, 'c');
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
|
@ -141,7 +141,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
shape::updateStrides(3, tadShape, newStride, 'c');
|
shape::updateStrides(3, tadShape, newStride, 'c');
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
|
@ -170,7 +170,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
shape::updateStrides(4, tadShape, newStride, 'c');
|
shape::updateStrides(4, tadShape, newStride, 'c');
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
|
@ -201,7 +201,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
shape::updateStrides(5, tadShape, newStride, 'c');
|
shape::updateStrides(5, tadShape, newStride, 'c');
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
|
@ -234,7 +234,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
|
const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
|
@ -258,7 +258,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
|
const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
|
@ -284,7 +284,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
|
const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
|
|
|
@ -80,7 +80,7 @@ namespace nd4j {
|
||||||
int nLen = zArr.lengthOf() / yArr.sizeAt(-1);
|
int nLen = zArr.lengthOf() / yArr.sizeAt(-1);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (uint32_t total = start; total < stop; total += increment) {
|
for (uint32_t total = start; total < stop; total++) {
|
||||||
|
|
||||||
uint32_t i = total / zDim1;
|
uint32_t i = total / zDim1;
|
||||||
uint32_t j = total % zDim1;
|
uint32_t j = total % zDim1;
|
||||||
|
|
|
@ -73,7 +73,7 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
intermediatery[thread_id] = OpType::startingIndexValue(x);
|
intermediatery[thread_id] = OpType::startingIndexValue(x);
|
||||||
|
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
IndexValue<X> curr(x[i], i);
|
IndexValue<X> curr(x[i], i);
|
||||||
intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
|
intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
|
||||||
}
|
}
|
||||||
|
@ -88,7 +88,7 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
intermediatery[thread_id] = OpType::startingIndexValue(x);
|
intermediatery[thread_id] = OpType::startingIndexValue(x);
|
||||||
|
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
IndexValue<X> curr(x[offset], i);
|
IndexValue<X> curr(x[offset], i);
|
||||||
intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
|
intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
|
||||||
|
|
|
@ -75,7 +75,7 @@ namespace functions {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
|
z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
|
||||||
}
|
}
|
||||||
|
@ -93,7 +93,7 @@ namespace functions {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (uint64_t i = start; i < stop; i += increment) {
|
for (uint64_t i = start; i < stop; i++) {
|
||||||
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
|
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
|
z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
|
||||||
|
@ -111,7 +111,7 @@ namespace functions {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (uint64_t i = start; i < stop; i += increment) {
|
for (uint64_t i = start; i < stop; i++) {
|
||||||
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
|
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
|
z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
|
||||||
|
@ -129,7 +129,7 @@ namespace functions {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (uint64_t i = start; i < stop; i += increment) {
|
for (uint64_t i = start; i < stop; i++) {
|
||||||
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto offset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
|
auto offset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
|
z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
|
||||||
|
@ -149,7 +149,7 @@ namespace functions {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (uint64_t i = start; i < stop; i += increment) {
|
for (uint64_t i = start; i < stop; i++) {
|
||||||
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
|
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
|
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
|
@ -197,7 +197,7 @@ namespace functions {
|
||||||
else{
|
else{
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (uint64_t i = start; i < stop; i += increment) {
|
for (uint64_t i = start; i < stop; i++) {
|
||||||
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
|
z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
|
||||||
}
|
}
|
||||||
|
@ -213,7 +213,7 @@ namespace functions {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (uint64_t i = start; i < stop; i += increment) {
|
for (uint64_t i = start; i < stop; i++) {
|
||||||
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
|
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
|
z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
|
||||||
|
@ -255,7 +255,7 @@ namespace functions {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (uint64_t i = start; i < stop; i += increment) {
|
for (uint64_t i = start; i < stop; i++) {
|
||||||
auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
|
auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[offset] = OpClass::op(i, length, rng, extraArguments);
|
z[offset] = OpClass::op(i, length, rng, extraArguments);
|
||||||
}
|
}
|
||||||
|
|
|
@ -88,7 +88,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
|
||||||
|
|
||||||
if (kindOfLoop == nd4j::LoopKind::EWS1) {
|
if (kindOfLoop == nd4j::LoopKind::EWS1) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
|
intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -98,7 +98,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
|
||||||
} else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
|
} else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
|
intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
|
||||||
}
|
}
|
||||||
|
@ -110,7 +110,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
|
||||||
const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
|
const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
|
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
|
intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
|
||||||
|
|
|
@ -158,7 +158,7 @@ namespace functions {
|
||||||
const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeShapeInfo, tadShapeShapeInfoCast);
|
const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeShapeInfo, tadShapeShapeInfoCast);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto r = start; r < stop; r += increment) {
|
for (auto r = start; r < stop; r++) {
|
||||||
|
|
||||||
auto tadOffsetForBlock = tadPack.primaryOffsets()[r];
|
auto tadOffsetForBlock = tadPack.primaryOffsets()[r];
|
||||||
auto tx = x + tadOffsetForBlock;
|
auto tx = x + tadOffsetForBlock;
|
||||||
|
|
|
@ -81,7 +81,7 @@ namespace nd4j {
|
||||||
|
|
||||||
// now we actually apply quantization
|
// now we actually apply quantization
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
rz[e] = static_cast<char>(nd4j::math::nd4j_round<float, char>( 1.0f * static_cast<float>(x[e]) / nd4j::math::nd4j_max<float>(amax, amin) * max_byte));
|
rz[e] = static_cast<char>(nd4j::math::nd4j_round<float, char>( 1.0f * static_cast<float>(x[e]) / nd4j::math::nd4j_max<float>(amax, amin) * max_byte));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -177,7 +177,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
|
||||||
int flimit = limit + 4;
|
int flimit = limit + 4;
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
int el = x[e];
|
int el = x[e];
|
||||||
int ael = nd4j::math::nd4j_abs<int>(el) - 1;
|
int ael = nd4j::math::nd4j_abs<int>(el) - 1;
|
||||||
z[ael] += el > 0 ? static_cast<T>(threshold) : static_cast<T>(-threshold);
|
z[ael] += el > 0 ? static_cast<T>(threshold) : static_cast<T>(-threshold);
|
||||||
|
@ -202,7 +202,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
|
||||||
auto z = reinterpret_cast<T *>(dz);
|
auto z = reinterpret_cast<T *>(dz);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
z[i] = static_cast<T>(static_cast<float>(x[i]));
|
z[i] = static_cast<T>(static_cast<float>(x[i]));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -153,7 +153,7 @@ namespace helpers {
|
||||||
auto rowSize = sizeof(T) * colCount;
|
auto rowSize = sizeof(T) * colCount;
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto n = start; n < stop; n += increment) {
|
for (auto n = start; n < stop; n++) {
|
||||||
int s = rowP->e<int>(n);
|
int s = rowP->e<int>(n);
|
||||||
int end = rowP->e<int>(n + 1);
|
int end = rowP->e<int>(n + 1);
|
||||||
int shift = n * colCount;
|
int shift = n * colCount;
|
||||||
|
|
|
@ -291,7 +291,7 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
|
||||||
shape::calcOffsets(tadShapeInfo, offsets);
|
shape::calcOffsets(tadShapeInfo, offsets);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto inBuff = input.bufferAsT<T>() + tadOffsets[i];
|
auto inBuff = input.bufferAsT<T>() + tadOffsets[i];
|
||||||
auto outBuff = output.bufferAsT<T>() + tadOffsets[i];
|
auto outBuff = output.bufferAsT<T>() + tadOffsets[i];
|
||||||
|
|
||||||
|
@ -341,7 +341,7 @@ void prelu(nd4j::LaunchContext * context, const NDArray& input, const NDArray& a
|
||||||
const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo();
|
const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
// FIXME: double!
|
// FIXME: double!
|
||||||
double x = input.e<double>(i);
|
double x = input.e<double>(i);
|
||||||
if (x < 0.0) {
|
if (x < 0.0) {
|
||||||
|
|
|
@ -67,7 +67,7 @@ static void adjustHue_(const NDArray *input, const NDArray* deltaScalarArr, NDAr
|
||||||
const Nd4jLong zDimCstride = output->stridesOf()[dimC];
|
const Nd4jLong zDimCstride = output->stridesOf()[dimC];
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
|
|
||||||
const T *xTad = x + packX.platformOffsets()[i];
|
const T *xTad = x + packX.platformOffsets()[i];
|
||||||
T *zTad = z + packZ.platformOffsets()[i];
|
T *zTad = z + packZ.platformOffsets()[i];
|
||||||
|
|
|
@ -66,7 +66,7 @@ static void adjustSaturation_(const NDArray *input, const NDArray* factorScalarA
|
||||||
const Nd4jLong zDimCstride = output->stridesOf()[dimC];
|
const Nd4jLong zDimCstride = output->stridesOf()[dimC];
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
const T *xTad = x + packX.platformOffsets()[i];
|
const T *xTad = x + packX.platformOffsets()[i];
|
||||||
T *zTad = z + packZ.platformOffsets()[i];
|
T *zTad = z + packZ.platformOffsets()[i];
|
||||||
|
|
||||||
|
|
|
@ -94,7 +94,7 @@ void bgemm_(const std::vector<NDArray*>& vA, const std::vector<NDArray*>& vB, st
|
||||||
int vaSize = vA.size();
|
int vaSize = vA.size();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto p = start; p < stop; p += increment) {
|
for (auto p = start; p < stop; p++) {
|
||||||
auto A = reinterpret_cast<T *>(vA.at(p)->buffer());
|
auto A = reinterpret_cast<T *>(vA.at(p)->buffer());
|
||||||
auto B = reinterpret_cast<T *>(vB.at(p)->buffer());
|
auto B = reinterpret_cast<T *>(vB.at(p)->buffer());
|
||||||
auto C = reinterpret_cast<T *>(vC.at(p)->buffer());
|
auto C = reinterpret_cast<T *>(vC.at(p)->buffer());
|
||||||
|
|
|
@ -141,7 +141,7 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray
|
||||||
|
|
||||||
Nd4jLong coords[MAX_RANK];
|
Nd4jLong coords[MAX_RANK];
|
||||||
|
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
|
|
||||||
shape::index2coords(i, input->getShapeInfo(), coords);
|
shape::index2coords(i, input->getShapeInfo(), coords);
|
||||||
|
|
||||||
|
|
|
@ -117,7 +117,7 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con
|
||||||
int xLen = x.lengthOf();
|
int xLen = x.lengthOf();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment)
|
for (auto i = start; i < stop; i++)
|
||||||
output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i));
|
output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -96,7 +96,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
T *col, *im;
|
T *col, *im;
|
||||||
|
|
||||||
for (uint b = start; b < stop; b += increment) {
|
for (uint b = start; b < stop; b++) {
|
||||||
T *im0 = imBuff + b * imStride0;
|
T *im0 = imBuff + b * imStride0;
|
||||||
T *col4 = colBuff + b * colStride0;
|
T *col4 = colBuff + b * colStride0;
|
||||||
for (int colH = 0; colH < oH; ++colH, col4 += colStride4) {
|
for (int colH = 0; colH < oH; ++colH, col4 += colStride4) {
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (c) 2015-2018 Skymind, Inc.
|
||||||
|
*
|
||||||
|
* This program and the accompanying materials are made available under the
|
||||||
|
* terms of the Apache License, Version 2.0 which is available at
|
||||||
|
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
* License for the specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
******************************************************************************/
|
||||||
|
|
||||||
|
//
|
||||||
|
// @author Yurii Shyrma (iuriish@yahoo.com), created on 20.04.2018
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
|
#include <ops/declarable/helpers/transforms.h>
|
||||||
|
#include <ops/specials.h>
|
||||||
|
|
||||||
|
namespace nd4j {
|
||||||
|
namespace ops {
|
||||||
|
namespace helpers {
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
template<typename T>
|
||||||
|
static void concat_(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
|
||||||
|
nd4j::SpecialMethods<T>::concatCpuGeneric(inArrs, output, axis);
|
||||||
|
}
|
||||||
|
|
||||||
|
void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
|
||||||
|
BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES);
|
||||||
|
}
|
||||||
|
|
||||||
|
BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector<NDArray*>& inArrs, NDArray& output, const int axis), LIBND4J_TYPES);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -32,7 +32,7 @@ namespace helpers {
|
||||||
int lLen = labels->lengthOf();
|
int lLen = labels->lengthOf();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (int j = start; j < stop; j += increment) {
|
for (int j = start; j < stop; j++) {
|
||||||
auto label = labels->e<Nd4jLong>(j);
|
auto label = labels->e<Nd4jLong>(j);
|
||||||
auto pred = predictions->e<Nd4jLong>(j);
|
auto pred = predictions->e<Nd4jLong>(j);
|
||||||
T value = (weights == nullptr ? (T) 1.0f : weights->e<T>(j));
|
T value = (weights == nullptr ? (T) 1.0f : weights->e<T>(j));
|
||||||
|
|
|
@ -50,7 +50,7 @@ namespace nd4j {
|
||||||
T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0);
|
T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto y = start; y < stop; y += increment) {
|
for (auto y = start; y < stop; y++) {
|
||||||
const float inY = (cropHeight > 1)
|
const float inY = (cropHeight > 1)
|
||||||
? y1 * (imageHeight - 1) + y * heightScale
|
? y1 * (imageHeight - 1) + y * heightScale
|
||||||
: 0.5 * (y1 + y2) * (imageHeight - 1);
|
: 0.5 * (y1 + y2) * (imageHeight - 1);
|
||||||
|
|
|
@ -39,7 +39,7 @@ void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray
|
||||||
int tads = tadsA.size();
|
int tads = tadsA.size();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto a_ = tadsA.at(e);
|
auto a_ = tadsA.at(e);
|
||||||
auto b_ = tadsB.at(e);
|
auto b_ = tadsB.at(e);
|
||||||
auto o_ = tadsO.at(e);
|
auto o_ = tadsO.at(e);
|
||||||
|
|
|
@ -46,7 +46,7 @@ namespace helpers {
|
||||||
if (isNHWC) {
|
if (isNHWC) {
|
||||||
const int total_count = batch_size * output_height * output_width * output_depth;
|
const int total_count = batch_size * output_height * output_width * output_depth;
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto out_idx = start; out_idx < stop; out_idx += increment) {
|
for (auto out_idx = start; out_idx < stop; out_idx++) {
|
||||||
const int d = out_idx % output_depth;
|
const int d = out_idx % output_depth;
|
||||||
const int out_idx2 = out_idx / output_depth;
|
const int out_idx2 = out_idx / output_depth;
|
||||||
const int w = out_idx2 % output_width;
|
const int w = out_idx2 % output_width;
|
||||||
|
@ -70,7 +70,7 @@ namespace helpers {
|
||||||
const int total_count = batch_size * input_depth_by_input_area;
|
const int total_count = batch_size * input_depth_by_input_area;
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (int input_idx = start; input_idx < stop; input_idx += increment) {
|
for (int input_idx = start; input_idx < stop; input_idx++) {
|
||||||
const int n_bY_bX_oC_iY = input_idx / input_width;
|
const int n_bY_bX_oC_iY = input_idx / input_width;
|
||||||
const int iX = input_idx - n_bY_bX_oC_iY * input_width;
|
const int iX = input_idx - n_bY_bX_oC_iY * input_width;
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,7 @@ template <typename T>
|
||||||
static void diGamma_(const NDArray& x, NDArray& z) {
|
static void diGamma_(const NDArray& x, NDArray& z) {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment)
|
for (auto i = start; i < stop; i++)
|
||||||
z.p(i, diGammaScalar<T>(x.e<T>(i)));
|
z.p(i, diGammaScalar<T>(x.e<T>(i)));
|
||||||
};
|
};
|
||||||
samediff::Threads::parallel_for(func, 0, x.lengthOf());
|
samediff::Threads::parallel_for(func, 0, x.lengthOf());
|
||||||
|
|
|
@ -35,7 +35,7 @@ namespace helpers {
|
||||||
int inLen = input->lengthOf();
|
int inLen = input->lengthOf();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
float val = nodeRng.relativeT<T>(e, T(0.f), T(1.f));
|
float val = nodeRng.relativeT<T>(e, T(0.f), T(1.f));
|
||||||
|
|
||||||
if (val < probValue)
|
if (val < probValue)
|
||||||
|
@ -130,7 +130,7 @@ namespace helpers {
|
||||||
nd4j::graph::RandomGenerator nodeRng(3019L, seed);
|
nd4j::graph::RandomGenerator nodeRng(3019L, seed);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
float randVal = nodeRng.relativeT(e, T(0.f), T(1.f));
|
float randVal = nodeRng.relativeT(e, T(0.f), T(1.f));
|
||||||
float xVal = input->e<float>(e);
|
float xVal = input->e<float>(e);
|
||||||
output->p<float>(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1);
|
output->p<float>(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1);
|
||||||
|
|
|
@ -62,7 +62,7 @@ namespace nd4j {
|
||||||
unsigned int outSize = outputList.size();
|
unsigned int outSize = outputList.size();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
outputs[i].first = outputList[i];
|
outputs[i].first = outputList[i];
|
||||||
outputs[i].second = 0;
|
outputs[i].second = 0;
|
||||||
for (int e = 0; e < indices->lengthOf(); ++e)
|
for (int e = 0; e < indices->lengthOf(); ++e)
|
||||||
|
@ -168,7 +168,7 @@ namespace nd4j {
|
||||||
unsigned int gradsSize = inputGradientList.size();
|
unsigned int gradsSize = inputGradientList.size();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
outputs[i].first = inputGradientList[i];
|
outputs[i].first = inputGradientList[i];
|
||||||
outputs[i].second = 0;
|
outputs[i].second = 0;
|
||||||
for (int e = 0; e < indices->lengthOf(); ++e)
|
for (int e = 0; e < indices->lengthOf(); ++e)
|
||||||
|
|
|
@ -50,7 +50,7 @@ namespace helpers {
|
||||||
colCast = 0;
|
colCast = 0;
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto batch = 0; batch < stop; batch += increment) {
|
for (auto batch = 0; batch < stop; batch++) {
|
||||||
auto patch = listOfMatricies.at(batch);
|
auto patch = listOfMatricies.at(batch);
|
||||||
auto outMatrix = listOfOutputs.at(batch);
|
auto outMatrix = listOfOutputs.at(batch);
|
||||||
|
|
||||||
|
|
|
@ -59,7 +59,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
|
||||||
if(input->rankOf() == 1 && output->rankOf() == 1) {
|
if(input->rankOf() == 1 && output->rankOf() == 1) {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment)
|
for (auto i = start; i < stop; i++)
|
||||||
output->p(i, input->e(indices->e<Nd4jLong>(i)));
|
output->p(i, input->e(indices->e<Nd4jLong>(i)));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -88,7 +88,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
|
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
|
|
||||||
void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]);
|
void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]);
|
||||||
void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
|
void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
|
||||||
|
@ -100,7 +100,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
|
|
||||||
void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]);
|
void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]);
|
||||||
void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
|
void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
|
||||||
|
@ -140,7 +140,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
|
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
|
|
||||||
void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]);
|
void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]);
|
||||||
void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
|
void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
|
||||||
|
@ -155,7 +155,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
|
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
|
|
||||||
void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]);
|
void* inBuff = input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]);
|
||||||
void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
|
void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
|
||||||
|
|
|
@ -56,7 +56,7 @@ namespace nd4j {
|
||||||
|
|
||||||
if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) {
|
if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto _x = static_cast<unsigned long long>(xBuffer[e]);
|
auto _x = static_cast<unsigned long long>(xBuffer[e]);
|
||||||
auto _y = static_cast<unsigned long long>(yBuffer[e]);
|
auto _y = static_cast<unsigned long long>(yBuffer[e]);
|
||||||
|
|
||||||
|
@ -67,7 +67,7 @@ namespace nd4j {
|
||||||
maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
|
maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
|
||||||
} else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) {
|
} else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto _x = static_cast<unsigned long long>(xBuffer[e * xEws]);
|
auto _x = static_cast<unsigned long long>(xBuffer[e * xEws]);
|
||||||
auto _y = static_cast<unsigned long long>(yBuffer[e * yEws]);
|
auto _y = static_cast<unsigned long long>(yBuffer[e * yEws]);
|
||||||
|
|
||||||
|
@ -78,7 +78,7 @@ namespace nd4j {
|
||||||
maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
|
maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
|
||||||
} else {
|
} else {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto _x = static_cast<unsigned long long>(x.e<Nd4jLong>(e));
|
auto _x = static_cast<unsigned long long>(x.e<Nd4jLong>(e));
|
||||||
auto _y = static_cast<unsigned long long>(y.e<Nd4jLong>(e));
|
auto _y = static_cast<unsigned long long>(y.e<Nd4jLong>(e));
|
||||||
|
|
||||||
|
|
|
@ -42,7 +42,7 @@ namespace nd4j {
|
||||||
|
|
||||||
// we divide array into 32 element chunks, and store intermediate results once
|
// we divide array into 32 element chunks, and store intermediate results once
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto b = 0; b < stop; b += increment) {
|
for (auto b = 0; b < stop; b++) {
|
||||||
auto blockBuffer = buffer + b * numBlocks;
|
auto blockBuffer = buffer + b * numBlocks;
|
||||||
|
|
||||||
Nd4jLong r = 1;
|
Nd4jLong r = 1;
|
||||||
|
@ -64,7 +64,7 @@ namespace nd4j {
|
||||||
|
|
||||||
|
|
||||||
auto func2 = PRAGMA_THREADS_FOR {
|
auto func2 = PRAGMA_THREADS_FOR {
|
||||||
for (auto b = start; b < stop; b += increment) {
|
for (auto b = start; b < stop; b++) {
|
||||||
auto blockBuffer = tempBuffer + b * numBlocks;
|
auto blockBuffer = tempBuffer + b * numBlocks;
|
||||||
|
|
||||||
Nd4jLong r = 1;
|
Nd4jLong r = 1;
|
||||||
|
|
|
@ -280,7 +280,7 @@ namespace helpers {
|
||||||
int xsSize = xs.size();
|
int xsSize = xs.size();
|
||||||
// Scale x interpolation weights to avoid a multiplication during iteration.
|
// Scale x interpolation weights to avoid a multiplication during iteration.
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
xs[i]._bottomIndex *= channels;
|
xs[i]._bottomIndex *= channels;
|
||||||
xs[i]._topIndex *= channels;
|
xs[i]._topIndex *= channels;
|
||||||
}
|
}
|
||||||
|
@ -906,7 +906,7 @@ namespace helpers {
|
||||||
auto outputPtr = output->bufferAsT<float>(); // output is always float. TO DO: provide another float types also with template <typename X, typename Z> declaration
|
auto outputPtr = output->bufferAsT<float>(); // output is always float. TO DO: provide another float types also with template <typename X, typename Z> declaration
|
||||||
|
|
||||||
auto batchProcess = PRAGMA_THREADS_FOR {
|
auto batchProcess = PRAGMA_THREADS_FOR {
|
||||||
for (auto batch = start; batch < stop; batch += increment) {
|
for (auto batch = start; batch < stop; batch++) {
|
||||||
for (auto y = 0; y < st.outHeight; ++y) {
|
for (auto y = 0; y < st.outHeight; ++y) {
|
||||||
const float inY = y * st.heightScale;
|
const float inY = y * st.heightScale;
|
||||||
const float inY1 = (y + 1) * st.heightScale;
|
const float inY1 = (y + 1) * st.heightScale;
|
||||||
|
@ -961,7 +961,7 @@ namespace helpers {
|
||||||
if (Status::OK() == res) {
|
if (Status::OK() == res) {
|
||||||
std::vector<CachedInterpolation> xCached(st.outWidth);
|
std::vector<CachedInterpolation> xCached(st.outWidth);
|
||||||
auto cachingProcedure = PRAGMA_THREADS_FOR {
|
auto cachingProcedure = PRAGMA_THREADS_FOR {
|
||||||
for (auto x = start; x < stop; x += increment) {
|
for (auto x = start; x < stop; x++) {
|
||||||
auto &xCache = xCached[x];
|
auto &xCache = xCached[x];
|
||||||
const float inX = x * st.widthScale;
|
const float inX = x * st.widthScale;
|
||||||
const float inX1 = (x + 1) * st.widthScale;
|
const float inX1 = (x + 1) * st.widthScale;
|
||||||
|
|
|
@ -39,7 +39,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
|
||||||
'c' == output.ordering() && 1 == output.ews()){
|
'c' == output.ordering() && 1 == output.ews()){
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
const auto xStep = i*3;
|
const auto xStep = i*3;
|
||||||
z[i] = 0.2989f*x[xStep] + 0.5870f*x[xStep + 1] + 0.1140f*x[xStep + 2];
|
z[i] = 0.2989f*x[xStep] + 0.5870f*x[xStep + 1] + 0.1140f*x[xStep + 2];
|
||||||
}
|
}
|
||||||
|
@ -52,7 +52,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
|
|
||||||
Nd4jLong coords[MAX_RANK];
|
Nd4jLong coords[MAX_RANK];
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
shape::index2coords(i, output.getShapeInfo(), coords);
|
shape::index2coords(i, output.getShapeInfo(), coords);
|
||||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
||||||
const auto xOffset0 = shape::getOffset(input.getShapeInfo(), coords);
|
const auto xOffset0 = shape::getOffset(input.getShapeInfo(), coords);
|
||||||
|
@ -99,7 +99,7 @@ FORCEINLINE static void rgbToFromYuv_(const NDArray& input, NDArray& output, con
|
||||||
const Nd4jLong zDimCstride = output.stridesOf()[dimC];
|
const Nd4jLong zDimCstride = output.stridesOf()[dimC];
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
const T* xTad = x + packX.platformOffsets()[i];
|
const T* xTad = x + packX.platformOffsets()[i];
|
||||||
T* zTad = z + packZ.platformOffsets()[i];
|
T* zTad = z + packZ.platformOffsets()[i];
|
||||||
op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
|
op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
|
||||||
|
@ -157,7 +157,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
|
||||||
const Nd4jLong zDimCstride = output->stridesOf()[dimC];
|
const Nd4jLong zDimCstride = output->stridesOf()[dimC];
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
const T* xTad = x + packX.platformOffsets()[i];
|
const T* xTad = x + packX.platformOffsets()[i];
|
||||||
T* zTad = z + packZ.platformOffsets()[i];
|
T* zTad = z + packZ.platformOffsets()[i];
|
||||||
op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
|
op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
|
||||||
|
@ -207,7 +207,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
|
||||||
const Nd4jLong zDimCstride = output->stridesOf()[dimC];
|
const Nd4jLong zDimCstride = output->stridesOf()[dimC];
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR{
|
auto func = PRAGMA_THREADS_FOR{
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
const T* xTad = x + packX.platformOffsets()[i];
|
const T* xTad = x + packX.platformOffsets()[i];
|
||||||
T* zTad = z + packZ.platformOffsets()[i];
|
T* zTad = z + packZ.platformOffsets()[i];
|
||||||
//simple M*v //tr.T*v
|
//simple M*v //tr.T*v
|
||||||
|
|
|
@ -146,7 +146,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
|
||||||
int span = (tads / num_threads) + 8;
|
int span = (tads / num_threads) + 8;
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto r = start; r < stop; r += increment) {
|
for (auto r = start; r < stop; r++) {
|
||||||
auto rX = const_cast<NDArray*>(input)->bufferAsT<X>() + tadOffsets[r];
|
auto rX = const_cast<NDArray*>(input)->bufferAsT<X>() + tadOffsets[r];
|
||||||
auto rZ = output->bufferAsT<Z>() + zOfsets[r];
|
auto rZ = output->bufferAsT<Z>() + zOfsets[r];
|
||||||
|
|
||||||
|
|
|
@ -62,7 +62,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
|
||||||
if(inTadEws == 1 && outTadEws == 1) {
|
if(inTadEws == 1 && outTadEws == 1) {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (uint i = start; i < stop; i += increment) {
|
for (uint i = start; i < stop; i++) {
|
||||||
const T *x = inBuff + inTadOffsets[i];
|
const T *x = inBuff + inTadOffsets[i];
|
||||||
T *y = outBuff + outTadOffsets[i];
|
T *y = outBuff + outTadOffsets[i];
|
||||||
|
|
||||||
|
@ -179,7 +179,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
|
||||||
if(inTadEws == 1 && gradITadEws == 1) {
|
if(inTadEws == 1 && gradITadEws == 1) {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (uint i = start; i < stop; i += increment) {
|
for (uint i = start; i < stop; i++) {
|
||||||
const X *x = inBuff + inTadOffsets[i];
|
const X *x = inBuff + inTadOffsets[i];
|
||||||
Y *y = gradIBuff + gradITadOffsets[i];
|
Y *y = gradIBuff + gradITadOffsets[i];
|
||||||
|
|
||||||
|
@ -247,7 +247,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
|
||||||
else {
|
else {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (uint i = start; i < stop; i += increment) {
|
for (uint i = start; i < stop; i++) {
|
||||||
const X *x = inBuff + inTadOffsets[i];
|
const X *x = inBuff + inTadOffsets[i];
|
||||||
Y *y = gradIBuff + gradITadOffsets[i];
|
Y *y = gradIBuff + gradITadOffsets[i];
|
||||||
|
|
||||||
|
|
|
@ -124,7 +124,7 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast,
|
||||||
auto h_ = h->bufferAsT<T>();
|
auto h_ = h->bufferAsT<T>();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (uint e = start; e < stop; e += increment) {
|
for (uint e = start; e < stop; e++) {
|
||||||
c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
|
c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
|
||||||
h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]);
|
h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,7 +45,7 @@ namespace helpers {
|
||||||
auto n = shape::sizeAt(matrixShape, -1);
|
auto n = shape::sizeAt(matrixShape, -1);
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
Nd4jLong theFirstPos[] = {theFirst, i};
|
Nd4jLong theFirstPos[] = {theFirst, i};
|
||||||
Nd4jLong theSecondPos[] = {theSecond, i};
|
Nd4jLong theSecondPos[] = {theSecond, i};
|
||||||
auto theFirstIndex = shape::getOffset(matrixShape, theFirstPos, 0);
|
auto theFirstIndex = shape::getOffset(matrixShape, theFirstPos, 0);
|
||||||
|
@ -203,7 +203,7 @@ namespace helpers {
|
||||||
auto result = -1;
|
auto result = -1;
|
||||||
//auto loop = PRAGMA_THREADS_FOR {
|
//auto loop = PRAGMA_THREADS_FOR {
|
||||||
auto start = column, stop = rowNum, increment = 1;
|
auto start = column, stop = rowNum, increment = 1;
|
||||||
for (auto rowCounter = start; rowCounter < stop; rowCounter += increment) {
|
for (auto rowCounter = start; rowCounter < stop; rowCounter++) {
|
||||||
Nd4jLong xPos[] = {rowCounter, column};
|
Nd4jLong xPos[] = {rowCounter, column};
|
||||||
auto xIndex = shape::getOffset(compoundShape, xPos, 0);
|
auto xIndex = shape::getOffset(compoundShape, xPos, 0);
|
||||||
if (nd4j::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) {
|
if (nd4j::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) {
|
||||||
|
@ -221,7 +221,7 @@ namespace helpers {
|
||||||
Nd4jLong xDiag[] = {currentRow, currentRow};
|
Nd4jLong xDiag[] = {currentRow, currentRow};
|
||||||
auto diagIndex = shape::getOffset(compoundShape, xDiag, 0);
|
auto diagIndex = shape::getOffset(compoundShape, xDiag, 0);
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (int j = start; j < stop; j += increment) {
|
for (auto j = start; j < stop; j++) {
|
||||||
Nd4jLong xRow[] = {j, currentRow};
|
Nd4jLong xRow[] = {j, currentRow};
|
||||||
auto rowIndex = shape::getOffset(compoundShape, xRow, 0);
|
auto rowIndex = shape::getOffset(compoundShape, xRow, 0);
|
||||||
compoundBuf[rowIndex] /= compoundBuf[diagIndex]; //output->t<T>(i, i);
|
compoundBuf[rowIndex] /= compoundBuf[diagIndex]; //output->t<T>(i, i);
|
||||||
|
@ -310,7 +310,7 @@ namespace helpers {
|
||||||
permutations = permutationVectors->allTensorsAlongDimension({-1});
|
permutations = permutationVectors->allTensorsAlongDimension({-1});
|
||||||
|
|
||||||
auto loop = PRAGMA_THREADS_FOR {
|
auto loop = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
luNN_<T, I>(context, outputs.at(i), permutationVectors?permutations.at(i):nullptr, n);
|
luNN_<T, I>(context, outputs.at(i), permutationVectors?permutations.at(i):nullptr, n);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -46,7 +46,7 @@ int _matrixDiagPart(const NDArray* input, NDArray* output) {
|
||||||
int lO = listOut.size();
|
int lO = listOut.size();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment)
|
for (auto i = start; i < stop; i++)
|
||||||
for (int j = 0; j < lastDimension; ++j)
|
for (int j = 0; j < lastDimension; ++j)
|
||||||
listOut.at(i)->p(j, listDiag.at(i)->e<T>(j, j));
|
listOut.at(i)->p(j, listDiag.at(i)->e<T>(j, j));
|
||||||
};
|
};
|
||||||
|
|
|
@ -55,7 +55,7 @@ namespace helpers {
|
||||||
Nd4jLong oL = output->lengthOf();
|
Nd4jLong oL = output->lengthOf();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto row = rows.at(e);
|
auto row = rows.at(e);
|
||||||
output->p(e, row->e<T>(n));
|
output->p(e, row->e<T>(n));
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,7 +49,7 @@ namespace nd4j {
|
||||||
|
|
||||||
if (tadEws >= 1) {
|
if (tadEws >= 1) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = 0; e < stop; e += increment) {
|
for (auto e = 0; e < stop; e++) {
|
||||||
auto cO = output + tadPack.primaryOffsets()[e];
|
auto cO = output + tadPack.primaryOffsets()[e];
|
||||||
|
|
||||||
auto idx = static_cast<int>(indices[e]);
|
auto idx = static_cast<int>(indices[e]);
|
||||||
|
@ -70,7 +70,7 @@ namespace nd4j {
|
||||||
samediff::Threads::parallel_tad(func, 0, numTads);
|
samediff::Threads::parallel_tad(func, 0, numTads);
|
||||||
} else {
|
} else {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto cO = output + tadPack.primaryOffsets()[e];
|
auto cO = output + tadPack.primaryOffsets()[e];
|
||||||
|
|
||||||
auto idx = static_cast<int>(indices[e]);
|
auto idx = static_cast<int>(indices[e]);
|
||||||
|
|
|
@ -70,7 +70,7 @@ template <typename T>
|
||||||
static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) {
|
static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
const T order = n.e<T>(i);
|
const T order = n.e<T>(i);
|
||||||
if(order != static_cast<int>(order)) // if order has fractional part then do not perform calculations and return NAN
|
if(order != static_cast<int>(order)) // if order has fractional part then do not perform calculations and return NAN
|
||||||
output.p(i, std::numeric_limits<T>::quiet_NaN());
|
output.p(i, std::numeric_limits<T>::quiet_NaN());
|
||||||
|
|
|
@ -113,7 +113,7 @@ namespace helpers {
|
||||||
ResultSet listOutR(outputR->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}));
|
ResultSet listOutR(outputR->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}));
|
||||||
ResultSet listInput(input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}));
|
ResultSet listInput(input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}));
|
||||||
auto batching = PRAGMA_THREADS_FOR {
|
auto batching = PRAGMA_THREADS_FOR {
|
||||||
for (auto batch = start; batch < stop; batch += increment) {
|
for (auto batch = start; batch < stop; batch++) {
|
||||||
//qr here
|
//qr here
|
||||||
qrSingle<T>(listInput.at(batch), listOutQ.at(batch), listOutR.at(batch), fullMatricies);
|
qrSingle<T>(listInput.at(batch), listOutQ.at(batch), listOutR.at(batch), fullMatricies);
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,7 +39,7 @@ static void _range(const NDArray& start, const NDArray& delta, NDArray& outVecto
|
||||||
auto d = delta.e<T>(0);
|
auto d = delta.e<T>(0);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment)
|
for (auto i = start; i < stop; i++)
|
||||||
buff[i] = s + i * d;
|
buff[i] = s + i * d;
|
||||||
};
|
};
|
||||||
samediff::Threads::parallel_for(func, 0, len);
|
samediff::Threads::parallel_for(func, 0, len);
|
||||||
|
|
|
@ -54,7 +54,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
|
||||||
if (inArr == outArr) {
|
if (inArr == outArr) {
|
||||||
if (inEWS == 1) {
|
if (inEWS == 1) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto idx = sLength - e;
|
auto idx = sLength - e;
|
||||||
swap(inArr, e, idx);
|
swap(inArr, e, idx);
|
||||||
}
|
}
|
||||||
|
@ -63,7 +63,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
|
||||||
}
|
}
|
||||||
else if (inEWS > 1) {
|
else if (inEWS > 1) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto idx1 = (sLength - e) * inEWS;
|
auto idx1 = (sLength - e) * inEWS;
|
||||||
Nd4jLong idx2 = e * inEWS;
|
Nd4jLong idx2 = e * inEWS;
|
||||||
swap(inArr, idx1, idx2);
|
swap(inArr, idx1, idx2);
|
||||||
|
@ -75,7 +75,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
|
||||||
else {
|
else {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
|
auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
|
||||||
auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer);
|
auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer);
|
||||||
swap(outArr, inOffset, outOffset);
|
swap(outArr, inOffset, outOffset);
|
||||||
|
@ -93,14 +93,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
|
||||||
if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) {
|
if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (Nd4jLong e = start; e < stop; e += increment)
|
for (Nd4jLong e = start; e < stop; e++)
|
||||||
outArr[sLength - e] = inArr[e];
|
outArr[sLength - e] = inArr[e];
|
||||||
};
|
};
|
||||||
samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
|
samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
|
||||||
|
|
||||||
if(inLength != numOfElemsToReverse) {
|
if(inLength != numOfElemsToReverse) {
|
||||||
auto f2 = PRAGMA_THREADS_FOR {
|
auto f2 = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment)
|
for (auto e = start; e < stop; e++)
|
||||||
outArr[e] = inArr[e];
|
outArr[e] = inArr[e];
|
||||||
};
|
};
|
||||||
samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
|
samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
|
||||||
|
@ -109,14 +109,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
|
||||||
else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) {
|
else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment)
|
for (auto e = start; e < stop; e++)
|
||||||
outArr[(sLength - e) * outEWS] = inArr[e * inEWS];
|
outArr[(sLength - e) * outEWS] = inArr[e * inEWS];
|
||||||
};
|
};
|
||||||
samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
|
samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
|
||||||
|
|
||||||
if(inLength != numOfElemsToReverse) {
|
if(inLength != numOfElemsToReverse) {
|
||||||
auto f2 = PRAGMA_THREADS_FOR {
|
auto f2 = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment)
|
for (auto e = start; e < stop; e++)
|
||||||
outArr[e * outEWS] = inArr[e * inEWS];
|
outArr[e * outEWS] = inArr[e * inEWS];
|
||||||
};
|
};
|
||||||
samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
|
samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
|
||||||
|
@ -125,7 +125,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
|
||||||
else {
|
else {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
|
auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
|
||||||
auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer);
|
auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer);
|
||||||
outArr[outOffset] = inArr[inOffset];
|
outArr[outOffset] = inArr[inOffset];
|
||||||
|
@ -136,7 +136,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
|
||||||
if(inLength != numOfElemsToReverse) {
|
if(inLength != numOfElemsToReverse) {
|
||||||
|
|
||||||
auto f2 = PRAGMA_THREADS_FOR {
|
auto f2 = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
|
auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
|
||||||
auto outOffset = shape::getIndexOffset(e, outShapeBuffer);
|
auto outOffset = shape::getIndexOffset(e, outShapeBuffer);
|
||||||
outArr[outOffset] = inArr[inOffset];
|
outArr[outOffset] = inArr[inOffset];
|
||||||
|
|
|
@ -114,7 +114,7 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray&
|
||||||
// loop through input array
|
// loop through input array
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
Nd4jLong coords[MAX_RANK];
|
Nd4jLong coords[MAX_RANK];
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
|
|
||||||
shape::index2coords(i, output.getShapeInfo(), coords);
|
shape::index2coords(i, output.getShapeInfo(), coords);
|
||||||
|
|
||||||
|
@ -300,7 +300,7 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
|
||||||
// loop through output array
|
// loop through output array
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
Nd4jLong coords[MAX_RANK];
|
Nd4jLong coords[MAX_RANK];
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
shape::index2coords(i, output.getShapeInfo(), coords);
|
shape::index2coords(i, output.getShapeInfo(), coords);
|
||||||
|
|
||||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
||||||
|
|
|
@ -48,7 +48,7 @@ namespace helpers {
|
||||||
const int total_count = batch_size * input_height * input_width * input_depth;
|
const int total_count = batch_size * input_height * input_width * input_depth;
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) {
|
for (auto inp_idx = start; inp_idx < stop; inp_idx++) {
|
||||||
// inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
|
// inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
|
||||||
const int d = inp_idx % input_depth;
|
const int d = inp_idx % input_depth;
|
||||||
const int inp_idx2 = inp_idx / input_depth;
|
const int inp_idx2 = inp_idx / input_depth;
|
||||||
|
@ -74,7 +74,7 @@ namespace helpers {
|
||||||
const int total_count = batch_size * output_depth_by_output_area;
|
const int total_count = batch_size * output_depth_by_output_area;
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) {
|
for (auto inp_idx = start; inp_idx < stop; inp_idx++) {
|
||||||
const int n_iC_oY_bY_oX = inp_idx / block_size;
|
const int n_iC_oY_bY_oX = inp_idx / block_size;
|
||||||
const int bX = inp_idx - n_iC_oY_bY_oX * block_size;
|
const int bX = inp_idx - n_iC_oY_bY_oX * block_size;
|
||||||
|
|
||||||
|
|
|
@ -45,7 +45,7 @@ Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int
|
||||||
|
|
||||||
Nd4jLong xCoords[MAX_RANK];
|
Nd4jLong xCoords[MAX_RANK];
|
||||||
|
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
|
|
||||||
shape::index2coords(i, xShapeInfo, xCoords);
|
shape::index2coords(i, xShapeInfo, xCoords);
|
||||||
|
|
||||||
|
@ -79,7 +79,7 @@ void scatter(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& ind
|
||||||
|
|
||||||
if(outRank == 1) {
|
if(outRank == 1) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
Nd4jLong idx = indices.e<Nd4jLong>(i);
|
Nd4jLong idx = indices.e<Nd4jLong>(i);
|
||||||
NDArray out = output({idx, idx + 1});
|
NDArray out = output({idx, idx + 1});
|
||||||
|
|
||||||
|
@ -99,7 +99,7 @@ void scatter(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& ind
|
||||||
std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
|
std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0}));
|
NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0}));
|
||||||
NDArray updSubArr = updates(i, dimsToExcludeUpd);
|
NDArray updSubArr = updates(i, dimsToExcludeUpd);
|
||||||
|
|
||||||
|
@ -121,7 +121,7 @@ void scatterND(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& i
|
||||||
|
|
||||||
if(outRank == 1) {
|
if(outRank == 1) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
Nd4jLong idx = indices.e<Nd4jLong>(i);
|
Nd4jLong idx = indices.e<Nd4jLong>(i);
|
||||||
NDArray out = output({idx, idx + 1});
|
NDArray out = output({idx, idx + 1});
|
||||||
|
|
||||||
|
@ -139,7 +139,7 @@ void scatterND(nd4j::LaunchContext *context, pairwise::Ops op, const NDArray& i
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
std::vector<Nd4jLong> idxRangeOut(2*outRank, 0);
|
std::vector<Nd4jLong> idxRangeOut(2*outRank, 0);
|
||||||
|
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
NDArray indSubArr = indices(i, dimsToExcludeInd);
|
NDArray indSubArr = indices(i, dimsToExcludeInd);
|
||||||
|
|
||||||
for (Nd4jLong j = 0; j < indLastDim; ++j) {
|
for (Nd4jLong j = 0; j < indLastDim; ++j) {
|
||||||
|
@ -170,7 +170,7 @@ void scatterForLoss(nd4j::LaunchContext *context, const NDArray& indices, NDArr
|
||||||
|
|
||||||
if(!calcGrad) {
|
if(!calcGrad) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto subArr = updates(i, dimsToExclude);
|
auto subArr = updates(i, dimsToExclude);
|
||||||
output.p(i, subArr.e(indices.e<Nd4jLong>(i)));
|
output.p(i, subArr.e(indices.e<Nd4jLong>(i)));
|
||||||
}
|
}
|
||||||
|
@ -179,7 +179,7 @@ void scatterForLoss(nd4j::LaunchContext *context, const NDArray& indices, NDArr
|
||||||
samediff::Threads::parallel_for(func, 0, indicesLen);
|
samediff::Threads::parallel_for(func, 0, indicesLen);
|
||||||
} else {
|
} else {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto subArr = updates(i, dimsToExclude);
|
auto subArr = updates(i, dimsToExclude);
|
||||||
auto ind = indices.e<Nd4jLong>(i);
|
auto ind = indices.e<Nd4jLong>(i);
|
||||||
subArr.p(ind, subArr.e(ind) - 1.);
|
subArr.p(ind, subArr.e(ind) - 1.);
|
||||||
|
|
|
@ -169,7 +169,7 @@ namespace helpers {
|
||||||
for (int i = 1; i < indices->lengthOf(); i++) {
|
for (int i = 1; i < indices->lengthOf(); i++) {
|
||||||
if (indices->e<int>(i) == idx) {
|
if (indices->e<int>(i) == idx) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
meanV.p<T>(e, meanV.e<T>(e) + listOfTensors.at(i)->e<T>(e));
|
meanV.p<T>(e, meanV.e<T>(e) + listOfTensors.at(i)->e<T>(e));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -223,7 +223,7 @@ namespace helpers {
|
||||||
for (int i = 0; i < indices->lengthOf(); i++) {
|
for (int i = 0; i < indices->lengthOf(); i++) {
|
||||||
if (indices->e<int>(i) == idx) {
|
if (indices->e<int>(i) == idx) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
sumT->p(e, sumT->e<T>(e) + listOfTensors.at(i)->e<T>(e));
|
sumT->p(e, sumT->e<T>(e) + listOfTensors.at(i)->e<T>(e));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -272,7 +272,7 @@ namespace helpers {
|
||||||
for (int i = 1; i < indices->lengthOf(); i++) {
|
for (int i = 1; i < indices->lengthOf(); i++) {
|
||||||
if (indices->e<int>(i) == idx) {
|
if (indices->e<int>(i) == idx) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
sumT->p(e, sumT->e<T>(e) * listOfTensors.at(i)->e<T>(e));
|
sumT->p(e, sumT->e<T>(e) * listOfTensors.at(i)->e<T>(e));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -625,7 +625,7 @@ namespace helpers {
|
||||||
Nd4jLong loop_size = input->lengthOf();
|
Nd4jLong loop_size = input->lengthOf();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto classNum = indices->e<Nd4jLong>(e);
|
auto classNum = indices->e<Nd4jLong>(e);
|
||||||
if (nd4j::math::nd4j_abs(tempRes.e<T>(classNum) - input->e<T>(e)) <= T(1.e-6))
|
if (nd4j::math::nd4j_abs(tempRes.e<T>(classNum) - input->e<T>(e)) <= T(1.e-6))
|
||||||
output->p(e, gradOut->e<T>(classNum));
|
output->p(e, gradOut->e<T>(classNum));
|
||||||
|
@ -645,7 +645,7 @@ namespace helpers {
|
||||||
//std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
|
//std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto classNum = indices->e<Nd4jLong>(i);
|
auto classNum = indices->e<Nd4jLong>(i);
|
||||||
auto current = listOfTensors.at(i);
|
auto current = listOfTensors.at(i);
|
||||||
auto currentOut = listOfOutTensors.at(i);
|
auto currentOut = listOfOutTensors.at(i);
|
||||||
|
@ -675,7 +675,7 @@ namespace helpers {
|
||||||
segmentMinFunctor(context, input, indices, &tempRes);
|
segmentMinFunctor(context, input, indices, &tempRes);
|
||||||
if (input->isVector()) {
|
if (input->isVector()) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto classNum = indices->e<Nd4jLong>(e);
|
auto classNum = indices->e<Nd4jLong>(e);
|
||||||
if (nd4j::math::nd4j_abs(tempRes.e<double>(classNum) - input->e<double>(e)) < 1.e-5)
|
if (nd4j::math::nd4j_abs(tempRes.e<double>(classNum) - input->e<double>(e)) < 1.e-5)
|
||||||
output->p(e, gradOut->e<double>(classNum));
|
output->p(e, gradOut->e<double>(classNum));
|
||||||
|
@ -697,7 +697,7 @@ namespace helpers {
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto classNum = indices->e<Nd4jLong>(i);
|
auto classNum = indices->e<Nd4jLong>(i);
|
||||||
auto current = listOfTensors.at(i);
|
auto current = listOfTensors.at(i);
|
||||||
auto currentOut = listOfOutTensors.at(i);
|
auto currentOut = listOfOutTensors.at(i);
|
||||||
|
@ -887,7 +887,7 @@ namespace helpers {
|
||||||
if (input->isVector()) {
|
if (input->isVector()) {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto classNum = indices->e<Nd4jLong>(e);
|
auto classNum = indices->e<Nd4jLong>(e);
|
||||||
if (nd4j::math::nd4j_abs(tempRes.t<T>(classNum) - input->t<T>(e)) < 1.e-6)
|
if (nd4j::math::nd4j_abs(tempRes.t<T>(classNum) - input->t<T>(e)) < 1.e-6)
|
||||||
output->t<T>(e) = gradOut->t<T>(classNum);
|
output->t<T>(e) = gradOut->t<T>(classNum);
|
||||||
|
@ -1004,7 +1004,7 @@ namespace helpers {
|
||||||
unsortedSegmentProdFunctor(context, input, indices, numOfClasses, &tempRes);
|
unsortedSegmentProdFunctor(context, input, indices, numOfClasses, &tempRes);
|
||||||
if (input->isVector()) {
|
if (input->isVector()) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto classNum = indices->e<Nd4jLong>(e);
|
auto classNum = indices->e<Nd4jLong>(e);
|
||||||
output->p<double>(e, gradOut->e<double>(classNum) * tempRes.e<double>(classNum) / input->e<double>(e));
|
output->p<double>(e, gradOut->e<double>(classNum) * tempRes.e<double>(classNum) / input->e<double>(e));
|
||||||
}
|
}
|
||||||
|
|
|
@ -364,7 +364,7 @@ namespace nd4j {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
T sneu1e[600];
|
T sneu1e[600];
|
||||||
|
|
||||||
for (auto t = start; t < stop; t += increment) {
|
for (auto t = start; t < stop; t++) {
|
||||||
T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
|
T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
|
||||||
memset(neu1e, 0, vectorLength * sizeof(T));
|
memset(neu1e, 0, vectorLength * sizeof(T));
|
||||||
|
|
||||||
|
@ -457,7 +457,7 @@ namespace nd4j {
|
||||||
T sneu1[600];
|
T sneu1[600];
|
||||||
T sneu1e[600];
|
T sneu1e[600];
|
||||||
|
|
||||||
for (int e = start; e < stop; e += increment) {
|
for (int e = start; e < stop; e++) {
|
||||||
T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength];
|
T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength];
|
||||||
T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
|
T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
|
||||||
|
|
||||||
|
|
|
@ -40,7 +40,7 @@ namespace helpers {
|
||||||
output->assign(input);
|
output->assign(input);
|
||||||
|
|
||||||
auto batchLoop = PRAGMA_THREADS_FOR {
|
auto batchLoop = PRAGMA_THREADS_FOR {
|
||||||
for (auto batch = start; batch < stop; batch += increment) {
|
for (auto batch = start; batch < stop; batch++) {
|
||||||
for (auto r = 0; r < rows; r++) {
|
for (auto r = 0; r < rows; r++) {
|
||||||
for (auto c = 0; c < r; c++) {
|
for (auto c = 0; c < r; c++) {
|
||||||
math::nd4j_swap(outputPart[batch]->t<T>(r, c) , outputPart[batch]->t<T>(c, r));
|
math::nd4j_swap(outputPart[batch]->t<T>(r, c) , outputPart[batch]->t<T>(c, r));
|
||||||
|
|
|
@ -143,7 +143,7 @@ static void sruBI_(NDArray* x, const NDArray* w, const NDArray* b, const NDArray
|
||||||
T* pCt = ct->bufferAsT<T>();
|
T* pCt = ct->bufferAsT<T>();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto col = start; col < stop; col += increment) {
|
for (auto col = start; col < stop; col++) {
|
||||||
const auto colNum = col % d2;
|
const auto colNum = col % d2;
|
||||||
bool flip = colNum >= K;
|
bool flip = colNum >= K;
|
||||||
T maskVal = mask ? *(pMask + col) : T(1);
|
T maskVal = mask ? *(pMask + col) : T(1);
|
||||||
|
@ -236,7 +236,7 @@ static void sruBIBP_(NDArray* x, const NDArray* w, const NDArray* b, const NDArr
|
||||||
T* pGradInit = gradC0->bufferAsT<T>();
|
T* pGradInit = gradC0->bufferAsT<T>();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto col = start; col < stop; col += increment) {
|
for (auto col = start; col < stop; col++) {
|
||||||
T gbF = 0.f;
|
T gbF = 0.f;
|
||||||
T gbR = 0.f;
|
T gbR = 0.f;
|
||||||
const auto colNum = col % d2;
|
const auto colNum = col % d2;
|
||||||
|
|
|
@ -37,7 +37,7 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
|
||||||
int inSize = inArrs.size();
|
int inSize = inArrs.size();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment)
|
for (auto i = start; i < stop; i++)
|
||||||
outArr->p<T>(i, inArrs[i]->t<T>(0));
|
outArr->p<T>(i, inArrs[i]->t<T>(0));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -50,7 +50,7 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
|
||||||
int listSize = list.size();
|
int listSize = list.size();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment)
|
for (auto i = start; i < stop; i++)
|
||||||
list.at(i)->assign(inArrs[i]);
|
list.at(i)->assign(inArrs[i]);
|
||||||
};
|
};
|
||||||
samediff::Threads::parallel_tad(func, 0, listSize);
|
samediff::Threads::parallel_tad(func, 0, listSize);
|
||||||
|
|
|
@ -150,7 +150,7 @@ namespace helpers {
|
||||||
result->assign(0);
|
result->assign(0);
|
||||||
if (status == ND4J_STATUS_OK) {
|
if (status == ND4J_STATUS_OK) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
bool found = false;
|
bool found = false;
|
||||||
for (int j = 0; j < k; j++) {
|
for (int j = 0; j < k; j++) {
|
||||||
if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) {
|
if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) {
|
||||||
|
|
|
@ -43,7 +43,7 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N
|
||||||
int dLen = dOdI.lengthOf();
|
int dLen = dOdI.lengthOf();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
if (dOdI.t<T>(i) != static_cast<T>(0.f))
|
if (dOdI.t<T>(i) != static_cast<T>(0.f))
|
||||||
dOdI.t<T>(i) = static_cast<T>(1.f);
|
dOdI.t<T>(i) = static_cast<T>(1.f);
|
||||||
}
|
}
|
||||||
|
@ -65,7 +65,7 @@ static void trace_(const NDArray& input, NDArray& output) {
|
||||||
auto setOfSubArrs = input.allTensorsAlongDimension({inRank-2, inRank-1});
|
auto setOfSubArrs = input.allTensorsAlongDimension({inRank-2, inRank-1});
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment)
|
for (auto i = start; i < stop; i++)
|
||||||
output.p(i, setOfSubArrs.at(i)->getTrace());
|
output.p(i, setOfSubArrs.at(i)->getTrace());
|
||||||
};
|
};
|
||||||
samediff::Threads::parallel_for(func, 0, setOfSubArrs.size());
|
samediff::Threads::parallel_for(func, 0, setOfSubArrs.size());
|
||||||
|
@ -189,7 +189,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
Nd4jLong coords[MAX_RANK];
|
Nd4jLong coords[MAX_RANK];
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
shape::index2coords(i, output.getShapeInfo(), coords);
|
shape::index2coords(i, output.getShapeInfo(), coords);
|
||||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
||||||
|
|
||||||
|
@ -220,7 +220,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
Nd4jLong coords[MAX_RANK];
|
Nd4jLong coords[MAX_RANK];
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
shape::index2coords(i, output.getShapeInfo(), coords);
|
shape::index2coords(i, output.getShapeInfo(), coords);
|
||||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
||||||
|
|
||||||
|
@ -566,7 +566,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
Nd4jLong coords[MAX_RANK * 3];
|
Nd4jLong coords[MAX_RANK * 3];
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
Nd4jLong *zCoordStart, *xCoordStart;
|
Nd4jLong *zCoordStart, *xCoordStart;
|
||||||
|
|
||||||
if (yLastDim == xRank) {
|
if (yLastDim == xRank) {
|
||||||
|
@ -650,7 +650,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
|
||||||
else if (input->rankOf() == 1 && indices->isVector()) {
|
else if (input->rankOf() == 1 && indices->isVector()) {
|
||||||
// special case
|
// special case
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment)
|
for (auto e = start; e < stop; e++)
|
||||||
output->p(e, input->e<T>(indices->e<Nd4jLong>(e)));
|
output->p(e, input->e<T>(indices->e<Nd4jLong>(e)));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -663,7 +663,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
|
||||||
const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), dimsOut);
|
const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), dimsOut);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
NDArray subArrOut = (*output)(i, dimsOut);
|
NDArray subArrOut = (*output)(i, dimsOut);
|
||||||
NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis});
|
NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis});
|
||||||
subArrOut.assign(subArrIn);
|
subArrOut.assign(subArrIn);
|
||||||
|
@ -687,7 +687,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
|
||||||
const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), {axis});
|
const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), {axis});
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
NDArray subArrOut = (*output)(i, {axis});
|
NDArray subArrOut = (*output)(i, {axis});
|
||||||
NDArray subArrIn = (*input)(intArgs[i + 1], {axis});
|
NDArray subArrIn = (*input)(intArgs[i + 1], {axis});
|
||||||
subArrOut.assign(subArrIn);
|
subArrOut.assign(subArrIn);
|
||||||
|
@ -710,7 +710,7 @@ void eye(nd4j::LaunchContext * context, NDArray& output) {
|
||||||
auto arrs = output.allTensorsAlongDimension({rank-2, rank-1});
|
auto arrs = output.allTensorsAlongDimension({rank-2, rank-1});
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment)
|
for (auto i = start; i < stop; i++)
|
||||||
arrs.at(i)->setIdentity();
|
arrs.at(i)->setIdentity();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -737,7 +737,7 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat
|
||||||
indices.push_back((*intArgs)[e]);
|
indices.push_back((*intArgs)[e]);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto inSubArr = input(indices[i], dimsToExclude, true);
|
auto inSubArr = input(indices[i], dimsToExclude, true);
|
||||||
auto updSubArr = updates(i, dimsToExclude, true);
|
auto updSubArr = updates(i, dimsToExclude, true);
|
||||||
|
|
||||||
|
@ -786,7 +786,7 @@ void scatterSimple(nd4j::LaunchContext * context, const int opId, NDArray& input
|
||||||
|
|
||||||
case 6: { // copy
|
case 6: { // copy
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto inSubArr = input(i, dimensions);
|
auto inSubArr = input(i, dimensions);
|
||||||
inSubArr.p(indices.t<Nd4jLong>(i), updates.e(i));
|
inSubArr.p(indices.t<Nd4jLong>(i), updates.e(i));
|
||||||
}
|
}
|
||||||
|
@ -809,7 +809,7 @@ static void mergeMaxIndex_(const std::vector<NDArray*>& inArrs, NDArray& output)
|
||||||
auto x = inArrs[0];
|
auto x = inArrs[0];
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
T max = -DataTypeUtils::max<T>();
|
T max = -DataTypeUtils::max<T>();
|
||||||
Nd4jLong idx = 0;
|
Nd4jLong idx = 0;
|
||||||
|
|
||||||
|
@ -839,7 +839,7 @@ static void mergeMax_(const std::vector<NDArray*>& inArrs, NDArray& output) {
|
||||||
auto x = inArrs[0];
|
auto x = inArrs[0];
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
T max = -DataTypeUtils::max<T>();
|
T max = -DataTypeUtils::max<T>();
|
||||||
for (int i = 0; i < numArgs; i++) {
|
for (int i = 0; i < numArgs; i++) {
|
||||||
T v = inArrs[i]->e<T>(e);
|
T v = inArrs[i]->e<T>(e);
|
||||||
|
@ -865,7 +865,7 @@ static void mergeAvg_(const std::vector<NDArray*>& inArrs, NDArray& output) {
|
||||||
auto x = inArrs[0];
|
auto x = inArrs[0];
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
T sum = 0.;
|
T sum = 0.;
|
||||||
for (int i = 0; i < numArgs; i++) {
|
for (int i = 0; i < numArgs; i++) {
|
||||||
T v = inArrs[i]->e<T>(e);
|
T v = inArrs[i]->e<T>(e);
|
||||||
|
@ -891,7 +891,7 @@ static void mergeAdd_(const std::vector<NDArray*>& inArrs, NDArray& output) {
|
||||||
auto x = inArrs[0];
|
auto x = inArrs[0];
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
T sum = (T) 0.f;
|
T sum = (T) 0.f;
|
||||||
for (int i = 0; i < numArgs; i++)
|
for (int i = 0; i < numArgs; i++)
|
||||||
sum += inArrs[i]->e<T>(e);
|
sum += inArrs[i]->e<T>(e);
|
||||||
|
@ -928,7 +928,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
|
||||||
auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions);
|
auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
const T iNormActual = norm2.e<T>(i);
|
const T iNormActual = norm2.e<T>(i);
|
||||||
if (iNormActual > normClip)
|
if (iNormActual > normClip)
|
||||||
*listOfInSubArrs.at(i) *= normClip / iNormActual;
|
*listOfInSubArrs.at(i) *= normClip / iNormActual;
|
||||||
|
@ -952,7 +952,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
|
||||||
auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions);
|
auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto inputSubArr = listOfInSubArrs.at(i);
|
auto inputSubArr = listOfInSubArrs.at(i);
|
||||||
auto outputSubArr = listOfOutSubArrs.at(i);
|
auto outputSubArr = listOfOutSubArrs.at(i);
|
||||||
outputSubArr->assign(inputSubArr);
|
outputSubArr->assign(inputSubArr);
|
||||||
|
@ -1058,7 +1058,7 @@ static void clipByNormBP_(const NDArray& input, const NDArray& gradO, NDArray& g
|
||||||
auto cn = clipNorm.e<T>(0);
|
auto cn = clipNorm.e<T>(0);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
T N = norm2.e<T>(i);
|
T N = norm2.e<T>(i);
|
||||||
|
|
||||||
auto gradOSubArr = gradOSubArrs.at(i);
|
auto gradOSubArr = gradOSubArrs.at(i);
|
||||||
|
@ -1190,7 +1190,7 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
Nd4jLong inIdx[MAX_RANK];
|
Nd4jLong inIdx[MAX_RANK];
|
||||||
Nd4jLong outIdx[MAX_RANK];
|
Nd4jLong outIdx[MAX_RANK];
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
shape::index2coords(i, output.getShapeInfo(), outIdx);
|
shape::index2coords(i, output.getShapeInfo(), outIdx);
|
||||||
|
|
||||||
for (int j = 0; j < rank; ++j) {
|
for (int j = 0; j < rank; ++j) {
|
||||||
|
@ -1225,17 +1225,6 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template void mirrorPad_, (const NDArray& input, const NDArray& paddings, NDArray& output, const int mode), LIBND4J_TYPES);
|
BUILD_SINGLE_TEMPLATE(template void mirrorPad_, (const NDArray& input, const NDArray& paddings, NDArray& output, const int mode), LIBND4J_TYPES);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
template<typename T>
|
|
||||||
static void concat_(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
|
|
||||||
nd4j::SpecialMethods<T>::concatCpuGeneric(inArrs, output, axis);
|
|
||||||
}
|
|
||||||
|
|
||||||
void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
|
|
||||||
BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES);
|
|
||||||
}
|
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector<NDArray*>& inArrs, NDArray& output, const int axis), LIBND4J_TYPES);
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|
|
@ -90,7 +90,7 @@ namespace helpers {
|
||||||
auto outputPart = output->allTensorsAlongDimension({-2, -1});
|
auto outputPart = output->allTensorsAlongDimension({-2, -1});
|
||||||
|
|
||||||
auto batchLoop = PRAGMA_THREADS_FOR {
|
auto batchLoop = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
if (lower) {
|
if (lower) {
|
||||||
lowerTriangularSolve<T>(context, leftPart[i], rightPart[i], adjoint, outputPart[i]);
|
lowerTriangularSolve<T>(context, leftPart[i], rightPart[i], adjoint, outputPart[i]);
|
||||||
} else {
|
} else {
|
||||||
|
@ -112,7 +112,7 @@ namespace helpers {
|
||||||
auto rows = input->sizeAt(-2);
|
auto rows = input->sizeAt(-2);
|
||||||
|
|
||||||
auto batchLoop = PRAGMA_THREADS_FOR {
|
auto batchLoop = PRAGMA_THREADS_FOR {
|
||||||
for (auto batch = start; batch < stop; batch += increment) {
|
for (auto batch = start; batch < stop; batch++) {
|
||||||
if (!lower) {
|
if (!lower) {
|
||||||
for (auto r = 0; r < rows; r++) {
|
for (auto r = 0; r < rows; r++) {
|
||||||
for (auto c = 0; c <= r; c++) {
|
for (auto c = 0; c <= r; c++) {
|
||||||
|
|
|
@ -64,7 +64,7 @@ static void zeta_(nd4j::LaunchContext * context, const NDArray& x, const NDArray
|
||||||
int xLen = x.lengthOf();
|
int xLen = x.lengthOf();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment)
|
for (auto i = start; i < stop; i++)
|
||||||
z.p(i, zetaScalar<T>(x.e<T>(i), q.e<T>(i)));
|
z.p(i, zetaScalar<T>(x.e<T>(i), q.e<T>(i)));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -68,7 +68,7 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND
|
||||||
int tads = tadsA.size();
|
int tads = tadsA.size();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto a_ = tadsA.at(e);
|
auto a_ = tadsA.at(e);
|
||||||
auto b_ = tadsB.at(e);
|
auto b_ = tadsB.at(e);
|
||||||
auto o_ = tadsO.at(e);
|
auto o_ = tadsO.at(e);
|
||||||
|
|
|
@ -69,7 +69,7 @@ namespace helpers {
|
||||||
}
|
}
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
values->p(e, static_cast<T>(valuesVector[e]));
|
values->p(e, static_cast<T>(valuesVector[e]));
|
||||||
if (counts != nullptr)
|
if (counts != nullptr)
|
||||||
counts->p(e, countsMap[valuesVector[e]]);
|
counts->p(e, countsMap[valuesVector[e]]);
|
||||||
|
|
|
@ -19,8 +19,10 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_double.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_0);
|
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_0);
|
||||||
|
|
||||||
|
BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES);
|
||||||
}
|
}
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_double.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_1);
|
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_1);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_double.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_2);
|
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_2);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_double.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_3);
|
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_3);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_double.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_4);
|
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_4);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_double.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_5);
|
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_5);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_double.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_6);
|
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_6);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_double.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_7);
|
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_7);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_double.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_8);
|
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_8);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_double.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_9);
|
BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_9);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_single.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_0);
|
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_0);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_single.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_1);
|
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_1);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_single.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_2);
|
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_2);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_single.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_3);
|
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_3);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_single.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_4);
|
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_4);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_single.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_5);
|
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_5);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_single.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_6);
|
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_6);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_single.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_7);
|
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_7);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_single.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_8);
|
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_8);
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "../specials.hpp"
|
#include "../specials_single.hpp"
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_9);
|
BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_9);
|
||||||
|
|
|
@ -34,7 +34,7 @@ namespace nd4j {
|
||||||
|
|
||||||
// handle transpose in parallel
|
// handle transpose in parallel
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto r = start; r < stop; r += increment) {
|
for (auto r = start; r < stop; r++) {
|
||||||
for (int c = 0; c < cols; c++) {
|
for (int c = 0; c < cols; c++) {
|
||||||
int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c);
|
int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c);
|
||||||
int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c);
|
int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c);
|
||||||
|
@ -73,7 +73,7 @@ namespace nd4j {
|
||||||
C[r] = z;
|
C[r] = z;
|
||||||
} else {
|
} else {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto r = start; r < stop; r += increment)
|
for (auto r = start; r < stop; r++)
|
||||||
C[r] = z;
|
C[r] = z;
|
||||||
};
|
};
|
||||||
samediff::Threads::parallel_for(func, 0, length);
|
samediff::Threads::parallel_for(func, 0, length);
|
||||||
|
@ -130,7 +130,7 @@ namespace nd4j {
|
||||||
auto aT = TRANS == CblasTrans ? reinterpret_cast<X *>(nd4j::blas::transpose<X>(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast<void *>(x))) : x;
|
auto aT = TRANS == CblasTrans ? reinterpret_cast<X *>(nd4j::blas::transpose<X>(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast<void *>(x))) : x;
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto r = start; r < stop; r += increment) {
|
for (auto r = start; r < stop; r++) {
|
||||||
int aIdx = linearIndexC(M, N, r, 0);
|
int aIdx = linearIndexC(M, N, r, 0);
|
||||||
auto aX = aT + aIdx;
|
auto aX = aT + aIdx;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,270 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (c) 2015-2018 Skymind, Inc.
|
||||||
|
*
|
||||||
|
* This program and the accompanying materials are made available under the
|
||||||
|
* terms of the Apache License, Version 2.0 which is available at
|
||||||
|
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
* License for the specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
******************************************************************************/
|
||||||
|
|
||||||
|
//
|
||||||
|
// @author raver119@gmail.com, created on 07.10.2017.
|
||||||
|
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
|
#include <pointercast.h>
|
||||||
|
#include <helpers/shape.h>
|
||||||
|
#include <helpers/TAD.h>
|
||||||
|
#include <specials.h>
|
||||||
|
#include <dll.h>
|
||||||
|
#include <NDArray.h>
|
||||||
|
#include <ops/declarable/CustomOperations.h>
|
||||||
|
#include <types/types.h>
|
||||||
|
#include <helpers/Loops.h>
|
||||||
|
|
||||||
|
namespace nd4j {
|
||||||
|
|
||||||
|
|
||||||
|
template<typename S, typename T>
|
||||||
|
void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) {
|
||||||
|
auto x = reinterpret_cast<S *>(dx);
|
||||||
|
auto z = reinterpret_cast<T *>(dz);
|
||||||
|
|
||||||
|
|
||||||
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
|
for (auto i = start; i < stop; i++) {
|
||||||
|
z[i] = static_cast<T>(x[i]);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
samediff::Threads::parallel_for(func, 0, N);
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <typename X, typename Y>
|
||||||
|
void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
|
||||||
|
int i = left, j = right;
|
||||||
|
X ktmp;
|
||||||
|
X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)];
|
||||||
|
|
||||||
|
Y vtmp;
|
||||||
|
|
||||||
|
{
|
||||||
|
/* PARTITION PART */
|
||||||
|
while (i <= j) {
|
||||||
|
if (descending) {
|
||||||
|
while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot)
|
||||||
|
i++;
|
||||||
|
while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot)
|
||||||
|
j--;
|
||||||
|
if (i <= j) {
|
||||||
|
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
|
||||||
|
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
|
||||||
|
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
|
||||||
|
|
||||||
|
vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
|
||||||
|
values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
|
||||||
|
values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
|
||||||
|
|
||||||
|
i++;
|
||||||
|
j--;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot)
|
||||||
|
i++;
|
||||||
|
while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot)
|
||||||
|
j--;
|
||||||
|
if (i <= j) {
|
||||||
|
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
|
||||||
|
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
|
||||||
|
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
|
||||||
|
|
||||||
|
vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
|
||||||
|
values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
|
||||||
|
values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
|
||||||
|
|
||||||
|
i++;
|
||||||
|
j--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
|
||||||
|
if ( ((right-left)<cutoff) ){
|
||||||
|
if (left < j){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
|
||||||
|
if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
|
||||||
|
|
||||||
|
}else{
|
||||||
|
PRAGMA_OMP_TASK
|
||||||
|
{ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
|
||||||
|
PRAGMA_OMP_TASK
|
||||||
|
{ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <typename X, typename Y>
|
||||||
|
void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
|
||||||
|
int i = left, j = right;
|
||||||
|
X ktmp;
|
||||||
|
Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)];
|
||||||
|
|
||||||
|
Y vtmp;
|
||||||
|
|
||||||
|
{
|
||||||
|
/* PARTITION PART */
|
||||||
|
while (i <= j) {
|
||||||
|
if (descending) {
|
||||||
|
while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot)
|
||||||
|
i++;
|
||||||
|
while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot)
|
||||||
|
j--;
|
||||||
|
if (i <= j) {
|
||||||
|
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
|
||||||
|
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
|
||||||
|
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
|
||||||
|
|
||||||
|
vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
|
||||||
|
value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
|
||||||
|
value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
|
||||||
|
|
||||||
|
i++;
|
||||||
|
j--;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot)
|
||||||
|
i++;
|
||||||
|
while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot)
|
||||||
|
j--;
|
||||||
|
if (i <= j) {
|
||||||
|
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
|
||||||
|
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
|
||||||
|
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
|
||||||
|
|
||||||
|
vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
|
||||||
|
value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
|
||||||
|
value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
|
||||||
|
|
||||||
|
i++;
|
||||||
|
j--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
|
||||||
|
if ( ((right-left)<cutoff) ){
|
||||||
|
if (left < j){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
|
||||||
|
if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
|
||||||
|
|
||||||
|
}else{
|
||||||
|
PRAGMA_OMP_TASK
|
||||||
|
{ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
|
||||||
|
PRAGMA_OMP_TASK
|
||||||
|
{ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <typename X, typename Y>
|
||||||
|
static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
|
||||||
|
auto array = reinterpret_cast<X *>(varray);
|
||||||
|
auto values = reinterpret_cast<Y *>(yarray);
|
||||||
|
int cutoff = 1000;
|
||||||
|
|
||||||
|
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
|
||||||
|
{
|
||||||
|
PRAGMA_OMP_SINGLE_ARGS(nowait)
|
||||||
|
{
|
||||||
|
quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename X, typename Y>
|
||||||
|
static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
|
||||||
|
auto array = reinterpret_cast<X *>(varray);
|
||||||
|
auto values = reinterpret_cast<Y *>(yarray);
|
||||||
|
int cutoff = 1000;
|
||||||
|
|
||||||
|
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
|
||||||
|
{
|
||||||
|
PRAGMA_OMP_SINGLE_ARGS(nowait)
|
||||||
|
{
|
||||||
|
quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename X, typename Y>
|
||||||
|
void DoubleMethods<X,Y>::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
|
||||||
|
quickSort_parallel_key<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename X, typename Y>
|
||||||
|
void DoubleMethods<X,Y>::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
|
||||||
|
quickSort_parallel_value<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename X, typename Y>
|
||||||
|
void DoubleMethods<X,Y>::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
|
||||||
|
auto x = reinterpret_cast<X*>(vx);
|
||||||
|
auto y = reinterpret_cast<Y*>(vy);
|
||||||
|
|
||||||
|
auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
|
||||||
|
auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
|
||||||
|
|
||||||
|
auto xLength = shape::length(xShapeInfo);
|
||||||
|
auto xTadLength = shape::length(packX.primaryShapeInfo());
|
||||||
|
auto numTads = packX.numberOfTads();
|
||||||
|
|
||||||
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
|
for (auto r = start; r < stop; r++) {
|
||||||
|
auto dx = x + packX.primaryOffsets()[r];
|
||||||
|
auto dy = y + packY.primaryOffsets()[r];
|
||||||
|
|
||||||
|
quickSort_parallel_key<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
samediff::Threads::parallel_tad(func, 0, numTads);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename X, typename Y>
|
||||||
|
void DoubleMethods<X,Y>::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
|
||||||
|
auto x = reinterpret_cast<X*>(vx);
|
||||||
|
auto y = reinterpret_cast<Y*>(vy);
|
||||||
|
|
||||||
|
auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
|
||||||
|
auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
|
||||||
|
|
||||||
|
auto xLength = shape::length(xShapeInfo);
|
||||||
|
auto xTadLength = shape::length(packX.primaryShapeInfo());
|
||||||
|
auto numTads = packX.numberOfTads();
|
||||||
|
|
||||||
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
|
for (auto r = start; r < stop; r++) {
|
||||||
|
auto dx = x + packX.primaryOffsets()[r];
|
||||||
|
auto dy = y + packY.primaryOffsets()[r];
|
||||||
|
|
||||||
|
quickSort_parallel_value<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
samediff::Threads::parallel_tad(func, 0, numTads);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -64,7 +64,7 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, ND
|
||||||
T* outBuff = output.bufferAsT<T>();
|
T* outBuff = output.bufferAsT<T>();
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto r = start; r < stop; r += increment) {
|
for (auto r = start; r < stop; r++) {
|
||||||
const Nd4jLong arrLen = inArrs[r]->lengthOf();
|
const Nd4jLong arrLen = inArrs[r]->lengthOf();
|
||||||
const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
|
const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
|
||||||
|
|
||||||
|
@ -99,7 +99,7 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, ND
|
||||||
}
|
}
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
auto temp = output(indices[i], true);
|
auto temp = output(indices[i], true);
|
||||||
nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
|
nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
|
||||||
}
|
}
|
||||||
|
@ -143,7 +143,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
|
||||||
auto x = reinterpret_cast<T **>(vx);
|
auto x = reinterpret_cast<T **>(vx);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
for (auto ar = 0L; ar < n; ar++) {
|
for (auto ar = 0L; ar < n; ar++) {
|
||||||
z[i] += x[ar][i];
|
z[i] += x[ar][i];
|
||||||
}
|
}
|
||||||
|
@ -179,7 +179,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
|
||||||
}
|
}
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
for (Nd4jLong ar = 1; ar < n; ar++) {
|
for (Nd4jLong ar = 1; ar < n; ar++) {
|
||||||
z[i] += x[ar][i] / static_cast<T>(n);
|
z[i] += x[ar][i] / static_cast<T>(n);
|
||||||
}
|
}
|
||||||
|
@ -199,7 +199,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
|
||||||
|
|
||||||
// aggregation step
|
// aggregation step
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto i = start; i < stop; i += increment) {
|
for (auto i = start; i < stop; i++) {
|
||||||
for (Nd4jLong ar = 0; ar < n; ar++) {
|
for (Nd4jLong ar = 0; ar < n; ar++) {
|
||||||
z[i] += x[ar][i] / static_cast<T>(n);
|
z[i] += x[ar][i] / static_cast<T>(n);
|
||||||
}
|
}
|
||||||
|
@ -336,7 +336,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
|
||||||
int numTads = xLength / xTadLength;
|
int numTads = xLength / xTadLength;
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto r = start; r < stop; r += increment) {
|
for (auto r = start; r < stop; r++) {
|
||||||
T *dx = x + tadOffsets[r];
|
T *dx = x + tadOffsets[r];
|
||||||
|
|
||||||
quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending);
|
quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending);
|
||||||
|
@ -358,7 +358,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
|
||||||
|
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (auto e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
for (int bitId = 0; bitId < 16; bitId++) {
|
for (int bitId = 0; bitId < 16; bitId++) {
|
||||||
bool hasBit = (x[e] & 1 << (bitId)) != 0;
|
bool hasBit = (x[e] & 1 << (bitId)) != 0;
|
||||||
bool hasSign = (x[e] & 1 << (bitId + 16)) != 0;
|
bool hasSign = (x[e] & 1 << (bitId + 16)) != 0;
|
||||||
|
@ -378,22 +378,6 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
|
||||||
samediff::Threads::parallel_for(func, 4, lim);
|
samediff::Threads::parallel_for(func, 4, lim);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename S, typename T>
|
|
||||||
void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) {
|
|
||||||
auto x = reinterpret_cast<S *>(dx);
|
|
||||||
auto z = reinterpret_cast<T *>(dz);
|
|
||||||
|
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
|
||||||
for (auto i = start; i < stop; i += increment) {
|
|
||||||
z[i] = static_cast<T>(x[i]);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
samediff::Threads::parallel_for(func, 0, N);
|
|
||||||
};
|
|
||||||
BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES);
|
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) {
|
Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) {
|
||||||
auto dx = reinterpret_cast<T *>(vx);
|
auto dx = reinterpret_cast<T *>(vx);
|
||||||
|
@ -442,226 +426,5 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
|
||||||
};
|
};
|
||||||
return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
|
return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename X, typename Y>
|
|
||||||
void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
|
|
||||||
int i = left, j = right;
|
|
||||||
X ktmp;
|
|
||||||
X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)];
|
|
||||||
|
|
||||||
Y vtmp;
|
|
||||||
|
|
||||||
{
|
|
||||||
/* PARTITION PART */
|
|
||||||
while (i <= j) {
|
|
||||||
if (descending) {
|
|
||||||
while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot)
|
|
||||||
i++;
|
|
||||||
while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot)
|
|
||||||
j--;
|
|
||||||
if (i <= j) {
|
|
||||||
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
|
|
||||||
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
|
|
||||||
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
|
|
||||||
|
|
||||||
vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
|
|
||||||
values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
|
|
||||||
values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
|
|
||||||
|
|
||||||
i++;
|
|
||||||
j--;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot)
|
|
||||||
i++;
|
|
||||||
while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot)
|
|
||||||
j--;
|
|
||||||
if (i <= j) {
|
|
||||||
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
|
|
||||||
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
|
|
||||||
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
|
|
||||||
|
|
||||||
vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
|
|
||||||
values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
|
|
||||||
values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
|
|
||||||
|
|
||||||
i++;
|
|
||||||
j--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
|
|
||||||
if ( ((right-left)<cutoff) ){
|
|
||||||
if (left < j){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
|
|
||||||
if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
|
|
||||||
|
|
||||||
}else{
|
|
||||||
PRAGMA_OMP_TASK
|
|
||||||
{ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
|
|
||||||
PRAGMA_OMP_TASK
|
|
||||||
{ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template <typename X, typename Y>
|
|
||||||
void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
|
|
||||||
int i = left, j = right;
|
|
||||||
X ktmp;
|
|
||||||
Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)];
|
|
||||||
|
|
||||||
Y vtmp;
|
|
||||||
|
|
||||||
{
|
|
||||||
/* PARTITION PART */
|
|
||||||
while (i <= j) {
|
|
||||||
if (descending) {
|
|
||||||
while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot)
|
|
||||||
i++;
|
|
||||||
while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot)
|
|
||||||
j--;
|
|
||||||
if (i <= j) {
|
|
||||||
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
|
|
||||||
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
|
|
||||||
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
|
|
||||||
|
|
||||||
vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
|
|
||||||
value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
|
|
||||||
value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
|
|
||||||
|
|
||||||
i++;
|
|
||||||
j--;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot)
|
|
||||||
i++;
|
|
||||||
while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot)
|
|
||||||
j--;
|
|
||||||
if (i <= j) {
|
|
||||||
ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
|
|
||||||
key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
|
|
||||||
key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
|
|
||||||
|
|
||||||
vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
|
|
||||||
value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
|
|
||||||
value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
|
|
||||||
|
|
||||||
i++;
|
|
||||||
j--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
|
|
||||||
if ( ((right-left)<cutoff) ){
|
|
||||||
if (left < j){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
|
|
||||||
if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
|
|
||||||
|
|
||||||
}else{
|
|
||||||
PRAGMA_OMP_TASK
|
|
||||||
{ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
|
|
||||||
PRAGMA_OMP_TASK
|
|
||||||
{ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template <typename X, typename Y>
|
|
||||||
static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
|
|
||||||
auto array = reinterpret_cast<X *>(varray);
|
|
||||||
auto values = reinterpret_cast<Y *>(yarray);
|
|
||||||
int cutoff = 1000;
|
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
|
|
||||||
{
|
|
||||||
PRAGMA_OMP_SINGLE_ARGS(nowait)
|
|
||||||
{
|
|
||||||
quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename X, typename Y>
|
|
||||||
static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
|
|
||||||
auto array = reinterpret_cast<X *>(varray);
|
|
||||||
auto values = reinterpret_cast<Y *>(yarray);
|
|
||||||
int cutoff = 1000;
|
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
|
|
||||||
{
|
|
||||||
PRAGMA_OMP_SINGLE_ARGS(nowait)
|
|
||||||
{
|
|
||||||
quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename X, typename Y>
|
|
||||||
void DoubleMethods<X,Y>::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
|
|
||||||
quickSort_parallel_key<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename X, typename Y>
|
|
||||||
void DoubleMethods<X,Y>::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
|
|
||||||
quickSort_parallel_value<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename X, typename Y>
|
|
||||||
void DoubleMethods<X,Y>::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
|
|
||||||
auto x = reinterpret_cast<X*>(vx);
|
|
||||||
auto y = reinterpret_cast<Y*>(vy);
|
|
||||||
|
|
||||||
auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
|
|
||||||
auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
|
|
||||||
|
|
||||||
auto xLength = shape::length(xShapeInfo);
|
|
||||||
auto xTadLength = shape::length(packX.primaryShapeInfo());
|
|
||||||
auto numTads = packX.numberOfTads();
|
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
|
||||||
for (auto r = start; r < stop; r += increment) {
|
|
||||||
auto dx = x + packX.primaryOffsets()[r];
|
|
||||||
auto dy = y + packY.primaryOffsets()[r];
|
|
||||||
|
|
||||||
quickSort_parallel_key<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
samediff::Threads::parallel_tad(func, 0, numTads);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename X, typename Y>
|
|
||||||
void DoubleMethods<X,Y>::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
|
|
||||||
auto x = reinterpret_cast<X*>(vx);
|
|
||||||
auto y = reinterpret_cast<Y*>(vy);
|
|
||||||
|
|
||||||
auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
|
|
||||||
auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
|
|
||||||
|
|
||||||
auto xLength = shape::length(xShapeInfo);
|
|
||||||
auto xTadLength = shape::length(packX.primaryShapeInfo());
|
|
||||||
auto numTads = packX.numberOfTads();
|
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
|
||||||
for (auto r = start; r < stop; r += increment) {
|
|
||||||
auto dx = x + packX.primaryOffsets()[r];
|
|
||||||
auto dy = y + packY.primaryOffsets()[r];
|
|
||||||
|
|
||||||
quickSort_parallel_value<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
samediff::Threads::parallel_tad(func, 0, numTads);
|
|
||||||
}
|
|
||||||
|
|
||||||
//BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES);
|
|
||||||
//BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES);
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -167,7 +167,7 @@ namespace randomOps {
|
||||||
|
|
||||||
if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) {
|
if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) {
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (uint64_t e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
T prob = rng->relativeT<T>(e);
|
T prob = rng->relativeT<T>(e);
|
||||||
T cumProb = (T) 0.0f;
|
T cumProb = (T) 0.0f;
|
||||||
for (Nd4jLong f = 0; f < yLength; f++) {
|
for (Nd4jLong f = 0; f < yLength; f++) {
|
||||||
|
@ -330,7 +330,7 @@ namespace randomOps {
|
||||||
const T epsilon = static_cast<T>(1e-5);
|
const T epsilon = static_cast<T>(1e-5);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (uint64_t e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto epm = e + middle;
|
auto epm = e + middle;
|
||||||
|
|
||||||
// we need to get random values
|
// we need to get random values
|
||||||
|
@ -440,7 +440,7 @@ namespace randomOps {
|
||||||
|
|
||||||
nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
|
nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (Nd4jLong e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
|
|
||||||
int success = 0;
|
int success = 0;
|
||||||
for (int t = 1; t <= trials; t++) {
|
for (int t = 1; t <= trials; t++) {
|
||||||
|
@ -549,7 +549,7 @@ namespace randomOps {
|
||||||
//nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
|
//nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
|
||||||
nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
|
nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (uint64_t e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
|
|
||||||
int success = 0;
|
int success = 0;
|
||||||
for (int t = 1; t <= trials; t++) {
|
for (int t = 1; t <= trials; t++) {
|
||||||
|
@ -690,7 +690,7 @@ namespace randomOps {
|
||||||
const T epsilon = static_cast<T>(1e-5);
|
const T epsilon = static_cast<T>(1e-5);
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
for (uint64_t e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
if (z[e] > mean + ds || z[e] < mean - ds) {
|
if (z[e] > mean + ds || z[e] < mean - ds) {
|
||||||
z[e] = step(rng, mean, stddev, e, middle, z[e]);
|
z[e] = step(rng, mean, stddev, e, middle, z[e]);
|
||||||
|
|
||||||
|
@ -818,7 +818,7 @@ namespace randomOps {
|
||||||
|
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (uint64_t e = start; e < stop; e += increment) {
|
for (auto e = start; e < stop; e++) {
|
||||||
auto epm = e + middle;
|
auto epm = e + middle;
|
||||||
|
|
||||||
// we need to get random values
|
// we need to get random values
|
||||||
|
|
Loading…
Reference in New Issue