Minor improvements (#255)
* static increments in loops Signed-off-by: raver119 <raver119@gmail.com> * specials and concat split into separate units Signed-off-by: raver119 <raver119@gmail.com>
This commit is contained in:
		
							parent
							
								
									d9058b469a
								
							
						
					
					
						commit
						215641ea9e
					
				| @ -501,7 +501,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha | |||||||
|     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); |     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR{ |     auto func = PRAGMA_THREADS_FOR{ | ||||||
|         for (auto e = start; e < stop; e += increment) { |         for (auto e = start; e < stop; e++) { | ||||||
|                 auto cdata = data + offsets[e]; |                 auto cdata = data + offsets[e]; | ||||||
|                 if (dataType == DataType::UTF16) { |                 if (dataType == DataType::UTF16) { | ||||||
|                     unicode::utf8to16(string[e], cdata, std::char_traits<char>::length(string[e])); |                     unicode::utf8to16(string[e], cdata, std::char_traits<char>::length(string[e])); | ||||||
| @ -568,7 +568,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::stri | |||||||
|     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); |     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR{ |     auto func = PRAGMA_THREADS_FOR{ | ||||||
|         for (auto e = start; e < stop; e += increment) { |         for (auto e = start; e < stop; e++) { | ||||||
|              auto cdata = data + offsets[e]; |              auto cdata = data + offsets[e]; | ||||||
|              if (dataType == DataType::UTF16) { |              if (dataType == DataType::UTF16) { | ||||||
|                  unicode::utf8to16(string[e].data(), cdata, string[e].size()); |                  unicode::utf8to16(string[e].data(), cdata, string[e].size()); | ||||||
| @ -635,7 +635,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u16s | |||||||
|     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); |     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR{ |     auto func = PRAGMA_THREADS_FOR{ | ||||||
|         for (auto e = start; e < stop; e += increment) { |         for (auto e = start; e < stop; e++) { | ||||||
|              auto cdata = data + offsets[e]; |              auto cdata = data + offsets[e]; | ||||||
|              if (dtype == DataType::UTF16) { |              if (dtype == DataType::UTF16) { | ||||||
|                  memcpy(cdata, string[e].data(), string[e].size() * sizeof(uint16_t)); |                  memcpy(cdata, string[e].data(), string[e].size() * sizeof(uint16_t)); | ||||||
| @ -701,7 +701,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR{ |     auto func = PRAGMA_THREADS_FOR{ | ||||||
|         for (auto e = start; e < stop; e += increment) { |         for (auto e = start; e < stop; e++) { | ||||||
|              auto cdata = data + offsets[e]; |              auto cdata = data + offsets[e]; | ||||||
|              if (dtype == DataType::UTF16) { |              if (dtype == DataType::UTF16) { | ||||||
|                  memcpy(cdata, string[e], std::char_traits<char16_t>::length(string[e]) * sizeof(uint16_t)); |                  memcpy(cdata, string[e], std::char_traits<char16_t>::length(string[e]) * sizeof(uint16_t)); | ||||||
| @ -767,7 +767,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u32s | |||||||
|     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); |     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR{ |     auto func = PRAGMA_THREADS_FOR{ | ||||||
|         for (auto e = start; e < stop; e += increment) { |         for (auto e = start; e < stop; e++) { | ||||||
|             auto cdata = data + offsets[e]; |             auto cdata = data + offsets[e]; | ||||||
|             if (dtype == DataType::UTF16) { |             if (dtype == DataType::UTF16) { | ||||||
|                 unicode::utf32to16(string[e].data(), cdata, string[e].size()); |                 unicode::utf32to16(string[e].data(), cdata, string[e].size()); | ||||||
| @ -833,7 +833,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha | |||||||
|     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); |     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR{ |     auto func = PRAGMA_THREADS_FOR{ | ||||||
|         for (auto e = start; e < stop; e += increment) { |         for (auto e = start; e < stop; e++) { | ||||||
|             auto cdata = data + offsets[e]; |             auto cdata = data + offsets[e]; | ||||||
|             if (dtype == DataType::UTF16) { |             if (dtype == DataType::UTF16) { | ||||||
|                 unicode::utf32to16(string[e], cdata, std::char_traits<char32_t>::length(string[e])); |                 unicode::utf32to16(string[e], cdata, std::char_traits<char32_t>::length(string[e])); | ||||||
| @ -2367,7 +2367,7 @@ NDArray NDArray::asS() const { | |||||||
|     const auto inData = bufferAsT<int8_t>() + offsetsLength; |     const auto inData = bufferAsT<int8_t>() + offsetsLength; | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR{ |     auto func = PRAGMA_THREADS_FOR{ | ||||||
|         for (int e = start; e < stop; e += increment) { |         for (int e = start; e < stop; e++) { | ||||||
|            auto cdata = outData + offsets[e]; |            auto cdata = outData + offsets[e]; | ||||||
|            auto end = nInputoffsets[e + 1]; |            auto end = nInputoffsets[e + 1]; | ||||||
|            auto idata = inData + nInputoffsets[e]; |            auto idata = inData + nInputoffsets[e]; | ||||||
| @ -3466,7 +3466,7 @@ NDArray NDArray::dup(const char newOrder) const { | |||||||
|             std::vector<std::string> strings(lengthOf()); |             std::vector<std::string> strings(lengthOf()); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR{ |             auto func = PRAGMA_THREADS_FOR{ | ||||||
|                     for (auto i = start; i < stop; i += increment) { |                     for (auto i = start; i < stop; i++) { | ||||||
|                            strings[i] = std::move(this->e<std::string>(i)); |                            strings[i] = std::move(this->e<std::string>(i)); | ||||||
|                     } |                     } | ||||||
|             }; |             }; | ||||||
| @ -3479,7 +3479,7 @@ NDArray NDArray::dup(const char newOrder) const { | |||||||
|             std::vector<std::u16string> strings(lengthOf()); |             std::vector<std::u16string> strings(lengthOf()); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR{ |             auto func = PRAGMA_THREADS_FOR{ | ||||||
|                     for (auto i = start; i < stop; i += increment) { |                     for (auto i = start; i < stop; i++) { | ||||||
|                            strings[i] = std::move(this->e<std::u16string>(i)); |                            strings[i] = std::move(this->e<std::u16string>(i)); | ||||||
|                     } |                     } | ||||||
|             }; |             }; | ||||||
| @ -3491,7 +3491,7 @@ NDArray NDArray::dup(const char newOrder) const { | |||||||
| 
 | 
 | ||||||
|         std::vector<std::u32string> strings(lengthOf()); |         std::vector<std::u32string> strings(lengthOf()); | ||||||
|         auto func = PRAGMA_THREADS_FOR{ |         auto func = PRAGMA_THREADS_FOR{ | ||||||
|                for (auto i = start; i < stop; i += increment) { |                for (auto i = start; i < stop; i++) { | ||||||
|                       strings[i] = std::move(this->e<std::u32string>(i)); |                       strings[i] = std::move(this->e<std::u32string>(i)); | ||||||
|                } |                } | ||||||
|         }; |         }; | ||||||
|  | |||||||
| @ -98,7 +98,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t | |||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         Nd4jLong coords[MAX_RANK]; |         Nd4jLong coords[MAX_RANK]; | ||||||
|         for (auto i = start; i < stop; i += increment) { |         for (auto i = start; i < stop; i++) { | ||||||
|             shape::index2coords(i, target.getShapeInfo(), coords); |             shape::index2coords(i, target.getShapeInfo(), coords); | ||||||
|             const auto zOffset = shape::getOffset(target.getShapeInfo(), coords); |             const auto zOffset = shape::getOffset(target.getShapeInfo(), coords); | ||||||
| 
 | 
 | ||||||
| @ -152,7 +152,7 @@ static void templatedSwap(void *xBuffer, void *yBuffer, Nd4jLong length) { | |||||||
|     auto y = reinterpret_cast<T *>(yBuffer); |     auto y = reinterpret_cast<T *>(yBuffer); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto i = start; i < stop; i += increment) { |         for (auto i = start; i < stop; i++) { | ||||||
|             auto temp = x[i]; |             auto temp = x[i]; | ||||||
|             x[i] = y[i]; |             x[i] = y[i]; | ||||||
|             y[i] = temp; |             y[i] = temp; | ||||||
| @ -266,7 +266,7 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const { | |||||||
|     if(result.ordering() == 'c') {           //  ews == 1 always here
 |     if(result.ordering() == 'c') {           //  ews == 1 always here
 | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo()); |                 auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo()); | ||||||
|                 BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES); |                 BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES); | ||||||
|             } |             } | ||||||
| @ -277,7 +277,7 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const { | |||||||
|     else { |     else { | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 auto xOffset = result.getOffset(i); |                 auto xOffset = result.getOffset(i); | ||||||
|                 auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo()); |                 auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo()); | ||||||
|                 BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES); |                 BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES); | ||||||
| @ -377,7 +377,7 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int | |||||||
|     // loop through input array
 |     // loop through input array
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         Nd4jLong coords[MAX_RANK]; |         Nd4jLong coords[MAX_RANK]; | ||||||
|         for (auto i = start; i < stop; i += increment) { |         for (auto i = start; i < stop; i++) { | ||||||
|             shape::index2coords(i, output.getShapeInfo(), coords); |             shape::index2coords(i, output.getShapeInfo(), coords); | ||||||
| 
 | 
 | ||||||
|             const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); |             const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); | ||||||
|  | |||||||
| @ -22,7 +22,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std:: | |||||||
|     if (this->ordering() == second.ordering() && this->ordering() == third.ordering()  && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == second.ews() && this->ews() == third.ews()) { |     if (this->ordering() == second.ordering() && this->ordering() == third.ordering()  && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == second.ews() && this->ews() == third.ews()) { | ||||||
| 
 | 
 | ||||||
|         auto loop = PRAGMA_THREADS_FOR { |         auto loop = PRAGMA_THREADS_FOR { | ||||||
|             for (auto e = start; e < stop; e += increment) |             for (auto e = start; e < stop; e++) | ||||||
|                 z[e] = func(f[e], s[e], t[e]); |                 z[e] = func(f[e], s[e], t[e]); | ||||||
|         }; |         }; | ||||||
| 
 | 
 | ||||||
| @ -31,7 +31,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std:: | |||||||
|         if (f == z) { |         if (f == z) { | ||||||
| 
 | 
 | ||||||
|             auto loop = PRAGMA_THREADS_FOR { |             auto loop = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto tOffset = this->getOffset(e); |                     auto tOffset = this->getOffset(e); | ||||||
|                     auto uOffset = second.getOffset(e); |                     auto uOffset = second.getOffset(e); | ||||||
|                     auto vOffset = third.getOffset(e); |                     auto vOffset = third.getOffset(e); | ||||||
| @ -44,7 +44,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std:: | |||||||
|         } else { |         } else { | ||||||
| 
 | 
 | ||||||
|             auto loop = PRAGMA_THREADS_FOR { |             auto loop = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto tOffset = this->getOffset(e); |                     auto tOffset = this->getOffset(e); | ||||||
|                     auto uOffset = second.getOffset(e); |                     auto uOffset = second.getOffset(e); | ||||||
|                     auto vOffset = third.getOffset(e); |                     auto vOffset = third.getOffset(e); | ||||||
| @ -93,7 +93,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T, | |||||||
|     if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) { |     if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) { | ||||||
| 
 | 
 | ||||||
|         auto loop = PRAGMA_THREADS_FOR { |         auto loop = PRAGMA_THREADS_FOR { | ||||||
|             for (auto e = start; e < stop; e += increment) |             for (auto e = start; e < stop; e++) | ||||||
|                 z[e] = func(f[e], s[e]); |                 z[e] = func(f[e], s[e]); | ||||||
|         }; |         }; | ||||||
| 
 | 
 | ||||||
| @ -102,7 +102,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T, | |||||||
|         if (f == z) { |         if (f == z) { | ||||||
| 
 | 
 | ||||||
|             auto loop = PRAGMA_THREADS_FOR { |             auto loop = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto xOffset = this->getOffset(e); |                     auto xOffset = this->getOffset(e); | ||||||
|                     auto yOffset = other.getOffset(e); |                     auto yOffset = other.getOffset(e); | ||||||
| 
 | 
 | ||||||
| @ -114,7 +114,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T, | |||||||
|         } else { |         } else { | ||||||
| 
 | 
 | ||||||
|             auto loop = PRAGMA_THREADS_FOR { |             auto loop = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto xOffset = this->getOffset(e); |                     auto xOffset = this->getOffset(e); | ||||||
|                     auto yOffset = other.getOffset(e); |                     auto yOffset = other.getOffset(e); | ||||||
|                     auto zOffset = target.getOffset(e); |                     auto zOffset = target.getOffset(e); | ||||||
| @ -156,7 +156,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) { | |||||||
|     if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) { |     if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) { | ||||||
| 
 | 
 | ||||||
|         auto loop = PRAGMA_THREADS_FOR { |         auto loop = PRAGMA_THREADS_FOR { | ||||||
|             for (auto e = start; e < stop; e += increment) |             for (auto e = start; e < stop; e++) | ||||||
|                 z[e] = func(f[e]); |                 z[e] = func(f[e]); | ||||||
|         }; |         }; | ||||||
| 
 | 
 | ||||||
| @ -165,7 +165,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) { | |||||||
|         if (f == z) { |         if (f == z) { | ||||||
| 
 | 
 | ||||||
|             auto loop = PRAGMA_THREADS_FOR { |             auto loop = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto xOffset = this->getOffset(e); |                     auto xOffset = this->getOffset(e); | ||||||
| 
 | 
 | ||||||
|                     f[xOffset] = func(f[xOffset]); |                     f[xOffset] = func(f[xOffset]); | ||||||
| @ -176,7 +176,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) { | |||||||
|         } else { |         } else { | ||||||
| 
 | 
 | ||||||
|             auto loop = PRAGMA_THREADS_FOR { |             auto loop = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto xOffset = this->getOffset(e); |                     auto xOffset = this->getOffset(e); | ||||||
|                     auto zOffset = target.getOffset(e); |                     auto zOffset = target.getOffset(e); | ||||||
| 
 | 
 | ||||||
| @ -217,7 +217,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr | |||||||
|     if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) { |     if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) { | ||||||
| 
 | 
 | ||||||
|         auto loop = PRAGMA_THREADS_FOR { |         auto loop = PRAGMA_THREADS_FOR { | ||||||
|             for (auto e = start; e < stop; e += increment) |             for (auto e = start; e < stop; e++) | ||||||
|                 z[e] = func(e, f[e]); |                 z[e] = func(e, f[e]); | ||||||
|         }; |         }; | ||||||
| 
 | 
 | ||||||
| @ -226,7 +226,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr | |||||||
|         if (f == z) { |         if (f == z) { | ||||||
| 
 | 
 | ||||||
|             auto loop = PRAGMA_THREADS_FOR { |             auto loop = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto xOffset = this->getOffset(e); |                     auto xOffset = this->getOffset(e); | ||||||
| 
 | 
 | ||||||
|                     f[xOffset] = func(e, f[xOffset]); |                     f[xOffset] = func(e, f[xOffset]); | ||||||
| @ -237,7 +237,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr | |||||||
|         } else { |         } else { | ||||||
| 
 | 
 | ||||||
|             auto loop = PRAGMA_THREADS_FOR { |             auto loop = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto xOffset = this->getOffset(e); |                     auto xOffset = this->getOffset(e); | ||||||
|                     auto zOffset = target.getOffset(e); |                     auto zOffset = target.getOffset(e); | ||||||
| 
 | 
 | ||||||
| @ -283,7 +283,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N | |||||||
|     if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) { |     if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) { | ||||||
| 
 | 
 | ||||||
|         auto loop = PRAGMA_THREADS_FOR { |         auto loop = PRAGMA_THREADS_FOR { | ||||||
|             for (auto e = start; e < stop; e += increment) |             for (auto e = start; e < stop; e++) | ||||||
|                 z[e] = func((Nd4jLong) e, f[e], s[e]); |                 z[e] = func((Nd4jLong) e, f[e], s[e]); | ||||||
|         }; |         }; | ||||||
| 
 | 
 | ||||||
| @ -292,7 +292,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N | |||||||
|         if (f == z) { |         if (f == z) { | ||||||
| 
 | 
 | ||||||
|             auto loop = PRAGMA_THREADS_FOR { |             auto loop = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto xOffset = this->getOffset(e); |                     auto xOffset = this->getOffset(e); | ||||||
|                     auto yOffset = other.getOffset(e); |                     auto yOffset = other.getOffset(e); | ||||||
| 
 | 
 | ||||||
| @ -304,7 +304,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N | |||||||
|         } else { |         } else { | ||||||
| 
 | 
 | ||||||
|             auto loop = PRAGMA_THREADS_FOR { |             auto loop = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto xOffset = this->getOffset(e); |                     auto xOffset = this->getOffset(e); | ||||||
|                     auto yOffset = other.getOffset(e); |                     auto yOffset = other.getOffset(e); | ||||||
|                     auto zOffset = target.getOffset(e); |                     auto zOffset = target.getOffset(e); | ||||||
|  | |||||||
| @ -1291,7 +1291,7 @@ void pullRowsGeneric(void *vx, | |||||||
|     _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads()); |     _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads()); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto idx = start; idx < stop; idx += increment) { |         for (auto idx = start; idx < stop; idx++) { | ||||||
|             auto xTadOffsetForBlock = tadOffsets[indexes[idx]]; |             auto xTadOffsetForBlock = tadOffsets[indexes[idx]]; | ||||||
|             auto zTadOffsetForBlock = zTadOffsets[idx]; |             auto zTadOffsetForBlock = zTadOffsets[idx]; | ||||||
| 
 | 
 | ||||||
| @ -1356,7 +1356,7 @@ void tearGeneric(void *vx, | |||||||
|     auto numTads = shape::length(hXShapeInfo) / tadLength; |     auto numTads = shape::length(hXShapeInfo) / tadLength; | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto i = start; i < stop; i += increment) { |         for (auto i = start; i < stop; i++) { | ||||||
|             auto hZ = reinterpret_cast<T *>(targets[i]); |             auto hZ = reinterpret_cast<T *>(targets[i]); | ||||||
|             auto s = hX + tadOffsets[i]; |             auto s = hX + tadOffsets[i]; | ||||||
| 
 | 
 | ||||||
| @ -1478,7 +1478,7 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS | |||||||
|     auto dZ = reinterpret_cast<T **>(dz); |     auto dZ = reinterpret_cast<T **>(dz); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto f = start; f < stop; f += increment) { |         for (auto f = start; f < stop; f++) { | ||||||
|             auto hX = reinterpret_cast<T *>(dX[f]); |             auto hX = reinterpret_cast<T *>(dX[f]); | ||||||
|             //auto hZ = reinterpret_cast<T *>(dZ[f]);
 |             //auto hZ = reinterpret_cast<T *>(dZ[f]);
 | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -52,7 +52,7 @@ namespace nd4j { | |||||||
|                                 TypeCast::convertGeneric<T2, T>(nullptr, tmp, length, buffer); |                                 TypeCast::convertGeneric<T2, T>(nullptr, tmp, length, buffer); | ||||||
| #else | #else | ||||||
|                 auto func = PRAGMA_THREADS_FOR { |                 auto func = PRAGMA_THREADS_FOR { | ||||||
|                     for (auto e = start; e < stop; e += increment) |                     for (auto e = start; e < stop; e++) | ||||||
|                         buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e])); |                         buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e])); | ||||||
|                 }; |                 }; | ||||||
| 
 | 
 | ||||||
| @ -110,7 +110,7 @@ namespace nd4j { | |||||||
|                                 TypeCast::convertGeneric<float, T>(nullptr, tmp, length, buffer); |                                 TypeCast::convertGeneric<float, T>(nullptr, tmp, length, buffer); | ||||||
| #else | #else | ||||||
|                             auto func = PRAGMA_THREADS_FOR { |                             auto func = PRAGMA_THREADS_FOR { | ||||||
|                                 for (auto e = start; e < stop; e += increment) |                                 for (auto e = start; e < stop; e++) | ||||||
|                                     buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e])); |                                     buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e])); | ||||||
|                             }; |                             }; | ||||||
| 
 | 
 | ||||||
| @ -138,7 +138,7 @@ namespace nd4j { | |||||||
| 
 | 
 | ||||||
| #else | #else | ||||||
|                             auto func = PRAGMA_THREADS_FOR { |                             auto func = PRAGMA_THREADS_FOR { | ||||||
|                                 for (auto e = start; e < stop; e += increment) |                                 for (auto e = start; e < stop; e++) | ||||||
|                                     buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e])); |                                     buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e])); | ||||||
|                             }; |                             }; | ||||||
| 
 | 
 | ||||||
| @ -164,7 +164,7 @@ namespace nd4j { | |||||||
|                                 TypeCast::convertGeneric<float16, T>(nullptr, tmp, length, buffer); |                                 TypeCast::convertGeneric<float16, T>(nullptr, tmp, length, buffer); | ||||||
| #else | #else | ||||||
|                             auto func = PRAGMA_THREADS_FOR { |                             auto func = PRAGMA_THREADS_FOR { | ||||||
|                                 for (auto e = start; e < stop; e += increment) |                                 for (auto e = start; e < stop; e++) | ||||||
|                                     buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e])); |                                     buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e])); | ||||||
|                             }; |                             }; | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -49,7 +49,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, | |||||||
|         case nd4j::LoopKind::EWS1: { |         case nd4j::LoopKind::EWS1: { | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto tad = const_cast<X *>(x) + tadOffsets[i]; |                     auto tad = const_cast<X *>(x) + tadOffsets[i]; | ||||||
|                     auto indexValue = OpType::startingIndexValue(tad); |                     auto indexValue = OpType::startingIndexValue(tad); | ||||||
| 
 | 
 | ||||||
| @ -70,7 +70,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, | |||||||
|         case nd4j::LoopKind::EWSNONZERO: { |         case nd4j::LoopKind::EWSNONZERO: { | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto tad = const_cast<X *>(x) + tadOffsets[i]; |                     auto tad = const_cast<X *>(x) + tadOffsets[i]; | ||||||
|                     auto indexValue = OpType::startingIndexValue(tad); |                     auto indexValue = OpType::startingIndexValue(tad); | ||||||
| 
 | 
 | ||||||
| @ -91,7 +91,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, | |||||||
|         case nd4j::LoopKind::RANK1: { |         case nd4j::LoopKind::RANK1: { | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto tad = const_cast<X *>(x) + tadOffsets[i]; |                     auto tad = const_cast<X *>(x) + tadOffsets[i]; | ||||||
|                     auto indexValue = OpType::startingIndexValue(tad); |                     auto indexValue = OpType::startingIndexValue(tad); | ||||||
| 
 | 
 | ||||||
| @ -114,7 +114,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, | |||||||
|             shape::updateStrides(2, tadShape, newStride, 'c'); |             shape::updateStrides(2, tadShape, newStride, 'c'); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto tad = const_cast<X *>(x) + tadOffsets[i]; |                     auto tad = const_cast<X *>(x) + tadOffsets[i]; | ||||||
|                     auto indexValue = OpType::startingIndexValue(tad); |                     auto indexValue = OpType::startingIndexValue(tad); | ||||||
| 
 | 
 | ||||||
| @ -141,7 +141,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, | |||||||
|             shape::updateStrides(3, tadShape, newStride, 'c'); |             shape::updateStrides(3, tadShape, newStride, 'c'); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto tad = const_cast<X *>(x) + tadOffsets[i]; |                     auto tad = const_cast<X *>(x) + tadOffsets[i]; | ||||||
|                     auto indexValue = OpType::startingIndexValue(tad); |                     auto indexValue = OpType::startingIndexValue(tad); | ||||||
| 
 | 
 | ||||||
| @ -170,7 +170,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, | |||||||
|             shape::updateStrides(4, tadShape, newStride, 'c'); |             shape::updateStrides(4, tadShape, newStride, 'c'); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto tad = const_cast<X *>(x) + tadOffsets[i]; |                     auto tad = const_cast<X *>(x) + tadOffsets[i]; | ||||||
|                     auto indexValue = OpType::startingIndexValue(tad); |                     auto indexValue = OpType::startingIndexValue(tad); | ||||||
| 
 | 
 | ||||||
| @ -201,7 +201,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, | |||||||
|             shape::updateStrides(5, tadShape, newStride, 'c'); |             shape::updateStrides(5, tadShape, newStride, 'c'); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto tad = const_cast<X *>(x) + tadOffsets[i]; |                     auto tad = const_cast<X *>(x) + tadOffsets[i]; | ||||||
|                     auto indexValue = OpType::startingIndexValue(tad); |                     auto indexValue = OpType::startingIndexValue(tad); | ||||||
| 
 | 
 | ||||||
| @ -234,7 +234,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, | |||||||
|             const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo); |             const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto tad = const_cast<X *>(x) + tadOffsets[i]; |                     auto tad = const_cast<X *>(x) + tadOffsets[i]; | ||||||
|                     auto indexValue = OpType::startingIndexValue(tad); |                     auto indexValue = OpType::startingIndexValue(tad); | ||||||
| 
 | 
 | ||||||
| @ -258,7 +258,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, | |||||||
|             const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo); |             const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto tad = const_cast<X *>(x) + tadOffsets[i]; |                     auto tad = const_cast<X *>(x) + tadOffsets[i]; | ||||||
|                     auto indexValue = OpType::startingIndexValue(tad); |                     auto indexValue = OpType::startingIndexValue(tad); | ||||||
| 
 | 
 | ||||||
| @ -284,7 +284,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo, | |||||||
|             const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo); |             const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto tad = const_cast<X *>(x) + tadOffsets[i]; |                     auto tad = const_cast<X *>(x) + tadOffsets[i]; | ||||||
|                     auto indexValue = OpType::startingIndexValue(tad); |                     auto indexValue = OpType::startingIndexValue(tad); | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -80,7 +80,7 @@ namespace nd4j { | |||||||
|                 int nLen = zArr.lengthOf() / yArr.sizeAt(-1); |                 int nLen = zArr.lengthOf() / yArr.sizeAt(-1); | ||||||
| 
 | 
 | ||||||
|                 auto func = PRAGMA_THREADS_FOR{ |                 auto func = PRAGMA_THREADS_FOR{ | ||||||
|                      for (uint32_t total = start; total < stop; total += increment) { |                      for (uint32_t total = start; total < stop; total++) { | ||||||
| 
 | 
 | ||||||
|                         uint32_t i = total / zDim1; |                         uint32_t i = total / zDim1; | ||||||
|                         uint32_t j = total % zDim1; |                         uint32_t j = total % zDim1; | ||||||
|  | |||||||
| @ -73,7 +73,7 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex | |||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             intermediatery[thread_id] = OpType::startingIndexValue(x); |             intermediatery[thread_id] = OpType::startingIndexValue(x); | ||||||
| 
 | 
 | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 IndexValue<X> curr(x[i], i); |                 IndexValue<X> curr(x[i], i); | ||||||
|                 intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams); |                 intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams); | ||||||
|             } |             } | ||||||
| @ -88,7 +88,7 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex | |||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             intermediatery[thread_id] = OpType::startingIndexValue(x); |             intermediatery[thread_id] = OpType::startingIndexValue(x); | ||||||
| 
 | 
 | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); |                 auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); | ||||||
|                 IndexValue<X> curr(x[offset], i); |                 IndexValue<X> curr(x[offset], i); | ||||||
|                 intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams); |                 intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams); | ||||||
|  | |||||||
| @ -75,7 +75,7 @@ namespace functions { | |||||||
| 
 | 
 | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         PRAGMA_OMP_SIMD |                         PRAGMA_OMP_SIMD | ||||||
|                         for (auto i = start; i < stop; i += increment)  { |                         for (auto i = start; i < stop; i++)  { | ||||||
|                             auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); |                             auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); | ||||||
|                             z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); |                             z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); | ||||||
|                         } |                         } | ||||||
| @ -93,7 +93,7 @@ namespace functions { | |||||||
| 
 | 
 | ||||||
|                 auto func = PRAGMA_THREADS_FOR { |                 auto func = PRAGMA_THREADS_FOR { | ||||||
|                     PRAGMA_OMP_SIMD |                     PRAGMA_OMP_SIMD | ||||||
|                     for (uint64_t i = start; i < stop; i += increment)  { |                     for (uint64_t i = start; i < stop; i++)  { | ||||||
|                         auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); |                         auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); | ||||||
|                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); |                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); | ||||||
|                         z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); |                         z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); | ||||||
| @ -111,7 +111,7 @@ namespace functions { | |||||||
| 
 | 
 | ||||||
|                 auto func = PRAGMA_THREADS_FOR { |                 auto func = PRAGMA_THREADS_FOR { | ||||||
|                     PRAGMA_OMP_SIMD |                     PRAGMA_OMP_SIMD | ||||||
|                     for (uint64_t i = start; i < stop; i += increment)  { |                     for (uint64_t i = start; i < stop; i++)  { | ||||||
|                         auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); |                         auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); | ||||||
|                         auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); |                         auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); | ||||||
|                         z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments); |                         z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments); | ||||||
| @ -129,7 +129,7 @@ namespace functions { | |||||||
| 
 | 
 | ||||||
|                 auto func = PRAGMA_THREADS_FOR { |                 auto func = PRAGMA_THREADS_FOR { | ||||||
|                     PRAGMA_OMP_SIMD |                     PRAGMA_OMP_SIMD | ||||||
|                     for (uint64_t i = start; i < stop; i += increment)  { |                     for (uint64_t i = start; i < stop; i++)  { | ||||||
|                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); |                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); | ||||||
|                         auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); |                         auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); | ||||||
|                         z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments); |                         z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments); | ||||||
| @ -149,7 +149,7 @@ namespace functions { | |||||||
| 
 | 
 | ||||||
|                 auto func = PRAGMA_THREADS_FOR { |                 auto func = PRAGMA_THREADS_FOR { | ||||||
|                     PRAGMA_OMP_SIMD |                     PRAGMA_OMP_SIMD | ||||||
|                     for (uint64_t i = start; i < stop; i += increment)  { |                     for (uint64_t i = start; i < stop; i++)  { | ||||||
|                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); |                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); | ||||||
|                         auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); |                         auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); | ||||||
|                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); |                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); | ||||||
| @ -197,7 +197,7 @@ namespace functions { | |||||||
|                 else{ |                 else{ | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         PRAGMA_OMP_SIMD |                         PRAGMA_OMP_SIMD | ||||||
|                         for (uint64_t i = start; i < stop; i += increment)  { |                         for (uint64_t i = start; i < stop; i++)  { | ||||||
|                             auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); |                             auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); | ||||||
|                             z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments); |                             z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments); | ||||||
|                         } |                         } | ||||||
| @ -213,7 +213,7 @@ namespace functions { | |||||||
| 
 | 
 | ||||||
|                 auto func = PRAGMA_THREADS_FOR { |                 auto func = PRAGMA_THREADS_FOR { | ||||||
|                     PRAGMA_OMP_SIMD |                     PRAGMA_OMP_SIMD | ||||||
|                     for (uint64_t i = start; i < stop; i += increment)  { |                     for (uint64_t i = start; i < stop; i++)  { | ||||||
|                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); |                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); | ||||||
|                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); |                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); | ||||||
|                         z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments); |                         z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments); | ||||||
| @ -255,7 +255,7 @@ namespace functions { | |||||||
| 
 | 
 | ||||||
|                 auto func = PRAGMA_THREADS_FOR { |                 auto func = PRAGMA_THREADS_FOR { | ||||||
|                     PRAGMA_OMP_SIMD |                     PRAGMA_OMP_SIMD | ||||||
|                     for (uint64_t i = start; i < stop; i += increment)  { |                     for (uint64_t i = start; i < stop; i++)  { | ||||||
|                         auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); |                         auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); | ||||||
|                         z[offset] = OpClass::op(i, length, rng, extraArguments); |                         z[offset] = OpClass::op(i, length, rng, extraArguments); | ||||||
|                     } |                     } | ||||||
|  | |||||||
| @ -88,7 +88,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo, | |||||||
| 
 | 
 | ||||||
|     if (kindOfLoop == nd4j::LoopKind::EWS1) { |     if (kindOfLoop == nd4j::LoopKind::EWS1) { | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); |                 intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); | ||||||
|             } |             } | ||||||
|         }; |         }; | ||||||
| @ -98,7 +98,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo, | |||||||
|     } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { |     } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); |                 auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); | ||||||
|                 intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); |                 intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); | ||||||
|             } |             } | ||||||
| @ -110,7 +110,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo, | |||||||
|         const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); |         const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast); | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); |                 auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); | ||||||
|                 auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); |                 auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); | ||||||
|                 intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); |                 intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id); | ||||||
|  | |||||||
| @ -158,7 +158,7 @@ namespace functions { | |||||||
|             const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeShapeInfo, tadShapeShapeInfoCast); |             const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeShapeInfo, tadShapeShapeInfoCast); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto r = start; r < stop; r += increment) { |                 for (auto r = start; r < stop; r++) { | ||||||
| 
 | 
 | ||||||
|                     auto tadOffsetForBlock = tadPack.primaryOffsets()[r]; |                     auto tadOffsetForBlock = tadPack.primaryOffsets()[r]; | ||||||
|                     auto tx = x + tadOffsetForBlock; |                     auto tx = x + tadOffsetForBlock; | ||||||
|  | |||||||
| @ -81,7 +81,7 @@ namespace nd4j { | |||||||
| 
 | 
 | ||||||
|         // now we actually apply quantization
 |         // now we actually apply quantization
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto e = start; e < stop; e += increment) { |             for (auto e = start; e < stop; e++) { | ||||||
|                 rz[e] = static_cast<char>(nd4j::math::nd4j_round<float, char>( 1.0f * static_cast<float>(x[e]) / nd4j::math::nd4j_max<float>(amax, amin) * max_byte)); |                 rz[e] = static_cast<char>(nd4j::math::nd4j_round<float, char>( 1.0f * static_cast<float>(x[e]) / nd4j::math::nd4j_max<float>(amax, amin) * max_byte)); | ||||||
|             } |             } | ||||||
|         }; |         }; | ||||||
| @ -177,7 +177,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write) | |||||||
|         int flimit = limit + 4; |         int flimit = limit + 4; | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto e = start; e < stop; e += increment) { |             for (auto e = start; e < stop; e++) { | ||||||
|                 int el = x[e]; |                 int el = x[e]; | ||||||
|                 int ael = nd4j::math::nd4j_abs<int>(el) - 1; |                 int ael = nd4j::math::nd4j_abs<int>(el) - 1; | ||||||
|                 z[ael] += el > 0 ? static_cast<T>(threshold) : static_cast<T>(-threshold); |                 z[ael] += el > 0 ? static_cast<T>(threshold) : static_cast<T>(-threshold); | ||||||
| @ -202,7 +202,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write) | |||||||
|         auto z = reinterpret_cast<T *>(dz); |         auto z = reinterpret_cast<T *>(dz); | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 z[i] = static_cast<T>(static_cast<float>(x[i])); |                 z[i] = static_cast<T>(static_cast<float>(x[i])); | ||||||
|             } |             } | ||||||
|         }; |         }; | ||||||
|  | |||||||
| @ -153,7 +153,7 @@ namespace helpers { | |||||||
|         auto rowSize = sizeof(T) * colCount; |         auto rowSize = sizeof(T) * colCount; | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto n = start; n < stop; n += increment) { |             for (auto n = start; n < stop; n++) { | ||||||
|                 int s = rowP->e<int>(n); |                 int s = rowP->e<int>(n); | ||||||
|                 int end = rowP->e<int>(n + 1); |                 int end = rowP->e<int>(n + 1); | ||||||
|                 int shift = n * colCount; |                 int shift = n * colCount; | ||||||
|  | |||||||
| @ -291,7 +291,7 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra | |||||||
|             shape::calcOffsets(tadShapeInfo, offsets); |             shape::calcOffsets(tadShapeInfo, offsets); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto inBuff = input.bufferAsT<T>() + tadOffsets[i]; |                     auto inBuff = input.bufferAsT<T>() + tadOffsets[i]; | ||||||
|                     auto outBuff = output.bufferAsT<T>() + tadOffsets[i]; |                     auto outBuff = output.bufferAsT<T>() + tadOffsets[i]; | ||||||
| 
 | 
 | ||||||
| @ -341,7 +341,7 @@ void prelu(nd4j::LaunchContext * context, const NDArray& input, const NDArray& a | |||||||
|     const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo(); |     const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo(); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto i = start; i < stop; i += increment) { |         for (auto i = start; i < stop; i++) { | ||||||
|             // FIXME: double!
 |             // FIXME: double!
 | ||||||
|             double x = input.e<double>(i); |             double x = input.e<double>(i); | ||||||
|             if (x < 0.0) { |             if (x < 0.0) { | ||||||
|  | |||||||
| @ -67,7 +67,7 @@ static void adjustHue_(const NDArray *input, const NDArray* deltaScalarArr, NDAr | |||||||
|         const Nd4jLong zDimCstride = output->stridesOf()[dimC]; |         const Nd4jLong zDimCstride = output->stridesOf()[dimC]; | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
| 
 | 
 | ||||||
|                 const T *xTad = x + packX.platformOffsets()[i]; |                 const T *xTad = x + packX.platformOffsets()[i]; | ||||||
|                 T *zTad = z + packZ.platformOffsets()[i]; |                 T *zTad = z + packZ.platformOffsets()[i]; | ||||||
|  | |||||||
| @ -66,7 +66,7 @@ static void adjustSaturation_(const NDArray *input, const NDArray* factorScalarA | |||||||
|         const Nd4jLong zDimCstride = output->stridesOf()[dimC]; |         const Nd4jLong zDimCstride = output->stridesOf()[dimC]; | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 const T *xTad = x + packX.platformOffsets()[i]; |                 const T *xTad = x + packX.platformOffsets()[i]; | ||||||
|                 T *zTad = z + packZ.platformOffsets()[i]; |                 T *zTad = z + packZ.platformOffsets()[i]; | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -94,7 +94,7 @@ void bgemm_(const std::vector<NDArray*>& vA, const std::vector<NDArray*>& vB, st | |||||||
|         int vaSize = vA.size(); |         int vaSize = vA.size(); | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto p = start; p < stop; p += increment) { |             for (auto p = start; p < stop; p++) { | ||||||
|                 auto A = reinterpret_cast<T *>(vA.at(p)->buffer()); |                 auto A = reinterpret_cast<T *>(vA.at(p)->buffer()); | ||||||
|                 auto B = reinterpret_cast<T *>(vB.at(p)->buffer()); |                 auto B = reinterpret_cast<T *>(vB.at(p)->buffer()); | ||||||
|                 auto C = reinterpret_cast<T *>(vC.at(p)->buffer()); |                 auto C = reinterpret_cast<T *>(vC.at(p)->buffer()); | ||||||
|  | |||||||
| @ -141,7 +141,7 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray | |||||||
| 
 | 
 | ||||||
|         Nd4jLong coords[MAX_RANK]; |         Nd4jLong coords[MAX_RANK]; | ||||||
| 
 | 
 | ||||||
|         for (auto i = start; i < stop; i += increment) { |         for (auto i = start; i < stop; i++) { | ||||||
| 
 | 
 | ||||||
|             shape::index2coords(i, input->getShapeInfo(), coords); |             shape::index2coords(i, input->getShapeInfo(), coords); | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -117,7 +117,7 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con | |||||||
| 	int xLen = x.lengthOf(); | 	int xLen = x.lengthOf(); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto i = start; i < stop; i += increment) |         for (auto i = start; i < stop; i++) | ||||||
|             output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i)); |             output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i)); | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -96,7 +96,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp | |||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             T *col, *im; |             T *col, *im; | ||||||
| 
 | 
 | ||||||
|             for (uint b = start; b < stop; b += increment) { |             for (uint b = start; b < stop; b++) { | ||||||
|                 T *im0 = imBuff + b * imStride0; |                 T *im0 = imBuff + b * imStride0; | ||||||
|                 T *col4 = colBuff + b * colStride0; |                 T *col4 = colBuff + b * colStride0; | ||||||
|                 for (int colH = 0; colH < oH; ++colH, col4 += colStride4) { |                 for (int colH = 0; colH < oH; ++colH, col4 += colStride4) { | ||||||
|  | |||||||
							
								
								
									
										41
									
								
								libnd4j/include/ops/declarable/helpers/cpu/concat.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								libnd4j/include/ops/declarable/helpers/cpu/concat.cpp
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,41 @@ | |||||||
|  | /*******************************************************************************
 | ||||||
|  |  * Copyright (c) 2015-2018 Skymind, Inc. | ||||||
|  |  * | ||||||
|  |  * This program and the accompanying materials are made available under the | ||||||
|  |  * terms of the Apache License, Version 2.0 which is available at | ||||||
|  |  * https://www.apache.org/licenses/LICENSE-2.0.
 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||||||
|  |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||||||
|  |  * License for the specific language governing permissions and limitations | ||||||
|  |  * under the License. | ||||||
|  |  * | ||||||
|  |  * SPDX-License-Identifier: Apache-2.0 | ||||||
|  |  ******************************************************************************/ | ||||||
|  | 
 | ||||||
|  | //
 | ||||||
|  | // @author Yurii Shyrma (iuriish@yahoo.com), created on 20.04.2018
 | ||||||
|  | //
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | #include <ops/declarable/helpers/transforms.h> | ||||||
|  | #include <ops/specials.h> | ||||||
|  | 
 | ||||||
|  | namespace nd4j { | ||||||
|  |     namespace ops { | ||||||
|  |         namespace helpers { | ||||||
|  |             //////////////////////////////////////////////////////////////////////////
 | ||||||
|  |             template<typename T> | ||||||
|  |             static void concat_(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) { | ||||||
|  |                 nd4j::SpecialMethods<T>::concatCpuGeneric(inArrs, output, axis); | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |             void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) { | ||||||
|  |                 BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES); | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |             BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector<NDArray*>& inArrs, NDArray& output, const int axis), LIBND4J_TYPES); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
| @ -32,7 +32,7 @@ namespace helpers { | |||||||
|         int lLen = labels->lengthOf(); |         int lLen = labels->lengthOf(); | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (int j = start; j < stop; j += increment) { |             for (int j = start; j < stop; j++) { | ||||||
|                 auto label = labels->e<Nd4jLong>(j); |                 auto label = labels->e<Nd4jLong>(j); | ||||||
|                 auto pred = predictions->e<Nd4jLong>(j); |                 auto pred = predictions->e<Nd4jLong>(j); | ||||||
|                 T value = (weights == nullptr ? (T) 1.0f : weights->e<T>(j)); |                 T value = (weights == nullptr ? (T) 1.0f : weights->e<T>(j)); | ||||||
|  | |||||||
| @ -50,7 +50,7 @@ namespace nd4j { | |||||||
|                     T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0); |                     T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0); | ||||||
| 
 | 
 | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto y = start; y < stop; y += increment) { |                         for (auto y = start; y < stop; y++) { | ||||||
|                             const float inY = (cropHeight > 1) |                             const float inY = (cropHeight > 1) | ||||||
|                                               ? y1 * (imageHeight - 1) + y * heightScale |                                               ? y1 * (imageHeight - 1) + y * heightScale | ||||||
|                                               : 0.5 * (y1 + y2) * (imageHeight - 1); |                                               : 0.5 * (y1 + y2) * (imageHeight - 1); | ||||||
|  | |||||||
| @ -39,7 +39,7 @@ void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray | |||||||
|     int tads = tadsA.size(); |     int tads = tadsA.size(); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto e = start; e < stop; e += increment) { |         for (auto e = start; e < stop; e++) { | ||||||
|             auto a_ = tadsA.at(e); |             auto a_ = tadsA.at(e); | ||||||
|             auto b_ = tadsB.at(e); |             auto b_ = tadsB.at(e); | ||||||
|             auto o_ = tadsO.at(e); |             auto o_ = tadsO.at(e); | ||||||
|  | |||||||
| @ -46,7 +46,7 @@ namespace helpers { | |||||||
|         if (isNHWC) { |         if (isNHWC) { | ||||||
|             const int total_count = batch_size * output_height * output_width * output_depth; |             const int total_count = batch_size * output_height * output_width * output_depth; | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto out_idx = start; out_idx < stop; out_idx += increment) { |                 for (auto out_idx = start; out_idx < stop; out_idx++) { | ||||||
|                     const int d = out_idx % output_depth; |                     const int d = out_idx % output_depth; | ||||||
|                     const int out_idx2 = out_idx / output_depth; |                     const int out_idx2 = out_idx / output_depth; | ||||||
|                     const int w = out_idx2 % output_width; |                     const int w = out_idx2 % output_width; | ||||||
| @ -70,7 +70,7 @@ namespace helpers { | |||||||
|             const int total_count = batch_size * input_depth_by_input_area; |             const int total_count = batch_size * input_depth_by_input_area; | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (int input_idx = start; input_idx < stop; input_idx += increment) { |                 for (int input_idx = start; input_idx < stop; input_idx++) { | ||||||
|                     const int n_bY_bX_oC_iY = input_idx / input_width; |                     const int n_bY_bX_oC_iY = input_idx / input_width; | ||||||
|                     const int iX = input_idx - n_bY_bX_oC_iY * input_width; |                     const int iX = input_idx - n_bY_bX_oC_iY * input_width; | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -32,7 +32,7 @@ template <typename T> | |||||||
| static void diGamma_(const NDArray& x, NDArray& z) { | static void diGamma_(const NDArray& x, NDArray& z) { | ||||||
| 
 | 
 | ||||||
| 	auto func = PRAGMA_THREADS_FOR { | 	auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto i = start; i < stop; i += increment) |         for (auto i = start; i < stop; i++) | ||||||
|             z.p(i, diGammaScalar<T>(x.e<T>(i))); |             z.p(i, diGammaScalar<T>(x.e<T>(i))); | ||||||
|     }; |     }; | ||||||
| 	samediff::Threads::parallel_for(func, 0, x.lengthOf()); | 	samediff::Threads::parallel_for(func, 0, x.lengthOf()); | ||||||
|  | |||||||
| @ -35,7 +35,7 @@ namespace helpers { | |||||||
|         int inLen = input->lengthOf(); |         int inLen = input->lengthOf(); | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto e = start; e < stop; e += increment) { |             for (auto e = start; e < stop; e++) { | ||||||
|                 float val = nodeRng.relativeT<T>(e, T(0.f), T(1.f)); |                 float val = nodeRng.relativeT<T>(e, T(0.f), T(1.f)); | ||||||
| 
 | 
 | ||||||
|                 if (val < probValue) |                 if (val < probValue) | ||||||
| @ -130,7 +130,7 @@ namespace helpers { | |||||||
|         nd4j::graph::RandomGenerator nodeRng(3019L, seed); |         nd4j::graph::RandomGenerator nodeRng(3019L, seed); | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto e = start; e < stop; e += increment) { |             for (auto e = start; e < stop; e++) { | ||||||
|                 float randVal = nodeRng.relativeT(e, T(0.f), T(1.f)); |                 float randVal = nodeRng.relativeT(e, T(0.f), T(1.f)); | ||||||
|                 float xVal = input->e<float>(e); |                 float xVal = input->e<float>(e); | ||||||
|                 output->p<float>(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1); |                 output->p<float>(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1); | ||||||
|  | |||||||
| @ -62,7 +62,7 @@ namespace nd4j { | |||||||
|                     unsigned int outSize = outputList.size(); |                     unsigned int outSize = outputList.size(); | ||||||
| 
 | 
 | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto i = start; i < stop; i += increment) { |                         for (auto i = start; i < stop; i++) { | ||||||
|                             outputs[i].first = outputList[i]; |                             outputs[i].first = outputList[i]; | ||||||
|                             outputs[i].second = 0; |                             outputs[i].second = 0; | ||||||
|                             for (int e = 0; e < indices->lengthOf(); ++e) |                             for (int e = 0; e < indices->lengthOf(); ++e) | ||||||
| @ -168,7 +168,7 @@ namespace nd4j { | |||||||
|                     unsigned int gradsSize = inputGradientList.size(); |                     unsigned int gradsSize = inputGradientList.size(); | ||||||
| 
 | 
 | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto i = start; i < stop; i += increment) { |                         for (auto i = start; i < stop; i++) { | ||||||
|                             outputs[i].first = inputGradientList[i]; |                             outputs[i].first = inputGradientList[i]; | ||||||
|                             outputs[i].second = 0; |                             outputs[i].second = 0; | ||||||
|                             for (int e = 0; e < indices->lengthOf(); ++e) |                             for (int e = 0; e < indices->lengthOf(); ++e) | ||||||
|  | |||||||
| @ -50,7 +50,7 @@ namespace helpers { | |||||||
|             colCast = 0; |             colCast = 0; | ||||||
| 
 | 
 | ||||||
|        auto func = PRAGMA_THREADS_FOR { |        auto func = PRAGMA_THREADS_FOR { | ||||||
|            for (auto batch = 0; batch < stop; batch += increment) { |            for (auto batch = 0; batch < stop; batch++) { | ||||||
|                auto patch = listOfMatricies.at(batch); |                auto patch = listOfMatricies.at(batch); | ||||||
|                auto outMatrix = listOfOutputs.at(batch); |                auto outMatrix = listOfOutputs.at(batch); | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -59,7 +59,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* | |||||||
|             if(input->rankOf() == 1 && output->rankOf() == 1) { |             if(input->rankOf() == 1 && output->rankOf() == 1) { | ||||||
| 
 | 
 | ||||||
|                 auto func = PRAGMA_THREADS_FOR { |                 auto func = PRAGMA_THREADS_FOR { | ||||||
|                     for (auto i = start; i < stop; i += increment) |                     for (auto i = start; i < stop; i++) | ||||||
|                         output->p(i, input->e(indices->e<Nd4jLong>(i))); |                         output->p(i, input->e(indices->e<Nd4jLong>(i))); | ||||||
|                 }; |                 }; | ||||||
| 
 | 
 | ||||||
| @ -88,7 +88,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* | |||||||
| 
 | 
 | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
| 
 | 
 | ||||||
|                         for (auto i = start; i < stop; i += increment) { |                         for (auto i = start; i < stop; i++) { | ||||||
| 
 | 
 | ||||||
|                             void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]); |                             void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]); | ||||||
|                             void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); |                             void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); | ||||||
| @ -100,7 +100,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* | |||||||
|                 } |                 } | ||||||
|                 else { |                 else { | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto i = start; i < stop; i += increment) { |                         for (auto i = start; i < stop; i++) { | ||||||
| 
 | 
 | ||||||
|                             void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]); |                             void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]); | ||||||
|                             void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); |                             void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); | ||||||
| @ -140,7 +140,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* | |||||||
| 
 | 
 | ||||||
|                 auto func = PRAGMA_THREADS_FOR { |                 auto func = PRAGMA_THREADS_FOR { | ||||||
| 
 | 
 | ||||||
|                     for (auto i = start; i < stop; i += increment) { |                     for (auto i = start; i < stop; i++) { | ||||||
| 
 | 
 | ||||||
|                         void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]); |                         void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]); | ||||||
|                         void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); |                         void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); | ||||||
| @ -155,7 +155,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* | |||||||
| 
 | 
 | ||||||
|                 auto func = PRAGMA_THREADS_FOR { |                 auto func = PRAGMA_THREADS_FOR { | ||||||
| 
 | 
 | ||||||
|                     for (auto i = start; i < stop; i += increment) { |                     for (auto i = start; i < stop; i++) { | ||||||
| 
 | 
 | ||||||
|                         void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]); |                         void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]); | ||||||
|                         void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); |                         void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]); | ||||||
|  | |||||||
| @ -56,7 +56,7 @@ namespace nd4j { | |||||||
| 
 | 
 | ||||||
|                 if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) { |                 if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) { | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto e = start; e < stop; e += increment) { |                         for (auto e = start; e < stop; e++) { | ||||||
|                             auto _x = static_cast<unsigned long long>(xBuffer[e]); |                             auto _x = static_cast<unsigned long long>(xBuffer[e]); | ||||||
|                             auto _y = static_cast<unsigned long long>(yBuffer[e]); |                             auto _y = static_cast<unsigned long long>(yBuffer[e]); | ||||||
| 
 | 
 | ||||||
| @ -67,7 +67,7 @@ namespace nd4j { | |||||||
|                     maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf); |                     maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf); | ||||||
|                 } else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) { |                 } else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) { | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto e = start; e < stop; e += increment) { |                         for (auto e = start; e < stop; e++) { | ||||||
|                             auto _x = static_cast<unsigned long long>(xBuffer[e * xEws]); |                             auto _x = static_cast<unsigned long long>(xBuffer[e * xEws]); | ||||||
|                             auto _y = static_cast<unsigned long long>(yBuffer[e * yEws]); |                             auto _y = static_cast<unsigned long long>(yBuffer[e * yEws]); | ||||||
| 
 | 
 | ||||||
| @ -78,7 +78,7 @@ namespace nd4j { | |||||||
|                     maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf); |                     maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf); | ||||||
|                 } else { |                 } else { | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto e = start; e < stop; e += increment) { |                         for (auto e = start; e < stop; e++) { | ||||||
|                             auto _x = static_cast<unsigned long long>(x.e<Nd4jLong>(e)); |                             auto _x = static_cast<unsigned long long>(x.e<Nd4jLong>(e)); | ||||||
|                             auto _y = static_cast<unsigned long long>(y.e<Nd4jLong>(e)); |                             auto _y = static_cast<unsigned long long>(y.e<Nd4jLong>(e)); | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -42,7 +42,7 @@ namespace nd4j { | |||||||
| 
 | 
 | ||||||
|                 // we divide array into 32 element chunks, and store intermediate results once
 |                 // we divide array into 32 element chunks, and store intermediate results once
 | ||||||
|                 auto func = PRAGMA_THREADS_FOR { |                 auto func = PRAGMA_THREADS_FOR { | ||||||
|                     for (auto b = 0; b < stop; b += increment) { |                     for (auto b = 0; b < stop; b++) { | ||||||
|                         auto blockBuffer = buffer + b * numBlocks; |                         auto blockBuffer = buffer + b * numBlocks; | ||||||
| 
 | 
 | ||||||
|                         Nd4jLong r = 1; |                         Nd4jLong r = 1; | ||||||
| @ -64,7 +64,7 @@ namespace nd4j { | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|                     auto func2 = PRAGMA_THREADS_FOR { |                     auto func2 = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto b = start; b < stop; b += increment) { |                         for (auto b = start; b < stop; b++) { | ||||||
|                             auto blockBuffer = tempBuffer + b * numBlocks; |                             auto blockBuffer = tempBuffer + b * numBlocks; | ||||||
| 
 | 
 | ||||||
|                             Nd4jLong r = 1; |                             Nd4jLong r = 1; | ||||||
|  | |||||||
| @ -280,7 +280,7 @@ namespace helpers { | |||||||
|         int xsSize = xs.size(); |         int xsSize = xs.size(); | ||||||
|         // Scale x interpolation weights to avoid a multiplication during iteration.
 |         // Scale x interpolation weights to avoid a multiplication during iteration.
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 xs[i]._bottomIndex *= channels; |                 xs[i]._bottomIndex *= channels; | ||||||
|                 xs[i]._topIndex *= channels; |                 xs[i]._topIndex *= channels; | ||||||
|             } |             } | ||||||
| @ -906,7 +906,7 @@ namespace helpers { | |||||||
|         auto outputPtr = output->bufferAsT<float>(); // output is always float. TO DO: provide another float types also with  template <typename X, typename Z> declaration
 |         auto outputPtr = output->bufferAsT<float>(); // output is always float. TO DO: provide another float types also with  template <typename X, typename Z> declaration
 | ||||||
| 
 | 
 | ||||||
|         auto batchProcess = PRAGMA_THREADS_FOR { |         auto batchProcess = PRAGMA_THREADS_FOR { | ||||||
|             for (auto batch = start; batch < stop; batch += increment) { |             for (auto batch = start; batch < stop; batch++) { | ||||||
|                 for (auto y = 0; y < st.outHeight; ++y) { |                 for (auto y = 0; y < st.outHeight; ++y) { | ||||||
|                     const float inY = y * st.heightScale; |                     const float inY = y * st.heightScale; | ||||||
|                     const float inY1 = (y + 1) * st.heightScale; |                     const float inY1 = (y + 1) * st.heightScale; | ||||||
| @ -961,7 +961,7 @@ namespace helpers { | |||||||
|             if (Status::OK() == res) { |             if (Status::OK() == res) { | ||||||
|                 std::vector<CachedInterpolation> xCached(st.outWidth); |                 std::vector<CachedInterpolation> xCached(st.outWidth); | ||||||
|                 auto cachingProcedure = PRAGMA_THREADS_FOR { |                 auto cachingProcedure = PRAGMA_THREADS_FOR { | ||||||
|                     for (auto x = start; x < stop; x += increment) { |                     for (auto x = start; x < stop; x++) { | ||||||
|                         auto &xCache = xCached[x]; |                         auto &xCache = xCached[x]; | ||||||
|                         const float inX = x * st.widthScale; |                         const float inX = x * st.widthScale; | ||||||
|                         const float inX1 = (x + 1) * st.widthScale; |                         const float inX1 = (x + 1) * st.widthScale; | ||||||
|  | |||||||
| @ -39,7 +39,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) { | |||||||
|         'c' == output.ordering() && 1 == output.ews()){ |         'c' == output.ordering() && 1 == output.ews()){ | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR{ |         auto func = PRAGMA_THREADS_FOR{ | ||||||
|              for (auto i = start; i < stop; i += increment) { |              for (auto i = start; i < stop; i++) { | ||||||
|                  const auto xStep = i*3; |                  const auto xStep = i*3; | ||||||
|                  z[i] = 0.2989f*x[xStep] + 0.5870f*x[xStep + 1] + 0.1140f*x[xStep + 2]; |                  z[i] = 0.2989f*x[xStep] + 0.5870f*x[xStep + 1] + 0.1140f*x[xStep + 2]; | ||||||
|              } |              } | ||||||
| @ -52,7 +52,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) { | |||||||
|     auto func = PRAGMA_THREADS_FOR{ |     auto func = PRAGMA_THREADS_FOR{ | ||||||
| 
 | 
 | ||||||
|          Nd4jLong coords[MAX_RANK]; |          Nd4jLong coords[MAX_RANK]; | ||||||
|          for (auto i = start; i < stop; i += increment) { |          for (auto i = start; i < stop; i++) { | ||||||
|              shape::index2coords(i, output.getShapeInfo(), coords); |              shape::index2coords(i, output.getShapeInfo(), coords); | ||||||
|              const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); |              const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); | ||||||
|              const auto xOffset0 =  shape::getOffset(input.getShapeInfo(), coords); |              const auto xOffset0 =  shape::getOffset(input.getShapeInfo(), coords); | ||||||
| @ -99,7 +99,7 @@ FORCEINLINE static void rgbToFromYuv_(const NDArray& input, NDArray& output, con | |||||||
|     const Nd4jLong zDimCstride = output.stridesOf()[dimC]; |     const Nd4jLong zDimCstride = output.stridesOf()[dimC]; | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR{ |     auto func = PRAGMA_THREADS_FOR{ | ||||||
|         for (auto i = start; i < stop; i += increment) { |         for (auto i = start; i < stop; i++) { | ||||||
|             const T* xTad = x + packX.platformOffsets()[i]; |             const T* xTad = x + packX.platformOffsets()[i]; | ||||||
|             T* zTad = z + packZ.platformOffsets()[i]; |             T* zTad = z + packZ.platformOffsets()[i]; | ||||||
|             op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]); |             op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]); | ||||||
| @ -157,7 +157,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output, | |||||||
|         const Nd4jLong zDimCstride = output->stridesOf()[dimC]; |         const Nd4jLong zDimCstride = output->stridesOf()[dimC]; | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR{ |         auto func = PRAGMA_THREADS_FOR{ | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 const T* xTad = x + packX.platformOffsets()[i]; |                 const T* xTad = x + packX.platformOffsets()[i]; | ||||||
|                 T* zTad = z + packZ.platformOffsets()[i]; |                 T* zTad = z + packZ.platformOffsets()[i]; | ||||||
|                 op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]); |                 op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]); | ||||||
| @ -207,7 +207,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output, | |||||||
|         const Nd4jLong zDimCstride = output->stridesOf()[dimC]; |         const Nd4jLong zDimCstride = output->stridesOf()[dimC]; | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR{ |         auto func = PRAGMA_THREADS_FOR{ | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 const T* xTad = x + packX.platformOffsets()[i]; |                 const T* xTad = x + packX.platformOffsets()[i]; | ||||||
|                 T* zTad = z + packZ.platformOffsets()[i]; |                 T* zTad = z + packZ.platformOffsets()[i]; | ||||||
|                 //simple M*v //tr.T*v
 |                 //simple M*v //tr.T*v
 | ||||||
|  | |||||||
| @ -146,7 +146,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int> | |||||||
|         int span = (tads / num_threads) + 8; |         int span = (tads / num_threads) + 8; | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto r = start; r < stop; r += increment) { |             for (auto r = start; r < stop; r++) { | ||||||
|                     auto rX = const_cast<NDArray*>(input)->bufferAsT<X>() + tadOffsets[r]; |                     auto rX = const_cast<NDArray*>(input)->bufferAsT<X>() + tadOffsets[r]; | ||||||
|                     auto rZ = output->bufferAsT<Z>() + zOfsets[r]; |                     auto rZ = output->bufferAsT<Z>() + zOfsets[r]; | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -62,7 +62,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out | |||||||
|     if(inTadEws == 1 && outTadEws == 1) { |     if(inTadEws == 1 && outTadEws == 1) { | ||||||
|          |          | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (uint i = start; i < stop; i += increment) { |             for (uint i = start; i < stop; i++) { | ||||||
|                 const T *x = inBuff + inTadOffsets[i]; |                 const T *x = inBuff + inTadOffsets[i]; | ||||||
|                 T *y = outBuff + outTadOffsets[i]; |                 T *y = outBuff + outTadOffsets[i]; | ||||||
| 
 | 
 | ||||||
| @ -179,7 +179,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c | |||||||
|     if(inTadEws == 1 && gradITadEws == 1) { |     if(inTadEws == 1 && gradITadEws == 1) { | ||||||
|          |          | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (uint i = start; i < stop; i += increment) { |             for (uint i = start; i < stop; i++) { | ||||||
|                 const X *x = inBuff + inTadOffsets[i]; |                 const X *x = inBuff + inTadOffsets[i]; | ||||||
|                       Y *y = gradIBuff + gradITadOffsets[i]; |                       Y *y = gradIBuff + gradITadOffsets[i]; | ||||||
| 
 | 
 | ||||||
| @ -247,7 +247,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c | |||||||
|     else { |     else { | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (uint i = start; i < stop; i += increment) { |             for (uint i = start; i < stop; i++) { | ||||||
|                 const X *x = inBuff + inTadOffsets[i]; |                 const X *x = inBuff + inTadOffsets[i]; | ||||||
|                       Y *y = gradIBuff + gradITadOffsets[i]; |                       Y *y = gradIBuff + gradITadOffsets[i]; | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -124,7 +124,7 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast, | |||||||
|     auto h_ = h->bufferAsT<T>(); |     auto h_ = h->bufferAsT<T>(); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (uint e = start; e < stop; e += increment) { |         for (uint e = start; e < stop; e++) { | ||||||
|             c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]); |             c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]); | ||||||
|             h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]); |             h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]); | ||||||
|         } |         } | ||||||
|  | |||||||
| @ -45,7 +45,7 @@ namespace helpers { | |||||||
|             auto n = shape::sizeAt(matrixShape, -1); |             auto n = shape::sizeAt(matrixShape, -1); | ||||||
| 
 | 
 | ||||||
|             auto loop = PRAGMA_THREADS_FOR { |             auto loop = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     Nd4jLong theFirstPos[] = {theFirst, i}; |                     Nd4jLong theFirstPos[] = {theFirst, i}; | ||||||
|                     Nd4jLong theSecondPos[] = {theSecond, i}; |                     Nd4jLong theSecondPos[] = {theSecond, i}; | ||||||
|                     auto theFirstIndex = shape::getOffset(matrixShape, theFirstPos, 0); |                     auto theFirstIndex = shape::getOffset(matrixShape, theFirstPos, 0); | ||||||
| @ -203,7 +203,7 @@ namespace helpers { | |||||||
|         auto result = -1; |         auto result = -1; | ||||||
|         //auto loop = PRAGMA_THREADS_FOR {
 |         //auto loop = PRAGMA_THREADS_FOR {
 | ||||||
|             auto start = column, stop = rowNum, increment = 1; |             auto start = column, stop = rowNum, increment = 1; | ||||||
|             for (auto rowCounter = start; rowCounter < stop; rowCounter += increment) { |             for (auto rowCounter = start; rowCounter < stop; rowCounter++) { | ||||||
|                 Nd4jLong xPos[] = {rowCounter, column}; |                 Nd4jLong xPos[] = {rowCounter, column}; | ||||||
|                 auto xIndex = shape::getOffset(compoundShape, xPos, 0); |                 auto xIndex = shape::getOffset(compoundShape, xPos, 0); | ||||||
|                 if (nd4j::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) { |                 if (nd4j::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) { | ||||||
| @ -221,7 +221,7 @@ namespace helpers { | |||||||
|         Nd4jLong xDiag[] = {currentRow, currentRow}; |         Nd4jLong xDiag[] = {currentRow, currentRow}; | ||||||
|         auto diagIndex = shape::getOffset(compoundShape, xDiag, 0); |         auto diagIndex = shape::getOffset(compoundShape, xDiag, 0); | ||||||
|         auto loop = PRAGMA_THREADS_FOR { |         auto loop = PRAGMA_THREADS_FOR { | ||||||
|             for (int j = start; j < stop; j += increment) { |             for (auto j = start; j < stop; j++) { | ||||||
|                 Nd4jLong xRow[] = {j, currentRow}; |                 Nd4jLong xRow[] = {j, currentRow}; | ||||||
|                 auto rowIndex = shape::getOffset(compoundShape, xRow, 0); |                 auto rowIndex = shape::getOffset(compoundShape, xRow, 0); | ||||||
|                 compoundBuf[rowIndex] /= compoundBuf[diagIndex]; //output->t<T>(i, i);
 |                 compoundBuf[rowIndex] /= compoundBuf[diagIndex]; //output->t<T>(i, i);
 | ||||||
| @ -310,7 +310,7 @@ namespace helpers { | |||||||
|             permutations = permutationVectors->allTensorsAlongDimension({-1}); |             permutations = permutationVectors->allTensorsAlongDimension({-1}); | ||||||
| 
 | 
 | ||||||
|         auto loop = PRAGMA_THREADS_FOR { |         auto loop = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 luNN_<T, I>(context, outputs.at(i), permutationVectors?permutations.at(i):nullptr, n); |                 luNN_<T, I>(context, outputs.at(i), permutationVectors?permutations.at(i):nullptr, n); | ||||||
|             } |             } | ||||||
|         }; |         }; | ||||||
|  | |||||||
| @ -46,7 +46,7 @@ int _matrixDiagPart(const NDArray* input, NDArray* output) { | |||||||
|     int lO = listOut.size(); |     int lO = listOut.size(); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto i = start; i < stop; i += increment) |         for (auto i = start; i < stop; i++) | ||||||
|             for (int j = 0; j < lastDimension; ++j) |             for (int j = 0; j < lastDimension; ++j) | ||||||
|                 listOut.at(i)->p(j, listDiag.at(i)->e<T>(j, j)); |                 listOut.at(i)->p(j, listDiag.at(i)->e<T>(j, j)); | ||||||
|     }; |     }; | ||||||
|  | |||||||
| @ -55,7 +55,7 @@ namespace helpers { | |||||||
|             Nd4jLong oL = output->lengthOf(); |             Nd4jLong oL = output->lengthOf(); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto row = rows.at(e); |                     auto row = rows.at(e); | ||||||
|                     output->p(e, row->e<T>(n)); |                     output->p(e, row->e<T>(n)); | ||||||
|                 } |                 } | ||||||
|  | |||||||
| @ -49,7 +49,7 @@ namespace nd4j { | |||||||
| 
 | 
 | ||||||
|                 if (tadEws >= 1) { |                 if (tadEws >= 1) { | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto e = 0; e < stop; e += increment) { |                         for (auto e = 0; e < stop; e++) { | ||||||
|                             auto cO = output + tadPack.primaryOffsets()[e]; |                             auto cO = output + tadPack.primaryOffsets()[e]; | ||||||
| 
 | 
 | ||||||
|                             auto idx = static_cast<int>(indices[e]); |                             auto idx = static_cast<int>(indices[e]); | ||||||
| @ -70,7 +70,7 @@ namespace nd4j { | |||||||
|                     samediff::Threads::parallel_tad(func, 0, numTads); |                     samediff::Threads::parallel_tad(func, 0, numTads); | ||||||
|                 } else { |                 } else { | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto e = start; e < stop; e += increment) { |                         for (auto e = start; e < stop; e++) { | ||||||
|                             auto cO = output + tadPack.primaryOffsets()[e]; |                             auto cO = output + tadPack.primaryOffsets()[e]; | ||||||
| 
 | 
 | ||||||
|                             auto idx = static_cast<int>(indices[e]); |                             auto idx = static_cast<int>(indices[e]); | ||||||
|  | |||||||
| @ -70,7 +70,7 @@ template <typename T> | |||||||
| static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) { | static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) { | ||||||
| 
 | 
 | ||||||
| 	auto func = PRAGMA_THREADS_FOR { | 	auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto i = start; i < stop; i += increment) { |         for (auto i = start; i < stop; i++) { | ||||||
|         	const T order = n.e<T>(i); |         	const T order = n.e<T>(i); | ||||||
|         	if(order != static_cast<int>(order))						// if order has fractional part then do not perform calculations and return NAN
 |         	if(order != static_cast<int>(order))						// if order has fractional part then do not perform calculations and return NAN
 | ||||||
|         		output.p(i, std::numeric_limits<T>::quiet_NaN()); |         		output.p(i, std::numeric_limits<T>::quiet_NaN()); | ||||||
|  | |||||||
| @ -113,7 +113,7 @@ namespace helpers { | |||||||
|         ResultSet listOutR(outputR->allTensorsAlongDimension({(int)preLastDim, (int)lastDim})); |         ResultSet listOutR(outputR->allTensorsAlongDimension({(int)preLastDim, (int)lastDim})); | ||||||
|         ResultSet listInput(input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim})); |         ResultSet listInput(input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim})); | ||||||
|         auto batching = PRAGMA_THREADS_FOR { |         auto batching = PRAGMA_THREADS_FOR { | ||||||
|             for (auto batch = start; batch < stop; batch += increment) { |             for (auto batch = start; batch < stop; batch++) { | ||||||
|                 //qr here
 |                 //qr here
 | ||||||
|                 qrSingle<T>(listInput.at(batch), listOutQ.at(batch), listOutR.at(batch), fullMatricies); |                 qrSingle<T>(listInput.at(batch), listOutQ.at(batch), listOutR.at(batch), fullMatricies); | ||||||
|             } |             } | ||||||
|  | |||||||
| @ -39,7 +39,7 @@ static void _range(const NDArray& start, const NDArray& delta, NDArray& outVecto | |||||||
|     auto d = delta.e<T>(0); |     auto d = delta.e<T>(0); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto i = start; i < stop; i += increment) |         for (auto i = start; i < stop; i++) | ||||||
|             buff[i] = s + i * d; |             buff[i] = s + i * d; | ||||||
|     }; |     }; | ||||||
|     samediff::Threads::parallel_for(func, 0, len); |     samediff::Threads::parallel_for(func, 0, len); | ||||||
|  | |||||||
| @ -54,7 +54,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * | |||||||
|             if (inArr == outArr) { |             if (inArr == outArr) { | ||||||
|                 if (inEWS == 1) { |                 if (inEWS == 1) { | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto e = start; e < stop; e += increment) { |                         for (auto e = start; e < stop; e++) { | ||||||
|                             auto idx = sLength - e; |                             auto idx = sLength - e; | ||||||
|                             swap(inArr, e, idx); |                             swap(inArr, e, idx); | ||||||
|                         } |                         } | ||||||
| @ -63,7 +63,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * | |||||||
|                 } |                 } | ||||||
|                 else if (inEWS > 1) { |                 else if (inEWS > 1) { | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto e = start; e < stop; e += increment) { |                         for (auto e = start; e < stop; e++) { | ||||||
|                             auto idx1 = (sLength - e) * inEWS; |                             auto idx1 = (sLength - e) * inEWS; | ||||||
|                             Nd4jLong idx2 = e * inEWS; |                             Nd4jLong idx2 = e * inEWS; | ||||||
|                             swap(inArr, idx1, idx2); |                             swap(inArr, idx1, idx2); | ||||||
| @ -75,7 +75,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * | |||||||
|                 else { |                 else { | ||||||
| 
 | 
 | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto e = start; e < stop; e += increment) { |                         for (auto e = start; e < stop; e++) { | ||||||
|                             auto inOffset = shape::getIndexOffset(e, inShapeBuffer); |                             auto inOffset = shape::getIndexOffset(e, inShapeBuffer); | ||||||
|                             auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer); |                             auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer); | ||||||
|                             swap(outArr, inOffset, outOffset); |                             swap(outArr, inOffset, outOffset); | ||||||
| @ -93,14 +93,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * | |||||||
|                 if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) { |                 if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) { | ||||||
| 
 | 
 | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (Nd4jLong e = start; e < stop; e += increment) |                         for (Nd4jLong e = start; e < stop; e++) | ||||||
|                             outArr[sLength - e] = inArr[e]; |                             outArr[sLength - e] = inArr[e]; | ||||||
|                     }; |                     }; | ||||||
|                     samediff::Threads::parallel_for(func, 0, numOfElemsToReverse); |                     samediff::Threads::parallel_for(func, 0, numOfElemsToReverse); | ||||||
| 
 | 
 | ||||||
|                     if(inLength != numOfElemsToReverse) { |                     if(inLength != numOfElemsToReverse) { | ||||||
|                         auto f2 = PRAGMA_THREADS_FOR { |                         auto f2 = PRAGMA_THREADS_FOR { | ||||||
|                             for (auto e = start; e < stop; e += increment) |                             for (auto e = start; e < stop; e++) | ||||||
|                                 outArr[e] = inArr[e]; |                                 outArr[e] = inArr[e]; | ||||||
|                         }; |                         }; | ||||||
|                         samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength); |                         samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength); | ||||||
| @ -109,14 +109,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * | |||||||
|                 else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) { |                 else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) { | ||||||
| 
 | 
 | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto e = start; e < stop; e += increment) |                         for (auto e = start; e < stop; e++) | ||||||
|                             outArr[(sLength - e) * outEWS] = inArr[e * inEWS]; |                             outArr[(sLength - e) * outEWS] = inArr[e * inEWS]; | ||||||
|                     }; |                     }; | ||||||
|                     samediff::Threads::parallel_for(func, 0, numOfElemsToReverse); |                     samediff::Threads::parallel_for(func, 0, numOfElemsToReverse); | ||||||
| 
 | 
 | ||||||
|                     if(inLength != numOfElemsToReverse) { |                     if(inLength != numOfElemsToReverse) { | ||||||
|                         auto f2 = PRAGMA_THREADS_FOR { |                         auto f2 = PRAGMA_THREADS_FOR { | ||||||
|                             for (auto e = start; e < stop; e += increment) |                             for (auto e = start; e < stop; e++) | ||||||
|                                 outArr[e * outEWS] = inArr[e * inEWS]; |                                 outArr[e * outEWS] = inArr[e * inEWS]; | ||||||
|                         }; |                         }; | ||||||
|                         samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength); |                         samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength); | ||||||
| @ -125,7 +125,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * | |||||||
|                 else { |                 else { | ||||||
| 
 | 
 | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto e = start; e < stop; e += increment) { |                         for (auto e = start; e < stop; e++) { | ||||||
|                             auto inOffset = shape::getIndexOffset(e, inShapeBuffer); |                             auto inOffset = shape::getIndexOffset(e, inShapeBuffer); | ||||||
|                             auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer); |                             auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer); | ||||||
|                             outArr[outOffset] = inArr[inOffset]; |                             outArr[outOffset] = inArr[inOffset]; | ||||||
| @ -136,7 +136,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong * | |||||||
|                     if(inLength != numOfElemsToReverse) { |                     if(inLength != numOfElemsToReverse) { | ||||||
| 
 | 
 | ||||||
|                         auto f2 = PRAGMA_THREADS_FOR { |                         auto f2 = PRAGMA_THREADS_FOR { | ||||||
|                             for (auto e = start; e < stop; e += increment) { |                             for (auto e = start; e < stop; e++) { | ||||||
|                                 auto inOffset = shape::getIndexOffset(e, inShapeBuffer); |                                 auto inOffset = shape::getIndexOffset(e, inShapeBuffer); | ||||||
|                                 auto outOffset = shape::getIndexOffset(e, outShapeBuffer); |                                 auto outOffset = shape::getIndexOffset(e, outShapeBuffer); | ||||||
|                                 outArr[outOffset] = inArr[inOffset]; |                                 outArr[outOffset] = inArr[inOffset]; | ||||||
|  | |||||||
| @ -114,7 +114,7 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray& | |||||||
|     // loop through input array
 |     // loop through input array
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         Nd4jLong coords[MAX_RANK]; |         Nd4jLong coords[MAX_RANK]; | ||||||
|         for (auto i = start; i < stop; i += increment) { |         for (auto i = start; i < stop; i++) { | ||||||
| 
 | 
 | ||||||
|             shape::index2coords(i, output.getShapeInfo(), coords); |             shape::index2coords(i, output.getShapeInfo(), coords); | ||||||
| 
 | 
 | ||||||
| @ -300,7 +300,7 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra | |||||||
|     // loop through output array
 |     // loop through output array
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         Nd4jLong coords[MAX_RANK]; |         Nd4jLong coords[MAX_RANK]; | ||||||
|         for (auto i = start; i < stop; i += increment) { |         for (auto i = start; i < stop; i++) { | ||||||
|             shape::index2coords(i, output.getShapeInfo(), coords); |             shape::index2coords(i, output.getShapeInfo(), coords); | ||||||
| 
 | 
 | ||||||
|             const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); |             const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); | ||||||
|  | |||||||
| @ -48,7 +48,7 @@ namespace helpers { | |||||||
|             const int total_count = batch_size * input_height * input_width * input_depth; |             const int total_count = batch_size * input_height * input_width * input_depth; | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) { |                 for (auto inp_idx = start; inp_idx < stop; inp_idx++) { | ||||||
|                     // inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
 |                     // inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
 | ||||||
|                     const int d = inp_idx % input_depth; |                     const int d = inp_idx % input_depth; | ||||||
|                     const int inp_idx2 = inp_idx / input_depth; |                     const int inp_idx2 = inp_idx / input_depth; | ||||||
| @ -74,7 +74,7 @@ namespace helpers { | |||||||
|             const int total_count = batch_size * output_depth_by_output_area; |             const int total_count = batch_size * output_depth_by_output_area; | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) { |                 for (auto inp_idx = start; inp_idx < stop; inp_idx++) { | ||||||
|                     const int n_iC_oY_bY_oX = inp_idx / block_size; |                     const int n_iC_oY_bY_oX = inp_idx / block_size; | ||||||
|                     const int bX = inp_idx - n_iC_oY_bY_oX * block_size; |                     const int bX = inp_idx - n_iC_oY_bY_oX * block_size; | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -45,7 +45,7 @@ Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int | |||||||
| 
 | 
 | ||||||
|         Nd4jLong xCoords[MAX_RANK]; |         Nd4jLong xCoords[MAX_RANK]; | ||||||
| 
 | 
 | ||||||
|         for (auto i = start; i < stop; i += increment) { |         for (auto i = start; i < stop; i++) { | ||||||
| 
 | 
 | ||||||
|             shape::index2coords(i, xShapeInfo, xCoords); |             shape::index2coords(i, xShapeInfo, xCoords); | ||||||
| 
 | 
 | ||||||
| @ -79,7 +79,7 @@ void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& ind | |||||||
| 
 | 
 | ||||||
|     if(outRank == 1) { |     if(outRank == 1) { | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 Nd4jLong idx = indices.e<Nd4jLong>(i); |                 Nd4jLong idx = indices.e<Nd4jLong>(i); | ||||||
|                 NDArray out = output({idx, idx + 1}); |                 NDArray out = output({idx, idx + 1}); | ||||||
| 
 | 
 | ||||||
| @ -99,7 +99,7 @@ void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& ind | |||||||
|         std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0); |         std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0); | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0})); |                 NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0})); | ||||||
|                 NDArray updSubArr = updates(i, dimsToExcludeUpd); |                 NDArray updSubArr = updates(i, dimsToExcludeUpd); | ||||||
| 
 | 
 | ||||||
| @ -121,7 +121,7 @@ void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& i | |||||||
| 
 | 
 | ||||||
|     if(outRank == 1) { |     if(outRank == 1) { | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 Nd4jLong idx = indices.e<Nd4jLong>(i); |                 Nd4jLong idx = indices.e<Nd4jLong>(i); | ||||||
|                 NDArray out = output({idx, idx + 1}); |                 NDArray out = output({idx, idx + 1}); | ||||||
| 
 | 
 | ||||||
| @ -139,7 +139,7 @@ void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& i | |||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             std::vector<Nd4jLong> idxRangeOut(2*outRank, 0); |             std::vector<Nd4jLong> idxRangeOut(2*outRank, 0); | ||||||
| 
 | 
 | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 NDArray indSubArr = indices(i, dimsToExcludeInd); |                 NDArray indSubArr = indices(i, dimsToExcludeInd); | ||||||
| 
 | 
 | ||||||
|                 for (Nd4jLong j = 0; j < indLastDim; ++j) { |                 for (Nd4jLong j = 0; j < indLastDim; ++j) { | ||||||
| @ -170,7 +170,7 @@ void scatterForLoss(nd4j::LaunchContext  *context, const NDArray& indices, NDArr | |||||||
| 
 | 
 | ||||||
|     if(!calcGrad) { |     if(!calcGrad) { | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 auto subArr = updates(i, dimsToExclude); |                 auto subArr = updates(i, dimsToExclude); | ||||||
|                 output.p(i, subArr.e(indices.e<Nd4jLong>(i))); |                 output.p(i, subArr.e(indices.e<Nd4jLong>(i))); | ||||||
|             } |             } | ||||||
| @ -179,7 +179,7 @@ void scatterForLoss(nd4j::LaunchContext  *context, const NDArray& indices, NDArr | |||||||
|         samediff::Threads::parallel_for(func, 0, indicesLen); |         samediff::Threads::parallel_for(func, 0, indicesLen); | ||||||
|     } else { |     } else { | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 auto subArr = updates(i, dimsToExclude); |                 auto subArr = updates(i, dimsToExclude); | ||||||
|                 auto ind = indices.e<Nd4jLong>(i); |                 auto ind = indices.e<Nd4jLong>(i); | ||||||
|                 subArr.p(ind, subArr.e(ind) - 1.); |                 subArr.p(ind, subArr.e(ind) - 1.); | ||||||
|  | |||||||
| @ -169,7 +169,7 @@ namespace helpers { | |||||||
|             for (int i = 1; i < indices->lengthOf(); i++) { |             for (int i = 1; i < indices->lengthOf(); i++) { | ||||||
|                 if (indices->e<int>(i) == idx) { |                 if (indices->e<int>(i) == idx) { | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto e = start; e < stop; e += increment) { |                         for (auto e = start; e < stop; e++) { | ||||||
|                             meanV.p<T>(e, meanV.e<T>(e) + listOfTensors.at(i)->e<T>(e)); |                             meanV.p<T>(e, meanV.e<T>(e) + listOfTensors.at(i)->e<T>(e)); | ||||||
|                         } |                         } | ||||||
|                     }; |                     }; | ||||||
| @ -223,7 +223,7 @@ namespace helpers { | |||||||
|             for (int i = 0; i < indices->lengthOf(); i++) { |             for (int i = 0; i < indices->lengthOf(); i++) { | ||||||
|                 if (indices->e<int>(i) == idx) { |                 if (indices->e<int>(i) == idx) { | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto e = start; e < stop; e += increment) { |                         for (auto e = start; e < stop; e++) { | ||||||
|                             sumT->p(e, sumT->e<T>(e) + listOfTensors.at(i)->e<T>(e)); |                             sumT->p(e, sumT->e<T>(e) + listOfTensors.at(i)->e<T>(e)); | ||||||
|                         } |                         } | ||||||
|                     }; |                     }; | ||||||
| @ -272,7 +272,7 @@ namespace helpers { | |||||||
|             for (int i = 1; i < indices->lengthOf(); i++) { |             for (int i = 1; i < indices->lengthOf(); i++) { | ||||||
|                 if (indices->e<int>(i)  == idx) { |                 if (indices->e<int>(i)  == idx) { | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto e = start; e < stop; e += increment) { |                         for (auto e = start; e < stop; e++) { | ||||||
|                             sumT->p(e, sumT->e<T>(e) * listOfTensors.at(i)->e<T>(e)); |                             sumT->p(e, sumT->e<T>(e) * listOfTensors.at(i)->e<T>(e)); | ||||||
|                         } |                         } | ||||||
|                     }; |                     }; | ||||||
| @ -625,7 +625,7 @@ namespace helpers { | |||||||
|             Nd4jLong loop_size = input->lengthOf(); |             Nd4jLong loop_size = input->lengthOf(); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto classNum = indices->e<Nd4jLong>(e); |                     auto classNum = indices->e<Nd4jLong>(e); | ||||||
|                     if (nd4j::math::nd4j_abs(tempRes.e<T>(classNum) - input->e<T>(e)) <= T(1.e-6)) |                     if (nd4j::math::nd4j_abs(tempRes.e<T>(classNum) - input->e<T>(e)) <= T(1.e-6)) | ||||||
|                         output->p(e, gradOut->e<T>(classNum)); |                         output->p(e, gradOut->e<T>(classNum)); | ||||||
| @ -645,7 +645,7 @@ namespace helpers { | |||||||
|             //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
 |             //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
 | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto classNum = indices->e<Nd4jLong>(i); |                     auto classNum = indices->e<Nd4jLong>(i); | ||||||
|                     auto current = listOfTensors.at(i); |                     auto current = listOfTensors.at(i); | ||||||
|                     auto currentOut = listOfOutTensors.at(i); |                     auto currentOut = listOfOutTensors.at(i); | ||||||
| @ -675,7 +675,7 @@ namespace helpers { | |||||||
|         segmentMinFunctor(context, input, indices, &tempRes); |         segmentMinFunctor(context, input, indices, &tempRes); | ||||||
|         if (input->isVector()) { |         if (input->isVector()) { | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto classNum = indices->e<Nd4jLong>(e); |                     auto classNum = indices->e<Nd4jLong>(e); | ||||||
|                     if (nd4j::math::nd4j_abs(tempRes.e<double>(classNum) - input->e<double>(e)) < 1.e-5) |                     if (nd4j::math::nd4j_abs(tempRes.e<double>(classNum) - input->e<double>(e)) < 1.e-5) | ||||||
|                         output->p(e, gradOut->e<double>(classNum)); |                         output->p(e, gradOut->e<double>(classNum)); | ||||||
| @ -697,7 +697,7 @@ namespace helpers { | |||||||
|             int pos = 0; |             int pos = 0; | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto classNum = indices->e<Nd4jLong>(i); |                     auto classNum = indices->e<Nd4jLong>(i); | ||||||
|                     auto current = listOfTensors.at(i); |                     auto current = listOfTensors.at(i); | ||||||
|                     auto currentOut = listOfOutTensors.at(i); |                     auto currentOut = listOfOutTensors.at(i); | ||||||
| @ -887,7 +887,7 @@ namespace helpers { | |||||||
|         if (input->isVector()) { |         if (input->isVector()) { | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto classNum = indices->e<Nd4jLong>(e); |                     auto classNum = indices->e<Nd4jLong>(e); | ||||||
|                     if (nd4j::math::nd4j_abs(tempRes.t<T>(classNum) - input->t<T>(e)) < 1.e-6) |                     if (nd4j::math::nd4j_abs(tempRes.t<T>(classNum) - input->t<T>(e)) < 1.e-6) | ||||||
|                         output->t<T>(e) = gradOut->t<T>(classNum); |                         output->t<T>(e) = gradOut->t<T>(classNum); | ||||||
| @ -1004,7 +1004,7 @@ namespace helpers { | |||||||
|         unsortedSegmentProdFunctor(context, input, indices, numOfClasses, &tempRes); |         unsortedSegmentProdFunctor(context, input, indices, numOfClasses, &tempRes); | ||||||
|         if (input->isVector()) { |         if (input->isVector()) { | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto classNum = indices->e<Nd4jLong>(e); |                     auto classNum = indices->e<Nd4jLong>(e); | ||||||
|                     output->p<double>(e, gradOut->e<double>(classNum) * tempRes.e<double>(classNum) / input->e<double>(e)); |                     output->p<double>(e, gradOut->e<double>(classNum) * tempRes.e<double>(classNum) / input->e<double>(e)); | ||||||
|                 } |                 } | ||||||
|  | |||||||
| @ -364,7 +364,7 @@ namespace nd4j { | |||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         T sneu1e[600]; |                         T sneu1e[600]; | ||||||
| 
 | 
 | ||||||
|                         for (auto t = start; t < stop; t += increment) { |                         for (auto t = start; t < stop; t++) { | ||||||
|                             T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; |                             T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; | ||||||
|                             memset(neu1e, 0, vectorLength * sizeof(T)); |                             memset(neu1e, 0, vectorLength * sizeof(T)); | ||||||
| 
 | 
 | ||||||
| @ -457,7 +457,7 @@ namespace nd4j { | |||||||
|                     T sneu1[600]; |                     T sneu1[600]; | ||||||
|                     T sneu1e[600]; |                     T sneu1e[600]; | ||||||
| 
 | 
 | ||||||
|                     for (int e = start; e < stop; e += increment) { |                     for (int e = start; e < stop; e++) { | ||||||
|                         T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength]; |                         T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength]; | ||||||
|                         T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; |                         T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -40,7 +40,7 @@ namespace helpers { | |||||||
|         output->assign(input); |         output->assign(input); | ||||||
| 
 | 
 | ||||||
|         auto batchLoop = PRAGMA_THREADS_FOR { |         auto batchLoop = PRAGMA_THREADS_FOR { | ||||||
|             for (auto batch = start; batch < stop; batch += increment) { |             for (auto batch = start; batch < stop; batch++) { | ||||||
|                 for (auto r = 0; r < rows; r++) { |                 for (auto r = 0; r < rows; r++) { | ||||||
|                     for (auto c = 0; c < r; c++) { |                     for (auto c = 0; c < r; c++) { | ||||||
|                         math::nd4j_swap(outputPart[batch]->t<T>(r, c) , outputPart[batch]->t<T>(c, r)); |                         math::nd4j_swap(outputPart[batch]->t<T>(r, c) , outputPart[batch]->t<T>(c, r)); | ||||||
|  | |||||||
| @ -143,7 +143,7 @@ static void sruBI_(NDArray* x, const NDArray* w, const NDArray* b, const NDArray | |||||||
|     T* pCt   = ct->bufferAsT<T>(); |     T* pCt   = ct->bufferAsT<T>(); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto col = start; col < stop; col += increment) { |         for (auto col = start; col < stop; col++) { | ||||||
|             const auto colNum = col % d2; |             const auto colNum = col % d2; | ||||||
|             bool flip = colNum >= K; |             bool flip = colNum >= K; | ||||||
|             T maskVal = mask ? *(pMask + col) : T(1); |             T maskVal = mask ? *(pMask + col) : T(1); | ||||||
| @ -236,7 +236,7 @@ static void sruBIBP_(NDArray* x, const NDArray* w, const NDArray* b, const NDArr | |||||||
|     T* pGradInit  = gradC0->bufferAsT<T>(); |     T* pGradInit  = gradC0->bufferAsT<T>(); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto col = start; col < stop; col += increment) { |         for (auto col = start; col < stop; col++) { | ||||||
|             T gbF = 0.f; |             T gbF = 0.f; | ||||||
|             T gbR = 0.f; |             T gbR = 0.f; | ||||||
|             const auto colNum = col % d2; |             const auto colNum = col % d2; | ||||||
|  | |||||||
| @ -37,7 +37,7 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c | |||||||
| 	    int inSize = inArrs.size(); | 	    int inSize = inArrs.size(); | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) |             for (auto i = start; i < stop; i++) | ||||||
|                 outArr->p<T>(i, inArrs[i]->t<T>(0)); |                 outArr->p<T>(i, inArrs[i]->t<T>(0)); | ||||||
|         }; |         }; | ||||||
| 
 | 
 | ||||||
| @ -50,7 +50,7 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c | |||||||
|         int listSize = list.size(); |         int listSize = list.size(); | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) |             for (auto i = start; i < stop; i++) | ||||||
|                 list.at(i)->assign(inArrs[i]); |                 list.at(i)->assign(inArrs[i]); | ||||||
|         }; |         }; | ||||||
|         samediff::Threads::parallel_tad(func, 0, listSize); |         samediff::Threads::parallel_tad(func, 0, listSize); | ||||||
|  | |||||||
| @ -150,7 +150,7 @@ namespace helpers { | |||||||
|             result->assign(0); |             result->assign(0); | ||||||
|             if (status == ND4J_STATUS_OK) { |             if (status == ND4J_STATUS_OK) { | ||||||
|                 auto func = PRAGMA_THREADS_FOR { |                 auto func = PRAGMA_THREADS_FOR { | ||||||
|                     for (auto e = start; e < stop; e += increment) { |                     for (auto e = start; e < stop; e++) { | ||||||
|                         bool found = false; |                         bool found = false; | ||||||
|                         for (int j = 0; j < k; j++) { |                         for (int j = 0; j < k; j++) { | ||||||
|                             if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) { |                             if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) { | ||||||
|  | |||||||
| @ -43,7 +43,7 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N | |||||||
|     int dLen = dOdI.lengthOf(); |     int dLen = dOdI.lengthOf(); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto i = start; i < stop; i += increment) { |         for (auto i = start; i < stop; i++) { | ||||||
|             if (dOdI.t<T>(i) != static_cast<T>(0.f)) |             if (dOdI.t<T>(i) != static_cast<T>(0.f)) | ||||||
|                 dOdI.t<T>(i) = static_cast<T>(1.f); |                 dOdI.t<T>(i) = static_cast<T>(1.f); | ||||||
|         } |         } | ||||||
| @ -65,7 +65,7 @@ static void trace_(const NDArray& input, NDArray& output) { | |||||||
|     auto setOfSubArrs = input.allTensorsAlongDimension({inRank-2, inRank-1}); |     auto setOfSubArrs = input.allTensorsAlongDimension({inRank-2, inRank-1}); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto i = start; i < stop; i += increment) |         for (auto i = start; i < stop; i++) | ||||||
|             output.p(i, setOfSubArrs.at(i)->getTrace()); |             output.p(i, setOfSubArrs.at(i)->getTrace()); | ||||||
|     }; |     }; | ||||||
|     samediff::Threads::parallel_for(func, 0, setOfSubArrs.size()); |     samediff::Threads::parallel_for(func, 0, setOfSubArrs.size()); | ||||||
| @ -189,7 +189,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray | |||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             Nd4jLong coords[MAX_RANK]; |             Nd4jLong coords[MAX_RANK]; | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 shape::index2coords(i, output.getShapeInfo(), coords); |                 shape::index2coords(i, output.getShapeInfo(), coords); | ||||||
|                 const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); |                 const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); | ||||||
| 
 | 
 | ||||||
| @ -220,7 +220,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray | |||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             Nd4jLong coords[MAX_RANK]; |             Nd4jLong coords[MAX_RANK]; | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 shape::index2coords(i, output.getShapeInfo(), coords); |                 shape::index2coords(i, output.getShapeInfo(), coords); | ||||||
|                 const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); |                 const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); | ||||||
| 
 | 
 | ||||||
| @ -566,7 +566,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) { | |||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         Nd4jLong coords[MAX_RANK * 3]; |         Nd4jLong coords[MAX_RANK * 3]; | ||||||
|         for (auto i = start; i < stop; i += increment) { |         for (auto i = start; i < stop; i++) { | ||||||
|             Nd4jLong *zCoordStart, *xCoordStart; |             Nd4jLong *zCoordStart, *xCoordStart; | ||||||
| 
 | 
 | ||||||
|             if (yLastDim == xRank) { |             if (yLastDim == xRank) { | ||||||
| @ -650,7 +650,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con | |||||||
|         else if (input->rankOf() == 1 && indices->isVector()) { |         else if (input->rankOf() == 1 && indices->isVector()) { | ||||||
|             // special case
 |             // special case
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto e = start; e < stop; e += increment) |                 for (auto e = start; e < stop; e++) | ||||||
|                     output->p(e, input->e<T>(indices->e<Nd4jLong>(e))); |                     output->p(e, input->e<T>(indices->e<Nd4jLong>(e))); | ||||||
|             }; |             }; | ||||||
| 
 | 
 | ||||||
| @ -663,7 +663,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con | |||||||
|             const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), dimsOut); |             const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), dimsOut); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     NDArray subArrOut = (*output)(i, dimsOut); |                     NDArray subArrOut = (*output)(i, dimsOut); | ||||||
|                     NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis}); |                     NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis}); | ||||||
|                     subArrOut.assign(subArrIn); |                     subArrOut.assign(subArrIn); | ||||||
| @ -687,7 +687,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con | |||||||
|             const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), {axis}); |             const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), {axis}); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     NDArray subArrOut = (*output)(i, {axis}); |                     NDArray subArrOut = (*output)(i, {axis}); | ||||||
|                     NDArray subArrIn = (*input)(intArgs[i + 1], {axis}); |                     NDArray subArrIn = (*input)(intArgs[i + 1], {axis}); | ||||||
|                     subArrOut.assign(subArrIn); |                     subArrOut.assign(subArrIn); | ||||||
| @ -710,7 +710,7 @@ void eye(nd4j::LaunchContext * context, NDArray& output) { | |||||||
|     auto arrs = output.allTensorsAlongDimension({rank-2, rank-1}); |     auto arrs = output.allTensorsAlongDimension({rank-2, rank-1}); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto i = start; i < stop; i += increment) |         for (auto i = start; i < stop; i++) | ||||||
|             arrs.at(i)->setIdentity(); |             arrs.at(i)->setIdentity(); | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
| @ -737,7 +737,7 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat | |||||||
|         indices.push_back((*intArgs)[e]); |         indices.push_back((*intArgs)[e]); | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto i = start; i < stop; i += increment) { |         for (auto i = start; i < stop; i++) { | ||||||
|             auto inSubArr = input(indices[i], dimsToExclude, true); |             auto inSubArr = input(indices[i], dimsToExclude, true); | ||||||
|             auto updSubArr = updates(i, dimsToExclude, true); |             auto updSubArr = updates(i, dimsToExclude, true); | ||||||
| 
 | 
 | ||||||
| @ -786,7 +786,7 @@ void scatterSimple(nd4j::LaunchContext * context, const int opId, NDArray& input | |||||||
| 
 | 
 | ||||||
|         case 6: {   // copy
 |         case 6: {   // copy
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto inSubArr = input(i, dimensions); |                     auto inSubArr = input(i, dimensions); | ||||||
|                     inSubArr.p(indices.t<Nd4jLong>(i), updates.e(i)); |                     inSubArr.p(indices.t<Nd4jLong>(i), updates.e(i)); | ||||||
|                 } |                 } | ||||||
| @ -809,7 +809,7 @@ static void mergeMaxIndex_(const std::vector<NDArray*>& inArrs, NDArray& output) | |||||||
|     auto x = inArrs[0]; |     auto x = inArrs[0]; | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto e = start; e < stop; e += increment) { |         for (auto e = start; e < stop; e++) { | ||||||
|             T max = -DataTypeUtils::max<T>(); |             T max = -DataTypeUtils::max<T>(); | ||||||
|             Nd4jLong idx = 0; |             Nd4jLong idx = 0; | ||||||
| 
 | 
 | ||||||
| @ -839,7 +839,7 @@ static void mergeMax_(const std::vector<NDArray*>& inArrs, NDArray& output) { | |||||||
|     auto x = inArrs[0]; |     auto x = inArrs[0]; | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto e = start; e < stop; e += increment) { |         for (auto e = start; e < stop; e++) { | ||||||
|             T max = -DataTypeUtils::max<T>(); |             T max = -DataTypeUtils::max<T>(); | ||||||
|             for (int i = 0; i < numArgs; i++) { |             for (int i = 0; i < numArgs; i++) { | ||||||
|                 T v = inArrs[i]->e<T>(e); |                 T v = inArrs[i]->e<T>(e); | ||||||
| @ -865,7 +865,7 @@ static void mergeAvg_(const std::vector<NDArray*>& inArrs, NDArray& output) { | |||||||
|     auto x = inArrs[0]; |     auto x = inArrs[0]; | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto e = start; e < stop; e += increment) { |         for (auto e = start; e < stop; e++) { | ||||||
|             T sum = 0.; |             T sum = 0.; | ||||||
|             for (int i = 0; i < numArgs; i++) { |             for (int i = 0; i < numArgs; i++) { | ||||||
|                 T v = inArrs[i]->e<T>(e); |                 T v = inArrs[i]->e<T>(e); | ||||||
| @ -891,7 +891,7 @@ static void mergeAdd_(const std::vector<NDArray*>& inArrs, NDArray& output) { | |||||||
|     auto x = inArrs[0]; |     auto x = inArrs[0]; | ||||||
| 
 | 
 | ||||||
|     auto func = PRAGMA_THREADS_FOR { |     auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto e = start; e < stop; e += increment) { |         for (auto e = start; e < stop; e++) { | ||||||
|             T sum = (T) 0.f; |             T sum = (T) 0.f; | ||||||
|             for (int i = 0; i < numArgs; i++) |             for (int i = 0; i < numArgs; i++) | ||||||
|                 sum += inArrs[i]->e<T>(e); |                 sum += inArrs[i]->e<T>(e); | ||||||
| @ -928,7 +928,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>& | |||||||
|             auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions); |             auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     const T iNormActual = norm2.e<T>(i); |                     const T iNormActual = norm2.e<T>(i); | ||||||
|                     if (iNormActual > normClip) |                     if (iNormActual > normClip) | ||||||
|                         *listOfInSubArrs.at(i) *= normClip / iNormActual; |                         *listOfInSubArrs.at(i) *= normClip / iNormActual; | ||||||
| @ -952,7 +952,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>& | |||||||
|             auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions); |             auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     auto inputSubArr = listOfInSubArrs.at(i); |                     auto inputSubArr = listOfInSubArrs.at(i); | ||||||
|                     auto outputSubArr = listOfOutSubArrs.at(i); |                     auto outputSubArr = listOfOutSubArrs.at(i); | ||||||
|                     outputSubArr->assign(inputSubArr); |                     outputSubArr->assign(inputSubArr); | ||||||
| @ -1058,7 +1058,7 @@ static void clipByNormBP_(const NDArray& input, const NDArray& gradO, NDArray& g | |||||||
|         auto cn = clipNorm.e<T>(0); |         auto cn = clipNorm.e<T>(0); | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 T N = norm2.e<T>(i); |                 T N = norm2.e<T>(i); | ||||||
| 
 | 
 | ||||||
|                 auto gradOSubArr = gradOSubArrs.at(i); |                 auto gradOSubArr = gradOSubArrs.at(i); | ||||||
| @ -1190,7 +1190,7 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o | |||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             Nd4jLong inIdx[MAX_RANK]; |             Nd4jLong inIdx[MAX_RANK]; | ||||||
|             Nd4jLong outIdx[MAX_RANK]; |             Nd4jLong outIdx[MAX_RANK]; | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 shape::index2coords(i, output.getShapeInfo(), outIdx); |                 shape::index2coords(i, output.getShapeInfo(), outIdx); | ||||||
| 
 | 
 | ||||||
|                 for (int j = 0; j < rank; ++j) { |                 for (int j = 0; j < rank; ++j) { | ||||||
| @ -1225,17 +1225,6 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o | |||||||
| 
 | 
 | ||||||
|     BUILD_SINGLE_TEMPLATE(template void mirrorPad_, (const NDArray& input, const NDArray& paddings, NDArray& output, const int mode), LIBND4J_TYPES); |     BUILD_SINGLE_TEMPLATE(template void mirrorPad_, (const NDArray& input, const NDArray& paddings, NDArray& output, const int mode), LIBND4J_TYPES); | ||||||
| 
 | 
 | ||||||
| //////////////////////////////////////////////////////////////////////////
 |  | ||||||
| template<typename T> |  | ||||||
| static void concat_(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) { |  | ||||||
|     nd4j::SpecialMethods<T>::concatCpuGeneric(inArrs, output, axis); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
|     void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) { |  | ||||||
|         BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector<NDArray*>& inArrs, NDArray& output, const int axis), LIBND4J_TYPES); |  | ||||||
| 
 | 
 | ||||||
| //////////////////////////////////////////////////////////////////////////
 | //////////////////////////////////////////////////////////////////////////
 | ||||||
| template <typename T> | template <typename T> | ||||||
|  | |||||||
| @ -90,7 +90,7 @@ namespace helpers { | |||||||
|         auto outputPart = output->allTensorsAlongDimension({-2, -1}); |         auto outputPart = output->allTensorsAlongDimension({-2, -1}); | ||||||
| 
 | 
 | ||||||
|         auto batchLoop = PRAGMA_THREADS_FOR { |         auto batchLoop = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 if (lower) { |                 if (lower) { | ||||||
|                     lowerTriangularSolve<T>(context, leftPart[i], rightPart[i], adjoint, outputPart[i]); |                     lowerTriangularSolve<T>(context, leftPart[i], rightPart[i], adjoint, outputPart[i]); | ||||||
|                 } else { |                 } else { | ||||||
| @ -112,7 +112,7 @@ namespace helpers { | |||||||
|         auto rows = input->sizeAt(-2); |         auto rows = input->sizeAt(-2); | ||||||
| 
 | 
 | ||||||
|         auto batchLoop = PRAGMA_THREADS_FOR { |         auto batchLoop = PRAGMA_THREADS_FOR { | ||||||
|             for (auto batch = start; batch < stop; batch += increment) { |             for (auto batch = start; batch < stop; batch++) { | ||||||
|                 if (!lower) { |                 if (!lower) { | ||||||
|                     for (auto r = 0; r < rows; r++) { |                     for (auto r = 0; r < rows; r++) { | ||||||
|                         for (auto c = 0; c <= r; c++) { |                         for (auto c = 0; c <= r; c++) { | ||||||
|  | |||||||
| @ -64,7 +64,7 @@ static void zeta_(nd4j::LaunchContext * context, const NDArray& x, const NDArray | |||||||
| 	int xLen = x.lengthOf(); | 	int xLen = x.lengthOf(); | ||||||
| 
 | 
 | ||||||
| 	auto func = PRAGMA_THREADS_FOR { | 	auto func = PRAGMA_THREADS_FOR { | ||||||
|         for (auto i = start; i < stop; i += increment) |         for (auto i = start; i < stop; i++) | ||||||
|             z.p(i, zetaScalar<T>(x.e<T>(i), q.e<T>(i))); |             z.p(i, zetaScalar<T>(x.e<T>(i), q.e<T>(i))); | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -68,7 +68,7 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND | |||||||
|         int tads = tadsA.size(); |         int tads = tadsA.size(); | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto e = start; e < stop; e += increment) { |             for (auto e = start; e < stop; e++) { | ||||||
|                 auto a_ = tadsA.at(e); |                 auto a_ = tadsA.at(e); | ||||||
|                 auto b_ = tadsB.at(e); |                 auto b_ = tadsB.at(e); | ||||||
|                 auto o_ = tadsO.at(e); |                 auto o_ = tadsO.at(e); | ||||||
|  | |||||||
| @ -69,7 +69,7 @@ namespace helpers { | |||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto e = start; e < stop; e += increment) { |             for (auto e = start; e < stop; e++) { | ||||||
|                 values->p(e, static_cast<T>(valuesVector[e])); |                 values->p(e, static_cast<T>(valuesVector[e])); | ||||||
|                 if (counts != nullptr) |                 if (counts != nullptr) | ||||||
|                     counts->p(e, countsMap[valuesVector[e]]); |                     counts->p(e, countsMap[valuesVector[e]]); | ||||||
|  | |||||||
| @ -19,8 +19,10 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_double.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_0); |     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_0); | ||||||
|  | 
 | ||||||
|  |     BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES); | ||||||
| } | } | ||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_double.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_1); |     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_1); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_double.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_2); |     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_2); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_double.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_3); |     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_3); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_double.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_4); |     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_4); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_double.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_5); |     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_5); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_double.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_6); |     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_6); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_double.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_7); |     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_7); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_double.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_8); |     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_8); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_double.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_9); |     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_9); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_single.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_0); |     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_0); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_single.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_1); |     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_1); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_single.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_2); |     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_2); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_single.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_3); |     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_3); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_single.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_4); |     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_4); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_single.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_5); |     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_5); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_single.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_6); |     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_6); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_single.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_7); |     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_7); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_single.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_8); |     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_8); | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ | |||||||
| // @author raver119@gmail.com
 | // @author raver119@gmail.com
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
| #include "../specials.hpp" | #include "../specials_single.hpp" | ||||||
| 
 | 
 | ||||||
| namespace nd4j { | namespace nd4j { | ||||||
|     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_9); |     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_9); | ||||||
|  | |||||||
| @ -34,7 +34,7 @@ namespace nd4j { | |||||||
| 
 | 
 | ||||||
|             // handle transpose in parallel
 |             // handle transpose in parallel
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto r = start; r < stop; r += increment) { |                 for (auto r = start; r < stop; r++) { | ||||||
|                     for (int c = 0; c < cols; c++) { |                     for (int c = 0; c < cols; c++) { | ||||||
|                         int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c); |                         int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c); | ||||||
|                         int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c); |                         int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c); | ||||||
| @ -73,7 +73,7 @@ namespace nd4j { | |||||||
|                         C[r] = z; |                         C[r] = z; | ||||||
|                 } else { |                 } else { | ||||||
|                     auto func = PRAGMA_THREADS_FOR { |                     auto func = PRAGMA_THREADS_FOR { | ||||||
|                         for (auto r = start; r < stop; r += increment) |                         for (auto r = start; r < stop; r++) | ||||||
|                             C[r] = z; |                             C[r] = z; | ||||||
|                     }; |                     }; | ||||||
|                     samediff::Threads::parallel_for(func, 0, length); |                     samediff::Threads::parallel_for(func, 0, length); | ||||||
| @ -130,7 +130,7 @@ namespace nd4j { | |||||||
|             auto aT = TRANS == CblasTrans ? reinterpret_cast<X *>(nd4j::blas::transpose<X>(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast<void *>(x))) : x; |             auto aT = TRANS == CblasTrans ? reinterpret_cast<X *>(nd4j::blas::transpose<X>(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast<void *>(x))) : x; | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto r = start; r < stop; r += increment) { |                 for (auto r = start; r < stop; r++) { | ||||||
|                     int aIdx = linearIndexC(M, N, r, 0); |                     int aIdx = linearIndexC(M, N, r, 0); | ||||||
|                     auto aX = aT + aIdx; |                     auto aX = aT + aIdx; | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										270
									
								
								libnd4j/include/ops/impl/specials_double.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										270
									
								
								libnd4j/include/ops/impl/specials_double.hpp
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,270 @@ | |||||||
|  | /*******************************************************************************
 | ||||||
|  |  * Copyright (c) 2015-2018 Skymind, Inc. | ||||||
|  |  * | ||||||
|  |  * This program and the accompanying materials are made available under the | ||||||
|  |  * terms of the Apache License, Version 2.0 which is available at | ||||||
|  |  * https://www.apache.org/licenses/LICENSE-2.0.
 | ||||||
|  |  * | ||||||
|  |  * Unless required by applicable law or agreed to in writing, software | ||||||
|  |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||||||
|  |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||||||
|  |  * License for the specific language governing permissions and limitations | ||||||
|  |  * under the License. | ||||||
|  |  * | ||||||
|  |  * SPDX-License-Identifier: Apache-2.0 | ||||||
|  |  ******************************************************************************/ | ||||||
|  | 
 | ||||||
|  | //
 | ||||||
|  | // @author raver119@gmail.com, created on 07.10.2017.
 | ||||||
|  | // @author Yurii Shyrma (iuriish@yahoo.com)
 | ||||||
|  | //
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | #include <pointercast.h> | ||||||
|  | #include <helpers/shape.h> | ||||||
|  | #include <helpers/TAD.h> | ||||||
|  | #include <specials.h> | ||||||
|  | #include <dll.h> | ||||||
|  | #include <NDArray.h> | ||||||
|  | #include <ops/declarable/CustomOperations.h> | ||||||
|  | #include <types/types.h> | ||||||
|  | #include <helpers/Loops.h> | ||||||
|  | 
 | ||||||
|  | namespace nd4j { | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     template<typename S, typename T> | ||||||
|  |     void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) { | ||||||
|  |         auto x = reinterpret_cast<S *>(dx); | ||||||
|  |         auto z = reinterpret_cast<T *>(dz); | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         auto func = PRAGMA_THREADS_FOR { | ||||||
|  |             for (auto i = start; i < stop; i++) { | ||||||
|  |                 z[i] = static_cast<T>(x[i]); | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|  | 
 | ||||||
|  |         samediff::Threads::parallel_for(func, 0, N); | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     template <typename X, typename Y> | ||||||
|  |     void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) { | ||||||
|  |         int i = left, j = right; | ||||||
|  |         X ktmp; | ||||||
|  |         X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)]; | ||||||
|  | 
 | ||||||
|  |         Y vtmp; | ||||||
|  | 
 | ||||||
|  |         { | ||||||
|  |             /* PARTITION PART */ | ||||||
|  |             while (i <= j) { | ||||||
|  |                 if (descending) { | ||||||
|  |                     while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot) | ||||||
|  |                         i++; | ||||||
|  |                     while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot) | ||||||
|  |                         j--; | ||||||
|  |                     if (i <= j) { | ||||||
|  |                         ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; | ||||||
|  |                         key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; | ||||||
|  |                         key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; | ||||||
|  | 
 | ||||||
|  |                         vtmp = values[shape::getIndexOffset(i, yShapeInfo)]; | ||||||
|  |                         values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)]; | ||||||
|  |                         values[shape::getIndexOffset(j, yShapeInfo)] = vtmp; | ||||||
|  | 
 | ||||||
|  |                         i++; | ||||||
|  |                         j--; | ||||||
|  |                     } | ||||||
|  |                 } else { | ||||||
|  |                     while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot) | ||||||
|  |                         i++; | ||||||
|  |                     while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot) | ||||||
|  |                         j--; | ||||||
|  |                     if (i <= j) { | ||||||
|  |                         ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; | ||||||
|  |                         key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; | ||||||
|  |                         key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; | ||||||
|  | 
 | ||||||
|  |                         vtmp = values[shape::getIndexOffset(i, yShapeInfo)]; | ||||||
|  |                         values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)]; | ||||||
|  |                         values[shape::getIndexOffset(j, yShapeInfo)] = vtmp; | ||||||
|  | 
 | ||||||
|  |                         i++; | ||||||
|  |                         j--; | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         //
 | ||||||
|  | 
 | ||||||
|  |         if ( ((right-left)<cutoff) ){ | ||||||
|  |             if (left < j){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); } | ||||||
|  |             if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); } | ||||||
|  | 
 | ||||||
|  |         }else{ | ||||||
|  | PRAGMA_OMP_TASK | ||||||
|  |             { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); } | ||||||
|  | PRAGMA_OMP_TASK | ||||||
|  |             { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     template <typename X, typename Y> | ||||||
|  |     void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) { | ||||||
|  |         int i = left, j = right; | ||||||
|  |         X ktmp; | ||||||
|  |         Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)]; | ||||||
|  | 
 | ||||||
|  |         Y vtmp; | ||||||
|  | 
 | ||||||
|  |         { | ||||||
|  |             /* PARTITION PART */ | ||||||
|  |             while (i <= j) { | ||||||
|  |                 if (descending) { | ||||||
|  |                     while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot) | ||||||
|  |                         i++; | ||||||
|  |                     while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot) | ||||||
|  |                         j--; | ||||||
|  |                     if (i <= j) { | ||||||
|  |                         ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; | ||||||
|  |                         key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; | ||||||
|  |                         key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; | ||||||
|  | 
 | ||||||
|  |                         vtmp = value[shape::getIndexOffset(i, yShapeInfo)]; | ||||||
|  |                         value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)]; | ||||||
|  |                         value[shape::getIndexOffset(j, yShapeInfo)] = vtmp; | ||||||
|  | 
 | ||||||
|  |                         i++; | ||||||
|  |                         j--; | ||||||
|  |                     } | ||||||
|  |                 } else { | ||||||
|  |                     while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot) | ||||||
|  |                         i++; | ||||||
|  |                     while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot) | ||||||
|  |                         j--; | ||||||
|  |                     if (i <= j) { | ||||||
|  |                         ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; | ||||||
|  |                         key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; | ||||||
|  |                         key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; | ||||||
|  | 
 | ||||||
|  |                         vtmp = value[shape::getIndexOffset(i, yShapeInfo)]; | ||||||
|  |                         value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)]; | ||||||
|  |                         value[shape::getIndexOffset(j, yShapeInfo)] = vtmp; | ||||||
|  | 
 | ||||||
|  |                         i++; | ||||||
|  |                         j--; | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         //
 | ||||||
|  | 
 | ||||||
|  |         if ( ((right-left)<cutoff) ){ | ||||||
|  |             if (left < j){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); } | ||||||
|  |             if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); } | ||||||
|  | 
 | ||||||
|  |         }else{ | ||||||
|  | PRAGMA_OMP_TASK | ||||||
|  |             { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); } | ||||||
|  | PRAGMA_OMP_TASK | ||||||
|  |             { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     template <typename X, typename Y> | ||||||
|  |     static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){ | ||||||
|  |         auto array = reinterpret_cast<X *>(varray); | ||||||
|  |         auto values = reinterpret_cast<Y *>(yarray); | ||||||
|  |         int cutoff = 1000; | ||||||
|  | 
 | ||||||
|  |         PRAGMA_OMP_PARALLEL_THREADS(numThreads) | ||||||
|  |         { | ||||||
|  | PRAGMA_OMP_SINGLE_ARGS(nowait) | ||||||
|  |             { | ||||||
|  |                 quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     template <typename X, typename Y> | ||||||
|  |     static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){ | ||||||
|  |         auto array = reinterpret_cast<X *>(varray); | ||||||
|  |         auto values = reinterpret_cast<Y *>(yarray); | ||||||
|  |         int cutoff = 1000; | ||||||
|  | 
 | ||||||
|  |         PRAGMA_OMP_PARALLEL_THREADS(numThreads) | ||||||
|  |         { | ||||||
|  | PRAGMA_OMP_SINGLE_ARGS(nowait) | ||||||
|  |             { | ||||||
|  |                 quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     template <typename X, typename Y> | ||||||
|  |     void DoubleMethods<X,Y>::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) { | ||||||
|  |         quickSort_parallel_key<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     template <typename X, typename Y> | ||||||
|  |     void DoubleMethods<X,Y>::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) { | ||||||
|  |         quickSort_parallel_value<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     template <typename X, typename Y> | ||||||
|  |     void DoubleMethods<X,Y>::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) { | ||||||
|  |         auto x = reinterpret_cast<X*>(vx); | ||||||
|  |         auto y = reinterpret_cast<Y*>(vy); | ||||||
|  | 
 | ||||||
|  |         auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength); | ||||||
|  |         auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength); | ||||||
|  | 
 | ||||||
|  |         auto xLength = shape::length(xShapeInfo); | ||||||
|  |         auto xTadLength = shape::length(packX.primaryShapeInfo()); | ||||||
|  |         auto numTads = packX.numberOfTads(); | ||||||
|  | 
 | ||||||
|  |         auto func = PRAGMA_THREADS_FOR { | ||||||
|  |             for (auto r = start; r < stop; r++) { | ||||||
|  |                 auto dx = x + packX.primaryOffsets()[r]; | ||||||
|  |                 auto dy = y + packY.primaryOffsets()[r]; | ||||||
|  | 
 | ||||||
|  |                 quickSort_parallel_key<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|  | 
 | ||||||
|  |         samediff::Threads::parallel_tad(func, 0, numTads); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     template <typename X, typename Y> | ||||||
|  |     void DoubleMethods<X,Y>::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) { | ||||||
|  |         auto x = reinterpret_cast<X*>(vx); | ||||||
|  |         auto y = reinterpret_cast<Y*>(vy); | ||||||
|  | 
 | ||||||
|  |         auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength); | ||||||
|  |         auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength); | ||||||
|  | 
 | ||||||
|  |         auto xLength = shape::length(xShapeInfo); | ||||||
|  |         auto xTadLength = shape::length(packX.primaryShapeInfo()); | ||||||
|  |         auto numTads = packX.numberOfTads(); | ||||||
|  | 
 | ||||||
|  |         auto func = PRAGMA_THREADS_FOR { | ||||||
|  |             for (auto r = start; r < stop; r++) { | ||||||
|  |                 auto dx = x + packX.primaryOffsets()[r]; | ||||||
|  |                 auto dy = y + packY.primaryOffsets()[r]; | ||||||
|  | 
 | ||||||
|  |                 quickSort_parallel_value<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|  | 
 | ||||||
|  |         samediff::Threads::parallel_tad(func, 0, numTads); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
| @ -64,7 +64,7 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, ND | |||||||
|                 T* outBuff = output.bufferAsT<T>(); |                 T* outBuff = output.bufferAsT<T>(); | ||||||
| 
 | 
 | ||||||
|                 auto func = PRAGMA_THREADS_FOR { |                 auto func = PRAGMA_THREADS_FOR { | ||||||
|                     for (auto r = start; r < stop; r += increment) { |                     for (auto r = start; r < stop; r++) { | ||||||
|                         const Nd4jLong arrLen = inArrs[r]->lengthOf(); |                         const Nd4jLong arrLen = inArrs[r]->lengthOf(); | ||||||
|                         const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]]; |                         const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]]; | ||||||
| 
 | 
 | ||||||
| @ -99,7 +99,7 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, ND | |||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 auto temp = output(indices[i], true); |                 auto temp = output(indices[i], true); | ||||||
|                 nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1); |                 nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1); | ||||||
|             } |             } | ||||||
| @ -143,7 +143,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint | |||||||
|         auto x = reinterpret_cast<T **>(vx); |         auto x = reinterpret_cast<T **>(vx); | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto i = start; i < stop; i += increment) { |             for (auto i = start; i < stop; i++) { | ||||||
|                 for (auto ar = 0L; ar < n; ar++) { |                 for (auto ar = 0L; ar < n; ar++) { | ||||||
|                     z[i] += x[ar][i]; |                     z[i] += x[ar][i]; | ||||||
|                 } |                 } | ||||||
| @ -179,7 +179,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint | |||||||
|             } |             } | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     for (Nd4jLong ar = 1; ar < n; ar++) { |                     for (Nd4jLong ar = 1; ar < n; ar++) { | ||||||
|                         z[i] += x[ar][i] / static_cast<T>(n); |                         z[i] += x[ar][i] / static_cast<T>(n); | ||||||
|                     } |                     } | ||||||
| @ -199,7 +199,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint | |||||||
| 
 | 
 | ||||||
|             // aggregation step
 |             // aggregation step
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (auto i = start; i < stop; i += increment) { |                 for (auto i = start; i < stop; i++) { | ||||||
|                     for (Nd4jLong ar = 0; ar < n; ar++) { |                     for (Nd4jLong ar = 0; ar < n; ar++) { | ||||||
|                         z[i] += x[ar][i] / static_cast<T>(n); |                         z[i] += x[ar][i] / static_cast<T>(n); | ||||||
|                     } |                     } | ||||||
| @ -336,7 +336,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) | |||||||
|         int numTads = xLength / xTadLength; |         int numTads = xLength / xTadLength; | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto r = start; r < stop; r += increment) { |             for (auto r = start; r < stop; r++) { | ||||||
|                 T *dx = x + tadOffsets[r]; |                 T *dx = x + tadOffsets[r]; | ||||||
| 
 | 
 | ||||||
|                 quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending); |                 quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending); | ||||||
| @ -358,7 +358,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|         auto func = PRAGMA_THREADS_FOR { |         auto func = PRAGMA_THREADS_FOR { | ||||||
|             for (auto e = start; e < stop; e += increment) { |             for (auto e = start; e < stop; e++) { | ||||||
|                 for (int bitId = 0; bitId < 16; bitId++) { |                 for (int bitId = 0; bitId < 16; bitId++) { | ||||||
|                     bool hasBit = (x[e] & 1 << (bitId)) != 0; |                     bool hasBit = (x[e] & 1 << (bitId)) != 0; | ||||||
|                     bool hasSign = (x[e] & 1 << (bitId + 16)) != 0; |                     bool hasSign = (x[e] & 1 << (bitId + 16)) != 0; | ||||||
| @ -378,22 +378,6 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) | |||||||
|         samediff::Threads::parallel_for(func, 4, lim); |         samediff::Threads::parallel_for(func, 4, lim); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     template<typename S, typename T> |  | ||||||
|     void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) { |  | ||||||
|         auto x = reinterpret_cast<S *>(dx); |  | ||||||
|         auto z = reinterpret_cast<T *>(dz); |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|         auto func = PRAGMA_THREADS_FOR { |  | ||||||
|             for (auto i = start; i < stop; i += increment) { |  | ||||||
|                 z[i] = static_cast<T>(x[i]); |  | ||||||
|             } |  | ||||||
|         }; |  | ||||||
| 
 |  | ||||||
|         samediff::Threads::parallel_for(func, 0, N); |  | ||||||
|     }; |  | ||||||
|     BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES); |  | ||||||
| 
 |  | ||||||
|     template<typename T> |     template<typename T> | ||||||
|     Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) { |     Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) { | ||||||
|         auto dx = reinterpret_cast<T *>(vx); |         auto dx = reinterpret_cast<T *>(vx); | ||||||
| @ -442,226 +426,5 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) | |||||||
|         }; |         }; | ||||||
|         return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16); |         return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16); | ||||||
|     } |     } | ||||||
| 
 |  | ||||||
|     template <typename X, typename Y> |  | ||||||
|     void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) { |  | ||||||
|         int i = left, j = right; |  | ||||||
|         X ktmp; |  | ||||||
|         X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)]; |  | ||||||
| 
 |  | ||||||
|         Y vtmp; |  | ||||||
| 
 |  | ||||||
|         { |  | ||||||
|             /* PARTITION PART */ |  | ||||||
|             while (i <= j) { |  | ||||||
|                 if (descending) { |  | ||||||
|                     while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot) |  | ||||||
|                         i++; |  | ||||||
|                     while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot) |  | ||||||
|                         j--; |  | ||||||
|                     if (i <= j) { |  | ||||||
|                         ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; |  | ||||||
|                         key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; |  | ||||||
|                         key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; |  | ||||||
| 
 |  | ||||||
|                         vtmp = values[shape::getIndexOffset(i, yShapeInfo)]; |  | ||||||
|                         values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)]; |  | ||||||
|                         values[shape::getIndexOffset(j, yShapeInfo)] = vtmp; |  | ||||||
| 
 |  | ||||||
|                         i++; |  | ||||||
|                         j--; |  | ||||||
|                     } |  | ||||||
|                 } else { |  | ||||||
|                     while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot) |  | ||||||
|                         i++; |  | ||||||
|                     while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot) |  | ||||||
|                         j--; |  | ||||||
|                     if (i <= j) { |  | ||||||
|                         ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; |  | ||||||
|                         key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; |  | ||||||
|                         key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; |  | ||||||
| 
 |  | ||||||
|                         vtmp = values[shape::getIndexOffset(i, yShapeInfo)]; |  | ||||||
|                         values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)]; |  | ||||||
|                         values[shape::getIndexOffset(j, yShapeInfo)] = vtmp; |  | ||||||
| 
 |  | ||||||
|                         i++; |  | ||||||
|                         j--; |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
| 
 |  | ||||||
|         } |  | ||||||
| 
 |  | ||||||
|         //
 |  | ||||||
| 
 |  | ||||||
|         if ( ((right-left)<cutoff) ){ |  | ||||||
|             if (left < j){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); } |  | ||||||
|             if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); } |  | ||||||
| 
 |  | ||||||
|         }else{ |  | ||||||
| PRAGMA_OMP_TASK |  | ||||||
|             { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); } |  | ||||||
| PRAGMA_OMP_TASK |  | ||||||
|             { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     template <typename X, typename Y> |  | ||||||
|     void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) { |  | ||||||
|         int i = left, j = right; |  | ||||||
|         X ktmp; |  | ||||||
|         Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)]; |  | ||||||
| 
 |  | ||||||
|         Y vtmp; |  | ||||||
| 
 |  | ||||||
|         { |  | ||||||
|             /* PARTITION PART */ |  | ||||||
|             while (i <= j) { |  | ||||||
|                 if (descending) { |  | ||||||
|                     while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot) |  | ||||||
|                         i++; |  | ||||||
|                     while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot) |  | ||||||
|                         j--; |  | ||||||
|                     if (i <= j) { |  | ||||||
|                         ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; |  | ||||||
|                         key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; |  | ||||||
|                         key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; |  | ||||||
| 
 |  | ||||||
|                         vtmp = value[shape::getIndexOffset(i, yShapeInfo)]; |  | ||||||
|                         value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)]; |  | ||||||
|                         value[shape::getIndexOffset(j, yShapeInfo)] = vtmp; |  | ||||||
| 
 |  | ||||||
|                         i++; |  | ||||||
|                         j--; |  | ||||||
|                     } |  | ||||||
|                 } else { |  | ||||||
|                     while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot) |  | ||||||
|                         i++; |  | ||||||
|                     while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot) |  | ||||||
|                         j--; |  | ||||||
|                     if (i <= j) { |  | ||||||
|                         ktmp = key[shape::getIndexOffset(i, xShapeInfo)]; |  | ||||||
|                         key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)]; |  | ||||||
|                         key[shape::getIndexOffset(j, xShapeInfo)] = ktmp; |  | ||||||
| 
 |  | ||||||
|                         vtmp = value[shape::getIndexOffset(i, yShapeInfo)]; |  | ||||||
|                         value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)]; |  | ||||||
|                         value[shape::getIndexOffset(j, yShapeInfo)] = vtmp; |  | ||||||
| 
 |  | ||||||
|                         i++; |  | ||||||
|                         j--; |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
| 
 |  | ||||||
|         } |  | ||||||
| 
 |  | ||||||
|         //
 |  | ||||||
| 
 |  | ||||||
|         if ( ((right-left)<cutoff) ){ |  | ||||||
|             if (left < j){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); } |  | ||||||
|             if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); } |  | ||||||
| 
 |  | ||||||
|         }else{ |  | ||||||
| PRAGMA_OMP_TASK |  | ||||||
|             { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); } |  | ||||||
| PRAGMA_OMP_TASK |  | ||||||
|             { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     template <typename X, typename Y> |  | ||||||
|     static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){ |  | ||||||
|         auto array = reinterpret_cast<X *>(varray); |  | ||||||
|         auto values = reinterpret_cast<Y *>(yarray); |  | ||||||
|         int cutoff = 1000; |  | ||||||
| 
 |  | ||||||
|         PRAGMA_OMP_PARALLEL_THREADS(numThreads) |  | ||||||
|         { |  | ||||||
| PRAGMA_OMP_SINGLE_ARGS(nowait) |  | ||||||
|             { |  | ||||||
|                 quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     template <typename X, typename Y> |  | ||||||
|     static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){ |  | ||||||
|         auto array = reinterpret_cast<X *>(varray); |  | ||||||
|         auto values = reinterpret_cast<Y *>(yarray); |  | ||||||
|         int cutoff = 1000; |  | ||||||
| 
 |  | ||||||
|         PRAGMA_OMP_PARALLEL_THREADS(numThreads) |  | ||||||
|         { |  | ||||||
| PRAGMA_OMP_SINGLE_ARGS(nowait) |  | ||||||
|             { |  | ||||||
|                 quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     template <typename X, typename Y> |  | ||||||
|     void DoubleMethods<X,Y>::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) { |  | ||||||
|         quickSort_parallel_key<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     template <typename X, typename Y> |  | ||||||
|     void DoubleMethods<X,Y>::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) { |  | ||||||
|         quickSort_parallel_value<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     template <typename X, typename Y> |  | ||||||
|     void DoubleMethods<X,Y>::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) { |  | ||||||
|         auto x = reinterpret_cast<X*>(vx); |  | ||||||
|         auto y = reinterpret_cast<Y*>(vy); |  | ||||||
| 
 |  | ||||||
|         auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength); |  | ||||||
|         auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength); |  | ||||||
| 
 |  | ||||||
|         auto xLength = shape::length(xShapeInfo); |  | ||||||
|         auto xTadLength = shape::length(packX.primaryShapeInfo()); |  | ||||||
|         auto numTads = packX.numberOfTads(); |  | ||||||
| 
 |  | ||||||
|         auto func = PRAGMA_THREADS_FOR { |  | ||||||
|             for (auto r = start; r < stop; r += increment) { |  | ||||||
|                 auto dx = x + packX.primaryOffsets()[r]; |  | ||||||
|                 auto dy = y + packY.primaryOffsets()[r]; |  | ||||||
| 
 |  | ||||||
|                 quickSort_parallel_key<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); |  | ||||||
|             } |  | ||||||
|         }; |  | ||||||
| 
 |  | ||||||
|         samediff::Threads::parallel_tad(func, 0, numTads); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     template <typename X, typename Y> |  | ||||||
|     void DoubleMethods<X,Y>::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) { |  | ||||||
|         auto x = reinterpret_cast<X*>(vx); |  | ||||||
|         auto y = reinterpret_cast<Y*>(vy); |  | ||||||
| 
 |  | ||||||
|         auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength); |  | ||||||
|         auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength); |  | ||||||
| 
 |  | ||||||
|         auto xLength = shape::length(xShapeInfo); |  | ||||||
|         auto xTadLength = shape::length(packX.primaryShapeInfo()); |  | ||||||
|         auto numTads = packX.numberOfTads(); |  | ||||||
| 
 |  | ||||||
|         auto func = PRAGMA_THREADS_FOR { |  | ||||||
|             for (auto r = start; r < stop; r += increment) { |  | ||||||
|                 auto dx = x + packX.primaryOffsets()[r]; |  | ||||||
|                 auto dy = y + packY.primaryOffsets()[r]; |  | ||||||
| 
 |  | ||||||
|                 quickSort_parallel_value<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending); |  | ||||||
|             } |  | ||||||
|         }; |  | ||||||
| 
 |  | ||||||
|         samediff::Threads::parallel_tad(func, 0, numTads); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     //BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES);
 |  | ||||||
|     //BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES);
 |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @ -167,7 +167,7 @@ namespace randomOps { | |||||||
| 
 | 
 | ||||||
|             if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) { |             if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) { | ||||||
|                 auto func = PRAGMA_THREADS_FOR { |                 auto func = PRAGMA_THREADS_FOR { | ||||||
|                     for (uint64_t e = start; e < stop; e += increment) { |                     for (auto e = start; e < stop; e++) { | ||||||
|                         T prob = rng->relativeT<T>(e); |                         T prob = rng->relativeT<T>(e); | ||||||
|                         T cumProb = (T) 0.0f; |                         T cumProb = (T) 0.0f; | ||||||
|                         for (Nd4jLong f = 0; f < yLength; f++) { |                         for (Nd4jLong f = 0; f < yLength; f++) { | ||||||
| @ -330,7 +330,7 @@ namespace randomOps { | |||||||
|             const T epsilon = static_cast<T>(1e-5); |             const T epsilon = static_cast<T>(1e-5); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (uint64_t e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto epm = e + middle; |                     auto epm = e + middle; | ||||||
| 
 | 
 | ||||||
|                     // we need to get random values
 |                     // we need to get random values
 | ||||||
| @ -440,7 +440,7 @@ namespace randomOps { | |||||||
| 
 | 
 | ||||||
|             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state); |             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state); | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (Nd4jLong e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
| 
 | 
 | ||||||
|                     int success = 0; |                     int success = 0; | ||||||
|                     for (int t = 1; t <= trials; t++) { |                     for (int t = 1; t <= trials; t++) { | ||||||
| @ -549,7 +549,7 @@ namespace randomOps { | |||||||
|             //nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
 |             //nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
 | ||||||
|             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state); |             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state); | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (uint64_t e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
| 
 | 
 | ||||||
|                     int success = 0; |                     int success = 0; | ||||||
|                     for (int t = 1; t <= trials; t++) { |                     for (int t = 1; t <= trials; t++) { | ||||||
| @ -690,7 +690,7 @@ namespace randomOps { | |||||||
|             const T epsilon = static_cast<T>(1e-5); |             const T epsilon = static_cast<T>(1e-5); | ||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 for (uint64_t e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     if (z[e] > mean + ds || z[e] < mean - ds) { |                     if (z[e] > mean + ds || z[e] < mean - ds) { | ||||||
|                         z[e] = step(rng, mean, stddev, e, middle, z[e]); |                         z[e] = step(rng, mean, stddev, e, middle, z[e]); | ||||||
| 
 | 
 | ||||||
| @ -818,7 +818,7 @@ namespace randomOps { | |||||||
| 
 | 
 | ||||||
|             auto func = PRAGMA_THREADS_FOR { |             auto func = PRAGMA_THREADS_FOR { | ||||||
|                 PRAGMA_OMP_SIMD |                 PRAGMA_OMP_SIMD | ||||||
|                 for (uint64_t e = start; e < stop; e += increment) { |                 for (auto e = start; e < stop; e++) { | ||||||
|                     auto epm = e + middle; |                     auto epm = e + middle; | ||||||
| 
 | 
 | ||||||
|                     // we need to get random values
 |                     // we need to get random values
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user