diff --git a/libnd4j/include/helpers/LoopsCoordsHelper.h b/libnd4j/include/helpers/LoopsCoordsHelper.h index 3a9951ba0..c2d51fbf4 100644 --- a/libnd4j/include/helpers/LoopsCoordsHelper.h +++ b/libnd4j/include/helpers/LoopsCoordsHelper.h @@ -41,7 +41,10 @@ namespace sd { #define unlikely(x) (x) #endif - using zip_size_t = std::pair; + struct zip_size_t{ + Nd4jLong first; + Nd4jLong second; + }; template struct CoordsState :CoordsState { @@ -96,7 +99,7 @@ namespace sd { #define ZIP_OF_ADJUST2(x,index) ((x).::sd::ZipCoordsState<(index)>::adjust2) - FORCEINLINE void index2coords_C(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) { + _CUDA_HD FORCEINLINE void index2coords_C(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) { for (size_t i = rank - 1; i > 0; --i) { coords[i] = index % bases[i]; index /= bases[i]; @@ -104,7 +107,7 @@ namespace sd { coords[0] = index; // last iteration } - FORCEINLINE void index2coords_F(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) { + _CUDA_HD FORCEINLINE void index2coords_F(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) { for (size_t i = 0; i < rank - 1; i++) { coords[i] = index % bases[i]; @@ -113,7 +116,7 @@ namespace sd { coords[rank - 1] = index; // last iteration } - FORCEINLINE size_t offset_from_coords(const Nd4jLong* strides, const Nd4jLong* coords, const Nd4jLong& rank) { + _CUDA_HD FORCEINLINE size_t offset_from_coords(const Nd4jLong* strides, const Nd4jLong* coords, const Nd4jLong& rank) { size_t offset = 0; size_t rank_4 = rank & -4; @@ -131,7 +134,7 @@ namespace sd { } - FORCEINLINE zip_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) { + _CUDA_HD FORCEINLINE zip_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) { zip_size_t offset = { 0,0 }; size_t rank_4 = rank & -4; @@ -160,7 +163,7 @@ namespace sd { } template - FORCEINLINE + _CUDA_HD FORCEINLINE typename std::enable_if<(Rank - 1 == Index), size_t>::type coord_inc_n(CoordsState& cbs, size_t last_offset) { @@ -178,7 +181,7 @@ namespace sd { } template - FORCEINLINE + _CUDA_HD FORCEINLINE typename std::enable_if<(Rank - 1 != Index), size_t >::type coord_inc_n(CoordsState& cbs, size_t last_offset) { @@ -200,13 +203,13 @@ namespace sd { } template - FORCEINLINE size_t inc_coords(CoordsState& cbs, size_t last_offset) { + _CUDA_HD FORCEINLINE size_t inc_coords(CoordsState& cbs, size_t last_offset) { return coord_inc_n(cbs,/* 1,*/ last_offset/*, 0*/); } template - FORCEINLINE size_t inc_coords_ews(CoordsState& cbs, size_t last_offset, size_t ews) { + _CUDA_HD FORCEINLINE size_t inc_coords_ews(CoordsState& cbs, size_t last_offset, size_t ews) { if (ews == 1) { constexpr size_t Ind = StridesOrderInd(); return last_offset + STRIDE(cbs, Ind); @@ -215,7 +218,7 @@ namespace sd { } template - FORCEINLINE + _CUDA_HD FORCEINLINE typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type coord_inc_n(ZipCoordsState& cbs, zip_size_t last_offset) { @@ -234,7 +237,7 @@ namespace sd { } template - FORCEINLINE + _CUDA_HD FORCEINLINE typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t >::type coord_inc_n(ZipCoordsState& cbs, zip_size_t last_offset) { @@ -259,14 +262,14 @@ namespace sd { } template - FORCEINLINE zip_size_t inc_coords(ZipCoordsState& cbs, zip_size_t last_offset) { + _CUDA_HD FORCEINLINE zip_size_t inc_coords(ZipCoordsState& cbs, zip_size_t last_offset) { return coord_inc_n(cbs, last_offset); } template - FORCEINLINE + _CUDA_HD FORCEINLINE typename std::enable_if<(Rank - 1 == rankIndex), size_t>::type init_coords(CoordsState& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) { constexpr size_t Ind = StridesOrderInd(); @@ -281,7 +284,7 @@ namespace sd { template - FORCEINLINE + _CUDA_HD FORCEINLINE typename std::enable_if<(Rank - 1 != rankIndex), size_t>::type init_coords(CoordsState& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) { constexpr size_t Ind = StridesOrderInd(); @@ -297,14 +300,14 @@ namespace sd { template - FORCEINLINE + _CUDA_HD FORCEINLINE typename std::enable_if<(Rank - 1 == rankIndex), bool>::type eq_coords(CoordsState& cbs, const Nd4jLong* coords) { return COORDS(cbs, rankIndex) == coords[rankIndex]; } template - FORCEINLINE + _CUDA_HD FORCEINLINE typename std::enable_if<(Rank - 1 != rankIndex), bool>::type eq_coords(CoordsState& cbs, const Nd4jLong* coords) { return COORDS(cbs, rankIndex) == coords[rankIndex] && eq_coords(cbs, coords); @@ -312,21 +315,21 @@ namespace sd { template - FORCEINLINE + _CUDA_HD FORCEINLINE typename std::enable_if<(Rank - 1 == rankIndex), bool>::type eq_zip_coords(ZipCoordsState& cbs, const Nd4jLong* coords) { return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex]; } template - FORCEINLINE + _CUDA_HD FORCEINLINE typename std::enable_if<(Rank - 1 != rankIndex), bool>::type eq_zip_coords(ZipCoordsState& cbs, const Nd4jLong* coords) { return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex] && eq_zip_coords(cbs, coords); } template - FORCEINLINE + _CUDA_HD FORCEINLINE typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type init_coords(ZipCoordsState& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) { constexpr size_t Ind = StridesOrderInd(); @@ -342,7 +345,7 @@ namespace sd { } template - FORCEINLINE + _CUDA_HD FORCEINLINE typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t>::type init_coords(ZipCoordsState& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) { constexpr size_t Ind = StridesOrderInd(); @@ -360,7 +363,7 @@ namespace sd { //inc coords for non constant Ranks template - FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip = 0) { + _CUDA_HD FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip = 0) { Nd4jLong val; for (int i = rank - skip - 1; i >= 0; i--) { @@ -379,7 +382,7 @@ namespace sd { } template<> - FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip) { + _CUDA_HD FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip) { Nd4jLong val; for (int i = skip; i < rank; i++) { @@ -399,7 +402,7 @@ namespace sd { template - FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip = 0) { + _CUDA_HD FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip = 0) { Nd4jLong val = 0; for (int i = rank - skip - 1; i >= 0; i--) { @@ -420,7 +423,7 @@ namespace sd { } template<> - FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip) { + _CUDA_HD FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip) { Nd4jLong val = 0; for (int i = skip; i < rank; i++) { @@ -450,7 +453,7 @@ namespace sd { template - FORCEINLINE triple_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip = 0) { + _CUDA_HD FORCEINLINE triple_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip = 0) { Nd4jLong val = 0; for (int i = rank - skip - 1; i >= 0; i--) { @@ -473,7 +476,7 @@ namespace sd { } template<> - FORCEINLINE triple_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip) { + _CUDA_HD FORCEINLINE triple_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip) { Nd4jLong val = 0; for (int i = skip; i < rank; i++) { @@ -496,7 +499,7 @@ namespace sd { return last_offset; } - FORCEINLINE triple_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) { + _CUDA_HD FORCEINLINE triple_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) { triple_size_t offset = { 0,0 ,0 }; size_t rank_4 = rank & -4; @@ -527,7 +530,7 @@ namespace sd { template - FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip = 0) + _CUDA_HD FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip = 0) { if (skip < 0 || skip >= rank) skip = 0; Nd4jLong total = 1; @@ -539,7 +542,7 @@ namespace sd { template<> - FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip) + _CUDA_HD FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip) { if (skip < 0 || skip >= rank) skip = 0; Nd4jLong total = 1; @@ -552,7 +555,7 @@ namespace sd { template - FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength) + _CUDA_HD FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength) { if (skip < 0 || skip >= rank) skip = 0; Nd4jLong total = 1; @@ -573,7 +576,7 @@ namespace sd { template<> - FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength) + _CUDA_HD FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength) { if (skip < 0 || skip >= rank) skip = 0; if (skip > 0) { @@ -602,7 +605,7 @@ namespace sd { if squash is True then it will attempt to minimize the output ( for both orders) and the tail */ - FORCEINLINE void rePartition(char order, const std::vector& dimensions, const size_t rank, const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong(&new_bases)[MAX_RANK], Nd4jLong(&new_strides)[MAX_RANK], int& first_begin, int& first_end, int& second_begin, int& second_end, bool first_squash = false, bool second_squash = true) { + _CUDA_HD FORCEINLINE void rePartition(char order, const std::vector& dimensions, const size_t rank, const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong(&new_bases)[MAX_RANK], Nd4jLong(&new_strides)[MAX_RANK], int& first_begin, int& first_end, int& second_begin, int& second_end, bool first_squash = false, bool second_squash = true) { bool indices[MAX_RANK] = {}; int ind = 0;