LoopsCoordsHelper: enable functions for Cuda usage

Signed-off-by: AbdelRauf <rauf@konduit.ai>
master
AbdelRauf 2021-02-28 19:16:56 +01:00
parent 40c2e592ac
commit b66454d593
1 changed files with 35 additions and 32 deletions

View File

@ -41,7 +41,10 @@ namespace sd {
#define unlikely(x) (x) #define unlikely(x) (x)
#endif #endif
using zip_size_t = std::pair<size_t, size_t>; struct zip_size_t{
Nd4jLong first;
Nd4jLong second;
};
template<size_t Index> template<size_t Index>
struct CoordsState :CoordsState<Index - 1> { struct CoordsState :CoordsState<Index - 1> {
@ -96,7 +99,7 @@ namespace sd {
#define ZIP_OF_ADJUST2(x,index) ((x).::sd::ZipCoordsState<(index)>::adjust2) #define ZIP_OF_ADJUST2(x,index) ((x).::sd::ZipCoordsState<(index)>::adjust2)
FORCEINLINE void index2coords_C(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) { _CUDA_HD FORCEINLINE void index2coords_C(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) {
for (size_t i = rank - 1; i > 0; --i) { for (size_t i = rank - 1; i > 0; --i) {
coords[i] = index % bases[i]; coords[i] = index % bases[i];
index /= bases[i]; index /= bases[i];
@ -104,7 +107,7 @@ namespace sd {
coords[0] = index; // last iteration coords[0] = index; // last iteration
} }
FORCEINLINE void index2coords_F(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) { _CUDA_HD FORCEINLINE void index2coords_F(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) {
for (size_t i = 0; i < rank - 1; i++) { for (size_t i = 0; i < rank - 1; i++) {
coords[i] = index % bases[i]; coords[i] = index % bases[i];
@ -113,7 +116,7 @@ namespace sd {
coords[rank - 1] = index; // last iteration coords[rank - 1] = index; // last iteration
} }
FORCEINLINE size_t offset_from_coords(const Nd4jLong* strides, const Nd4jLong* coords, const Nd4jLong& rank) { _CUDA_HD FORCEINLINE size_t offset_from_coords(const Nd4jLong* strides, const Nd4jLong* coords, const Nd4jLong& rank) {
size_t offset = 0; size_t offset = 0;
size_t rank_4 = rank & -4; size_t rank_4 = rank & -4;
@ -131,7 +134,7 @@ namespace sd {
} }
FORCEINLINE zip_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) { _CUDA_HD FORCEINLINE zip_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) {
zip_size_t offset = { 0,0 }; zip_size_t offset = { 0,0 };
size_t rank_4 = rank & -4; size_t rank_4 = rank & -4;
@ -160,7 +163,7 @@ namespace sd {
} }
template<size_t Rank, size_t Index, bool Last_Index_Faster = true> template<size_t Rank, size_t Index, bool Last_Index_Faster = true>
FORCEINLINE _CUDA_HD FORCEINLINE
typename std::enable_if<(Rank - 1 == Index), size_t>::type typename std::enable_if<(Rank - 1 == Index), size_t>::type
coord_inc_n(CoordsState<Rank - 1>& cbs, size_t last_offset) { coord_inc_n(CoordsState<Rank - 1>& cbs, size_t last_offset) {
@ -178,7 +181,7 @@ namespace sd {
} }
template<size_t Rank, size_t Index, bool Last_Index_Faster = true> template<size_t Rank, size_t Index, bool Last_Index_Faster = true>
FORCEINLINE _CUDA_HD FORCEINLINE
typename std::enable_if<(Rank - 1 != Index), size_t >::type typename std::enable_if<(Rank - 1 != Index), size_t >::type
coord_inc_n(CoordsState<Rank - 1>& cbs, size_t last_offset) { coord_inc_n(CoordsState<Rank - 1>& cbs, size_t last_offset) {
@ -200,13 +203,13 @@ namespace sd {
} }
template<size_t Rank, size_t Index = 0, bool Last_Index_Faster = true> template<size_t Rank, size_t Index = 0, bool Last_Index_Faster = true>
FORCEINLINE size_t inc_coords(CoordsState<Rank - 1>& cbs, size_t last_offset) { _CUDA_HD FORCEINLINE size_t inc_coords(CoordsState<Rank - 1>& cbs, size_t last_offset) {
return coord_inc_n<Rank, Index, Last_Index_Faster>(cbs,/* 1,*/ last_offset/*, 0*/); return coord_inc_n<Rank, Index, Last_Index_Faster>(cbs,/* 1,*/ last_offset/*, 0*/);
} }
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
FORCEINLINE size_t inc_coords_ews(CoordsState<Rank - 1>& cbs, size_t last_offset, size_t ews) { _CUDA_HD FORCEINLINE size_t inc_coords_ews(CoordsState<Rank - 1>& cbs, size_t last_offset, size_t ews) {
if (ews == 1) { if (ews == 1) {
constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>(); constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
return last_offset + STRIDE(cbs, Ind); return last_offset + STRIDE(cbs, Ind);
@ -215,7 +218,7 @@ namespace sd {
} }
template<size_t Rank, size_t rankIndex, bool Last_Index_Faster = true> template<size_t Rank, size_t rankIndex, bool Last_Index_Faster = true>
FORCEINLINE _CUDA_HD FORCEINLINE
typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type
coord_inc_n(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) { coord_inc_n(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) {
@ -234,7 +237,7 @@ namespace sd {
} }
template<size_t Rank, size_t rankIndex, bool Last_Index_Faster = true> template<size_t Rank, size_t rankIndex, bool Last_Index_Faster = true>
FORCEINLINE _CUDA_HD FORCEINLINE
typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t >::type typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t >::type
coord_inc_n(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) { coord_inc_n(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) {
@ -259,14 +262,14 @@ namespace sd {
} }
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
FORCEINLINE zip_size_t inc_coords(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) { _CUDA_HD FORCEINLINE zip_size_t inc_coords(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) {
return coord_inc_n<Rank, rankIndex, Last_Index_Faster>(cbs, last_offset); return coord_inc_n<Rank, rankIndex, Last_Index_Faster>(cbs, last_offset);
} }
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
FORCEINLINE _CUDA_HD FORCEINLINE
typename std::enable_if<(Rank - 1 == rankIndex), size_t>::type typename std::enable_if<(Rank - 1 == rankIndex), size_t>::type
init_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) { init_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) {
constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>(); constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
@ -281,7 +284,7 @@ namespace sd {
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
FORCEINLINE _CUDA_HD FORCEINLINE
typename std::enable_if<(Rank - 1 != rankIndex), size_t>::type typename std::enable_if<(Rank - 1 != rankIndex), size_t>::type
init_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) { init_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) {
constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>(); constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
@ -297,14 +300,14 @@ namespace sd {
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
FORCEINLINE _CUDA_HD FORCEINLINE
typename std::enable_if<(Rank - 1 == rankIndex), bool>::type typename std::enable_if<(Rank - 1 == rankIndex), bool>::type
eq_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong* coords) { eq_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong* coords) {
return COORDS(cbs, rankIndex) == coords[rankIndex]; return COORDS(cbs, rankIndex) == coords[rankIndex];
} }
template<size_t Rank, size_t rankIndex = 0> template<size_t Rank, size_t rankIndex = 0>
FORCEINLINE _CUDA_HD FORCEINLINE
typename std::enable_if<(Rank - 1 != rankIndex), bool>::type typename std::enable_if<(Rank - 1 != rankIndex), bool>::type
eq_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong* coords) { eq_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong* coords) {
return COORDS(cbs, rankIndex) == coords[rankIndex] && eq_coords<Rank, rankIndex + 1>(cbs, coords); return COORDS(cbs, rankIndex) == coords[rankIndex] && eq_coords<Rank, rankIndex + 1>(cbs, coords);
@ -312,21 +315,21 @@ namespace sd {
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
FORCEINLINE _CUDA_HD FORCEINLINE
typename std::enable_if<(Rank - 1 == rankIndex), bool>::type typename std::enable_if<(Rank - 1 == rankIndex), bool>::type
eq_zip_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong* coords) { eq_zip_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong* coords) {
return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex]; return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex];
} }
template<size_t Rank, size_t rankIndex = 0> template<size_t Rank, size_t rankIndex = 0>
FORCEINLINE _CUDA_HD FORCEINLINE
typename std::enable_if<(Rank - 1 != rankIndex), bool>::type typename std::enable_if<(Rank - 1 != rankIndex), bool>::type
eq_zip_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong* coords) { eq_zip_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong* coords) {
return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex] && eq_zip_coords<Rank, rankIndex + 1>(cbs, coords); return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex] && eq_zip_coords<Rank, rankIndex + 1>(cbs, coords);
} }
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
FORCEINLINE _CUDA_HD FORCEINLINE
typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type
init_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) { init_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) {
constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>(); constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
@ -342,7 +345,7 @@ namespace sd {
} }
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
FORCEINLINE _CUDA_HD FORCEINLINE
typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t>::type typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t>::type
init_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) { init_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) {
constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>(); constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
@ -360,7 +363,7 @@ namespace sd {
//inc coords for non constant Ranks //inc coords for non constant Ranks
template<bool Last_Index_Faster = true> template<bool Last_Index_Faster = true>
FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip = 0) { _CUDA_HD FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip = 0) {
Nd4jLong val; Nd4jLong val;
for (int i = rank - skip - 1; i >= 0; i--) { for (int i = rank - skip - 1; i >= 0; i--) {
@ -379,7 +382,7 @@ namespace sd {
} }
template<> template<>
FORCEINLINE size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip) { _CUDA_HD FORCEINLINE size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip) {
Nd4jLong val; Nd4jLong val;
for (int i = skip; i < rank; i++) { for (int i = skip; i < rank; i++) {
@ -399,7 +402,7 @@ namespace sd {
template<bool Last_Index_Faster = true> template<bool Last_Index_Faster = true>
FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip = 0) { _CUDA_HD FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip = 0) {
Nd4jLong val = 0; Nd4jLong val = 0;
for (int i = rank - skip - 1; i >= 0; i--) { for (int i = rank - skip - 1; i >= 0; i--) {
@ -420,7 +423,7 @@ namespace sd {
} }
template<> template<>
FORCEINLINE zip_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip) { _CUDA_HD FORCEINLINE zip_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip) {
Nd4jLong val = 0; Nd4jLong val = 0;
for (int i = skip; i < rank; i++) { for (int i = skip; i < rank; i++) {
@ -450,7 +453,7 @@ namespace sd {
template<bool Last_Index_Faster = true> template<bool Last_Index_Faster = true>
FORCEINLINE triple_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip = 0) { _CUDA_HD FORCEINLINE triple_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip = 0) {
Nd4jLong val = 0; Nd4jLong val = 0;
for (int i = rank - skip - 1; i >= 0; i--) { for (int i = rank - skip - 1; i >= 0; i--) {
@ -473,7 +476,7 @@ namespace sd {
} }
template<> template<>
FORCEINLINE triple_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip) { _CUDA_HD FORCEINLINE triple_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip) {
Nd4jLong val = 0; Nd4jLong val = 0;
for (int i = skip; i < rank; i++) { for (int i = skip; i < rank; i++) {
@ -496,7 +499,7 @@ namespace sd {
return last_offset; return last_offset;
} }
FORCEINLINE triple_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) { _CUDA_HD FORCEINLINE triple_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) {
triple_size_t offset = { 0,0 ,0 }; triple_size_t offset = { 0,0 ,0 };
size_t rank_4 = rank & -4; size_t rank_4 = rank & -4;
@ -527,7 +530,7 @@ namespace sd {
template<bool Last_Index_Faster = true> template<bool Last_Index_Faster = true>
FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip = 0) _CUDA_HD FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip = 0)
{ {
if (skip < 0 || skip >= rank) skip = 0; if (skip < 0 || skip >= rank) skip = 0;
Nd4jLong total = 1; Nd4jLong total = 1;
@ -539,7 +542,7 @@ namespace sd {
template<> template<>
FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip) _CUDA_HD FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip)
{ {
if (skip < 0 || skip >= rank) skip = 0; if (skip < 0 || skip >= rank) skip = 0;
Nd4jLong total = 1; Nd4jLong total = 1;
@ -552,7 +555,7 @@ namespace sd {
template<bool Last_Index_Faster = true> template<bool Last_Index_Faster = true>
FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength) _CUDA_HD FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength)
{ {
if (skip < 0 || skip >= rank) skip = 0; if (skip < 0 || skip >= rank) skip = 0;
Nd4jLong total = 1; Nd4jLong total = 1;
@ -573,7 +576,7 @@ namespace sd {
template<> template<>
FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength) _CUDA_HD FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength)
{ {
if (skip < 0 || skip >= rank) skip = 0; if (skip < 0 || skip >= rank) skip = 0;
if (skip > 0) { if (skip > 0) {
@ -602,7 +605,7 @@ namespace sd {
if squash is True then it will attempt to minimize the output ( for both orders) and the tail if squash is True then it will attempt to minimize the output ( for both orders) and the tail
*/ */
FORCEINLINE void rePartition(char order, const std::vector<int>& dimensions, const size_t rank, const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong(&new_bases)[MAX_RANK], Nd4jLong(&new_strides)[MAX_RANK], int& first_begin, int& first_end, int& second_begin, int& second_end, bool first_squash = false, bool second_squash = true) { _CUDA_HD FORCEINLINE void rePartition(char order, const std::vector<int>& dimensions, const size_t rank, const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong(&new_bases)[MAX_RANK], Nd4jLong(&new_strides)[MAX_RANK], int& first_begin, int& first_end, int& second_begin, int& second_end, bool first_squash = false, bool second_squash = true) {
bool indices[MAX_RANK] = {}; bool indices[MAX_RANK] = {};
int ind = 0; int ind = 0;