LoopsCoordsHelper: enable functions for Cuda usage
Signed-off-by: AbdelRauf <rauf@konduit.ai>master
parent
40c2e592ac
commit
b66454d593
|
@ -41,7 +41,10 @@ namespace sd {
|
|||
#define unlikely(x) (x)
|
||||
#endif
|
||||
|
||||
using zip_size_t = std::pair<size_t, size_t>;
|
||||
struct zip_size_t{
|
||||
Nd4jLong first;
|
||||
Nd4jLong second;
|
||||
};
|
||||
|
||||
template<size_t Index>
|
||||
struct CoordsState :CoordsState<Index - 1> {
|
||||
|
@ -96,7 +99,7 @@ namespace sd {
|
|||
#define ZIP_OF_ADJUST2(x,index) ((x).::sd::ZipCoordsState<(index)>::adjust2)
|
||||
|
||||
|
||||
FORCEINLINE void index2coords_C(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) {
|
||||
_CUDA_HD FORCEINLINE void index2coords_C(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) {
|
||||
for (size_t i = rank - 1; i > 0; --i) {
|
||||
coords[i] = index % bases[i];
|
||||
index /= bases[i];
|
||||
|
@ -104,7 +107,7 @@ namespace sd {
|
|||
coords[0] = index; // last iteration
|
||||
}
|
||||
|
||||
FORCEINLINE void index2coords_F(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) {
|
||||
_CUDA_HD FORCEINLINE void index2coords_F(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) {
|
||||
|
||||
for (size_t i = 0; i < rank - 1; i++) {
|
||||
coords[i] = index % bases[i];
|
||||
|
@ -113,7 +116,7 @@ namespace sd {
|
|||
coords[rank - 1] = index; // last iteration
|
||||
}
|
||||
|
||||
FORCEINLINE size_t offset_from_coords(const Nd4jLong* strides, const Nd4jLong* coords, const Nd4jLong& rank) {
|
||||
_CUDA_HD FORCEINLINE size_t offset_from_coords(const Nd4jLong* strides, const Nd4jLong* coords, const Nd4jLong& rank) {
|
||||
|
||||
size_t offset = 0;
|
||||
size_t rank_4 = rank & -4;
|
||||
|
@ -131,7 +134,7 @@ namespace sd {
|
|||
}
|
||||
|
||||
|
||||
FORCEINLINE zip_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) {
|
||||
_CUDA_HD FORCEINLINE zip_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) {
|
||||
|
||||
zip_size_t offset = { 0,0 };
|
||||
size_t rank_4 = rank & -4;
|
||||
|
@ -160,7 +163,7 @@ namespace sd {
|
|||
}
|
||||
|
||||
template<size_t Rank, size_t Index, bool Last_Index_Faster = true>
|
||||
FORCEINLINE
|
||||
_CUDA_HD FORCEINLINE
|
||||
typename std::enable_if<(Rank - 1 == Index), size_t>::type
|
||||
coord_inc_n(CoordsState<Rank - 1>& cbs, size_t last_offset) {
|
||||
|
||||
|
@ -178,7 +181,7 @@ namespace sd {
|
|||
}
|
||||
|
||||
template<size_t Rank, size_t Index, bool Last_Index_Faster = true>
|
||||
FORCEINLINE
|
||||
_CUDA_HD FORCEINLINE
|
||||
typename std::enable_if<(Rank - 1 != Index), size_t >::type
|
||||
coord_inc_n(CoordsState<Rank - 1>& cbs, size_t last_offset) {
|
||||
|
||||
|
@ -200,13 +203,13 @@ namespace sd {
|
|||
}
|
||||
|
||||
template<size_t Rank, size_t Index = 0, bool Last_Index_Faster = true>
|
||||
FORCEINLINE size_t inc_coords(CoordsState<Rank - 1>& cbs, size_t last_offset) {
|
||||
_CUDA_HD FORCEINLINE size_t inc_coords(CoordsState<Rank - 1>& cbs, size_t last_offset) {
|
||||
|
||||
return coord_inc_n<Rank, Index, Last_Index_Faster>(cbs,/* 1,*/ last_offset/*, 0*/);
|
||||
}
|
||||
|
||||
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
|
||||
FORCEINLINE size_t inc_coords_ews(CoordsState<Rank - 1>& cbs, size_t last_offset, size_t ews) {
|
||||
_CUDA_HD FORCEINLINE size_t inc_coords_ews(CoordsState<Rank - 1>& cbs, size_t last_offset, size_t ews) {
|
||||
if (ews == 1) {
|
||||
constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
|
||||
return last_offset + STRIDE(cbs, Ind);
|
||||
|
@ -215,7 +218,7 @@ namespace sd {
|
|||
}
|
||||
|
||||
template<size_t Rank, size_t rankIndex, bool Last_Index_Faster = true>
|
||||
FORCEINLINE
|
||||
_CUDA_HD FORCEINLINE
|
||||
typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type
|
||||
coord_inc_n(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) {
|
||||
|
||||
|
@ -234,7 +237,7 @@ namespace sd {
|
|||
}
|
||||
|
||||
template<size_t Rank, size_t rankIndex, bool Last_Index_Faster = true>
|
||||
FORCEINLINE
|
||||
_CUDA_HD FORCEINLINE
|
||||
typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t >::type
|
||||
coord_inc_n(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) {
|
||||
|
||||
|
@ -259,14 +262,14 @@ namespace sd {
|
|||
}
|
||||
|
||||
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
|
||||
FORCEINLINE zip_size_t inc_coords(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) {
|
||||
_CUDA_HD FORCEINLINE zip_size_t inc_coords(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) {
|
||||
|
||||
return coord_inc_n<Rank, rankIndex, Last_Index_Faster>(cbs, last_offset);
|
||||
}
|
||||
|
||||
|
||||
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
|
||||
FORCEINLINE
|
||||
_CUDA_HD FORCEINLINE
|
||||
typename std::enable_if<(Rank - 1 == rankIndex), size_t>::type
|
||||
init_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) {
|
||||
constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
|
||||
|
@ -281,7 +284,7 @@ namespace sd {
|
|||
|
||||
|
||||
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
|
||||
FORCEINLINE
|
||||
_CUDA_HD FORCEINLINE
|
||||
typename std::enable_if<(Rank - 1 != rankIndex), size_t>::type
|
||||
init_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) {
|
||||
constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
|
||||
|
@ -297,14 +300,14 @@ namespace sd {
|
|||
|
||||
|
||||
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
|
||||
FORCEINLINE
|
||||
_CUDA_HD FORCEINLINE
|
||||
typename std::enable_if<(Rank - 1 == rankIndex), bool>::type
|
||||
eq_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong* coords) {
|
||||
return COORDS(cbs, rankIndex) == coords[rankIndex];
|
||||
}
|
||||
|
||||
template<size_t Rank, size_t rankIndex = 0>
|
||||
FORCEINLINE
|
||||
_CUDA_HD FORCEINLINE
|
||||
typename std::enable_if<(Rank - 1 != rankIndex), bool>::type
|
||||
eq_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong* coords) {
|
||||
return COORDS(cbs, rankIndex) == coords[rankIndex] && eq_coords<Rank, rankIndex + 1>(cbs, coords);
|
||||
|
@ -312,21 +315,21 @@ namespace sd {
|
|||
|
||||
|
||||
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
|
||||
FORCEINLINE
|
||||
_CUDA_HD FORCEINLINE
|
||||
typename std::enable_if<(Rank - 1 == rankIndex), bool>::type
|
||||
eq_zip_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong* coords) {
|
||||
return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex];
|
||||
}
|
||||
|
||||
template<size_t Rank, size_t rankIndex = 0>
|
||||
FORCEINLINE
|
||||
_CUDA_HD FORCEINLINE
|
||||
typename std::enable_if<(Rank - 1 != rankIndex), bool>::type
|
||||
eq_zip_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong* coords) {
|
||||
return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex] && eq_zip_coords<Rank, rankIndex + 1>(cbs, coords);
|
||||
}
|
||||
|
||||
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
|
||||
FORCEINLINE
|
||||
_CUDA_HD FORCEINLINE
|
||||
typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type
|
||||
init_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) {
|
||||
constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
|
||||
|
@ -342,7 +345,7 @@ namespace sd {
|
|||
}
|
||||
|
||||
template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
|
||||
FORCEINLINE
|
||||
_CUDA_HD FORCEINLINE
|
||||
typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t>::type
|
||||
init_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) {
|
||||
constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
|
||||
|
@ -360,7 +363,7 @@ namespace sd {
|
|||
|
||||
//inc coords for non constant Ranks
|
||||
template<bool Last_Index_Faster = true>
|
||||
FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip = 0) {
|
||||
_CUDA_HD FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip = 0) {
|
||||
|
||||
Nd4jLong val;
|
||||
for (int i = rank - skip - 1; i >= 0; i--) {
|
||||
|
@ -379,7 +382,7 @@ namespace sd {
|
|||
}
|
||||
|
||||
template<>
|
||||
FORCEINLINE size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip) {
|
||||
_CUDA_HD FORCEINLINE size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip) {
|
||||
|
||||
Nd4jLong val;
|
||||
for (int i = skip; i < rank; i++) {
|
||||
|
@ -399,7 +402,7 @@ namespace sd {
|
|||
|
||||
|
||||
template<bool Last_Index_Faster = true>
|
||||
FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip = 0) {
|
||||
_CUDA_HD FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip = 0) {
|
||||
|
||||
Nd4jLong val = 0;
|
||||
for (int i = rank - skip - 1; i >= 0; i--) {
|
||||
|
@ -420,7 +423,7 @@ namespace sd {
|
|||
}
|
||||
|
||||
template<>
|
||||
FORCEINLINE zip_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip) {
|
||||
_CUDA_HD FORCEINLINE zip_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip) {
|
||||
|
||||
Nd4jLong val = 0;
|
||||
for (int i = skip; i < rank; i++) {
|
||||
|
@ -450,7 +453,7 @@ namespace sd {
|
|||
|
||||
|
||||
template<bool Last_Index_Faster = true>
|
||||
FORCEINLINE triple_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip = 0) {
|
||||
_CUDA_HD FORCEINLINE triple_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip = 0) {
|
||||
|
||||
Nd4jLong val = 0;
|
||||
for (int i = rank - skip - 1; i >= 0; i--) {
|
||||
|
@ -473,7 +476,7 @@ namespace sd {
|
|||
}
|
||||
|
||||
template<>
|
||||
FORCEINLINE triple_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip) {
|
||||
_CUDA_HD FORCEINLINE triple_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip) {
|
||||
|
||||
Nd4jLong val = 0;
|
||||
for (int i = skip; i < rank; i++) {
|
||||
|
@ -496,7 +499,7 @@ namespace sd {
|
|||
return last_offset;
|
||||
}
|
||||
|
||||
FORCEINLINE triple_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) {
|
||||
_CUDA_HD FORCEINLINE triple_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* y_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) {
|
||||
|
||||
triple_size_t offset = { 0,0 ,0 };
|
||||
size_t rank_4 = rank & -4;
|
||||
|
@ -527,7 +530,7 @@ namespace sd {
|
|||
|
||||
|
||||
template<bool Last_Index_Faster = true>
|
||||
FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip = 0)
|
||||
_CUDA_HD FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip = 0)
|
||||
{
|
||||
if (skip < 0 || skip >= rank) skip = 0;
|
||||
Nd4jLong total = 1;
|
||||
|
@ -539,7 +542,7 @@ namespace sd {
|
|||
|
||||
|
||||
template<>
|
||||
FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip)
|
||||
_CUDA_HD FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip)
|
||||
{
|
||||
if (skip < 0 || skip >= rank) skip = 0;
|
||||
Nd4jLong total = 1;
|
||||
|
@ -552,7 +555,7 @@ namespace sd {
|
|||
|
||||
|
||||
template<bool Last_Index_Faster = true>
|
||||
FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength)
|
||||
_CUDA_HD FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength)
|
||||
{
|
||||
if (skip < 0 || skip >= rank) skip = 0;
|
||||
Nd4jLong total = 1;
|
||||
|
@ -573,7 +576,7 @@ namespace sd {
|
|||
|
||||
|
||||
template<>
|
||||
FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength)
|
||||
_CUDA_HD FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength)
|
||||
{
|
||||
if (skip < 0 || skip >= rank) skip = 0;
|
||||
if (skip > 0) {
|
||||
|
@ -602,7 +605,7 @@ namespace sd {
|
|||
if squash is True then it will attempt to minimize the output ( for both orders) and the tail
|
||||
*/
|
||||
|
||||
FORCEINLINE void rePartition(char order, const std::vector<int>& dimensions, const size_t rank, const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong(&new_bases)[MAX_RANK], Nd4jLong(&new_strides)[MAX_RANK], int& first_begin, int& first_end, int& second_begin, int& second_end, bool first_squash = false, bool second_squash = true) {
|
||||
_CUDA_HD FORCEINLINE void rePartition(char order, const std::vector<int>& dimensions, const size_t rank, const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong(&new_bases)[MAX_RANK], Nd4jLong(&new_strides)[MAX_RANK], int& first_begin, int& first_end, int& second_begin, int& second_end, bool first_squash = false, bool second_squash = true) {
|
||||
|
||||
bool indices[MAX_RANK] = {};
|
||||
int ind = 0;
|
||||
|
|
Loading…
Reference in New Issue