LoopsCoordsHelper: enable functions for Cuda usage

Signed-off-by: AbdelRauf <rauf@konduit.ai>
2021-02-28 19:16:56 +01:00 · 2021-02-28 19:16:56 +01:00 · b66454d593
commit b66454d593
parent 40c2e592ac
1 changed files with 35 additions and 32 deletions
--- a/libnd4j/include/helpers/LoopsCoordsHelper.h
+++ b/libnd4j/include/helpers/LoopsCoordsHelper.h
@ -41,7 +41,10 @@ namespace sd {
 #define unlikely(x)  (x)
 #endif

-	using zip_size_t = std::pair<size_t, size_t>;
+	struct zip_size_t{
+		Nd4jLong first;
+		Nd4jLong second;
+	};

 	template<size_t Index>
 	struct CoordsState :CoordsState<Index - 1> {
@ -96,7 +99,7 @@ namespace sd {
 #define ZIP_OF_ADJUST2(x,index)  ((x).::sd::ZipCoordsState<(index)>::adjust2)


-	FORCEINLINE void   index2coords_C(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) {
+	_CUDA_HD FORCEINLINE void   index2coords_C(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) {
 		for (size_t i = rank - 1; i > 0; --i) {
 			coords[i] = index % bases[i];
 			index /= bases[i];
@ -104,7 +107,7 @@ namespace sd {
 		coords[0] = index;      // last iteration 
 	}

-	FORCEINLINE void   index2coords_F(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) {
+	_CUDA_HD FORCEINLINE void   index2coords_F(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) {

 		for (size_t i = 0; i < rank - 1; i++) {
 			coords[i] = index % bases[i];
@ -113,7 +116,7 @@ namespace sd {
 		coords[rank - 1] = index;      // last iteration
 	}

-	FORCEINLINE size_t offset_from_coords(const Nd4jLong* strides, const Nd4jLong* coords, const  Nd4jLong& rank) {
+	_CUDA_HD FORCEINLINE size_t offset_from_coords(const Nd4jLong* strides, const Nd4jLong* coords, const  Nd4jLong& rank) {

 		size_t offset = 0;
 		size_t rank_4 = rank & -4;
@ -131,7 +134,7 @@ namespace sd {
 	}


-	FORCEINLINE zip_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) {
+	_CUDA_HD FORCEINLINE zip_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) {

 		zip_size_t offset = { 0,0 };
 		size_t rank_4 = rank & -4;
@ -160,7 +163,7 @@ namespace sd {
 	}

 	template<size_t Rank, size_t Index, bool Last_Index_Faster = true>
-	FORCEINLINE
+	_CUDA_HD FORCEINLINE
 		typename std::enable_if<(Rank - 1 == Index), size_t>::type
 		coord_inc_n(CoordsState<Rank - 1>& cbs, size_t last_offset) {

@ -178,7 +181,7 @@ namespace sd {
 	}

 	template<size_t Rank, size_t Index, bool Last_Index_Faster = true>
-	FORCEINLINE
+	_CUDA_HD FORCEINLINE
 		typename std::enable_if<(Rank - 1 != Index), size_t >::type
 		coord_inc_n(CoordsState<Rank - 1>& cbs, size_t last_offset) {

@ -200,13 +203,13 @@ namespace sd {
 	}

 	template<size_t Rank, size_t Index = 0, bool Last_Index_Faster = true>
-	FORCEINLINE size_t inc_coords(CoordsState<Rank - 1>& cbs, size_t last_offset) {
+	_CUDA_HD FORCEINLINE size_t inc_coords(CoordsState<Rank - 1>& cbs, size_t last_offset) {

 		return coord_inc_n<Rank, Index, Last_Index_Faster>(cbs,/* 1,*/ last_offset/*, 0*/);
 	}

 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
-	FORCEINLINE size_t inc_coords_ews(CoordsState<Rank - 1>& cbs, size_t last_offset, size_t ews) {
+	_CUDA_HD FORCEINLINE size_t inc_coords_ews(CoordsState<Rank - 1>& cbs, size_t last_offset, size_t ews) {
 		if (ews == 1) {
 			constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
 			return last_offset + STRIDE(cbs, Ind);
@ -215,7 +218,7 @@ namespace sd {
 	}

 	template<size_t Rank, size_t rankIndex, bool Last_Index_Faster = true>
-	FORCEINLINE
+	_CUDA_HD FORCEINLINE
 		typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type
 		coord_inc_n(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) {

@ -234,7 +237,7 @@ namespace sd {
 	}

 	template<size_t Rank, size_t rankIndex, bool Last_Index_Faster = true>
-	FORCEINLINE
+	_CUDA_HD FORCEINLINE
 		typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t >::type
 		coord_inc_n(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) {

@ -259,14 +262,14 @@ namespace sd {
 	}

 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
-	FORCEINLINE zip_size_t inc_coords(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) {
+	_CUDA_HD FORCEINLINE zip_size_t inc_coords(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) {

 		return coord_inc_n<Rank, rankIndex, Last_Index_Faster>(cbs, last_offset);
 	}


 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
-	FORCEINLINE
+	_CUDA_HD FORCEINLINE
 		typename std::enable_if<(Rank - 1 == rankIndex), size_t>::type
 		init_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) {
 		constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
@ -281,7 +284,7 @@ namespace sd {


 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
-	FORCEINLINE
+	_CUDA_HD FORCEINLINE
 		typename std::enable_if<(Rank - 1 != rankIndex), size_t>::type
 		init_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) {
 		constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
@ -297,14 +300,14 @@ namespace sd {


 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
-	FORCEINLINE
+	_CUDA_HD FORCEINLINE
 		typename std::enable_if<(Rank - 1 == rankIndex), bool>::type
 		eq_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong* coords) {
 		return COORDS(cbs, rankIndex) == coords[rankIndex];
 	}

 	template<size_t Rank, size_t rankIndex = 0>
-	FORCEINLINE
+	_CUDA_HD FORCEINLINE
 		typename std::enable_if<(Rank - 1 != rankIndex), bool>::type
 		eq_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong* coords) {
 		return COORDS(cbs, rankIndex) == coords[rankIndex] && eq_coords<Rank, rankIndex + 1>(cbs, coords);
@ -312,21 +315,21 @@ namespace sd {


 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
-	FORCEINLINE
+	_CUDA_HD FORCEINLINE
 		typename std::enable_if<(Rank - 1 == rankIndex), bool>::type
 		eq_zip_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong* coords) {
 		return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex];
 	}

 	template<size_t Rank, size_t rankIndex = 0>
-	FORCEINLINE
+	_CUDA_HD FORCEINLINE
 		typename std::enable_if<(Rank - 1 != rankIndex), bool>::type
 		eq_zip_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong* coords) {
 		return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex] && eq_zip_coords<Rank, rankIndex + 1>(cbs, coords);
 	}

 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
-	FORCEINLINE
+	_CUDA_HD FORCEINLINE
 		typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type
 		init_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) {
 		constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
@ -342,7 +345,7 @@ namespace sd {
 	}

 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true>
-	FORCEINLINE
+	_CUDA_HD FORCEINLINE
 		typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t>::type
 		init_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) {
 		constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>();
@ -360,7 +363,7 @@ namespace sd {

 	//inc coords for non constant Ranks
 	template<bool Last_Index_Faster = true>
-	FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip = 0) {
+	_CUDA_HD FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip = 0) {

 		Nd4jLong  val;
 		for (int i = rank - skip - 1; i >= 0; i--) {
@ -379,7 +382,7 @@ namespace sd {
 	}

 	template<>
-	FORCEINLINE size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip) {
+	_CUDA_HD FORCEINLINE size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip) {

 		Nd4jLong  val;
 		for (int i = skip; i < rank; i++) {
@ -399,7 +402,7 @@ namespace sd {


 	template<bool Last_Index_Faster = true>
-	FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip = 0) {
+	_CUDA_HD FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip = 0) {

 		Nd4jLong  val = 0;
 		for (int i = rank - skip - 1; i >= 0; i--) {
@ -420,7 +423,7 @@ namespace sd {
 	}

 	template<>
-	FORCEINLINE zip_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip) {
+	_CUDA_HD FORCEINLINE zip_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip) {

 		Nd4jLong  val = 0;
 		for (int i = skip; i < rank; i++) {
@ -450,7 +453,7 @@ namespace sd {


 	template<bool Last_Index_Faster = true>
-	FORCEINLINE triple_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* y_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip = 0) {
+	_CUDA_HD FORCEINLINE triple_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* y_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip = 0) {

 		Nd4jLong  val = 0;
 		for (int i = rank - skip - 1; i >= 0; i--) {
@ -473,7 +476,7 @@ namespace sd {
 	}

 	template<>
-	FORCEINLINE triple_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* y_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip) {
+	_CUDA_HD FORCEINLINE triple_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* y_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip) {

 		Nd4jLong  val = 0;
 		for (int i = skip; i < rank; i++) {
@ -496,7 +499,7 @@ namespace sd {
 		return last_offset;
 	}

-	FORCEINLINE triple_size_t offset_from_coords(const Nd4jLong* x_strides, const  Nd4jLong* y_strides, const  Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) {
+	_CUDA_HD FORCEINLINE triple_size_t offset_from_coords(const Nd4jLong* x_strides, const  Nd4jLong* y_strides, const  Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) {

 		triple_size_t offset = { 0,0 ,0 };
 		size_t rank_4 = rank & -4;
@ -527,7 +530,7 @@ namespace sd {


 	template<bool Last_Index_Faster = true>
-	FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip = 0)
+	_CUDA_HD FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip = 0)
 	{
 		if (skip < 0 || skip >= rank) skip = 0;
 		Nd4jLong total = 1;
@ -539,7 +542,7 @@ namespace sd {


 	template<>
-	FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip)
+	_CUDA_HD FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip)
 	{
 		if (skip < 0 || skip >= rank) skip = 0;
 		Nd4jLong total = 1;
@ -552,7 +555,7 @@ namespace sd {


 	template<bool Last_Index_Faster = true>
-	FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength)
+	_CUDA_HD FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength)
 	{
 		if (skip < 0 || skip >= rank) skip = 0;
 		Nd4jLong total = 1;
@ -573,7 +576,7 @@ namespace sd {


 	template<>
-	FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength)
+	_CUDA_HD FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength)
 	{
 		if (skip < 0 || skip >= rank) skip = 0;
 		if (skip > 0) {
@ -602,7 +605,7 @@ namespace sd {
 	if squash is True then  it will attempt to minimize the output ( for both orders) and the tail
 */

-	FORCEINLINE void rePartition(char order, const std::vector<int>& dimensions, const size_t rank, const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong(&new_bases)[MAX_RANK], Nd4jLong(&new_strides)[MAX_RANK], int& first_begin, int& first_end, int& second_begin, int& second_end, bool first_squash = false, bool second_squash = true) {
+	_CUDA_HD FORCEINLINE void rePartition(char order, const std::vector<int>& dimensions, const size_t rank, const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong(&new_bases)[MAX_RANK], Nd4jLong(&new_strides)[MAX_RANK], int& first_begin, int& first_end, int& second_begin, int& second_end, bool first_squash = false, bool second_squash = true) {

 		bool indices[MAX_RANK] = {};
 		int ind = 0;