LoopsCoordsHelper: enable functions for Cuda usage
Signed-off-by: AbdelRauf <rauf@konduit.ai>
This commit is contained in:
		
							parent
							
								
									40c2e592ac
								
							
						
					
					
						commit
						b66454d593
					
				| @ -41,7 +41,10 @@ namespace sd { | ||||
| #define unlikely(x)  (x) | ||||
| #endif | ||||
| 
 | ||||
| 	using zip_size_t = std::pair<size_t, size_t>; | ||||
| 	struct zip_size_t{ | ||||
| 		Nd4jLong first; | ||||
| 		Nd4jLong second; | ||||
| 	}; | ||||
| 
 | ||||
| 	template<size_t Index> | ||||
| 	struct CoordsState :CoordsState<Index - 1> { | ||||
| @ -96,7 +99,7 @@ namespace sd { | ||||
| #define ZIP_OF_ADJUST2(x,index)  ((x).::sd::ZipCoordsState<(index)>::adjust2) | ||||
| 
 | ||||
| 
 | ||||
| 	FORCEINLINE void   index2coords_C(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) { | ||||
| 	_CUDA_HD FORCEINLINE void   index2coords_C(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) { | ||||
| 		for (size_t i = rank - 1; i > 0; --i) { | ||||
| 			coords[i] = index % bases[i]; | ||||
| 			index /= bases[i]; | ||||
| @ -104,7 +107,7 @@ namespace sd { | ||||
| 		coords[0] = index;      // last iteration 
 | ||||
| 	} | ||||
| 
 | ||||
| 	FORCEINLINE void   index2coords_F(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) { | ||||
| 	_CUDA_HD FORCEINLINE void   index2coords_F(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) { | ||||
| 
 | ||||
| 		for (size_t i = 0; i < rank - 1; i++) { | ||||
| 			coords[i] = index % bases[i]; | ||||
| @ -113,7 +116,7 @@ namespace sd { | ||||
| 		coords[rank - 1] = index;      // last iteration
 | ||||
| 	} | ||||
| 
 | ||||
| 	FORCEINLINE size_t offset_from_coords(const Nd4jLong* strides, const Nd4jLong* coords, const  Nd4jLong& rank) { | ||||
| 	_CUDA_HD FORCEINLINE size_t offset_from_coords(const Nd4jLong* strides, const Nd4jLong* coords, const  Nd4jLong& rank) { | ||||
| 
 | ||||
| 		size_t offset = 0; | ||||
| 		size_t rank_4 = rank & -4; | ||||
| @ -131,7 +134,7 @@ namespace sd { | ||||
| 	} | ||||
| 
 | ||||
| 
 | ||||
| 	FORCEINLINE zip_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) { | ||||
| 	_CUDA_HD FORCEINLINE zip_size_t offset_from_coords(const Nd4jLong* x_strides, const Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) { | ||||
| 
 | ||||
| 		zip_size_t offset = { 0,0 }; | ||||
| 		size_t rank_4 = rank & -4; | ||||
| @ -160,7 +163,7 @@ namespace sd { | ||||
| 	} | ||||
| 
 | ||||
| 	template<size_t Rank, size_t Index, bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE | ||||
| 	_CUDA_HD FORCEINLINE | ||||
| 		typename std::enable_if<(Rank - 1 == Index), size_t>::type | ||||
| 		coord_inc_n(CoordsState<Rank - 1>& cbs, size_t last_offset) { | ||||
| 
 | ||||
| @ -178,7 +181,7 @@ namespace sd { | ||||
| 	} | ||||
| 
 | ||||
| 	template<size_t Rank, size_t Index, bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE | ||||
| 	_CUDA_HD FORCEINLINE | ||||
| 		typename std::enable_if<(Rank - 1 != Index), size_t >::type | ||||
| 		coord_inc_n(CoordsState<Rank - 1>& cbs, size_t last_offset) { | ||||
| 
 | ||||
| @ -200,13 +203,13 @@ namespace sd { | ||||
| 	} | ||||
| 
 | ||||
| 	template<size_t Rank, size_t Index = 0, bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE size_t inc_coords(CoordsState<Rank - 1>& cbs, size_t last_offset) { | ||||
| 	_CUDA_HD FORCEINLINE size_t inc_coords(CoordsState<Rank - 1>& cbs, size_t last_offset) { | ||||
| 
 | ||||
| 		return coord_inc_n<Rank, Index, Last_Index_Faster>(cbs,/* 1,*/ last_offset/*, 0*/); | ||||
| 	} | ||||
| 
 | ||||
| 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE size_t inc_coords_ews(CoordsState<Rank - 1>& cbs, size_t last_offset, size_t ews) { | ||||
| 	_CUDA_HD FORCEINLINE size_t inc_coords_ews(CoordsState<Rank - 1>& cbs, size_t last_offset, size_t ews) { | ||||
| 		if (ews == 1) { | ||||
| 			constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>(); | ||||
| 			return last_offset + STRIDE(cbs, Ind); | ||||
| @ -215,7 +218,7 @@ namespace sd { | ||||
| 	} | ||||
| 
 | ||||
| 	template<size_t Rank, size_t rankIndex, bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE | ||||
| 	_CUDA_HD FORCEINLINE | ||||
| 		typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type | ||||
| 		coord_inc_n(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) { | ||||
| 
 | ||||
| @ -234,7 +237,7 @@ namespace sd { | ||||
| 	} | ||||
| 
 | ||||
| 	template<size_t Rank, size_t rankIndex, bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE | ||||
| 	_CUDA_HD FORCEINLINE | ||||
| 		typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t >::type | ||||
| 		coord_inc_n(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) { | ||||
| 
 | ||||
| @ -259,14 +262,14 @@ namespace sd { | ||||
| 	} | ||||
| 
 | ||||
| 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE zip_size_t inc_coords(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) { | ||||
| 	_CUDA_HD FORCEINLINE zip_size_t inc_coords(ZipCoordsState<Rank - 1>& cbs, zip_size_t last_offset) { | ||||
| 
 | ||||
| 		return coord_inc_n<Rank, rankIndex, Last_Index_Faster>(cbs, last_offset); | ||||
| 	} | ||||
| 
 | ||||
| 
 | ||||
| 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE | ||||
| 	_CUDA_HD FORCEINLINE | ||||
| 		typename std::enable_if<(Rank - 1 == rankIndex), size_t>::type | ||||
| 		init_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) { | ||||
| 		constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>(); | ||||
| @ -281,7 +284,7 @@ namespace sd { | ||||
| 
 | ||||
| 
 | ||||
| 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE | ||||
| 	_CUDA_HD FORCEINLINE | ||||
| 		typename std::enable_if<(Rank - 1 != rankIndex), size_t>::type | ||||
| 		init_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) { | ||||
| 		constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>(); | ||||
| @ -297,14 +300,14 @@ namespace sd { | ||||
| 
 | ||||
| 
 | ||||
| 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE | ||||
| 	_CUDA_HD FORCEINLINE | ||||
| 		typename std::enable_if<(Rank - 1 == rankIndex), bool>::type | ||||
| 		eq_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong* coords) { | ||||
| 		return COORDS(cbs, rankIndex) == coords[rankIndex]; | ||||
| 	} | ||||
| 
 | ||||
| 	template<size_t Rank, size_t rankIndex = 0> | ||||
| 	FORCEINLINE | ||||
| 	_CUDA_HD FORCEINLINE | ||||
| 		typename std::enable_if<(Rank - 1 != rankIndex), bool>::type | ||||
| 		eq_coords(CoordsState<Rank - 1>& cbs, const Nd4jLong* coords) { | ||||
| 		return COORDS(cbs, rankIndex) == coords[rankIndex] && eq_coords<Rank, rankIndex + 1>(cbs, coords); | ||||
| @ -312,21 +315,21 @@ namespace sd { | ||||
| 
 | ||||
| 
 | ||||
| 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE | ||||
| 	_CUDA_HD FORCEINLINE | ||||
| 		typename std::enable_if<(Rank - 1 == rankIndex), bool>::type | ||||
| 		eq_zip_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong* coords) { | ||||
| 		return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex]; | ||||
| 	} | ||||
| 
 | ||||
| 	template<size_t Rank, size_t rankIndex = 0> | ||||
| 	FORCEINLINE | ||||
| 	_CUDA_HD FORCEINLINE | ||||
| 		typename std::enable_if<(Rank - 1 != rankIndex), bool>::type | ||||
| 		eq_zip_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong* coords) { | ||||
| 		return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex] && eq_zip_coords<Rank, rankIndex + 1>(cbs, coords); | ||||
| 	} | ||||
| 
 | ||||
| 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE | ||||
| 	_CUDA_HD FORCEINLINE | ||||
| 		typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type | ||||
| 		init_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) { | ||||
| 		constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>(); | ||||
| @ -342,7 +345,7 @@ namespace sd { | ||||
| 	} | ||||
| 
 | ||||
| 	template<size_t Rank, size_t rankIndex = 0, bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE | ||||
| 	_CUDA_HD FORCEINLINE | ||||
| 		typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t>::type | ||||
| 		init_coords(ZipCoordsState<Rank - 1>& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) { | ||||
| 		constexpr size_t Ind = StridesOrderInd<Rank, rankIndex, Last_Index_Faster>(); | ||||
| @ -360,7 +363,7 @@ namespace sd { | ||||
| 
 | ||||
| 	//inc coords for non constant Ranks
 | ||||
| 	template<bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip = 0) { | ||||
| 	_CUDA_HD FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip = 0) { | ||||
| 
 | ||||
| 		Nd4jLong  val; | ||||
| 		for (int i = rank - skip - 1; i >= 0; i--) { | ||||
| @ -379,7 +382,7 @@ namespace sd { | ||||
| 	} | ||||
| 
 | ||||
| 	template<> | ||||
| 	FORCEINLINE size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip) { | ||||
| 	_CUDA_HD FORCEINLINE size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip) { | ||||
| 
 | ||||
| 		Nd4jLong  val; | ||||
| 		for (int i = skip; i < rank; i++) { | ||||
| @ -399,7 +402,7 @@ namespace sd { | ||||
| 
 | ||||
| 
 | ||||
| 	template<bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip = 0) { | ||||
| 	_CUDA_HD FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip = 0) { | ||||
| 
 | ||||
| 		Nd4jLong  val = 0; | ||||
| 		for (int i = rank - skip - 1; i >= 0; i--) { | ||||
| @ -420,7 +423,7 @@ namespace sd { | ||||
| 	} | ||||
| 
 | ||||
| 	template<> | ||||
| 	FORCEINLINE zip_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip) { | ||||
| 	_CUDA_HD FORCEINLINE zip_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip) { | ||||
| 
 | ||||
| 		Nd4jLong  val = 0; | ||||
| 		for (int i = skip; i < rank; i++) { | ||||
| @ -450,7 +453,7 @@ namespace sd { | ||||
| 
 | ||||
| 
 | ||||
| 	template<bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE triple_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* y_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip = 0) { | ||||
| 	_CUDA_HD FORCEINLINE triple_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* y_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip = 0) { | ||||
| 
 | ||||
| 		Nd4jLong  val = 0; | ||||
| 		for (int i = rank - skip - 1; i >= 0; i--) { | ||||
| @ -473,7 +476,7 @@ namespace sd { | ||||
| 	} | ||||
| 
 | ||||
| 	template<> | ||||
| 	FORCEINLINE triple_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* y_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip) { | ||||
| 	_CUDA_HD FORCEINLINE triple_size_t inc_coords<false>(const Nd4jLong* bases, const Nd4jLong* x_strides, const  Nd4jLong* y_strides, const  Nd4jLong* z_strides, Nd4jLong* coords, triple_size_t last_offset, const size_t rank, const size_t skip) { | ||||
| 
 | ||||
| 		Nd4jLong  val = 0; | ||||
| 		for (int i = skip; i < rank; i++) { | ||||
| @ -496,7 +499,7 @@ namespace sd { | ||||
| 		return last_offset; | ||||
| 	} | ||||
| 
 | ||||
| 	FORCEINLINE triple_size_t offset_from_coords(const Nd4jLong* x_strides, const  Nd4jLong* y_strides, const  Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) { | ||||
| 	_CUDA_HD FORCEINLINE triple_size_t offset_from_coords(const Nd4jLong* x_strides, const  Nd4jLong* y_strides, const  Nd4jLong* z_strides, const Nd4jLong* coords, const Nd4jLong& rank) { | ||||
| 
 | ||||
| 		triple_size_t offset = { 0,0 ,0 }; | ||||
| 		size_t rank_4 = rank & -4; | ||||
| @ -527,7 +530,7 @@ namespace sd { | ||||
| 
 | ||||
| 
 | ||||
| 	template<bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip = 0) | ||||
| 	_CUDA_HD FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip = 0) | ||||
| 	{ | ||||
| 		if (skip < 0 || skip >= rank) skip = 0; | ||||
| 		Nd4jLong total = 1; | ||||
| @ -539,7 +542,7 @@ namespace sd { | ||||
| 
 | ||||
| 
 | ||||
| 	template<> | ||||
| 	FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip) | ||||
| 	_CUDA_HD FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip) | ||||
| 	{ | ||||
| 		if (skip < 0 || skip >= rank) skip = 0; | ||||
| 		Nd4jLong total = 1; | ||||
| @ -552,7 +555,7 @@ namespace sd { | ||||
| 
 | ||||
| 
 | ||||
| 	template<bool Last_Index_Faster = true> | ||||
| 	FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength) | ||||
| 	_CUDA_HD FORCEINLINE Nd4jLong getLength(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength) | ||||
| 	{ | ||||
| 		if (skip < 0 || skip >= rank) skip = 0; | ||||
| 		Nd4jLong total = 1; | ||||
| @ -573,7 +576,7 @@ namespace sd { | ||||
| 
 | ||||
| 
 | ||||
| 	template<> | ||||
| 	FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength) | ||||
| 	_CUDA_HD FORCEINLINE Nd4jLong getLength<false>(const Nd4jLong* bases, int rank, int skip, Nd4jLong& outSkippedLength) | ||||
| 	{ | ||||
| 		if (skip < 0 || skip >= rank) skip = 0; | ||||
| 		if (skip > 0) { | ||||
| @ -602,7 +605,7 @@ namespace sd { | ||||
| 	if squash is True then  it will attempt to minimize the output ( for both orders) and the tail | ||||
| */ | ||||
| 
 | ||||
| 	FORCEINLINE void rePartition(char order, const std::vector<int>& dimensions, const size_t rank, const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong(&new_bases)[MAX_RANK], Nd4jLong(&new_strides)[MAX_RANK], int& first_begin, int& first_end, int& second_begin, int& second_end, bool first_squash = false, bool second_squash = true) { | ||||
| 	_CUDA_HD FORCEINLINE void rePartition(char order, const std::vector<int>& dimensions, const size_t rank, const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong(&new_bases)[MAX_RANK], Nd4jLong(&new_strides)[MAX_RANK], int& first_begin, int& first_end, int& second_begin, int& second_end, bool first_squash = false, bool second_squash = true) { | ||||
| 
 | ||||
| 		bool indices[MAX_RANK] = {}; | ||||
| 		int ind = 0; | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user