/******************************************************************************* * * Copyright (c) 2019 Konduit K.K. * * This program and the accompanying materials are made available under the * terms of the Apache License, Version 2.0 which is available at * https://www.apache.org/licenses/LICENSE-2.0. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * SPDX-License-Identifier: Apache-2.0 ******************************************************************************/ // // @author AbdelRauf // #ifndef LIBND4J_LOOPCOORDSHELPER_H #define LIBND4J_LOOPCOORDSHELPER_H #include #include #include #include #include namespace nd4j { #if defined(__GNUC__) #define likely(x) __builtin_expect( (x), 1) #define unlikely(x) __builtin_expect( (x), 0) #else #define likely(x) (x) #define unlikely(x) (x) #endif using zip_size_t = std::pair; template struct CoordsState :CoordsState { Nd4jLong coord; Nd4jLong last_num; Nd4jLong stride; Nd4jLong adjust; CoordsState() :CoordsState() {} }; template<> struct CoordsState<0> { Nd4jLong coord; Nd4jLong last_num; Nd4jLong stride; Nd4jLong adjust; CoordsState() {} }; template struct ZipCoordsState :ZipCoordsState { Nd4jLong coord; Nd4jLong last_num; Nd4jLong stride1; Nd4jLong stride2; Nd4jLong adjust1; Nd4jLong adjust2; ZipCoordsState() : ZipCoordsState() {} }; template<> struct ZipCoordsState<0> { Nd4jLong coord; Nd4jLong last_num; Nd4jLong stride1; Nd4jLong stride2; Nd4jLong adjust1; Nd4jLong adjust2; ZipCoordsState() {} }; #define COORDS(x,index) ((x).::nd4j::CoordsState<(index)>::coord) #define STRIDE(x,index) ((x).::nd4j::CoordsState<(index)>::stride) #define LAST_NUM(x,index) ((x).::nd4j::CoordsState<(index)>::last_num) #define OF_ADJUST(x,index) ((x).::nd4j::CoordsState<(index)>::adjust) #define ZIP_LAST_NUM(x,index) ((x).::nd4j::ZipCoordsState<(index)>::last_num) #define ZIP_COORDS(x,index) ((x).::nd4j::ZipCoordsState<(index)>::coord) #define ZIP_STRIDE1(x,index) ((x).::nd4j::ZipCoordsState<(index)>::stride1) #define ZIP_STRIDE2(x,index) ((x).::nd4j::ZipCoordsState<(index)>::stride2) #define ZIP_OF_ADJUST1(x,index) ((x).::nd4j::ZipCoordsState<(index)>::adjust1) #define ZIP_OF_ADJUST2(x,index) ((x).::nd4j::ZipCoordsState<(index)>::adjust2) FORCEINLINE void index2coords_C(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) { for (size_t i = rank - 1; i > 0; --i) { coords[i] = index % bases[i]; index /= bases[i]; } coords[0] = index; // last iteration } FORCEINLINE void index2coords_F(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) { for (size_t i = 0; i < rank - 1; i++) { coords[i] = index % bases[i]; index /= bases[i]; } coords[rank - 1] = index; // last iteration } FORCEINLINE size_t offset_from_coords(const Nd4jLong* strides, const Nd4jLong* coords, const Nd4jLong& rank) { size_t offset = 0; size_t rank_4 = rank & -4; for (int i = 0; i < rank_4; i += 4) { offset = offset + coords[i] * strides[i] + coords[i + 1] * strides[i + 1] + coords[i + 2] * strides[i + 2] + coords[i + 3] * strides[i + 3]; } for (int i = rank_4; i < rank; i++) { offset += coords[i] * strides[i]; } return offset; } FORCEINLINE zip_size_t offset_from_coords(const Nd4jLong*& x_strides, const Nd4jLong*& z_strides, const Nd4jLong* coords, const Nd4jLong& rank) { zip_size_t offset = { 0,0 }; size_t rank_4 = rank & -4; for (int i = 0; i < rank_4; i += 4) { offset.first = offset.first + coords[i] * x_strides[i] + coords[i + 1] * x_strides[i + 1] + coords[i + 2] * x_strides[i + 2] + coords[i + 3] * x_strides[i + 3]; offset.second = offset.second + coords[i] * z_strides[i] + coords[i + 1] * z_strides[i + 1] + coords[i + 2] * z_strides[i + 2] + coords[i + 3] * z_strides[i + 3]; } for (int i = rank_4; i < rank; i++) { offset.first += coords[i] * x_strides[i]; offset.second += coords[i] * z_strides[i]; } return offset; } template constexpr size_t StridesOrderInd() { return Last_Index_Faster ? Rank - Index - 1 : Index; } template FORCEINLINE typename std::enable_if<(Rank - 1 == Index), size_t>::type coord_inc_n(CoordsState& cbs, size_t last_offset) { constexpr size_t Ind = StridesOrderInd(); if (likely(COORDS(cbs, Ind) < LAST_NUM(cbs, Ind))) { last_offset += cbs.CoordsState::stride; COORDS(cbs, Ind) = COORDS(cbs, Ind) + 1; return last_offset; } //overflow case should not happen COORDS(cbs, Ind) = 0; //last_offset = 0;// last_offset + strides[Ind] - adjust_stride; return 0; } template FORCEINLINE typename std::enable_if<(Rank - 1 != Index), size_t >::type coord_inc_n(CoordsState& cbs, size_t last_offset) { constexpr size_t Ind = StridesOrderInd(); if (likely(COORDS(cbs, Ind) < LAST_NUM(cbs, Ind))) { last_offset = last_offset + cbs.CoordsState::stride; COORDS(cbs, Ind) = COORDS(cbs, Ind) + 1; } else { //lets adjust offset last_offset -= OF_ADJUST(cbs, Ind); COORDS(cbs, Ind) = 0; last_offset = coord_inc_n(cbs, last_offset); } return last_offset; } template FORCEINLINE size_t inc_coords(CoordsState& cbs, size_t last_offset) { return coord_inc_n(cbs,/* 1,*/ last_offset/*, 0*/); } template FORCEINLINE size_t inc_coords_ews(CoordsState& cbs, size_t last_offset, size_t ews) { if (ews == 1) { constexpr size_t Ind = StridesOrderInd(); return last_offset + STRIDE(cbs, Ind); } return coord_inc_n(cbs,/* 1,*/ last_offset/*, 0*/); } template FORCEINLINE typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type coord_inc_n(ZipCoordsState& cbs, zip_size_t last_offset) { constexpr size_t Ind = StridesOrderInd(); if (likely(ZIP_COORDS(cbs, Ind) < ZIP_LAST_NUM(cbs, Ind))) { last_offset.first += ZIP_STRIDE1(cbs, Ind); last_offset.second += ZIP_STRIDE2(cbs, Ind); ZIP_COORDS(cbs, Ind) = ZIP_COORDS(cbs, Ind) + 1; return last_offset; } //overflow case should not happen ZIP_COORDS(cbs, Ind) = 0; //last_offset = 0;// last_offset + strides[Ind] - adjust_stride; return { 0,0 }; } template FORCEINLINE typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t >::type coord_inc_n(ZipCoordsState& cbs, zip_size_t last_offset) { constexpr size_t Ind = StridesOrderInd(); if (likely(ZIP_COORDS(cbs, Ind) < ZIP_LAST_NUM(cbs, Ind))) { last_offset.first += ZIP_STRIDE1(cbs, Ind); last_offset.second += ZIP_STRIDE2(cbs, Ind); ZIP_COORDS(cbs, Ind) = ZIP_COORDS(cbs, Ind) + 1; } else { //lets adjust offset last_offset.first -= ZIP_OF_ADJUST1(cbs, Ind); last_offset.second -= ZIP_OF_ADJUST2(cbs, Ind); ZIP_COORDS(cbs, Ind) = 0; last_offset = coord_inc_n(cbs, last_offset); } return last_offset; } template FORCEINLINE zip_size_t inc_coords(ZipCoordsState& cbs, zip_size_t last_offset) { return coord_inc_n(cbs, last_offset); } template FORCEINLINE typename std::enable_if<(Rank - 1 == rankIndex), size_t>::type init_coords(CoordsState& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) { constexpr size_t Ind = StridesOrderInd(); COORDS(cbs, Ind) = index % bases[Ind]; LAST_NUM(cbs, Ind) = bases[Ind] - 1; STRIDE(cbs, Ind) = strides[Ind]; OF_ADJUST(cbs, Ind) = bases[Ind] * strides[Ind] - strides[Ind]; offset += COORDS(cbs, Ind) * strides[Ind]; return offset; } template FORCEINLINE typename std::enable_if<(Rank - 1 != rankIndex), size_t>::type init_coords(CoordsState& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) { constexpr size_t Ind = StridesOrderInd(); COORDS(cbs, Ind) = index % bases[Ind]; LAST_NUM(cbs, Ind) = bases[Ind] - 1; STRIDE(cbs, Ind) = strides[Ind]; OF_ADJUST(cbs, Ind) = bases[Ind] * strides[Ind] - strides[Ind]; offset += COORDS(cbs, Ind) * strides[Ind]; return init_coords(cbs, index / bases[Ind], bases, strides, offset); } template FORCEINLINE typename std::enable_if<(Rank - 1 == rankIndex), bool>::type eq_coords(CoordsState& cbs, const Nd4jLong* coords) { return COORDS(cbs, rankIndex) == coords[rankIndex]; } template FORCEINLINE typename std::enable_if<(Rank - 1 != rankIndex), bool>::type eq_coords(CoordsState& cbs, const Nd4jLong* coords) { return COORDS(cbs, rankIndex) == coords[rankIndex] && eq_coords(cbs, coords); } template FORCEINLINE typename std::enable_if<(Rank - 1 == rankIndex), bool>::type eq_zip_coords(ZipCoordsState& cbs, const Nd4jLong* coords) { return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex]; } template FORCEINLINE typename std::enable_if<(Rank - 1 != rankIndex), bool>::type eq_zip_coords(ZipCoordsState& cbs, const Nd4jLong* coords) { return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex] && eq_zip_coords(cbs, coords); } template FORCEINLINE typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type init_coords(ZipCoordsState& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) { constexpr size_t Ind = StridesOrderInd(); ZIP_COORDS(cbs, Ind) = index % bases[Ind]; ZIP_LAST_NUM(cbs, Ind) = bases[Ind] - 1; ZIP_STRIDE1(cbs, Ind) = x_strides[Ind]; ZIP_STRIDE2(cbs, Ind) = z_strides[Ind]; ZIP_OF_ADJUST1(cbs, Ind) = ZIP_LAST_NUM(cbs, Ind) * ZIP_STRIDE1(cbs, Ind); ZIP_OF_ADJUST2(cbs, Ind) = ZIP_LAST_NUM(cbs, Ind) * ZIP_STRIDE2(cbs, Ind); offset.first += ZIP_COORDS(cbs, Ind) * ZIP_STRIDE1(cbs, Ind); offset.second += ZIP_COORDS(cbs, Ind) * ZIP_STRIDE2(cbs, Ind); return offset; } template FORCEINLINE typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t>::type init_coords(ZipCoordsState& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) { constexpr size_t Ind = StridesOrderInd(); ZIP_COORDS(cbs, Ind) = index % bases[Ind]; ZIP_LAST_NUM(cbs, Ind) = bases[Ind] - 1; ZIP_STRIDE1(cbs, Ind) = x_strides[Ind]; ZIP_STRIDE2(cbs, Ind) = z_strides[Ind]; ZIP_OF_ADJUST1(cbs, Ind) = ZIP_LAST_NUM(cbs, Ind) * ZIP_STRIDE1(cbs, Ind); ZIP_OF_ADJUST2(cbs, Ind) = ZIP_LAST_NUM(cbs, Ind) * ZIP_STRIDE2(cbs, Ind); offset.first += ZIP_COORDS(cbs, Ind) * ZIP_STRIDE1(cbs, Ind); offset.second += ZIP_COORDS(cbs, Ind) * ZIP_STRIDE2(cbs, Ind); return init_coords(cbs, index / bases[Ind], bases, x_strides, z_strides, offset); } //inc coords for non constant Ranks template FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip = 0) { Nd4jLong val; for (int i = rank - skip - 1; i >= 0; i--) { val = coords[i] + 1; if (likely(val < bases[i])) { coords[i] = val; last_offset += strides[i]; break; } else { last_offset -= coords[i] * strides[i]; coords[i] = 0; } } return last_offset; } template<> FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip) { Nd4jLong val; for (int i = skip; i < rank; i++) { val = coords[i] + 1; if (likely(val < bases[i])) { coords[i] = val; last_offset += strides[i]; break; } else { last_offset -= coords[i] * strides[i]; coords[i] = 0; } } return last_offset; } template FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip = 0) { Nd4jLong val = 0; for (int i = rank - skip - 1; i >= 0; i--) { val = coords[i] + 1; if (likely(val < bases[i])) { coords[i] = val; last_offset.first += x_strides[i]; last_offset.second += z_strides[i]; break; } else { last_offset.first -= coords[i] * x_strides[i]; last_offset.second -= coords[i] * z_strides[i]; coords[i] = 0; } } return last_offset; } template<> FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip) { Nd4jLong val = 0; for (int i = skip; i < rank; i++) { val = coords[i] + 1; if (likely(val < bases[i])) { coords[i] = val; last_offset.first += x_strides[i]; last_offset.second += z_strides[i]; break; } else { last_offset.first -= coords[i] * x_strides[i]; last_offset.second -= coords[i] * z_strides[i]; coords[i] = 0; } } return last_offset; } } #endif