From bead656febe60db11caab7d22281eb3ab1757adc Mon Sep 17 00:00:00 2001 From: Abdelrauf Date: Sat, 8 Feb 2020 16:31:30 +0400 Subject: [PATCH] Initial performance improvement for Bias Add and etc #8556 (#217) * Initial performance improvement for Bias Add, loop coords helpers and increment aligned parallel threading Signed-off-by: AbdelRauf * One more test for Rauf Signed-off-by: raver119 * disable couple of perf tests Signed-off-by: raver119 Co-authored-by: raver119 --- libnd4j/include/execution/Threads.h | 14 +- libnd4j/include/execution/impl/Threads.cpp | 82 ++ libnd4j/include/helpers/LoopsCoordsHelper.h | 440 +++++++++++ .../ops/declarable/helpers/cpu/addBias.cpp | 726 ++++++++++++++---- .../layers_tests/LoopCoordsHelperTests.cpp | 223 ++++++ .../layers_tests/PlaygroundTests.cpp | 82 ++ 6 files changed, 1427 insertions(+), 140 deletions(-) create mode 100644 libnd4j/include/helpers/LoopsCoordsHelper.h create mode 100644 libnd4j/tests_cpu/layers_tests/LoopCoordsHelperTests.cpp diff --git a/libnd4j/include/execution/Threads.h b/libnd4j/include/execution/Threads.h index 14467883f..3a1fd8951 100644 --- a/libnd4j/include/execution/Threads.h +++ b/libnd4j/include/execution/Threads.h @@ -14,9 +14,9 @@ * SPDX-License-Identifier: Apache-2.0 ******************************************************************************/ -// -// @author raver119@gmail.com -// + // + // @author raver119@gmail.com + // #ifndef SAMEDIFF_THREADS_H #define SAMEDIFF_THREADS_H @@ -165,6 +165,14 @@ namespace samediff { static int64_t parallel_long(FUNC_RL function, FUNC_AL aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads()); static double parallel_double(FUNC_RD function, FUNC_AD aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads()); + + /** + * This method will execute function in parallel preserving the parts to be aligned increment size + * PLEASE NOTE: this function can use smaller number of threads than requested. + * + */ + static int parallel_aligned_increment(FUNC_1D function, int64_t start, int64_t stop, int64_t increment, size_t type_size = sizeof(float), uint32_t req_numThreads = nd4j::Environment::getInstance()->maxMasterThreads()); + }; } diff --git a/libnd4j/include/execution/impl/Threads.cpp b/libnd4j/include/execution/impl/Threads.cpp index 982b59a4c..94710731e 100644 --- a/libnd4j/include/execution/impl/Threads.cpp +++ b/libnd4j/include/execution/impl/Threads.cpp @@ -638,4 +638,86 @@ namespace samediff { return intermediatery[0]; } + + int Threads::parallel_aligned_increment(FUNC_1D function, int64_t start, int64_t stop, int64_t increment, size_t type_size , uint32_t req_numThreads) { + if (start > stop) + throw std::runtime_error("Threads::parallel_for got start > stop"); + auto num_elements = (stop - start); + //this way we preserve increment starts offset + //so we will parition considering delta but not total elements + auto delta = (stop - start) / increment; + + // in some cases we just fire func as is + if (delta == 0 || req_numThreads == 1) { + function(0, start, stop, increment); + return 1; + } + int numThreads = 0; + + int adjusted_numThreads = samediff::ThreadsHelper::numberOfThreads(req_numThreads, (num_elements * sizeof(double)) / (200 * type_size)); + + if (adjusted_numThreads > delta) + adjusted_numThreads = delta; + // shortcut + if (adjusted_numThreads <= 1) { + function(0, start, stop, increment); + return 1; + } + //take span as ceil + auto spand = std::ceil((double)delta / (double)adjusted_numThreads); + numThreads = static_cast(std::ceil((double)delta / spand)); + auto span = static_cast(spand); + + auto ticket = samediff::ThreadPool::getInstance()->tryAcquire(numThreads); + if (ticket != nullptr) { + //tail_add is additional value of the last part + //it could be negative or positive + //we will spread that value across + auto tail_add = delta - numThreads * span; + Nd4jLong begin = 0; + Nd4jLong end = 0; + + //we will try enqueu bigger parts first + decltype(span) span1, span2; + int last = 0; + if (tail_add >= 0) { + //for span == 1 , tail_add is 0 + last = tail_add; + span1 = span + 1; + span2 = span; + } + else { + last = numThreads + tail_add;// -std::abs(tail_add); + span1 = span; + span2 = span - 1; + } + for (int i = 0; i < last; i++) { + end = begin + span1 * increment; + // putting the task into the queue for a given thread + ticket->enqueue(i, numThreads, function, begin, end, increment); + begin = end; + } + for (int i = last; i < numThreads - 1; i++) { + end = begin + span2 * increment; + // putting the task into the queue for a given thread + ticket->enqueue(i, numThreads, function, begin, end, increment); + begin = end; + } + //for last one enqueue last offset as stop + //we need it in case our ((stop-start) % increment ) > 0 + ticket->enqueue(numThreads - 1, numThreads, function, begin, stop, increment); + // block and wait till all threads finished the job + ticket->waitAndRelease(); + // we tell that parallelism request succeeded + return numThreads; + } + else { + // if there were no threads available - we'll execute function right within current thread + function(0, start, stop, increment); + // we tell that parallelism request declined + return 1; + } + } + + } \ No newline at end of file diff --git a/libnd4j/include/helpers/LoopsCoordsHelper.h b/libnd4j/include/helpers/LoopsCoordsHelper.h new file mode 100644 index 000000000..35f9d2063 --- /dev/null +++ b/libnd4j/include/helpers/LoopsCoordsHelper.h @@ -0,0 +1,440 @@ +/******************************************************************************* + * + * Copyright (c) 2019 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + // + // @author AbdelRauf + // +#ifndef LIBND4J_LOOPCOORDSHELPER_H +#define LIBND4J_LOOPCOORDSHELPER_H + +#include +#include +#include +#include +#include +namespace nd4j { + +#if defined(__GNUC__) +#define likely(x) __builtin_expect( (x), 1) +#define unlikely(x) __builtin_expect( (x), 0) +#else +#define likely(x) (x) +#define unlikely(x) (x) +#endif + + using zip_size_t = std::pair; + + template + struct CoordsState :CoordsState { + Nd4jLong coord; + Nd4jLong last_num; + Nd4jLong stride; + Nd4jLong adjust; + CoordsState() :CoordsState() {} + }; + + template<> + struct CoordsState<0> { + Nd4jLong coord; + Nd4jLong last_num; + Nd4jLong stride; + Nd4jLong adjust; + CoordsState() {} + }; + + + template + struct ZipCoordsState :ZipCoordsState { + Nd4jLong coord; + Nd4jLong last_num; + Nd4jLong stride1; + Nd4jLong stride2; + Nd4jLong adjust1; + Nd4jLong adjust2; + ZipCoordsState() : ZipCoordsState() {} + }; + + template<> + struct ZipCoordsState<0> { + Nd4jLong coord; + Nd4jLong last_num; + Nd4jLong stride1; + Nd4jLong stride2; + Nd4jLong adjust1; + Nd4jLong adjust2; + ZipCoordsState() {} + }; + +#define COORDS(x,index) ((x).::nd4j::CoordsState<(index)>::coord) +#define STRIDE(x,index) ((x).::nd4j::CoordsState<(index)>::stride) +#define LAST_NUM(x,index) ((x).::nd4j::CoordsState<(index)>::last_num) +#define OF_ADJUST(x,index) ((x).::nd4j::CoordsState<(index)>::adjust) +#define ZIP_LAST_NUM(x,index) ((x).::nd4j::ZipCoordsState<(index)>::last_num) +#define ZIP_COORDS(x,index) ((x).::nd4j::ZipCoordsState<(index)>::coord) +#define ZIP_STRIDE1(x,index) ((x).::nd4j::ZipCoordsState<(index)>::stride1) +#define ZIP_STRIDE2(x,index) ((x).::nd4j::ZipCoordsState<(index)>::stride2) +#define ZIP_OF_ADJUST1(x,index) ((x).::nd4j::ZipCoordsState<(index)>::adjust1) +#define ZIP_OF_ADJUST2(x,index) ((x).::nd4j::ZipCoordsState<(index)>::adjust2) + + + FORCEINLINE void index2coords_C(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) { + for (size_t i = rank - 1; i > 0; --i) { + coords[i] = index % bases[i]; + index /= bases[i]; + } + coords[0] = index; // last iteration + } + + FORCEINLINE void index2coords_F(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) { + + for (size_t i = 0; i < rank - 1; i++) { + coords[i] = index % bases[i]; + index /= bases[i]; + } + coords[rank - 1] = index; // last iteration + } + + FORCEINLINE size_t offset_from_coords(const Nd4jLong* strides, const Nd4jLong* coords, const Nd4jLong& rank) { + + size_t offset = 0; + size_t rank_4 = rank & -4; + for (int i = 0; i < rank_4; i += 4) { + offset = offset + + coords[i] * strides[i] + + coords[i + 1] * strides[i + 1] + + coords[i + 2] * strides[i + 2] + + coords[i + 3] * strides[i + 3]; + } + for (int i = rank_4; i < rank; i++) { + offset += coords[i] * strides[i]; + } + return offset; + } + + + FORCEINLINE zip_size_t offset_from_coords(const Nd4jLong*& x_strides, const Nd4jLong*& z_strides, const Nd4jLong* coords, const Nd4jLong& rank) { + + zip_size_t offset = { 0,0 }; + size_t rank_4 = rank & -4; + for (int i = 0; i < rank_4; i += 4) { + offset.first = offset.first + + coords[i] * x_strides[i] + + coords[i + 1] * x_strides[i + 1] + + coords[i + 2] * x_strides[i + 2] + + coords[i + 3] * x_strides[i + 3]; + offset.second = offset.second + + coords[i] * z_strides[i] + + coords[i + 1] * z_strides[i + 1] + + coords[i + 2] * z_strides[i + 2] + + coords[i + 3] * z_strides[i + 3]; + } + for (int i = rank_4; i < rank; i++) { + offset.first += coords[i] * x_strides[i]; + offset.second += coords[i] * z_strides[i]; + } + return offset; + } + + template + constexpr size_t StridesOrderInd() { + return Last_Index_Faster ? Rank - Index - 1 : Index; + } + + template + FORCEINLINE + typename std::enable_if<(Rank - 1 == Index), size_t>::type + coord_inc_n(CoordsState& cbs, size_t last_offset) { + + constexpr size_t Ind = StridesOrderInd(); + + if (likely(COORDS(cbs, Ind) < LAST_NUM(cbs, Ind))) { + last_offset += cbs.CoordsState::stride; + COORDS(cbs, Ind) = COORDS(cbs, Ind) + 1; + return last_offset; + } + //overflow case should not happen + COORDS(cbs, Ind) = 0; + //last_offset = 0;// last_offset + strides[Ind] - adjust_stride; + return 0; + } + + template + FORCEINLINE + typename std::enable_if<(Rank - 1 != Index), size_t >::type + coord_inc_n(CoordsState& cbs, size_t last_offset) { + + constexpr size_t Ind = StridesOrderInd(); + + if (likely(COORDS(cbs, Ind) < LAST_NUM(cbs, Ind))) { + last_offset = last_offset + cbs.CoordsState::stride; + COORDS(cbs, Ind) = COORDS(cbs, Ind) + 1; + } + else { + //lets adjust offset + last_offset -= OF_ADJUST(cbs, Ind); + COORDS(cbs, Ind) = 0; + last_offset = coord_inc_n(cbs, last_offset); + } + + return last_offset; + + } + + template + FORCEINLINE size_t inc_coords(CoordsState& cbs, size_t last_offset) { + + return coord_inc_n(cbs,/* 1,*/ last_offset/*, 0*/); + } + + template + FORCEINLINE size_t inc_coords_ews(CoordsState& cbs, size_t last_offset, size_t ews) { + if (ews == 1) { + constexpr size_t Ind = StridesOrderInd(); + return last_offset + STRIDE(cbs, Ind); + } + return coord_inc_n(cbs,/* 1,*/ last_offset/*, 0*/); + } + + template + FORCEINLINE + typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type + coord_inc_n(ZipCoordsState& cbs, zip_size_t last_offset) { + + constexpr size_t Ind = StridesOrderInd(); + + if (likely(ZIP_COORDS(cbs, Ind) < ZIP_LAST_NUM(cbs, Ind))) { + last_offset.first += ZIP_STRIDE1(cbs, Ind); + last_offset.second += ZIP_STRIDE2(cbs, Ind); + ZIP_COORDS(cbs, Ind) = ZIP_COORDS(cbs, Ind) + 1; + return last_offset; + } + //overflow case should not happen + ZIP_COORDS(cbs, Ind) = 0; + //last_offset = 0;// last_offset + strides[Ind] - adjust_stride; + return { 0,0 }; + } + + template + FORCEINLINE + typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t >::type + coord_inc_n(ZipCoordsState& cbs, zip_size_t last_offset) { + + constexpr size_t Ind = StridesOrderInd(); + + if (likely(ZIP_COORDS(cbs, Ind) < ZIP_LAST_NUM(cbs, Ind))) { + last_offset.first += ZIP_STRIDE1(cbs, Ind); + last_offset.second += ZIP_STRIDE2(cbs, Ind); + ZIP_COORDS(cbs, Ind) = ZIP_COORDS(cbs, Ind) + 1; + } + else { + + //lets adjust offset + last_offset.first -= ZIP_OF_ADJUST1(cbs, Ind); + last_offset.second -= ZIP_OF_ADJUST2(cbs, Ind); + ZIP_COORDS(cbs, Ind) = 0; + last_offset = coord_inc_n(cbs, last_offset); + } + + return last_offset; + + } + + template + FORCEINLINE zip_size_t inc_coords(ZipCoordsState& cbs, zip_size_t last_offset) { + + return coord_inc_n(cbs, last_offset); + } + + + template + FORCEINLINE + typename std::enable_if<(Rank - 1 == rankIndex), size_t>::type + init_coords(CoordsState& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) { + constexpr size_t Ind = StridesOrderInd(); + COORDS(cbs, Ind) = index % bases[Ind]; + LAST_NUM(cbs, Ind) = bases[Ind] - 1; + STRIDE(cbs, Ind) = strides[Ind]; + OF_ADJUST(cbs, Ind) = bases[Ind] * strides[Ind] - strides[Ind]; + offset += COORDS(cbs, Ind) * strides[Ind]; + return offset; + } + + + + template + FORCEINLINE + typename std::enable_if<(Rank - 1 != rankIndex), size_t>::type + init_coords(CoordsState& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* strides, size_t offset = 0) { + constexpr size_t Ind = StridesOrderInd(); + COORDS(cbs, Ind) = index % bases[Ind]; + LAST_NUM(cbs, Ind) = bases[Ind] - 1; + STRIDE(cbs, Ind) = strides[Ind]; + OF_ADJUST(cbs, Ind) = bases[Ind] * strides[Ind] - strides[Ind]; + offset += COORDS(cbs, Ind) * strides[Ind]; + return init_coords(cbs, index / bases[Ind], bases, strides, offset); + } + + + + + template + FORCEINLINE + typename std::enable_if<(Rank - 1 == rankIndex), bool>::type + eq_coords(CoordsState& cbs, const Nd4jLong* coords) { + return COORDS(cbs, rankIndex) == coords[rankIndex]; + } + + template + FORCEINLINE + typename std::enable_if<(Rank - 1 != rankIndex), bool>::type + eq_coords(CoordsState& cbs, const Nd4jLong* coords) { + return COORDS(cbs, rankIndex) == coords[rankIndex] && eq_coords(cbs, coords); + } + + + template + FORCEINLINE + typename std::enable_if<(Rank - 1 == rankIndex), bool>::type + eq_zip_coords(ZipCoordsState& cbs, const Nd4jLong* coords) { + return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex]; + } + + template + FORCEINLINE + typename std::enable_if<(Rank - 1 != rankIndex), bool>::type + eq_zip_coords(ZipCoordsState& cbs, const Nd4jLong* coords) { + return ZIP_COORDS(cbs, rankIndex) == coords[rankIndex] && eq_zip_coords(cbs, coords); + } + + template + FORCEINLINE + typename std::enable_if<(Rank - 1 == rankIndex), zip_size_t>::type + init_coords(ZipCoordsState& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) { + constexpr size_t Ind = StridesOrderInd(); + ZIP_COORDS(cbs, Ind) = index % bases[Ind]; + ZIP_LAST_NUM(cbs, Ind) = bases[Ind] - 1; + ZIP_STRIDE1(cbs, Ind) = x_strides[Ind]; + ZIP_STRIDE2(cbs, Ind) = z_strides[Ind]; + ZIP_OF_ADJUST1(cbs, Ind) = ZIP_LAST_NUM(cbs, Ind) * ZIP_STRIDE1(cbs, Ind); + ZIP_OF_ADJUST2(cbs, Ind) = ZIP_LAST_NUM(cbs, Ind) * ZIP_STRIDE2(cbs, Ind); + offset.first += ZIP_COORDS(cbs, Ind) * ZIP_STRIDE1(cbs, Ind); + offset.second += ZIP_COORDS(cbs, Ind) * ZIP_STRIDE2(cbs, Ind); + return offset; + } + + template + FORCEINLINE + typename std::enable_if<(Rank - 1 != rankIndex), zip_size_t>::type + init_coords(ZipCoordsState& cbs, const Nd4jLong index, const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, zip_size_t offset = {}) { + constexpr size_t Ind = StridesOrderInd(); + ZIP_COORDS(cbs, Ind) = index % bases[Ind]; + ZIP_LAST_NUM(cbs, Ind) = bases[Ind] - 1; + ZIP_STRIDE1(cbs, Ind) = x_strides[Ind]; + ZIP_STRIDE2(cbs, Ind) = z_strides[Ind]; + ZIP_OF_ADJUST1(cbs, Ind) = ZIP_LAST_NUM(cbs, Ind) * ZIP_STRIDE1(cbs, Ind); + ZIP_OF_ADJUST2(cbs, Ind) = ZIP_LAST_NUM(cbs, Ind) * ZIP_STRIDE2(cbs, Ind); + offset.first += ZIP_COORDS(cbs, Ind) * ZIP_STRIDE1(cbs, Ind); + offset.second += ZIP_COORDS(cbs, Ind) * ZIP_STRIDE2(cbs, Ind); + return init_coords(cbs, index / bases[Ind], bases, x_strides, z_strides, offset); + } + + + //inc coords for non constant Ranks + template + FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip = 0) { + + Nd4jLong val; + for (int i = rank - skip - 1; i >= 0; i--) { + val = coords[i] + 1; + if (likely(val < bases[i])) { + coords[i] = val; + last_offset += strides[i]; + break; + } + else { + last_offset -= coords[i] * strides[i]; + coords[i] = 0; + } + } + return last_offset; + } + + template<> + FORCEINLINE size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* strides, Nd4jLong* coords, size_t last_offset, const size_t rank, const size_t skip) { + + Nd4jLong val; + for (int i = skip; i < rank; i++) { + val = coords[i] + 1; + if (likely(val < bases[i])) { + coords[i] = val; + last_offset += strides[i]; + break; + } + else { + last_offset -= coords[i] * strides[i]; + coords[i] = 0; + } + } + return last_offset; + } + + + template + FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip = 0) { + + Nd4jLong val = 0; + for (int i = rank - skip - 1; i >= 0; i--) { + val = coords[i] + 1; + if (likely(val < bases[i])) { + coords[i] = val; + last_offset.first += x_strides[i]; + last_offset.second += z_strides[i]; + break; + } + else { + last_offset.first -= coords[i] * x_strides[i]; + last_offset.second -= coords[i] * z_strides[i]; + coords[i] = 0; + } + } + return last_offset; + } + + template<> + FORCEINLINE zip_size_t inc_coords(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, Nd4jLong* coords, zip_size_t last_offset, const size_t rank, const size_t skip) { + + Nd4jLong val = 0; + for (int i = skip; i < rank; i++) { + val = coords[i] + 1; + if (likely(val < bases[i])) { + coords[i] = val; + + last_offset.first += x_strides[i]; + last_offset.second += z_strides[i]; + break; + } + else { + last_offset.first -= coords[i] * x_strides[i]; + last_offset.second -= coords[i] * z_strides[i]; + coords[i] = 0; + } + } + return last_offset; + } + +} + +#endif \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp index a910a854c..e5242a5be 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright (c) 2015-2018 Skymind, Inc. + * Copyright (c) 2019 Konduit K.K. * * This program and the accompanying materials are made available under the * terms of the Apache License, Version 2.0 which is available at @@ -14,161 +15,612 @@ * SPDX-License-Identifier: Apache-2.0 ******************************************************************************/ -// -// @author Yurii Shyrma, created on 26.02.2018 -// + // + // @author Yurii Shyrma, created on 26.02.2018 + // + // + // @author AbdelRauf + // - -#include +#include +#include +#include +#include #include +#include +#include +#include -namespace nd4j { -namespace ops { -namespace helpers { +#if defined(__GNUC__) +#define align32 __attribute__((aligned(32))) +#elif defined(_MSC_VER) +#define align32 __declspec(align(32)) +#else +#define align32 +#endif + +namespace nd4j { + namespace ops { + namespace helpers { + + template + static FORCEINLINE void _add(const T* __restrict xx, const T* __restrict yy, T* __restrict zz, const size_t& N) { + PRAGMA_OMP_SIMD + for (size_t c = 0; c < N; c++) + zz[c] = xx[c] + yy[c]; + } + + template + static FORCEINLINE void _add_inplace(T* __restrict xx, const T* __restrict yy, const size_t& N) { + PRAGMA_OMP_SIMD + for (size_t c = 0; c < N; c++) + xx[c] = xx[c] + yy[c]; + } + + template + static FORCEINLINE void _add_broadcast_inplace(T* __restrict xx, const T yy, const size_t& N) { + PRAGMA_OMP_SIMD + for (size_t c = 0; c < N; c++) + xx[c] = xx[c] + yy; + } + + template + static FORCEINLINE void _add_broadcast(const T* __restrict xx, const T yy, T* __restrict zz, const size_t& N) { + PRAGMA_OMP_SIMD + for (size_t c = 0; c < N; c++) + zz[c] = xx[c] + yy; + } + + static constexpr size_t MIN_NN = 32; + static constexpr size_t MIN_NN_K = 2; + + template + static typename std::enable_if::value, const X*>::type + flattened_bias(const Y* b_real, X* b_stack, const size_t b_stack_size, std::unique_ptr& b_heap, const Nd4jLong num, Nd4jLong yStrideC) + { + //best results when buffer used much , may result bad perf if buffer is used once + X* b_new = nullptr; + if (yStrideC != 1) { + if (num > b_stack_size) { + b_heap.reset(new X[num]); + b_new = b_heap.get(); + } + else { + b_new = b_stack; + } + for (size_t i = 0; i < num; i++) { + b_new[i] = b_real[i * yStrideC]; + } + } + else { + //no need , just pass normal bias + return static_cast(b_real); + } + return const_cast(b_new); + } + + template + static typename std::enable_if::value, const X*>::type + flattened_bias(const Y* b_real, X* b_stack, const size_t b_stack_size, std::unique_ptr& b_heap, const Nd4jLong num, Nd4jLong yStrideC) + { + //best results when buffer used much , may result bad perf if buffer is used once + X* b_new = nullptr; + if (num > b_stack_size) { + b_heap.reset(new X[num]); + b_new = b_heap.get(); + } + else { + b_new = b_stack; + } + if (yStrideC != 1) { + for (size_t i = 0; i < num; i++) { + b_new[i] = static_cast(b_real[i * yStrideC]); + } + } + else { + for (size_t i = 0; i < num; i++) { + b_new[i] = static_cast(b_real[i]); + } + } + return const_cast(b_new); + } + + template + static void channel_atTheEnd_stride1_C(const Nd4jLong*& x_strides, const Nd4jLong*& bases, T* x, const T* b, T* z, const bool& inplace, const Nd4jLong& start, const Nd4jLong& stop, const Nd4jLong& inc) + { + size_t loop_count = (stop - start) / inc; + nd4j::CoordsState cst; + size_t offset = nd4j::init_coords(cst, start, bases, x_strides); + + if (!inplace) { + for (size_t i = 0; i < loop_count; i++) { + _add(&(x[offset]), b, &(z[offset]), inc); + offset = nd4j::inc_coords(cst, offset); + } + } + else { + for (size_t i = 0; i < loop_count; i++) { + _add_inplace(&(x[offset]), b, inc); + offset = nd4j::inc_coords(cst, offset); + } + } + } -////////////////////////////////////////////////////////////////////////// -template -static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output, const bool isNCHW) { + template + static void channel_atTheEnd_generic_C(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, const bool& inplaceOp, const bool same_stride, const bool same_order, T* x, const T* b, T* z, Nd4jLong start, Nd4jLong stop, Nd4jLong inc) { - // bias [oC] + //just ensure that passed sameStride is correct, because when bases are equal orders matters + bool sameOrderStride = same_order && same_stride; + if (sameOrderStride && x_strides[constRank - 1] == 1) { + channel_atTheEnd_stride1_C(x_strides, bases, x, b, z, inplaceOp, start, stop, inc); + } + else { + size_t loop_count = (stop - start) / inc; + nd4j::ZipCoordsState cst; + nd4j::zip_size_t offset = nd4j::init_coords(cst, start, bases, x_strides, z_strides); + Nd4jLong x_stride = ZIP_STRIDE1(cst, constRank - 1); + Nd4jLong z_stride = ZIP_STRIDE2(cst, constRank - 1); - // if(input_rank == 4) - // input and output have same shapes: [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW) - // if(input_rank == 5) - // input and output have same shapes: [bS, oD, oH, oW, oC] (NHWC) or [bS, oD, oC, oH, oW] (NCHW) - // else - // apply applyBroadCast + if (same_order && x_stride == 1 && z_stride == 1) { + /* bases are equal with different strides , but the last one is 1. So we can still vectorize it */ + for (size_t i = 0; i < loop_count; i++) { + _add(&(x[offset.first]), b, &(z[offset.second]), inc); + offset = nd4j::inc_coords(cst, offset); + } + } + else { + for (size_t i = 0; i < loop_count; i++) { + T* xx = &(x[offset.first]); + T* zz = &(z[offset.second]); + for (size_t j = 0; j < inc; j++) + zz[j * z_stride] = xx[j * x_stride] + b[j]; + offset = nd4j::inc_coords(cst, offset); + } + } + } + + } + + /** + * this is our main optimization which benefits from everything for the continuous last_channel C order case + * as it is intended for full continous we do not need any rank info + */ + template + void channel_atTheEnd_continous_C(T* x, const T* b, T* z, bool inplaceOp, Nd4jLong start, Nd4jLong stop, Nd4jLong inc) { + size_t nums = (stop - start); + size_t num_inc = nums - nums % inc; + if (inplaceOp) { + + size_t offset_p = start; + for (size_t i = 0; i < num_inc; i += inc) { + _add_inplace(&(x[offset_p]), b, inc); + offset_p += inc; + } + if (nums > num_inc) + _add_inplace(&(x[offset_p]), b, nums - num_inc); + } + else { + size_t offset_p = start; + for (size_t i = 0; i < num_inc; i += inc) { + _add(&(x[offset_p]), b, &(z[offset_p]), inc); + offset_p += inc; + } + if (nums > num_inc) + _add(&(x[offset_p]), b, &(z[offset_p]), nums - num_inc); + } + } + + template + static void channel_NC_stride1_C(const Nd4jLong*& x_strides, const Nd4jLong*& bases, T* x, const T2* b, T* z, const bool& inplace, const Nd4jLong yStrideC, const Nd4jLong& start, const Nd4jLong& stop, const Nd4jLong& inc) + { + size_t loop_count = (stop - start) / inc; + nd4j::CoordsState cst; + size_t offset = nd4j::init_coords(cst, start, bases, x_strides); + + if (!inplace) { + for (size_t i = 0; i < loop_count; i++) { + T yy = static_cast(b[COORDS(cst, 1) * yStrideC]); + _add_broadcast(&(x[offset]), yy, &(z[offset]), inc); + offset = nd4j::inc_coords(cst, offset); + } + } + else { + for (size_t i = 0; i < loop_count; i++) { + T yy = static_cast(b[COORDS(cst, 1) * yStrideC]); + _add_broadcast_inplace(&(x[offset]), yy, inc); + offset = nd4j::inc_coords(cst, offset); + } + } + } + + template + void channel_NC_generic_C(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, const bool& inplaceOp, const bool same_stride, const bool same_order, const Nd4jLong yStrideC, T* x, const T2* b, T* z, Nd4jLong start, Nd4jLong stop, Nd4jLong inc) { + + //just ensure that passed sameStride is correct, because when bases are equal orders matters + + bool sameOrderStride = same_order && same_stride; + + if (sameOrderStride && x_strides[constRank - 1] == 1) { + channel_NC_stride1_C(x_strides, bases, x, b, z, inplaceOp, yStrideC, start, stop, inc); + } + else { + + // (stop-start) % inc == 0 because we handled inside partitioning using the channel size + size_t loop_count = (stop - start) / inc; + nd4j::ZipCoordsState cst; + nd4j::zip_size_t offset = nd4j::init_coords(cst, start, bases, x_strides, z_strides); + Nd4jLong x_stride = ZIP_STRIDE1(cst, constRank - 1); + Nd4jLong z_stride = ZIP_STRIDE2(cst, constRank - 1); + if (same_order && z_stride == 1 && x_stride == 1) { + /* bases are equal with different strides , but the last one is 1. So we can still vectorize it */ + for (size_t i = 0; i < loop_count; i++) { + T yy = static_cast(b[ZIP_COORDS(cst, 1) * yStrideC]); + _add_broadcast(&(x[offset.first]), yy, &(z[offset.second]), inc); + offset = nd4j::inc_coords(cst, offset); + } + } + else { + for (size_t i = 0; i < loop_count; i++) { + T* xx = &(x[offset.first]); + T* zz = &(z[offset.second]); + T yy = static_cast(b[ZIP_COORDS(cst, 1) * yStrideC]); + for (size_t j = 0; j < inc; j++) + zz[j * z_stride] = xx[j * x_stride] + yy; + offset = nd4j::inc_coords(cst, offset); + } + } + } + } + + /// + template + void channel_NC_continous_numHW_C(Nd4jLong rank, const Nd4jLong* bases, const Nd4jLong* x_strides, T* x, const T2* b, T* z, bool inplaceOp, const Nd4jLong yStrideC, Nd4jLong start, Nd4jLong stop, Nd4jLong inc) { + + // (stop-start) % inc == 0 because we handled inside partitioning using the channel size + size_t loop_count = (stop - start) / inc; + + nd4j::CoordsState<1> cst; + //note: we had to manually pass index + size_t offset_p = nd4j::init_coords<2>(cst, start / inc, bases, x_strides); + + //partitioning was done using numHW, so we can increment from rank 2 + if (inplaceOp) { + for (size_t i = 0; i < loop_count; i++) { + T yy = static_cast(b[COORDS(cst, 1) * yStrideC]); + _add_broadcast_inplace(&(x[offset_p]), yy, inc); + offset_p = nd4j::inc_coords<2>(cst, offset_p); + } + } + else { + if (yStrideC == 1) { + for (size_t i = 0; i < loop_count; i++) { + T yy = static_cast(b[COORDS(cst, 1)]); + _add_broadcast(&(x[offset_p]), yy, &(z[offset_p]), inc); + offset_p = nd4j::inc_coords<2>(cst, offset_p); + } + } + else { + for (size_t i = 0; i < loop_count; i++) { + T yy = static_cast(b[COORDS(cst, 1) * yStrideC]); + _add_broadcast(&(x[offset_p]), yy, &(z[offset_p]), inc); + offset_p = nd4j::inc_coords<2>(cst, offset_p); + } + } + } + } + + // + template + static void channel_generic_stride_skip_F(const Nd4jLong*& x_strides, const Nd4jLong*& bases, T* x, const T2* b, T* z, const bool& inplace, const Nd4jLong yStrideC, const Nd4jLong& start, const Nd4jLong& stop, const Nd4jLong& inc) + { + // (stop-start) % inc == 0 because we handled inside partitioning using the channel size + size_t loop_count = (stop - start) / inc; + nd4j::CoordsState cst; + size_t offset_p = nd4j::init_coords(cst, start, bases, x_strides); + if (!inplace) { + for (size_t i = 0; i < loop_count; i++) { + T yy = static_cast(b[COORDS(cst, b_index) * yStrideC]); + _add_broadcast(&(x[offset_p]), yy, &(z[offset_p]), inc); + offset_p = nd4j::inc_coords(cst, offset_p); + } + } + else { + for (size_t i = 0; i < loop_count; i++) { + T yy = static_cast(b[COORDS(cst, b_index) * yStrideC]); + _add_broadcast_inplace(&(x[offset_p]), yy, inc); + offset_p = nd4j::inc_coords(cst, offset_p); + } + } + } + + /// + template + void channel_generic_F(const Nd4jLong* bases, const Nd4jLong* x_strides, const Nd4jLong* z_strides, const bool& inplaceOp, const bool same_stride, const bool same_order, const Nd4jLong yStrideC, T* x, const T2* b, T* z, Nd4jLong start, Nd4jLong stop, Nd4jLong inc) { + //just ensure that passed sameStride is correct, because when bases are equal orders matters + bool sameOrderStride = same_order && same_stride; + if (sameOrderStride && x_strides[0] == 1) { + channel_generic_stride_skip_F(x_strides, bases, x, b, z, inplaceOp, yStrideC, start, stop, inc); + } + else { + // (stop-start) % inc == 0 because we handled inside partitioning using the channel size + + size_t loop_count = (stop - start) / inc; + nd4j::ZipCoordsState cst; + nd4j::zip_size_t offset = nd4j::init_coords(cst, start, bases, x_strides, z_strides); + Nd4jLong x_stride = ZIP_STRIDE1(cst, 0); + Nd4jLong z_stride = ZIP_STRIDE2(cst, 0); + if (same_order && z_stride == 1 && x_stride == 1) { + + for (size_t i = 0; i < loop_count; i++) { + T yy = static_cast(b[ZIP_COORDS(cst, b_index) * yStrideC]); + _add_broadcast(&(x[offset.first]), yy, &(z[offset.second]), inc); + offset = nd4j::inc_coords(cst, offset); + } + } + else { + for (size_t i = 0; i < loop_count; i++) { + T* xx = &(x[offset.first]); + T* zz = &(z[offset.second]); + T yy = static_cast(b[ZIP_COORDS(cst, b_index) * yStrideC]); + for (size_t j = 0; j < inc; j++) + zz[j * z_stride] = xx[j * x_stride] + yy; + offset = nd4j::inc_coords(cst, offset); + } + } + } + } - const X* x = input.bufferAsT(); - const Y* y = bias.bufferAsT(); - X* z = output.bufferAsT(); + template + static void addBias_(const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) { + Nd4jLong* x_shapeInfo = input.getShapeInfo(); + Nd4jLong* z_shapeInfo = output.getShapeInfo(); + X* x = input.bufferAsT(); + X* z = output.bufferAsT(); + const Y* b = bias.bufferAsT(); + const Nd4jLong rank = x_shapeInfo[0]; + const Nd4jLong* bases = &(x_shapeInfo[1]); + const Nd4jLong* x_strides = &(x_shapeInfo[rank + 1]); + const Nd4jLong* z_strides = &(z_shapeInfo[rank + 1]); + const bool inplaceOp = (x == z); + const bool same_order = inplaceOp || (input.ordering() == output.ordering()); + const bool channel_atTheEnd = !isNCHW; + const bool same_stride = inplaceOp || shape::strideEquals(x_shapeInfo, z_shapeInfo); + bool isContinuous = false; + int posOfNonUnityDim; + bias.isCommonVector(posOfNonUnityDim); + const Nd4jLong yStrideC = bias.strideAt(posOfNonUnityDim); + char order = input.ordering(); - const bool inOutAreSame = x == z; + //for rank>5 + if (rank > 5) { + const int channelDim = isNCHW ? 1 : input.rankOf() - 1; // second or last + const_cast(input).applyBroadcast(nd4j::broadcast::Add, { channelDim }, bias, output); + return; + } - int posOfNonUnityDim; - bias.isCommonVector(posOfNonUnityDim); + if (same_order && same_stride) { + isContinuous = shape::elementWiseStride(x_shapeInfo) == 1 && shape::elementWiseStride(z_shapeInfo) == 1; + // check_continuity(order, bases, x_strides, rank); + }//if ( sameOrder && same_stride) - const uint bS = output.sizeAt(0); // batch size - const Nd4jLong yStrideC = bias.strideAt(posOfNonUnityDim); - const Nd4jLong zStrideB = output.strideAt(0); + bool treat_as_lastC = false; + // + if (rank == 2 && isNCHW) { + //we believe we better treat it as channel at the end case; + treat_as_lastC = true; + } + if (channel_atTheEnd || treat_as_lastC) { + //N..HWC case here + //flattened bias variables + constexpr size_t BSIZE1 = 3 * MIN_NN * MIN_NN; + constexpr size_t BSIZE2 = BSIZE1 + MIN_NN * MIN_NN; + X flatBias_stack[BSIZE2] align32; + std::unique_ptr flatBias_heap; + const X* bias_new; + X* bias_extra = nullptr; + size_t total_num = 1; + for (size_t i = 0; i < rank; i++) { + total_num *= bases[i]; + } + Nd4jLong inc; + size_t rank_skip = 1; + if (order == 'c') { + size_t b_stack_size = BSIZE2; + inc = bases[rank - 1]; + if (isContinuous) { + //for continous we need extra stack memory + // to create vectorizable bias from small size + b_stack_size = BSIZE1; + bias_extra = &(flatBias_stack[BSIZE1]); + } + bias_new = flattened_bias(b, (X*)flatBias_stack, b_stack_size, flatBias_heap, inc, yStrideC); + if (isContinuous && inc < MIN_NN_K * MIN_NN && total_num > inc * MIN_NN_K) { + //for small size where total_num is sufficient we need to recreate vectorizable buffer + size_t old_inc = inc; + //sizeof bias_extra is MIN_NN * MIN_NN + size_t new_inc = inc < MIN_NN ? inc * MIN_NN : inc * MIN_NN / MIN_NN_K; + //if there is a room then lets multiply + new_inc = (new_inc * MIN_NN_K <= total_num && new_inc < MIN_NN * MIN_NN / MIN_NN_K) ? MIN_NN_K * new_inc : new_inc; + for (size_t i = 0; i < new_inc; i += inc) { + //copy to our buffer + X* cp = &(bias_extra[i]); + for (size_t j = 0; j < inc; j++) { + cp[j] = bias_new[j]; + } + } + //vectorizable buffer + inc = new_inc; + bias_new = bias_extra; + } + } + else { + inc = bases[0]; + if (isContinuous) { + //we can choose other inc and index for that case + //but for now lets choose all till the last one + uint32_t req_numThreads = nd4j::Environment::getInstance()->maxMasterThreads(); + isContinuous = false; + if (rank > 2) { + if (req_numThreads < 2 || bases[rank - 1] >= req_numThreads) { + inc = total_num / bases[rank - 1]; + isContinuous = true; + rank_skip = rank - 1; + } + else if (rank > 3 && bases[rank - 1] * bases[rank - 2] >= req_numThreads) { + inc = total_num / bases[rank - 1] / bases[rank - 2]; //for continuous case it is its stride + rank_skip = rank - 2; + isContinuous = true; + } + } + } + } - if(output.rankOf() == 4) { + FUNC_1D func = [order, isContinuous, rank, x, b, bias_new, z, x_shapeInfo, z_shapeInfo, same_stride, same_order, yStrideC, rank_skip] + (uint64_t thread_id, int64_t start, int64_t stop, int64_t increment) -> void { + const Nd4jLong rank = x_shapeInfo[0]; + const Nd4jLong* bases = &(x_shapeInfo[1]); + const Nd4jLong* x_strides = &(x_shapeInfo[rank + 1]); + const Nd4jLong* z_strides = &(z_shapeInfo[rank + 1]); + const bool inplaceOp = (x == z); + if (order == 'c') { + if (isContinuous) { + channel_atTheEnd_continous_C(x, bias_new, z, inplaceOp, start, stop, increment); + } + // rank is in [2,5] + else if (rank == 4) { + channel_atTheEnd_generic_C(bases, x_strides, z_strides, inplaceOp, same_stride, same_order, x, bias_new, z, start, stop, increment); - const uint C = isNCHW ? output.sizeAt(1) : output.sizeAt(3); // channels - const uint oH = isNCHW ? output.sizeAt(2) : output.sizeAt(1); // height - const uint oW = isNCHW ? output.sizeAt(3) : output.sizeAt(2); // width + } + else if (rank == 5) { + channel_atTheEnd_generic_C(bases, x_strides, z_strides, inplaceOp, same_stride, same_order, x, bias_new, z, start, stop, increment); + } + else if (rank == 2) { + channel_atTheEnd_generic_C(bases, x_strides, z_strides, inplaceOp, same_stride, same_order, x, bias_new, z, start, stop, increment); + } + else if (rank == 3) { + channel_atTheEnd_generic_C(bases, x_strides, z_strides, inplaceOp, same_stride, same_order, x, bias_new, z, start, stop, increment); + } + } + else { + //generic F case + if (isContinuous) { + if (rank == 4) { + if (rank_skip == rank - 2) { + channel_generic_stride_skip_F(x_strides, bases, x, b, z, inplaceOp, yStrideC, start, stop, increment); + } + else { + channel_generic_stride_skip_F(x_strides, bases, x, b, z, inplaceOp, yStrideC, start, stop, increment); + } + } + else if (rank == 5) { + if (rank_skip == rank - 2) { + //skip==3 + channel_generic_stride_skip_F(x_strides, bases, x, b, z, inplaceOp, yStrideC, start, stop, increment); + } + else { + channel_generic_stride_skip_F(x_strides, bases, x, b, z, inplaceOp, yStrideC, start, stop, increment); + } + } + else if (rank == 3) { + channel_generic_stride_skip_F(x_strides, bases, x, b, z, inplaceOp, yStrideC, start, stop, increment); + } + } + else if (rank == 4) { + channel_generic_F(bases, x_strides, z_strides, inplaceOp, same_stride, same_order, yStrideC, x, b, z, start, stop, increment); + } + else if (rank == 5) { + channel_generic_F(bases, x_strides, z_strides, inplaceOp, same_stride, same_order, yStrideC, x, b, z, start, stop, increment); + } + else if (rank == 2) { + channel_generic_F(bases, x_strides, z_strides, inplaceOp, same_stride, same_order, yStrideC, x, b, z, start, stop, increment); + } + else if (rank == 3) { + channel_generic_F(bases, x_strides, z_strides, inplaceOp, same_stride, same_order, yStrideC, x, b, z, start, stop, increment); + } - const Nd4jLong zStrideC = isNCHW ? output.stridesOf()[1] : output.stridesOf()[3]; - const Nd4jLong zStrideH = isNCHW ? output.stridesOf()[2] : output.stridesOf()[1]; - const Nd4jLong zStrideW = isNCHW ? output.stridesOf()[3] : output.stridesOf()[2]; + } + }; + // + samediff::Threads::parallel_aligned_increment(func, 0, total_num, inc); + } + else { + //NC...HW case here + size_t numNC = 1; + size_t numHW = 1; + for (size_t i = 0; i < 2; i++) { + numNC *= bases[i]; + } + for (size_t i = 2; i < rank; i++) { + numHW *= bases[i]; + } + Nd4jLong total_num = numNC * numHW; + Nd4jLong inc = (order == 'c') ? bases[rank - 1] : bases[0]; + if (order == 'c' && isContinuous) { + //sometimes last dimension is too big and multithreading could suffer using unfair partitioning + //so we will do it only when inc is smaller our value or multithreading turned off + uint32_t req_numThreads = nd4j::Environment::getInstance()->maxMasterThreads(); + if (req_numThreads < 2 || numNC >= req_numThreads || inc <= 2 * 8196 || rank == 3) { + inc = numHW; + } + else { + //treat it as stride1c case + isContinuous = false; + } + } + FUNC_1D func = [order, isContinuous, rank, x, b, z, x_shapeInfo, z_shapeInfo, same_stride, same_order, yStrideC] + (uint64_t thread_id, int64_t start, int64_t stop, int64_t increment) -> void { + const Nd4jLong rank = x_shapeInfo[0]; + const Nd4jLong* bases = &(x_shapeInfo[1]); + const Nd4jLong* x_strides = &(x_shapeInfo[rank + 1]); + const Nd4jLong* z_strides = &(z_shapeInfo[rank + 1]); + const bool inplaceOp = (x == z); + if (order == 'c') { + if (isContinuous) { + channel_NC_continous_numHW_C(rank, bases, x_strides, x, b, z, inplaceOp, yStrideC, start, stop, increment); + } + // rank is in [3,5] + else if (rank == 4) { + channel_NC_generic_C(bases, x_strides, z_strides, inplaceOp, same_stride, same_order, yStrideC, x, b, z, start, stop, increment); - if(inOutAreSame) { + } + else if (rank == 5) { + channel_NC_generic_C(bases, x_strides, z_strides, inplaceOp, same_stride, same_order, yStrideC, x, b, z, start, stop, increment); + } + else if (rank == 3) { + channel_NC_generic_C(bases, x_strides, z_strides, inplaceOp, same_stride, same_order, yStrideC, x, b, z, start, stop, increment); + } + } + else { + //the same can be applied for NCHW case + //generic F case + //continous case is missing - auto func = PRAGMA_THREADS_FOR_3D { - for (uint b = start_x; b < stop_x; b += inc_x) - for (uint c = start_y; c < stop_y; c += inc_y) - for (uint h = start_z; h < stop_z; h += inc_z) - for (uint w = 0; w < oW; ++w) - z[b * zStrideB + c * zStrideC + h * zStrideH + w * zStrideW] += static_cast(y[c * yStrideC]); - }; + if (rank == 4) { + channel_generic_F(bases, x_strides, z_strides, inplaceOp, same_stride, same_order, yStrideC, x, b, z, start, stop, increment); + } + else if (rank == 5) { + channel_generic_F(bases, x_strides, z_strides, inplaceOp, same_stride, same_order, yStrideC, x, b, z, start, stop, increment); + } + else if (rank == 3) { + channel_generic_F(bases, x_strides, z_strides, inplaceOp, same_stride, same_order, yStrideC, x, b, z, start, stop, increment); + } + } + }; + // + samediff::Threads::parallel_aligned_increment(func, 0, total_num, inc); + } + } + ////////////////////////////////////////////////////////////////////////// + void addBias(nd4j::graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) { - samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oH, 1); - } - else { + // bias.rankOf() == 1 ? bias : bias.reshape(bias.ordering(), {bias.lengthOf()}) + BUILD_DOUBLE_SELECTOR(input.dataType(), bias.dataType(), addBias_, (input, bias, output, isNCHW), FLOAT_TYPES, FLOAT_TYPES); + } - const Nd4jLong xStrideB = input.stridesOf()[0]; - const Nd4jLong xStrideC = isNCHW ? input.stridesOf()[1] : input.stridesOf()[3]; - const Nd4jLong xStrideH = isNCHW ? input.stridesOf()[2] : input.stridesOf()[1]; - const Nd4jLong xStrideW = isNCHW ? input.stridesOf()[3] : input.stridesOf()[2]; - if (isNCHW) { - - auto func = PRAGMA_THREADS_FOR_3D { - for (uint b = start_x; b < stop_x; b += inc_x) - for (uint c = start_y; c < stop_y; c += inc_y) - for (uint h = start_z; h < stop_z; h += inc_z) - for (uint w = 0; w < oW; ++w) - z[b * zStrideB + c * zStrideC + h * zStrideH + w * zStrideW] = x[b * xStrideB + c * xStrideC + h * xStrideH + w * xStrideW] + static_cast(y[c * yStrideC]); - }; - - samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oH, 1); - } else { - auto func = PRAGMA_THREADS_FOR_3D { - for (uint b = start_x; b < stop_x; b++) - for (uint h = start_y; h < stop_y; h++) - for (uint w = start_z; w < stop_z; w++) - for (uint c = 0; c < C; c++) - z[b * zStrideB + c * zStrideC + h * zStrideH + w * zStrideW] = x[b * xStrideB + c * xStrideC + h * xStrideH + w * xStrideW] + y[c * yStrideC]; - }; - - samediff::Threads::parallel_for(func, 0, bS, 1, 0, oH, 1, 0, oW, 1); - } - } - } - else if(output.rankOf() == 5) { - - const uint C = isNCHW ? output.sizeAt(1) : output.sizeAt(4); // channels - const uint oD = isNCHW ? output.sizeAt(2) : output.sizeAt(1); // depth - const uint oH = isNCHW ? output.sizeAt(3) : output.sizeAt(2); // height - const uint oW = isNCHW ? output.sizeAt(4) : output.sizeAt(3); // width - - const Nd4jLong zStrideC = isNCHW ? output.stridesOf()[1] : output.stridesOf()[4]; - const Nd4jLong zStrideD = isNCHW ? output.stridesOf()[2] : output.stridesOf()[1]; - const Nd4jLong zStrideH = isNCHW ? output.stridesOf()[3] : output.stridesOf()[2]; - const Nd4jLong zStrideW = isNCHW ? output.stridesOf()[4] : output.stridesOf()[3]; - - if(inOutAreSame) { - - auto func = PRAGMA_THREADS_FOR_3D { - for (uint b = start_x; b < stop_x; b += inc_x) - for (uint c = start_y; c < stop_y; c += inc_y) - for (uint d = start_z; d < stop_z; d += inc_z) - for (uint h = 0; h < oH; ++h) - for (uint w = 0; w < oW; ++w) - z[b * zStrideB + c * zStrideC + d * zStrideD + h * zStrideH + w * zStrideW] += static_cast(y[c * yStrideC]); - }; - - samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oD, 1); - } - else { - - const Nd4jLong xStrideB = input.stridesOf()[0]; - const Nd4jLong xStrideC = isNCHW ? input.stridesOf()[1] : input.stridesOf()[4]; - const Nd4jLong xStrideD = isNCHW ? input.stridesOf()[2] : input.stridesOf()[1]; - const Nd4jLong xStrideH = isNCHW ? input.stridesOf()[3] : input.stridesOf()[2]; - const Nd4jLong xStrideW = isNCHW ? input.stridesOf()[4] : input.stridesOf()[3]; - - auto func = PRAGMA_THREADS_FOR_3D { - for (uint b = start_x; b < stop_x; b += inc_x) - for (uint c = start_y; c < stop_y; c += inc_y) - for (uint d = start_z; d < stop_z; d += inc_z) - for (uint h = 0; h < oH; ++h) - for (uint w = 0; w < oW; ++w) - z[b * zStrideB + c * zStrideC + d * zStrideD + h * zStrideH + w * zStrideW] = x[b * xStrideB + c * xStrideC + d * xStrideD + h * xStrideH + w * xStrideW] + static_cast(y[c * yStrideC]); - }; - - samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oD, 1); - } - } - else { - const int channelDim = isNCHW ? 1 : input.rankOf() - 1; // second or last - const_cast(input).applyBroadcast(nd4j::broadcast::Add, {channelDim}, bias, output); - } + BUILD_DOUBLE_TEMPLATE(template void addBias_, (const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW), FLOAT_TYPES, FLOAT_TYPES); + } + } } - -////////////////////////////////////////////////////////////////////////// -void addBias(nd4j::graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) { - - // bias.rankOf() == 1 ? bias : bias.reshape(bias.ordering(), {bias.lengthOf()}) - BUILD_DOUBLE_SELECTOR(input.dataType(), bias.dataType(), addBias_, (input, bias, output, isNCHW), FLOAT_TYPES, FLOAT_TYPES); -} - - -BUILD_DOUBLE_TEMPLATE(template void addBias_, (const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW), FLOAT_TYPES, FLOAT_TYPES); - -} -} -} - diff --git a/libnd4j/tests_cpu/layers_tests/LoopCoordsHelperTests.cpp b/libnd4j/tests_cpu/layers_tests/LoopCoordsHelperTests.cpp new file mode 100644 index 000000000..1a65c09ae --- /dev/null +++ b/libnd4j/tests_cpu/layers_tests/LoopCoordsHelperTests.cpp @@ -0,0 +1,223 @@ +/******************************************************************************* + * Copyright (c) 2019 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + + // + // @author Abdelrauf + // + +#include "testlayers.h" +#include +#include +using namespace nd4j; + +class LoopCoordsHelper : public testing::Test { +public: + +}; + + +template +FORCEINLINE +typename std::enable_if<(Rank - 1 == rankIndex), bool>::type +eq_strides(CoordsState& cbs, const Nd4jLong* strides) { + return STRIDE(cbs, rankIndex) == strides[rankIndex]; +} + +template +FORCEINLINE +typename std::enable_if<(Rank - 1 != rankIndex), bool>::type +eq_strides(CoordsState& cbs, const Nd4jLong* strides) { + return STRIDE(cbs, rankIndex) == strides[rankIndex] && eq_strides(cbs, strides); +} + +template +FORCEINLINE +typename std::enable_if<(Rank - 1 == rankIndex), bool>::type +eq_zip_strides(ZipCoordsState& cbs, const Nd4jLong* strides1, const Nd4jLong* strides2) { + return ZIP_STRIDE1(cbs, rankIndex) == strides1[rankIndex] && ZIP_STRIDE2(cbs, rankIndex) == strides2[rankIndex]; +} + +template +FORCEINLINE +typename std::enable_if<(Rank - 1 != rankIndex), bool>::type +eq_zip_strides(ZipCoordsState& cbs, const Nd4jLong* strides1, const Nd4jLong* strides2) { + return ZIP_STRIDE1(cbs, rankIndex) == strides1[rankIndex] && ZIP_STRIDE2(cbs, rankIndex) == strides2[rankIndex] + && eq_zip_strides(cbs, strides1, strides2); +} + + + + +TEST_F(LoopCoordsHelper, Init_Tests) { + + constexpr size_t test_Index = 131; + constexpr size_t Rank = 5; + + Nd4jLong shape[Rank] = { 3, 5 ,7, 8, 9}; + Nd4jLong multiply_st[] = { 2,3,3,5,6,7,9,3 }; + Nd4jLong strides_c[Rank] ; + Nd4jLong strides_f[Rank]; + + Nd4jLong coords[Rank]; + Nd4jLong coords_f[Rank]; + + strides_f[0] = multiply_st[0] * shape[0]; + strides_c[Rank-1] = multiply_st[Rank-1] * shape[Rank-1]; + + for (int i = 1; i < Rank; i++) { + strides_f[i] = strides_f[i - 1] * multiply_st[i] * shape[i]; + } + + for (int i = Rank-2; i >=0; i--) { + strides_c[i] = strides_c[i+1] * multiply_st[i] * shape[i]; + } + + //init our base coords + index2coords_C(test_Index, Rank, shape, coords); + index2coords_F(test_Index, Rank, shape, coords_f); + + + size_t offset_calc = offset_from_coords(strides_c, coords, Rank); + size_t offset_calc_f = offset_from_coords(strides_f, coords_f, Rank); + + CoordsState cts; + CoordsState cts_f; + + ZipCoordsState zcts; + ZipCoordsState zcts_f; + + size_t offset = init_coords(cts, test_Index, shape, strides_c); + size_t offset_f = init_coords(cts_f, test_Index, shape, strides_f); + + zip_size_t zoffset = init_coords(zcts, test_Index, shape, strides_c, strides_c); + zip_size_t zoffset_f = init_coords(zcts_f, test_Index, shape, strides_f, strides_f); + + ASSERT_TRUE(eq_coords(cts, coords)); + ASSERT_TRUE(eq_coords(cts_f, coords_f)); + + ASSERT_TRUE(eq_zip_coords(zcts, coords)); + ASSERT_TRUE(eq_zip_coords(zcts_f, coords_f)); + + ASSERT_TRUE(eq_strides(cts,strides_c)); + ASSERT_TRUE(eq_strides(cts_f,strides_f)); + + ASSERT_TRUE(eq_zip_strides(zcts, strides_c, strides_c)); + ASSERT_TRUE(eq_zip_strides(zcts_f, strides_f, strides_f)); + + + ASSERT_EQ(offset , offset_calc); + ASSERT_EQ(zoffset.first , offset_calc); + ASSERT_EQ(zoffset.second , offset_calc); + ASSERT_EQ(offset_f , offset_calc_f); + ASSERT_EQ(zoffset_f.first , offset_calc_f); + ASSERT_EQ(zoffset_f.second , offset_calc_f); +} + +TEST_F(LoopCoordsHelper, Increment_Use_Tests) { + + + constexpr size_t Rank = 4; + + Nd4jLong shape[Rank] = { 3, 5 ,7, 8 }; + Nd4jLong multiply_st[] = { 2,3,3,5,6,7,9,3 }; + Nd4jLong strides_c[Rank]; + Nd4jLong strides_f[Rank]; + + Nd4jLong coords[Rank] = {}; + Nd4jLong coords_f[Rank] = {}; + Nd4jLong coords2[Rank] = {}; + Nd4jLong coords2_f[Rank] = {}; + Nd4jLong zcoords2[Rank] = {}; + Nd4jLong zcoords2_f[Rank] = {}; + + strides_f[0] = multiply_st[0] * shape[0]; + strides_c[Rank - 1] = multiply_st[Rank - 1] * shape[Rank - 1]; + + for (int i = 1; i < Rank; i++) { + strides_f[i] = strides_f[i - 1] * multiply_st[i] * shape[i]; + } + + for (int i = Rank - 2; i >= 0; i--) { + strides_c[i] = strides_c[i + 1] * multiply_st[i] * shape[i]; + } + + int total = 1; + for (int i = 0; i < Rank; i++) { + total *= shape[i]; + } + + CoordsState cts; + CoordsState cts_f; + + ZipCoordsState zcts; + ZipCoordsState zcts_f; + + size_t offset = init_coords(cts, 0, shape, strides_c); + size_t offset_f = init_coords(cts_f, 0, shape, strides_f); + + zip_size_t zoffset = init_coords(zcts, 0, shape, strides_c, strides_c); + zip_size_t zoffset_f = init_coords(zcts_f, 0, shape, strides_f, strides_f); + + size_t offset2 = 0; + size_t offset2_f = 0; + zip_size_t zoffset2 = {}; + zip_size_t zoffset2_f = {}; + + for (int j = 0; j < total; j++) { + + + index2coords_C(j, Rank, shape, coords); + index2coords_F(j, Rank, shape, coords_f); + + size_t offset_calc = offset_from_coords(strides_c, coords, Rank); + size_t offset_calc_f = offset_from_coords(strides_f, coords_f, Rank); + + + ASSERT_TRUE(eq_coords(cts, coords)); + ASSERT_TRUE(eq_coords(cts_f, coords_f)); + + ASSERT_TRUE(eq_zip_coords(zcts, coords)); + ASSERT_TRUE(eq_zip_coords(zcts_f, coords_f)); + + ASSERT_EQ(offset, offset_calc); + ASSERT_EQ(zoffset.first, offset_calc); + ASSERT_EQ(zoffset.second, offset_calc); + ASSERT_EQ(offset_f, offset_calc_f); + ASSERT_EQ(zoffset_f.first, offset_calc_f); + ASSERT_EQ(zoffset_f.second, offset_calc_f); + + + ASSERT_EQ(offset2, offset_calc); + ASSERT_EQ(zoffset2.first, offset_calc); + ASSERT_EQ(zoffset2.second, offset_calc); + ASSERT_EQ(offset2_f, offset_calc_f); + ASSERT_EQ(zoffset2_f.first, offset_calc_f); + ASSERT_EQ(zoffset2_f.second, offset_calc_f); + + offset = inc_coords(cts, offset); + offset_f = inc_coords(cts_f, offset_f); + zoffset = inc_coords(zcts, zoffset); + zoffset_f = inc_coords(zcts_f, zoffset_f); + + offset2 = inc_coords(shape,strides_c, coords2, offset2, Rank); + offset2_f = inc_coords(shape, strides_f, coords2_f, offset2_f, Rank); + zoffset2 = inc_coords(shape, strides_c, strides_c, zcoords2, zoffset2, Rank); + zoffset2_f = inc_coords(shape, strides_f, strides_f, zcoords2_f, zoffset2_f, Rank); + + } + +} + diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp index 4d7a0f783..9f75beca1 100644 --- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp @@ -45,6 +45,7 @@ #include #include +#include using namespace nd4j; using namespace nd4j::graph; @@ -64,6 +65,87 @@ TEST_F(PlaygroundTests, test_avx) { nd4j_printf("Optimal level: %i; Binary level: %i;\n", ::optimalLevel(), ::binaryLevel()); } +/* + +TEST_F(PlaygroundTests, test_s_0) { + std::vector> shapes = {{32, 224, 224, 3}, {32, 56, 56, 64}, {32, 7, 7, 512}}; + std::vector threads = {1, 2, 4, 8, 16}; + + for (auto shape: shapes) { + for (auto t: threads) { + nd4j::Environment::getInstance()->setMaxMasterThreads(t); + + auto x = NDArrayFactory::create('c', shape); + auto y = NDArrayFactory::create('c', {shape[3]}); + auto z = x.ulike(); + + std::vector values; + Context ctx(1); + ctx.setInputArray(0, &x); + ctx.setInputArray(1, &y); + ctx.setOutputArray(0, &z); + + nd4j::ops::biasadd op; + + + for (int e = 0; e < 10000; e++) { + auto timeStart = std::chrono::system_clock::now(); + + op.execute(&ctx); + nd4j::ops::helpers::addBias(ctx, x, y, z, false); + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + values.emplace_back(outerTime); + } + + std::sort(values.begin(), values.end()); + + nd4j_printf("Shape: [%lld, %lld, %lld, %lld]; Threads: [%i]; Time: %lld us;\n", shape[0], shape[1], shape[2], shape[3], t, values[values.size() / 2]); + } + } +} + +TEST_F(PlaygroundTests, test_s_1) { + std::vector> shapes = {{32, 3, 224, 224}, {32, 64, 56, 56}, {32, 512, 7, 7}}; + std::vector threads = {1, 2, 4, 8, 16}; + + for (auto shape: shapes) { + for (auto t: threads) { + nd4j::Environment::getInstance()->setMaxMasterThreads(t); + + auto x = NDArrayFactory::create('c', shape); + auto y = NDArrayFactory::create('c', {shape[1]}); + auto z = x.ulike(); + + std::vector values; + Context ctx(1); + ctx.setInputArray(0, &x); + ctx.setInputArray(1, &y); + ctx.setOutputArray(0, &z); + + nd4j::ops::biasadd op; + + + for (int e = 0; e < 10000; e++) { + auto timeStart = std::chrono::system_clock::now(); + + //op.execute({&x, &y}, {&z}, {true}); + nd4j::ops::helpers::addBias(ctx, x, y, z, true); + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + values.emplace_back(outerTime); + } + + std::sort(values.begin(), values.end()); + + nd4j_printf("Shape: [%lld, %lld, %lld, %lld]; Threads: [%i]; Time: %lld us;\n", shape[0], shape[1], shape[2], shape[3], t, values[values.size() / 2]); + } + } +} +*/ + /* TEST_F(PlaygroundTests, test_s_0) { auto x = NDArrayFactory::create('c', {32, 112, 112, 16});