From 9bb11d5b062886ea8ef0c91d535e5a2b0e2364e0 Mon Sep 17 00:00:00 2001 From: Samuel Audet Date: Thu, 18 Jul 2019 20:13:56 +0900 Subject: [PATCH] Fix OpenMP by replacing pragmas with macros (#8026) Signed-off-by: Samuel Audet --- libnd4j/blas/NDArray.hpp | 4 +- libnd4j/include/helpers/cpu/MmulHelper.cpp | 18 ++++---- libnd4j/include/helpers/impl/DebugHelper.cpp | 4 +- libnd4j/include/helpers/shape.h | 46 +++++++++---------- .../include/loops/cpu/type_conversions.cpp | 12 ++--- libnd4j/include/openmp_pragmas.h | 14 ++++++ .../declarable/helpers/cpu/activations.cpp | 6 +-- .../ops/declarable/helpers/cpu/gather.cpp | 6 +-- .../ops/declarable/helpers/cpu/lstm.cpp | 8 ++-- .../ops/declarable/helpers/cpu/scatter.cpp | 22 ++++----- .../ops/declarable/helpers/cuda/adjust_hue.cu | 4 +- .../helpers/cuda/adjust_saturation.cu | 4 +- .../ops/declarable/helpers/cuda/col2im.cppc | 8 ++-- .../ops/declarable/helpers/cuda/im2col.cppc | 6 +-- libnd4j/include/ops/impl/specials.cpp | 20 ++++---- libnd4j/include/ops/impl/specials_sparse.cpp | 6 +-- .../layers_tests/PlaygroundTests.cpp | 38 +++++++-------- 17 files changed, 120 insertions(+), 106 deletions(-) diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp index 5c616f605..3d3ce59fd 100644 --- a/libnd4j/blas/NDArray.hpp +++ b/libnd4j/blas/NDArray.hpp @@ -2412,7 +2412,7 @@ double NDArray::getTrace() const { double sum = 0.; -#pragma omp parallel for reduction(sumT:sum) if(minDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided) +PRAGMA_OMP_PARALLEL_FOR_ARGS(reduction(OMP_SUMT:sum) if(minDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) for(int i = 0; i < minDim; ++i) sum += e(i * offset); @@ -4477,4 +4477,4 @@ void NDArray::setShapeInfo(const ConstantDataBuffer& shapeBuffer) { // } // } // return true; -// } \ No newline at end of file +// } diff --git a/libnd4j/include/helpers/cpu/MmulHelper.cpp b/libnd4j/include/helpers/cpu/MmulHelper.cpp index 246d70187..293360a25 100644 --- a/libnd4j/include/helpers/cpu/MmulHelper.cpp +++ b/libnd4j/include/helpers/cpu/MmulHelper.cpp @@ -39,7 +39,7 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c const bool flagA = (flagC && transA) || (!flagC && !transA); const bool flagB = (flagC && transB) || (!flagC && !transB); - // #pragma omp parallel for if(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided) + // PRAGMA_OMP_PARALLEL_FOR_ARGS(if(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) // for(uint row = 0; row < M; ++row) { // T3* c = flagC ? (C + row) : (C + row * ldc); @@ -53,7 +53,7 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c // T3* a = flagA ? (A + row * lda + i) : (A + row + i * lda); // if(flagC) { - // #pragma omp simd + // PRAGMA_OMP_SIMD // for(uint col = 0; col < N; ++col) { // if(betaZ) // c[col * ldc] += a * b[flagB ? col : col * ldb] + betaZ * c[col * ldc]; @@ -62,7 +62,7 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c // } // } // else { - // #pragma omp simd + // PRAGMA_OMP_SIMD // for(uint col = 0; col < N; ++col) { // if(betaZ) // c[col] += a * b[flagB ? col : col * ldb] + betaZ * c[col]; @@ -73,14 +73,14 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c // } // } - #pragma omp parallel for if(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided) collapse(2) + PRAGMA_OMP_PARALLEL_FOR_ARGS(if(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided) collapse(2)) for(uint row = 0; row < M; ++row) { for(uint col = 0; col < N; ++col) { T3* c = flagC ? (C + row + col * ldc) : (C + row * ldc + col); T3 val = 0; - #pragma omp simd + PRAGMA_OMP_SIMD for(uint i = 0; i < K; ++i) { T3 a = flagA ? *(A + row * lda + i) : *(A + row + i * lda); T3 b = flagB ? *(B + col + i * ldb) : *(B + col * ldb + i); @@ -107,13 +107,13 @@ static void usualGemv(const char aOrder, const int M, const int N, const double const bool flagA = aOrder == 'f'; - #pragma omp parallel for if(M > Environment::getInstance()->elementwiseThreshold()) schedule(guided) + PRAGMA_OMP_PARALLEL_FOR_ARGS(if(M > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) for(int row = 0; row < M; ++row) { T3* y = Y + row * incy; T3 val = 0; - #pragma omp simd + PRAGMA_OMP_SIMD for(int i = 0; i < N; ++i) { T3 a = flagA ? *(A + row + i * lda) : *(A + row * lda + i); T3 x = *(X + i * incx); @@ -138,7 +138,7 @@ static void usualDot(const Nd4jLong length, const double alpha, const void* vX, T3 alphaZ(alpha), betaZ(beta); T3 sum = 0; - #pragma omp parallel for if(length > Environment::getInstance()->elementwiseThreshold()) schedule(guided) reduction(sumT:sum) + PRAGMA_OMP_PARALLEL_FOR_ARGS(if(length > Environment::getInstance()->elementwiseThreshold()) schedule(guided) reduction(OMP_SUMT:sum)) for(int i = 0; i < length; ++i) sum = sum + X[i * incx] * Y[i * incy]; @@ -325,4 +325,4 @@ BUILD_TRIPLE_TEMPLATE(template void usualGemm, (const char cOrder, const bool tr BUILD_TRIPLE_TEMPLATE(template void usualGemv, (const char aOrder, const int M, const int N, const double alpha, const void* A, const int lda, const void* B, const int incx, const double beta, void* C, const int incy), LIBND4J_TYPES, FLOAT_TYPES, FLOAT_TYPES); BUILD_TRIPLE_TEMPLATE(template void usualDot, (const Nd4jLong length, const double alpha, const void* vX, const Nd4jLong incx, const void* vY, const Nd4jLong incy, const double beta, void* vZ), LIBND4J_TYPES, FLOAT_TYPES, FLOAT_TYPES); -} \ No newline at end of file +} diff --git a/libnd4j/include/helpers/impl/DebugHelper.cpp b/libnd4j/include/helpers/impl/DebugHelper.cpp index 5cb7a5531..f1ba8a755 100644 --- a/libnd4j/include/helpers/impl/DebugHelper.cpp +++ b/libnd4j/include/helpers/impl/DebugHelper.cpp @@ -67,7 +67,7 @@ namespace nd4j { auto _infCount = nd4j::math::nd4j_isinf(input->e(0)) ? 1L : 0L; auto _nanCount = nd4j::math::nd4j_isnan(input->e(0)) ? 1L : 0L; -#pragma omp parallel for schedule(guided) reduction(+:_nanCount,_infCount,_meanValue,_zeroCount,_positiveCount,_negativeCount) reduction(min:_minValue) reduction(max:_maxValue) +PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) reduction(+:_nanCount,_infCount,_meanValue,_zeroCount,_positiveCount,_negativeCount) reduction(min:_minValue) reduction(max:_maxValue)) for (Nd4jLong e = 1; e < input->lengthOf(); e++) { auto current = input->e(e); auto n = e + 1.; @@ -88,7 +88,7 @@ namespace nd4j { } *info = {_minValue, _maxValue, _meanValue / input->lengthOf(), _stdDevValue, _zeroCount, _positiveCount, _negativeCount, _infCount, _nanCount}; _stdDevValue = 0; //math::nd4j_sqrt(info->_stdDevValue / (input->lengthOf() - 1)); -#pragma omp parallel for schedule (static) reduction(+:_stdDevValue) +PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule (static) reduction(+:_stdDevValue)) for (Nd4jLong e = 0; e < input->lengthOf(); e++) { double current = input->e(e); _stdDevValue += (info->_meanValue - current) * (info->_meanValue - current); //info->_minValue; diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h index acff46a24..312df8c7b 100644 --- a/libnd4j/include/helpers/shape.h +++ b/libnd4j/include/helpers/shape.h @@ -4761,14 +4761,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs } else if(xEws == 1) { xOffsets = nullptr; - #pragma omp parallel sections + PRAGMA_OMP_PARALLEL_SECTIONS { - #pragma omp section + PRAGMA_OMP_SECTION { yOffsets = new Nd4jLong[len]; shape::calcOffsets(yShapeInfo, yOffsets, xOrder); } - #pragma omp section + PRAGMA_OMP_SECTION { zOffsets = new Nd4jLong[len]; shape::calcOffsets(zShapeInfo, zOffsets, xOrder); @@ -4777,14 +4777,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs } else if(yEws == 1) { yOffsets = nullptr; - #pragma omp parallel sections + PRAGMA_OMP_PARALLEL_SECTIONS { - #pragma omp section + PRAGMA_OMP_SECTION { xOffsets = new Nd4jLong[len]; shape::calcOffsets(xShapeInfo, xOffsets, yOrder); } - #pragma omp section + PRAGMA_OMP_SECTION { zOffsets = new Nd4jLong[len]; shape::calcOffsets(zShapeInfo, zOffsets, yOrder); @@ -4793,14 +4793,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs } else if(zEws == 1) { zOffsets = nullptr; - #pragma omp parallel sections + PRAGMA_OMP_PARALLEL_SECTIONS { - #pragma omp section + PRAGMA_OMP_SECTION { xOffsets = new Nd4jLong[len]; shape::calcOffsets(xShapeInfo, xOffsets, zOrder); } - #pragma omp section + PRAGMA_OMP_SECTION { yOffsets = new Nd4jLong[len]; shape::calcOffsets(yShapeInfo, yOffsets, zOrder); @@ -4813,14 +4813,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs yOffsets = zOffsets = xOffsets; } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { - #pragma omp parallel sections + PRAGMA_OMP_PARALLEL_SECTIONS { - #pragma omp section + PRAGMA_OMP_SECTION { xOffsets = new Nd4jLong[len]; shape::calcOffsets(xShapeInfo, xOffsets); } - #pragma omp section + PRAGMA_OMP_SECTION { zOffsets = new Nd4jLong[len]; shape::calcOffsets(zShapeInfo, zOffsets); @@ -4829,14 +4829,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs yOffsets = xOffsets; } else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { - #pragma omp parallel sections + PRAGMA_OMP_PARALLEL_SECTIONS { - #pragma omp section + PRAGMA_OMP_SECTION { xOffsets = new Nd4jLong[len]; shape::calcOffsets(xShapeInfo, xOffsets); } - #pragma omp section + PRAGMA_OMP_SECTION { yOffsets = new Nd4jLong[len]; shape::calcOffsets(yShapeInfo, yOffsets); @@ -4845,19 +4845,19 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs zOffsets = xOffsets; } else { - #pragma omp parallel sections + PRAGMA_OMP_PARALLEL_SECTIONS { - #pragma omp section + PRAGMA_OMP_SECTION { xOffsets = new Nd4jLong[len]; shape::calcOffsets(xShapeInfo, xOffsets); } - #pragma omp section + PRAGMA_OMP_SECTION { yOffsets = new Nd4jLong[len]; shape::calcOffsets(yShapeInfo, yOffsets); } - #pragma omp section + PRAGMA_OMP_SECTION { zOffsets = new Nd4jLong[len]; shape::calcOffsets(zShapeInfo, zOffsets); @@ -4899,14 +4899,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs yOffsets = xOffsets; } else { - #pragma omp parallel sections + PRAGMA_OMP_PARALLEL_SECTIONS { - #pragma omp section + PRAGMA_OMP_SECTION { xOffsets = new Nd4jLong[len]; shape::calcOffsets(xShapeInfo, xOffsets); } - #pragma omp section + PRAGMA_OMP_SECTION { yOffsets = new Nd4jLong[len]; shape::calcOffsets(yShapeInfo, yOffsets); @@ -4919,4 +4919,4 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs } -#endif /* SHAPE_H_ */ \ No newline at end of file +#endif /* SHAPE_H_ */ diff --git a/libnd4j/include/loops/cpu/type_conversions.cpp b/libnd4j/include/loops/cpu/type_conversions.cpp index c7a610e82..3c923de39 100644 --- a/libnd4j/include/loops/cpu/type_conversions.cpp +++ b/libnd4j/include/loops/cpu/type_conversions.cpp @@ -122,7 +122,7 @@ namespace nd4j { for (int e = start; e < stop; e++) { bool flag_load; -#pragma omp atomic read +PRAGMA_OMP_ATOMIC_ARGS(read) flag_load = flag; if (flag_load) break; @@ -130,11 +130,11 @@ namespace nd4j { T cUpd = x[e]; if (cUpd >= tt) { int idx; -#pragma omp atomic capture +PRAGMA_OMP_ATOMIC_ARGS(capture) idx = cnt++; if (idx >= flimit) { -#pragma omp atomic write +PRAGMA_OMP_ATOMIC_ARGS(write) flag = true; break; } @@ -143,11 +143,11 @@ namespace nd4j { x[e] -= tt; } else if (cUpd <= mtt) { int idx; -#pragma omp atomic capture +PRAGMA_OMP_ATOMIC_ARGS(capture) idx = cnt++; if (idx >= flimit) { -#pragma omp atomic write +PRAGMA_OMP_ATOMIC_ARGS(write) flag = true; break; } @@ -237,4 +237,4 @@ namespace nd4j { #ifndef __CLION_IDE__ BUILD_DOUBLE_TEMPLATE(template void TypeCast::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES) #endif -} \ No newline at end of file +} diff --git a/libnd4j/include/openmp_pragmas.h b/libnd4j/include/openmp_pragmas.h index e219dd027..174bd624b 100644 --- a/libnd4j/include/openmp_pragmas.h +++ b/libnd4j/include/openmp_pragmas.h @@ -29,6 +29,8 @@ #define OMP_MAXT #define OMP_SUMT #define OMP_REDUCTION(args) +#define PRAGMA_OMP_ATOMIC +#define PRAGMA_OMP_ATOMIC_ARGS(args) #define PRAGMA_OMP_CRITICAL #define PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD_ARGS(args) @@ -50,6 +52,11 @@ #define PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(args) #define PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(args) #define PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS_COLLAPSE(threads, loops) +#define PRAGMA_OMP_PARALLEL_SECTIONS +#define PRAGMA_OMP_SECTION +#define PRAGMA_OMP_SINGLE +#define PRAGMA_OMP_SINGLE_ARGS(args) +#define PRAGMA_OMP_TASK #else @@ -59,6 +66,8 @@ #define OMP_MAXT maxT #define OMP_SUMT sumT #define OMP_REDUCTION(args) reduction(args) +#define PRAGMA_OMP_ATOMIC _Pragma(OMP_STRINGIFY(omp atomic)) +#define PRAGMA_OMP_ATOMIC_ARGS(args) _Pragma(OMP_STRINGIFY(omp atomic args)) #define PRAGMA_OMP_CRITICAL _Pragma(OMP_STRINGIFY(omp critical)) #define PRAGMA_OMP_SIMD _Pragma(OMP_STRINGIFY(omp simd)) #define PRAGMA_OMP_SIMD_ARGS(args) _Pragma(OMP_STRINGIFY(omp simd args)) @@ -80,6 +89,11 @@ #define PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(loops) _Pragma(OMP_STRINGIFY(omp parallel for simd default(shared) collapse(loops))) #define PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(args) _Pragma(OMP_STRINGIFY(omp parallel for simd reduction(args) default(shared))) #define PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(args) _Pragma(OMP_STRINGIFY(omp parallel for simd num_threads(args) if(args > 1) default(shared))) +#define PRAGMA_OMP_PARALLEL_SECTIONS _Pragma(OMP_STRINGIFY(omp parallel sections)) +#define PRAGMA_OMP_SECTION _Pragma(OMP_STRINGIFY(omp section)) +#define PRAGMA_OMP_SINGLE _Pragma(OMP_STRINGIFY(omp single)) +#define PRAGMA_OMP_SINGLE_ARGS(args) _Pragma(OMP_STRINGIFY(omp single args)) +#define PRAGMA_OMP_TASK _Pragma(OMP_STRINGIFY(omp task)) #endif diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp index 2be2dbcb4..d15ce3266 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp @@ -89,19 +89,19 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output, T sum = 0.; int length = shape::length(inShapeInfo); -#pragma omp simd reduction(maxT:max) +PRAGMA_OMP_SIMD_ARGS(reduction(OMP_MAXT:max)) for (int i = 0; i < length; i++) { const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length); max = nd4j::math::nd4j_max(max, inBuff[offset]); } -#pragma omp parallel for simd reduction(sumT:sum) +PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(reduction(OMP_SUMT:sum)) for (int i = 0; i < length; i++) { const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length); outBuff[offset] = nd4j::math::nd4j_exp(inBuff[offset] - max); sum += outBuff[offset]; } -#pragma omp simd +PRAGMA_OMP_SIMD for (int i = 0; i < length; i++) { const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length); outBuff[offset] /= sum; diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp index 4fc6fa52a..f2e4e77bc 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp @@ -56,7 +56,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* std::vector dimsOut(indices->rankOf()); std::iota(dimsOut.begin(), dimsOut.end(), axis); // fill with axis, axis+1, ... axis+indices->rankOf()-1 const Nd4jLong numOfSubArrs = indices->lengthOf(); -#pragma omp parallel for if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided) +PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) for(int i = 0; i < numOfSubArrs; ++i) { NDArray subArrOut = (*output)(i, dimsOut); NDArray subArrIn = (*input)(indices->e(i), {axis}); @@ -72,7 +72,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* } else { // vector case const Nd4jLong numOfSubArrs = intArgs.size() - 1; -#pragma omp parallel for if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided) +PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) for(int i = 0; i < numOfSubArrs; ++i) { NDArray subArrOut = (*output)(i, {axis}); NDArray subArrIn = (*input)(intArgs[i+1], {axis}); @@ -85,4 +85,4 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* } } -} \ No newline at end of file +} diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp index 261ee32bf..f0f5697d0 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp @@ -204,15 +204,15 @@ void lstmBlockCell(const NDArray* xt, const NDArray* cLast, const NDArray* yLast } PRAGMA_OMP_PARALLEL - #pragma omp single + PRAGMA_OMP_SINGLE { - #pragma omp task + PRAGMA_OMP_TASK zz.applyTransform(transform::Tanh, z); //z = tanh(zz) - #pragma omp task + PRAGMA_OMP_TASK zi.applyTransform(transform::Sigmoid, i); //i = sigmoid(zi) - #pragma omp task + PRAGMA_OMP_TASK zf.applyTransform(transform::Sigmoid, f); //f = sigmoid(zf); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp index e950ce8e5..4446953e6 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp @@ -34,8 +34,8 @@ namespace nd4j { if(outRank == 1) { -// #pragma omp parallel for if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided) -#pragma omp parallel for if(!lock) schedule(guided) +// PRAGMA_OMP_PARALLEL_FOR_ARGS(if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) +PRAGMA_OMP_PARALLEL_FOR_ARGS(if(!lock) schedule(guided)) for(Nd4jLong i = 0; i < indLen; ++i) { Nd4jLong idx = indices.e(i); @@ -53,8 +53,8 @@ namespace nd4j { std::vector dimsToExcludeUpd(sizeOfDims); std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0); -// #pragma omp parallel for if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided) // causes known openMP asan bug ! -#pragma omp parallel for if(!lock) schedule(guided) +// PRAGMA_OMP_PARALLEL_FOR_ARGS(if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) // causes known openMP asan bug ! +PRAGMA_OMP_PARALLEL_FOR_ARGS(if(!lock) schedule(guided)) for(Nd4jLong i = 0; i < indLen; ++i) { NDArray outSubArr = output(indices.e(i), std::vector({0})); @@ -75,8 +75,8 @@ namespace nd4j { if(outRank == 1) { -// #pragma omp parallel for if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided) -#pragma omp parallel for if(!lock) schedule(guided) +// PRAGMA_OMP_PARALLEL_FOR_ARGS(if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) +PRAGMA_OMP_PARALLEL_FOR_ARGS(if(!lock) schedule(guided)) for(Nd4jLong i = 0; i < indLen; ++i) { Nd4jLong idx = indices.e(i); @@ -92,8 +92,8 @@ namespace nd4j { std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0); std::vector idxRangeOut(2*outRank, 0); -// #pragma omp parallel for if(indLen/indLastDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided) firstprivate(idxRangeOut) -#pragma omp parallel for if(!lock) schedule(guided) firstprivate(idxRangeOut) +// PRAGMA_OMP_PARALLEL_FOR_ARGS(if(indLen/indLastDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided) firstprivate(idxRangeOut)) +PRAGMA_OMP_PARALLEL_FOR_ARGS(if(!lock) schedule(guided) firstprivate(idxRangeOut)) for(Nd4jLong i = 0; i < indLen/indLastDim; ++i) { NDArray indSubArr = indices(i, dimsToExcludeInd); @@ -124,7 +124,7 @@ namespace nd4j { std::vector dimsToExclude = ShapeUtils::evalDimsToExclude(updates.rankOf(), {-1}); if(!calcGrad) { -#pragma omp parallel for schedule(guided) +PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided)) for(Nd4jLong i = 0; i < indicesLen; ++i) { auto subArr = updates(i, dimsToExclude); @@ -132,7 +132,7 @@ namespace nd4j { } } else { -#pragma omp parallel for schedule(guided) +PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided)) for(Nd4jLong i = 0; i < indicesLen; ++i) { auto subArr = updates(i, dimsToExclude); @@ -143,4 +143,4 @@ namespace nd4j { } } } -} \ No newline at end of file +} diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu index 814a1280d..7e8bb99f7 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu @@ -38,7 +38,7 @@ namespace helpers { auto tadsOut = output->allTensorsAlongDimension({0}); // FIXME: template selector should be moved out of loop -#pragma omp parallel for +PRAGMA_OMP_PARALLEL_FOR for (int e = 0; e < tadsIn->size(); e++) { BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES); } @@ -55,4 +55,4 @@ namespace helpers { } } -} \ No newline at end of file +} diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu index 141b54e1f..cdd6640c4 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu @@ -39,7 +39,7 @@ namespace helpers { auto tadsOut = output->allTensorsAlongDimension({0}); // FIXME: template selector should be moved out of loop -#pragma omp parallel for +PRAGMA_OMP_PARALLEL_FOR for (int e = 0; e < tadsIn->size(); e++) { BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES); } @@ -56,4 +56,4 @@ namespace helpers { } } -} \ No newline at end of file +} diff --git a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc index 5b8b4a60a..aefb97963 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc +++ b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc @@ -60,13 +60,13 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp memset(imBuff, 0, shape::length(imShapeBuffer) * sizeof(T)); } else if (imEWS > 1) { -#pragma omp parallel for schedule(static) proc_bind(close) +PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close)) for (int i = 0; i < shape::length(imShapeBuffer) * imEWS; i += imEWS) imBuff[i] = static_cast(0.f); } else { const auto len = shape::length(imShapeBuffer); -#pragma omp parallel for schedule(static) proc_bind(close) +PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close)) for (int i = 0; i < len; i++) imBuff[shape::getIndexOffset(i, imShapeBuffer, len)] = static_cast(0.f); } @@ -76,7 +76,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp if (shape::order(colShapeBuffer) == 'c' && shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) { -#pragma omp parallel for schedule(static) proc_bind(close) private(col, im, imRow, imCol) +PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, imRow, imCol)) for (int b = 0; b < bS; b++) { for (int c = 0; c < iC; ++c) { for (int kRow = 0; kRow < kH; ++kRow) { @@ -101,7 +101,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp } else { -#pragma omp parallel for schedule(static) proc_bind(close) private(im, col, imRow, imCol) +PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, imRow, imCol)) for (int b = 0; b < bS; b++) { for (int colH = 0; colH < oH; ++colH) { for (int colW = 0; colW < oW; ++colW) { diff --git a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc index d19352ff8..67f5650bd 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc +++ b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc @@ -62,7 +62,7 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input, NDArra if (shape::order(imShapeBuffer) == 'c' && shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) { -#pragma omp parallel for schedule(static) proc_bind(close) private(col, im, imRow, imCol) +PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, imRow, imCol)) for (int b = 0; b < bS; b++) { for (int c = 0; c < iC; ++c) { for (int kRow = 0; kRow < kH; ++kRow) { @@ -89,7 +89,7 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input, NDArra } else { -#pragma omp parallel for schedule(static) proc_bind(close) private(im, col, imRow, imCol) +PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, imRow, imCol)) for (int b = 0; b < bS; b++) { for (int colH = 0; colH < oH; ++colH) { for (int colW = 0; colW < oW; ++colW) { @@ -126,4 +126,4 @@ BUILD_SINGLE_TEMPLATE(template void im2col_, (nd4j::LaunchContext & context, con } } -} \ No newline at end of file +} diff --git a/libnd4j/include/ops/impl/specials.cpp b/libnd4j/include/ops/impl/specials.cpp index 074b2eaa6..3a07ba1e2 100644 --- a/libnd4j/include/ops/impl/specials.cpp +++ b/libnd4j/include/ops/impl/specials.cpp @@ -224,9 +224,9 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint if (i < right){ quickSort_parallel_internal(array, xShapeInfo, i, right, cutoff, descending); } }else{ -#pragma omp task +PRAGMA_OMP_TASK { quickSort_parallel_internal(array, xShapeInfo, left, j, cutoff, descending); } -#pragma omp task +PRAGMA_OMP_TASK { quickSort_parallel_internal(array, xShapeInfo, i, right, cutoff, descending); } } } @@ -238,7 +238,7 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint PRAGMA_OMP_PARALLEL_THREADS(numThreads) { -#pragma omp single nowait +PRAGMA_OMP_SINGLE_ARGS(nowait) { quickSort_parallel_internal(array, xShapeInfo, 0, lenArray-1, cutoff, descending); } @@ -350,7 +350,7 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint Nd4jLong retVal = 0L; -#pragma omp parallel for schedule(guided) proc_bind(close) reduction(+:retVal) +PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal)) for (Nd4jLong x = 0; x < N; x += 16) { int byte = 0; @@ -451,9 +451,9 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); } }else{ -#pragma omp task +PRAGMA_OMP_TASK { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); } -#pragma omp task +PRAGMA_OMP_TASK { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); } } } @@ -517,9 +517,9 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); } }else{ -#pragma omp task +PRAGMA_OMP_TASK { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); } -#pragma omp task +PRAGMA_OMP_TASK { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); } } } @@ -533,7 +533,7 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint PRAGMA_OMP_PARALLEL_THREADS(numThreads) { -#pragma omp single nowait +PRAGMA_OMP_SINGLE_ARGS(nowait) { quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending); } @@ -548,7 +548,7 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint PRAGMA_OMP_PARALLEL_THREADS(numThreads) { -#pragma omp single nowait +PRAGMA_OMP_SINGLE_ARGS(nowait) { quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending); } diff --git a/libnd4j/include/ops/impl/specials_sparse.cpp b/libnd4j/include/ops/impl/specials_sparse.cpp index 790005148..6a76c9209 100644 --- a/libnd4j/include/ops/impl/specials_sparse.cpp +++ b/libnd4j/include/ops/impl/specials_sparse.cpp @@ -185,9 +185,9 @@ namespace nd4j { if (i < right){ coo_quickSort_parallel_internal(indices, array, i, right, cutoff, rank); } }else{ -#pragma omp task +PRAGMA_OMP_TASK { coo_quickSort_parallel_internal(indices, array, left, j, cutoff, rank); } -#pragma omp task +PRAGMA_OMP_TASK { coo_quickSort_parallel_internal(indices, array, i, right, cutoff, rank); } } @@ -200,7 +200,7 @@ namespace nd4j { PRAGMA_OMP_PARALLEL_THREADS(numThreads) { -#pragma omp single nowait +PRAGMA_OMP_SINGLE_ARGS(nowait) { coo_quickSort_parallel_internal(indices, array, 0, lenArray-1, cutoff, rank); } diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp index f77f87e75..c295b7e75 100644 --- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp @@ -823,7 +823,7 @@ TEST_F(PlaygroundTests, ScalarTest_2) { float * array = reinterpret_cast(source.buffer()); for (int e = 0; e < 1000; e++) { -#pragma omp simd +PRAGMA_OMP_SIMD for (int i = 0; i < source.lengthOf(); i++) { array[i] = simdOps::Add::op(array[i], 2.0f); } @@ -1215,7 +1215,7 @@ TEST_F(PlaygroundTests, loopThroughArrs_test1) { //*********************************** auto timeStart = std::chrono::system_clock::now(); -#pragma omp parallel for schedule(guided) +PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided)) for(Nd4jLong i = 0; i < len; ++i) { Nd4jLong offset1 = shape::getIndexOffset(i, x.getShapeInfo(), len); @@ -1230,7 +1230,7 @@ TEST_F(PlaygroundTests, loopThroughArrs_test1) { //*********************************** timeStart = std::chrono::system_clock::now(); -#pragma omp parallel for schedule(guided) +PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided)) for(Nd4jLong i = 0; i < len; ++i) { Nd4jLong offset1 = shape::getIndexOffset(i, x.getShapeInfo(), len); @@ -1255,7 +1255,7 @@ static void loopSpan(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeI int zEws = shape::elementWiseStride(zShapeInfo); BlockInformation info(len, ELEMENT_THRESHOLD); - #pragma omp parallel num_threads(info.threads) if (info.threads > 1) default(shared) + PRAGMA_OMP_PARALLEL_ARGS(num_threads(info.threads) if (info.threads > 1)) { auto i = omp_get_thread_num(); Nd4jLong itemsToLoop = (i < info.threads-1) ? info.items : info.items + info.remainder; @@ -1263,7 +1263,7 @@ static void loopSpan(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeI auto xi = x + xEws * index; auto yi = y + yEws * index; auto zi = z + zEws * index; - #pragma omp simd + PRAGMA_OMP_SIMD for (Nd4jLong j = 0; j < itemsToLoop; j++) zi[j * zEws] = simdOps::LogPoissonLoss::op(xi[j * xEws], yi[j * yEws]); } @@ -1278,7 +1278,7 @@ static void loopSimple(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShap int threads = 6; int span_size = len / threads + 1; - #pragma omp parallel for simd schedule(static, span_size) if (len > ELEMENT_THRESHOLD) proc_bind(close) default(shared) + PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(static, span_size) if (len > ELEMENT_THRESHOLD) proc_bind(close)) for(Nd4jLong i = 0; i < len; ++i) z[i * zEws] = simdOps::LogPoissonLoss::op(x[i * xEws], y[i * yEws]); @@ -1347,11 +1347,11 @@ static void loop1(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeInfo int zEws = shape::elementWiseStride(zShapeInfo); nd4j::OmpLaunchHelper info(len); - #pragma omp parallel num_threads(info._numThreads) default(shared) + PRAGMA_OMP_PARALLEL_ARGS(num_threads(info._numThreads)) { auto threadNum = omp_get_thread_num(); Nd4jLong threadOffset = info.getThreadOffset(threadNum); - #pragma omp simd + PRAGMA_OMP_SIMD for (Nd4jLong j = 0; j < info.getItersPerThread(threadNum); j++) { Nd4jLong xOffset = shape::getIndexOffset(j+threadOffset, xShapeInfo, len); Nd4jLong yOffset = shape::getIndexOffset(j+threadOffset, yShapeInfo, len); @@ -1370,7 +1370,7 @@ static void loop2(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeInfo int threads = 6; int span_size = len / threads + 1; - #pragma omp parallel for simd schedule(static) default(shared) + PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(static)) for(Nd4jLong i = 0; i < len; ++i) { Nd4jLong xOffset = shape::getIndexOffset(i, xShapeInfo, len); Nd4jLong yOffset = shape::getIndexOffset(i, yShapeInfo, len); @@ -1615,7 +1615,7 @@ TEST_F(PlaygroundTests, test_manual_loop) { auto timeStart = std::chrono::system_clock::now(); for (int i = 0; i < iterations; i++) { -#pragma omp parallel for num_threads(4) schedule(static, 32768) +PRAGMA_OMP_PARALLEL_FOR_ARGS(num_threads(4) schedule(static, 32768)) for (unsigned int e = 0; e < len; e++) z[e] = array[e]; } @@ -1931,19 +1931,19 @@ TEST_F(PlaygroundTests, loops_2) { for (int i = 0; i < N; ++i) { - #pragma omp parallel sections + PRAGMA_OMP_PARALLEL_SECTIONS { - #pragma omp section + PRAGMA_OMP_SECTION { shape::calcOffsets(3, shape, strides, xOffsets); } - #pragma omp section + PRAGMA_OMP_SECTION { shape::calcOffsets(3, shape, strides, yOffsets); } - #pragma omp section + PRAGMA_OMP_SECTION { shape::calcOffsets(3, shape, strides, zOffsets); @@ -2110,19 +2110,19 @@ TEST_F(PlaygroundTests, loops_3) { for (int i = 0; i < N; ++i) { - #pragma omp parallel sections + PRAGMA_OMP_PARALLEL_SECTIONS { - #pragma omp section + PRAGMA_OMP_SECTION { shape::calcOffsets(3, shape, strides, xOffsets); } - #pragma omp section + PRAGMA_OMP_SECTION { shape::calcOffsets(3, shape, strides, yOffsets); } - #pragma omp section + PRAGMA_OMP_SECTION { shape::calcOffsets(3, shape, strides, zOffsets); @@ -2350,4 +2350,4 @@ TEST_F(PlaygroundTests, mmulMxM_1) { auto duration1 = std::chrono::duration_cast ((timeEnd - timeStart) / numOfIters).count(); printf("duration %ld\n", duration1); } -*/ \ No newline at end of file +*/