Fix OpenMP by replacing pragmas with macros (#8026)
Signed-off-by: Samuel Audet <samuel.audet@gmail.com>master
parent
5a0904cb4c
commit
9bb11d5b06
|
@ -2412,7 +2412,7 @@ double NDArray::getTrace() const {
|
||||||
|
|
||||||
double sum = 0.;
|
double sum = 0.;
|
||||||
|
|
||||||
#pragma omp parallel for reduction(sumT:sum) if(minDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(reduction(OMP_SUMT:sum) if(minDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
|
||||||
for(int i = 0; i < minDim; ++i)
|
for(int i = 0; i < minDim; ++i)
|
||||||
sum += e<double>(i * offset);
|
sum += e<double>(i * offset);
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c
|
||||||
const bool flagA = (flagC && transA) || (!flagC && !transA);
|
const bool flagA = (flagC && transA) || (!flagC && !transA);
|
||||||
const bool flagB = (flagC && transB) || (!flagC && !transB);
|
const bool flagB = (flagC && transB) || (!flagC && !transB);
|
||||||
|
|
||||||
// #pragma omp parallel for if(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
|
// PRAGMA_OMP_PARALLEL_FOR_ARGS(if(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
|
||||||
// for(uint row = 0; row < M; ++row) {
|
// for(uint row = 0; row < M; ++row) {
|
||||||
|
|
||||||
// T3* c = flagC ? (C + row) : (C + row * ldc);
|
// T3* c = flagC ? (C + row) : (C + row * ldc);
|
||||||
|
@ -53,7 +53,7 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c
|
||||||
// T3* a = flagA ? (A + row * lda + i) : (A + row + i * lda);
|
// T3* a = flagA ? (A + row * lda + i) : (A + row + i * lda);
|
||||||
|
|
||||||
// if(flagC) {
|
// if(flagC) {
|
||||||
// #pragma omp simd
|
// PRAGMA_OMP_SIMD
|
||||||
// for(uint col = 0; col < N; ++col) {
|
// for(uint col = 0; col < N; ++col) {
|
||||||
// if(betaZ)
|
// if(betaZ)
|
||||||
// c[col * ldc] += a * b[flagB ? col : col * ldb] + betaZ * c[col * ldc];
|
// c[col * ldc] += a * b[flagB ? col : col * ldb] + betaZ * c[col * ldc];
|
||||||
|
@ -62,7 +62,7 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
// else {
|
// else {
|
||||||
// #pragma omp simd
|
// PRAGMA_OMP_SIMD
|
||||||
// for(uint col = 0; col < N; ++col) {
|
// for(uint col = 0; col < N; ++col) {
|
||||||
// if(betaZ)
|
// if(betaZ)
|
||||||
// c[col] += a * b[flagB ? col : col * ldb] + betaZ * c[col];
|
// c[col] += a * b[flagB ? col : col * ldb] + betaZ * c[col];
|
||||||
|
@ -73,14 +73,14 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
#pragma omp parallel for if(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided) collapse(2)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(if(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided) collapse(2))
|
||||||
for(uint row = 0; row < M; ++row) {
|
for(uint row = 0; row < M; ++row) {
|
||||||
for(uint col = 0; col < N; ++col) {
|
for(uint col = 0; col < N; ++col) {
|
||||||
|
|
||||||
T3* c = flagC ? (C + row + col * ldc) : (C + row * ldc + col);
|
T3* c = flagC ? (C + row + col * ldc) : (C + row * ldc + col);
|
||||||
T3 val = 0;
|
T3 val = 0;
|
||||||
|
|
||||||
#pragma omp simd
|
PRAGMA_OMP_SIMD
|
||||||
for(uint i = 0; i < K; ++i) {
|
for(uint i = 0; i < K; ++i) {
|
||||||
T3 a = flagA ? *(A + row * lda + i) : *(A + row + i * lda);
|
T3 a = flagA ? *(A + row * lda + i) : *(A + row + i * lda);
|
||||||
T3 b = flagB ? *(B + col + i * ldb) : *(B + col * ldb + i);
|
T3 b = flagB ? *(B + col + i * ldb) : *(B + col * ldb + i);
|
||||||
|
@ -107,13 +107,13 @@ static void usualGemv(const char aOrder, const int M, const int N, const double
|
||||||
|
|
||||||
const bool flagA = aOrder == 'f';
|
const bool flagA = aOrder == 'f';
|
||||||
|
|
||||||
#pragma omp parallel for if(M > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(if(M > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
|
||||||
for(int row = 0; row < M; ++row) {
|
for(int row = 0; row < M; ++row) {
|
||||||
|
|
||||||
T3* y = Y + row * incy;
|
T3* y = Y + row * incy;
|
||||||
T3 val = 0;
|
T3 val = 0;
|
||||||
|
|
||||||
#pragma omp simd
|
PRAGMA_OMP_SIMD
|
||||||
for(int i = 0; i < N; ++i) {
|
for(int i = 0; i < N; ++i) {
|
||||||
T3 a = flagA ? *(A + row + i * lda) : *(A + row * lda + i);
|
T3 a = flagA ? *(A + row + i * lda) : *(A + row * lda + i);
|
||||||
T3 x = *(X + i * incx);
|
T3 x = *(X + i * incx);
|
||||||
|
@ -138,7 +138,7 @@ static void usualDot(const Nd4jLong length, const double alpha, const void* vX,
|
||||||
T3 alphaZ(alpha), betaZ(beta);
|
T3 alphaZ(alpha), betaZ(beta);
|
||||||
|
|
||||||
T3 sum = 0;
|
T3 sum = 0;
|
||||||
#pragma omp parallel for if(length > Environment::getInstance()->elementwiseThreshold()) schedule(guided) reduction(sumT:sum)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(if(length > Environment::getInstance()->elementwiseThreshold()) schedule(guided) reduction(OMP_SUMT:sum))
|
||||||
for(int i = 0; i < length; ++i)
|
for(int i = 0; i < length; ++i)
|
||||||
sum = sum + X[i * incx] * Y[i * incy];
|
sum = sum + X[i * incx] * Y[i * incy];
|
||||||
|
|
||||||
|
|
|
@ -67,7 +67,7 @@ namespace nd4j {
|
||||||
auto _infCount = nd4j::math::nd4j_isinf(input->e<double>(0)) ? 1L : 0L;
|
auto _infCount = nd4j::math::nd4j_isinf(input->e<double>(0)) ? 1L : 0L;
|
||||||
auto _nanCount = nd4j::math::nd4j_isnan(input->e<double>(0)) ? 1L : 0L;
|
auto _nanCount = nd4j::math::nd4j_isnan(input->e<double>(0)) ? 1L : 0L;
|
||||||
|
|
||||||
#pragma omp parallel for schedule(guided) reduction(+:_nanCount,_infCount,_meanValue,_zeroCount,_positiveCount,_negativeCount) reduction(min:_minValue) reduction(max:_maxValue)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) reduction(+:_nanCount,_infCount,_meanValue,_zeroCount,_positiveCount,_negativeCount) reduction(min:_minValue) reduction(max:_maxValue))
|
||||||
for (Nd4jLong e = 1; e < input->lengthOf(); e++) {
|
for (Nd4jLong e = 1; e < input->lengthOf(); e++) {
|
||||||
auto current = input->e<double>(e);
|
auto current = input->e<double>(e);
|
||||||
auto n = e + 1.;
|
auto n = e + 1.;
|
||||||
|
@ -88,7 +88,7 @@ namespace nd4j {
|
||||||
}
|
}
|
||||||
*info = {_minValue, _maxValue, _meanValue / input->lengthOf(), _stdDevValue, _zeroCount, _positiveCount, _negativeCount, _infCount, _nanCount};
|
*info = {_minValue, _maxValue, _meanValue / input->lengthOf(), _stdDevValue, _zeroCount, _positiveCount, _negativeCount, _infCount, _nanCount};
|
||||||
_stdDevValue = 0; //math::nd4j_sqrt<double, double>(info->_stdDevValue / (input->lengthOf() - 1));
|
_stdDevValue = 0; //math::nd4j_sqrt<double, double>(info->_stdDevValue / (input->lengthOf() - 1));
|
||||||
#pragma omp parallel for schedule (static) reduction(+:_stdDevValue)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule (static) reduction(+:_stdDevValue))
|
||||||
for (Nd4jLong e = 0; e < input->lengthOf(); e++) {
|
for (Nd4jLong e = 0; e < input->lengthOf(); e++) {
|
||||||
double current = input->e<double>(e);
|
double current = input->e<double>(e);
|
||||||
_stdDevValue += (info->_meanValue - current) * (info->_meanValue - current); //info->_minValue;
|
_stdDevValue += (info->_meanValue - current) * (info->_meanValue - current); //info->_minValue;
|
||||||
|
|
|
@ -4761,14 +4761,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
|
||||||
}
|
}
|
||||||
else if(xEws == 1) {
|
else if(xEws == 1) {
|
||||||
xOffsets = nullptr;
|
xOffsets = nullptr;
|
||||||
#pragma omp parallel sections
|
PRAGMA_OMP_PARALLEL_SECTIONS
|
||||||
{
|
{
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
yOffsets = new Nd4jLong[len];
|
yOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
|
shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
|
||||||
}
|
}
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
zOffsets = new Nd4jLong[len];
|
zOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(zShapeInfo, zOffsets, xOrder);
|
shape::calcOffsets(zShapeInfo, zOffsets, xOrder);
|
||||||
|
@ -4777,14 +4777,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
|
||||||
}
|
}
|
||||||
else if(yEws == 1) {
|
else if(yEws == 1) {
|
||||||
yOffsets = nullptr;
|
yOffsets = nullptr;
|
||||||
#pragma omp parallel sections
|
PRAGMA_OMP_PARALLEL_SECTIONS
|
||||||
{
|
{
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
xOffsets = new Nd4jLong[len];
|
xOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
|
shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
|
||||||
}
|
}
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
zOffsets = new Nd4jLong[len];
|
zOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(zShapeInfo, zOffsets, yOrder);
|
shape::calcOffsets(zShapeInfo, zOffsets, yOrder);
|
||||||
|
@ -4793,14 +4793,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
|
||||||
}
|
}
|
||||||
else if(zEws == 1) {
|
else if(zEws == 1) {
|
||||||
zOffsets = nullptr;
|
zOffsets = nullptr;
|
||||||
#pragma omp parallel sections
|
PRAGMA_OMP_PARALLEL_SECTIONS
|
||||||
{
|
{
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
xOffsets = new Nd4jLong[len];
|
xOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(xShapeInfo, xOffsets, zOrder);
|
shape::calcOffsets(xShapeInfo, xOffsets, zOrder);
|
||||||
}
|
}
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
yOffsets = new Nd4jLong[len];
|
yOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(yShapeInfo, yOffsets, zOrder);
|
shape::calcOffsets(yShapeInfo, yOffsets, zOrder);
|
||||||
|
@ -4813,14 +4813,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
|
||||||
yOffsets = zOffsets = xOffsets;
|
yOffsets = zOffsets = xOffsets;
|
||||||
}
|
}
|
||||||
else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
|
else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
|
||||||
#pragma omp parallel sections
|
PRAGMA_OMP_PARALLEL_SECTIONS
|
||||||
{
|
{
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
xOffsets = new Nd4jLong[len];
|
xOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(xShapeInfo, xOffsets);
|
shape::calcOffsets(xShapeInfo, xOffsets);
|
||||||
}
|
}
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
zOffsets = new Nd4jLong[len];
|
zOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(zShapeInfo, zOffsets);
|
shape::calcOffsets(zShapeInfo, zOffsets);
|
||||||
|
@ -4829,14 +4829,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
|
||||||
yOffsets = xOffsets;
|
yOffsets = xOffsets;
|
||||||
}
|
}
|
||||||
else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
|
else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
|
||||||
#pragma omp parallel sections
|
PRAGMA_OMP_PARALLEL_SECTIONS
|
||||||
{
|
{
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
xOffsets = new Nd4jLong[len];
|
xOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(xShapeInfo, xOffsets);
|
shape::calcOffsets(xShapeInfo, xOffsets);
|
||||||
}
|
}
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
yOffsets = new Nd4jLong[len];
|
yOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(yShapeInfo, yOffsets);
|
shape::calcOffsets(yShapeInfo, yOffsets);
|
||||||
|
@ -4845,19 +4845,19 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
|
||||||
zOffsets = xOffsets;
|
zOffsets = xOffsets;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
#pragma omp parallel sections
|
PRAGMA_OMP_PARALLEL_SECTIONS
|
||||||
{
|
{
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
xOffsets = new Nd4jLong[len];
|
xOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(xShapeInfo, xOffsets);
|
shape::calcOffsets(xShapeInfo, xOffsets);
|
||||||
}
|
}
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
yOffsets = new Nd4jLong[len];
|
yOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(yShapeInfo, yOffsets);
|
shape::calcOffsets(yShapeInfo, yOffsets);
|
||||||
}
|
}
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
zOffsets = new Nd4jLong[len];
|
zOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(zShapeInfo, zOffsets);
|
shape::calcOffsets(zShapeInfo, zOffsets);
|
||||||
|
@ -4899,14 +4899,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
|
||||||
yOffsets = xOffsets;
|
yOffsets = xOffsets;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
#pragma omp parallel sections
|
PRAGMA_OMP_PARALLEL_SECTIONS
|
||||||
{
|
{
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
xOffsets = new Nd4jLong[len];
|
xOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(xShapeInfo, xOffsets);
|
shape::calcOffsets(xShapeInfo, xOffsets);
|
||||||
}
|
}
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
yOffsets = new Nd4jLong[len];
|
yOffsets = new Nd4jLong[len];
|
||||||
shape::calcOffsets(yShapeInfo, yOffsets);
|
shape::calcOffsets(yShapeInfo, yOffsets);
|
||||||
|
|
|
@ -122,7 +122,7 @@ namespace nd4j {
|
||||||
|
|
||||||
for (int e = start; e < stop; e++) {
|
for (int e = start; e < stop; e++) {
|
||||||
bool flag_load;
|
bool flag_load;
|
||||||
#pragma omp atomic read
|
PRAGMA_OMP_ATOMIC_ARGS(read)
|
||||||
flag_load = flag;
|
flag_load = flag;
|
||||||
if (flag_load)
|
if (flag_load)
|
||||||
break;
|
break;
|
||||||
|
@ -130,11 +130,11 @@ namespace nd4j {
|
||||||
T cUpd = x[e];
|
T cUpd = x[e];
|
||||||
if (cUpd >= tt) {
|
if (cUpd >= tt) {
|
||||||
int idx;
|
int idx;
|
||||||
#pragma omp atomic capture
|
PRAGMA_OMP_ATOMIC_ARGS(capture)
|
||||||
idx = cnt++;
|
idx = cnt++;
|
||||||
|
|
||||||
if (idx >= flimit) {
|
if (idx >= flimit) {
|
||||||
#pragma omp atomic write
|
PRAGMA_OMP_ATOMIC_ARGS(write)
|
||||||
flag = true;
|
flag = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -143,11 +143,11 @@ namespace nd4j {
|
||||||
x[e] -= tt;
|
x[e] -= tt;
|
||||||
} else if (cUpd <= mtt) {
|
} else if (cUpd <= mtt) {
|
||||||
int idx;
|
int idx;
|
||||||
#pragma omp atomic capture
|
PRAGMA_OMP_ATOMIC_ARGS(capture)
|
||||||
idx = cnt++;
|
idx = cnt++;
|
||||||
|
|
||||||
if (idx >= flimit) {
|
if (idx >= flimit) {
|
||||||
#pragma omp atomic write
|
PRAGMA_OMP_ATOMIC_ARGS(write)
|
||||||
flag = true;
|
flag = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,6 +29,8 @@
|
||||||
#define OMP_MAXT
|
#define OMP_MAXT
|
||||||
#define OMP_SUMT
|
#define OMP_SUMT
|
||||||
#define OMP_REDUCTION(args)
|
#define OMP_REDUCTION(args)
|
||||||
|
#define PRAGMA_OMP_ATOMIC
|
||||||
|
#define PRAGMA_OMP_ATOMIC_ARGS(args)
|
||||||
#define PRAGMA_OMP_CRITICAL
|
#define PRAGMA_OMP_CRITICAL
|
||||||
#define PRAGMA_OMP_SIMD
|
#define PRAGMA_OMP_SIMD
|
||||||
#define PRAGMA_OMP_SIMD_ARGS(args)
|
#define PRAGMA_OMP_SIMD_ARGS(args)
|
||||||
|
@ -50,6 +52,11 @@
|
||||||
#define PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(args)
|
#define PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(args)
|
||||||
#define PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(args)
|
#define PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(args)
|
||||||
#define PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS_COLLAPSE(threads, loops)
|
#define PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS_COLLAPSE(threads, loops)
|
||||||
|
#define PRAGMA_OMP_PARALLEL_SECTIONS
|
||||||
|
#define PRAGMA_OMP_SECTION
|
||||||
|
#define PRAGMA_OMP_SINGLE
|
||||||
|
#define PRAGMA_OMP_SINGLE_ARGS(args)
|
||||||
|
#define PRAGMA_OMP_TASK
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@ -59,6 +66,8 @@
|
||||||
#define OMP_MAXT maxT
|
#define OMP_MAXT maxT
|
||||||
#define OMP_SUMT sumT
|
#define OMP_SUMT sumT
|
||||||
#define OMP_REDUCTION(args) reduction(args)
|
#define OMP_REDUCTION(args) reduction(args)
|
||||||
|
#define PRAGMA_OMP_ATOMIC _Pragma(OMP_STRINGIFY(omp atomic))
|
||||||
|
#define PRAGMA_OMP_ATOMIC_ARGS(args) _Pragma(OMP_STRINGIFY(omp atomic args))
|
||||||
#define PRAGMA_OMP_CRITICAL _Pragma(OMP_STRINGIFY(omp critical))
|
#define PRAGMA_OMP_CRITICAL _Pragma(OMP_STRINGIFY(omp critical))
|
||||||
#define PRAGMA_OMP_SIMD _Pragma(OMP_STRINGIFY(omp simd))
|
#define PRAGMA_OMP_SIMD _Pragma(OMP_STRINGIFY(omp simd))
|
||||||
#define PRAGMA_OMP_SIMD_ARGS(args) _Pragma(OMP_STRINGIFY(omp simd args))
|
#define PRAGMA_OMP_SIMD_ARGS(args) _Pragma(OMP_STRINGIFY(omp simd args))
|
||||||
|
@ -80,6 +89,11 @@
|
||||||
#define PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(loops) _Pragma(OMP_STRINGIFY(omp parallel for simd default(shared) collapse(loops)))
|
#define PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(loops) _Pragma(OMP_STRINGIFY(omp parallel for simd default(shared) collapse(loops)))
|
||||||
#define PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(args) _Pragma(OMP_STRINGIFY(omp parallel for simd reduction(args) default(shared)))
|
#define PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(args) _Pragma(OMP_STRINGIFY(omp parallel for simd reduction(args) default(shared)))
|
||||||
#define PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(args) _Pragma(OMP_STRINGIFY(omp parallel for simd num_threads(args) if(args > 1) default(shared)))
|
#define PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(args) _Pragma(OMP_STRINGIFY(omp parallel for simd num_threads(args) if(args > 1) default(shared)))
|
||||||
|
#define PRAGMA_OMP_PARALLEL_SECTIONS _Pragma(OMP_STRINGIFY(omp parallel sections))
|
||||||
|
#define PRAGMA_OMP_SECTION _Pragma(OMP_STRINGIFY(omp section))
|
||||||
|
#define PRAGMA_OMP_SINGLE _Pragma(OMP_STRINGIFY(omp single))
|
||||||
|
#define PRAGMA_OMP_SINGLE_ARGS(args) _Pragma(OMP_STRINGIFY(omp single args))
|
||||||
|
#define PRAGMA_OMP_TASK _Pragma(OMP_STRINGIFY(omp task))
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -89,19 +89,19 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
|
||||||
T sum = 0.;
|
T sum = 0.;
|
||||||
int length = shape::length(inShapeInfo);
|
int length = shape::length(inShapeInfo);
|
||||||
|
|
||||||
#pragma omp simd reduction(maxT:max)
|
PRAGMA_OMP_SIMD_ARGS(reduction(OMP_MAXT:max))
|
||||||
for (int i = 0; i < length; i++) {
|
for (int i = 0; i < length; i++) {
|
||||||
const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
|
const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
|
||||||
max = nd4j::math::nd4j_max<T>(max, inBuff[offset]);
|
max = nd4j::math::nd4j_max<T>(max, inBuff[offset]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma omp parallel for simd reduction(sumT:sum)
|
PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(reduction(OMP_SUMT:sum))
|
||||||
for (int i = 0; i < length; i++) {
|
for (int i = 0; i < length; i++) {
|
||||||
const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
|
const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
|
||||||
outBuff[offset] = nd4j::math::nd4j_exp<T, T>(inBuff[offset] - max);
|
outBuff[offset] = nd4j::math::nd4j_exp<T, T>(inBuff[offset] - max);
|
||||||
sum += outBuff[offset];
|
sum += outBuff[offset];
|
||||||
}
|
}
|
||||||
#pragma omp simd
|
PRAGMA_OMP_SIMD
|
||||||
for (int i = 0; i < length; i++) {
|
for (int i = 0; i < length; i++) {
|
||||||
const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
|
const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
|
||||||
outBuff[offset] /= sum;
|
outBuff[offset] /= sum;
|
||||||
|
|
|
@ -56,7 +56,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
|
||||||
std::vector<int> dimsOut(indices->rankOf());
|
std::vector<int> dimsOut(indices->rankOf());
|
||||||
std::iota(dimsOut.begin(), dimsOut.end(), axis); // fill with axis, axis+1, ... axis+indices->rankOf()-1
|
std::iota(dimsOut.begin(), dimsOut.end(), axis); // fill with axis, axis+1, ... axis+indices->rankOf()-1
|
||||||
const Nd4jLong numOfSubArrs = indices->lengthOf();
|
const Nd4jLong numOfSubArrs = indices->lengthOf();
|
||||||
#pragma omp parallel for if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
|
||||||
for(int i = 0; i < numOfSubArrs; ++i) {
|
for(int i = 0; i < numOfSubArrs; ++i) {
|
||||||
NDArray subArrOut = (*output)(i, dimsOut);
|
NDArray subArrOut = (*output)(i, dimsOut);
|
||||||
NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis});
|
NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis});
|
||||||
|
@ -72,7 +72,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
|
||||||
}
|
}
|
||||||
else { // vector case
|
else { // vector case
|
||||||
const Nd4jLong numOfSubArrs = intArgs.size() - 1;
|
const Nd4jLong numOfSubArrs = intArgs.size() - 1;
|
||||||
#pragma omp parallel for if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
|
||||||
for(int i = 0; i < numOfSubArrs; ++i) {
|
for(int i = 0; i < numOfSubArrs; ++i) {
|
||||||
NDArray subArrOut = (*output)(i, {axis});
|
NDArray subArrOut = (*output)(i, {axis});
|
||||||
NDArray subArrIn = (*input)(intArgs[i+1], {axis});
|
NDArray subArrIn = (*input)(intArgs[i+1], {axis});
|
||||||
|
|
|
@ -204,15 +204,15 @@ void lstmBlockCell(const NDArray* xt, const NDArray* cLast, const NDArray* yLast
|
||||||
}
|
}
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL
|
PRAGMA_OMP_PARALLEL
|
||||||
#pragma omp single
|
PRAGMA_OMP_SINGLE
|
||||||
{
|
{
|
||||||
#pragma omp task
|
PRAGMA_OMP_TASK
|
||||||
zz.applyTransform(transform::Tanh, z); //z = tanh(zz)
|
zz.applyTransform(transform::Tanh, z); //z = tanh(zz)
|
||||||
|
|
||||||
#pragma omp task
|
PRAGMA_OMP_TASK
|
||||||
zi.applyTransform(transform::Sigmoid, i); //i = sigmoid(zi)
|
zi.applyTransform(transform::Sigmoid, i); //i = sigmoid(zi)
|
||||||
|
|
||||||
#pragma omp task
|
PRAGMA_OMP_TASK
|
||||||
zf.applyTransform(transform::Sigmoid, f); //f = sigmoid(zf);
|
zf.applyTransform(transform::Sigmoid, f); //f = sigmoid(zf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -34,8 +34,8 @@ namespace nd4j {
|
||||||
|
|
||||||
if(outRank == 1) {
|
if(outRank == 1) {
|
||||||
|
|
||||||
// #pragma omp parallel for if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
|
// PRAGMA_OMP_PARALLEL_FOR_ARGS(if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
|
||||||
#pragma omp parallel for if(!lock) schedule(guided)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(if(!lock) schedule(guided))
|
||||||
for(Nd4jLong i = 0; i < indLen; ++i) {
|
for(Nd4jLong i = 0; i < indLen; ++i) {
|
||||||
|
|
||||||
Nd4jLong idx = indices.e<Nd4jLong>(i);
|
Nd4jLong idx = indices.e<Nd4jLong>(i);
|
||||||
|
@ -53,8 +53,8 @@ namespace nd4j {
|
||||||
std::vector<int> dimsToExcludeUpd(sizeOfDims);
|
std::vector<int> dimsToExcludeUpd(sizeOfDims);
|
||||||
std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
|
std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
|
||||||
|
|
||||||
// #pragma omp parallel for if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided) // causes known openMP asan bug !
|
// PRAGMA_OMP_PARALLEL_FOR_ARGS(if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) // causes known openMP asan bug !
|
||||||
#pragma omp parallel for if(!lock) schedule(guided)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(if(!lock) schedule(guided))
|
||||||
for(Nd4jLong i = 0; i < indLen; ++i) {
|
for(Nd4jLong i = 0; i < indLen; ++i) {
|
||||||
|
|
||||||
NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0}));
|
NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0}));
|
||||||
|
@ -75,8 +75,8 @@ namespace nd4j {
|
||||||
|
|
||||||
if(outRank == 1) {
|
if(outRank == 1) {
|
||||||
|
|
||||||
// #pragma omp parallel for if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
|
// PRAGMA_OMP_PARALLEL_FOR_ARGS(if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
|
||||||
#pragma omp parallel for if(!lock) schedule(guided)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(if(!lock) schedule(guided))
|
||||||
for(Nd4jLong i = 0; i < indLen; ++i) {
|
for(Nd4jLong i = 0; i < indLen; ++i) {
|
||||||
|
|
||||||
Nd4jLong idx = indices.e<Nd4jLong>(i);
|
Nd4jLong idx = indices.e<Nd4jLong>(i);
|
||||||
|
@ -92,8 +92,8 @@ namespace nd4j {
|
||||||
std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
|
std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
|
||||||
std::vector<Nd4jLong> idxRangeOut(2*outRank, 0);
|
std::vector<Nd4jLong> idxRangeOut(2*outRank, 0);
|
||||||
|
|
||||||
// #pragma omp parallel for if(indLen/indLastDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided) firstprivate(idxRangeOut)
|
// PRAGMA_OMP_PARALLEL_FOR_ARGS(if(indLen/indLastDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided) firstprivate(idxRangeOut))
|
||||||
#pragma omp parallel for if(!lock) schedule(guided) firstprivate(idxRangeOut)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(if(!lock) schedule(guided) firstprivate(idxRangeOut))
|
||||||
for(Nd4jLong i = 0; i < indLen/indLastDim; ++i) {
|
for(Nd4jLong i = 0; i < indLen/indLastDim; ++i) {
|
||||||
|
|
||||||
NDArray indSubArr = indices(i, dimsToExcludeInd);
|
NDArray indSubArr = indices(i, dimsToExcludeInd);
|
||||||
|
@ -124,7 +124,7 @@ namespace nd4j {
|
||||||
std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(updates.rankOf(), {-1});
|
std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(updates.rankOf(), {-1});
|
||||||
|
|
||||||
if(!calcGrad) {
|
if(!calcGrad) {
|
||||||
#pragma omp parallel for schedule(guided)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
|
||||||
for(Nd4jLong i = 0; i < indicesLen; ++i) {
|
for(Nd4jLong i = 0; i < indicesLen; ++i) {
|
||||||
|
|
||||||
auto subArr = updates(i, dimsToExclude);
|
auto subArr = updates(i, dimsToExclude);
|
||||||
|
@ -132,7 +132,7 @@ namespace nd4j {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
#pragma omp parallel for schedule(guided)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
|
||||||
for(Nd4jLong i = 0; i < indicesLen; ++i) {
|
for(Nd4jLong i = 0; i < indicesLen; ++i) {
|
||||||
|
|
||||||
auto subArr = updates(i, dimsToExclude);
|
auto subArr = updates(i, dimsToExclude);
|
||||||
|
|
|
@ -38,7 +38,7 @@ namespace helpers {
|
||||||
auto tadsOut = output->allTensorsAlongDimension({0});
|
auto tadsOut = output->allTensorsAlongDimension({0});
|
||||||
|
|
||||||
// FIXME: template selector should be moved out of loop
|
// FIXME: template selector should be moved out of loop
|
||||||
#pragma omp parallel for
|
PRAGMA_OMP_PARALLEL_FOR
|
||||||
for (int e = 0; e < tadsIn->size(); e++) {
|
for (int e = 0; e < tadsIn->size(); e++) {
|
||||||
BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
|
BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,7 +39,7 @@ namespace helpers {
|
||||||
auto tadsOut = output->allTensorsAlongDimension({0});
|
auto tadsOut = output->allTensorsAlongDimension({0});
|
||||||
|
|
||||||
// FIXME: template selector should be moved out of loop
|
// FIXME: template selector should be moved out of loop
|
||||||
#pragma omp parallel for
|
PRAGMA_OMP_PARALLEL_FOR
|
||||||
for (int e = 0; e < tadsIn->size(); e++) {
|
for (int e = 0; e < tadsIn->size(); e++) {
|
||||||
BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
|
BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
|
||||||
}
|
}
|
||||||
|
|
|
@ -60,13 +60,13 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp
|
||||||
memset(imBuff, 0, shape::length(imShapeBuffer) * sizeof(T));
|
memset(imBuff, 0, shape::length(imShapeBuffer) * sizeof(T));
|
||||||
}
|
}
|
||||||
else if (imEWS > 1) {
|
else if (imEWS > 1) {
|
||||||
#pragma omp parallel for schedule(static) proc_bind(close)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close))
|
||||||
for (int i = 0; i < shape::length(imShapeBuffer) * imEWS; i += imEWS)
|
for (int i = 0; i < shape::length(imShapeBuffer) * imEWS; i += imEWS)
|
||||||
imBuff[i] = static_cast<T>(0.f);
|
imBuff[i] = static_cast<T>(0.f);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
const auto len = shape::length(imShapeBuffer);
|
const auto len = shape::length(imShapeBuffer);
|
||||||
#pragma omp parallel for schedule(static) proc_bind(close)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close))
|
||||||
for (int i = 0; i < len; i++)
|
for (int i = 0; i < len; i++)
|
||||||
imBuff[shape::getIndexOffset(i, imShapeBuffer, len)] = static_cast<T>(0.f);
|
imBuff[shape::getIndexOffset(i, imShapeBuffer, len)] = static_cast<T>(0.f);
|
||||||
}
|
}
|
||||||
|
@ -76,7 +76,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp
|
||||||
|
|
||||||
if (shape::order(colShapeBuffer) == 'c' && shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) {
|
if (shape::order(colShapeBuffer) == 'c' && shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) {
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static) proc_bind(close) private(col, im, imRow, imCol)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, imRow, imCol))
|
||||||
for (int b = 0; b < bS; b++) {
|
for (int b = 0; b < bS; b++) {
|
||||||
for (int c = 0; c < iC; ++c) {
|
for (int c = 0; c < iC; ++c) {
|
||||||
for (int kRow = 0; kRow < kH; ++kRow) {
|
for (int kRow = 0; kRow < kH; ++kRow) {
|
||||||
|
@ -101,7 +101,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static) proc_bind(close) private(im, col, imRow, imCol)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, imRow, imCol))
|
||||||
for (int b = 0; b < bS; b++) {
|
for (int b = 0; b < bS; b++) {
|
||||||
for (int colH = 0; colH < oH; ++colH) {
|
for (int colH = 0; colH < oH; ++colH) {
|
||||||
for (int colW = 0; colW < oW; ++colW) {
|
for (int colW = 0; colW < oW; ++colW) {
|
||||||
|
|
|
@ -62,7 +62,7 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input, NDArra
|
||||||
|
|
||||||
if (shape::order(imShapeBuffer) == 'c' && shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) {
|
if (shape::order(imShapeBuffer) == 'c' && shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) {
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static) proc_bind(close) private(col, im, imRow, imCol)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, imRow, imCol))
|
||||||
for (int b = 0; b < bS; b++) {
|
for (int b = 0; b < bS; b++) {
|
||||||
for (int c = 0; c < iC; ++c) {
|
for (int c = 0; c < iC; ++c) {
|
||||||
for (int kRow = 0; kRow < kH; ++kRow) {
|
for (int kRow = 0; kRow < kH; ++kRow) {
|
||||||
|
@ -89,7 +89,7 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input, NDArra
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static) proc_bind(close) private(im, col, imRow, imCol)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, imRow, imCol))
|
||||||
for (int b = 0; b < bS; b++) {
|
for (int b = 0; b < bS; b++) {
|
||||||
for (int colH = 0; colH < oH; ++colH) {
|
for (int colH = 0; colH < oH; ++colH) {
|
||||||
for (int colW = 0; colW < oW; ++colW) {
|
for (int colW = 0; colW < oW; ++colW) {
|
||||||
|
|
|
@ -224,9 +224,9 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
|
||||||
if (i < right){ quickSort_parallel_internal(array, xShapeInfo, i, right, cutoff, descending); }
|
if (i < right){ quickSort_parallel_internal(array, xShapeInfo, i, right, cutoff, descending); }
|
||||||
|
|
||||||
}else{
|
}else{
|
||||||
#pragma omp task
|
PRAGMA_OMP_TASK
|
||||||
{ quickSort_parallel_internal(array, xShapeInfo, left, j, cutoff, descending); }
|
{ quickSort_parallel_internal(array, xShapeInfo, left, j, cutoff, descending); }
|
||||||
#pragma omp task
|
PRAGMA_OMP_TASK
|
||||||
{ quickSort_parallel_internal(array, xShapeInfo, i, right, cutoff, descending); }
|
{ quickSort_parallel_internal(array, xShapeInfo, i, right, cutoff, descending); }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -238,7 +238,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
|
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
|
||||||
{
|
{
|
||||||
#pragma omp single nowait
|
PRAGMA_OMP_SINGLE_ARGS(nowait)
|
||||||
{
|
{
|
||||||
quickSort_parallel_internal(array, xShapeInfo, 0, lenArray-1, cutoff, descending);
|
quickSort_parallel_internal(array, xShapeInfo, 0, lenArray-1, cutoff, descending);
|
||||||
}
|
}
|
||||||
|
@ -350,7 +350,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
|
||||||
|
|
||||||
Nd4jLong retVal = 0L;
|
Nd4jLong retVal = 0L;
|
||||||
|
|
||||||
#pragma omp parallel for schedule(guided) proc_bind(close) reduction(+:retVal)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal))
|
||||||
for (Nd4jLong x = 0; x < N; x += 16) {
|
for (Nd4jLong x = 0; x < N; x += 16) {
|
||||||
|
|
||||||
int byte = 0;
|
int byte = 0;
|
||||||
|
@ -451,9 +451,9 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
|
||||||
if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
|
if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
|
||||||
|
|
||||||
}else{
|
}else{
|
||||||
#pragma omp task
|
PRAGMA_OMP_TASK
|
||||||
{ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
|
{ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
|
||||||
#pragma omp task
|
PRAGMA_OMP_TASK
|
||||||
{ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
|
{ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -517,9 +517,9 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
|
||||||
if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
|
if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
|
||||||
|
|
||||||
}else{
|
}else{
|
||||||
#pragma omp task
|
PRAGMA_OMP_TASK
|
||||||
{ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
|
{ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
|
||||||
#pragma omp task
|
PRAGMA_OMP_TASK
|
||||||
{ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
|
{ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -533,7 +533,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
|
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
|
||||||
{
|
{
|
||||||
#pragma omp single nowait
|
PRAGMA_OMP_SINGLE_ARGS(nowait)
|
||||||
{
|
{
|
||||||
quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
|
quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
|
||||||
}
|
}
|
||||||
|
@ -548,7 +548,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
|
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
|
||||||
{
|
{
|
||||||
#pragma omp single nowait
|
PRAGMA_OMP_SINGLE_ARGS(nowait)
|
||||||
{
|
{
|
||||||
quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
|
quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
|
||||||
}
|
}
|
||||||
|
|
|
@ -185,9 +185,9 @@ namespace nd4j {
|
||||||
if (i < right){ coo_quickSort_parallel_internal(indices, array, i, right, cutoff, rank); }
|
if (i < right){ coo_quickSort_parallel_internal(indices, array, i, right, cutoff, rank); }
|
||||||
|
|
||||||
}else{
|
}else{
|
||||||
#pragma omp task
|
PRAGMA_OMP_TASK
|
||||||
{ coo_quickSort_parallel_internal(indices, array, left, j, cutoff, rank); }
|
{ coo_quickSort_parallel_internal(indices, array, left, j, cutoff, rank); }
|
||||||
#pragma omp task
|
PRAGMA_OMP_TASK
|
||||||
{ coo_quickSort_parallel_internal(indices, array, i, right, cutoff, rank); }
|
{ coo_quickSort_parallel_internal(indices, array, i, right, cutoff, rank); }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -200,7 +200,7 @@ namespace nd4j {
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
|
PRAGMA_OMP_PARALLEL_THREADS(numThreads)
|
||||||
{
|
{
|
||||||
#pragma omp single nowait
|
PRAGMA_OMP_SINGLE_ARGS(nowait)
|
||||||
{
|
{
|
||||||
coo_quickSort_parallel_internal(indices, array, 0, lenArray-1, cutoff, rank);
|
coo_quickSort_parallel_internal(indices, array, 0, lenArray-1, cutoff, rank);
|
||||||
}
|
}
|
||||||
|
|
|
@ -823,7 +823,7 @@ TEST_F(PlaygroundTests, ScalarTest_2) {
|
||||||
float * array = reinterpret_cast<float*>(source.buffer());
|
float * array = reinterpret_cast<float*>(source.buffer());
|
||||||
for (int e = 0; e < 1000; e++) {
|
for (int e = 0; e < 1000; e++) {
|
||||||
|
|
||||||
#pragma omp simd
|
PRAGMA_OMP_SIMD
|
||||||
for (int i = 0; i < source.lengthOf(); i++) {
|
for (int i = 0; i < source.lengthOf(); i++) {
|
||||||
array[i] = simdOps::Add<float, float, float>::op(array[i], 2.0f);
|
array[i] = simdOps::Add<float, float, float>::op(array[i], 2.0f);
|
||||||
}
|
}
|
||||||
|
@ -1215,7 +1215,7 @@ TEST_F(PlaygroundTests, loopThroughArrs_test1) {
|
||||||
//***********************************
|
//***********************************
|
||||||
|
|
||||||
auto timeStart = std::chrono::system_clock::now();
|
auto timeStart = std::chrono::system_clock::now();
|
||||||
#pragma omp parallel for schedule(guided)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
|
||||||
for(Nd4jLong i = 0; i < len; ++i) {
|
for(Nd4jLong i = 0; i < len; ++i) {
|
||||||
|
|
||||||
Nd4jLong offset1 = shape::getIndexOffset(i, x.getShapeInfo(), len);
|
Nd4jLong offset1 = shape::getIndexOffset(i, x.getShapeInfo(), len);
|
||||||
|
@ -1230,7 +1230,7 @@ TEST_F(PlaygroundTests, loopThroughArrs_test1) {
|
||||||
//***********************************
|
//***********************************
|
||||||
|
|
||||||
timeStart = std::chrono::system_clock::now();
|
timeStart = std::chrono::system_clock::now();
|
||||||
#pragma omp parallel for schedule(guided)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
|
||||||
for(Nd4jLong i = 0; i < len; ++i) {
|
for(Nd4jLong i = 0; i < len; ++i) {
|
||||||
|
|
||||||
Nd4jLong offset1 = shape::getIndexOffset(i, x.getShapeInfo(), len);
|
Nd4jLong offset1 = shape::getIndexOffset(i, x.getShapeInfo(), len);
|
||||||
|
@ -1255,7 +1255,7 @@ static void loopSpan(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeI
|
||||||
int zEws = shape::elementWiseStride(zShapeInfo);
|
int zEws = shape::elementWiseStride(zShapeInfo);
|
||||||
|
|
||||||
BlockInformation info(len, ELEMENT_THRESHOLD);
|
BlockInformation info(len, ELEMENT_THRESHOLD);
|
||||||
#pragma omp parallel num_threads(info.threads) if (info.threads > 1) default(shared)
|
PRAGMA_OMP_PARALLEL_ARGS(num_threads(info.threads) if (info.threads > 1))
|
||||||
{
|
{
|
||||||
auto i = omp_get_thread_num();
|
auto i = omp_get_thread_num();
|
||||||
Nd4jLong itemsToLoop = (i < info.threads-1) ? info.items : info.items + info.remainder;
|
Nd4jLong itemsToLoop = (i < info.threads-1) ? info.items : info.items + info.remainder;
|
||||||
|
@ -1263,7 +1263,7 @@ static void loopSpan(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeI
|
||||||
auto xi = x + xEws * index;
|
auto xi = x + xEws * index;
|
||||||
auto yi = y + yEws * index;
|
auto yi = y + yEws * index;
|
||||||
auto zi = z + zEws * index;
|
auto zi = z + zEws * index;
|
||||||
#pragma omp simd
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong j = 0; j < itemsToLoop; j++)
|
for (Nd4jLong j = 0; j < itemsToLoop; j++)
|
||||||
zi[j * zEws] = simdOps::LogPoissonLoss<float, float, float>::op(xi[j * xEws], yi[j * yEws]);
|
zi[j * zEws] = simdOps::LogPoissonLoss<float, float, float>::op(xi[j * xEws], yi[j * yEws]);
|
||||||
}
|
}
|
||||||
|
@ -1278,7 +1278,7 @@ static void loopSimple(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShap
|
||||||
int threads = 6;
|
int threads = 6;
|
||||||
int span_size = len / threads + 1;
|
int span_size = len / threads + 1;
|
||||||
|
|
||||||
#pragma omp parallel for simd schedule(static, span_size) if (len > ELEMENT_THRESHOLD) proc_bind(close) default(shared)
|
PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(static, span_size) if (len > ELEMENT_THRESHOLD) proc_bind(close))
|
||||||
for(Nd4jLong i = 0; i < len; ++i)
|
for(Nd4jLong i = 0; i < len; ++i)
|
||||||
z[i * zEws] = simdOps::LogPoissonLoss<float, float, float>::op(x[i * xEws], y[i * yEws]);
|
z[i * zEws] = simdOps::LogPoissonLoss<float, float, float>::op(x[i * xEws], y[i * yEws]);
|
||||||
|
|
||||||
|
@ -1347,11 +1347,11 @@ static void loop1(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeInfo
|
||||||
int zEws = shape::elementWiseStride(zShapeInfo);
|
int zEws = shape::elementWiseStride(zShapeInfo);
|
||||||
|
|
||||||
nd4j::OmpLaunchHelper info(len);
|
nd4j::OmpLaunchHelper info(len);
|
||||||
#pragma omp parallel num_threads(info._numThreads) default(shared)
|
PRAGMA_OMP_PARALLEL_ARGS(num_threads(info._numThreads))
|
||||||
{
|
{
|
||||||
auto threadNum = omp_get_thread_num();
|
auto threadNum = omp_get_thread_num();
|
||||||
Nd4jLong threadOffset = info.getThreadOffset(threadNum);
|
Nd4jLong threadOffset = info.getThreadOffset(threadNum);
|
||||||
#pragma omp simd
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong j = 0; j < info.getItersPerThread(threadNum); j++) {
|
for (Nd4jLong j = 0; j < info.getItersPerThread(threadNum); j++) {
|
||||||
Nd4jLong xOffset = shape::getIndexOffset(j+threadOffset, xShapeInfo, len);
|
Nd4jLong xOffset = shape::getIndexOffset(j+threadOffset, xShapeInfo, len);
|
||||||
Nd4jLong yOffset = shape::getIndexOffset(j+threadOffset, yShapeInfo, len);
|
Nd4jLong yOffset = shape::getIndexOffset(j+threadOffset, yShapeInfo, len);
|
||||||
|
@ -1370,7 +1370,7 @@ static void loop2(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeInfo
|
||||||
int threads = 6;
|
int threads = 6;
|
||||||
int span_size = len / threads + 1;
|
int span_size = len / threads + 1;
|
||||||
|
|
||||||
#pragma omp parallel for simd schedule(static) default(shared)
|
PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(static))
|
||||||
for(Nd4jLong i = 0; i < len; ++i) {
|
for(Nd4jLong i = 0; i < len; ++i) {
|
||||||
Nd4jLong xOffset = shape::getIndexOffset(i, xShapeInfo, len);
|
Nd4jLong xOffset = shape::getIndexOffset(i, xShapeInfo, len);
|
||||||
Nd4jLong yOffset = shape::getIndexOffset(i, yShapeInfo, len);
|
Nd4jLong yOffset = shape::getIndexOffset(i, yShapeInfo, len);
|
||||||
|
@ -1615,7 +1615,7 @@ TEST_F(PlaygroundTests, test_manual_loop) {
|
||||||
auto timeStart = std::chrono::system_clock::now();
|
auto timeStart = std::chrono::system_clock::now();
|
||||||
for (int i = 0; i < iterations; i++) {
|
for (int i = 0; i < iterations; i++) {
|
||||||
|
|
||||||
#pragma omp parallel for num_threads(4) schedule(static, 32768)
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(num_threads(4) schedule(static, 32768))
|
||||||
for (unsigned int e = 0; e < len; e++)
|
for (unsigned int e = 0; e < len; e++)
|
||||||
z[e] = array[e];
|
z[e] = array[e];
|
||||||
}
|
}
|
||||||
|
@ -1931,19 +1931,19 @@ TEST_F(PlaygroundTests, loops_2) {
|
||||||
for (int i = 0; i < N; ++i)
|
for (int i = 0; i < N; ++i)
|
||||||
{
|
{
|
||||||
|
|
||||||
#pragma omp parallel sections
|
PRAGMA_OMP_PARALLEL_SECTIONS
|
||||||
{
|
{
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
|
|
||||||
shape::calcOffsets(3, shape, strides, xOffsets);
|
shape::calcOffsets(3, shape, strides, xOffsets);
|
||||||
}
|
}
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
|
|
||||||
shape::calcOffsets(3, shape, strides, yOffsets);
|
shape::calcOffsets(3, shape, strides, yOffsets);
|
||||||
}
|
}
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
|
|
||||||
shape::calcOffsets(3, shape, strides, zOffsets);
|
shape::calcOffsets(3, shape, strides, zOffsets);
|
||||||
|
@ -2110,19 +2110,19 @@ TEST_F(PlaygroundTests, loops_3) {
|
||||||
for (int i = 0; i < N; ++i)
|
for (int i = 0; i < N; ++i)
|
||||||
{
|
{
|
||||||
|
|
||||||
#pragma omp parallel sections
|
PRAGMA_OMP_PARALLEL_SECTIONS
|
||||||
{
|
{
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
|
|
||||||
shape::calcOffsets(3, shape, strides, xOffsets);
|
shape::calcOffsets(3, shape, strides, xOffsets);
|
||||||
}
|
}
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
|
|
||||||
shape::calcOffsets(3, shape, strides, yOffsets);
|
shape::calcOffsets(3, shape, strides, yOffsets);
|
||||||
}
|
}
|
||||||
#pragma omp section
|
PRAGMA_OMP_SECTION
|
||||||
{
|
{
|
||||||
|
|
||||||
shape::calcOffsets(3, shape, strides, zOffsets);
|
shape::calcOffsets(3, shape, strides, zOffsets);
|
||||||
|
|
Loading…
Reference in New Issue