From 9bb11d5b062886ea8ef0c91d535e5a2b0e2364e0 Mon Sep 17 00:00:00 2001
From: Samuel Audet <samuel.audet@gmail.com>
Date: Thu, 18 Jul 2019 20:13:56 +0900
Subject: [PATCH] Fix OpenMP by replacing pragmas with macros (#8026)

Signed-off-by: Samuel Audet <samuel.audet@gmail.com>
---
 libnd4j/blas/NDArray.hpp                      |  4 +-
 libnd4j/include/helpers/cpu/MmulHelper.cpp    | 18 ++++----
 libnd4j/include/helpers/impl/DebugHelper.cpp  |  4 +-
 libnd4j/include/helpers/shape.h               | 46 +++++++++----------
 .../include/loops/cpu/type_conversions.cpp    | 12 ++---
 libnd4j/include/openmp_pragmas.h              | 14 ++++++
 .../declarable/helpers/cpu/activations.cpp    |  6 +--
 .../ops/declarable/helpers/cpu/gather.cpp     |  6 +--
 .../ops/declarable/helpers/cpu/lstm.cpp       |  8 ++--
 .../ops/declarable/helpers/cpu/scatter.cpp    | 22 ++++-----
 .../ops/declarable/helpers/cuda/adjust_hue.cu |  4 +-
 .../helpers/cuda/adjust_saturation.cu         |  4 +-
 .../ops/declarable/helpers/cuda/col2im.cppc   |  8 ++--
 .../ops/declarable/helpers/cuda/im2col.cppc   |  6 +--
 libnd4j/include/ops/impl/specials.cpp         | 20 ++++----
 libnd4j/include/ops/impl/specials_sparse.cpp  |  6 +--
 .../layers_tests/PlaygroundTests.cpp          | 38 +++++++--------
 17 files changed, 120 insertions(+), 106 deletions(-)
diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp
index 5c616f605..3d3ce59fd 100644
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/blas/NDArray.hpp
@@ -2412,7 +2412,7 @@ double NDArray::getTrace() const {
 
     double sum = 0.;
 
-#pragma omp parallel for reduction(sumT:sum) if(minDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(reduction(OMP_SUMT:sum) if(minDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
     for(int i = 0; i < minDim; ++i)
         sum += e<double>(i * offset);
 
@@ -4477,4 +4477,4 @@ void NDArray::setShapeInfo(const ConstantDataBuffer& shapeBuffer) {
 //             }
 //         }
 //     return true;
-// }
\ No newline at end of file
+// }
diff --git a/libnd4j/include/helpers/cpu/MmulHelper.cpp b/libnd4j/include/helpers/cpu/MmulHelper.cpp
index 246d70187..293360a25 100644
--- a/libnd4j/include/helpers/cpu/MmulHelper.cpp
+++ b/libnd4j/include/helpers/cpu/MmulHelper.cpp
@@ -39,7 +39,7 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c
     const bool flagA = (flagC && transA) || (!flagC && !transA);
     const bool flagB = (flagC && transB) || (!flagC && !transB);
 
-    // #pragma omp parallel for if(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
+    // PRAGMA_OMP_PARALLEL_FOR_ARGS(if(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
     // for(uint row = 0; row < M; ++row) {
 
     //     T3* c = flagC ? (C + row) : (C + row * ldc);
@@ -53,7 +53,7 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c
     //         T3* a = flagA ? (A + row * lda + i) : (A + row + i * lda);
 
     //         if(flagC) {
-    //             #pragma omp simd
+    //             PRAGMA_OMP_SIMD
     //             for(uint col = 0; col < N; ++col) {
     //                 if(betaZ)
     //                     c[col * ldc] += a * b[flagB ? col : col * ldb] + betaZ * c[col * ldc];
@@ -62,7 +62,7 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c
     //             }
     //         }
     //         else {
-    //             #pragma omp simd
+    //             PRAGMA_OMP_SIMD
     //             for(uint col = 0; col < N; ++col) {
     //                 if(betaZ)
     //                     c[col] += a * b[flagB ? col : col * ldb] + betaZ * c[col];
@@ -73,14 +73,14 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c
     //     }
     // }   
 
-    #pragma omp parallel for if(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided) collapse(2)    
+    PRAGMA_OMP_PARALLEL_FOR_ARGS(if(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided) collapse(2))
     for(uint row = 0; row < M; ++row) {
        for(uint col = 0; col < N; ++col) {
             
             T3* c = flagC ? (C + row + col * ldc) : (C + row * ldc + col);
             T3 val = 0;  
 
-            #pragma omp simd
+            PRAGMA_OMP_SIMD
             for(uint i = 0; i < K; ++i) {
                 T3 a = flagA ? *(A + row * lda + i) : *(A + row + i * lda);
                 T3 b = flagB ? *(B + col + i * ldb) : *(B + col * ldb + i);             
@@ -107,13 +107,13 @@ static void usualGemv(const char aOrder, const int M, const int N, const double
     
     const bool flagA = aOrder == 'f';
 
-    #pragma omp parallel for if(M > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
+    PRAGMA_OMP_PARALLEL_FOR_ARGS(if(M > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
     for(int row = 0; row < M; ++row) {
                         
         T3* y = Y + row * incy;
         T3 val = 0;
 
-        #pragma omp simd
+        PRAGMA_OMP_SIMD
         for(int i = 0; i < N; ++i) {
             T3 a = flagA ? *(A + row + i * lda) : *(A + row * lda + i);
             T3 x = *(X + i * incx);
@@ -138,7 +138,7 @@ static void usualDot(const Nd4jLong length, const double alpha, const void* vX,
     T3 alphaZ(alpha), betaZ(beta);
 
     T3 sum = 0;
-    #pragma omp parallel for if(length > Environment::getInstance()->elementwiseThreshold()) schedule(guided) reduction(sumT:sum)
+    PRAGMA_OMP_PARALLEL_FOR_ARGS(if(length > Environment::getInstance()->elementwiseThreshold()) schedule(guided) reduction(OMP_SUMT:sum))
     for(int i = 0; i < length; ++i)
             sum = sum + X[i * incx] * Y[i * incy];        
     
@@ -325,4 +325,4 @@ BUILD_TRIPLE_TEMPLATE(template void usualGemm, (const char cOrder, const bool tr
 BUILD_TRIPLE_TEMPLATE(template void usualGemv, (const char aOrder, const int M, const int N, const double alpha, const void* A, const int lda, const void* B, const int incx, const double beta, void* C, const int incy), LIBND4J_TYPES, FLOAT_TYPES, FLOAT_TYPES);
 BUILD_TRIPLE_TEMPLATE(template void usualDot,  (const Nd4jLong length, const double alpha, const void* vX, const Nd4jLong incx, const void* vY, const Nd4jLong incy, const double beta, void* vZ), LIBND4J_TYPES, FLOAT_TYPES, FLOAT_TYPES);
 
-}
\ No newline at end of file
+}
diff --git a/libnd4j/include/helpers/impl/DebugHelper.cpp b/libnd4j/include/helpers/impl/DebugHelper.cpp
index 5cb7a5531..f1ba8a755 100644
--- a/libnd4j/include/helpers/impl/DebugHelper.cpp
+++ b/libnd4j/include/helpers/impl/DebugHelper.cpp
@@ -67,7 +67,7 @@ namespace nd4j {
             auto _infCount = nd4j::math::nd4j_isinf(input->e<double>(0)) ? 1L : 0L;
             auto _nanCount = nd4j::math::nd4j_isnan(input->e<double>(0)) ? 1L : 0L;
 
-#pragma omp parallel for schedule(guided) reduction(+:_nanCount,_infCount,_meanValue,_zeroCount,_positiveCount,_negativeCount) reduction(min:_minValue) reduction(max:_maxValue)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) reduction(+:_nanCount,_infCount,_meanValue,_zeroCount,_positiveCount,_negativeCount) reduction(min:_minValue) reduction(max:_maxValue))
             for (Nd4jLong e = 1; e < input->lengthOf(); e++) {
                 auto current = input->e<double>(e);
                 auto n = e + 1.;
@@ -88,7 +88,7 @@ namespace nd4j {
             }
             *info = {_minValue, _maxValue, _meanValue / input->lengthOf(), _stdDevValue, _zeroCount, _positiveCount, _negativeCount, _infCount, _nanCount};
             _stdDevValue = 0; //math::nd4j_sqrt<double, double>(info->_stdDevValue / (input->lengthOf() - 1));
-#pragma omp parallel for schedule (static) reduction(+:_stdDevValue)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule (static) reduction(+:_stdDevValue))
             for (Nd4jLong e = 0; e < input->lengthOf(); e++) {
                 double current = input->e<double>(e);
                 _stdDevValue += (info->_meanValue - current) * (info->_meanValue - current); //info->_minValue;
diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h
index acff46a24..312df8c7b 100644
--- a/libnd4j/include/helpers/shape.h
+++ b/libnd4j/include/helpers/shape.h
@@ -4761,14 +4761,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
     }
     else if(xEws == 1) {
         xOffsets = nullptr;
-        #pragma omp parallel sections
+        PRAGMA_OMP_PARALLEL_SECTIONS
         {
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 yOffsets = new Nd4jLong[len];
                 shape::calcOffsets(yShapeInfo, yOffsets, xOrder);
             }
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 zOffsets = new Nd4jLong[len];
                 shape::calcOffsets(zShapeInfo, zOffsets, xOrder);
@@ -4777,14 +4777,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
     }
     else if(yEws == 1) {
         yOffsets = nullptr;
-        #pragma omp parallel sections
+        PRAGMA_OMP_PARALLEL_SECTIONS
         {
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 xOffsets = new Nd4jLong[len];
                 shape::calcOffsets(xShapeInfo, xOffsets, yOrder);
             }
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 zOffsets = new Nd4jLong[len];
                 shape::calcOffsets(zShapeInfo, zOffsets, yOrder);
@@ -4793,14 +4793,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
     }
     else if(zEws == 1) {
         zOffsets = nullptr;
-        #pragma omp parallel sections
+        PRAGMA_OMP_PARALLEL_SECTIONS
         {
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 xOffsets = new Nd4jLong[len];
                 shape::calcOffsets(xShapeInfo, xOffsets, zOrder);
             }
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 yOffsets = new Nd4jLong[len];
                 shape::calcOffsets(yShapeInfo, yOffsets, zOrder);
@@ -4813,14 +4813,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
         yOffsets = zOffsets = xOffsets;
     }
     else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
-        #pragma omp parallel sections
+        PRAGMA_OMP_PARALLEL_SECTIONS
         {
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 xOffsets = new Nd4jLong[len];
                 shape::calcOffsets(xShapeInfo, xOffsets);
             }
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 zOffsets = new Nd4jLong[len];
                 shape::calcOffsets(zShapeInfo, zOffsets);
@@ -4829,14 +4829,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
         yOffsets = xOffsets;
     }
     else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-        #pragma omp parallel sections
+        PRAGMA_OMP_PARALLEL_SECTIONS
         {
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 xOffsets = new Nd4jLong[len];
                 shape::calcOffsets(xShapeInfo, xOffsets);
             }
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 yOffsets = new Nd4jLong[len];
                 shape::calcOffsets(yShapeInfo, yOffsets);
@@ -4845,19 +4845,19 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
         zOffsets = xOffsets;
     }
     else {
-        #pragma omp parallel sections
+        PRAGMA_OMP_PARALLEL_SECTIONS
         {
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 xOffsets = new Nd4jLong[len];
                 shape::calcOffsets(xShapeInfo, xOffsets);
             }
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 yOffsets = new Nd4jLong[len];
                 shape::calcOffsets(yShapeInfo, yOffsets);
             }
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 zOffsets = new Nd4jLong[len];
                 shape::calcOffsets(zShapeInfo, zOffsets);
@@ -4899,14 +4899,14 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
         yOffsets = xOffsets;
     }
     else {
-        #pragma omp parallel sections
+        PRAGMA_OMP_PARALLEL_SECTIONS
         {
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 xOffsets = new Nd4jLong[len];
                 shape::calcOffsets(xShapeInfo, xOffsets);
             }
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
                 yOffsets = new Nd4jLong[len];
                 shape::calcOffsets(yShapeInfo, yOffsets);
@@ -4919,4 +4919,4 @@ INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffs
 
 }
 
-#endif /* SHAPE_H_ */
\ No newline at end of file
+#endif /* SHAPE_H_ */
diff --git a/libnd4j/include/loops/cpu/type_conversions.cpp b/libnd4j/include/loops/cpu/type_conversions.cpp
index c7a610e82..3c923de39 100644
--- a/libnd4j/include/loops/cpu/type_conversions.cpp
+++ b/libnd4j/include/loops/cpu/type_conversions.cpp
@@ -122,7 +122,7 @@ namespace nd4j {
 
             for (int e = start; e < stop; e++) {
                 bool flag_load;
-#pragma omp atomic read
+PRAGMA_OMP_ATOMIC_ARGS(read)
                 flag_load = flag;
                 if (flag_load)
                     break;
@@ -130,11 +130,11 @@ namespace nd4j {
                 T cUpd = x[e];
                 if (cUpd >= tt) {
                     int idx;
-#pragma omp atomic capture
+PRAGMA_OMP_ATOMIC_ARGS(capture)
                     idx = cnt++;
 
                     if (idx >= flimit) {
-#pragma omp atomic write
+PRAGMA_OMP_ATOMIC_ARGS(write)
                         flag = true;
                         break;
                     }
@@ -143,11 +143,11 @@ namespace nd4j {
                     x[e] -= tt;
                 } else if (cUpd <= mtt) {
                     int idx;
-#pragma omp atomic capture
+PRAGMA_OMP_ATOMIC_ARGS(capture)
                     idx = cnt++;
 
                     if (idx >= flimit) {
-#pragma omp atomic write
+PRAGMA_OMP_ATOMIC_ARGS(write)
                         flag = true;
                         break;
                     }
@@ -237,4 +237,4 @@ namespace nd4j {
 #ifndef __CLION_IDE__
     BUILD_DOUBLE_TEMPLATE(template void TypeCast::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES)
 #endif
-}
\ No newline at end of file
+}
diff --git a/libnd4j/include/openmp_pragmas.h b/libnd4j/include/openmp_pragmas.h
index e219dd027..174bd624b 100644
--- a/libnd4j/include/openmp_pragmas.h
+++ b/libnd4j/include/openmp_pragmas.h
@@ -29,6 +29,8 @@
 #define OMP_MAXT
 #define OMP_SUMT
 #define OMP_REDUCTION(args)
+#define PRAGMA_OMP_ATOMIC
+#define PRAGMA_OMP_ATOMIC_ARGS(args)
 #define PRAGMA_OMP_CRITICAL
 #define PRAGMA_OMP_SIMD
 #define PRAGMA_OMP_SIMD_ARGS(args)
@@ -50,6 +52,11 @@
 #define PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(args)
 #define PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(args)
 #define PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS_COLLAPSE(threads, loops)
+#define PRAGMA_OMP_PARALLEL_SECTIONS
+#define PRAGMA_OMP_SECTION
+#define PRAGMA_OMP_SINGLE
+#define PRAGMA_OMP_SINGLE_ARGS(args)
+#define PRAGMA_OMP_TASK
 
 #else
 
@@ -59,6 +66,8 @@
 #define OMP_MAXT maxT
 #define OMP_SUMT sumT
 #define OMP_REDUCTION(args) reduction(args)
+#define PRAGMA_OMP_ATOMIC _Pragma(OMP_STRINGIFY(omp atomic))
+#define PRAGMA_OMP_ATOMIC_ARGS(args) _Pragma(OMP_STRINGIFY(omp atomic args))
 #define PRAGMA_OMP_CRITICAL _Pragma(OMP_STRINGIFY(omp critical))
 #define PRAGMA_OMP_SIMD _Pragma(OMP_STRINGIFY(omp simd))
 #define PRAGMA_OMP_SIMD_ARGS(args) _Pragma(OMP_STRINGIFY(omp simd args))
@@ -80,6 +89,11 @@
 #define PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(loops) _Pragma(OMP_STRINGIFY(omp parallel for simd default(shared) collapse(loops)))
 #define PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(args) _Pragma(OMP_STRINGIFY(omp parallel for simd reduction(args) default(shared)))
 #define PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(args) _Pragma(OMP_STRINGIFY(omp parallel for simd num_threads(args) if(args > 1) default(shared)))
+#define PRAGMA_OMP_PARALLEL_SECTIONS _Pragma(OMP_STRINGIFY(omp parallel sections))
+#define PRAGMA_OMP_SECTION _Pragma(OMP_STRINGIFY(omp section))
+#define PRAGMA_OMP_SINGLE _Pragma(OMP_STRINGIFY(omp single))
+#define PRAGMA_OMP_SINGLE_ARGS(args) _Pragma(OMP_STRINGIFY(omp single args))
+#define PRAGMA_OMP_TASK _Pragma(OMP_STRINGIFY(omp task))
 
 #endif
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
index 2be2dbcb4..d15ce3266 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
@@ -89,19 +89,19 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
         T sum = 0.;
         int length = shape::length(inShapeInfo);
 
-#pragma omp simd reduction(maxT:max)
+PRAGMA_OMP_SIMD_ARGS(reduction(OMP_MAXT:max))
         for (int i = 0; i < length; i++) {
             const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
             max = nd4j::math::nd4j_max<T>(max, inBuff[offset]);
         }
 
-#pragma omp parallel for simd reduction(sumT:sum)
+PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(reduction(OMP_SUMT:sum))
         for (int i = 0; i < length; i++) {
             const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
             outBuff[offset] = nd4j::math::nd4j_exp<T, T>(inBuff[offset] - max);
             sum += outBuff[offset];
         }
-#pragma omp simd
+PRAGMA_OMP_SIMD
         for (int i = 0; i < length; i++) {
             const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
             outBuff[offset] /= sum;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
index 4fc6fa52a..f2e4e77bc 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
@@ -56,7 +56,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
             std::vector<int> dimsOut(indices->rankOf());
             std::iota(dimsOut.begin(), dimsOut.end(), axis);   // fill with axis, axis+1, ... axis+indices->rankOf()-1
             const Nd4jLong numOfSubArrs = indices->lengthOf();
-#pragma omp parallel for if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
             for(int i = 0; i < numOfSubArrs; ++i) {
                 NDArray subArrOut = (*output)(i, dimsOut);
                 NDArray subArrIn  = (*input)(indices->e<Nd4jLong>(i), {axis});
@@ -72,7 +72,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
         }
         else { // vector case
             const Nd4jLong numOfSubArrs = intArgs.size() - 1;
-#pragma omp parallel for if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(if(numOfSubArrs > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
             for(int i = 0; i < numOfSubArrs; ++i) {
                 NDArray subArrOut = (*output)(i, {axis});
                 NDArray subArrIn  = (*input)(intArgs[i+1], {axis});
@@ -85,4 +85,4 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
 
 }
 }
-}
\ No newline at end of file
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
index 261ee32bf..f0f5697d0 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
@@ -204,15 +204,15 @@ void lstmBlockCell(const NDArray* xt, const NDArray* cLast, const NDArray* yLast
     }
 
     PRAGMA_OMP_PARALLEL
-    #pragma omp single
+    PRAGMA_OMP_SINGLE
     {
-        #pragma omp task
+        PRAGMA_OMP_TASK
         zz.applyTransform(transform::Tanh, z);      //z = tanh(zz)
 
-        #pragma omp task
+        PRAGMA_OMP_TASK
         zi.applyTransform(transform::Sigmoid, i);   //i = sigmoid(zi)
 
-        #pragma omp task
+        PRAGMA_OMP_TASK
         zf.applyTransform(transform::Sigmoid, f);   //f = sigmoid(zf);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
index e950ce8e5..4446953e6 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
@@ -34,8 +34,8 @@ namespace nd4j {
 
                 if(outRank == 1) {
 
-// #pragma omp parallel for if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
-#pragma omp parallel for if(!lock) schedule(guided)
+// PRAGMA_OMP_PARALLEL_FOR_ARGS(if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
+PRAGMA_OMP_PARALLEL_FOR_ARGS(if(!lock) schedule(guided))
                     for(Nd4jLong i = 0; i < indLen; ++i) {
 
                         Nd4jLong idx = indices.e<Nd4jLong>(i);
@@ -53,8 +53,8 @@ namespace nd4j {
                     std::vector<int> dimsToExcludeUpd(sizeOfDims);
                     std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
 
-// #pragma omp parallel for if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided) // causes known openMP asan bug !
-#pragma omp parallel for if(!lock) schedule(guided)
+// PRAGMA_OMP_PARALLEL_FOR_ARGS(if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) // causes known openMP asan bug !
+PRAGMA_OMP_PARALLEL_FOR_ARGS(if(!lock) schedule(guided))
                     for(Nd4jLong i = 0; i < indLen; ++i) {
 
                         NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0}));
@@ -75,8 +75,8 @@ namespace nd4j {
 
                 if(outRank == 1) {
 
-// #pragma omp parallel for if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
-#pragma omp parallel for if(!lock) schedule(guided)
+// PRAGMA_OMP_PARALLEL_FOR_ARGS(if(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
+PRAGMA_OMP_PARALLEL_FOR_ARGS(if(!lock) schedule(guided))
                     for(Nd4jLong i = 0; i < indLen; ++i) {
 
                         Nd4jLong idx = indices.e<Nd4jLong>(i);
@@ -92,8 +92,8 @@ namespace nd4j {
                     std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
                     std::vector<Nd4jLong> idxRangeOut(2*outRank, 0);
 
-// #pragma omp parallel for if(indLen/indLastDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided) firstprivate(idxRangeOut)
-#pragma omp parallel for if(!lock) schedule(guided) firstprivate(idxRangeOut)
+// PRAGMA_OMP_PARALLEL_FOR_ARGS(if(indLen/indLastDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided) firstprivate(idxRangeOut))
+PRAGMA_OMP_PARALLEL_FOR_ARGS(if(!lock) schedule(guided) firstprivate(idxRangeOut))
                     for(Nd4jLong i = 0; i < indLen/indLastDim; ++i) {
 
                         NDArray indSubArr = indices(i, dimsToExcludeInd);
@@ -124,7 +124,7 @@ namespace nd4j {
                 std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(updates.rankOf(), {-1});
 
                 if(!calcGrad) {
-#pragma omp parallel for schedule(guided)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
                     for(Nd4jLong i = 0; i < indicesLen; ++i) {
 
                         auto subArr = updates(i, dimsToExclude);
@@ -132,7 +132,7 @@ namespace nd4j {
                     }
                 }
                 else {
-#pragma omp parallel for schedule(guided)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
                     for(Nd4jLong i = 0; i < indicesLen; ++i) {
 
                         auto subArr = updates(i, dimsToExclude);
@@ -143,4 +143,4 @@ namespace nd4j {
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
index 814a1280d..7e8bb99f7 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
@@ -38,7 +38,7 @@ namespace helpers {
             auto tadsOut = output->allTensorsAlongDimension({0});
 
             // FIXME: template selector should be moved out of loop
-#pragma omp parallel for
+PRAGMA_OMP_PARALLEL_FOR
             for (int e = 0; e < tadsIn->size(); e++) {
                 BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
             }
@@ -55,4 +55,4 @@ namespace helpers {
 
 }
 }
-}
\ No newline at end of file
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
index 141b54e1f..cdd6640c4 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
@@ -39,7 +39,7 @@ namespace helpers {
             auto tadsOut = output->allTensorsAlongDimension({0});
 
             // FIXME: template selector should be moved out of loop
-#pragma omp parallel for
+PRAGMA_OMP_PARALLEL_FOR
             for (int e = 0; e < tadsIn->size(); e++) {
                 BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
             }
@@ -56,4 +56,4 @@ namespace helpers {
 
 }
 }
-}
\ No newline at end of file
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc
index 5b8b4a60a..aefb97963 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc
+++ b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc
@@ -60,13 +60,13 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
         memset(imBuff, 0, shape::length(imShapeBuffer) * sizeof(T));
     } 
     else if (imEWS > 1) {
-#pragma omp parallel for schedule(static) proc_bind(close)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close))
         for (int i = 0; i < shape::length(imShapeBuffer) * imEWS; i += imEWS)
             imBuff[i] = static_cast<T>(0.f);
     } 
     else {        
         const auto len = shape::length(imShapeBuffer);
-#pragma omp parallel for schedule(static) proc_bind(close)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close))
         for (int i = 0; i < len; i++)            
             imBuff[shape::getIndexOffset(i, imShapeBuffer, len)] = static_cast<T>(0.f);
     }
@@ -76,7 +76,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
 
     if (shape::order(colShapeBuffer) == 'c' &&  shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) {
             
-#pragma omp parallel for schedule(static) proc_bind(close) private(col, im, imRow, imCol)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, imRow, imCol))
     	for (int b = 0; b < bS; b++) {        
       		for (int c = 0; c < iC; ++c) {                    
             	for (int kRow = 0; kRow < kH; ++kRow) {                        
@@ -101,7 +101,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
     }
     else {
 
-#pragma omp parallel for schedule(static) proc_bind(close) private(im, col, imRow, imCol)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, imRow, imCol))
     	for (int b = 0; b < bS; b++) {        
         	for (int colH = 0; colH < oH; ++colH) {
             	for (int colW = 0; colW < oW; ++colW) {
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc
index d19352ff8..67f5650bd 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc
+++ b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc
@@ -62,7 +62,7 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input,  NDArra
             
     if (shape::order(imShapeBuffer) == 'c' &&  shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) {
 
-#pragma omp parallel for schedule(static) proc_bind(close) private(col, im, imRow, imCol)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, imRow, imCol))
     	for (int b = 0; b < bS; b++) {
         	for (int c = 0; c < iC; ++c) {        
             	for (int kRow = 0; kRow < kH; ++kRow) {                        
@@ -89,7 +89,7 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input,  NDArra
     }
     else {
  
-#pragma omp parallel for schedule(static) proc_bind(close) private(im, col, imRow, imCol)    
+PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, imRow, imCol))
     	for (int b = 0; b < bS; b++) {
         	for (int colH = 0; colH < oH; ++colH) {
             	for (int colW = 0; colW < oW; ++colW) {
@@ -126,4 +126,4 @@ BUILD_SINGLE_TEMPLATE(template void im2col_, (nd4j::LaunchContext & context, con
 
 }
 }
-}
\ No newline at end of file
+}
diff --git a/libnd4j/include/ops/impl/specials.cpp b/libnd4j/include/ops/impl/specials.cpp
index 074b2eaa6..3a07ba1e2 100644
--- a/libnd4j/include/ops/impl/specials.cpp
+++ b/libnd4j/include/ops/impl/specials.cpp
@@ -224,9 +224,9 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
             if (i < right){ quickSort_parallel_internal(array, xShapeInfo, i, right, cutoff, descending); }
 
         }else{
-#pragma omp task
+PRAGMA_OMP_TASK
             { quickSort_parallel_internal(array, xShapeInfo, left, j, cutoff, descending); }
-#pragma omp task
+PRAGMA_OMP_TASK
             { quickSort_parallel_internal(array, xShapeInfo, i, right, cutoff, descending); }
         }
     }
@@ -238,7 +238,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
 
         PRAGMA_OMP_PARALLEL_THREADS(numThreads)
         {
-#pragma omp single nowait
+PRAGMA_OMP_SINGLE_ARGS(nowait)
             {
                 quickSort_parallel_internal(array, xShapeInfo, 0, lenArray-1, cutoff, descending);
             }
@@ -350,7 +350,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
 
         Nd4jLong retVal = 0L;
 
-#pragma omp parallel for schedule(guided) proc_bind(close) reduction(+:retVal)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal))
         for (Nd4jLong x = 0; x < N; x += 16) {
 
             int byte = 0;
@@ -451,9 +451,9 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
             if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
 
         }else{
-#pragma omp task
+PRAGMA_OMP_TASK
             { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
-#pragma omp task
+PRAGMA_OMP_TASK
             { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
         }
     }
@@ -517,9 +517,9 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
             if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
 
         }else{
-#pragma omp task
+PRAGMA_OMP_TASK
             { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
-#pragma omp task
+PRAGMA_OMP_TASK
             { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
         }
     }
@@ -533,7 +533,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
 
         PRAGMA_OMP_PARALLEL_THREADS(numThreads)
         {
-#pragma omp single nowait
+PRAGMA_OMP_SINGLE_ARGS(nowait)
             {
                 quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
             }
@@ -548,7 +548,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
 
         PRAGMA_OMP_PARALLEL_THREADS(numThreads)
         {
-#pragma omp single nowait
+PRAGMA_OMP_SINGLE_ARGS(nowait)
             {
                 quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
             }
diff --git a/libnd4j/include/ops/impl/specials_sparse.cpp b/libnd4j/include/ops/impl/specials_sparse.cpp
index 790005148..6a76c9209 100644
--- a/libnd4j/include/ops/impl/specials_sparse.cpp
+++ b/libnd4j/include/ops/impl/specials_sparse.cpp
@@ -185,9 +185,9 @@ namespace nd4j {
                 if (i < right){ coo_quickSort_parallel_internal(indices, array, i, right, cutoff, rank); }
 
             }else{
-#pragma omp task
+PRAGMA_OMP_TASK
                 { coo_quickSort_parallel_internal(indices, array, left, j, cutoff, rank); }
-#pragma omp task
+PRAGMA_OMP_TASK
                 { coo_quickSort_parallel_internal(indices, array, i, right, cutoff, rank); }
             }
 
@@ -200,7 +200,7 @@ namespace nd4j {
 
             PRAGMA_OMP_PARALLEL_THREADS(numThreads)
             {
-#pragma omp single nowait
+PRAGMA_OMP_SINGLE_ARGS(nowait)
                 {
                     coo_quickSort_parallel_internal(indices, array, 0, lenArray-1, cutoff, rank);
                 }
diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
index f77f87e75..c295b7e75 100644
--- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
@@ -823,7 +823,7 @@ TEST_F(PlaygroundTests, ScalarTest_2) {
     float * array = reinterpret_cast<float*>(source.buffer());
     for (int e = 0; e < 1000; e++) {
 
-#pragma omp simd
+PRAGMA_OMP_SIMD
         for (int i = 0; i < source.lengthOf(); i++) {
             array[i] = simdOps::Add<float, float, float>::op(array[i], 2.0f);
         }
@@ -1215,7 +1215,7 @@ TEST_F(PlaygroundTests, loopThroughArrs_test1) {
     //***********************************
   
     auto timeStart = std::chrono::system_clock::now();
-#pragma omp parallel for schedule(guided) 
+PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided)) 
     for(Nd4jLong i = 0; i < len; ++i) {
                 
         Nd4jLong offset1 = shape::getIndexOffset(i, x.getShapeInfo(), len);
@@ -1230,7 +1230,7 @@ TEST_F(PlaygroundTests, loopThroughArrs_test1) {
     //***********************************
     
     timeStart = std::chrono::system_clock::now();
-#pragma omp parallel for schedule(guided)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
     for(Nd4jLong i = 0; i < len; ++i) {
         
         Nd4jLong offset1 = shape::getIndexOffset(i, x.getShapeInfo(), len);
@@ -1255,7 +1255,7 @@ static void loopSpan(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeI
     int zEws = shape::elementWiseStride(zShapeInfo);
             
     BlockInformation info(len, ELEMENT_THRESHOLD);
-    #pragma omp parallel num_threads(info.threads) if (info.threads > 1) default(shared)
+    PRAGMA_OMP_PARALLEL_ARGS(num_threads(info.threads) if (info.threads > 1))
     {                
         auto i = omp_get_thread_num();            
         Nd4jLong itemsToLoop = (i < info.threads-1) ? info.items : info.items + info.remainder;
@@ -1263,7 +1263,7 @@ static void loopSpan(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeI
         auto xi = x + xEws * index;
         auto yi = y + yEws * index;
         auto zi = z + zEws * index;        
-        #pragma omp simd
+        PRAGMA_OMP_SIMD
         for (Nd4jLong j = 0; j < itemsToLoop; j++) 
             zi[j * zEws] = simdOps::LogPoissonLoss<float, float, float>::op(xi[j * xEws], yi[j * yEws]);
     }
@@ -1278,7 +1278,7 @@ static void loopSimple(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShap
     int threads = 6;
     int span_size = len / threads + 1;
     
-    #pragma omp parallel for simd schedule(static, span_size) if (len > ELEMENT_THRESHOLD) proc_bind(close) default(shared)
+    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(static, span_size) if (len > ELEMENT_THRESHOLD) proc_bind(close))
     for(Nd4jLong i = 0; i < len; ++i)
         z[i * zEws] = simdOps::LogPoissonLoss<float, float, float>::op(x[i * xEws], y[i * yEws]);
 
@@ -1347,11 +1347,11 @@ static void loop1(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeInfo
     int zEws = shape::elementWiseStride(zShapeInfo);
             
     nd4j::OmpLaunchHelper info(len);
-    #pragma omp parallel num_threads(info._numThreads) default(shared)
+    PRAGMA_OMP_PARALLEL_ARGS(num_threads(info._numThreads))
     {                
         auto threadNum = omp_get_thread_num();
         Nd4jLong threadOffset = info.getThreadOffset(threadNum);        
-        #pragma omp simd
+        PRAGMA_OMP_SIMD
         for (Nd4jLong j = 0; j < info.getItersPerThread(threadNum); j++)  {
             Nd4jLong xOffset = shape::getIndexOffset(j+threadOffset, xShapeInfo, len);
             Nd4jLong yOffset = shape::getIndexOffset(j+threadOffset, yShapeInfo, len);
@@ -1370,7 +1370,7 @@ static void loop2(float* x, Nd4jLong* xShapeInfo, float* y, Nd4jLong* yShapeInfo
     int threads = 6;
     int span_size = len / threads + 1;
     
-    #pragma omp parallel for simd schedule(static) default(shared)
+    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(static))
     for(Nd4jLong i = 0; i < len; ++i) {
         Nd4jLong xOffset = shape::getIndexOffset(i, xShapeInfo, len);
         Nd4jLong yOffset = shape::getIndexOffset(i, yShapeInfo, len);
@@ -1615,7 +1615,7 @@ TEST_F(PlaygroundTests, test_manual_loop) {
     auto timeStart = std::chrono::system_clock::now();
     for (int i = 0; i < iterations; i++) {
 
-#pragma omp parallel for num_threads(4) schedule(static, 32768)
+PRAGMA_OMP_PARALLEL_FOR_ARGS(num_threads(4) schedule(static, 32768))
         for (unsigned int e = 0; e < len; e++)
             z[e] = array[e];
     }
@@ -1931,19 +1931,19 @@ TEST_F(PlaygroundTests, loops_2) {
     for (int i = 0; i < N; ++i)
     {
 
-        #pragma omp parallel sections
+        PRAGMA_OMP_PARALLEL_SECTIONS
         {
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
 
                 shape::calcOffsets(3, shape, strides, xOffsets);
             }
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
 
                 shape::calcOffsets(3, shape, strides, yOffsets);
             }
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
 
                 shape::calcOffsets(3, shape, strides, zOffsets);
@@ -2110,19 +2110,19 @@ TEST_F(PlaygroundTests, loops_3) {
     for (int i = 0; i < N; ++i)
     {
 
-        #pragma omp parallel sections
+        PRAGMA_OMP_PARALLEL_SECTIONS
         {
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
 
                 shape::calcOffsets(3, shape, strides, xOffsets);
             }
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
 
                 shape::calcOffsets(3, shape, strides, yOffsets);
             }
-            #pragma omp section
+            PRAGMA_OMP_SECTION
             {
 
                 shape::calcOffsets(3, shape, strides, zOffsets);
@@ -2350,4 +2350,4 @@ TEST_F(PlaygroundTests, mmulMxM_1) {
     auto duration1 = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / numOfIters).count();
     printf("duration  %ld\n", duration1);
 }
-*/
\ No newline at end of file
+*/