diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp index 660bd9354..9d947e5ab 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp @@ -33,34 +33,45 @@ namespace nd4j { auto yBuffer = y.bufferAsT(); Nd4jLong distance = 0; + auto lengthOf = x.lengthOf(); + const int maxThreads = nd4j::math::nd4j_min(256, omp_get_max_threads()); + Nd4jLong intermediate[256]; + + // nullify temp values + for (int e = 0; e < maxThreads; e++) + intermediate[e] = 0; if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) { - PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:distance) - for (Nd4jLong e = 0; e < x.lengthOf(); e++) { + PRAGMA_OMP_PARALLEL_FOR + for (Nd4jLong e = 0; e < lengthOf; e++) { auto _x = static_cast(xBuffer[e]); auto _y = static_cast(yBuffer[e]); - distance += __builtin_popcountll(_x ^ _y); + intermediate[omp_get_thread_num()] += __builtin_popcountll(_x ^ _y); } } else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) { - PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:distance) - for (Nd4jLong e = 0; e < x.lengthOf(); e++) { + PRAGMA_OMP_PARALLEL_FOR + for (Nd4jLong e = 0; e < lengthOf; e++) { auto _x = static_cast(xBuffer[e * xEws]); auto _y = static_cast(yBuffer[e * yEws]); - distance += __builtin_popcountll(_x ^ _y); + intermediate[omp_get_thread_num()] += __builtin_popcountll(_x ^ _y); } } else { - PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:distance) - for (Nd4jLong e = 0; e < x.lengthOf(); e++) { + PRAGMA_OMP_PARALLEL_FOR + for (Nd4jLong e = 0; e < lengthOf; e++) { auto _x = static_cast(x.e(e)); auto _y = static_cast(y.e(e)); - distance += __builtin_popcountll(_x ^ _y); + intermediate[omp_get_thread_num()] += __builtin_popcountll(_x ^ _y); } } + // accumulate intermediate variables into output array + for (int e = 0; e < maxThreads; e++) + distance += intermediate[e]; + z.p(0, distance); }