diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h index 81e42d159..a6b22ba6d 100644 --- a/libnd4j/include/helpers/shape.h +++ b/libnd4j/include/helpers/shape.h @@ -2969,10 +2969,8 @@ INLINEDEF _CUDA_HD bool haveSameShapeAndStrides(const Nd4jLong *shapeInfo1, cons if (shapeInfo1[0] == 0) return true; - int range = 2 * shapeInfo1[0]; - - for (int e = 1; e <= range; e++) - if (shapeInfo1[e] != shapeInfo2[e]) + for (uint e = 0; e < static_cast(shape::rank(shapeInfo1)); ++e) + if (shape::shapeOf(shapeInfo1)[e] != shape::shapeOf(shapeInfo2)[e] || shape::stride(shapeInfo1)[e] != shape::stride(shapeInfo2)[e]) return false; return true; diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp index 2b24dc17a..55f9338fb 100644 --- a/libnd4j/include/loops/cpu/broadcasting.hpp +++ b/libnd4j/include/loops/cpu/broadcasting.hpp @@ -623,12 +623,22 @@ void Broadcast::exec(const void *vx, const Nd4jLong *xShapeInfo, const auto func = PRAGMA_THREADS_FOR{ - if(zStrd0 == 1 && xStrd0 <= 1 && yStrd0 <= 1) + if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 0) { for (auto i0 = start; i0 < stop; ++i0) - z[i0] = OpType::op(x[xStrd0 ? i0 : 0], y[yStrd0 ? i0 : 0]); - else + z[i0] = OpType::op(x[i0], *y); + } + else if(zStrd0 == 1 && xStrd0 == 0 && yStrd0 == 1) { + for (auto i0 = start; i0 < stop; ++i0) + z[i0] = OpType::op(*x, y[i0]); + } + else if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 1) { + for (auto i0 = start; i0 < stop; ++i0) + z[i0] = OpType::op(x[i0], y[i0]); + } + else { for (auto i0 = start; i0 < stop; ++i0) z[i0 * zStrd0] = OpType::op(x[i0 * xStrd0], y[i0 * yStrd0]); + } }; samediff::Threads::parallel_tad(func, 0, zAxis0); } @@ -644,9 +654,15 @@ void Broadcast::exec(const void *vx, const Nd4jLong *xShapeInfo, const auto y0 = y + i0 * yStrd0; auto z0 = z + i0 * zStrd0; - if(zStrd1 == 1 && xStrd1 <= 1 && yStrd1 <= 1) + if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 0) for (uint i1 = 0; i1 < zAxis1; ++i1) - z0[i1] = OpType::op(x0[xStrd1 ? i1 : 0], y0[yStrd1 ? i1 : 0]); + z0[i1] = OpType::op(x0[i1], *y0); + else if(zStrd1 == 1 && xStrd1 == 0 && yStrd1 == 1) + for (uint i1 = 0; i1 < zAxis1; ++i1) + z0[i1] = OpType::op(*x0, y0[i1]); + else if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 1) + for (uint i1 = 0; i1 < zAxis1; ++i1) + z0[i1] = OpType::op(x0[i1], y0[i1]); else for (uint i1 = 0; i1 < zAxis1; ++i1) z0[i1 * zStrd1] = OpType::op(x0[i1 * xStrd1], y0[i1 * yStrd1]); @@ -658,7 +674,6 @@ void Broadcast::exec(const void *vx, const Nd4jLong *xShapeInfo, const case 3: { - auto func = PRAGMA_THREADS_FOR_2D { for (auto i0 = start_x; i0 < stop_x; ++i0) { @@ -668,9 +683,15 @@ void Broadcast::exec(const void *vx, const Nd4jLong *xShapeInfo, const auto y1 = y + i0 * yStrd0 + i1 * yStrd1; auto z1 = z + i0 * zStrd0 + i1 * zStrd1; - if(zStrd2 == 1 && xStrd2 <= 1 && yStrd2 <= 1) + if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 0) for (uint i2 = 0; i2 < zAxis2; ++i2) - z1[i2] = OpType::op(x1[xStrd2 ? i2 : 0], y1[yStrd2 ? i2 : 0]); + z1[i2] = OpType::op(x1[i2], *y1); + else if(zStrd2 == 1 && xStrd2 == 0 && yStrd2 == 1) + for (uint i2 = 0; i2 < zAxis2; ++i2) + z1[i2] = OpType::op(*x1, y1[i2]); + else if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 1) + for (uint i2 = 0; i2 < zAxis2; ++i2) + z1[i2] = OpType::op(x1[i2], y1[i2]); else for (uint i2 = 0; i2 < zAxis2; ++i2) z1[i2 * zStrd2] = OpType::op(x1[i2 * xStrd2], y1[i2 * yStrd2]); @@ -693,9 +714,15 @@ void Broadcast::exec(const void *vx, const Nd4jLong *xShapeInfo, const auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2; auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2; - if(zStrd3 == 1 && xStrd3 <= 1 && yStrd3 <= 1) + if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 0) for (uint i3 = 0; i3 < zAxis3; ++i3) - z2[i3] = OpType::op(x2[xStrd3 ? i3 : 0], y2[yStrd3 ? i3 : 0]); + z2[i3] = OpType::op(x2[i3], *y2); + else if(zStrd3 == 1 && xStrd3 == 0 && yStrd3 == 1) + for (uint i3 = 0; i3 < zAxis3; ++i3) + z2[i3] = OpType::op(*x2, y2[i3]); + else if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 1) + for (uint i3 = 0; i3 < zAxis3; ++i3) + z2[i3] = OpType::op(x2[i3], y2[i3]); else for (uint i3 = 0; i3 < zAxis3; ++i3) z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3]); @@ -720,9 +747,15 @@ void Broadcast::exec(const void *vx, const Nd4jLong *xShapeInfo, const auto y3 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2 + i3 * yStrd3; auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3; - if(zStrd4 == 1 && xStrd4 <= 1 && yStrd4 <= 1) + if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 0) for (uint i4 = 0; i4 < zAxis4; ++i4) - z3[i4] = OpType::op(x3[xStrd4 ? i4 : 0], y3[yStrd4 ? i4 : 0]); + z3[i4] = OpType::op(x3[i4], *y3); + else if(zStrd4 == 1 && xStrd4 == 0 && yStrd4 == 1) + for (uint i4 = 0; i4 < zAxis4; ++i4) + z3[i4] = OpType::op(*x3, y3[i4]); + else if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 1) + for (uint i4 = 0; i4 < zAxis4; ++i4) + z3[i4] = OpType::op(x3[i4], y3[i4]); else for (uint i4 = 0; i4 < zAxis4; ++i4) z3[i4 * zStrd4] = OpType::op(x3[i4 * xStrd4], y3[i4 * yStrd4]); @@ -737,6 +770,9 @@ void Broadcast::exec(const void *vx, const Nd4jLong *xShapeInfo, const default: { + const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); + const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo); + auto func = PRAGMA_THREADS_FOR{ int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; @@ -750,9 +786,9 @@ void Broadcast::exec(const void *vx, const Nd4jLong *xShapeInfo, const yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j]; } - const auto xOffset = shape::getOffset(xShapeInfo, xCoords); - const auto yOffset = shape::getOffset(yShapeInfo, yCoords); const auto zOffset = shape::getOffset(zShapeInfo, zCoords); + const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords); + const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords); z[zOffset] = OpType::op(x[xOffset], y[yOffset]); } diff --git a/libnd4j/include/loops/cpu/broadcasting_bool.hpp b/libnd4j/include/loops/cpu/broadcasting_bool.hpp index ef8a35c48..b1b7eb27b 100644 --- a/libnd4j/include/loops/cpu/broadcasting_bool.hpp +++ b/libnd4j/include/loops/cpu/broadcasting_bool.hpp @@ -29,7 +29,7 @@ using namespace simdOps; namespace functions { - namespace broadcast { +namespace broadcast { template void BroadcastBool::exec(const int opNum, @@ -277,202 +277,6 @@ namespace functions { } } -//////////////////////////////////////////////////////////////////////// -template -template -void BroadcastBool::exec(const void *vx, const Nd4jLong *xShapeInfo, - const void *vy, const Nd4jLong *yShapeInfo, - void *vz, const Nd4jLong *zShapeInfo, - void *vextraParams) { - - const X* x = reinterpret_cast(vx); - const X* y = reinterpret_cast(vy); - Z* z = reinterpret_cast(vz); - - X* extraParams = reinterpret_cast(vextraParams); - - const int rank = shape::rank(zShapeInfo); // xRank = yRank = zRank - const char zOrder = shape::order(zShapeInfo); - - uint xAxis0 = shape::sizeAt(xShapeInfo, zOrder == 'c' ? 0 : rank-1); - uint xAxis1 = shape::sizeAt(xShapeInfo, zOrder == 'c' ? 1 : rank-2); - uint xAxis2 = rank > 2 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0; - uint xAxis3 = rank > 3 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0; - uint xAxis4 = rank > 4 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0; - Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, zOrder == 'c' ? 0 : rank-1); - Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, zOrder == 'c' ? 1 : rank-2); - Nd4jLong xStrd2 = rank > 2 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0; - Nd4jLong xStrd3 = rank > 3 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0; - Nd4jLong xStrd4 = rank > 4 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0; - - uint yAxis0 = shape::sizeAt(yShapeInfo, zOrder == 'c' ? 0 : rank-1); - uint yAxis1 = shape::sizeAt(yShapeInfo, zOrder == 'c' ? 1 : rank-2); - uint yAxis2 = rank > 2 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0; - uint yAxis3 = rank > 3 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0; - uint yAxis4 = rank > 4 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0; - Nd4jLong yStrd0 = shape::strideAt(yShapeInfo, zOrder == 'c' ? 0 : rank-1); - Nd4jLong yStrd1 = shape::strideAt(yShapeInfo, zOrder == 'c' ? 1 : rank-2); - Nd4jLong yStrd2 = rank > 2 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0; - Nd4jLong yStrd3 = rank > 3 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0; - Nd4jLong yStrd4 = rank > 4 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0; - - uint zAxis0 = shape::sizeAt(zShapeInfo, zOrder == 'c' ? 0 : rank-1); - uint zAxis1 = shape::sizeAt(zShapeInfo, zOrder == 'c' ? 1 : rank-2); - uint zAxis2 = rank > 2 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0; - uint zAxis3 = rank > 3 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0; - uint zAxis4 = rank > 4 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0; - Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, zOrder == 'c' ? 0 : rank-1); - Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, zOrder == 'c' ? 1 : rank-2); - Nd4jLong zStrd2 = rank > 2 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0; - Nd4jLong zStrd3 = rank > 3 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0; - Nd4jLong zStrd4 = rank > 4 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0; - - switch (rank) { - - case 1: { - - auto func = PRAGMA_THREADS_FOR{ - - if(zStrd0 == 1 && xStrd0 <= 1 && yStrd0 <= 1) - for (auto i0 = start; i0 < stop; ++i0) - z[i0] = OpType::op(x[xStrd0 ? i0 : 0], y[yStrd0 ? i0 : 0], extraParams); - else - for (auto i0 = start; i0 < stop; ++i0) - z[i0 * zStrd0] = OpType::op(x[i0 * xStrd0], y[i0 * yStrd0], extraParams); - }; - samediff::Threads::parallel_tad(func, 0, zAxis0); - } - break; - - case 2: { - - auto func = PRAGMA_THREADS_FOR{ - - for (auto i0 = start; i0 < stop; ++i0) { - - auto x0 = x + i0 * xStrd0; - auto y0 = y + i0 * yStrd0; - auto z0 = z + i0 * zStrd0; - - if(zStrd1 == 1 && xStrd1 <= 1 && yStrd1 <= 1) - for (uint i1 = 0; i1 < zAxis1; ++i1) - z0[i1] = OpType::op(x0[xStrd1 ? i1 : 0], y0[yStrd1 ? i1 : 0], extraParams); - else - for (uint i1 = 0; i1 < zAxis1; ++i1) - z0[i1 * zStrd1] = OpType::op(x0[i1 * xStrd1], y0[i1 * yStrd1], extraParams); - } - }; - samediff::Threads::parallel_tad(func, 0, zAxis0); - } - break; - - case 3: { - - - auto func = PRAGMA_THREADS_FOR_2D { - - for (auto i0 = start_x; i0 < stop_x; ++i0) { - for (auto i1 = start_y; i1 < stop_y; ++i1) { - - auto x1 = x + i0 * xStrd0 + i1 * xStrd1; - auto y1 = y + i0 * yStrd0 + i1 * yStrd1; - auto z1 = z + i0 * zStrd0 + i1 * zStrd1; - - if(zStrd2 == 1 && xStrd2 <= 1 && yStrd2 <= 1) - for (uint i2 = 0; i2 < zAxis2; ++i2) - z1[i2] = OpType::op(x1[xStrd2 ? i2 : 0], y1[yStrd2 ? i2 : 0], extraParams); - else - for (uint i2 = 0; i2 < zAxis2; ++i2) - z1[i2 * zStrd2] = OpType::op(x1[i2 * xStrd2], y1[i2 * yStrd2], extraParams); - } - } - }; - samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1); - } - break; - - case 4: { - - auto func = PRAGMA_THREADS_FOR_3D { - - for (auto i0 = start_x; i0 < stop_x; ++i0) { - for (auto i1 = start_y; i1 < stop_y; ++i1) { - for (auto i2 = start_z; i2 < stop_z; ++i2) { - - auto x2 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2; - auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2; - auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2; - - if(zStrd3 == 1 && xStrd3 <= 1 && yStrd3 <= 1) - for (uint i3 = 0; i3 < zAxis3; ++i3) - z2[i3] = OpType::op(x2[xStrd3 ? i3 : 0], y2[yStrd3 ? i3 : 0], extraParams); - else - for (uint i3 = 0; i3 < zAxis3; ++i3) - z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3], extraParams); - } - } - } - }; - samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1, 0,zAxis2,1); - } - break; - - case 5: { - - auto func = PRAGMA_THREADS_FOR_3D { - - for (auto i0 = start_x; i0 < stop_x; ++i0) { - for (auto i1 = start_y; i1 < stop_y; ++i1) { - for (auto i2 = start_z; i2 < stop_z; ++i2) { - for (uint i3 = 0; i3 < zAxis3; ++i3) { - - auto x3 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2 + i3 * xStrd3; - auto y3 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2 + i3 * yStrd3; - auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3; - - if(zStrd4 == 1 && xStrd4 <= 1 && yStrd4 <= 1) - for (uint i4 = 0; i4 < zAxis4; ++i4) - z3[i4] = OpType::op(x3[xStrd4 ? i4 : 0], y3[yStrd4 ? i4 : 0], extraParams); - else - for (uint i4 = 0; i4 < zAxis4; ++i4) - z3[i4 * zStrd4] = OpType::op(x3[i4 * xStrd4], y3[i4 * yStrd4], extraParams); - } - } - } - } - }; - samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1, 0,zAxis2,1); - } - break; - - default: { - - auto func = PRAGMA_THREADS_FOR{ - - int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; - - for (auto i = start; i < stop; ++i) { - - shape::index2coordsCPU(start, i, zShapeInfo, zCoords); - - for (uint j = 0; j < rank; ++j) { - xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j]; - yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j]; - } - - const auto xOffset = shape::getOffset(xShapeInfo, xCoords); - const auto yOffset = shape::getOffset(yShapeInfo, yCoords); - const auto zOffset = shape::getOffset(zShapeInfo, zCoords); - - z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); - } - }; - - samediff::Threads::parallel_for(func, 0, shape::length(zShapeInfo)); - } - } -} - template template void BroadcastBool::execInverse(void *vx, @@ -649,6 +453,240 @@ void BroadcastBool::exec(const void *vx, const Nd4jLong *xShapeInfo, } } - //BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT BroadcastBool, , LIBND4J_TYPES, BOOL_TYPES); +//////////////////////////////////////////////////////////////////////// +template +template +void BroadcastBool::exec(const void *vx, const Nd4jLong *xShapeInfo, + const void *vy, const Nd4jLong *yShapeInfo, + void *vz, const Nd4jLong *zShapeInfo, + void *vextraParams) { + + const X* x = reinterpret_cast(vx); + const X* y = reinterpret_cast(vy); + Z* z = reinterpret_cast(vz); + + X* extraParams = reinterpret_cast(vextraParams); + + const int rank = shape::rank(zShapeInfo); // xRank = yRank = zRank + const char zOrder = shape::order(zShapeInfo); + + uint xAxis0 = shape::sizeAt(xShapeInfo, zOrder == 'c' ? 0 : rank-1); + uint xAxis1 = shape::sizeAt(xShapeInfo, zOrder == 'c' ? 1 : rank-2); + uint xAxis2 = rank > 2 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0; + uint xAxis3 = rank > 3 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0; + uint xAxis4 = rank > 4 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0; + Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, zOrder == 'c' ? 0 : rank-1); + Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, zOrder == 'c' ? 1 : rank-2); + Nd4jLong xStrd2 = rank > 2 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0; + Nd4jLong xStrd3 = rank > 3 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0; + Nd4jLong xStrd4 = rank > 4 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0; + + uint yAxis0 = shape::sizeAt(yShapeInfo, zOrder == 'c' ? 0 : rank-1); + uint yAxis1 = shape::sizeAt(yShapeInfo, zOrder == 'c' ? 1 : rank-2); + uint yAxis2 = rank > 2 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0; + uint yAxis3 = rank > 3 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0; + uint yAxis4 = rank > 4 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0; + Nd4jLong yStrd0 = shape::strideAt(yShapeInfo, zOrder == 'c' ? 0 : rank-1); + Nd4jLong yStrd1 = shape::strideAt(yShapeInfo, zOrder == 'c' ? 1 : rank-2); + Nd4jLong yStrd2 = rank > 2 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0; + Nd4jLong yStrd3 = rank > 3 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0; + Nd4jLong yStrd4 = rank > 4 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0; + + uint zAxis0 = shape::sizeAt(zShapeInfo, zOrder == 'c' ? 0 : rank-1); + uint zAxis1 = shape::sizeAt(zShapeInfo, zOrder == 'c' ? 1 : rank-2); + uint zAxis2 = rank > 2 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0; + uint zAxis3 = rank > 3 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0; + uint zAxis4 = rank > 4 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0; + Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, zOrder == 'c' ? 0 : rank-1); + Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, zOrder == 'c' ? 1 : rank-2); + Nd4jLong zStrd2 = rank > 2 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0; + Nd4jLong zStrd3 = rank > 3 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0; + Nd4jLong zStrd4 = rank > 4 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0; + + switch (rank) { + + case 1: { + + auto func = PRAGMA_THREADS_FOR{ + + if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 0) { + for (auto i0 = start; i0 < stop; ++i0) + z[i0] = OpType::op(x[i0], *y, extraParams); + } + else if(zStrd0 == 1 && xStrd0 == 0 && yStrd0 == 1) { + for (auto i0 = start; i0 < stop; ++i0) + z[i0] = OpType::op(*x, y[i0], extraParams); + } + else if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 1) { + for (auto i0 = start; i0 < stop; ++i0) + z[i0] = OpType::op(x[i0], y[i0], extraParams); + } + else { + for (auto i0 = start; i0 < stop; ++i0) + z[i0 * zStrd0] = OpType::op(x[i0 * xStrd0], y[i0 * yStrd0], extraParams); + } + }; + samediff::Threads::parallel_tad(func, 0, zAxis0); + } + break; + + case 2: { + + auto func = PRAGMA_THREADS_FOR{ + + for (auto i0 = start; i0 < stop; ++i0) { + + auto x0 = x + i0 * xStrd0; + auto y0 = y + i0 * yStrd0; + auto z0 = z + i0 * zStrd0; + + if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 0) + for (uint i1 = 0; i1 < zAxis1; ++i1) + z0[i1] = OpType::op(x0[i1], *y0, extraParams); + else if(zStrd1 == 1 && xStrd1 == 0 && yStrd1 == 1) + for (uint i1 = 0; i1 < zAxis1; ++i1) + z0[i1] = OpType::op(*x0, y0[i1], extraParams); + else if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 1) + for (uint i1 = 0; i1 < zAxis1; ++i1) + z0[i1] = OpType::op(x0[i1], y0[i1], extraParams); + else + for (uint i1 = 0; i1 < zAxis1; ++i1) + z0[i1 * zStrd1] = OpType::op(x0[i1 * xStrd1], y0[i1 * yStrd1], extraParams); + } + }; + samediff::Threads::parallel_tad(func, 0, zAxis0); + } + break; + + case 3: { + + auto func = PRAGMA_THREADS_FOR_2D { + + for (auto i0 = start_x; i0 < stop_x; ++i0) { + for (auto i1 = start_y; i1 < stop_y; ++i1) { + + auto x1 = x + i0 * xStrd0 + i1 * xStrd1; + auto y1 = y + i0 * yStrd0 + i1 * yStrd1; + auto z1 = z + i0 * zStrd0 + i1 * zStrd1; + + if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 0) + for (uint i2 = 0; i2 < zAxis2; ++i2) + z1[i2] = OpType::op(x1[i2], *y1, extraParams); + else if(zStrd2 == 1 && xStrd2 == 0 && yStrd2 == 1) + for (uint i2 = 0; i2 < zAxis2; ++i2) + z1[i2] = OpType::op(*x1, y1[i2], extraParams); + else if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 1) + for (uint i2 = 0; i2 < zAxis2; ++i2) + z1[i2] = OpType::op(x1[i2], y1[i2], extraParams); + else + for (uint i2 = 0; i2 < zAxis2; ++i2) + z1[i2 * zStrd2] = OpType::op(x1[i2 * xStrd2], y1[i2 * yStrd2], extraParams); + } + } + }; + samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1); + } + break; + + case 4: { + + auto func = PRAGMA_THREADS_FOR_3D { + + for (auto i0 = start_x; i0 < stop_x; ++i0) { + for (auto i1 = start_y; i1 < stop_y; ++i1) { + for (auto i2 = start_z; i2 < stop_z; ++i2) { + + auto x2 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2; + auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2; + auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2; + + if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 0) + for (uint i3 = 0; i3 < zAxis3; ++i3) + z2[i3] = OpType::op(x2[i3], *y2, extraParams); + else if(zStrd3 == 1 && xStrd3 == 0 && yStrd3 == 1) + for (uint i3 = 0; i3 < zAxis3; ++i3) + z2[i3] = OpType::op(*x2, y2[i3], extraParams); + else if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 1) + for (uint i3 = 0; i3 < zAxis3; ++i3) + z2[i3] = OpType::op(x2[i3], y2[i3], extraParams); + else + for (uint i3 = 0; i3 < zAxis3; ++i3) + z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3], extraParams); + } + } + } + }; + samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1, 0,zAxis2,1); + } + break; + + case 5: { + + auto func = PRAGMA_THREADS_FOR_3D { + + for (auto i0 = start_x; i0 < stop_x; ++i0) { + for (auto i1 = start_y; i1 < stop_y; ++i1) { + for (auto i2 = start_z; i2 < stop_z; ++i2) { + for (uint i3 = 0; i3 < zAxis3; ++i3) { + + auto x3 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2 + i3 * xStrd3; + auto y3 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2 + i3 * yStrd3; + auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3; + + if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 0) + for (uint i4 = 0; i4 < zAxis4; ++i4) + z3[i4] = OpType::op(x3[i4], *y3, extraParams); + else if(zStrd4 == 1 && xStrd4 == 0 && yStrd4 == 1) + for (uint i4 = 0; i4 < zAxis4; ++i4) + z3[i4] = OpType::op(*x3, y3[i4], extraParams); + else if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 1) + for (uint i4 = 0; i4 < zAxis4; ++i4) + z3[i4] = OpType::op(x3[i4], y3[i4], extraParams); + else + for (uint i4 = 0; i4 < zAxis4; ++i4) + z3[i4 * zStrd4] = OpType::op(x3[i4 * xStrd4], y3[i4 * yStrd4], extraParams); + } + } + } + } + }; + samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1, 0,zAxis2,1); + } + break; + + default: { + + const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); + const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo); + + auto func = PRAGMA_THREADS_FOR{ + + int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; + + for (auto i = start; i < stop; ++i) { + + shape::index2coordsCPU(start, i, zShapeInfo, zCoords); + + for (uint j = 0; j < rank; ++j) { + xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j]; + yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j]; + } + + const auto zOffset = shape::getOffset(zShapeInfo, zCoords); + const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords); + const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords); + + z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); + } + }; + + samediff::Threads::parallel_for(func, 0, shape::length(zShapeInfo)); + } } +} + + //BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT BroadcastBool, , LIBND4J_TYPES, BOOL_TYPES); + + +} } \ No newline at end of file diff --git a/libnd4j/include/loops/cpu/broadcasting_int.hpp b/libnd4j/include/loops/cpu/broadcasting_int.hpp index 95f54881d..deb8c2ea3 100644 --- a/libnd4j/include/loops/cpu/broadcasting_int.hpp +++ b/libnd4j/include/loops/cpu/broadcasting_int.hpp @@ -444,8 +444,8 @@ namespace functions { template template void BroadcastInt::exec(const void *vx, const Nd4jLong *xShapeInfo, - const void *vy, const Nd4jLong *yShapeInfo, - void *vz, const Nd4jLong *zShapeInfo) { + const void *vy, const Nd4jLong *yShapeInfo, + void *vz, const Nd4jLong *zShapeInfo) { const X* x = reinterpret_cast(vx); const X* y = reinterpret_cast(vy); @@ -493,12 +493,22 @@ void BroadcastInt::exec(const void *vx, const Nd4jLong *xShapeInfo, auto func = PRAGMA_THREADS_FOR{ - if(zStrd0 == 1 && xStrd0 <= 1 && yStrd0 <= 1) + if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 0) { for (auto i0 = start; i0 < stop; ++i0) - z[i0] = OpType::op(x[xStrd0 ? i0 : 0], y[yStrd0 ? i0 : 0]); - else + z[i0] = OpType::op(x[i0], *y); + } + else if(zStrd0 == 1 && xStrd0 == 0 && yStrd0 == 1) { + for (auto i0 = start; i0 < stop; ++i0) + z[i0] = OpType::op(*x, y[i0]); + } + else if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 1) { + for (auto i0 = start; i0 < stop; ++i0) + z[i0] = OpType::op(x[i0], y[i0]); + } + else { for (auto i0 = start; i0 < stop; ++i0) z[i0 * zStrd0] = OpType::op(x[i0 * xStrd0], y[i0 * yStrd0]); + } }; samediff::Threads::parallel_tad(func, 0, zAxis0); } @@ -514,9 +524,15 @@ void BroadcastInt::exec(const void *vx, const Nd4jLong *xShapeInfo, auto y0 = y + i0 * yStrd0; auto z0 = z + i0 * zStrd0; - if(zStrd1 == 1 && xStrd1 <= 1 && yStrd1 <= 1) + if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 0) for (uint i1 = 0; i1 < zAxis1; ++i1) - z0[i1] = OpType::op(x0[xStrd1 ? i1 : 0], y0[yStrd1 ? i1 : 0]); + z0[i1] = OpType::op(x0[i1], *y0); + else if(zStrd1 == 1 && xStrd1 == 0 && yStrd1 == 1) + for (uint i1 = 0; i1 < zAxis1; ++i1) + z0[i1] = OpType::op(*x0, y0[i1]); + else if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 1) + for (uint i1 = 0; i1 < zAxis1; ++i1) + z0[i1] = OpType::op(x0[i1], y0[i1]); else for (uint i1 = 0; i1 < zAxis1; ++i1) z0[i1 * zStrd1] = OpType::op(x0[i1 * xStrd1], y0[i1 * yStrd1]); @@ -528,7 +544,6 @@ void BroadcastInt::exec(const void *vx, const Nd4jLong *xShapeInfo, case 3: { - auto func = PRAGMA_THREADS_FOR_2D { for (auto i0 = start_x; i0 < stop_x; ++i0) { @@ -538,9 +553,15 @@ void BroadcastInt::exec(const void *vx, const Nd4jLong *xShapeInfo, auto y1 = y + i0 * yStrd0 + i1 * yStrd1; auto z1 = z + i0 * zStrd0 + i1 * zStrd1; - if(zStrd2 == 1 && xStrd2 <= 1 && yStrd2 <= 1) + if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 0) for (uint i2 = 0; i2 < zAxis2; ++i2) - z1[i2] = OpType::op(x1[xStrd2 ? i2 : 0], y1[yStrd2 ? i2 : 0]); + z1[i2] = OpType::op(x1[i2], *y1); + else if(zStrd2 == 1 && xStrd2 == 0 && yStrd2 == 1) + for (uint i2 = 0; i2 < zAxis2; ++i2) + z1[i2] = OpType::op(*x1, y1[i2]); + else if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 1) + for (uint i2 = 0; i2 < zAxis2; ++i2) + z1[i2] = OpType::op(x1[i2], y1[i2]); else for (uint i2 = 0; i2 < zAxis2; ++i2) z1[i2 * zStrd2] = OpType::op(x1[i2 * xStrd2], y1[i2 * yStrd2]); @@ -563,9 +584,15 @@ void BroadcastInt::exec(const void *vx, const Nd4jLong *xShapeInfo, auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2; auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2; - if(zStrd3 == 1 && xStrd3 <= 1 && yStrd3 <= 1) + if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 0) for (uint i3 = 0; i3 < zAxis3; ++i3) - z2[i3] = OpType::op(x2[xStrd3 ? i3 : 0], y2[yStrd3 ? i3 : 0]); + z2[i3] = OpType::op(x2[i3], *y2); + else if(zStrd3 == 1 && xStrd3 == 0 && yStrd3 == 1) + for (uint i3 = 0; i3 < zAxis3; ++i3) + z2[i3] = OpType::op(*x2, y2[i3]); + else if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 1) + for (uint i3 = 0; i3 < zAxis3; ++i3) + z2[i3] = OpType::op(x2[i3], y2[i3]); else for (uint i3 = 0; i3 < zAxis3; ++i3) z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3]); @@ -590,9 +617,15 @@ void BroadcastInt::exec(const void *vx, const Nd4jLong *xShapeInfo, auto y3 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2 + i3 * yStrd3; auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3; - if(zStrd4 == 1 && xStrd4 <= 1 && yStrd4 <= 1) + if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 0) for (uint i4 = 0; i4 < zAxis4; ++i4) - z3[i4] = OpType::op(x3[xStrd4 ? i4 : 0], y3[yStrd4 ? i4 : 0]); + z3[i4] = OpType::op(x3[i4], *y3); + else if(zStrd4 == 1 && xStrd4 == 0 && yStrd4 == 1) + for (uint i4 = 0; i4 < zAxis4; ++i4) + z3[i4] = OpType::op(*x3, y3[i4]); + else if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 1) + for (uint i4 = 0; i4 < zAxis4; ++i4) + z3[i4] = OpType::op(x3[i4], y3[i4]); else for (uint i4 = 0; i4 < zAxis4; ++i4) z3[i4 * zStrd4] = OpType::op(x3[i4 * xStrd4], y3[i4 * yStrd4]); @@ -607,6 +640,9 @@ void BroadcastInt::exec(const void *vx, const Nd4jLong *xShapeInfo, default: { + const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); + const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo); + auto func = PRAGMA_THREADS_FOR{ int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; @@ -620,9 +656,9 @@ void BroadcastInt::exec(const void *vx, const Nd4jLong *xShapeInfo, yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j]; } - const auto xOffset = shape::getOffset(xShapeInfo, xCoords); - const auto yOffset = shape::getOffset(yShapeInfo, yCoords); const auto zOffset = shape::getOffset(zShapeInfo, zCoords); + const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords); + const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords); z[zOffset] = OpType::op(x[xOffset], y[yOffset]); } diff --git a/libnd4j/include/loops/cuda/broadcasting.chpp b/libnd4j/include/loops/cuda/broadcasting.chpp index 49270ddcc..848522a35 100644 --- a/libnd4j/include/loops/cuda/broadcasting.chpp +++ b/libnd4j/include/loops/cuda/broadcasting.chpp @@ -264,11 +264,15 @@ __device__ void Broadcast::transformCuda( __shared__ Nd4jLong zLen; __shared__ int rank; + __shared__ bool xzSameOffsets, yzSameOffsets; if (threadIdx.x == 0) { zLen = shape::length(zShapeInfo); rank = shape::rank(zShapeInfo); + + xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); + yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo); } __syncthreads(); @@ -286,9 +290,9 @@ __device__ void Broadcast::transformCuda( yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j]; } - const auto xOffset = shape::getOffset(xShapeInfo, xCoords); - const auto yOffset = shape::getOffset(yShapeInfo, yCoords); const auto zOffset = shape::getOffset(zShapeInfo, zCoords); + const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords); + const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords); z[zOffset] = OpType::op(x[xOffset], y[yOffset]); } diff --git a/libnd4j/include/loops/cuda/broadcasting_bool.cu b/libnd4j/include/loops/cuda/broadcasting_bool.cu index aae6bb141..1c7bc358e 100644 --- a/libnd4j/include/loops/cuda/broadcasting_bool.cu +++ b/libnd4j/include/loops/cuda/broadcasting_bool.cu @@ -280,11 +280,15 @@ __device__ void BroadcastBool::transformCuda(const void *vx, const Nd4jLong __shared__ Nd4jLong zLen; __shared__ int rank; + __shared__ bool xzSameOffsets, yzSameOffsets; if (threadIdx.x == 0) { zLen = shape::length(zShapeInfo); rank = shape::rank(zShapeInfo); + + xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); + yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo); } __syncthreads(); @@ -302,9 +306,9 @@ __device__ void BroadcastBool::transformCuda(const void *vx, const Nd4jLong yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j]; } - const auto xOffset = shape::getOffset(xShapeInfo, xCoords); - const auto yOffset = shape::getOffset(yShapeInfo, yCoords); const auto zOffset = shape::getOffset(zShapeInfo, zCoords); + const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords); + const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords); z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); } diff --git a/libnd4j/include/loops/cuda/broadcasting_int.cu b/libnd4j/include/loops/cuda/broadcasting_int.cu index f9ad3218c..998ac9ae8 100644 --- a/libnd4j/include/loops/cuda/broadcasting_int.cu +++ b/libnd4j/include/loops/cuda/broadcasting_int.cu @@ -260,11 +260,15 @@ __device__ void BroadcastInt::transformCuda(const void *vx, const Nd4jLong *x __shared__ Nd4jLong zLen; __shared__ int rank; + __shared__ bool xzSameOffsets, yzSameOffsets; if (threadIdx.x == 0) { zLen = shape::length(zShapeInfo); rank = shape::rank(zShapeInfo); + + xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); + yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo); } __syncthreads(); @@ -282,9 +286,9 @@ __device__ void BroadcastInt::transformCuda(const void *vx, const Nd4jLong *x yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j]; } - const auto xOffset = shape::getOffset(xShapeInfo, xCoords); - const auto yOffset = shape::getOffset(yShapeInfo, yCoords); const auto zOffset = shape::getOffset(zShapeInfo, zCoords); + const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords); + const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords); z[zOffset] = OpType::op(x[xOffset], y[yOffset]); }