Shyrma broadcast2 (#309)
* - profiling broadcast ops for aurora Signed-off-by: Yurii <iuriish@yahoo.com> * - correct loop limit type in shape::haveSameShapeAndStrides Signed-off-by: Yurii <iuriish@yahoo.com>master
parent
58550b7c98
commit
ebab6b6410
|
@ -2969,10 +2969,8 @@ INLINEDEF _CUDA_HD bool haveSameShapeAndStrides(const Nd4jLong *shapeInfo1, cons
|
|||
if (shapeInfo1[0] == 0)
|
||||
return true;
|
||||
|
||||
int range = 2 * shapeInfo1[0];
|
||||
|
||||
for (int e = 1; e <= range; e++)
|
||||
if (shapeInfo1[e] != shapeInfo2[e])
|
||||
for (uint e = 0; e < static_cast<uint>(shape::rank(shapeInfo1)); ++e)
|
||||
if (shape::shapeOf(shapeInfo1)[e] != shape::shapeOf(shapeInfo2)[e] || shape::stride(shapeInfo1)[e] != shape::stride(shapeInfo2)[e])
|
||||
return false;
|
||||
|
||||
return true;
|
||||
|
|
|
@ -623,12 +623,22 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
if(zStrd0 == 1 && xStrd0 <= 1 && yStrd0 <= 1)
|
||||
if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 0) {
|
||||
for (auto i0 = start; i0 < stop; ++i0)
|
||||
z[i0] = OpType::op(x[xStrd0 ? i0 : 0], y[yStrd0 ? i0 : 0]);
|
||||
else
|
||||
z[i0] = OpType::op(x[i0], *y);
|
||||
}
|
||||
else if(zStrd0 == 1 && xStrd0 == 0 && yStrd0 == 1) {
|
||||
for (auto i0 = start; i0 < stop; ++i0)
|
||||
z[i0] = OpType::op(*x, y[i0]);
|
||||
}
|
||||
else if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 1) {
|
||||
for (auto i0 = start; i0 < stop; ++i0)
|
||||
z[i0] = OpType::op(x[i0], y[i0]);
|
||||
}
|
||||
else {
|
||||
for (auto i0 = start; i0 < stop; ++i0)
|
||||
z[i0 * zStrd0] = OpType::op(x[i0 * xStrd0], y[i0 * yStrd0]);
|
||||
}
|
||||
};
|
||||
samediff::Threads::parallel_tad(func, 0, zAxis0);
|
||||
}
|
||||
|
@ -644,9 +654,15 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
|
|||
auto y0 = y + i0 * yStrd0;
|
||||
auto z0 = z + i0 * zStrd0;
|
||||
|
||||
if(zStrd1 == 1 && xStrd1 <= 1 && yStrd1 <= 1)
|
||||
if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 0)
|
||||
for (uint i1 = 0; i1 < zAxis1; ++i1)
|
||||
z0[i1] = OpType::op(x0[xStrd1 ? i1 : 0], y0[yStrd1 ? i1 : 0]);
|
||||
z0[i1] = OpType::op(x0[i1], *y0);
|
||||
else if(zStrd1 == 1 && xStrd1 == 0 && yStrd1 == 1)
|
||||
for (uint i1 = 0; i1 < zAxis1; ++i1)
|
||||
z0[i1] = OpType::op(*x0, y0[i1]);
|
||||
else if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 1)
|
||||
for (uint i1 = 0; i1 < zAxis1; ++i1)
|
||||
z0[i1] = OpType::op(x0[i1], y0[i1]);
|
||||
else
|
||||
for (uint i1 = 0; i1 < zAxis1; ++i1)
|
||||
z0[i1 * zStrd1] = OpType::op(x0[i1 * xStrd1], y0[i1 * yStrd1]);
|
||||
|
@ -658,7 +674,6 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
|
|||
|
||||
case 3: {
|
||||
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR_2D {
|
||||
|
||||
for (auto i0 = start_x; i0 < stop_x; ++i0) {
|
||||
|
@ -668,9 +683,15 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
|
|||
auto y1 = y + i0 * yStrd0 + i1 * yStrd1;
|
||||
auto z1 = z + i0 * zStrd0 + i1 * zStrd1;
|
||||
|
||||
if(zStrd2 == 1 && xStrd2 <= 1 && yStrd2 <= 1)
|
||||
if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 0)
|
||||
for (uint i2 = 0; i2 < zAxis2; ++i2)
|
||||
z1[i2] = OpType::op(x1[xStrd2 ? i2 : 0], y1[yStrd2 ? i2 : 0]);
|
||||
z1[i2] = OpType::op(x1[i2], *y1);
|
||||
else if(zStrd2 == 1 && xStrd2 == 0 && yStrd2 == 1)
|
||||
for (uint i2 = 0; i2 < zAxis2; ++i2)
|
||||
z1[i2] = OpType::op(*x1, y1[i2]);
|
||||
else if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 1)
|
||||
for (uint i2 = 0; i2 < zAxis2; ++i2)
|
||||
z1[i2] = OpType::op(x1[i2], y1[i2]);
|
||||
else
|
||||
for (uint i2 = 0; i2 < zAxis2; ++i2)
|
||||
z1[i2 * zStrd2] = OpType::op(x1[i2 * xStrd2], y1[i2 * yStrd2]);
|
||||
|
@ -693,9 +714,15 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
|
|||
auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2;
|
||||
auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2;
|
||||
|
||||
if(zStrd3 == 1 && xStrd3 <= 1 && yStrd3 <= 1)
|
||||
if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 0)
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3)
|
||||
z2[i3] = OpType::op(x2[xStrd3 ? i3 : 0], y2[yStrd3 ? i3 : 0]);
|
||||
z2[i3] = OpType::op(x2[i3], *y2);
|
||||
else if(zStrd3 == 1 && xStrd3 == 0 && yStrd3 == 1)
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3)
|
||||
z2[i3] = OpType::op(*x2, y2[i3]);
|
||||
else if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 1)
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3)
|
||||
z2[i3] = OpType::op(x2[i3], y2[i3]);
|
||||
else
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3)
|
||||
z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3]);
|
||||
|
@ -720,9 +747,15 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
|
|||
auto y3 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2 + i3 * yStrd3;
|
||||
auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3;
|
||||
|
||||
if(zStrd4 == 1 && xStrd4 <= 1 && yStrd4 <= 1)
|
||||
if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 0)
|
||||
for (uint i4 = 0; i4 < zAxis4; ++i4)
|
||||
z3[i4] = OpType::op(x3[xStrd4 ? i4 : 0], y3[yStrd4 ? i4 : 0]);
|
||||
z3[i4] = OpType::op(x3[i4], *y3);
|
||||
else if(zStrd4 == 1 && xStrd4 == 0 && yStrd4 == 1)
|
||||
for (uint i4 = 0; i4 < zAxis4; ++i4)
|
||||
z3[i4] = OpType::op(*x3, y3[i4]);
|
||||
else if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 1)
|
||||
for (uint i4 = 0; i4 < zAxis4; ++i4)
|
||||
z3[i4] = OpType::op(x3[i4], y3[i4]);
|
||||
else
|
||||
for (uint i4 = 0; i4 < zAxis4; ++i4)
|
||||
z3[i4 * zStrd4] = OpType::op(x3[i4 * xStrd4], y3[i4 * yStrd4]);
|
||||
|
@ -737,6 +770,9 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
|
|||
|
||||
default: {
|
||||
|
||||
const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
|
||||
const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
|
@ -750,9 +786,9 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
|
|||
yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
|
||||
}
|
||||
|
||||
const auto xOffset = shape::getOffset(xShapeInfo, xCoords);
|
||||
const auto yOffset = shape::getOffset(yShapeInfo, yCoords);
|
||||
const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
|
||||
const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
|
||||
const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
|
||||
|
||||
z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
|
||||
}
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
using namespace simdOps;
|
||||
|
||||
namespace functions {
|
||||
namespace broadcast {
|
||||
namespace broadcast {
|
||||
|
||||
template <typename X, typename Y>
|
||||
void BroadcastBool<X, Y>::exec(const int opNum,
|
||||
|
@ -277,202 +277,6 @@ namespace functions {
|
|||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
template <typename X, typename Z>
|
||||
template<typename OpType>
|
||||
void BroadcastBool<X, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
|
||||
const void *vy, const Nd4jLong *yShapeInfo,
|
||||
void *vz, const Nd4jLong *zShapeInfo,
|
||||
void *vextraParams) {
|
||||
|
||||
const X* x = reinterpret_cast<const X*>(vx);
|
||||
const X* y = reinterpret_cast<const X*>(vy);
|
||||
Z* z = reinterpret_cast<Z*>(vz);
|
||||
|
||||
X* extraParams = reinterpret_cast<X*>(vextraParams);
|
||||
|
||||
const int rank = shape::rank(zShapeInfo); // xRank = yRank = zRank
|
||||
const char zOrder = shape::order(zShapeInfo);
|
||||
|
||||
uint xAxis0 = shape::sizeAt(xShapeInfo, zOrder == 'c' ? 0 : rank-1);
|
||||
uint xAxis1 = shape::sizeAt(xShapeInfo, zOrder == 'c' ? 1 : rank-2);
|
||||
uint xAxis2 = rank > 2 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
|
||||
uint xAxis3 = rank > 3 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
|
||||
uint xAxis4 = rank > 4 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
|
||||
Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, zOrder == 'c' ? 0 : rank-1);
|
||||
Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, zOrder == 'c' ? 1 : rank-2);
|
||||
Nd4jLong xStrd2 = rank > 2 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
|
||||
Nd4jLong xStrd3 = rank > 3 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
|
||||
Nd4jLong xStrd4 = rank > 4 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
|
||||
|
||||
uint yAxis0 = shape::sizeAt(yShapeInfo, zOrder == 'c' ? 0 : rank-1);
|
||||
uint yAxis1 = shape::sizeAt(yShapeInfo, zOrder == 'c' ? 1 : rank-2);
|
||||
uint yAxis2 = rank > 2 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
|
||||
uint yAxis3 = rank > 3 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
|
||||
uint yAxis4 = rank > 4 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
|
||||
Nd4jLong yStrd0 = shape::strideAt(yShapeInfo, zOrder == 'c' ? 0 : rank-1);
|
||||
Nd4jLong yStrd1 = shape::strideAt(yShapeInfo, zOrder == 'c' ? 1 : rank-2);
|
||||
Nd4jLong yStrd2 = rank > 2 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
|
||||
Nd4jLong yStrd3 = rank > 3 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
|
||||
Nd4jLong yStrd4 = rank > 4 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
|
||||
|
||||
uint zAxis0 = shape::sizeAt(zShapeInfo, zOrder == 'c' ? 0 : rank-1);
|
||||
uint zAxis1 = shape::sizeAt(zShapeInfo, zOrder == 'c' ? 1 : rank-2);
|
||||
uint zAxis2 = rank > 2 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
|
||||
uint zAxis3 = rank > 3 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
|
||||
uint zAxis4 = rank > 4 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
|
||||
Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, zOrder == 'c' ? 0 : rank-1);
|
||||
Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, zOrder == 'c' ? 1 : rank-2);
|
||||
Nd4jLong zStrd2 = rank > 2 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
|
||||
Nd4jLong zStrd3 = rank > 3 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
|
||||
Nd4jLong zStrd4 = rank > 4 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
|
||||
|
||||
switch (rank) {
|
||||
|
||||
case 1: {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
if(zStrd0 == 1 && xStrd0 <= 1 && yStrd0 <= 1)
|
||||
for (auto i0 = start; i0 < stop; ++i0)
|
||||
z[i0] = OpType::op(x[xStrd0 ? i0 : 0], y[yStrd0 ? i0 : 0], extraParams);
|
||||
else
|
||||
for (auto i0 = start; i0 < stop; ++i0)
|
||||
z[i0 * zStrd0] = OpType::op(x[i0 * xStrd0], y[i0 * yStrd0], extraParams);
|
||||
};
|
||||
samediff::Threads::parallel_tad(func, 0, zAxis0);
|
||||
}
|
||||
break;
|
||||
|
||||
case 2: {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
for (auto i0 = start; i0 < stop; ++i0) {
|
||||
|
||||
auto x0 = x + i0 * xStrd0;
|
||||
auto y0 = y + i0 * yStrd0;
|
||||
auto z0 = z + i0 * zStrd0;
|
||||
|
||||
if(zStrd1 == 1 && xStrd1 <= 1 && yStrd1 <= 1)
|
||||
for (uint i1 = 0; i1 < zAxis1; ++i1)
|
||||
z0[i1] = OpType::op(x0[xStrd1 ? i1 : 0], y0[yStrd1 ? i1 : 0], extraParams);
|
||||
else
|
||||
for (uint i1 = 0; i1 < zAxis1; ++i1)
|
||||
z0[i1 * zStrd1] = OpType::op(x0[i1 * xStrd1], y0[i1 * yStrd1], extraParams);
|
||||
}
|
||||
};
|
||||
samediff::Threads::parallel_tad(func, 0, zAxis0);
|
||||
}
|
||||
break;
|
||||
|
||||
case 3: {
|
||||
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR_2D {
|
||||
|
||||
for (auto i0 = start_x; i0 < stop_x; ++i0) {
|
||||
for (auto i1 = start_y; i1 < stop_y; ++i1) {
|
||||
|
||||
auto x1 = x + i0 * xStrd0 + i1 * xStrd1;
|
||||
auto y1 = y + i0 * yStrd0 + i1 * yStrd1;
|
||||
auto z1 = z + i0 * zStrd0 + i1 * zStrd1;
|
||||
|
||||
if(zStrd2 == 1 && xStrd2 <= 1 && yStrd2 <= 1)
|
||||
for (uint i2 = 0; i2 < zAxis2; ++i2)
|
||||
z1[i2] = OpType::op(x1[xStrd2 ? i2 : 0], y1[yStrd2 ? i2 : 0], extraParams);
|
||||
else
|
||||
for (uint i2 = 0; i2 < zAxis2; ++i2)
|
||||
z1[i2 * zStrd2] = OpType::op(x1[i2 * xStrd2], y1[i2 * yStrd2], extraParams);
|
||||
}
|
||||
}
|
||||
};
|
||||
samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1);
|
||||
}
|
||||
break;
|
||||
|
||||
case 4: {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR_3D {
|
||||
|
||||
for (auto i0 = start_x; i0 < stop_x; ++i0) {
|
||||
for (auto i1 = start_y; i1 < stop_y; ++i1) {
|
||||
for (auto i2 = start_z; i2 < stop_z; ++i2) {
|
||||
|
||||
auto x2 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2;
|
||||
auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2;
|
||||
auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2;
|
||||
|
||||
if(zStrd3 == 1 && xStrd3 <= 1 && yStrd3 <= 1)
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3)
|
||||
z2[i3] = OpType::op(x2[xStrd3 ? i3 : 0], y2[yStrd3 ? i3 : 0], extraParams);
|
||||
else
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3)
|
||||
z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3], extraParams);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1, 0,zAxis2,1);
|
||||
}
|
||||
break;
|
||||
|
||||
case 5: {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR_3D {
|
||||
|
||||
for (auto i0 = start_x; i0 < stop_x; ++i0) {
|
||||
for (auto i1 = start_y; i1 < stop_y; ++i1) {
|
||||
for (auto i2 = start_z; i2 < stop_z; ++i2) {
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3) {
|
||||
|
||||
auto x3 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2 + i3 * xStrd3;
|
||||
auto y3 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2 + i3 * yStrd3;
|
||||
auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3;
|
||||
|
||||
if(zStrd4 == 1 && xStrd4 <= 1 && yStrd4 <= 1)
|
||||
for (uint i4 = 0; i4 < zAxis4; ++i4)
|
||||
z3[i4] = OpType::op(x3[xStrd4 ? i4 : 0], y3[yStrd4 ? i4 : 0], extraParams);
|
||||
else
|
||||
for (uint i4 = 0; i4 < zAxis4; ++i4)
|
||||
z3[i4 * zStrd4] = OpType::op(x3[i4 * xStrd4], y3[i4 * yStrd4], extraParams);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1, 0,zAxis2,1);
|
||||
}
|
||||
break;
|
||||
|
||||
default: {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
|
||||
for (auto i = start; i < stop; ++i) {
|
||||
|
||||
shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
|
||||
|
||||
for (uint j = 0; j < rank; ++j) {
|
||||
xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
|
||||
yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
|
||||
}
|
||||
|
||||
const auto xOffset = shape::getOffset(xShapeInfo, xCoords);
|
||||
const auto yOffset = shape::getOffset(yShapeInfo, yCoords);
|
||||
const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
|
||||
|
||||
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
||||
}
|
||||
};
|
||||
|
||||
samediff::Threads::parallel_for(func, 0, shape::length(zShapeInfo));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename X, typename Z>
|
||||
template<typename OpType>
|
||||
void BroadcastBool<X, Z>::execInverse(void *vx,
|
||||
|
@ -649,6 +453,240 @@ void BroadcastBool<X, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
|
|||
}
|
||||
}
|
||||
|
||||
//BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT BroadcastBool, , LIBND4J_TYPES, BOOL_TYPES);
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
template <typename X, typename Z>
|
||||
template<typename OpType>
|
||||
void BroadcastBool<X, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
|
||||
const void *vy, const Nd4jLong *yShapeInfo,
|
||||
void *vz, const Nd4jLong *zShapeInfo,
|
||||
void *vextraParams) {
|
||||
|
||||
const X* x = reinterpret_cast<const X*>(vx);
|
||||
const X* y = reinterpret_cast<const X*>(vy);
|
||||
Z* z = reinterpret_cast<Z*>(vz);
|
||||
|
||||
X* extraParams = reinterpret_cast<X*>(vextraParams);
|
||||
|
||||
const int rank = shape::rank(zShapeInfo); // xRank = yRank = zRank
|
||||
const char zOrder = shape::order(zShapeInfo);
|
||||
|
||||
uint xAxis0 = shape::sizeAt(xShapeInfo, zOrder == 'c' ? 0 : rank-1);
|
||||
uint xAxis1 = shape::sizeAt(xShapeInfo, zOrder == 'c' ? 1 : rank-2);
|
||||
uint xAxis2 = rank > 2 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
|
||||
uint xAxis3 = rank > 3 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
|
||||
uint xAxis4 = rank > 4 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
|
||||
Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, zOrder == 'c' ? 0 : rank-1);
|
||||
Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, zOrder == 'c' ? 1 : rank-2);
|
||||
Nd4jLong xStrd2 = rank > 2 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
|
||||
Nd4jLong xStrd3 = rank > 3 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
|
||||
Nd4jLong xStrd4 = rank > 4 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
|
||||
|
||||
uint yAxis0 = shape::sizeAt(yShapeInfo, zOrder == 'c' ? 0 : rank-1);
|
||||
uint yAxis1 = shape::sizeAt(yShapeInfo, zOrder == 'c' ? 1 : rank-2);
|
||||
uint yAxis2 = rank > 2 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
|
||||
uint yAxis3 = rank > 3 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
|
||||
uint yAxis4 = rank > 4 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
|
||||
Nd4jLong yStrd0 = shape::strideAt(yShapeInfo, zOrder == 'c' ? 0 : rank-1);
|
||||
Nd4jLong yStrd1 = shape::strideAt(yShapeInfo, zOrder == 'c' ? 1 : rank-2);
|
||||
Nd4jLong yStrd2 = rank > 2 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
|
||||
Nd4jLong yStrd3 = rank > 3 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
|
||||
Nd4jLong yStrd4 = rank > 4 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
|
||||
|
||||
uint zAxis0 = shape::sizeAt(zShapeInfo, zOrder == 'c' ? 0 : rank-1);
|
||||
uint zAxis1 = shape::sizeAt(zShapeInfo, zOrder == 'c' ? 1 : rank-2);
|
||||
uint zAxis2 = rank > 2 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
|
||||
uint zAxis3 = rank > 3 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
|
||||
uint zAxis4 = rank > 4 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
|
||||
Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, zOrder == 'c' ? 0 : rank-1);
|
||||
Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, zOrder == 'c' ? 1 : rank-2);
|
||||
Nd4jLong zStrd2 = rank > 2 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
|
||||
Nd4jLong zStrd3 = rank > 3 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
|
||||
Nd4jLong zStrd4 = rank > 4 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
|
||||
|
||||
switch (rank) {
|
||||
|
||||
case 1: {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 0) {
|
||||
for (auto i0 = start; i0 < stop; ++i0)
|
||||
z[i0] = OpType::op(x[i0], *y, extraParams);
|
||||
}
|
||||
else if(zStrd0 == 1 && xStrd0 == 0 && yStrd0 == 1) {
|
||||
for (auto i0 = start; i0 < stop; ++i0)
|
||||
z[i0] = OpType::op(*x, y[i0], extraParams);
|
||||
}
|
||||
else if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 1) {
|
||||
for (auto i0 = start; i0 < stop; ++i0)
|
||||
z[i0] = OpType::op(x[i0], y[i0], extraParams);
|
||||
}
|
||||
else {
|
||||
for (auto i0 = start; i0 < stop; ++i0)
|
||||
z[i0 * zStrd0] = OpType::op(x[i0 * xStrd0], y[i0 * yStrd0], extraParams);
|
||||
}
|
||||
};
|
||||
samediff::Threads::parallel_tad(func, 0, zAxis0);
|
||||
}
|
||||
break;
|
||||
|
||||
case 2: {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
for (auto i0 = start; i0 < stop; ++i0) {
|
||||
|
||||
auto x0 = x + i0 * xStrd0;
|
||||
auto y0 = y + i0 * yStrd0;
|
||||
auto z0 = z + i0 * zStrd0;
|
||||
|
||||
if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 0)
|
||||
for (uint i1 = 0; i1 < zAxis1; ++i1)
|
||||
z0[i1] = OpType::op(x0[i1], *y0, extraParams);
|
||||
else if(zStrd1 == 1 && xStrd1 == 0 && yStrd1 == 1)
|
||||
for (uint i1 = 0; i1 < zAxis1; ++i1)
|
||||
z0[i1] = OpType::op(*x0, y0[i1], extraParams);
|
||||
else if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 1)
|
||||
for (uint i1 = 0; i1 < zAxis1; ++i1)
|
||||
z0[i1] = OpType::op(x0[i1], y0[i1], extraParams);
|
||||
else
|
||||
for (uint i1 = 0; i1 < zAxis1; ++i1)
|
||||
z0[i1 * zStrd1] = OpType::op(x0[i1 * xStrd1], y0[i1 * yStrd1], extraParams);
|
||||
}
|
||||
};
|
||||
samediff::Threads::parallel_tad(func, 0, zAxis0);
|
||||
}
|
||||
break;
|
||||
|
||||
case 3: {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR_2D {
|
||||
|
||||
for (auto i0 = start_x; i0 < stop_x; ++i0) {
|
||||
for (auto i1 = start_y; i1 < stop_y; ++i1) {
|
||||
|
||||
auto x1 = x + i0 * xStrd0 + i1 * xStrd1;
|
||||
auto y1 = y + i0 * yStrd0 + i1 * yStrd1;
|
||||
auto z1 = z + i0 * zStrd0 + i1 * zStrd1;
|
||||
|
||||
if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 0)
|
||||
for (uint i2 = 0; i2 < zAxis2; ++i2)
|
||||
z1[i2] = OpType::op(x1[i2], *y1, extraParams);
|
||||
else if(zStrd2 == 1 && xStrd2 == 0 && yStrd2 == 1)
|
||||
for (uint i2 = 0; i2 < zAxis2; ++i2)
|
||||
z1[i2] = OpType::op(*x1, y1[i2], extraParams);
|
||||
else if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 1)
|
||||
for (uint i2 = 0; i2 < zAxis2; ++i2)
|
||||
z1[i2] = OpType::op(x1[i2], y1[i2], extraParams);
|
||||
else
|
||||
for (uint i2 = 0; i2 < zAxis2; ++i2)
|
||||
z1[i2 * zStrd2] = OpType::op(x1[i2 * xStrd2], y1[i2 * yStrd2], extraParams);
|
||||
}
|
||||
}
|
||||
};
|
||||
samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1);
|
||||
}
|
||||
break;
|
||||
|
||||
case 4: {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR_3D {
|
||||
|
||||
for (auto i0 = start_x; i0 < stop_x; ++i0) {
|
||||
for (auto i1 = start_y; i1 < stop_y; ++i1) {
|
||||
for (auto i2 = start_z; i2 < stop_z; ++i2) {
|
||||
|
||||
auto x2 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2;
|
||||
auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2;
|
||||
auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2;
|
||||
|
||||
if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 0)
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3)
|
||||
z2[i3] = OpType::op(x2[i3], *y2, extraParams);
|
||||
else if(zStrd3 == 1 && xStrd3 == 0 && yStrd3 == 1)
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3)
|
||||
z2[i3] = OpType::op(*x2, y2[i3], extraParams);
|
||||
else if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 1)
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3)
|
||||
z2[i3] = OpType::op(x2[i3], y2[i3], extraParams);
|
||||
else
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3)
|
||||
z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3], extraParams);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1, 0,zAxis2,1);
|
||||
}
|
||||
break;
|
||||
|
||||
case 5: {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR_3D {
|
||||
|
||||
for (auto i0 = start_x; i0 < stop_x; ++i0) {
|
||||
for (auto i1 = start_y; i1 < stop_y; ++i1) {
|
||||
for (auto i2 = start_z; i2 < stop_z; ++i2) {
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3) {
|
||||
|
||||
auto x3 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2 + i3 * xStrd3;
|
||||
auto y3 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2 + i3 * yStrd3;
|
||||
auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3;
|
||||
|
||||
if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 0)
|
||||
for (uint i4 = 0; i4 < zAxis4; ++i4)
|
||||
z3[i4] = OpType::op(x3[i4], *y3, extraParams);
|
||||
else if(zStrd4 == 1 && xStrd4 == 0 && yStrd4 == 1)
|
||||
for (uint i4 = 0; i4 < zAxis4; ++i4)
|
||||
z3[i4] = OpType::op(*x3, y3[i4], extraParams);
|
||||
else if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 1)
|
||||
for (uint i4 = 0; i4 < zAxis4; ++i4)
|
||||
z3[i4] = OpType::op(x3[i4], y3[i4], extraParams);
|
||||
else
|
||||
for (uint i4 = 0; i4 < zAxis4; ++i4)
|
||||
z3[i4 * zStrd4] = OpType::op(x3[i4 * xStrd4], y3[i4 * yStrd4], extraParams);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1, 0,zAxis2,1);
|
||||
}
|
||||
break;
|
||||
|
||||
default: {
|
||||
|
||||
const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
|
||||
const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
|
||||
for (auto i = start; i < stop; ++i) {
|
||||
|
||||
shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
|
||||
|
||||
for (uint j = 0; j < rank; ++j) {
|
||||
xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
|
||||
yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
|
||||
}
|
||||
|
||||
const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
|
||||
const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
|
||||
const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
|
||||
|
||||
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
||||
}
|
||||
};
|
||||
|
||||
samediff::Threads::parallel_for(func, 0, shape::length(zShapeInfo));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT BroadcastBool, , LIBND4J_TYPES, BOOL_TYPES);
|
||||
|
||||
|
||||
}
|
||||
}
|
|
@ -444,8 +444,8 @@ namespace functions {
|
|||
template <typename X>
|
||||
template<typename OpType>
|
||||
void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
|
||||
const void *vy, const Nd4jLong *yShapeInfo,
|
||||
void *vz, const Nd4jLong *zShapeInfo) {
|
||||
const void *vy, const Nd4jLong *yShapeInfo,
|
||||
void *vz, const Nd4jLong *zShapeInfo) {
|
||||
|
||||
const X* x = reinterpret_cast<const X*>(vx);
|
||||
const X* y = reinterpret_cast<const X*>(vy);
|
||||
|
@ -493,12 +493,22 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
if(zStrd0 == 1 && xStrd0 <= 1 && yStrd0 <= 1)
|
||||
if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 0) {
|
||||
for (auto i0 = start; i0 < stop; ++i0)
|
||||
z[i0] = OpType::op(x[xStrd0 ? i0 : 0], y[yStrd0 ? i0 : 0]);
|
||||
else
|
||||
z[i0] = OpType::op(x[i0], *y);
|
||||
}
|
||||
else if(zStrd0 == 1 && xStrd0 == 0 && yStrd0 == 1) {
|
||||
for (auto i0 = start; i0 < stop; ++i0)
|
||||
z[i0] = OpType::op(*x, y[i0]);
|
||||
}
|
||||
else if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 1) {
|
||||
for (auto i0 = start; i0 < stop; ++i0)
|
||||
z[i0] = OpType::op(x[i0], y[i0]);
|
||||
}
|
||||
else {
|
||||
for (auto i0 = start; i0 < stop; ++i0)
|
||||
z[i0 * zStrd0] = OpType::op(x[i0 * xStrd0], y[i0 * yStrd0]);
|
||||
}
|
||||
};
|
||||
samediff::Threads::parallel_tad(func, 0, zAxis0);
|
||||
}
|
||||
|
@ -514,9 +524,15 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
|
|||
auto y0 = y + i0 * yStrd0;
|
||||
auto z0 = z + i0 * zStrd0;
|
||||
|
||||
if(zStrd1 == 1 && xStrd1 <= 1 && yStrd1 <= 1)
|
||||
if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 0)
|
||||
for (uint i1 = 0; i1 < zAxis1; ++i1)
|
||||
z0[i1] = OpType::op(x0[xStrd1 ? i1 : 0], y0[yStrd1 ? i1 : 0]);
|
||||
z0[i1] = OpType::op(x0[i1], *y0);
|
||||
else if(zStrd1 == 1 && xStrd1 == 0 && yStrd1 == 1)
|
||||
for (uint i1 = 0; i1 < zAxis1; ++i1)
|
||||
z0[i1] = OpType::op(*x0, y0[i1]);
|
||||
else if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 1)
|
||||
for (uint i1 = 0; i1 < zAxis1; ++i1)
|
||||
z0[i1] = OpType::op(x0[i1], y0[i1]);
|
||||
else
|
||||
for (uint i1 = 0; i1 < zAxis1; ++i1)
|
||||
z0[i1 * zStrd1] = OpType::op(x0[i1 * xStrd1], y0[i1 * yStrd1]);
|
||||
|
@ -528,7 +544,6 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
|
|||
|
||||
case 3: {
|
||||
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR_2D {
|
||||
|
||||
for (auto i0 = start_x; i0 < stop_x; ++i0) {
|
||||
|
@ -538,9 +553,15 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
|
|||
auto y1 = y + i0 * yStrd0 + i1 * yStrd1;
|
||||
auto z1 = z + i0 * zStrd0 + i1 * zStrd1;
|
||||
|
||||
if(zStrd2 == 1 && xStrd2 <= 1 && yStrd2 <= 1)
|
||||
if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 0)
|
||||
for (uint i2 = 0; i2 < zAxis2; ++i2)
|
||||
z1[i2] = OpType::op(x1[xStrd2 ? i2 : 0], y1[yStrd2 ? i2 : 0]);
|
||||
z1[i2] = OpType::op(x1[i2], *y1);
|
||||
else if(zStrd2 == 1 && xStrd2 == 0 && yStrd2 == 1)
|
||||
for (uint i2 = 0; i2 < zAxis2; ++i2)
|
||||
z1[i2] = OpType::op(*x1, y1[i2]);
|
||||
else if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 1)
|
||||
for (uint i2 = 0; i2 < zAxis2; ++i2)
|
||||
z1[i2] = OpType::op(x1[i2], y1[i2]);
|
||||
else
|
||||
for (uint i2 = 0; i2 < zAxis2; ++i2)
|
||||
z1[i2 * zStrd2] = OpType::op(x1[i2 * xStrd2], y1[i2 * yStrd2]);
|
||||
|
@ -563,9 +584,15 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
|
|||
auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2;
|
||||
auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2;
|
||||
|
||||
if(zStrd3 == 1 && xStrd3 <= 1 && yStrd3 <= 1)
|
||||
if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 0)
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3)
|
||||
z2[i3] = OpType::op(x2[xStrd3 ? i3 : 0], y2[yStrd3 ? i3 : 0]);
|
||||
z2[i3] = OpType::op(x2[i3], *y2);
|
||||
else if(zStrd3 == 1 && xStrd3 == 0 && yStrd3 == 1)
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3)
|
||||
z2[i3] = OpType::op(*x2, y2[i3]);
|
||||
else if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 1)
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3)
|
||||
z2[i3] = OpType::op(x2[i3], y2[i3]);
|
||||
else
|
||||
for (uint i3 = 0; i3 < zAxis3; ++i3)
|
||||
z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3]);
|
||||
|
@ -590,9 +617,15 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
|
|||
auto y3 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2 + i3 * yStrd3;
|
||||
auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3;
|
||||
|
||||
if(zStrd4 == 1 && xStrd4 <= 1 && yStrd4 <= 1)
|
||||
if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 0)
|
||||
for (uint i4 = 0; i4 < zAxis4; ++i4)
|
||||
z3[i4] = OpType::op(x3[xStrd4 ? i4 : 0], y3[yStrd4 ? i4 : 0]);
|
||||
z3[i4] = OpType::op(x3[i4], *y3);
|
||||
else if(zStrd4 == 1 && xStrd4 == 0 && yStrd4 == 1)
|
||||
for (uint i4 = 0; i4 < zAxis4; ++i4)
|
||||
z3[i4] = OpType::op(*x3, y3[i4]);
|
||||
else if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 1)
|
||||
for (uint i4 = 0; i4 < zAxis4; ++i4)
|
||||
z3[i4] = OpType::op(x3[i4], y3[i4]);
|
||||
else
|
||||
for (uint i4 = 0; i4 < zAxis4; ++i4)
|
||||
z3[i4 * zStrd4] = OpType::op(x3[i4 * xStrd4], y3[i4 * yStrd4]);
|
||||
|
@ -607,6 +640,9 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
|
|||
|
||||
default: {
|
||||
|
||||
const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
|
||||
const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
|
@ -620,9 +656,9 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
|
|||
yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
|
||||
}
|
||||
|
||||
const auto xOffset = shape::getOffset(xShapeInfo, xCoords);
|
||||
const auto yOffset = shape::getOffset(yShapeInfo, yCoords);
|
||||
const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
|
||||
const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
|
||||
const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
|
||||
|
||||
z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
|
||||
}
|
||||
|
|
|
@ -264,11 +264,15 @@ __device__ void Broadcast<X,Y,Z>::transformCuda(
|
|||
|
||||
__shared__ Nd4jLong zLen;
|
||||
__shared__ int rank;
|
||||
__shared__ bool xzSameOffsets, yzSameOffsets;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
zLen = shape::length(zShapeInfo);
|
||||
rank = shape::rank(zShapeInfo);
|
||||
|
||||
xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
|
||||
yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
|
@ -286,9 +290,9 @@ __device__ void Broadcast<X,Y,Z>::transformCuda(
|
|||
yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
|
||||
}
|
||||
|
||||
const auto xOffset = shape::getOffset(xShapeInfo, xCoords);
|
||||
const auto yOffset = shape::getOffset(yShapeInfo, yCoords);
|
||||
const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
|
||||
const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
|
||||
const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
|
||||
|
||||
z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
|
||||
}
|
||||
|
|
|
@ -280,11 +280,15 @@ __device__ void BroadcastBool<X,Z>::transformCuda(const void *vx, const Nd4jLong
|
|||
|
||||
__shared__ Nd4jLong zLen;
|
||||
__shared__ int rank;
|
||||
__shared__ bool xzSameOffsets, yzSameOffsets;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
zLen = shape::length(zShapeInfo);
|
||||
rank = shape::rank(zShapeInfo);
|
||||
|
||||
xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
|
||||
yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
|
@ -302,9 +306,9 @@ __device__ void BroadcastBool<X,Z>::transformCuda(const void *vx, const Nd4jLong
|
|||
yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
|
||||
}
|
||||
|
||||
const auto xOffset = shape::getOffset(xShapeInfo, xCoords);
|
||||
const auto yOffset = shape::getOffset(yShapeInfo, yCoords);
|
||||
const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
|
||||
const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
|
||||
const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
|
||||
|
||||
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
||||
}
|
||||
|
|
|
@ -260,11 +260,15 @@ __device__ void BroadcastInt<X>::transformCuda(const void *vx, const Nd4jLong *x
|
|||
|
||||
__shared__ Nd4jLong zLen;
|
||||
__shared__ int rank;
|
||||
__shared__ bool xzSameOffsets, yzSameOffsets;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
zLen = shape::length(zShapeInfo);
|
||||
rank = shape::rank(zShapeInfo);
|
||||
|
||||
xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
|
||||
yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
|
@ -282,9 +286,9 @@ __device__ void BroadcastInt<X>::transformCuda(const void *vx, const Nd4jLong *x
|
|||
yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
|
||||
}
|
||||
|
||||
const auto xOffset = shape::getOffset(xShapeInfo, xCoords);
|
||||
const auto yOffset = shape::getOffset(yShapeInfo, yCoords);
|
||||
const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
|
||||
const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
|
||||
const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
|
||||
|
||||
z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue