Shyrma broadcast2 (#309)

* - profiling broadcast ops for aurora

Signed-off-by: Yurii <iuriish@yahoo.com>

* - correct loop limit type in shape::haveSameShapeAndStrides

Signed-off-by: Yurii <iuriish@yahoo.com>
master
Yurii Shyrma 2020-03-11 16:58:53 +02:00 committed by GitHub
parent 58550b7c98
commit ebab6b6410
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 358 additions and 238 deletions

View File

@ -2969,10 +2969,8 @@ INLINEDEF _CUDA_HD bool haveSameShapeAndStrides(const Nd4jLong *shapeInfo1, cons
if (shapeInfo1[0] == 0) if (shapeInfo1[0] == 0)
return true; return true;
int range = 2 * shapeInfo1[0]; for (uint e = 0; e < static_cast<uint>(shape::rank(shapeInfo1)); ++e)
if (shape::shapeOf(shapeInfo1)[e] != shape::shapeOf(shapeInfo2)[e] || shape::stride(shapeInfo1)[e] != shape::stride(shapeInfo2)[e])
for (int e = 1; e <= range; e++)
if (shapeInfo1[e] != shapeInfo2[e])
return false; return false;
return true; return true;

View File

@ -623,12 +623,22 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
if(zStrd0 == 1 && xStrd0 <= 1 && yStrd0 <= 1) if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 0) {
for (auto i0 = start; i0 < stop; ++i0) for (auto i0 = start; i0 < stop; ++i0)
z[i0] = OpType::op(x[xStrd0 ? i0 : 0], y[yStrd0 ? i0 : 0]); z[i0] = OpType::op(x[i0], *y);
else }
else if(zStrd0 == 1 && xStrd0 == 0 && yStrd0 == 1) {
for (auto i0 = start; i0 < stop; ++i0)
z[i0] = OpType::op(*x, y[i0]);
}
else if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 1) {
for (auto i0 = start; i0 < stop; ++i0)
z[i0] = OpType::op(x[i0], y[i0]);
}
else {
for (auto i0 = start; i0 < stop; ++i0) for (auto i0 = start; i0 < stop; ++i0)
z[i0 * zStrd0] = OpType::op(x[i0 * xStrd0], y[i0 * yStrd0]); z[i0 * zStrd0] = OpType::op(x[i0 * xStrd0], y[i0 * yStrd0]);
}
}; };
samediff::Threads::parallel_tad(func, 0, zAxis0); samediff::Threads::parallel_tad(func, 0, zAxis0);
} }
@ -644,9 +654,15 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
auto y0 = y + i0 * yStrd0; auto y0 = y + i0 * yStrd0;
auto z0 = z + i0 * zStrd0; auto z0 = z + i0 * zStrd0;
if(zStrd1 == 1 && xStrd1 <= 1 && yStrd1 <= 1) if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 0)
for (uint i1 = 0; i1 < zAxis1; ++i1) for (uint i1 = 0; i1 < zAxis1; ++i1)
z0[i1] = OpType::op(x0[xStrd1 ? i1 : 0], y0[yStrd1 ? i1 : 0]); z0[i1] = OpType::op(x0[i1], *y0);
else if(zStrd1 == 1 && xStrd1 == 0 && yStrd1 == 1)
for (uint i1 = 0; i1 < zAxis1; ++i1)
z0[i1] = OpType::op(*x0, y0[i1]);
else if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 1)
for (uint i1 = 0; i1 < zAxis1; ++i1)
z0[i1] = OpType::op(x0[i1], y0[i1]);
else else
for (uint i1 = 0; i1 < zAxis1; ++i1) for (uint i1 = 0; i1 < zAxis1; ++i1)
z0[i1 * zStrd1] = OpType::op(x0[i1 * xStrd1], y0[i1 * yStrd1]); z0[i1 * zStrd1] = OpType::op(x0[i1 * xStrd1], y0[i1 * yStrd1]);
@ -658,7 +674,6 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
case 3: { case 3: {
auto func = PRAGMA_THREADS_FOR_2D { auto func = PRAGMA_THREADS_FOR_2D {
for (auto i0 = start_x; i0 < stop_x; ++i0) { for (auto i0 = start_x; i0 < stop_x; ++i0) {
@ -668,9 +683,15 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
auto y1 = y + i0 * yStrd0 + i1 * yStrd1; auto y1 = y + i0 * yStrd0 + i1 * yStrd1;
auto z1 = z + i0 * zStrd0 + i1 * zStrd1; auto z1 = z + i0 * zStrd0 + i1 * zStrd1;
if(zStrd2 == 1 && xStrd2 <= 1 && yStrd2 <= 1) if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 0)
for (uint i2 = 0; i2 < zAxis2; ++i2) for (uint i2 = 0; i2 < zAxis2; ++i2)
z1[i2] = OpType::op(x1[xStrd2 ? i2 : 0], y1[yStrd2 ? i2 : 0]); z1[i2] = OpType::op(x1[i2], *y1);
else if(zStrd2 == 1 && xStrd2 == 0 && yStrd2 == 1)
for (uint i2 = 0; i2 < zAxis2; ++i2)
z1[i2] = OpType::op(*x1, y1[i2]);
else if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 1)
for (uint i2 = 0; i2 < zAxis2; ++i2)
z1[i2] = OpType::op(x1[i2], y1[i2]);
else else
for (uint i2 = 0; i2 < zAxis2; ++i2) for (uint i2 = 0; i2 < zAxis2; ++i2)
z1[i2 * zStrd2] = OpType::op(x1[i2 * xStrd2], y1[i2 * yStrd2]); z1[i2 * zStrd2] = OpType::op(x1[i2 * xStrd2], y1[i2 * yStrd2]);
@ -693,9 +714,15 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2; auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2;
auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2; auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2;
if(zStrd3 == 1 && xStrd3 <= 1 && yStrd3 <= 1) if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 0)
for (uint i3 = 0; i3 < zAxis3; ++i3) for (uint i3 = 0; i3 < zAxis3; ++i3)
z2[i3] = OpType::op(x2[xStrd3 ? i3 : 0], y2[yStrd3 ? i3 : 0]); z2[i3] = OpType::op(x2[i3], *y2);
else if(zStrd3 == 1 && xStrd3 == 0 && yStrd3 == 1)
for (uint i3 = 0; i3 < zAxis3; ++i3)
z2[i3] = OpType::op(*x2, y2[i3]);
else if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 1)
for (uint i3 = 0; i3 < zAxis3; ++i3)
z2[i3] = OpType::op(x2[i3], y2[i3]);
else else
for (uint i3 = 0; i3 < zAxis3; ++i3) for (uint i3 = 0; i3 < zAxis3; ++i3)
z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3]); z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3]);
@ -720,9 +747,15 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
auto y3 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2 + i3 * yStrd3; auto y3 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2 + i3 * yStrd3;
auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3; auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3;
if(zStrd4 == 1 && xStrd4 <= 1 && yStrd4 <= 1) if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 0)
for (uint i4 = 0; i4 < zAxis4; ++i4) for (uint i4 = 0; i4 < zAxis4; ++i4)
z3[i4] = OpType::op(x3[xStrd4 ? i4 : 0], y3[yStrd4 ? i4 : 0]); z3[i4] = OpType::op(x3[i4], *y3);
else if(zStrd4 == 1 && xStrd4 == 0 && yStrd4 == 1)
for (uint i4 = 0; i4 < zAxis4; ++i4)
z3[i4] = OpType::op(*x3, y3[i4]);
else if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 1)
for (uint i4 = 0; i4 < zAxis4; ++i4)
z3[i4] = OpType::op(x3[i4], y3[i4]);
else else
for (uint i4 = 0; i4 < zAxis4; ++i4) for (uint i4 = 0; i4 < zAxis4; ++i4)
z3[i4 * zStrd4] = OpType::op(x3[i4 * xStrd4], y3[i4 * yStrd4]); z3[i4 * zStrd4] = OpType::op(x3[i4 * xStrd4], y3[i4 * yStrd4]);
@ -737,6 +770,9 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
default: { default: {
const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
@ -750,9 +786,9 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j]; yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
} }
const auto xOffset = shape::getOffset(xShapeInfo, xCoords);
const auto yOffset = shape::getOffset(yShapeInfo, yCoords);
const auto zOffset = shape::getOffset(zShapeInfo, zCoords); const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
z[zOffset] = OpType::op(x[xOffset], y[yOffset]); z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
} }

View File

@ -277,202 +277,6 @@ namespace functions {
} }
} }
////////////////////////////////////////////////////////////////////////
template <typename X, typename Z>
template<typename OpType>
void BroadcastBool<X, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
const void *vy, const Nd4jLong *yShapeInfo,
void *vz, const Nd4jLong *zShapeInfo,
void *vextraParams) {
const X* x = reinterpret_cast<const X*>(vx);
const X* y = reinterpret_cast<const X*>(vy);
Z* z = reinterpret_cast<Z*>(vz);
X* extraParams = reinterpret_cast<X*>(vextraParams);
const int rank = shape::rank(zShapeInfo); // xRank = yRank = zRank
const char zOrder = shape::order(zShapeInfo);
uint xAxis0 = shape::sizeAt(xShapeInfo, zOrder == 'c' ? 0 : rank-1);
uint xAxis1 = shape::sizeAt(xShapeInfo, zOrder == 'c' ? 1 : rank-2);
uint xAxis2 = rank > 2 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
uint xAxis3 = rank > 3 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
uint xAxis4 = rank > 4 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, zOrder == 'c' ? 0 : rank-1);
Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, zOrder == 'c' ? 1 : rank-2);
Nd4jLong xStrd2 = rank > 2 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
Nd4jLong xStrd3 = rank > 3 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
Nd4jLong xStrd4 = rank > 4 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
uint yAxis0 = shape::sizeAt(yShapeInfo, zOrder == 'c' ? 0 : rank-1);
uint yAxis1 = shape::sizeAt(yShapeInfo, zOrder == 'c' ? 1 : rank-2);
uint yAxis2 = rank > 2 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
uint yAxis3 = rank > 3 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
uint yAxis4 = rank > 4 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
Nd4jLong yStrd0 = shape::strideAt(yShapeInfo, zOrder == 'c' ? 0 : rank-1);
Nd4jLong yStrd1 = shape::strideAt(yShapeInfo, zOrder == 'c' ? 1 : rank-2);
Nd4jLong yStrd2 = rank > 2 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
Nd4jLong yStrd3 = rank > 3 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
Nd4jLong yStrd4 = rank > 4 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
uint zAxis0 = shape::sizeAt(zShapeInfo, zOrder == 'c' ? 0 : rank-1);
uint zAxis1 = shape::sizeAt(zShapeInfo, zOrder == 'c' ? 1 : rank-2);
uint zAxis2 = rank > 2 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
uint zAxis3 = rank > 3 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
uint zAxis4 = rank > 4 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, zOrder == 'c' ? 0 : rank-1);
Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, zOrder == 'c' ? 1 : rank-2);
Nd4jLong zStrd2 = rank > 2 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
Nd4jLong zStrd3 = rank > 3 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
Nd4jLong zStrd4 = rank > 4 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
switch (rank) {
case 1: {
auto func = PRAGMA_THREADS_FOR{
if(zStrd0 == 1 && xStrd0 <= 1 && yStrd0 <= 1)
for (auto i0 = start; i0 < stop; ++i0)
z[i0] = OpType::op(x[xStrd0 ? i0 : 0], y[yStrd0 ? i0 : 0], extraParams);
else
for (auto i0 = start; i0 < stop; ++i0)
z[i0 * zStrd0] = OpType::op(x[i0 * xStrd0], y[i0 * yStrd0], extraParams);
};
samediff::Threads::parallel_tad(func, 0, zAxis0);
}
break;
case 2: {
auto func = PRAGMA_THREADS_FOR{
for (auto i0 = start; i0 < stop; ++i0) {
auto x0 = x + i0 * xStrd0;
auto y0 = y + i0 * yStrd0;
auto z0 = z + i0 * zStrd0;
if(zStrd1 == 1 && xStrd1 <= 1 && yStrd1 <= 1)
for (uint i1 = 0; i1 < zAxis1; ++i1)
z0[i1] = OpType::op(x0[xStrd1 ? i1 : 0], y0[yStrd1 ? i1 : 0], extraParams);
else
for (uint i1 = 0; i1 < zAxis1; ++i1)
z0[i1 * zStrd1] = OpType::op(x0[i1 * xStrd1], y0[i1 * yStrd1], extraParams);
}
};
samediff::Threads::parallel_tad(func, 0, zAxis0);
}
break;
case 3: {
auto func = PRAGMA_THREADS_FOR_2D {
for (auto i0 = start_x; i0 < stop_x; ++i0) {
for (auto i1 = start_y; i1 < stop_y; ++i1) {
auto x1 = x + i0 * xStrd0 + i1 * xStrd1;
auto y1 = y + i0 * yStrd0 + i1 * yStrd1;
auto z1 = z + i0 * zStrd0 + i1 * zStrd1;
if(zStrd2 == 1 && xStrd2 <= 1 && yStrd2 <= 1)
for (uint i2 = 0; i2 < zAxis2; ++i2)
z1[i2] = OpType::op(x1[xStrd2 ? i2 : 0], y1[yStrd2 ? i2 : 0], extraParams);
else
for (uint i2 = 0; i2 < zAxis2; ++i2)
z1[i2 * zStrd2] = OpType::op(x1[i2 * xStrd2], y1[i2 * yStrd2], extraParams);
}
}
};
samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1);
}
break;
case 4: {
auto func = PRAGMA_THREADS_FOR_3D {
for (auto i0 = start_x; i0 < stop_x; ++i0) {
for (auto i1 = start_y; i1 < stop_y; ++i1) {
for (auto i2 = start_z; i2 < stop_z; ++i2) {
auto x2 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2;
auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2;
auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2;
if(zStrd3 == 1 && xStrd3 <= 1 && yStrd3 <= 1)
for (uint i3 = 0; i3 < zAxis3; ++i3)
z2[i3] = OpType::op(x2[xStrd3 ? i3 : 0], y2[yStrd3 ? i3 : 0], extraParams);
else
for (uint i3 = 0; i3 < zAxis3; ++i3)
z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3], extraParams);
}
}
}
};
samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1, 0,zAxis2,1);
}
break;
case 5: {
auto func = PRAGMA_THREADS_FOR_3D {
for (auto i0 = start_x; i0 < stop_x; ++i0) {
for (auto i1 = start_y; i1 < stop_y; ++i1) {
for (auto i2 = start_z; i2 < stop_z; ++i2) {
for (uint i3 = 0; i3 < zAxis3; ++i3) {
auto x3 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2 + i3 * xStrd3;
auto y3 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2 + i3 * yStrd3;
auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3;
if(zStrd4 == 1 && xStrd4 <= 1 && yStrd4 <= 1)
for (uint i4 = 0; i4 < zAxis4; ++i4)
z3[i4] = OpType::op(x3[xStrd4 ? i4 : 0], y3[yStrd4 ? i4 : 0], extraParams);
else
for (uint i4 = 0; i4 < zAxis4; ++i4)
z3[i4 * zStrd4] = OpType::op(x3[i4 * xStrd4], y3[i4 * yStrd4], extraParams);
}
}
}
}
};
samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1, 0,zAxis2,1);
}
break;
default: {
auto func = PRAGMA_THREADS_FOR{
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
for (auto i = start; i < stop; ++i) {
shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
for (uint j = 0; j < rank; ++j) {
xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
}
const auto xOffset = shape::getOffset(xShapeInfo, xCoords);
const auto yOffset = shape::getOffset(yShapeInfo, yCoords);
const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
}
};
samediff::Threads::parallel_for(func, 0, shape::length(zShapeInfo));
}
}
}
template <typename X, typename Z> template <typename X, typename Z>
template<typename OpType> template<typename OpType>
void BroadcastBool<X, Z>::execInverse(void *vx, void BroadcastBool<X, Z>::execInverse(void *vx,
@ -649,6 +453,240 @@ void BroadcastBool<X, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
} }
} }
////////////////////////////////////////////////////////////////////////
template <typename X, typename Z>
template<typename OpType>
void BroadcastBool<X, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
const void *vy, const Nd4jLong *yShapeInfo,
void *vz, const Nd4jLong *zShapeInfo,
void *vextraParams) {
const X* x = reinterpret_cast<const X*>(vx);
const X* y = reinterpret_cast<const X*>(vy);
Z* z = reinterpret_cast<Z*>(vz);
X* extraParams = reinterpret_cast<X*>(vextraParams);
const int rank = shape::rank(zShapeInfo); // xRank = yRank = zRank
const char zOrder = shape::order(zShapeInfo);
uint xAxis0 = shape::sizeAt(xShapeInfo, zOrder == 'c' ? 0 : rank-1);
uint xAxis1 = shape::sizeAt(xShapeInfo, zOrder == 'c' ? 1 : rank-2);
uint xAxis2 = rank > 2 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
uint xAxis3 = rank > 3 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
uint xAxis4 = rank > 4 ? shape::sizeAt(xShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
Nd4jLong xStrd0 = shape::strideAt(xShapeInfo, zOrder == 'c' ? 0 : rank-1);
Nd4jLong xStrd1 = shape::strideAt(xShapeInfo, zOrder == 'c' ? 1 : rank-2);
Nd4jLong xStrd2 = rank > 2 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
Nd4jLong xStrd3 = rank > 3 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
Nd4jLong xStrd4 = rank > 4 ? shape::strideAt(xShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
uint yAxis0 = shape::sizeAt(yShapeInfo, zOrder == 'c' ? 0 : rank-1);
uint yAxis1 = shape::sizeAt(yShapeInfo, zOrder == 'c' ? 1 : rank-2);
uint yAxis2 = rank > 2 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
uint yAxis3 = rank > 3 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
uint yAxis4 = rank > 4 ? shape::sizeAt(yShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
Nd4jLong yStrd0 = shape::strideAt(yShapeInfo, zOrder == 'c' ? 0 : rank-1);
Nd4jLong yStrd1 = shape::strideAt(yShapeInfo, zOrder == 'c' ? 1 : rank-2);
Nd4jLong yStrd2 = rank > 2 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
Nd4jLong yStrd3 = rank > 3 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
Nd4jLong yStrd4 = rank > 4 ? shape::strideAt(yShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
uint zAxis0 = shape::sizeAt(zShapeInfo, zOrder == 'c' ? 0 : rank-1);
uint zAxis1 = shape::sizeAt(zShapeInfo, zOrder == 'c' ? 1 : rank-2);
uint zAxis2 = rank > 2 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
uint zAxis3 = rank > 3 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
uint zAxis4 = rank > 4 ? shape::sizeAt(zShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
Nd4jLong zStrd0 = shape::strideAt(zShapeInfo, zOrder == 'c' ? 0 : rank-1);
Nd4jLong zStrd1 = shape::strideAt(zShapeInfo, zOrder == 'c' ? 1 : rank-2);
Nd4jLong zStrd2 = rank > 2 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 2 : rank - 3) : 0;
Nd4jLong zStrd3 = rank > 3 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 3 : rank - 4) : 0;
Nd4jLong zStrd4 = rank > 4 ? shape::strideAt(zShapeInfo, zOrder == 'c' ? 4 : rank - 5) : 0;
switch (rank) {
case 1: {
auto func = PRAGMA_THREADS_FOR{
if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 0) {
for (auto i0 = start; i0 < stop; ++i0)
z[i0] = OpType::op(x[i0], *y, extraParams);
}
else if(zStrd0 == 1 && xStrd0 == 0 && yStrd0 == 1) {
for (auto i0 = start; i0 < stop; ++i0)
z[i0] = OpType::op(*x, y[i0], extraParams);
}
else if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 1) {
for (auto i0 = start; i0 < stop; ++i0)
z[i0] = OpType::op(x[i0], y[i0], extraParams);
}
else {
for (auto i0 = start; i0 < stop; ++i0)
z[i0 * zStrd0] = OpType::op(x[i0 * xStrd0], y[i0 * yStrd0], extraParams);
}
};
samediff::Threads::parallel_tad(func, 0, zAxis0);
}
break;
case 2: {
auto func = PRAGMA_THREADS_FOR{
for (auto i0 = start; i0 < stop; ++i0) {
auto x0 = x + i0 * xStrd0;
auto y0 = y + i0 * yStrd0;
auto z0 = z + i0 * zStrd0;
if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 0)
for (uint i1 = 0; i1 < zAxis1; ++i1)
z0[i1] = OpType::op(x0[i1], *y0, extraParams);
else if(zStrd1 == 1 && xStrd1 == 0 && yStrd1 == 1)
for (uint i1 = 0; i1 < zAxis1; ++i1)
z0[i1] = OpType::op(*x0, y0[i1], extraParams);
else if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 1)
for (uint i1 = 0; i1 < zAxis1; ++i1)
z0[i1] = OpType::op(x0[i1], y0[i1], extraParams);
else
for (uint i1 = 0; i1 < zAxis1; ++i1)
z0[i1 * zStrd1] = OpType::op(x0[i1 * xStrd1], y0[i1 * yStrd1], extraParams);
}
};
samediff::Threads::parallel_tad(func, 0, zAxis0);
}
break;
case 3: {
auto func = PRAGMA_THREADS_FOR_2D {
for (auto i0 = start_x; i0 < stop_x; ++i0) {
for (auto i1 = start_y; i1 < stop_y; ++i1) {
auto x1 = x + i0 * xStrd0 + i1 * xStrd1;
auto y1 = y + i0 * yStrd0 + i1 * yStrd1;
auto z1 = z + i0 * zStrd0 + i1 * zStrd1;
if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 0)
for (uint i2 = 0; i2 < zAxis2; ++i2)
z1[i2] = OpType::op(x1[i2], *y1, extraParams);
else if(zStrd2 == 1 && xStrd2 == 0 && yStrd2 == 1)
for (uint i2 = 0; i2 < zAxis2; ++i2)
z1[i2] = OpType::op(*x1, y1[i2], extraParams);
else if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 1)
for (uint i2 = 0; i2 < zAxis2; ++i2)
z1[i2] = OpType::op(x1[i2], y1[i2], extraParams);
else
for (uint i2 = 0; i2 < zAxis2; ++i2)
z1[i2 * zStrd2] = OpType::op(x1[i2 * xStrd2], y1[i2 * yStrd2], extraParams);
}
}
};
samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1);
}
break;
case 4: {
auto func = PRAGMA_THREADS_FOR_3D {
for (auto i0 = start_x; i0 < stop_x; ++i0) {
for (auto i1 = start_y; i1 < stop_y; ++i1) {
for (auto i2 = start_z; i2 < stop_z; ++i2) {
auto x2 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2;
auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2;
auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2;
if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 0)
for (uint i3 = 0; i3 < zAxis3; ++i3)
z2[i3] = OpType::op(x2[i3], *y2, extraParams);
else if(zStrd3 == 1 && xStrd3 == 0 && yStrd3 == 1)
for (uint i3 = 0; i3 < zAxis3; ++i3)
z2[i3] = OpType::op(*x2, y2[i3], extraParams);
else if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 1)
for (uint i3 = 0; i3 < zAxis3; ++i3)
z2[i3] = OpType::op(x2[i3], y2[i3], extraParams);
else
for (uint i3 = 0; i3 < zAxis3; ++i3)
z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3], extraParams);
}
}
}
};
samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1, 0,zAxis2,1);
}
break;
case 5: {
auto func = PRAGMA_THREADS_FOR_3D {
for (auto i0 = start_x; i0 < stop_x; ++i0) {
for (auto i1 = start_y; i1 < stop_y; ++i1) {
for (auto i2 = start_z; i2 < stop_z; ++i2) {
for (uint i3 = 0; i3 < zAxis3; ++i3) {
auto x3 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2 + i3 * xStrd3;
auto y3 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2 + i3 * yStrd3;
auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3;
if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 0)
for (uint i4 = 0; i4 < zAxis4; ++i4)
z3[i4] = OpType::op(x3[i4], *y3, extraParams);
else if(zStrd4 == 1 && xStrd4 == 0 && yStrd4 == 1)
for (uint i4 = 0; i4 < zAxis4; ++i4)
z3[i4] = OpType::op(*x3, y3[i4], extraParams);
else if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 1)
for (uint i4 = 0; i4 < zAxis4; ++i4)
z3[i4] = OpType::op(x3[i4], y3[i4], extraParams);
else
for (uint i4 = 0; i4 < zAxis4; ++i4)
z3[i4 * zStrd4] = OpType::op(x3[i4 * xStrd4], y3[i4 * yStrd4], extraParams);
}
}
}
}
};
samediff::Threads::parallel_for(func, 0,zAxis0,1, 0,zAxis1,1, 0,zAxis2,1);
}
break;
default: {
const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
auto func = PRAGMA_THREADS_FOR{
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
for (auto i = start; i < stop; ++i) {
shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
for (uint j = 0; j < rank; ++j) {
xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
}
const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
}
};
samediff::Threads::parallel_for(func, 0, shape::length(zShapeInfo));
}
}
}
//BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT BroadcastBool, , LIBND4J_TYPES, BOOL_TYPES); //BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT BroadcastBool, , LIBND4J_TYPES, BOOL_TYPES);
} }
} }

View File

@ -493,12 +493,22 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
if(zStrd0 == 1 && xStrd0 <= 1 && yStrd0 <= 1) if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 0) {
for (auto i0 = start; i0 < stop; ++i0) for (auto i0 = start; i0 < stop; ++i0)
z[i0] = OpType::op(x[xStrd0 ? i0 : 0], y[yStrd0 ? i0 : 0]); z[i0] = OpType::op(x[i0], *y);
else }
else if(zStrd0 == 1 && xStrd0 == 0 && yStrd0 == 1) {
for (auto i0 = start; i0 < stop; ++i0)
z[i0] = OpType::op(*x, y[i0]);
}
else if(zStrd0 == 1 && xStrd0 == 1 && yStrd0 == 1) {
for (auto i0 = start; i0 < stop; ++i0)
z[i0] = OpType::op(x[i0], y[i0]);
}
else {
for (auto i0 = start; i0 < stop; ++i0) for (auto i0 = start; i0 < stop; ++i0)
z[i0 * zStrd0] = OpType::op(x[i0 * xStrd0], y[i0 * yStrd0]); z[i0 * zStrd0] = OpType::op(x[i0 * xStrd0], y[i0 * yStrd0]);
}
}; };
samediff::Threads::parallel_tad(func, 0, zAxis0); samediff::Threads::parallel_tad(func, 0, zAxis0);
} }
@ -514,9 +524,15 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
auto y0 = y + i0 * yStrd0; auto y0 = y + i0 * yStrd0;
auto z0 = z + i0 * zStrd0; auto z0 = z + i0 * zStrd0;
if(zStrd1 == 1 && xStrd1 <= 1 && yStrd1 <= 1) if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 0)
for (uint i1 = 0; i1 < zAxis1; ++i1) for (uint i1 = 0; i1 < zAxis1; ++i1)
z0[i1] = OpType::op(x0[xStrd1 ? i1 : 0], y0[yStrd1 ? i1 : 0]); z0[i1] = OpType::op(x0[i1], *y0);
else if(zStrd1 == 1 && xStrd1 == 0 && yStrd1 == 1)
for (uint i1 = 0; i1 < zAxis1; ++i1)
z0[i1] = OpType::op(*x0, y0[i1]);
else if(zStrd1 == 1 && xStrd1 == 1 && yStrd1 == 1)
for (uint i1 = 0; i1 < zAxis1; ++i1)
z0[i1] = OpType::op(x0[i1], y0[i1]);
else else
for (uint i1 = 0; i1 < zAxis1; ++i1) for (uint i1 = 0; i1 < zAxis1; ++i1)
z0[i1 * zStrd1] = OpType::op(x0[i1 * xStrd1], y0[i1 * yStrd1]); z0[i1 * zStrd1] = OpType::op(x0[i1 * xStrd1], y0[i1 * yStrd1]);
@ -528,7 +544,6 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
case 3: { case 3: {
auto func = PRAGMA_THREADS_FOR_2D { auto func = PRAGMA_THREADS_FOR_2D {
for (auto i0 = start_x; i0 < stop_x; ++i0) { for (auto i0 = start_x; i0 < stop_x; ++i0) {
@ -538,9 +553,15 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
auto y1 = y + i0 * yStrd0 + i1 * yStrd1; auto y1 = y + i0 * yStrd0 + i1 * yStrd1;
auto z1 = z + i0 * zStrd0 + i1 * zStrd1; auto z1 = z + i0 * zStrd0 + i1 * zStrd1;
if(zStrd2 == 1 && xStrd2 <= 1 && yStrd2 <= 1) if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 0)
for (uint i2 = 0; i2 < zAxis2; ++i2) for (uint i2 = 0; i2 < zAxis2; ++i2)
z1[i2] = OpType::op(x1[xStrd2 ? i2 : 0], y1[yStrd2 ? i2 : 0]); z1[i2] = OpType::op(x1[i2], *y1);
else if(zStrd2 == 1 && xStrd2 == 0 && yStrd2 == 1)
for (uint i2 = 0; i2 < zAxis2; ++i2)
z1[i2] = OpType::op(*x1, y1[i2]);
else if(zStrd2 == 1 && xStrd2 == 1 && yStrd2 == 1)
for (uint i2 = 0; i2 < zAxis2; ++i2)
z1[i2] = OpType::op(x1[i2], y1[i2]);
else else
for (uint i2 = 0; i2 < zAxis2; ++i2) for (uint i2 = 0; i2 < zAxis2; ++i2)
z1[i2 * zStrd2] = OpType::op(x1[i2 * xStrd2], y1[i2 * yStrd2]); z1[i2 * zStrd2] = OpType::op(x1[i2 * xStrd2], y1[i2 * yStrd2]);
@ -563,9 +584,15 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2; auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2;
auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2; auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2;
if(zStrd3 == 1 && xStrd3 <= 1 && yStrd3 <= 1) if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 0)
for (uint i3 = 0; i3 < zAxis3; ++i3) for (uint i3 = 0; i3 < zAxis3; ++i3)
z2[i3] = OpType::op(x2[xStrd3 ? i3 : 0], y2[yStrd3 ? i3 : 0]); z2[i3] = OpType::op(x2[i3], *y2);
else if(zStrd3 == 1 && xStrd3 == 0 && yStrd3 == 1)
for (uint i3 = 0; i3 < zAxis3; ++i3)
z2[i3] = OpType::op(*x2, y2[i3]);
else if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 1)
for (uint i3 = 0; i3 < zAxis3; ++i3)
z2[i3] = OpType::op(x2[i3], y2[i3]);
else else
for (uint i3 = 0; i3 < zAxis3; ++i3) for (uint i3 = 0; i3 < zAxis3; ++i3)
z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3]); z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3]);
@ -590,9 +617,15 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
auto y3 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2 + i3 * yStrd3; auto y3 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2 + i3 * yStrd3;
auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3; auto z3 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2 + i3 * zStrd3;
if(zStrd4 == 1 && xStrd4 <= 1 && yStrd4 <= 1) if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 0)
for (uint i4 = 0; i4 < zAxis4; ++i4) for (uint i4 = 0; i4 < zAxis4; ++i4)
z3[i4] = OpType::op(x3[xStrd4 ? i4 : 0], y3[yStrd4 ? i4 : 0]); z3[i4] = OpType::op(x3[i4], *y3);
else if(zStrd4 == 1 && xStrd4 == 0 && yStrd4 == 1)
for (uint i4 = 0; i4 < zAxis4; ++i4)
z3[i4] = OpType::op(*x3, y3[i4]);
else if(zStrd4 == 1 && xStrd4 == 1 && yStrd4 == 1)
for (uint i4 = 0; i4 < zAxis4; ++i4)
z3[i4] = OpType::op(x3[i4], y3[i4]);
else else
for (uint i4 = 0; i4 < zAxis4; ++i4) for (uint i4 = 0; i4 < zAxis4; ++i4)
z3[i4 * zStrd4] = OpType::op(x3[i4 * xStrd4], y3[i4 * yStrd4]); z3[i4 * zStrd4] = OpType::op(x3[i4 * xStrd4], y3[i4 * yStrd4]);
@ -607,6 +640,9 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
default: { default: {
const bool xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
const bool yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK]; int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
@ -620,9 +656,9 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j]; yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
} }
const auto xOffset = shape::getOffset(xShapeInfo, xCoords);
const auto yOffset = shape::getOffset(yShapeInfo, yCoords);
const auto zOffset = shape::getOffset(zShapeInfo, zCoords); const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
z[zOffset] = OpType::op(x[xOffset], y[yOffset]); z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
} }

View File

@ -264,11 +264,15 @@ __device__ void Broadcast<X,Y,Z>::transformCuda(
__shared__ Nd4jLong zLen; __shared__ Nd4jLong zLen;
__shared__ int rank; __shared__ int rank;
__shared__ bool xzSameOffsets, yzSameOffsets;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
zLen = shape::length(zShapeInfo); zLen = shape::length(zShapeInfo);
rank = shape::rank(zShapeInfo); rank = shape::rank(zShapeInfo);
xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
} }
__syncthreads(); __syncthreads();
@ -286,9 +290,9 @@ __device__ void Broadcast<X,Y,Z>::transformCuda(
yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j]; yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
} }
const auto xOffset = shape::getOffset(xShapeInfo, xCoords);
const auto yOffset = shape::getOffset(yShapeInfo, yCoords);
const auto zOffset = shape::getOffset(zShapeInfo, zCoords); const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
z[zOffset] = OpType::op(x[xOffset], y[yOffset]); z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
} }

View File

@ -280,11 +280,15 @@ __device__ void BroadcastBool<X,Z>::transformCuda(const void *vx, const Nd4jLong
__shared__ Nd4jLong zLen; __shared__ Nd4jLong zLen;
__shared__ int rank; __shared__ int rank;
__shared__ bool xzSameOffsets, yzSameOffsets;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
zLen = shape::length(zShapeInfo); zLen = shape::length(zShapeInfo);
rank = shape::rank(zShapeInfo); rank = shape::rank(zShapeInfo);
xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
} }
__syncthreads(); __syncthreads();
@ -302,9 +306,9 @@ __device__ void BroadcastBool<X,Z>::transformCuda(const void *vx, const Nd4jLong
yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j]; yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
} }
const auto xOffset = shape::getOffset(xShapeInfo, xCoords);
const auto yOffset = shape::getOffset(yShapeInfo, yCoords);
const auto zOffset = shape::getOffset(zShapeInfo, zCoords); const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams); z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
} }

View File

@ -260,11 +260,15 @@ __device__ void BroadcastInt<X>::transformCuda(const void *vx, const Nd4jLong *x
__shared__ Nd4jLong zLen; __shared__ Nd4jLong zLen;
__shared__ int rank; __shared__ int rank;
__shared__ bool xzSameOffsets, yzSameOffsets;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
zLen = shape::length(zShapeInfo); zLen = shape::length(zShapeInfo);
rank = shape::rank(zShapeInfo); rank = shape::rank(zShapeInfo);
xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
yzSameOffsets = shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo);
} }
__syncthreads(); __syncthreads();
@ -282,9 +286,9 @@ __device__ void BroadcastInt<X>::transformCuda(const void *vx, const Nd4jLong *x
yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j]; yCoords[j] = shape::sizeAt(yShapeInfo, j) == 1 ? 0 : zCoords[j];
} }
const auto xOffset = shape::getOffset(xShapeInfo, xCoords);
const auto yOffset = shape::getOffset(yShapeInfo, yCoords);
const auto zOffset = shape::getOffset(zShapeInfo, zCoords); const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
const auto xOffset = xzSameOffsets ? zOffset : shape::getOffset(xShapeInfo, xCoords);
const auto yOffset = yzSameOffsets ? zOffset : shape::getOffset(yShapeInfo, yCoords);
z[zOffset] = OpType::op(x[xOffset], y[yOffset]); z[zOffset] = OpType::op(x[xOffset], y[yOffset]);
} }