Loops auto-vectorization problem fix (#274)
* libnd4j cast loop types Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j more type castination added to loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j sync casting types of iterated variable in loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j more loops reviewed for vectorization problem fix Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j fixed several typos Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j several more files reviewed to fix auto-vectorization problem in loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j merge master and reviewed more files to fix auto-vectorization problem in loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j several type casting added in broadcasting that were missed, fixed mac builds Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j double check all files and fix several more places in loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j fixed builds Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j revert changes for lup.cpp Signed-off-by: Oleg <oleg.semeniv@gmail.com>master
parent
5c806d2fb5
commit
b4575d11e9
|
@ -14,9 +14,9 @@
|
|||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
//
|
||||
// @author Yurii Shyrma (iuriish@yahoo.com), created on 14.03.2019
|
||||
//
|
||||
//
|
||||
// @author Yurii Shyrma (iuriish@yahoo.com), created on 14.03.2019
|
||||
//
|
||||
|
||||
#ifndef LIBND4J_LOOPS_H
|
||||
#define LIBND4J_LOOPS_H
|
||||
|
@ -45,7 +45,7 @@ namespace nd4j {
|
|||
};
|
||||
|
||||
template <typename X, typename Z>
|
||||
class ReductionFloatLoops : public ReductionLoops<X,Z,Z> {
|
||||
class ReductionFloatLoops : public ReductionLoops<X, Z, Z> {
|
||||
public:
|
||||
static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop);
|
||||
|
||||
|
@ -54,7 +54,7 @@ namespace nd4j {
|
|||
};
|
||||
|
||||
template <typename X, typename Z>
|
||||
class ND4J_EXPORT ReductionBoolLoops : public ReductionLoops<X,Z,X> {
|
||||
class ND4J_EXPORT ReductionBoolLoops : public ReductionLoops<X, Z, X> {
|
||||
public:
|
||||
static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
|
||||
|
||||
|
@ -63,7 +63,7 @@ namespace nd4j {
|
|||
};
|
||||
|
||||
template <typename X, typename Z>
|
||||
class ND4J_EXPORT ReductionLongLoops : public ReductionLoops<X,Z,X> {
|
||||
class ND4J_EXPORT ReductionLongLoops : public ReductionLoops<X, Z, X> {
|
||||
public:
|
||||
static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
|
||||
|
||||
|
@ -72,7 +72,7 @@ namespace nd4j {
|
|||
};
|
||||
|
||||
template <typename X>
|
||||
class ND4J_EXPORT ReductionSameLoops : public ReductionLoops<X,X,X> {
|
||||
class ND4J_EXPORT ReductionSameLoops : public ReductionLoops<X, X, X> {
|
||||
public:
|
||||
static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
|
||||
|
||||
|
@ -125,10 +125,10 @@ namespace nd4j {
|
|||
|
||||
|
||||
|
||||
/*
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
template<typename X, typename Y, typename Z>
|
||||
void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
||||
/*
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
template<typename X, typename Y, typename Z>
|
||||
void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
||||
const Y* y, const Nd4jLong* yShapeInfo,
|
||||
Z* z, const Nd4jLong* zShapeInfo,
|
||||
Z* extraParams,
|
||||
|
@ -255,12 +255,12 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
template<typename X, typename Z, typename E>
|
||||
template <typename OpType>
|
||||
void nd4j::ReductionLoops<X, Z, E>::loopReduce(X* x, Nd4jLong* xShapeInfo,
|
||||
|
@ -324,7 +324,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
auto tad = x + tadOffsets[i];
|
||||
auto s = OpType::startingValue(tad);
|
||||
|
||||
for (uint j = 0; j < tadLen; j++)
|
||||
for (Nd4jLong j = 0; j < tadLen; j++)
|
||||
s = OpType::update(s, OpType::op(tad[j], extraParams), extraParams);
|
||||
|
||||
z[i] = OpType::postProcess(s, tadLen, extraParams);
|
||||
|
@ -338,7 +338,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
auto tad = x + tadOffsets[i];
|
||||
auto s = OpType::startingValue(tad);
|
||||
|
||||
for (uint j = 0; j < tadLen; j++)
|
||||
for (Nd4jLong j = 0; j < tadLen; j++)
|
||||
s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
|
||||
|
||||
z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
|
||||
|
@ -352,7 +352,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
auto tad = x + tadOffsets[i];
|
||||
auto s = OpType::startingValue(tad);
|
||||
|
||||
for (uint i0 = 0; i0 < tadLen; ++i0)
|
||||
for (Nd4jLong i0 = 0; i0 < tadLen; ++i0)
|
||||
s = OpType::update(s, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams);
|
||||
|
||||
z[i] = OpType::postProcess(s, tadLen, extraParams);
|
||||
|
@ -366,8 +366,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
auto tad = x + tadOffsets[i];
|
||||
auto s = OpType::startingValue(tad);
|
||||
|
||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0)
|
||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1)
|
||||
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
|
||||
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
|
||||
s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1]], extraParams), extraParams);
|
||||
|
||||
z[i] = OpType::postProcess(s, tadLen, extraParams);
|
||||
|
@ -381,9 +381,9 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
auto tad = x + tadOffsets[i];
|
||||
auto s = OpType::startingValue(tad);
|
||||
|
||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0)
|
||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1)
|
||||
for (uint i2 = 0; i2 < tadShape[2]; ++i2)
|
||||
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
|
||||
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
|
||||
for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
|
||||
s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2]], extraParams), extraParams);
|
||||
|
||||
z[i] = OpType::postProcess(s, tadLen, extraParams);
|
||||
|
@ -397,10 +397,10 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
auto tad = x + tadOffsets[i];
|
||||
auto s = OpType::startingValue(tad);
|
||||
|
||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0)
|
||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1)
|
||||
for (uint i2 = 0; i2 < tadShape[2]; ++i2)
|
||||
for (uint i3 = 0; i3 < tadShape[3]; ++i3)
|
||||
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
|
||||
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
|
||||
for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
|
||||
for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3)
|
||||
s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3]], extraParams), extraParams);
|
||||
|
||||
z[i] = OpType::postProcess(s, tadLen, extraParams);
|
||||
|
@ -414,11 +414,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
auto tad = x + tadOffsets[i];
|
||||
auto s = OpType::startingValue(tad);
|
||||
|
||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0)
|
||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1)
|
||||
for (uint i2 = 0; i2 < tadShape[2]; ++i2)
|
||||
for (uint i3 = 0; i3 < tadShape[3]; ++i3)
|
||||
for (uint i4 = 0; i4 < tadShape[4]; ++i4)
|
||||
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
|
||||
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
|
||||
for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
|
||||
for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3)
|
||||
for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4)
|
||||
s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4]], extraParams), extraParams);
|
||||
|
||||
z[i] = OpType::postProcess(s, tadLen, extraParams);
|
||||
|
@ -435,7 +435,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
auto tad = x + tadOffsets[i];
|
||||
auto s = OpType::startingValue(tad);
|
||||
|
||||
for (uint j = 0; j < tadLen; j++)
|
||||
for (Nd4jLong j = 0; j < tadLen; j++)
|
||||
s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
|
||||
|
||||
auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
|
||||
|
@ -453,7 +453,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
auto tad = x + tadOffsets[i];
|
||||
auto s = OpType::startingValue(tad);
|
||||
|
||||
for (uint j = 0; j < tadLen; j++) {
|
||||
for (Nd4jLong j = 0; j < tadLen; j++) {
|
||||
auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
|
||||
s = OpType::update(s, OpType::op(tad[tadOffset], extraParams), extraParams);
|
||||
}
|
||||
|
@ -475,7 +475,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
auto tad = x + tadOffsets[i];
|
||||
auto s = OpType::startingValue(tad);
|
||||
|
||||
for (uint j = 0; j < tadLen; j++)
|
||||
for (Nd4jLong j = 0; j < tadLen; j++)
|
||||
s = OpType::update(s, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
|
||||
|
||||
auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
|
||||
|
@ -492,7 +492,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
//////////////////////////////////////////////////////////////////////////////
|
||||
template <typename X, typename Z, typename E>
|
||||
template <typename OpType>
|
||||
void nd4j::TransformLoops<X,Z,E>::loopTransform(X* x, Nd4jLong* xShapeInfo,
|
||||
void nd4j::TransformLoops<X, Z, E>::loopTransform(X* x, Nd4jLong* xShapeInfo,
|
||||
Z* z, Nd4jLong* zShapeInfo,
|
||||
E* extraParams, uint64_t threadId, uint64_t numThreads) {
|
||||
|
||||
|
@ -528,7 +528,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
int64_t start = span.startX(), stop = span.stopX();
|
||||
|
||||
for (auto i = start; i < stop; i++)
|
||||
z[i*zEws] = OpType::op(x[i*xEws], extraParams);
|
||||
z[i * zEws] = OpType::op(x[i * xEws], extraParams);
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -546,7 +546,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX);
|
||||
z[i * zEws] = OpType::op(x[xOffset], extraParams);
|
||||
}
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX);
|
||||
z[i] = OpType::op(x[xOffset], extraParams);
|
||||
|
@ -576,7 +577,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
auto z0 = i0 * zStride[0];
|
||||
auto x0 = i0 * xStride[0];
|
||||
|
||||
for (uint i1 = span.startY(); i1 < span.stopY(); ++i1)
|
||||
for (auto i1 = span.startY(); i1 < span.stopY(); ++i1)
|
||||
z[z0 + i1 * zStride[1]] = OpType::op(x[x0 + i1 * xStride[1]], extraParams);
|
||||
}
|
||||
}
|
||||
|
@ -584,9 +585,9 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
|
||||
//*********************************************//
|
||||
case LoopKind::RANK3: {
|
||||
auto uXShape0 = static_cast<uint>(xShape[0]);
|
||||
auto uXShape1 = static_cast<uint>(xShape[1]);
|
||||
auto uXShape2 = static_cast<uint>(xShape[2]);
|
||||
auto uXShape0 = xShape[0];
|
||||
auto uXShape1 = xShape[1];
|
||||
auto uXShape2 = xShape[2];
|
||||
|
||||
auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1);
|
||||
auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1);
|
||||
|
@ -597,7 +598,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
auto z0 = i0 * zStride[0] + i1 * zStride[1];
|
||||
auto x0 = i0 * xStride[0] + i1 * xStride[1];
|
||||
|
||||
for (uint i2 = 0; i2 < uXShape2; ++i2)
|
||||
for (Nd4jLong i2 = 0; i2 < uXShape2; ++i2)
|
||||
z[z0 + i2 * zStride[2]] = OpType::op(x[x0 + i2 * xStride[2]], extraParams);
|
||||
}
|
||||
}
|
||||
|
@ -605,10 +606,10 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
|
||||
//*********************************************//
|
||||
case LoopKind::RANK4: {
|
||||
auto uXShape0 = static_cast<uint>(xShape[0]);
|
||||
auto uXShape1 = static_cast<uint>(xShape[1]);
|
||||
auto uXShape2 = static_cast<uint>(xShape[2]);
|
||||
auto uXShape3 = static_cast<uint>(xShape[3]);
|
||||
auto uXShape0 = xShape[0];
|
||||
auto uXShape1 = xShape[1];
|
||||
auto uXShape2 = xShape[2];
|
||||
auto uXShape3 = xShape[3];
|
||||
|
||||
auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2);
|
||||
auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1);
|
||||
|
@ -619,7 +620,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
|
||||
auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
|
||||
|
||||
for (uint i3 = 0; i3 < uXShape3; ++i3)
|
||||
for (Nd4jLong i3 = 0; i3 < uXShape3; ++i3)
|
||||
z[z0 + i3 * zStride[3]] = OpType::op(x[x0 + i3 * xStride[3]], extraParams);
|
||||
}
|
||||
}
|
||||
|
@ -627,11 +628,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
|
||||
//*********************************************//
|
||||
case LoopKind::RANK5: {
|
||||
auto uXShape0 = static_cast<uint>(xShape[0]);
|
||||
auto uXShape1 = static_cast<uint>(xShape[1]);
|
||||
auto uXShape2 = static_cast<uint>(xShape[2]);
|
||||
auto uXShape3 = static_cast<uint>(xShape[3]);
|
||||
auto uXShape4 = static_cast<uint>(xShape[4]);
|
||||
auto uXShape0 = xShape[0];
|
||||
auto uXShape1 = xShape[1];
|
||||
auto uXShape2 = xShape[2];
|
||||
auto uXShape3 = xShape[3];
|
||||
auto uXShape4 = xShape[4];
|
||||
|
||||
auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2);
|
||||
auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1);
|
||||
|
@ -643,12 +644,12 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
|
||||
auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
|
||||
|
||||
for (uint i3 = 0; i3 < uXShape3; ++i3) {
|
||||
for (Nd4jLong i3 = 0; i3 < uXShape3; ++i3) {
|
||||
|
||||
auto z1 = z0 + i3 * zStride[3];
|
||||
auto x1 = x0 + i3 * xStride[3];
|
||||
|
||||
for (uint i4 = 0; i4 < uXShape4; ++i4)
|
||||
for (Nd4jLong i4 = 0; i4 < uXShape4; ++i4)
|
||||
z[z1 + i4 * zStride[4]] = OpType::op(x[x1 + i4 * xStride[4]], extraParams);
|
||||
|
||||
}
|
||||
|
@ -678,7 +679,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
template<typename X, typename Z>
|
||||
template <typename OpType>
|
||||
void nd4j::Reduction3Loops<X, Z>::loopReduce3(X* x, Nd4jLong* xShapeInfo,
|
||||
|
@ -694,11 +695,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const Nd4jLong xLen = shape::length(xShapeInfo);
|
||||
const Nd4jLong yLen = shape::length(yShapeInfo);
|
||||
|
||||
Nd4jLong *xTadShapeInfo = nullptr, *yTadShapeInfo = nullptr, *xTadOffsets = nullptr, *yTadOffsets = nullptr;
|
||||
Nd4jLong* xTadShapeInfo = nullptr, * yTadShapeInfo = nullptr, * xTadOffsets = nullptr, * yTadOffsets = nullptr;
|
||||
TadPack tadPackX, tadPackY;
|
||||
std::vector<Nd4jLong> zeroOffsets;
|
||||
|
||||
if(xLen == yLen) {
|
||||
if (xLen == yLen) {
|
||||
tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dims, dimsLen);
|
||||
tadPackY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dims, dimsLen);
|
||||
xTadShapeInfo = tadPackX.primaryShapeInfo();
|
||||
|
@ -706,7 +707,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
xTadOffsets = tadPackX.primaryOffsets();
|
||||
yTadOffsets = tadPackY.primaryOffsets();
|
||||
}
|
||||
else if(yLen > xLen) {
|
||||
else if (yLen > xLen) {
|
||||
tadPackY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dims, dimsLen);
|
||||
xTadShapeInfo = xShapeInfo;
|
||||
yTadShapeInfo = tadPackY.primaryShapeInfo();
|
||||
|
@ -749,7 +750,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
|
||||
auto s = OpType::startingValue(xTad);
|
||||
|
||||
for (uint j = 0; j < tadLen; ++j)
|
||||
for (Nd4jLong j = 0; j < tadLen; ++j)
|
||||
s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
|
||||
|
||||
z[i] = OpType::postProcess(s, tadLen, extraParams);
|
||||
|
@ -769,7 +770,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
|
||||
auto s = OpType::startingValue(xTad);
|
||||
|
||||
for (uint j = 0; j < tadLen; ++j)
|
||||
for (Nd4jLong j = 0; j < tadLen; ++j)
|
||||
s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
|
||||
|
||||
z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
|
||||
|
@ -789,7 +790,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
|
||||
auto s = OpType::startingValue(xTad);
|
||||
|
||||
for (uint i0 = 0; i0 < tadLen; ++i0) {
|
||||
for (Nd4jLong i0 = 0; i0 < tadLen; ++i0) {
|
||||
const auto xTadOffset = i0 * xTadStride[0];
|
||||
const auto yTadOffset = i0 * yTadStride[0];
|
||||
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
||||
|
@ -812,8 +813,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
|
||||
auto s = OpType::startingValue(xTad);
|
||||
|
||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1];
|
||||
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1];
|
||||
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
||||
|
@ -836,9 +837,9 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
|
||||
auto s = OpType::startingValue(xTad);
|
||||
|
||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2];
|
||||
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2];
|
||||
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
||||
|
@ -862,10 +863,10 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
|
||||
auto s = OpType::startingValue(xTad);
|
||||
|
||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||
for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
|
||||
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||
for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) {
|
||||
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3];
|
||||
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3];
|
||||
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
||||
|
@ -890,11 +891,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
|
||||
auto s = OpType::startingValue(xTad);
|
||||
|
||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||
for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
|
||||
for (uint i4 = 0; i4 < tadShape[4]; ++i4) {
|
||||
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||
for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) {
|
||||
for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4) {
|
||||
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4];
|
||||
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4];
|
||||
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
||||
|
@ -913,7 +914,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
uint castXTadShapeInfo[MAX_RANK];
|
||||
const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo<uint>(xTadShapeInfo, castXTadShapeInfo);
|
||||
|
||||
if(shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
|
||||
if (shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
|
||||
Z extraParams[3];
|
||||
for (auto i = start; i < stop; i++) {
|
||||
extraParams[0] = param0;
|
||||
|
@ -924,7 +925,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
|
||||
auto s = OpType::startingValue(xTad);
|
||||
|
||||
for (uint j = 0; j < tadLen; ++j) {
|
||||
for (Nd4jLong j = 0; j < tadLen; ++j) {
|
||||
const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
|
||||
s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
|
||||
}
|
||||
|
@ -946,7 +947,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
|
||||
auto s = OpType::startingValue(xTad);
|
||||
|
||||
for (uint j = 0; j < tadLen; ++j) {
|
||||
for (Nd4jLong j = 0; j < tadLen; ++j) {
|
||||
const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
|
||||
const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
|
||||
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
||||
|
@ -958,7 +959,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
template<typename X, typename Z>
|
||||
template <typename OpType>
|
||||
void nd4j::Reduction3Loops<X, Z>::loopReduce3All(X* x, Nd4jLong* xShapeInfo,
|
||||
|
@ -990,14 +991,14 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
|
||||
const auto startVal = OpType::startingValue(x);
|
||||
|
||||
int numThreads = OmpLaunchHelper::tadThreads(tadLen, numXTads*numYTads);
|
||||
int numThreads = OmpLaunchHelper::tadThreads(tadLen, numXTads * numYTads);
|
||||
|
||||
switch (kindOfLoop) {
|
||||
//*********************************************//
|
||||
case LoopKind::EWS1: {
|
||||
Z extraParams[3];
|
||||
for (auto ix = 0; ix < numXTads; ix++) {
|
||||
for (auto iy = 0; iy < numYTads; iy++) {
|
||||
for (Nd4jLong ix = 0; ix < numXTads; ix++) {
|
||||
for (Nd4jLong iy = 0; iy < numYTads; iy++) {
|
||||
extraParams[0] = param0;
|
||||
extraParams[1] = param1;
|
||||
extraParams[2] = param2;
|
||||
|
@ -1007,7 +1008,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto zInd = ix * numYTads + iy;
|
||||
auto s = startVal;
|
||||
|
||||
for (uint j = 0; j < tadLen; ++j)
|
||||
for (Nd4jLong j = 0; j < tadLen; ++j)
|
||||
s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
|
||||
|
||||
z[zInd] = OpType::postProcess(s, tadLen, extraParams);
|
||||
|
@ -1019,8 +1020,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
//*********************************************//
|
||||
case LoopKind::EWSNONZERO: {
|
||||
Z extraParams[3];
|
||||
for (auto ix = 0; ix < numXTads; ix++) {
|
||||
for (auto iy = 0; iy < numYTads; iy++) {
|
||||
for (Nd4jLong ix = 0; ix < numXTads; ix++) {
|
||||
for (Nd4jLong iy = 0; iy < numYTads; iy++) {
|
||||
extraParams[0] = param0;
|
||||
extraParams[1] = param1;
|
||||
extraParams[2] = param2;
|
||||
|
@ -1030,7 +1031,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto zInd = ix * numYTads + iy;
|
||||
auto s = startVal;
|
||||
|
||||
for (uint j = 0; j < tadLen; ++j)
|
||||
for (Nd4jLong j = 0; j < tadLen; ++j)
|
||||
s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
|
||||
|
||||
z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
|
||||
|
@ -1042,8 +1043,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
//*********************************************//
|
||||
case LoopKind::RANK1: {
|
||||
Z extraParams[3];
|
||||
for (auto ix = 0; ix < numXTads; ix++) {
|
||||
for (auto iy = 0; iy < numYTads; iy++) {
|
||||
for (Nd4jLong ix = 0; ix < numXTads; ix++) {
|
||||
for (Nd4jLong iy = 0; iy < numYTads; iy++) {
|
||||
extraParams[0] = param0;
|
||||
extraParams[1] = param1;
|
||||
extraParams[2] = param2;
|
||||
|
@ -1053,7 +1054,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto zInd = ix * numYTads + iy;
|
||||
auto s = startVal;
|
||||
|
||||
for (uint i0 = 0; i0 < tadLen; ++i0) {
|
||||
for (Nd4jLong i0 = 0; i0 < tadLen; ++i0) {
|
||||
const auto xTadOffset = i0 * xTadStride[0];
|
||||
const auto yTadOffset = i0 * yTadStride[0];
|
||||
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
||||
|
@ -1067,8 +1068,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
//*********************************************//
|
||||
case LoopKind::RANK2: {
|
||||
Z extraParams[3];
|
||||
for (auto ix = 0; ix < numXTads; ix++) {
|
||||
for (auto iy = 0; iy < numYTads; iy++) {
|
||||
for (Nd4jLong ix = 0; ix < numXTads; ix++) {
|
||||
for (Nd4jLong iy = 0; iy < numYTads; iy++) {
|
||||
extraParams[0] = param0;
|
||||
extraParams[1] = param1;
|
||||
extraParams[2] = param2;
|
||||
|
@ -1078,8 +1079,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto zInd = ix * numYTads + iy;
|
||||
auto s = startVal;
|
||||
|
||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1];
|
||||
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1];
|
||||
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
||||
|
@ -1094,8 +1095,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
//*********************************************//
|
||||
case LoopKind::RANK3: {
|
||||
Z extraParams[3];
|
||||
for (auto ix = 0; ix < numXTads; ix++) {
|
||||
for (auto iy = 0; iy < numYTads; iy++) {
|
||||
for (Nd4jLong ix = 0; ix < numXTads; ix++) {
|
||||
for (Nd4jLong iy = 0; iy < numYTads; iy++) {
|
||||
extraParams[0] = param0;
|
||||
extraParams[1] = param1;
|
||||
extraParams[2] = param2;
|
||||
|
@ -1105,9 +1106,9 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto zInd = ix * numYTads + iy;
|
||||
auto s = startVal;
|
||||
|
||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2];
|
||||
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2];
|
||||
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
||||
|
@ -1123,8 +1124,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
//*********************************************//
|
||||
case LoopKind::RANK4: {
|
||||
Z extraParams[3];
|
||||
for (auto ix = 0; ix < numXTads; ix++) {
|
||||
for (auto iy = 0; iy < numYTads; iy++) {
|
||||
for (Nd4jLong ix = 0; ix < numXTads; ix++) {
|
||||
for (Nd4jLong iy = 0; iy < numYTads; iy++) {
|
||||
extraParams[0] = param0;
|
||||
extraParams[1] = param1;
|
||||
extraParams[2] = param2;
|
||||
|
@ -1134,10 +1135,10 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto zInd = ix * numYTads + iy;
|
||||
auto s = startVal;
|
||||
|
||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||
for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
|
||||
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||
for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) {
|
||||
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3];
|
||||
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3];
|
||||
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
||||
|
@ -1154,8 +1155,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
//*********************************************//
|
||||
case LoopKind::RANK5: {
|
||||
Z extraParams[3];
|
||||
for (auto ix = 0; ix < numXTads; ix++) {
|
||||
for (auto iy = 0; iy < numYTads; iy++) {
|
||||
for (Nd4jLong ix = 0; ix < numXTads; ix++) {
|
||||
for (Nd4jLong iy = 0; iy < numYTads; iy++) {
|
||||
extraParams[0] = param0;
|
||||
extraParams[1] = param1;
|
||||
extraParams[2] = param2;
|
||||
|
@ -1165,11 +1166,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto zInd = ix * numYTads + iy;
|
||||
auto s = startVal;
|
||||
|
||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||
for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
|
||||
for (uint i4 = 0; i4 < tadShape[4]; ++i4) {
|
||||
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||
for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||
for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) {
|
||||
for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4) {
|
||||
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4];
|
||||
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4];
|
||||
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
||||
|
@ -1189,10 +1190,10 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
uint castXTadShapeInfo[MAX_RANK];
|
||||
const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo<uint>(xTadShapeInfo, castXTadShapeInfo);
|
||||
|
||||
if(shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
|
||||
if (shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
|
||||
Z extraParams[3];
|
||||
for (auto ix = 0; ix < numXTads; ix++) {
|
||||
for (auto iy = 0; iy < numYTads; iy++) {
|
||||
for (Nd4jLong ix = 0; ix < numXTads; ix++) {
|
||||
for (Nd4jLong iy = 0; iy < numYTads; iy++) {
|
||||
extraParams[0] = param0;
|
||||
extraParams[1] = param1;
|
||||
extraParams[2] = param2;
|
||||
|
@ -1202,7 +1203,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto zInd = ix * numYTads + iy;
|
||||
auto s = startVal;
|
||||
|
||||
for (uint j = 0; j < tadLen; ++j) {
|
||||
for (Nd4jLong j = 0; j < tadLen; ++j) {
|
||||
const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
|
||||
s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
|
||||
}
|
||||
|
@ -1215,8 +1216,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo);
|
||||
|
||||
Z extraParams[3];
|
||||
for (auto ix = 0; ix < numXTads; ix++) {
|
||||
for (auto iy = 0; iy < numYTads; iy++) {
|
||||
for (Nd4jLong ix = 0; ix < numXTads; ix++) {
|
||||
for (Nd4jLong iy = 0; iy < numYTads; iy++) {
|
||||
extraParams[0] = param0;
|
||||
extraParams[1] = param1;
|
||||
extraParams[2] = param2;
|
||||
|
@ -1226,7 +1227,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
|||
const auto zInd = ix * numYTads + iy;
|
||||
auto s = startVal;
|
||||
|
||||
for (uint j = 0; j < tadLen; ++j) {
|
||||
for (Nd4jLong j = 0; j < tadLen; ++j) {
|
||||
const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
|
||||
const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
|
||||
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
||||
|
|
|
@ -50,12 +50,12 @@ namespace nd4j {
|
|||
1 == zArr.ews() && 'c' == zArr.ordering());
|
||||
|
||||
if (bSpecialCase && yArr.isColumnVector() && 1 == xArr.sizeAt(-1) ) {
|
||||
auto yLen = (uint32_t)yArr.lengthOf();
|
||||
auto yLen = yArr.lengthOf();
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
for (uint32_t i = start; i < stop; i++) {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
auto rZ = z + (i * yLen);
|
||||
auto v = x[i];
|
||||
for (uint32_t j = 0; j < yLen; j++) {
|
||||
for (Nd4jLong j = 0; j < yLen; j++) {
|
||||
rZ[j] = OpType::op(v, y[j]);
|
||||
}
|
||||
}
|
||||
|
@ -74,13 +74,13 @@ namespace nd4j {
|
|||
|
||||
if (bSpecialCase && bSpecialCase2) {
|
||||
|
||||
int zDim1 = zArr.sizeAt(-2);
|
||||
int zDim2 = zArr.sizeAt(-1);
|
||||
uint32_t zDim1 = zArr.sizeAt(-2);
|
||||
uint32_t zDim2 = zArr.sizeAt(-1);
|
||||
|
||||
int nLen = zArr.lengthOf() / yArr.sizeAt(-1);
|
||||
uint32_t nLen = zArr.lengthOf() / yArr.sizeAt(-1);
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
for (uint32_t total = start; total < stop; total++) {
|
||||
for (auto total = start; total < stop; total++) {
|
||||
|
||||
uint32_t i = total / zDim1;
|
||||
uint32_t j = total % zDim1;
|
||||
|
|
|
@ -184,7 +184,7 @@ namespace functions {
|
|||
const auto oX = x[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (unsigned int f = 0; f < loopLength; f++)
|
||||
for (Nd4jLong f = 0; f < loopLength; f++)
|
||||
oZ[f] = OpType::op(oX, oY[f]);
|
||||
}
|
||||
} else if(kindOfLoop == nd4j::LoopKind::BROADCAST_SCALAR_Y){
|
||||
|
@ -198,7 +198,7 @@ namespace functions {
|
|||
const auto oY = y[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (unsigned int f = 0; f < loopLength; f++)
|
||||
for (Nd4jLong f = 0; f < loopLength; f++)
|
||||
oZ[f] = OpType::op(oX[f], oY);
|
||||
}
|
||||
}
|
||||
|
@ -213,14 +213,14 @@ namespace functions {
|
|||
Nd4jLong yStrides[3] = { 0,0,0 };
|
||||
nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
|
||||
|
||||
uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1);
|
||||
uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2);
|
||||
uint64_t nSize1 = shape::sizeAt(zShapeInfo, 1);
|
||||
uint64_t nSize2 = shape::sizeAt(zShapeInfo, 2);
|
||||
|
||||
for (uint32_t index0 = start; index0 < stop; index0++) {
|
||||
for (auto index0 = start; index0 < stop; index0++) {
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (uint32_t index1 = 0; index1 < nSize1; index1++) {
|
||||
for (uint32_t index2 = 0; index2 < nSize2; index2++) {
|
||||
for (uint64_t index1 = 0; index1 < nSize1; index1++) {
|
||||
for (uint64_t index2 = 0; index2 < nSize2; index2++) {
|
||||
auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2);
|
||||
auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2);
|
||||
auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2);
|
||||
|
@ -242,18 +242,18 @@ namespace functions {
|
|||
Nd4jLong yStrides[4] = { 0,0,0,0 };
|
||||
nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
|
||||
|
||||
uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1);
|
||||
uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2);
|
||||
uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3);
|
||||
uint64_t nSize1 = shape::sizeAt(zShapeInfo, 1);
|
||||
uint64_t nSize2 = shape::sizeAt(zShapeInfo, 2);
|
||||
uint64_t nSize3 = shape::sizeAt(zShapeInfo, 3);
|
||||
|
||||
for (uint32_t i = start; i < stop; i++) {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
|
||||
uint32_t index0 = i / nSize1;
|
||||
uint32_t index1 = i % nSize1;
|
||||
uint64_t index0 = i / nSize1;
|
||||
uint64_t index1 = i % nSize1;
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (uint32_t index2 = 0; index2 < nSize2; index2++) {
|
||||
for (uint32_t index3 = 0; index3 < nSize3; index3++) {
|
||||
for (uint64_t index2 = 0; index2 < nSize2; index2++) {
|
||||
for (uint64_t index3 = 0; index3 < nSize3; index3++) {
|
||||
auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2 + xStrides[3] * index3);
|
||||
auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2 + yStrides[3] * index3);
|
||||
auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2 + zStrides[3] * index3);
|
||||
|
@ -279,7 +279,7 @@ namespace functions {
|
|||
uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3);
|
||||
uint32_t nSize4 = shape::sizeAt(zShapeInfo, 4);
|
||||
|
||||
for (uint32_t i = start; i < stop; i++) {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
|
||||
uint32_t index0 = i / nSize1;
|
||||
uint32_t index1 = i % nSize1;
|
||||
|
@ -326,7 +326,7 @@ namespace functions {
|
|||
auto oX = x + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||
oZ[zOffset] = OpType::op(oX[offset], y[offset]);
|
||||
|
@ -344,7 +344,7 @@ namespace functions {
|
|||
auto oX = x + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||
oZ[offset] = OpType::op(oX[offset], y[yOffset]);
|
||||
|
@ -362,7 +362,7 @@ namespace functions {
|
|||
auto oX = x + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||
auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||
oZ[offset] = OpType::op(oX[xOffset], y[offset]);
|
||||
|
@ -382,7 +382,7 @@ namespace functions {
|
|||
auto oX = x + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||
|
@ -497,7 +497,7 @@ namespace functions {
|
|||
auto oY = y + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||
oZ[zOffset] = OpType::op(x[offset], oY[offset]);
|
||||
|
@ -515,7 +515,7 @@ namespace functions {
|
|||
auto oY = y + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||
auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, canCastX);
|
||||
oZ[offset] = OpType::op(x[xOffset], oY[offset]);
|
||||
|
@ -533,7 +533,7 @@ namespace functions {
|
|||
auto oY = y + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||
auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||
oZ[offset] = OpType::op(x[offset], oY[yOffset]);
|
||||
|
@ -553,7 +553,7 @@ namespace functions {
|
|||
auto oY = y + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||
|
|
|
@ -183,7 +183,7 @@ namespace functions {
|
|||
auto oX = x + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||
oZ[offset] = OpType::op(oX[offset], y[offset], extraParams);
|
||||
}
|
||||
|
@ -200,7 +200,7 @@ namespace functions {
|
|||
auto oX = x + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||
oZ[zOffset] = OpType::op(oX[offset], y[offset], extraParams);
|
||||
|
@ -218,7 +218,7 @@ namespace functions {
|
|||
auto oX = x + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||
oZ[offset] = OpType::op(oX[offset], y[yOffset], extraParams);
|
||||
|
@ -237,7 +237,7 @@ namespace functions {
|
|||
auto oX = x + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||
auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||
oZ[offset] = OpType::op(oX[xOffset], y[offset], extraParams);
|
||||
|
@ -257,7 +257,7 @@ namespace functions {
|
|||
auto oX = x + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||
|
@ -357,7 +357,7 @@ namespace functions {
|
|||
auto oZ = z + zTadOffset[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||
oZ[offset] = OpType::op(x[offset], oY[offset], extraParams);
|
||||
}
|
||||
|
@ -375,7 +375,7 @@ namespace functions {
|
|||
auto oY = y + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||
oZ[zOffset] = OpType::op(x[offset], oY[offset], extraParams);
|
||||
|
@ -394,7 +394,7 @@ namespace functions {
|
|||
auto oY = y + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||
oZ[offset] = OpType::op(x[xOffset], oY[offset], extraParams);
|
||||
|
@ -413,7 +413,7 @@ namespace functions {
|
|||
auto oY = y + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||
auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||
oZ[offset] = OpType::op(x[offset], oY[yOffset], extraParams);
|
||||
|
@ -434,7 +434,7 @@ namespace functions {
|
|||
auto oY = y + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||
|
|
|
@ -177,7 +177,7 @@ namespace functions {
|
|||
auto oX = x + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||
oZ[offset] = OpType::op(oX[offset], y[offset]);
|
||||
}
|
||||
|
@ -194,7 +194,7 @@ namespace functions {
|
|||
auto oX = x + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||
oZ[zOffset] = OpType::op(oX[offset], y[offset]);
|
||||
|
@ -212,7 +212,7 @@ namespace functions {
|
|||
auto oX = x + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||
oZ[offset] = OpType::op(oX[offset], y[yOffset]);
|
||||
|
@ -230,7 +230,7 @@ namespace functions {
|
|||
auto oX = x + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||
auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||
oZ[offset] = OpType::op(oX[xOffset], y[offset]);
|
||||
|
@ -250,7 +250,7 @@ namespace functions {
|
|||
auto oX = x + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (unsigned int f = 0; f < tadLength; f++) {
|
||||
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||
|
@ -347,7 +347,7 @@ namespace functions {
|
|||
auto oZ = z + zTadOffset[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (uint f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||
oZ[offset] = OpType::op(x[offset], oY[offset]);
|
||||
}
|
||||
|
@ -364,7 +364,7 @@ namespace functions {
|
|||
auto oZ = z + zTadOffset[i];
|
||||
auto oY = y + tadOffsets[i];
|
||||
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (uint f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||
oZ[zOffset] = OpType::op(x[offset], oY[offset]);
|
||||
|
@ -382,7 +382,7 @@ namespace functions {
|
|||
auto oY = y + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (uint f = 0; f < tadLength; f++) {
|
||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||
oZ[offset] = OpType::op(x[xOffset], oY[offset]);
|
||||
|
@ -400,7 +400,7 @@ namespace functions {
|
|||
auto oY = y + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (uint f = 0; f < tadLength; f++) {
|
||||
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||
auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||
oZ[offset] = OpType::op(x[offset], oY[yOffset]);
|
||||
|
@ -420,7 +420,7 @@ namespace functions {
|
|||
auto oY = y + tadOffsets[i];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int f = 0; f < tadLength; f++) {
|
||||
for (uint f = 0; f < tadLength; f++) {
|
||||
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||
|
|
|
@ -124,7 +124,7 @@ void IndexReduce<X, Z>::exec(void *vx, Nd4jLong *xShapeInfo,
|
|||
return;
|
||||
const auto indexValue = OpType::startingIndexValue(x);
|
||||
|
||||
for (uint i = 0; i < zLen; i++)
|
||||
for (Nd4jLong i = 0; i < zLen; i++)
|
||||
z[i] = (Z) indexValue.index;
|
||||
|
||||
return;
|
||||
|
|
|
@ -93,7 +93,7 @@ namespace functions {
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
PRAGMA_OMP_SIMD
|
||||
for (uint64_t i = start; i < stop; i++) {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||
z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
|
||||
|
@ -111,7 +111,7 @@ namespace functions {
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
PRAGMA_OMP_SIMD
|
||||
for (uint64_t i = start; i < stop; i++) {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
|
||||
z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
|
||||
|
@ -129,7 +129,7 @@ namespace functions {
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
PRAGMA_OMP_SIMD
|
||||
for (uint64_t i = start; i < stop; i++) {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||
auto offset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
|
||||
z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
|
||||
|
@ -149,7 +149,7 @@ namespace functions {
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
PRAGMA_OMP_SIMD
|
||||
for (uint64_t i = start; i < stop; i++) {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
|
||||
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||
|
@ -197,7 +197,7 @@ namespace functions {
|
|||
else{
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
PRAGMA_OMP_SIMD
|
||||
for (uint64_t i = start; i < stop; i++) {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||
z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
|
||||
}
|
||||
|
@ -213,7 +213,7 @@ namespace functions {
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
PRAGMA_OMP_SIMD
|
||||
for (uint64_t i = start; i < stop; i++) {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||
z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
|
||||
|
@ -255,7 +255,7 @@ namespace functions {
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
PRAGMA_OMP_SIMD
|
||||
for (uint64_t i = start; i < stop; i++) {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||
z[offset] = OpClass::op(i, length, rng, extraArguments);
|
||||
}
|
||||
|
|
|
@ -55,7 +55,7 @@ namespace functions {
|
|||
return;
|
||||
const auto startingVal = OpType::startingValue(x);
|
||||
|
||||
for (uint i = 0; i < length; i++)
|
||||
for (Nd4jLong i = 0; i < length; i++)
|
||||
z[i] = startingVal;
|
||||
return;
|
||||
}
|
||||
|
@ -68,7 +68,7 @@ namespace functions {
|
|||
uint xShapeInfoCast[MAX_RANK];
|
||||
const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
|
||||
|
||||
for (auto i = 0; i < length; i++)
|
||||
for (Nd4jLong i = 0; i < length; i++)
|
||||
startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
|
||||
|
||||
z[0] = OpType::postProcess(startingValue, length, extraParams);
|
||||
|
@ -94,7 +94,7 @@ namespace functions {
|
|||
uint xShapeInfoCast[MAX_RANK];
|
||||
bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
|
||||
|
||||
for (auto i = 0; i < length; i++)
|
||||
for (Nd4jLong i = 0; i < length; i++)
|
||||
startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
|
||||
|
||||
return OpType::postProcess(startingValue, length, extraParams);
|
||||
|
@ -156,7 +156,7 @@ namespace functions {
|
|||
return;
|
||||
const auto startingVal = OpType::startingValue(x);
|
||||
|
||||
for (uint i = 0; i < resultLength; i++)
|
||||
for (Nd4jLong i = 0; i < resultLength; i++)
|
||||
z[i] = startingVal;
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -59,7 +59,7 @@ namespace functions {
|
|||
return;
|
||||
const auto startingVal = OpType::startingValue(x);
|
||||
|
||||
for (uint i = 0; i < length; i++)
|
||||
for (Nd4jLong i = 0; i < length; i++)
|
||||
z[i] = startingVal;
|
||||
|
||||
return;
|
||||
|
@ -113,7 +113,7 @@ namespace functions {
|
|||
uint xShapeInfoCast[MAX_RANK];
|
||||
bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
|
||||
|
||||
for (auto i = 0; i < length; i++)
|
||||
for (Nd4jLong i = 0; i < length; i++)
|
||||
startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
|
||||
|
||||
return OpType::postProcess(startingValue, length, extraParams);
|
||||
|
@ -184,7 +184,7 @@ namespace functions {
|
|||
return;
|
||||
const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? nd4j::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(x));
|
||||
|
||||
for (uint i = 0; i < resultLength; i++)
|
||||
for (Nd4jLong i = 0; i < resultLength; i++)
|
||||
z[i] = startingVal;
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -55,7 +55,7 @@ namespace functions {
|
|||
return;
|
||||
const auto startingVal = OpType::startingValue(x);
|
||||
|
||||
for (uint i = 0; i < length; i++)
|
||||
for (Nd4jLong i = 0; i < length; i++)
|
||||
z[i] = startingVal;
|
||||
return;
|
||||
}
|
||||
|
@ -110,7 +110,7 @@ namespace functions {
|
|||
uint xShapeInfoCast[MAX_RANK];
|
||||
bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
|
||||
|
||||
for (auto i = 0; i < length; i++)
|
||||
for (Nd4jLong i = 0; i < length; i++)
|
||||
startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
|
||||
|
||||
return OpType::postProcess(startingValue, length, extraParams);
|
||||
|
@ -173,7 +173,7 @@ namespace functions {
|
|||
return;
|
||||
const auto startingVal = OpType::startingValue(x);
|
||||
|
||||
for (uint i = 0; i < resultLength; i++)
|
||||
for (Nd4jLong i = 0; i < resultLength; i++)
|
||||
z[i] = startingVal;
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -57,7 +57,7 @@ namespace functions {
|
|||
return;
|
||||
const auto startingVal = OpType::startingValue(x);
|
||||
|
||||
for (uint i = 0; i < length; i++)
|
||||
for (Nd4jLong i = 0; i < length; i++)
|
||||
z[i] = startingVal;
|
||||
return;
|
||||
}
|
||||
|
@ -111,7 +111,7 @@ namespace functions {
|
|||
uint xShapeInfoCast[MAX_RANK];
|
||||
bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
|
||||
|
||||
for (auto i = 0; i < length; i++)
|
||||
for (Nd4jLong i = 0; i < length; i++)
|
||||
startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
|
||||
|
||||
return OpType::postProcess(startingValue, length, extraParams);
|
||||
|
@ -182,7 +182,7 @@ namespace functions {
|
|||
return;
|
||||
const auto startingVal = OpType::startingValue(x);
|
||||
|
||||
for (uint i = 0; i < zLength; i++)
|
||||
for (Nd4jLong i = 0; i < zLength; i++)
|
||||
z[i] = startingVal;
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -53,7 +53,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
|
|||
return;
|
||||
const auto startingVal = OpType::startingValue(x);
|
||||
|
||||
for (uint i = 0; i < length; i++)
|
||||
for (Nd4jLong i = 0; i < length; i++)
|
||||
z[i] = startingVal;
|
||||
|
||||
return;
|
||||
|
|
|
@ -73,7 +73,7 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
|
|||
auto oX = x + xTadOffsets[r];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (unsigned int f = 0; f < tadLength; f++)
|
||||
for (int f = 0; f < tadLength; f++)
|
||||
oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
|
||||
};
|
||||
}
|
||||
|
@ -83,7 +83,7 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
|
|||
auto oX = x + xTadOffsets[r];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (unsigned int f = 0; f < tadLength; f++)
|
||||
for (int f = 0; f < tadLength; f++)
|
||||
oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
|
||||
};
|
||||
}
|
||||
|
|
|
@ -74,7 +74,7 @@ namespace functions {
|
|||
auto oX = x + xTadOffsets[r];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (unsigned int f = 0; f < tadLength; f++)
|
||||
for (int f = 0; f < tadLength; f++)
|
||||
oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
|
||||
};
|
||||
}
|
||||
|
@ -84,7 +84,7 @@ namespace functions {
|
|||
auto oX = x + xTadOffsets[r];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (unsigned int f = 0; f < tadLength; f++)
|
||||
for (int f = 0; f < tadLength; f++)
|
||||
oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
|
||||
};
|
||||
}
|
||||
|
|
|
@ -74,7 +74,7 @@ namespace functions {
|
|||
auto oX = x + xTadOffsets[r];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (unsigned int f = 0; f < tadLength; f++)
|
||||
for (int f = 0; f < tadLength; f++)
|
||||
oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
|
||||
};
|
||||
}
|
||||
|
@ -84,7 +84,7 @@ namespace functions {
|
|||
auto oX = x + xTadOffsets[r];
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (unsigned int f = 0; f < tadLength; f++)
|
||||
for (int f = 0; f < tadLength; f++)
|
||||
oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
|
||||
};
|
||||
}
|
||||
|
|
|
@ -91,7 +91,7 @@ namespace functions {
|
|||
uint xShapeInfoCast[MAX_RANK];
|
||||
const bool canCast = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
|
||||
|
||||
for (uint64_t i = 0; i < length; i++) {
|
||||
for (Nd4jLong i = 0; i < length; i++) {
|
||||
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCast);
|
||||
|
||||
SummaryStatsData<X> curr;
|
||||
|
@ -116,7 +116,7 @@ namespace functions {
|
|||
auto x = reinterpret_cast<X *>(vx);
|
||||
auto z = reinterpret_cast<Z *>(vz);
|
||||
auto extraParams = reinterpret_cast<Z *>(vextraParams);
|
||||
int resultLength = shape::length(zShapeInfo);
|
||||
auto resultLength = shape::length(zShapeInfo);
|
||||
|
||||
if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
|
||||
if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
|
||||
|
@ -124,7 +124,7 @@ namespace functions {
|
|||
SummaryStatsData<X> comp;
|
||||
comp.initWithValue(x[0]);
|
||||
|
||||
for (uint i = 0; i < resultLength; i++)
|
||||
for (Nd4jLong i = 0; i < resultLength; i++)
|
||||
z[i] = OpType::getValue(biasCorrected, comp);
|
||||
return;
|
||||
}
|
||||
|
@ -166,14 +166,14 @@ namespace functions {
|
|||
comp.initWithValue(tx[0]);
|
||||
|
||||
if (tadEWS == 1 && tadOrder == 'c') {
|
||||
for (int i = 1; i < tadLength; i++) {
|
||||
for (Nd4jLong i = 1; i < tadLength; i++) {
|
||||
SummaryStatsData <X> indexVal2;
|
||||
indexVal2.initWithValue(tx[i]);
|
||||
|
||||
comp = update(comp, OpType::op(indexVal2, extraParams), extraParams);
|
||||
}
|
||||
} else {
|
||||
for (int i = 1; i < tadLength; i++) {
|
||||
for (Nd4jLong i = 1; i < tadLength; i++) {
|
||||
auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast);
|
||||
|
||||
SummaryStatsData <X> indexVal2;
|
||||
|
|
|
@ -61,7 +61,7 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) {
|
|||
else
|
||||
axes.push_back(inRank-1); // default dimension to reduce along is last dimension
|
||||
|
||||
const int numOfAxes = axes.size();
|
||||
const uint numOfAxes = axes.size();
|
||||
REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank);
|
||||
|
||||
// evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes
|
||||
|
@ -83,7 +83,7 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) {
|
|||
REQUIRE_TRUE(beta->isSameShape(expShape), 0, "BATCHNORM op: wrong shape of beta array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(beta).c_str());
|
||||
|
||||
// types of all input arrays should be the same
|
||||
for(int i = 1; i < block.width(); ++i)
|
||||
for(unsigned long i = 1; i < block.width(); ++i)
|
||||
REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM op: types of all input arrays should be the same !");
|
||||
|
||||
nd4j_debug("MKL-DNN is not used for batchnorm!\n", 0);
|
||||
|
@ -167,7 +167,7 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
|
|||
else
|
||||
axes.push_back(inRank-1); // default dimension to reduce along is last dimension
|
||||
|
||||
const int numOfAxes = axes.size();
|
||||
const uint numOfAxes = axes.size();
|
||||
REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM_BP op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank);
|
||||
|
||||
// evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes
|
||||
|
@ -191,7 +191,7 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
|
|||
REQUIRE_TRUE(input->isSameShape(dLdO), 0, "BATCHNORM_BP op: wrong shape of output gradients array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(input).c_str(), ShapeUtils::shapeAsString(dLdO).c_str());
|
||||
|
||||
// types of all input arrays should be the same (except dLdO)
|
||||
for(int i = 1; i < block.width() - 2; ++i)
|
||||
for(unsigned long i = 1; i < block.width() - 2; ++i)
|
||||
REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP op: types of arrays (input, mean, variance, gamma, beta) should be the same !");
|
||||
|
||||
// ***** calculations ***** //
|
||||
|
|
|
@ -30,7 +30,7 @@ namespace helpers {
|
|||
int* pRowCounts = reinterpret_cast<int*>(rowCounts.buffer());
|
||||
int const* pRows = reinterpret_cast<int const*>(rowP->getBuffer());
|
||||
int const* pCols = reinterpret_cast<int const*>(colP->getBuffer());
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (Nd4jLong n = 0; n < N; n++) {
|
||||
int begin = pRows[n];//->e<int>(n);
|
||||
int end = pRows[n + 1];//rowP->e<int>(n + 1);
|
||||
for (int i = begin; i < end; i++) {
|
||||
|
@ -72,7 +72,7 @@ namespace helpers {
|
|||
int const* pRows = reinterpret_cast<int const*>(rowP->getBuffer());
|
||||
int* symRowP = reinterpret_cast<int*>(outputRows->buffer());
|
||||
symRowP[0] = 0;
|
||||
for (int n = 0; n < N; n++)
|
||||
for (Nd4jLong n = 0; n < N; n++)
|
||||
symRowP[n + 1] = symRowP[n] + rowCounts->e<int>(n);
|
||||
// outputRows->printBuffer("output rows");
|
||||
|
||||
|
@ -86,7 +86,7 @@ namespace helpers {
|
|||
std::vector<int> offset(N);// = NDArrayFactory::create<int>('c', {N});
|
||||
|
||||
//PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(guided) shared(offset))
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (Nd4jLong n = 0; n < N; n++) {
|
||||
int begin = pRows[n];
|
||||
int bound = pRows[n + 1];
|
||||
|
||||
|
|
|
@ -146,17 +146,17 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
|
|||
auto length = shape::length(inShapeInfo);
|
||||
|
||||
if (inEWS == 1) {
|
||||
for (int i = 0; i < length; i++)
|
||||
for (Nd4jLong i = 0; i < length; i++)
|
||||
max = nd4j::math::nd4j_max<T>(max, inBuff[i]);
|
||||
|
||||
PRAGMA_OMP_SIMD_SUM(sum)
|
||||
for (int i = 0; i < length; i++) {
|
||||
for (Nd4jLong i = 0; i < length; i++) {
|
||||
outBuff[i] = nd4j::math::nd4j_exp<T,T>(inBuff[i] - max);
|
||||
sum += outBuff[i];
|
||||
}
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int i = 0; i < length; i++) {
|
||||
for (Nd4jLong i = 0; i < length; i++) {
|
||||
outBuff[i] /= sum;
|
||||
outBuff[i] = nd4j::math::nd4j_log<T,T>(outBuff[i]);
|
||||
}
|
||||
|
@ -164,17 +164,17 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
|
|||
else if (inEWS > 1) {
|
||||
|
||||
PRAGMA_OMP_SIMD_MAX(max)
|
||||
for (int i = 0; i < length; i++)
|
||||
for (Nd4jLong i = 0; i < length; i++)
|
||||
max = nd4j::math::nd4j_max<T>(max, inBuff[i * inEWS]);
|
||||
|
||||
PRAGMA_OMP_SIMD_SUM(sum)
|
||||
for (int i = 0; i < length; i++) {
|
||||
for (Nd4jLong i = 0; i < length; i++) {
|
||||
outBuff[i * inEWS] = nd4j::math::nd4j_exp<T,T>(inBuff[i * inEWS] - max);
|
||||
sum += outBuff[i * inEWS];
|
||||
}
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int i = 0; i < length; i++) {
|
||||
for (Nd4jLong i = 0; i < length; i++) {
|
||||
outBuff[i * inEWS] /= sum;
|
||||
outBuff[i * inEWS] = nd4j::math::nd4j_log<T, T>(outBuff[i * inEWS]);
|
||||
}
|
||||
|
|
|
@ -443,7 +443,7 @@ namespace nd4j {
|
|||
const X* bias_new;
|
||||
X* bias_extra = nullptr;
|
||||
size_t total_num = 1;
|
||||
for (size_t i = 0; i < rank; i++) {
|
||||
for (Nd4jLong i = 0; i < rank; i++) {
|
||||
total_num *= bases[i];
|
||||
}
|
||||
Nd4jLong inc;
|
||||
|
@ -574,7 +574,7 @@ namespace nd4j {
|
|||
for (size_t i = 0; i < 2; i++) {
|
||||
numNC *= bases[i];
|
||||
}
|
||||
for (size_t i = 2; i < rank; i++) {
|
||||
for (Nd4jLong i = 2; i < rank; i++) {
|
||||
numHW *= bases[i];
|
||||
}
|
||||
Nd4jLong total_num = numNC * numHW;
|
||||
|
|
|
@ -27,7 +27,7 @@ namespace helpers {
|
|||
|
||||
void adjustAxis(Nd4jLong rank, NDArray* axisVector, std::vector<int>& output) {
|
||||
output.resize(axisVector->lengthOf());
|
||||
for (int e = 0; e < axisVector->lengthOf(); e++) {
|
||||
for (Nd4jLong e = 0; e < axisVector->lengthOf(); e++) {
|
||||
auto ca = axisVector->e<int>(e);
|
||||
if (ca < 0)
|
||||
ca += rank;
|
||||
|
@ -37,7 +37,7 @@ namespace helpers {
|
|||
}
|
||||
|
||||
void adjustAxis(Nd4jLong rank, std::vector<int> &axisVector) {
|
||||
for (int e = 0; e < axisVector.size(); e++) {
|
||||
for (size_t e = 0; e < axisVector.size(); e++) {
|
||||
auto a = axisVector[e];
|
||||
if (a < 0)
|
||||
axisVector[e] = a + rank;
|
||||
|
|
|
@ -66,7 +66,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
|
|||
Nd4jLong* zOffsets = xzSameOffset ? xOffsets : new Nd4jLong[steps];
|
||||
Nd4jLong* auxBuff = new Nd4jLong[2 * input->rankOf()];
|
||||
|
||||
for (int j = 0; j < lenSmall; ++j) {
|
||||
for (Nd4jLong j = 0; j < lenSmall; ++j) {
|
||||
|
||||
const bool isOwner = (j < info._numThreads) ? thread_id == j : thread_id == (j % info._numThreads);
|
||||
|
||||
|
@ -96,7 +96,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
|
|||
shape::outerArrayOffsets(zOffsets, j, output->getShapeInfo(), mean->getShapeInfo(), auxBuff, dimsToExclude.data());
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (uint i = 0; i < steps; ++i)
|
||||
for (Nd4jLong i = 0; i < steps; ++i)
|
||||
z[zOffsets[i]] = (x[xOffsets[i]] - meanVal) * sigmaInvGam + betaVal;
|
||||
}
|
||||
|
||||
|
|
|
@ -65,8 +65,8 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp
|
|||
T *col, *im;
|
||||
int imRow, imCol;
|
||||
|
||||
for (uint b = start_x; b < stop_x; b += inc_x) {
|
||||
for (uint c = start_y; c < stop_y; c += inc_y) {
|
||||
for (auto b = start_x; b < stop_x; b += inc_x) {
|
||||
for (auto c = start_y; c < stop_y; c += inc_y) {
|
||||
for (int kRow = 0; kRow < kH; ++kRow) {
|
||||
for (int kCol = 0; kCol < kW; ++kCol) {
|
||||
for (int colH = 0; colH < oH; ++colH) {
|
||||
|
@ -96,7 +96,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp
|
|||
auto func = PRAGMA_THREADS_FOR {
|
||||
T *col, *im;
|
||||
|
||||
for (uint b = start; b < stop; b++) {
|
||||
for (auto b = start; b < stop; b++) {
|
||||
T *im0 = imBuff + b * imStride0;
|
||||
T *col4 = colBuff + b * colStride0;
|
||||
for (int colH = 0; colH < oH; ++colH, col4 += colStride4) {
|
||||
|
|
|
@ -55,8 +55,8 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR_2D {
|
||||
|
||||
for (uint b = start_x; b < stop_x; b += inc_x) {
|
||||
for (uint oh = start_y; oh < stop_y; oh += inc_y) {
|
||||
for (auto b = start_x; b < stop_x; b += inc_x) {
|
||||
for (auto oh = start_y; oh < stop_y; oh += inc_y) {
|
||||
for (uint ow = 0; ow < oW; ++ow) {
|
||||
for (uint c = 0; c < iC; ++c) {
|
||||
|
||||
|
@ -70,7 +70,7 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
|
|||
const int iw = ow * sW - pW + kw * dW;
|
||||
if (iw < 0 || iw >= iW) continue;
|
||||
|
||||
uint xCoords[4] = {b, (uint)ih, (uint)iw, c};
|
||||
uint xCoords[4] = { static_cast<uint>(b), static_cast<uint>(ih), static_cast<uint>(iw), c};
|
||||
uint yCoords[3] = {kh, kw, c};
|
||||
|
||||
const X val = x[shape::getOffset(xShapeInfo, xCoords)] + y[shape::getOffset(yShapeInfo, yCoords)];
|
||||
|
@ -79,7 +79,7 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
|
|||
}
|
||||
}
|
||||
|
||||
uint zCoords[4] = {b, oh, ow, c};
|
||||
uint zCoords[4] = { static_cast<uint>(b), static_cast<uint>(oh), ow, c};
|
||||
z[shape::getOffset(zShapeInfo, zCoords)] = static_cast<Z>(max);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -63,7 +63,7 @@ namespace helpers {
|
|||
std::vector<Nd4jLong> dims(reduceShape->lengthOf());
|
||||
|
||||
bool fit = true;
|
||||
for( int i = 0; i < dims.size(); i++ ) {
|
||||
for(auto i = 0; i < dims.size(); i++ ) {
|
||||
if (fit) {
|
||||
dims[i] = reduceShape->e<Nd4jLong>(i);
|
||||
for (int e = 0; e < input->rankOf(); ++e)
|
||||
|
|
|
@ -53,7 +53,7 @@ namespace nd4j {
|
|||
outputs[i].second = 0;
|
||||
|
||||
//PRAGMA_OMP_PARALLEL_FOR_IF(indices->lengthOf() > Environment::getInstance()->elementwiseThreshold())
|
||||
for (int e = 0; e < indices->lengthOf(); ++e)
|
||||
for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
|
||||
if ((*indices).e<Nd4jLong>(e) == i)
|
||||
listOutForCurrent.at(outputs[i].second++)->assign(listOfTensors.at(e));
|
||||
}
|
||||
|
@ -65,7 +65,7 @@ namespace nd4j {
|
|||
for (auto i = start; i < stop; i++) {
|
||||
outputs[i].first = outputList[i];
|
||||
outputs[i].second = 0;
|
||||
for (int e = 0; e < indices->lengthOf(); ++e)
|
||||
for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
|
||||
if (indices->e<Nd4jLong>(e) == i)
|
||||
outputs[i].first->p(outputs[i].second++, input->e<T>(e));
|
||||
}
|
||||
|
@ -83,7 +83,7 @@ namespace nd4j {
|
|||
for (int e = 0; e < numOfData; e++) {
|
||||
auto data = inputs[e];
|
||||
auto index = indices[e];
|
||||
for (int i = 0; i < index->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 0; i < index->lengthOf(); i++) {
|
||||
Nd4jLong pos = index->e<Nd4jLong>(i);
|
||||
if (pos < 0) {
|
||||
nd4j_printf("dynamic_stitch: Index value should be non-negative. But %i was given", pos);
|
||||
|
@ -100,7 +100,7 @@ namespace nd4j {
|
|||
}
|
||||
else {
|
||||
std::vector<int> restDims(output->rankOf() - 1);
|
||||
for (int i = restDims.size(); i > 0; i--)
|
||||
for (auto i = restDims.size(); i > 0; i--)
|
||||
restDims[restDims.size() - i] = output->rankOf() - i;
|
||||
|
||||
ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
|
||||
|
@ -109,12 +109,12 @@ namespace nd4j {
|
|||
auto data = inputs[e];
|
||||
auto index = indices[e];
|
||||
std::vector<int> sourceDims(data->rankOf() - index->rankOf());
|
||||
for (int i = sourceDims.size(); i > 0; i--)
|
||||
for (auto i = sourceDims.size(); i > 0; i--)
|
||||
sourceDims[sourceDims.size() - i] = data->rankOf() - i;
|
||||
|
||||
ResultSet listOfTensors = data->allTensorsAlongDimension(sourceDims) ;
|
||||
|
||||
for (int i = 0; i < index->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 0; i < index->lengthOf(); i++) {
|
||||
auto pos = index->e<Nd4jLong>(i);
|
||||
if (pos < 0) {
|
||||
nd4j_printf("dynamic_stitch: Index value should be non-negative. But %i was given", pos);
|
||||
|
@ -146,7 +146,7 @@ namespace nd4j {
|
|||
|
||||
ResultSet listOfTensors = outputList[0]->allTensorsAlongDimension(sourceDims);
|
||||
|
||||
for (unsigned int i = 0; i < inputGradientList.size(); i++) {
|
||||
for (auto i = 0; i < inputGradientList.size(); i++) {
|
||||
outputs[i].first = inputGradientList[i];
|
||||
if (outputs[i].first->rankOf() < 1) continue; // skip empty gradient outs
|
||||
std::vector<int> outDims(outputs[i].first->rankOf() - 1);
|
||||
|
@ -158,7 +158,7 @@ namespace nd4j {
|
|||
|
||||
outputs[i].second = 0;
|
||||
|
||||
for (int e = 0; e < indices->lengthOf(); ++e)
|
||||
for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
|
||||
if (indices->e<Nd4jLong>(e) == i)
|
||||
listOfTensors.at(e)->assign(listOutForCurrent.at(outputs[i].second++));
|
||||
}
|
||||
|
@ -171,7 +171,7 @@ namespace nd4j {
|
|||
for (auto i = start; i < stop; i++) {
|
||||
outputs[i].first = inputGradientList[i];
|
||||
outputs[i].second = 0;
|
||||
for (int e = 0; e < indices->lengthOf(); ++e)
|
||||
for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
|
||||
if (indices->e<Nd4jLong>(e) == i)
|
||||
output->p<T>(e, outputs[i].first->e<T>(outputs[i].second++));
|
||||
}
|
||||
|
|
|
@ -45,7 +45,7 @@ namespace nd4j {
|
|||
auto xShapeInfo = inputs[e]->shapeInfo();
|
||||
auto xLength = inputs[e]->lengthOf();
|
||||
|
||||
for (uint i = 0; i < xLength; i++)
|
||||
for (Nd4jLong i = 0; i < xLength; i++)
|
||||
z[i] = xBuffer[getIndexOffsetOrdered(i, xShapeInfo, order)];
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@ namespace nd4j {
|
|||
namespace helpers {
|
||||
template <typename T>
|
||||
static void hashCode_(LaunchContext *context, NDArray &array, NDArray &result) {
|
||||
auto blockSize = 32;
|
||||
Nd4jLong blockSize = 32;
|
||||
auto length = array.lengthOf();
|
||||
int numBlocks = length / blockSize + ((length % blockSize == 0) ? 0 : 1);
|
||||
auto tempA = NDArrayFactory::create<Nd4jLong>('c', {numBlocks}, context);
|
||||
|
@ -42,11 +42,11 @@ namespace nd4j {
|
|||
|
||||
// we divide array into 32 element chunks, and store intermediate results once
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto b = 0; b < stop; b++) {
|
||||
for (auto b = start; b < stop; b++) {
|
||||
auto blockBuffer = buffer + b * numBlocks;
|
||||
|
||||
Nd4jLong r = 1;
|
||||
for (int e = 0; e < blockSize && e + (b * numBlocks) < length; e++) {
|
||||
for (Nd4jLong e = 0; e < blockSize && e + (b * numBlocks) < length; e++) {
|
||||
auto v = longBytes<T>(blockBuffer[e]);
|
||||
r = 31 * r + v;
|
||||
}
|
||||
|
@ -68,7 +68,7 @@ namespace nd4j {
|
|||
auto blockBuffer = tempBuffer + b * numBlocks;
|
||||
|
||||
Nd4jLong r = 1;
|
||||
for (int e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) {
|
||||
for (Nd4jLong e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) {
|
||||
auto v = longBytes<T>(blockBuffer[e]);
|
||||
r = 31 * r + v;
|
||||
}
|
||||
|
@ -103,4 +103,3 @@ namespace nd4j {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ namespace nd4j {
|
|||
}
|
||||
|
||||
PRAGMA_OMP_SIMD
|
||||
for (int x = 0; x < numBins; x++) {
|
||||
for (Nd4jLong x = 0; x < numBins; x++) {
|
||||
result[x] += bins[x];
|
||||
}
|
||||
|
||||
|
|
|
@ -64,8 +64,8 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input, NDArra
|
|||
if (shape::order(imShapeBuffer) == 'c' && shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR_2D {
|
||||
for (int b = start_x; b < stop_x; b++) {
|
||||
for (int c = start_y; c < stop_y; c++) {
|
||||
for (auto b = start_x; b < stop_x; b++) {
|
||||
for (auto c = start_y; c < stop_y; c++) {
|
||||
for (int kRow = 0; kRow < kH; ++kRow) {
|
||||
for (int kCol = 0; kCol < kW; ++kCol) {
|
||||
for (int colH = 0; colH < oH; ++colH) {
|
||||
|
@ -98,8 +98,8 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input, NDArra
|
|||
T *col, *im;
|
||||
int imRow, imCol;
|
||||
|
||||
for (int b = start_x; b < stop_x; b += inc_x) {
|
||||
for (int colH = start_y; colH < stop_y; colH += inc_y) {
|
||||
for (auto b = start_x; b < stop_x; b += inc_x) {
|
||||
for (auto colH = start_y; colH < stop_y; colH += inc_y) {
|
||||
for (int colW = 0; colW < oW; ++colW) {
|
||||
for (int c = 0; c < iC; ++c) {
|
||||
for (int kRow = 0; kRow < kH; ++kRow) {
|
||||
|
|
|
@ -219,16 +219,16 @@ namespace helpers {
|
|||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto batch = start; batch < stop; ++batch) {
|
||||
auto pInput = pInputBuf + batch * inBatchNumValues;
|
||||
for (auto y = 0; y < outHeight; ++y) {
|
||||
for (Nd4jLong y = 0; y < outHeight; ++y) {
|
||||
auto pOutput = pOutputBuf + (batch * outHeight + y) * outRowSize;
|
||||
const T* ysInputLowerPtr = pInput + ys[y]._bottomIndex * inRowSize;
|
||||
const T* ysInputUpperPtr = pInput + ys[y]._topIndex * inRowSize;
|
||||
double yVal = ys[y]._interpolarValue;
|
||||
for (auto x = 0; x < outWidth; ++x) {
|
||||
for (Nd4jLong x = 0; x < outWidth; ++x) {
|
||||
auto xsBottom = xsPtr[x]._bottomIndex;
|
||||
auto xsTop = xsPtr[x]._topIndex;
|
||||
auto xVal = xsPtr[x]._interpolarValue;
|
||||
for (auto c = 0; c < channels; ++c) {
|
||||
for (Nd4jLong c = 0; c < channels; ++c) {
|
||||
double topLeft(ysInputLowerPtr[xsBottom + c]);
|
||||
double topRight(ysInputLowerPtr[xsTop + c]);
|
||||
double bottomLeft(ysInputUpperPtr[xsBottom + c]);
|
||||
|
@ -310,14 +310,14 @@ namespace helpers {
|
|||
if (halfPixelCenter) {
|
||||
inY = nd4j::math::nd4j_max(0LL, inY);
|
||||
}
|
||||
for (auto x = 0; x < outWidth; ++x) {
|
||||
for (Nd4jLong x = 0; x < outWidth; ++x) {
|
||||
auto posX = alignCorners ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(scaler(x, st.widthScale))) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(scaler(x, st.widthScale)));
|
||||
Nd4jLong inX = nd4j::math::nd4j_min(posX,inWidth - 1);
|
||||
if (halfPixelCenter) {
|
||||
inX = nd4j::math::nd4j_max(0LL, inX);
|
||||
}
|
||||
// copy pixel over all channels
|
||||
for (auto e = 0; e < channels; e++)
|
||||
for (Nd4jLong e = 0; e < channels; e++)
|
||||
output->t<T>(b, y, x, e) = images->t<T>(b, inY, inX, e);
|
||||
}
|
||||
}
|
||||
|
@ -613,7 +613,7 @@ namespace helpers {
|
|||
for (auto b = start; b < stop; ++b) {
|
||||
auto pInput = inputPtr + b * inBatchWidth;
|
||||
|
||||
for (auto y = 0; y < outHeight; ++y) {
|
||||
for (Nd4jLong y = 0; y < outHeight; ++y) {
|
||||
auto pOutput = &pOutputY[(b * outHeight + y) * outWidth * numChannels];
|
||||
|
||||
WeightsAndIndices yWai;
|
||||
|
@ -635,7 +635,7 @@ namespace helpers {
|
|||
F cached_value_0[4] = {0};
|
||||
F cached_value_1[4] = {0};
|
||||
F cached_value_2[4] = {0};
|
||||
for (auto x = 0; x < resizerState.outWidth; ++x) {
|
||||
for (Nd4jLong x = 0; x < resizerState.outWidth; ++x) {
|
||||
const WeightsAndIndices &xWai = xWais[x];
|
||||
// Shift values in cached_value_* to fill first '_advance' values.
|
||||
switch (xWai._advance) {
|
||||
|
@ -712,7 +712,7 @@ namespace helpers {
|
|||
xWai._weight2, xWai._weight3);
|
||||
}
|
||||
} else {
|
||||
for (auto x = 0; x < resizerState.outWidth; ++x) {
|
||||
for (Nd4jLong x = 0; x < resizerState.outWidth; ++x) {
|
||||
const WeightsAndIndices &xWai = xWais[x];
|
||||
// Shift values in cachedValue to fill first '_advance' values.
|
||||
switch (xWai._advance) {
|
||||
|
@ -828,7 +828,7 @@ namespace helpers {
|
|||
float sum_0 = 0;
|
||||
float sum_1 = 0;
|
||||
float sum_2 = 0;
|
||||
for (int i = 0; i < yPtrs.size(); ++i) {
|
||||
for (size_t i = 0; i < yPtrs.size(); ++i) {
|
||||
const T* ptr = yPtrs[i].yPtr;
|
||||
float scaleX = xCache.startScale;
|
||||
Nd4jLong offset = 3 * boundIfNeeded(xCache.start, st.inWidth);
|
||||
|
@ -879,7 +879,7 @@ namespace helpers {
|
|||
const auto numChannels = st.channels;
|
||||
for (Nd4jLong c = 0; c < numChannels; ++c) {
|
||||
float sum = 0;
|
||||
for (int i = 0; i < yPtrs.size(); ++i) {
|
||||
for (size_t i = 0; i < yPtrs.size(); ++i) {
|
||||
T const* ptr = yPtrs[i].yPtr;
|
||||
float scaleX = xCache.startScale;
|
||||
float sumY = static_cast<float>(ptr[numChannels * boundIfNeeded(xCache.start, st.inWidth) + c]) * scaleX;
|
||||
|
|
|
@ -62,7 +62,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
|
|||
if(inTadEws == 1 && outTadEws == 1) {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (uint i = start; i < stop; i++) {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
const T *x = inBuff + inTadOffsets[i];
|
||||
T *y = outBuff + outTadOffsets[i];
|
||||
|
||||
|
@ -70,7 +70,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
|
|||
|
||||
// calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
|
||||
// we store each squared sum in corresponding element of y array
|
||||
for (uint j = 0; j < tadLen; ++j) {
|
||||
for (Nd4jLong j = 0; j < tadLen; ++j) {
|
||||
const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
|
||||
const uint last = depth + j + 1;
|
||||
const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
|
||||
|
@ -100,7 +100,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
|
|||
}
|
||||
else {
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (uint i = 0; i < numOfTads; ++i) {
|
||||
for (Nd4jLong i = 0; i < numOfTads; ++i) {
|
||||
const T *x = inBuff + inTadOffsets[i];
|
||||
T *y = outBuff + outTadOffsets[i];
|
||||
|
||||
|
@ -108,7 +108,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
|
|||
|
||||
// calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
|
||||
// we store each squared sum in corresponding element of y array
|
||||
for (uint j = 0; j < tadLen; ++j) {
|
||||
for (Nd4jLong j = 0; j < tadLen; ++j) {
|
||||
const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
|
||||
const uint last = depth + j + 1;
|
||||
const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
|
||||
|
@ -179,13 +179,13 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
|
|||
if(inTadEws == 1 && gradITadEws == 1) {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (uint i = start; i < stop; i++) {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
const X *x = inBuff + inTadOffsets[i];
|
||||
Y *y = gradIBuff + gradITadOffsets[i];
|
||||
|
||||
// this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
|
||||
// we store each squared sum in corresponding element of y array
|
||||
for (uint j = 0; j < tadLen; ++j) {
|
||||
for (Nd4jLong j = 0; j < tadLen; ++j) {
|
||||
const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
|
||||
const uint last = depth + j + 1;
|
||||
const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
|
||||
|
@ -208,7 +208,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
|
|||
|
||||
Y prev = 0;
|
||||
// second loop calculates derivatives using information gained in first loop above
|
||||
for (uint j = 0; j < tadLen; ++j) {
|
||||
for (Nd4jLong j = 0; j < tadLen; ++j) {
|
||||
const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
|
||||
const uint last = depth + j + 1;
|
||||
const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
|
||||
|
@ -247,13 +247,13 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
|
|||
else {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (uint i = start; i < stop; i++) {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
const X *x = inBuff + inTadOffsets[i];
|
||||
Y *y = gradIBuff + gradITadOffsets[i];
|
||||
|
||||
// this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
|
||||
// we store each squared sum in corresponding element of y array
|
||||
for (uint j = 0; j < tadLen; ++j) {
|
||||
for (Nd4jLong j = 0; j < tadLen; ++j) {
|
||||
const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
|
||||
const uint last = depth + j + 1;
|
||||
const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
|
||||
|
@ -280,7 +280,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
|
|||
|
||||
Y prev = 0;
|
||||
// second loop calculates derivatives using information gained in first loop above
|
||||
for (uint j = 0; j < tadLen; ++j) {
|
||||
for (Nd4jLong j = 0; j < tadLen; ++j) {
|
||||
const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
|
||||
const uint last = depth + j + 1;
|
||||
const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
|
||||
|
|
|
@ -124,7 +124,7 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast,
|
|||
auto h_ = h->bufferAsT<T>();
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (uint e = start; e < stop; e++) {
|
||||
for (auto e = start; e < stop; e++) {
|
||||
c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
|
||||
h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]);
|
||||
}
|
||||
|
|
|
@ -32,7 +32,7 @@ namespace helpers {
|
|||
Nd4jLong preLastDim = input->rankOf() - 2;
|
||||
ResultSet listOut = output->allTensorsAlongDimension({(int)preLastDim, (int)lastDim});
|
||||
ResultSet listDiag = input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim});
|
||||
for (Nd4jLong e = 0; e < listOut.size(); ++e) {
|
||||
for (Nd4jLong e = 0; e < static_cast<Nd4jLong>(listOut.size()); ++e) {
|
||||
NDArray* inputMatrix = listDiag.at(e);
|
||||
NDArray* outputMatrix = listOut.at(e);
|
||||
if (outputMatrix != inputMatrix) // if not inplace
|
||||
|
|
|
@ -68,7 +68,7 @@ namespace nd4j {
|
|||
if (shape::elementWiseStride(xShapeInfo) == 1 && shape::elementWiseStride(zShapeInfo) == 1 &&
|
||||
shape::order(xShapeInfo) == 'c' && shape::order(zShapeInfo) == 'c') {
|
||||
|
||||
for (int e = 0; e < length; e++) {
|
||||
for (Nd4jLong e = 0; e < length; e++) {
|
||||
sum = op == scalar::Add ? simdOps::Add<T, T, T>::op(sum, x[e]) : simdOps::Multiply<T, T, T>::op(sum, x[e]);
|
||||
|
||||
if (!exclusive)
|
||||
|
@ -81,7 +81,7 @@ namespace nd4j {
|
|||
}
|
||||
else {
|
||||
|
||||
for (int e = 0; e < length; e++) {
|
||||
for (Nd4jLong e = 0; e < length; e++) {
|
||||
|
||||
auto xOffset = shape::getIndexOffset(e, xShapeInfo);
|
||||
auto zOffset = shape::getIndexOffset(e, zShapeInfo);
|
||||
|
|
|
@ -43,8 +43,8 @@ namespace helpers {
|
|||
T const* vBuf = v.getDataBuffer()->primaryAsT<T>();
|
||||
T* resBuf = res.dataBuffer()->primaryAsT<T>();
|
||||
auto interloop = PRAGMA_THREADS_FOR_2D {
|
||||
for (int i = start_x; i < n; i += inc_x)
|
||||
for (int j = start_y; j < n; j += inc_y)
|
||||
for (auto i = start_x; i < n; i += inc_x)
|
||||
for (auto j = start_y; j < n; j += inc_y)
|
||||
resBuf[i * n + j] = -2 * vBuf[i] * vBuf[j] + (i == j ? T(1) : T(0));
|
||||
};
|
||||
|
||||
|
@ -63,7 +63,7 @@ namespace helpers {
|
|||
NDArray z = *matrix;
|
||||
NDArray e('c', {M}, DataTypeUtils::fromT<T>()); // two internal buffers and scalar for squared norm
|
||||
|
||||
for (auto k = 0; k < N && k < M - 1; k++) { // loop for columns, but not further then row number
|
||||
for (Nd4jLong k = 0; k < N && k < M - 1; k++) { // loop for columns, but not further then row number
|
||||
e.nullify();
|
||||
z = matrixMinor<T>(z, k); // minor computing for current column with given matrix z (initally is a input matrix)
|
||||
// z.printIndexedBuffer("Minor!!!");
|
||||
|
@ -87,7 +87,7 @@ namespace helpers {
|
|||
}
|
||||
resQ.assign(q[0]); //
|
||||
// MmulHelper::matmul(&q[0], matrix, &resR, false, false);
|
||||
for (int i = 1; i < N && i < M - 1; i++) {
|
||||
for (Nd4jLong i = 1; i < N && i < M - 1; i++) {
|
||||
auto tempResQ = resQ;
|
||||
MmulHelper::matmul(&q[i], &resQ, &tempResQ, false, false); // use mmulMxM?
|
||||
resQ = std::move(tempResQ);
|
||||
|
|
|
@ -57,10 +57,10 @@ namespace helpers {
|
|||
T* outputBuf = output->dataBuffer()->primaryAsT<T>();
|
||||
|
||||
PRAGMA_OMP_PARALLEL_FOR
|
||||
for (auto k = 0; k < shift; k++) {
|
||||
for (Nd4jLong k = 0; k < shift; k++) {
|
||||
auto pos = k * step;
|
||||
auto u = rng.relativeT<T>(k, 0., 1.);
|
||||
for (auto e = 0; e < step; e++)
|
||||
for (Nd4jLong e = 0; e < step; e++)
|
||||
if (directOutput) {
|
||||
outputBuf[pos + e] = math::nd4j_igamma<T, T, T>(copyAlpha->t<T>(e),
|
||||
beta != nullptr ? copyBeta->t<T>(e) * u : u);
|
||||
|
@ -104,10 +104,10 @@ namespace helpers {
|
|||
bool directLa = lambda->ews() == 1 && lambda->ordering() == 'c';
|
||||
bool directOut = output->ews() == 1 && output->ordering() == 'c';
|
||||
PRAGMA_OMP_PARALLEL_FOR
|
||||
for (auto k = 0; k < shift; k++) {
|
||||
for (Nd4jLong k = 0; k < shift; k++) {
|
||||
auto pos = k * step;
|
||||
auto u = rng.relativeT<T>(k, 0., 1.);
|
||||
for (auto e = 0; e < step; e++) {
|
||||
for (Nd4jLong e = 0; e < step; e++) {
|
||||
auto p = math::nd4j_exp<T, T>(-lambda->t<T>(e));
|
||||
auto s = p;
|
||||
auto x = T(0.f);
|
||||
|
@ -143,7 +143,7 @@ namespace helpers {
|
|||
RandomLauncher::fillUniform(context, rng, output, minVal, maxVal);
|
||||
else {
|
||||
PRAGMA_OMP_PARALLEL_FOR
|
||||
for (auto i = 0; i < output->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 0; i < output->lengthOf(); i++) {
|
||||
output->t<T>(i) = rng.relativeT<T>(i, minVal, maxVal);
|
||||
}
|
||||
}
|
||||
|
@ -184,7 +184,7 @@ namespace helpers {
|
|||
|
||||
auto nSamplesPerBatch = nBatchIndex * numOfClassX * numOfSamples;
|
||||
auto nClassesPerSample = nSampleIndexInBatch * numOfClassX;
|
||||
for (auto nClass = 0; nClass < numOfClassX; nClass += 1) {
|
||||
for (Nd4jLong nClass = 0; nClass < numOfClassX; nClass += 1) {
|
||||
auto nIndex = nSamplesPerBatch + nClassesPerSample + nClass;
|
||||
auto unifornLog = nd4j::math::nd4j_log<Tx, Tx>(-nd4j::math::nd4j_log<Tx, Tx>(rng.relativeT<Tx>(nIndex, minVal, maxVal)));
|
||||
Tx tValue = (xTad[nClass * xDimAstride] - unifornLog);
|
||||
|
|
|
@ -50,7 +50,7 @@ namespace helpers {
|
|||
width = lastDim;
|
||||
}
|
||||
|
||||
for (int i = 0; i < input->lengthOf(); i += lastDim) {
|
||||
for (Nd4jLong i = 0; i < input->lengthOf(); i += lastDim) {
|
||||
for (Nd4jLong k = startPos; k < width && pos < output->lengthOf(); k++) {
|
||||
output->p(pos++, input->e<T>(i + k));
|
||||
}
|
||||
|
|
|
@ -110,7 +110,7 @@ namespace helpers {
|
|||
}
|
||||
else {
|
||||
std::vector<int> dims(source->rankOf() - axe - 1);
|
||||
for (int i = 0; i < dims.size(); ++i)
|
||||
for (size_t i = 0; i < dims.size(); ++i)
|
||||
dims[i] = axe + 1 + i;
|
||||
|
||||
ResultSet listOfTensors = source->allTensorsAlongDimension({dims});
|
||||
|
|
|
@ -55,9 +55,9 @@ static void batchToSpace_(const NDArray& input, NDArray& output, const uint crop
|
|||
|
||||
// loop through output array
|
||||
auto func = PRAGMA_THREADS_FOR_3D {
|
||||
for (uint b = start_x; b < stop_x; b += inc_x) {
|
||||
for (uint h = start_y; h < stop_y; h += inc_y) {
|
||||
for (uint w = start_z; w < stop_z; w += inc_z) {
|
||||
for (auto b = start_x; b < stop_x; b += inc_x) {
|
||||
for (auto h = start_y; h < stop_y; h += inc_y) {
|
||||
for (auto w = start_z; w < stop_z; w += inc_z) {
|
||||
for (uint c = 0; c < iC; ++c) {
|
||||
const Nd4jLong xOffset = b * xShapeInfo[5] + h * xShapeInfo[6] + w * xShapeInfo[7] + c * xShapeInfo[8];
|
||||
const Nd4jLong zOffset = b * zShapeInfo[5] + (h - cropBottom) * zShapeInfo[6] + (w - cropLeft) * zShapeInfo[7] + c * zShapeInfo[8];
|
||||
|
@ -146,11 +146,11 @@ void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const ND
|
|||
|
||||
std::vector<Nd4jLong> temp(numOfSpatialDims + rank);
|
||||
|
||||
int i;
|
||||
uint i;
|
||||
for(i = 0; i < numOfSpatialDims; ++i)
|
||||
temp[i] = blockShape.e<Nd4jLong>(i);
|
||||
temp[i++] = output.sizeAt(0);
|
||||
for(int j = 1; j < rank; ++i, ++j)
|
||||
for(uint j = 1; j < rank; ++i, ++j)
|
||||
temp[i] = input.sizeAt(j);
|
||||
|
||||
NDArray inputRearranged0 = input.reshape(input.ordering(), temp);
|
||||
|
@ -163,7 +163,7 @@ void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const ND
|
|||
temp[2*i - 1] = numOfSpatialDims + i;
|
||||
temp[2*i] = i - 1;
|
||||
}
|
||||
for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i)
|
||||
for(i = 2 * numOfSpatialDims + 1; i < static_cast<uint>(temp.size()); ++i)
|
||||
temp[i] = i;
|
||||
|
||||
inputRearranged0.permutei(temp);
|
||||
|
@ -216,8 +216,8 @@ static void spaceToBatch_(const NDArray& input, NDArray& output, const uint padB
|
|||
|
||||
// loop through output array
|
||||
auto func = PRAGMA_THREADS_FOR_2D {
|
||||
for (uint b = start_x; b < stop_x; b += inc_x) {
|
||||
for (uint h = start_y; h < stop_y; h += inc_y) {
|
||||
for (auto b = start_x; b < stop_x; b += inc_x) {
|
||||
for (auto h = start_y; h < stop_y; h += inc_y) {
|
||||
for (uint w = 0; w < oW; ++w) {
|
||||
for (uint c = 0; c < iC; ++c) {
|
||||
|
||||
|
|
|
@ -87,7 +87,7 @@ namespace helpers {
|
|||
if (input->isVector()) {
|
||||
T val = input->e<T>(0);
|
||||
|
||||
for (int e = 1; e < indices->lengthOf(); e++) {
|
||||
for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
|
||||
if (idx == indices->e<Nd4jLong>(e)) {
|
||||
// min
|
||||
val = nd4j::math::nd4j_min<T>(val, input->t<T>(e));
|
||||
|
@ -115,7 +115,7 @@ namespace helpers {
|
|||
for (Nd4jLong i = 1; i < indices->lengthOf(); i++) {
|
||||
if (indices->e<Nd4jLong>(i) == idx) {
|
||||
|
||||
for (int e = 0; e < minT->lengthOf(); e++) {
|
||||
for (Nd4jLong e = 0; e < minT->lengthOf(); e++) {
|
||||
minT->p(e, nd4j::math::nd4j_min(minT->e<T>(e), listOfTensors.at(i)->e<T>(e)));
|
||||
}
|
||||
}
|
||||
|
@ -138,7 +138,7 @@ namespace helpers {
|
|||
T val = T(0.f);
|
||||
int count = 0;
|
||||
|
||||
for (int e = 0; e < indices->lengthOf(); e++) {
|
||||
for (Nd4jLong e = 0; e < indices->lengthOf(); e++) {
|
||||
if (idx == indices->e<int>(e)) {
|
||||
// mean
|
||||
val += input->e<T>(e);
|
||||
|
@ -166,7 +166,7 @@ namespace helpers {
|
|||
auto meanV = meanT->dup();
|
||||
meanV.assign(listOfTensors.at(0));
|
||||
|
||||
for (int i = 1; i < indices->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 1; i < indices->lengthOf(); i++) {
|
||||
if (indices->e<int>(i) == idx) {
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto e = start; e < stop; e++) {
|
||||
|
@ -198,7 +198,7 @@ namespace helpers {
|
|||
if (input->isVector()) {
|
||||
T val = T(0.f);
|
||||
int count = 0;
|
||||
for (int e = 0; e < indices->lengthOf(); e++) {
|
||||
for (Nd4jLong e = 0; e < indices->lengthOf(); e++) {
|
||||
if (idx == indices->e<int>(e)) {
|
||||
// sum
|
||||
val += input->t<T>(e);
|
||||
|
@ -220,7 +220,7 @@ namespace helpers {
|
|||
std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
|
||||
auto sumT = listOfOutTensors.at(idx);
|
||||
|
||||
for (int i = 0; i < indices->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
|
||||
if (indices->e<int>(i) == idx) {
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto e = start; e < stop; e++) {
|
||||
|
@ -248,7 +248,7 @@ namespace helpers {
|
|||
T val = input->e<T>(0);
|
||||
int count = 0;
|
||||
|
||||
for (int e = 1; e < indices->lengthOf(); e++) {
|
||||
for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
|
||||
if (idx == indices->e<int>(e)) {
|
||||
// sum
|
||||
val *= input->e<T>(e);
|
||||
|
@ -269,7 +269,7 @@ namespace helpers {
|
|||
int numOfClasses = output->sizeAt(0); // number of classes
|
||||
auto sumT = listOfOutTensors.at(idx);
|
||||
sumT->assign(listOfTensors.at(0));
|
||||
for (int i = 1; i < indices->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 1; i < indices->lengthOf(); i++) {
|
||||
if (indices->e<int>(i) == idx) {
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto e = start; e < stop; e++) {
|
||||
|
@ -313,7 +313,7 @@ namespace helpers {
|
|||
|
||||
bool segmentIndicesValidate(nd4j::LaunchContext * context, NDArray* indices, NDArray& expected, NDArray& output) {
|
||||
auto val = indices->e(0);
|
||||
for (int e = 1; e < indices->lengthOf(); e++) {
|
||||
for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
|
||||
output = indices->e(e);
|
||||
if (val.e<Nd4jLong>(0) > output.e<Nd4jLong>(0))
|
||||
return false;
|
||||
|
@ -362,7 +362,7 @@ namespace helpers {
|
|||
|
||||
for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
|
||||
T val = input->e<T>(fi->second.at(0));
|
||||
for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
|
||||
for (Nd4jLong idx = 1; idx < static_cast<Nd4jLong>(fi->second.size()); ++idx) {
|
||||
val = nd4j::math::nd4j_max(val, input->e<T>(fi->second.at(idx)));
|
||||
}
|
||||
output->p(fi->first, val);
|
||||
|
@ -380,7 +380,7 @@ namespace helpers {
|
|||
for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
|
||||
auto outputT = listOfOutTensors.at(fi->first);
|
||||
outputT->assign(listOfTensors.at(fi->second.at(0)));
|
||||
for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
|
||||
for (Nd4jLong idx = 1; idx < static_cast<Nd4jLong>(fi->second.size()); ++idx) {
|
||||
auto maxT = listOfTensors.at(fi->second.at(idx));
|
||||
for (Nd4jLong e = 0; e < outputT->lengthOf(); ++e) {
|
||||
T val = nd4j::math::nd4j_max(maxT->e<T>(e), outputT->e<T>(e));
|
||||
|
@ -432,7 +432,7 @@ namespace helpers {
|
|||
for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
|
||||
auto outputT = listOfOutTensors.at(fi->first);
|
||||
outputT->assign(listOfTensors.at(fi->second.at(0)));
|
||||
for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
|
||||
for (size_t idx = 1; idx < fi->second.size(); ++idx) {
|
||||
auto minT = listOfTensors.at(fi->second.at(idx));
|
||||
|
||||
for (Nd4jLong e = 0; e < outputT->lengthOf(); ++e) {
|
||||
|
@ -560,7 +560,7 @@ namespace helpers {
|
|||
for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
|
||||
auto outputT = listOfOutTensors.at(fi->first);
|
||||
outputT->assign(listOfTensors.at(fi->second.at(0)));
|
||||
for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
|
||||
for (size_t idx = 1; idx < fi->second.size(); ++idx) {
|
||||
auto current = listOfTensors.at(fi->second.at(idx));
|
||||
|
||||
*outputT *= *current;
|
||||
|
@ -584,7 +584,7 @@ namespace helpers {
|
|||
if (input->isVector()) { // 1D case
|
||||
for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
|
||||
double sumValue = input->e<double>(fi->second.at(0));
|
||||
for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
|
||||
for (size_t idx = 1; idx < fi->second.size(); ++idx) {
|
||||
sumValue += input->e<double>(fi->second.at(idx));
|
||||
}
|
||||
output->p(fi->first, sumValue / nd4j::math::nd4j_sqrt<Nd4jLong, double>(fi->second.size()));
|
||||
|
@ -599,7 +599,7 @@ namespace helpers {
|
|||
for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
|
||||
auto outputT = listOfOutTensors.at(fi->first);
|
||||
outputT->assign(listOfTensors.at(fi->second.at(0)));
|
||||
for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
|
||||
for (size_t idx = 1; idx < fi->second.size(); ++idx) {
|
||||
auto current = listOfTensors.at(fi->second.at(idx));
|
||||
*outputT += *current;
|
||||
}
|
||||
|
@ -651,7 +651,7 @@ namespace helpers {
|
|||
auto currentOut = listOfOutTensors.at(i);
|
||||
auto currentGradOut = listOfGradOuts.at(classNum);
|
||||
|
||||
for (uint64_t e = 0; e < current->lengthOf(); e++) {
|
||||
for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
|
||||
if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e<T>(e) - current->e<T>(e)) <= T(1.e-6))
|
||||
currentOut->p(e, currentGradOut->e<T>(e));
|
||||
}
|
||||
|
@ -703,7 +703,7 @@ namespace helpers {
|
|||
auto currentOut = listOfOutTensors.at(i);
|
||||
auto currentGradOut = listOfGradOuts.at(classNum);
|
||||
|
||||
for (int e = 0; e < current->lengthOf(); e++) {
|
||||
for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
|
||||
if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e<double>(e) - current->e<double>(e)) <
|
||||
1.e-5)
|
||||
currentOut->p(e, currentGradOut->e<double>(e));
|
||||
|
@ -746,13 +746,13 @@ namespace helpers {
|
|||
|
||||
int pos = 0;
|
||||
//auto func = [&](uint64_t thread_id, uint64_t start, uint64_t stop, uint64_t increment) -> void {
|
||||
for (auto i = 0; i < indices->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
|
||||
auto classNum = indices->e<Nd4jLong>(i);
|
||||
auto current = listOfTensors.at(i);
|
||||
auto currentOut = listOfOutTensors.at(i);
|
||||
auto currentGradOut = listOfGradOuts.at(classNum);
|
||||
|
||||
for (int e = 0; e < current->lengthOf(); e++) {
|
||||
for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
|
||||
currentOut->p(e, currentGradOut->e<double>(e) / classCount.at(classNum));
|
||||
}
|
||||
}
|
||||
|
@ -781,7 +781,7 @@ namespace helpers {
|
|||
ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
|
||||
|
||||
//auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto i = 0; i < indices->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
|
||||
auto classNum = indices->e<Nd4jLong>(i);
|
||||
auto current = listOfTensors.at(i);
|
||||
auto currentOut = listOfOutTensors.at(i);
|
||||
|
@ -817,7 +817,7 @@ namespace helpers {
|
|||
//std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
|
||||
|
||||
//auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto i = 0; i < indices->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
|
||||
auto classNum = indices->e<Nd4jLong>(i);
|
||||
auto current = listOfTensors.at(i);
|
||||
auto currentOut = listOfOutTensors.at(i);
|
||||
|
@ -860,7 +860,7 @@ namespace helpers {
|
|||
ResultSet listOfTensors = input->allTensorsAlongDimension(restDims);
|
||||
ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
|
||||
|
||||
for (int i = 0; i < indices->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
|
||||
Nd4jLong classNum = indices->e<Nd4jLong>(i);
|
||||
NDArray* current = listOfTensors.at(i);
|
||||
NDArray* currentOut = listOfOutTensors.at(i);
|
||||
|
@ -905,13 +905,13 @@ namespace helpers {
|
|||
ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
|
||||
|
||||
//auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto i = 0; i < indices->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
|
||||
auto classNum = indices->e<Nd4jLong>(i);
|
||||
auto current = listOfTensors.at(i);
|
||||
auto currentOut = listOfOutTensors.at(i);
|
||||
auto currentGradOut = listOfGradOuts.at(classNum);
|
||||
|
||||
for (int e = 0; e < current->lengthOf(); e++) {
|
||||
for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
|
||||
if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->t<T>(e) - current->t<T>(e)) < 1.e-6)
|
||||
currentOut->t<T>(e) = currentGradOut->t<T>(e);
|
||||
}
|
||||
|
@ -955,7 +955,7 @@ namespace helpers {
|
|||
ResultSet listOfTensors = input->allTensorsAlongDimension(restDims);
|
||||
ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
|
||||
|
||||
for (int i = 0; i < indices->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
|
||||
Nd4jLong classNum = indices->e<Nd4jLong>(i);
|
||||
NDArray* current = listOfTensors.at(i);
|
||||
NDArray* currentOut = listOfOutTensors.at(i);
|
||||
|
@ -984,7 +984,7 @@ namespace helpers {
|
|||
ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
|
||||
|
||||
//auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto i = 0; i < indices->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
|
||||
auto classNum = indices->e<Nd4jLong>(i);
|
||||
auto currentOut = listOfOutTensors.at(i);
|
||||
auto currentGradOut = listOfGradOuts.at(classNum);
|
||||
|
@ -1021,7 +1021,7 @@ namespace helpers {
|
|||
ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
|
||||
|
||||
//auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto i = 0; i < indices->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
|
||||
auto classNum = indices->e<Nd4jLong>(i);
|
||||
auto current = listOfTensors.at(i);
|
||||
auto currentOut = listOfOutTensors.at(i);
|
||||
|
@ -1053,7 +1053,7 @@ namespace helpers {
|
|||
// if input is a vector: (as if in doc sample)
|
||||
if (input->isVector()) {
|
||||
//auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto e = 0; e < indices->lengthOf(); e++) {
|
||||
for (Nd4jLong e = 0; e < indices->lengthOf(); e++) {
|
||||
auto classNum = indices->e<Nd4jLong>(e);
|
||||
output->p(e, gradOut->e<double>(classNum) / nd4j::math::nd4j_sqrt<double, double>(classCount[classNum]));
|
||||
}
|
||||
|
@ -1069,7 +1069,7 @@ namespace helpers {
|
|||
ResultSet listOfOutTensors =output->allTensorsAlongDimension(restDims);
|
||||
|
||||
//auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto i = 0; i < indices->lengthOf(); i++) {
|
||||
for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
|
||||
auto classNum = indices->e<Nd4jLong>(i);
|
||||
auto current = listOfTensors.at(i);
|
||||
auto currentOut = listOfOutTensors.at(i);
|
||||
|
|
|
@ -378,7 +378,7 @@ namespace nd4j {
|
|||
int irow = 0;
|
||||
auto cShift = t * idxShift;
|
||||
|
||||
for (int e = 0; e < hsRounds; e++) {
|
||||
for (Nd4jLong e = 0; e < hsRounds; e++) {
|
||||
irow = bIndices[e + cShift];
|
||||
if (irow < 0 || irow >= vocabSize)
|
||||
continue;
|
||||
|
@ -457,7 +457,7 @@ namespace nd4j {
|
|||
T sneu1[600];
|
||||
T sneu1e[600];
|
||||
|
||||
for (int e = start; e < stop; e++) {
|
||||
for (auto e = start; e < stop; e++) {
|
||||
T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength];
|
||||
T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
|
||||
|
||||
|
@ -500,7 +500,7 @@ namespace nd4j {
|
|||
|
||||
// hierarchic softmax step
|
||||
if (!indices.isEmpty()) {
|
||||
for (int i = 0; i < numIndices; i++) {
|
||||
for (Nd4jLong i = 0; i < numIndices; i++) {
|
||||
const int cIndex = bIndices[(e * numIndices) + i];
|
||||
const int cCode = bCodes[(e * numIndices) + i];
|
||||
|
||||
|
|
|
@ -41,8 +41,8 @@ namespace helpers {
|
|||
|
||||
auto batchLoop = PRAGMA_THREADS_FOR {
|
||||
for (auto batch = start; batch < stop; batch++) {
|
||||
for (auto r = 0; r < rows; r++) {
|
||||
for (auto c = 0; c < r; c++) {
|
||||
for (Nd4jLong r = 0; r < rows; r++) {
|
||||
for (Nd4jLong c = 0; c < r; c++) {
|
||||
math::nd4j_swap(outputPart[batch]->t<T>(r, c) , outputPart[batch]->t<T>(c, r));
|
||||
}
|
||||
}
|
||||
|
@ -66,7 +66,7 @@ namespace helpers {
|
|||
auto permutationsPart = permutations.allTensorsAlongDimension({-1});
|
||||
|
||||
for (auto batch = 0; batch < permutationsPart.size(); ++batch) {
|
||||
for (auto row = 0; row < PPart[batch]->rows(); ++row) {
|
||||
for (Nd4jLong row = 0; row < PPart[batch]->rows(); ++row) {
|
||||
PPart[batch]->t<T>(row, permutationsPart[batch]->t<int>(row)) = T(1.f);
|
||||
}
|
||||
}
|
||||
|
@ -77,7 +77,7 @@ namespace helpers {
|
|||
MmulHelper::matmul(&P, rightInput, &rightPermuted, 0, 0);
|
||||
ResultSet leftLowerPart = leftLower.allTensorsAlongDimension({-2, -1});
|
||||
for (auto i = 0; i < leftLowerPart.size(); i++) {
|
||||
for (auto r = 0; r < leftLowerPart[i]->rows(); r++)
|
||||
for (Nd4jLong r = 0; r < leftLowerPart[i]->rows(); r++)
|
||||
leftLowerPart[i]->t<T>(r,r) = (T)1.f;
|
||||
}
|
||||
// stage 2: triangularSolveFunctor for Lower with given b
|
||||
|
|
|
@ -29,7 +29,7 @@ namespace helpers {
|
|||
//////////////////////////////////////////////////////////////////////////
|
||||
template <typename T>
|
||||
static void split_(const NDArray& input, const std::vector<NDArray*>& outArrs, const int axis) {
|
||||
int numSplits = outArrs.size();
|
||||
uint numSplits = outArrs.size();
|
||||
|
||||
const auto sizeofT = input.sizeOfT();
|
||||
|
||||
|
@ -73,9 +73,9 @@ namespace helpers {
|
|||
|
||||
if (luckCase2) {
|
||||
|
||||
const uint xDim = input.sizeAt(axis);
|
||||
const auto xDim = input.sizeAt(axis);
|
||||
|
||||
for (uint i = 0; i < input.lengthOf() / xDim; ++i) {
|
||||
for (Nd4jLong i = 0; i < input.lengthOf() / xDim; ++i) {
|
||||
|
||||
T* x = xBuff + xDim * i;
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@ namespace helpers {
|
|||
// }
|
||||
// ----------------------------------------------------------------------------------------------- //
|
||||
std::vector<int> dimsToExclude(input->rankOf() - 1);
|
||||
for (int d = 0; d < dimsToExclude.size(); ++d)
|
||||
for (size_t d = 0; d < dimsToExclude.size(); ++d)
|
||||
dimsToExclude[d] = d;
|
||||
|
||||
const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(input->getShapeInfo(), dimsToExclude);
|
||||
|
@ -72,7 +72,7 @@ namespace helpers {
|
|||
NDArray topValues = NDArrayFactory::create<T>('c', {k});
|
||||
NDArray sortedVals = NDArrayFactory::create<T>('c', {k});
|
||||
NDArray topIndices = NDArrayFactory::create<Nd4jLong>('c', {k});
|
||||
for (Nd4jLong pos = 0; pos < k; ++pos) {
|
||||
for (uint pos = 0; pos < k; ++pos) {
|
||||
topIndices.t<Nd4jLong>(pos) = pos;
|
||||
topValues.t<T>(pos) = trial.t<T>(pos);
|
||||
}
|
||||
|
@ -80,7 +80,7 @@ namespace helpers {
|
|||
sortedVals.assign(topValues);// = NDArrayFactory::create<T>('c', {k});
|
||||
//std::sort(sortedVals.begin(), sortedVals.end()); // sorted in ascending order
|
||||
SpecialMethods<T>::sortGeneric(sortedVals.buffer(), sortedVals.shapeInfo(), false);
|
||||
for (int i = k; i < width; ++i) {
|
||||
for (Nd4jLong i = static_cast<Nd4jLong>(k); i < width; ++i) {
|
||||
T val = trial.e<T>(i);
|
||||
T minTopVal = sortedVals.t<T>(0);
|
||||
if (minTopVal < val) { // value should be inserted to top k
|
||||
|
@ -104,15 +104,15 @@ namespace helpers {
|
|||
if (needSort) {
|
||||
SpecialMethods<T>::sortGeneric(topValues.buffer(), topValues.shapeInfo(), true);
|
||||
|
||||
for (int j = 0; j < width; j++)
|
||||
for (int pos = 0; pos < k; ++pos)
|
||||
for (Nd4jLong j = 0; j < width; j++)
|
||||
for (uint pos = 0; pos < k; ++pos)
|
||||
if (topValues.t<T>(pos) == trial.t<T>(j))
|
||||
topIndices.t<Nd4jLong>(pos) = j;
|
||||
}
|
||||
else { // else sort by indices
|
||||
std::map<Nd4jLong, T> sortValsMap;
|
||||
//std::vector<std::pair<int, T>> data(topValues.lengthOf());
|
||||
for (size_t e = 0; e < topValues.lengthOf(); ++e) {
|
||||
for (Nd4jLong e = 0; e < topValues.lengthOf(); ++e) {
|
||||
sortValsMap[topIndices.t<Nd4jLong>(e)] = topValues.t<T>(e);
|
||||
}
|
||||
|
||||
|
@ -152,7 +152,7 @@ namespace helpers {
|
|||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto e = start; e < stop; e++) {
|
||||
bool found = false;
|
||||
for (int j = 0; j < k; j++) {
|
||||
for (uint j = 0; j < k; j++) {
|
||||
if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) {
|
||||
found = true;
|
||||
break;
|
||||
|
|
|
@ -597,7 +597,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
|
|||
zCoordStart[yRank - 1] = coordToRestore;
|
||||
|
||||
// construct coordinates for x
|
||||
for (uint j = 0; j < yLastDim; ++j)
|
||||
for (int j = 0; j < yLastDim; ++j)
|
||||
xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride
|
||||
|
||||
const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart);
|
||||
|
@ -628,7 +628,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
|
|||
|
||||
if (indices != nullptr) {
|
||||
|
||||
for(int i = 0; i < indices->lengthOf(); ++i)
|
||||
for(Nd4jLong i = 0; i < indices->lengthOf(); ++i)
|
||||
if(indices->e<Nd4jLong>(i) >= input->sizeAt(axis))
|
||||
throw std::runtime_error("helpers::gather function: indices array contains wrong elements, each element must be smaller than corresponding dimension of input array !");
|
||||
|
||||
|
@ -733,7 +733,7 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat
|
|||
// increasing counter to skip numIndices
|
||||
e++;
|
||||
std::vector<int> indices;
|
||||
for (; e < intArgs->size(); e++)
|
||||
for (; e < static_cast<Nd4jLong>(intArgs->size()); e++)
|
||||
indices.push_back((*intArgs)[e]);
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
|
@ -813,7 +813,7 @@ static void mergeMaxIndex_(const std::vector<NDArray*>& inArrs, NDArray& output)
|
|||
T max = -DataTypeUtils::max<T>();
|
||||
Nd4jLong idx = 0;
|
||||
|
||||
for (int i = 0; i < numArgs; i++) {
|
||||
for (Nd4jLong i = 0; i < numArgs; i++) {
|
||||
T v = inArrs[i]->e<T>(e);
|
||||
if (v > max) {
|
||||
max = v;
|
||||
|
@ -841,7 +841,7 @@ static void mergeMax_(const std::vector<NDArray*>& inArrs, NDArray& output) {
|
|||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto e = start; e < stop; e++) {
|
||||
T max = -DataTypeUtils::max<T>();
|
||||
for (int i = 0; i < numArgs; i++) {
|
||||
for (Nd4jLong i = 0; i < numArgs; i++) {
|
||||
T v = inArrs[i]->e<T>(e);
|
||||
if (v > max)
|
||||
max = v;
|
||||
|
@ -867,7 +867,7 @@ static void mergeAvg_(const std::vector<NDArray*>& inArrs, NDArray& output) {
|
|||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto e = start; e < stop; e++) {
|
||||
T sum = 0.;
|
||||
for (int i = 0; i < numArgs; i++) {
|
||||
for (Nd4jLong i = 0; i < numArgs; i++) {
|
||||
T v = inArrs[i]->e<T>(e);
|
||||
sum += v;
|
||||
}
|
||||
|
@ -893,7 +893,7 @@ static void mergeAdd_(const std::vector<NDArray*>& inArrs, NDArray& output) {
|
|||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto e = start; e < stop; e++) {
|
||||
T sum = (T) 0.f;
|
||||
for (int i = 0; i < numArgs; i++)
|
||||
for (Nd4jLong i = 0; i < numArgs; i++)
|
||||
sum += inArrs[i]->e<T>(e);
|
||||
|
||||
output.p(e, sum);
|
||||
|
@ -1242,7 +1242,7 @@ static void tileBP_(const NDArray& gradO /*input*/, NDArray& gradI /*output*/, c
|
|||
memset(gradIBuff, 0, gradILen * sizeof(T));
|
||||
else {
|
||||
//PRAGMA_OMP_PARALLEL_FOR_SIMD
|
||||
for (int i = 0; i < gradILen * gradIEWS; i += gradIEWS)
|
||||
for (Nd4jLong i = 0; i < gradILen * gradIEWS; i += gradIEWS)
|
||||
gradIBuff[i] = static_cast<T>(0.f);
|
||||
}
|
||||
|
||||
|
|
|
@ -43,10 +43,10 @@ namespace helpers {
|
|||
auto rows = leftInput->rows();
|
||||
auto cols = rightInput->columns();
|
||||
//output->t<T>(0,0) = rightInput->t<T>(0,0) / leftInput->t<T>(0,0);
|
||||
for (auto r = 0; r < rows; r++) {
|
||||
for (auto j = 0; j < cols; j++) {
|
||||
for (Nd4jLong r = 0; r < rows; r++) {
|
||||
for (Nd4jLong j = 0; j < cols; j++) {
|
||||
auto sum = rightInput->t<T>(r, j);
|
||||
for (auto c = 0; c < r; c++) {
|
||||
for (Nd4jLong c = 0; c < r; c++) {
|
||||
sum -= leftInput->t<T>(r, c) * output->t<T>(c, j);
|
||||
}
|
||||
output->t<T>(r, j) = sum / leftInput->t<T>(r, r);
|
||||
|
@ -72,10 +72,10 @@ namespace helpers {
|
|||
static void upperTriangularSolve(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output) {
|
||||
auto rows = leftInput->rows();
|
||||
auto cols = rightInput->columns();
|
||||
for (auto r = rows; r > 0; r--) {
|
||||
for (auto j = 0; j < cols; j++) {
|
||||
for (Nd4jLong r = rows; r > 0; r--) {
|
||||
for (Nd4jLong j = 0; j < cols; j++) {
|
||||
auto sum = rightInput->t<T>(r - 1, j);
|
||||
for (auto c = r; c < rows; c++) {
|
||||
for (Nd4jLong c = r; c < rows; c++) {
|
||||
sum -= leftInput->t<T>(r - 1, c) * output->t<T>(c, j);
|
||||
}
|
||||
output->t<T>(r - 1, j) = sum / leftInput->t<T>(r - 1, r - 1);
|
||||
|
@ -114,14 +114,14 @@ namespace helpers {
|
|||
auto batchLoop = PRAGMA_THREADS_FOR {
|
||||
for (auto batch = start; batch < stop; batch++) {
|
||||
if (!lower) {
|
||||
for (auto r = 0; r < rows; r++) {
|
||||
for (auto c = 0; c <= r; c++) {
|
||||
for (Nd4jLong r = 0; r < rows; r++) {
|
||||
for (Nd4jLong c = 0; c <= r; c++) {
|
||||
outputPart[batch]->t<T>(r, c) = inputPart[batch]->t<T>(c, r);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (auto r = 0; r < rows; r++) {
|
||||
for (auto c = r; c < cols; c++) {
|
||||
for (Nd4jLong r = 0; r < rows; r++) {
|
||||
for (Nd4jLong c = r; c < cols; c++) {
|
||||
outputPart[batch]->t<T>(r, c) = inputPart[batch]->t<T>(c, r);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@ namespace helpers {
|
|||
|
||||
template <typename T>
|
||||
static void adjustWeights_(NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength) {
|
||||
for (int e = 0; e < input->lengthOf(); e++) {
|
||||
for (Nd4jLong e = 0; e < input->lengthOf(); e++) {
|
||||
int val = input->e<int>(e);
|
||||
if (val < maxLength) {
|
||||
if (weights != nullptr)
|
||||
|
|
Loading…
Reference in New Issue