Loops auto-vectorization problem fix (#274)

* libnd4j cast loop types

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j more type castination added to loops

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j sync casting types of iterated variable in loops

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j more loops reviewed for vectorization problem fix

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j fixed several typos

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j several more files reviewed to fix auto-vectorization problem in loops

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j merge master and reviewed more files to fix auto-vectorization problem in loops

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j several type casting added in broadcasting that were missed, fixed mac builds

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j double check all files and fix several more places in loops

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j fixed builds

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j revert changes for lup.cpp

Signed-off-by: Oleg <oleg.semeniv@gmail.com>
master
Oleh 2020-02-26 20:12:19 +02:00 committed by GitHub
parent 5c806d2fb5
commit b4575d11e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
48 changed files with 1084 additions and 1084 deletions

View File

@ -324,7 +324,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
auto tad = x + tadOffsets[i]; auto tad = x + tadOffsets[i];
auto s = OpType::startingValue(tad); auto s = OpType::startingValue(tad);
for (uint j = 0; j < tadLen; j++) for (Nd4jLong j = 0; j < tadLen; j++)
s = OpType::update(s, OpType::op(tad[j], extraParams), extraParams); s = OpType::update(s, OpType::op(tad[j], extraParams), extraParams);
z[i] = OpType::postProcess(s, tadLen, extraParams); z[i] = OpType::postProcess(s, tadLen, extraParams);
@ -338,7 +338,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
auto tad = x + tadOffsets[i]; auto tad = x + tadOffsets[i];
auto s = OpType::startingValue(tad); auto s = OpType::startingValue(tad);
for (uint j = 0; j < tadLen; j++) for (Nd4jLong j = 0; j < tadLen; j++)
s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams); s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
@ -352,7 +352,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
auto tad = x + tadOffsets[i]; auto tad = x + tadOffsets[i];
auto s = OpType::startingValue(tad); auto s = OpType::startingValue(tad);
for (uint i0 = 0; i0 < tadLen; ++i0) for (Nd4jLong i0 = 0; i0 < tadLen; ++i0)
s = OpType::update(s, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams); s = OpType::update(s, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams);
z[i] = OpType::postProcess(s, tadLen, extraParams); z[i] = OpType::postProcess(s, tadLen, extraParams);
@ -366,8 +366,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
auto tad = x + tadOffsets[i]; auto tad = x + tadOffsets[i];
auto s = OpType::startingValue(tad); auto s = OpType::startingValue(tad);
for (uint i0 = 0; i0 < tadShape[0]; ++i0) for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
for (uint i1 = 0; i1 < tadShape[1]; ++i1) for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1]], extraParams), extraParams); s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1]], extraParams), extraParams);
z[i] = OpType::postProcess(s, tadLen, extraParams); z[i] = OpType::postProcess(s, tadLen, extraParams);
@ -381,9 +381,9 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
auto tad = x + tadOffsets[i]; auto tad = x + tadOffsets[i];
auto s = OpType::startingValue(tad); auto s = OpType::startingValue(tad);
for (uint i0 = 0; i0 < tadShape[0]; ++i0) for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
for (uint i1 = 0; i1 < tadShape[1]; ++i1) for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
for (uint i2 = 0; i2 < tadShape[2]; ++i2) for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2]], extraParams), extraParams); s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2]], extraParams), extraParams);
z[i] = OpType::postProcess(s, tadLen, extraParams); z[i] = OpType::postProcess(s, tadLen, extraParams);
@ -397,10 +397,10 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
auto tad = x + tadOffsets[i]; auto tad = x + tadOffsets[i];
auto s = OpType::startingValue(tad); auto s = OpType::startingValue(tad);
for (uint i0 = 0; i0 < tadShape[0]; ++i0) for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
for (uint i1 = 0; i1 < tadShape[1]; ++i1) for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
for (uint i2 = 0; i2 < tadShape[2]; ++i2) for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
for (uint i3 = 0; i3 < tadShape[3]; ++i3) for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3)
s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3]], extraParams), extraParams); s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3]], extraParams), extraParams);
z[i] = OpType::postProcess(s, tadLen, extraParams); z[i] = OpType::postProcess(s, tadLen, extraParams);
@ -414,11 +414,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
auto tad = x + tadOffsets[i]; auto tad = x + tadOffsets[i];
auto s = OpType::startingValue(tad); auto s = OpType::startingValue(tad);
for (uint i0 = 0; i0 < tadShape[0]; ++i0) for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
for (uint i1 = 0; i1 < tadShape[1]; ++i1) for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
for (uint i2 = 0; i2 < tadShape[2]; ++i2) for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
for (uint i3 = 0; i3 < tadShape[3]; ++i3) for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3)
for (uint i4 = 0; i4 < tadShape[4]; ++i4) for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4)
s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4]], extraParams), extraParams); s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4]], extraParams), extraParams);
z[i] = OpType::postProcess(s, tadLen, extraParams); z[i] = OpType::postProcess(s, tadLen, extraParams);
@ -435,7 +435,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
auto tad = x + tadOffsets[i]; auto tad = x + tadOffsets[i];
auto s = OpType::startingValue(tad); auto s = OpType::startingValue(tad);
for (uint j = 0; j < tadLen; j++) for (Nd4jLong j = 0; j < tadLen; j++)
s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams); s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
@ -453,7 +453,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
auto tad = x + tadOffsets[i]; auto tad = x + tadOffsets[i];
auto s = OpType::startingValue(tad); auto s = OpType::startingValue(tad);
for (uint j = 0; j < tadLen; j++) { for (Nd4jLong j = 0; j < tadLen; j++) {
auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad); auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
s = OpType::update(s, OpType::op(tad[tadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(tad[tadOffset], extraParams), extraParams);
} }
@ -475,7 +475,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
auto tad = x + tadOffsets[i]; auto tad = x + tadOffsets[i];
auto s = OpType::startingValue(tad); auto s = OpType::startingValue(tad);
for (uint j = 0; j < tadLen; j++) for (Nd4jLong j = 0; j < tadLen; j++)
s = OpType::update(s, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams); s = OpType::update(s, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ); auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
@ -546,7 +546,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX); const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX);
z[i * zEws] = OpType::op(x[xOffset], extraParams); z[i * zEws] = OpType::op(x[xOffset], extraParams);
} }
} else { }
else {
for (auto i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX); const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX);
z[i] = OpType::op(x[xOffset], extraParams); z[i] = OpType::op(x[xOffset], extraParams);
@ -576,7 +577,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
auto z0 = i0 * zStride[0]; auto z0 = i0 * zStride[0];
auto x0 = i0 * xStride[0]; auto x0 = i0 * xStride[0];
for (uint i1 = span.startY(); i1 < span.stopY(); ++i1) for (auto i1 = span.startY(); i1 < span.stopY(); ++i1)
z[z0 + i1 * zStride[1]] = OpType::op(x[x0 + i1 * xStride[1]], extraParams); z[z0 + i1 * zStride[1]] = OpType::op(x[x0 + i1 * xStride[1]], extraParams);
} }
} }
@ -584,9 +585,9 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
//*********************************************// //*********************************************//
case LoopKind::RANK3: { case LoopKind::RANK3: {
auto uXShape0 = static_cast<uint>(xShape[0]); auto uXShape0 = xShape[0];
auto uXShape1 = static_cast<uint>(xShape[1]); auto uXShape1 = xShape[1];
auto uXShape2 = static_cast<uint>(xShape[2]); auto uXShape2 = xShape[2];
auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1); auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1);
auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1); auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1);
@ -597,7 +598,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
auto z0 = i0 * zStride[0] + i1 * zStride[1]; auto z0 = i0 * zStride[0] + i1 * zStride[1];
auto x0 = i0 * xStride[0] + i1 * xStride[1]; auto x0 = i0 * xStride[0] + i1 * xStride[1];
for (uint i2 = 0; i2 < uXShape2; ++i2) for (Nd4jLong i2 = 0; i2 < uXShape2; ++i2)
z[z0 + i2 * zStride[2]] = OpType::op(x[x0 + i2 * xStride[2]], extraParams); z[z0 + i2 * zStride[2]] = OpType::op(x[x0 + i2 * xStride[2]], extraParams);
} }
} }
@ -605,10 +606,10 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
//*********************************************// //*********************************************//
case LoopKind::RANK4: { case LoopKind::RANK4: {
auto uXShape0 = static_cast<uint>(xShape[0]); auto uXShape0 = xShape[0];
auto uXShape1 = static_cast<uint>(xShape[1]); auto uXShape1 = xShape[1];
auto uXShape2 = static_cast<uint>(xShape[2]); auto uXShape2 = xShape[2];
auto uXShape3 = static_cast<uint>(xShape[3]); auto uXShape3 = xShape[3];
auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2); auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2);
auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1); auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1);
@ -619,7 +620,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2]; auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2]; auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
for (uint i3 = 0; i3 < uXShape3; ++i3) for (Nd4jLong i3 = 0; i3 < uXShape3; ++i3)
z[z0 + i3 * zStride[3]] = OpType::op(x[x0 + i3 * xStride[3]], extraParams); z[z0 + i3 * zStride[3]] = OpType::op(x[x0 + i3 * xStride[3]], extraParams);
} }
} }
@ -627,11 +628,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
//*********************************************// //*********************************************//
case LoopKind::RANK5: { case LoopKind::RANK5: {
auto uXShape0 = static_cast<uint>(xShape[0]); auto uXShape0 = xShape[0];
auto uXShape1 = static_cast<uint>(xShape[1]); auto uXShape1 = xShape[1];
auto uXShape2 = static_cast<uint>(xShape[2]); auto uXShape2 = xShape[2];
auto uXShape3 = static_cast<uint>(xShape[3]); auto uXShape3 = xShape[3];
auto uXShape4 = static_cast<uint>(xShape[4]); auto uXShape4 = xShape[4];
auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2); auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2);
auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1); auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1);
@ -643,12 +644,12 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2]; auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2]; auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
for (uint i3 = 0; i3 < uXShape3; ++i3) { for (Nd4jLong i3 = 0; i3 < uXShape3; ++i3) {
auto z1 = z0 + i3 * zStride[3]; auto z1 = z0 + i3 * zStride[3];
auto x1 = x0 + i3 * xStride[3]; auto x1 = x0 + i3 * xStride[3];
for (uint i4 = 0; i4 < uXShape4; ++i4) for (Nd4jLong i4 = 0; i4 < uXShape4; ++i4)
z[z1 + i4 * zStride[4]] = OpType::op(x[x1 + i4 * xStride[4]], extraParams); z[z1 + i4 * zStride[4]] = OpType::op(x[x1 + i4 * xStride[4]], extraParams);
} }
@ -749,7 +750,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
auto s = OpType::startingValue(xTad); auto s = OpType::startingValue(xTad);
for (uint j = 0; j < tadLen; ++j) for (Nd4jLong j = 0; j < tadLen; ++j)
s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
z[i] = OpType::postProcess(s, tadLen, extraParams); z[i] = OpType::postProcess(s, tadLen, extraParams);
@ -769,7 +770,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
auto s = OpType::startingValue(xTad); auto s = OpType::startingValue(xTad);
for (uint j = 0; j < tadLen; ++j) for (Nd4jLong j = 0; j < tadLen; ++j)
s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
z[i * zEws] = OpType::postProcess(s, tadLen, extraParams); z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
@ -789,7 +790,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
auto s = OpType::startingValue(xTad); auto s = OpType::startingValue(xTad);
for (uint i0 = 0; i0 < tadLen; ++i0) { for (Nd4jLong i0 = 0; i0 < tadLen; ++i0) {
const auto xTadOffset = i0 * xTadStride[0]; const auto xTadOffset = i0 * xTadStride[0];
const auto yTadOffset = i0 * yTadStride[0]; const auto yTadOffset = i0 * yTadStride[0];
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
@ -812,8 +813,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
auto s = OpType::startingValue(xTad); auto s = OpType::startingValue(xTad);
for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
for (uint i1 = 0; i1 < tadShape[1]; ++i1) { for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1]; const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1];
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1];
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
@ -836,9 +837,9 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
auto s = OpType::startingValue(xTad); auto s = OpType::startingValue(xTad);
for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
for (uint i1 = 0; i1 < tadShape[1]; ++i1) { for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
for (uint i2 = 0; i2 < tadShape[2]; ++i2) { for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2]; const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2];
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2];
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
@ -862,10 +863,10 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
auto s = OpType::startingValue(xTad); auto s = OpType::startingValue(xTad);
for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
for (uint i1 = 0; i1 < tadShape[1]; ++i1) { for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
for (uint i2 = 0; i2 < tadShape[2]; ++i2) { for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
for (uint i3 = 0; i3 < tadShape[3]; ++i3) { for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) {
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3]; const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3];
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3];
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
@ -890,11 +891,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
auto s = OpType::startingValue(xTad); auto s = OpType::startingValue(xTad);
for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
for (uint i1 = 0; i1 < tadShape[1]; ++i1) { for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
for (uint i2 = 0; i2 < tadShape[2]; ++i2) { for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
for (uint i3 = 0; i3 < tadShape[3]; ++i3) { for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) {
for (uint i4 = 0; i4 < tadShape[4]; ++i4) { for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4) {
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4]; const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4];
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4];
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
@ -924,7 +925,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
auto s = OpType::startingValue(xTad); auto s = OpType::startingValue(xTad);
for (uint j = 0; j < tadLen; ++j) { for (Nd4jLong j = 0; j < tadLen; ++j) {
const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
} }
@ -946,7 +947,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y; const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
auto s = OpType::startingValue(xTad); auto s = OpType::startingValue(xTad);
for (uint j = 0; j < tadLen; ++j) { for (Nd4jLong j = 0; j < tadLen; ++j) {
const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad); const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
@ -996,8 +997,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
//*********************************************// //*********************************************//
case LoopKind::EWS1: { case LoopKind::EWS1: {
Z extraParams[3]; Z extraParams[3];
for (auto ix = 0; ix < numXTads; ix++) { for (Nd4jLong ix = 0; ix < numXTads; ix++) {
for (auto iy = 0; iy < numYTads; iy++) { for (Nd4jLong iy = 0; iy < numYTads; iy++) {
extraParams[0] = param0; extraParams[0] = param0;
extraParams[1] = param1; extraParams[1] = param1;
extraParams[2] = param2; extraParams[2] = param2;
@ -1007,7 +1008,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto zInd = ix * numYTads + iy; const auto zInd = ix * numYTads + iy;
auto s = startVal; auto s = startVal;
for (uint j = 0; j < tadLen; ++j) for (Nd4jLong j = 0; j < tadLen; ++j)
s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
z[zInd] = OpType::postProcess(s, tadLen, extraParams); z[zInd] = OpType::postProcess(s, tadLen, extraParams);
@ -1019,8 +1020,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
//*********************************************// //*********************************************//
case LoopKind::EWSNONZERO: { case LoopKind::EWSNONZERO: {
Z extraParams[3]; Z extraParams[3];
for (auto ix = 0; ix < numXTads; ix++) { for (Nd4jLong ix = 0; ix < numXTads; ix++) {
for (auto iy = 0; iy < numYTads; iy++) { for (Nd4jLong iy = 0; iy < numYTads; iy++) {
extraParams[0] = param0; extraParams[0] = param0;
extraParams[1] = param1; extraParams[1] = param1;
extraParams[2] = param2; extraParams[2] = param2;
@ -1030,7 +1031,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto zInd = ix * numYTads + iy; const auto zInd = ix * numYTads + iy;
auto s = startVal; auto s = startVal;
for (uint j = 0; j < tadLen; ++j) for (Nd4jLong j = 0; j < tadLen; ++j)
s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams); z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
@ -1042,8 +1043,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
//*********************************************// //*********************************************//
case LoopKind::RANK1: { case LoopKind::RANK1: {
Z extraParams[3]; Z extraParams[3];
for (auto ix = 0; ix < numXTads; ix++) { for (Nd4jLong ix = 0; ix < numXTads; ix++) {
for (auto iy = 0; iy < numYTads; iy++) { for (Nd4jLong iy = 0; iy < numYTads; iy++) {
extraParams[0] = param0; extraParams[0] = param0;
extraParams[1] = param1; extraParams[1] = param1;
extraParams[2] = param2; extraParams[2] = param2;
@ -1053,7 +1054,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto zInd = ix * numYTads + iy; const auto zInd = ix * numYTads + iy;
auto s = startVal; auto s = startVal;
for (uint i0 = 0; i0 < tadLen; ++i0) { for (Nd4jLong i0 = 0; i0 < tadLen; ++i0) {
const auto xTadOffset = i0 * xTadStride[0]; const auto xTadOffset = i0 * xTadStride[0];
const auto yTadOffset = i0 * yTadStride[0]; const auto yTadOffset = i0 * yTadStride[0];
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
@ -1067,8 +1068,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
//*********************************************// //*********************************************//
case LoopKind::RANK2: { case LoopKind::RANK2: {
Z extraParams[3]; Z extraParams[3];
for (auto ix = 0; ix < numXTads; ix++) { for (Nd4jLong ix = 0; ix < numXTads; ix++) {
for (auto iy = 0; iy < numYTads; iy++) { for (Nd4jLong iy = 0; iy < numYTads; iy++) {
extraParams[0] = param0; extraParams[0] = param0;
extraParams[1] = param1; extraParams[1] = param1;
extraParams[2] = param2; extraParams[2] = param2;
@ -1078,8 +1079,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto zInd = ix * numYTads + iy; const auto zInd = ix * numYTads + iy;
auto s = startVal; auto s = startVal;
for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
for (uint i1 = 0; i1 < tadShape[1]; ++i1) { for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1]; const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1];
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1];
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
@ -1094,8 +1095,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
//*********************************************// //*********************************************//
case LoopKind::RANK3: { case LoopKind::RANK3: {
Z extraParams[3]; Z extraParams[3];
for (auto ix = 0; ix < numXTads; ix++) { for (Nd4jLong ix = 0; ix < numXTads; ix++) {
for (auto iy = 0; iy < numYTads; iy++) { for (Nd4jLong iy = 0; iy < numYTads; iy++) {
extraParams[0] = param0; extraParams[0] = param0;
extraParams[1] = param1; extraParams[1] = param1;
extraParams[2] = param2; extraParams[2] = param2;
@ -1105,9 +1106,9 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto zInd = ix * numYTads + iy; const auto zInd = ix * numYTads + iy;
auto s = startVal; auto s = startVal;
for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
for (uint i1 = 0; i1 < tadShape[1]; ++i1) { for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
for (uint i2 = 0; i2 < tadShape[2]; ++i2) { for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2]; const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2];
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2];
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
@ -1123,8 +1124,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
//*********************************************// //*********************************************//
case LoopKind::RANK4: { case LoopKind::RANK4: {
Z extraParams[3]; Z extraParams[3];
for (auto ix = 0; ix < numXTads; ix++) { for (Nd4jLong ix = 0; ix < numXTads; ix++) {
for (auto iy = 0; iy < numYTads; iy++) { for (Nd4jLong iy = 0; iy < numYTads; iy++) {
extraParams[0] = param0; extraParams[0] = param0;
extraParams[1] = param1; extraParams[1] = param1;
extraParams[2] = param2; extraParams[2] = param2;
@ -1134,10 +1135,10 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto zInd = ix * numYTads + iy; const auto zInd = ix * numYTads + iy;
auto s = startVal; auto s = startVal;
for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
for (uint i1 = 0; i1 < tadShape[1]; ++i1) { for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
for (uint i2 = 0; i2 < tadShape[2]; ++i2) { for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
for (uint i3 = 0; i3 < tadShape[3]; ++i3) { for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) {
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3]; const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3];
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3];
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
@ -1154,8 +1155,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
//*********************************************// //*********************************************//
case LoopKind::RANK5: { case LoopKind::RANK5: {
Z extraParams[3]; Z extraParams[3];
for (auto ix = 0; ix < numXTads; ix++) { for (Nd4jLong ix = 0; ix < numXTads; ix++) {
for (auto iy = 0; iy < numYTads; iy++) { for (Nd4jLong iy = 0; iy < numYTads; iy++) {
extraParams[0] = param0; extraParams[0] = param0;
extraParams[1] = param1; extraParams[1] = param1;
extraParams[2] = param2; extraParams[2] = param2;
@ -1165,11 +1166,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto zInd = ix * numYTads + iy; const auto zInd = ix * numYTads + iy;
auto s = startVal; auto s = startVal;
for (uint i0 = 0; i0 < tadShape[0]; ++i0) { for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
for (uint i1 = 0; i1 < tadShape[1]; ++i1) { for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
for (uint i2 = 0; i2 < tadShape[2]; ++i2) { for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
for (uint i3 = 0; i3 < tadShape[3]; ++i3) { for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) {
for (uint i4 = 0; i4 < tadShape[4]; ++i4) { for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4) {
const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4]; const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4];
const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4]; const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4];
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
@ -1191,8 +1192,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
if (shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) { if (shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
Z extraParams[3]; Z extraParams[3];
for (auto ix = 0; ix < numXTads; ix++) { for (Nd4jLong ix = 0; ix < numXTads; ix++) {
for (auto iy = 0; iy < numYTads; iy++) { for (Nd4jLong iy = 0; iy < numYTads; iy++) {
extraParams[0] = param0; extraParams[0] = param0;
extraParams[1] = param1; extraParams[1] = param1;
extraParams[2] = param2; extraParams[2] = param2;
@ -1202,7 +1203,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto zInd = ix * numYTads + iy; const auto zInd = ix * numYTads + iy;
auto s = startVal; auto s = startVal;
for (uint j = 0; j < tadLen; ++j) { for (Nd4jLong j = 0; j < tadLen; ++j) {
const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
} }
@ -1215,8 +1216,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo); const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo);
Z extraParams[3]; Z extraParams[3];
for (auto ix = 0; ix < numXTads; ix++) { for (Nd4jLong ix = 0; ix < numXTads; ix++) {
for (auto iy = 0; iy < numYTads; iy++) { for (Nd4jLong iy = 0; iy < numYTads; iy++) {
extraParams[0] = param0; extraParams[0] = param0;
extraParams[1] = param1; extraParams[1] = param1;
extraParams[2] = param2; extraParams[2] = param2;
@ -1226,7 +1227,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
const auto zInd = ix * numYTads + iy; const auto zInd = ix * numYTads + iy;
auto s = startVal; auto s = startVal;
for (uint j = 0; j < tadLen; ++j) { for (Nd4jLong j = 0; j < tadLen; ++j) {
const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad); const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad); const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams); s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);

View File

@ -50,12 +50,12 @@ namespace nd4j {
1 == zArr.ews() && 'c' == zArr.ordering()); 1 == zArr.ews() && 'c' == zArr.ordering());
if (bSpecialCase && yArr.isColumnVector() && 1 == xArr.sizeAt(-1) ) { if (bSpecialCase && yArr.isColumnVector() && 1 == xArr.sizeAt(-1) ) {
auto yLen = (uint32_t)yArr.lengthOf(); auto yLen = yArr.lengthOf();
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (uint32_t i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
auto rZ = z + (i * yLen); auto rZ = z + (i * yLen);
auto v = x[i]; auto v = x[i];
for (uint32_t j = 0; j < yLen; j++) { for (Nd4jLong j = 0; j < yLen; j++) {
rZ[j] = OpType::op(v, y[j]); rZ[j] = OpType::op(v, y[j]);
} }
} }
@ -74,13 +74,13 @@ namespace nd4j {
if (bSpecialCase && bSpecialCase2) { if (bSpecialCase && bSpecialCase2) {
int zDim1 = zArr.sizeAt(-2); uint32_t zDim1 = zArr.sizeAt(-2);
int zDim2 = zArr.sizeAt(-1); uint32_t zDim2 = zArr.sizeAt(-1);
int nLen = zArr.lengthOf() / yArr.sizeAt(-1); uint32_t nLen = zArr.lengthOf() / yArr.sizeAt(-1);
auto func = PRAGMA_THREADS_FOR{ auto func = PRAGMA_THREADS_FOR{
for (uint32_t total = start; total < stop; total++) { for (auto total = start; total < stop; total++) {
uint32_t i = total / zDim1; uint32_t i = total / zDim1;
uint32_t j = total % zDim1; uint32_t j = total % zDim1;

View File

@ -184,7 +184,7 @@ namespace functions {
const auto oX = x[i]; const auto oX = x[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (unsigned int f = 0; f < loopLength; f++) for (Nd4jLong f = 0; f < loopLength; f++)
oZ[f] = OpType::op(oX, oY[f]); oZ[f] = OpType::op(oX, oY[f]);
} }
} else if(kindOfLoop == nd4j::LoopKind::BROADCAST_SCALAR_Y){ } else if(kindOfLoop == nd4j::LoopKind::BROADCAST_SCALAR_Y){
@ -198,7 +198,7 @@ namespace functions {
const auto oY = y[i]; const auto oY = y[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (unsigned int f = 0; f < loopLength; f++) for (Nd4jLong f = 0; f < loopLength; f++)
oZ[f] = OpType::op(oX[f], oY); oZ[f] = OpType::op(oX[f], oY);
} }
} }
@ -213,14 +213,14 @@ namespace functions {
Nd4jLong yStrides[3] = { 0,0,0 }; Nd4jLong yStrides[3] = { 0,0,0 };
nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides); nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1); uint64_t nSize1 = shape::sizeAt(zShapeInfo, 1);
uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2); uint64_t nSize2 = shape::sizeAt(zShapeInfo, 2);
for (uint32_t index0 = start; index0 < stop; index0++) { for (auto index0 = start; index0 < stop; index0++) {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint32_t index1 = 0; index1 < nSize1; index1++) { for (uint64_t index1 = 0; index1 < nSize1; index1++) {
for (uint32_t index2 = 0; index2 < nSize2; index2++) { for (uint64_t index2 = 0; index2 < nSize2; index2++) {
auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2); auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2);
auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2); auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2);
auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2); auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2);
@ -242,18 +242,18 @@ namespace functions {
Nd4jLong yStrides[4] = { 0,0,0,0 }; Nd4jLong yStrides[4] = { 0,0,0,0 };
nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides); nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1); uint64_t nSize1 = shape::sizeAt(zShapeInfo, 1);
uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2); uint64_t nSize2 = shape::sizeAt(zShapeInfo, 2);
uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3); uint64_t nSize3 = shape::sizeAt(zShapeInfo, 3);
for (uint32_t i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
uint32_t index0 = i / nSize1; uint64_t index0 = i / nSize1;
uint32_t index1 = i % nSize1; uint64_t index1 = i % nSize1;
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint32_t index2 = 0; index2 < nSize2; index2++) { for (uint64_t index2 = 0; index2 < nSize2; index2++) {
for (uint32_t index3 = 0; index3 < nSize3; index3++) { for (uint64_t index3 = 0; index3 < nSize3; index3++) {
auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2 + xStrides[3] * index3); auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2 + xStrides[3] * index3);
auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2 + yStrides[3] * index3); auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2 + yStrides[3] * index3);
auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2 + zStrides[3] * index3); auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2 + zStrides[3] * index3);
@ -279,7 +279,7 @@ namespace functions {
uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3); uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3);
uint32_t nSize4 = shape::sizeAt(zShapeInfo, 4); uint32_t nSize4 = shape::sizeAt(zShapeInfo, 4);
for (uint32_t i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
uint32_t index0 = i / nSize1; uint32_t index0 = i / nSize1;
uint32_t index1 = i % nSize1; uint32_t index1 = i % nSize1;
@ -326,7 +326,7 @@ namespace functions {
auto oX = x + tadOffsets[i]; auto oX = x + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
oZ[zOffset] = OpType::op(oX[offset], y[offset]); oZ[zOffset] = OpType::op(oX[offset], y[offset]);
@ -344,7 +344,7 @@ namespace functions {
auto oX = x + tadOffsets[i]; auto oX = x + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
oZ[offset] = OpType::op(oX[offset], y[yOffset]); oZ[offset] = OpType::op(oX[offset], y[yOffset]);
@ -362,7 +362,7 @@ namespace functions {
auto oX = x + tadOffsets[i]; auto oX = x + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
oZ[offset] = OpType::op(oX[xOffset], y[offset]); oZ[offset] = OpType::op(oX[xOffset], y[offset]);
@ -382,7 +382,7 @@ namespace functions {
auto oX = x + tadOffsets[i]; auto oX = x + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
@ -497,7 +497,7 @@ namespace functions {
auto oY = y + tadOffsets[i]; auto oY = y + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
oZ[zOffset] = OpType::op(x[offset], oY[offset]); oZ[zOffset] = OpType::op(x[offset], oY[offset]);
@ -515,7 +515,7 @@ namespace functions {
auto oY = y + tadOffsets[i]; auto oY = y + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, canCastX);
oZ[offset] = OpType::op(x[xOffset], oY[offset]); oZ[offset] = OpType::op(x[xOffset], oY[offset]);
@ -533,7 +533,7 @@ namespace functions {
auto oY = y + tadOffsets[i]; auto oY = y + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
oZ[offset] = OpType::op(x[offset], oY[yOffset]); oZ[offset] = OpType::op(x[offset], oY[yOffset]);
@ -553,7 +553,7 @@ namespace functions {
auto oY = y + tadOffsets[i]; auto oY = y + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);

View File

@ -183,7 +183,7 @@ namespace functions {
auto oX = x + tadOffsets[i]; auto oX = x + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
oZ[offset] = OpType::op(oX[offset], y[offset], extraParams); oZ[offset] = OpType::op(oX[offset], y[offset], extraParams);
} }
@ -200,7 +200,7 @@ namespace functions {
auto oX = x + tadOffsets[i]; auto oX = x + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
oZ[zOffset] = OpType::op(oX[offset], y[offset], extraParams); oZ[zOffset] = OpType::op(oX[offset], y[offset], extraParams);
@ -218,7 +218,7 @@ namespace functions {
auto oX = x + tadOffsets[i]; auto oX = x + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
oZ[offset] = OpType::op(oX[offset], y[yOffset], extraParams); oZ[offset] = OpType::op(oX[offset], y[yOffset], extraParams);
@ -237,7 +237,7 @@ namespace functions {
auto oX = x + tadOffsets[i]; auto oX = x + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
oZ[offset] = OpType::op(oX[xOffset], y[offset], extraParams); oZ[offset] = OpType::op(oX[xOffset], y[offset], extraParams);
@ -257,7 +257,7 @@ namespace functions {
auto oX = x + tadOffsets[i]; auto oX = x + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
@ -357,7 +357,7 @@ namespace functions {
auto oZ = z + zTadOffset[i]; auto oZ = z + zTadOffset[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
oZ[offset] = OpType::op(x[offset], oY[offset], extraParams); oZ[offset] = OpType::op(x[offset], oY[offset], extraParams);
} }
@ -375,7 +375,7 @@ namespace functions {
auto oY = y + tadOffsets[i]; auto oY = y + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
oZ[zOffset] = OpType::op(x[offset], oY[offset], extraParams); oZ[zOffset] = OpType::op(x[offset], oY[offset], extraParams);
@ -394,7 +394,7 @@ namespace functions {
auto oY = y + tadOffsets[i]; auto oY = y + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
oZ[offset] = OpType::op(x[xOffset], oY[offset], extraParams); oZ[offset] = OpType::op(x[xOffset], oY[offset], extraParams);
@ -413,7 +413,7 @@ namespace functions {
auto oY = y + tadOffsets[i]; auto oY = y + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
oZ[offset] = OpType::op(x[offset], oY[yOffset], extraParams); oZ[offset] = OpType::op(x[offset], oY[yOffset], extraParams);
@ -434,7 +434,7 @@ namespace functions {
auto oY = y + tadOffsets[i]; auto oY = y + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);

View File

@ -177,7 +177,7 @@ namespace functions {
auto oX = x + tadOffsets[i]; auto oX = x + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
oZ[offset] = OpType::op(oX[offset], y[offset]); oZ[offset] = OpType::op(oX[offset], y[offset]);
} }
@ -194,7 +194,7 @@ namespace functions {
auto oX = x + tadOffsets[i]; auto oX = x + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
oZ[zOffset] = OpType::op(oX[offset], y[offset]); oZ[zOffset] = OpType::op(oX[offset], y[offset]);
@ -212,7 +212,7 @@ namespace functions {
auto oX = x + tadOffsets[i]; auto oX = x + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
oZ[offset] = OpType::op(oX[offset], y[yOffset]); oZ[offset] = OpType::op(oX[offset], y[yOffset]);
@ -230,7 +230,7 @@ namespace functions {
auto oX = x + tadOffsets[i]; auto oX = x + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
oZ[offset] = OpType::op(oX[xOffset], y[offset]); oZ[offset] = OpType::op(oX[xOffset], y[offset]);
@ -250,7 +250,7 @@ namespace functions {
auto oX = x + tadOffsets[i]; auto oX = x + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (unsigned int f = 0; f < tadLength; f++) {
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
@ -347,7 +347,7 @@ namespace functions {
auto oZ = z + zTadOffset[i]; auto oZ = z + zTadOffset[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (uint f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
oZ[offset] = OpType::op(x[offset], oY[offset]); oZ[offset] = OpType::op(x[offset], oY[offset]);
} }
@ -364,7 +364,7 @@ namespace functions {
auto oZ = z + zTadOffset[i]; auto oZ = z + zTadOffset[i];
auto oY = y + tadOffsets[i]; auto oY = y + tadOffsets[i];
for (int f = 0; f < tadLength; f++) { for (uint f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
oZ[zOffset] = OpType::op(x[offset], oY[offset]); oZ[zOffset] = OpType::op(x[offset], oY[offset]);
@ -382,7 +382,7 @@ namespace functions {
auto oY = y + tadOffsets[i]; auto oY = y + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (uint f = 0; f < tadLength; f++) {
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
oZ[offset] = OpType::op(x[xOffset], oY[offset]); oZ[offset] = OpType::op(x[xOffset], oY[offset]);
@ -400,7 +400,7 @@ namespace functions {
auto oY = y + tadOffsets[i]; auto oY = y + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (uint f = 0; f < tadLength; f++) {
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
oZ[offset] = OpType::op(x[offset], oY[yOffset]); oZ[offset] = OpType::op(x[offset], oY[yOffset]);
@ -420,7 +420,7 @@ namespace functions {
auto oY = y + tadOffsets[i]; auto oY = y + tadOffsets[i];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int f = 0; f < tadLength; f++) { for (uint f = 0; f < tadLength; f++) {
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ); auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);

View File

@ -124,7 +124,7 @@ void IndexReduce<X, Z>::exec(void *vx, Nd4jLong *xShapeInfo,
return; return;
const auto indexValue = OpType::startingIndexValue(x); const auto indexValue = OpType::startingIndexValue(x);
for (uint i = 0; i < zLen; i++) for (Nd4jLong i = 0; i < zLen; i++)
z[i] = (Z) indexValue.index; z[i] = (Z) indexValue.index;
return; return;

View File

@ -93,7 +93,7 @@ namespace functions {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments); z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
@ -111,7 +111,7 @@ namespace functions {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments); z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
@ -129,7 +129,7 @@ namespace functions {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
auto offset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); auto offset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments); z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
@ -149,7 +149,7 @@ namespace functions {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY); auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
@ -197,7 +197,7 @@ namespace functions {
else{ else{
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments); z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
} }
@ -213,7 +213,7 @@ namespace functions {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX); auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments); z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
@ -255,7 +255,7 @@ namespace functions {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint64_t i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ); auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
z[offset] = OpClass::op(i, length, rng, extraArguments); z[offset] = OpClass::op(i, length, rng, extraArguments);
} }

View File

@ -55,7 +55,7 @@ namespace functions {
return; return;
const auto startingVal = OpType::startingValue(x); const auto startingVal = OpType::startingValue(x);
for (uint i = 0; i < length; i++) for (Nd4jLong i = 0; i < length; i++)
z[i] = startingVal; z[i] = startingVal;
return; return;
} }
@ -68,7 +68,7 @@ namespace functions {
uint xShapeInfoCast[MAX_RANK]; uint xShapeInfoCast[MAX_RANK];
const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
for (auto i = 0; i < length; i++) for (Nd4jLong i = 0; i < length; i++)
startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
z[0] = OpType::postProcess(startingValue, length, extraParams); z[0] = OpType::postProcess(startingValue, length, extraParams);
@ -94,7 +94,7 @@ namespace functions {
uint xShapeInfoCast[MAX_RANK]; uint xShapeInfoCast[MAX_RANK];
bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
for (auto i = 0; i < length; i++) for (Nd4jLong i = 0; i < length; i++)
startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
return OpType::postProcess(startingValue, length, extraParams); return OpType::postProcess(startingValue, length, extraParams);
@ -156,7 +156,7 @@ namespace functions {
return; return;
const auto startingVal = OpType::startingValue(x); const auto startingVal = OpType::startingValue(x);
for (uint i = 0; i < resultLength; i++) for (Nd4jLong i = 0; i < resultLength; i++)
z[i] = startingVal; z[i] = startingVal;
return; return;
} }

View File

@ -59,7 +59,7 @@ namespace functions {
return; return;
const auto startingVal = OpType::startingValue(x); const auto startingVal = OpType::startingValue(x);
for (uint i = 0; i < length; i++) for (Nd4jLong i = 0; i < length; i++)
z[i] = startingVal; z[i] = startingVal;
return; return;
@ -113,7 +113,7 @@ namespace functions {
uint xShapeInfoCast[MAX_RANK]; uint xShapeInfoCast[MAX_RANK];
bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
for (auto i = 0; i < length; i++) for (Nd4jLong i = 0; i < length; i++)
startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
return OpType::postProcess(startingValue, length, extraParams); return OpType::postProcess(startingValue, length, extraParams);
@ -184,7 +184,7 @@ namespace functions {
return; return;
const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? nd4j::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(x)); const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? nd4j::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(x));
for (uint i = 0; i < resultLength; i++) for (Nd4jLong i = 0; i < resultLength; i++)
z[i] = startingVal; z[i] = startingVal;
return; return;
} }

View File

@ -55,7 +55,7 @@ namespace functions {
return; return;
const auto startingVal = OpType::startingValue(x); const auto startingVal = OpType::startingValue(x);
for (uint i = 0; i < length; i++) for (Nd4jLong i = 0; i < length; i++)
z[i] = startingVal; z[i] = startingVal;
return; return;
} }
@ -110,7 +110,7 @@ namespace functions {
uint xShapeInfoCast[MAX_RANK]; uint xShapeInfoCast[MAX_RANK];
bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
for (auto i = 0; i < length; i++) for (Nd4jLong i = 0; i < length; i++)
startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
return OpType::postProcess(startingValue, length, extraParams); return OpType::postProcess(startingValue, length, extraParams);
@ -173,7 +173,7 @@ namespace functions {
return; return;
const auto startingVal = OpType::startingValue(x); const auto startingVal = OpType::startingValue(x);
for (uint i = 0; i < resultLength; i++) for (Nd4jLong i = 0; i < resultLength; i++)
z[i] = startingVal; z[i] = startingVal;
return; return;
} }

View File

@ -57,7 +57,7 @@ namespace functions {
return; return;
const auto startingVal = OpType::startingValue(x); const auto startingVal = OpType::startingValue(x);
for (uint i = 0; i < length; i++) for (Nd4jLong i = 0; i < length; i++)
z[i] = startingVal; z[i] = startingVal;
return; return;
} }
@ -111,7 +111,7 @@ namespace functions {
uint xShapeInfoCast[MAX_RANK]; uint xShapeInfoCast[MAX_RANK];
bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast); bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
for (auto i = 0; i < length; i++) for (Nd4jLong i = 0; i < length; i++)
startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams); startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
return OpType::postProcess(startingValue, length, extraParams); return OpType::postProcess(startingValue, length, extraParams);
@ -182,7 +182,7 @@ namespace functions {
return; return;
const auto startingVal = OpType::startingValue(x); const auto startingVal = OpType::startingValue(x);
for (uint i = 0; i < zLength; i++) for (Nd4jLong i = 0; i < zLength; i++)
z[i] = startingVal; z[i] = startingVal;
return; return;
} }

View File

@ -53,7 +53,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
return; return;
const auto startingVal = OpType::startingValue(x); const auto startingVal = OpType::startingValue(x);
for (uint i = 0; i < length; i++) for (Nd4jLong i = 0; i < length; i++)
z[i] = startingVal; z[i] = startingVal;
return; return;

View File

@ -73,7 +73,7 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
auto oX = x + xTadOffsets[r]; auto oX = x + xTadOffsets[r];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (unsigned int f = 0; f < tadLength; f++) for (int f = 0; f < tadLength; f++)
oZ[f] = OpType::op(oX[f], scalars[r], extraParams); oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
}; };
} }
@ -83,7 +83,7 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
auto oX = x + xTadOffsets[r]; auto oX = x + xTadOffsets[r];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (unsigned int f = 0; f < tadLength; f++) for (int f = 0; f < tadLength; f++)
oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams); oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
}; };
} }

View File

@ -74,7 +74,7 @@ namespace functions {
auto oX = x + xTadOffsets[r]; auto oX = x + xTadOffsets[r];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (unsigned int f = 0; f < tadLength; f++) for (int f = 0; f < tadLength; f++)
oZ[f] = OpType::op(oX[f], scalars[r], extraParams); oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
}; };
} }
@ -84,7 +84,7 @@ namespace functions {
auto oX = x + xTadOffsets[r]; auto oX = x + xTadOffsets[r];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (unsigned int f = 0; f < tadLength; f++) for (int f = 0; f < tadLength; f++)
oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams); oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
}; };
} }

View File

@ -74,7 +74,7 @@ namespace functions {
auto oX = x + xTadOffsets[r]; auto oX = x + xTadOffsets[r];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (unsigned int f = 0; f < tadLength; f++) for (int f = 0; f < tadLength; f++)
oZ[f] = OpType::op(oX[f], scalars[r], extraParams); oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
}; };
} }
@ -84,7 +84,7 @@ namespace functions {
auto oX = x + xTadOffsets[r]; auto oX = x + xTadOffsets[r];
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (unsigned int f = 0; f < tadLength; f++) for (int f = 0; f < tadLength; f++)
oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams); oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
}; };
} }

View File

@ -91,7 +91,7 @@ namespace functions {
uint xShapeInfoCast[MAX_RANK]; uint xShapeInfoCast[MAX_RANK];
const bool canCast = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast); const bool canCast = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
for (uint64_t i = 0; i < length; i++) { for (Nd4jLong i = 0; i < length; i++) {
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCast); auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCast);
SummaryStatsData<X> curr; SummaryStatsData<X> curr;
@ -116,7 +116,7 @@ namespace functions {
auto x = reinterpret_cast<X *>(vx); auto x = reinterpret_cast<X *>(vx);
auto z = reinterpret_cast<Z *>(vz); auto z = reinterpret_cast<Z *>(vz);
auto extraParams = reinterpret_cast<Z *>(vextraParams); auto extraParams = reinterpret_cast<Z *>(vextraParams);
int resultLength = shape::length(zShapeInfo); auto resultLength = shape::length(zShapeInfo);
if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) { if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY) if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
@ -124,7 +124,7 @@ namespace functions {
SummaryStatsData<X> comp; SummaryStatsData<X> comp;
comp.initWithValue(x[0]); comp.initWithValue(x[0]);
for (uint i = 0; i < resultLength; i++) for (Nd4jLong i = 0; i < resultLength; i++)
z[i] = OpType::getValue(biasCorrected, comp); z[i] = OpType::getValue(biasCorrected, comp);
return; return;
} }
@ -166,14 +166,14 @@ namespace functions {
comp.initWithValue(tx[0]); comp.initWithValue(tx[0]);
if (tadEWS == 1 && tadOrder == 'c') { if (tadEWS == 1 && tadOrder == 'c') {
for (int i = 1; i < tadLength; i++) { for (Nd4jLong i = 1; i < tadLength; i++) {
SummaryStatsData <X> indexVal2; SummaryStatsData <X> indexVal2;
indexVal2.initWithValue(tx[i]); indexVal2.initWithValue(tx[i]);
comp = update(comp, OpType::op(indexVal2, extraParams), extraParams); comp = update(comp, OpType::op(indexVal2, extraParams), extraParams);
} }
} else { } else {
for (int i = 1; i < tadLength; i++) { for (Nd4jLong i = 1; i < tadLength; i++) {
auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast); auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast);
SummaryStatsData <X> indexVal2; SummaryStatsData <X> indexVal2;

View File

@ -61,7 +61,7 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) {
else else
axes.push_back(inRank-1); // default dimension to reduce along is last dimension axes.push_back(inRank-1); // default dimension to reduce along is last dimension
const int numOfAxes = axes.size(); const uint numOfAxes = axes.size();
REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank); REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank);
// evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes // evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes
@ -83,7 +83,7 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) {
REQUIRE_TRUE(beta->isSameShape(expShape), 0, "BATCHNORM op: wrong shape of beta array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(beta).c_str()); REQUIRE_TRUE(beta->isSameShape(expShape), 0, "BATCHNORM op: wrong shape of beta array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(beta).c_str());
// types of all input arrays should be the same // types of all input arrays should be the same
for(int i = 1; i < block.width(); ++i) for(unsigned long i = 1; i < block.width(); ++i)
REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM op: types of all input arrays should be the same !"); REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM op: types of all input arrays should be the same !");
nd4j_debug("MKL-DNN is not used for batchnorm!\n", 0); nd4j_debug("MKL-DNN is not used for batchnorm!\n", 0);
@ -167,7 +167,7 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
else else
axes.push_back(inRank-1); // default dimension to reduce along is last dimension axes.push_back(inRank-1); // default dimension to reduce along is last dimension
const int numOfAxes = axes.size(); const uint numOfAxes = axes.size();
REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM_BP op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank); REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM_BP op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank);
// evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes // evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes
@ -191,7 +191,7 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
REQUIRE_TRUE(input->isSameShape(dLdO), 0, "BATCHNORM_BP op: wrong shape of output gradients array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(input).c_str(), ShapeUtils::shapeAsString(dLdO).c_str()); REQUIRE_TRUE(input->isSameShape(dLdO), 0, "BATCHNORM_BP op: wrong shape of output gradients array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(input).c_str(), ShapeUtils::shapeAsString(dLdO).c_str());
// types of all input arrays should be the same (except dLdO) // types of all input arrays should be the same (except dLdO)
for(int i = 1; i < block.width() - 2; ++i) for(unsigned long i = 1; i < block.width() - 2; ++i)
REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP op: types of arrays (input, mean, variance, gamma, beta) should be the same !"); REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP op: types of arrays (input, mean, variance, gamma, beta) should be the same !");
// ***** calculations ***** // // ***** calculations ***** //

View File

@ -30,7 +30,7 @@ namespace helpers {
int* pRowCounts = reinterpret_cast<int*>(rowCounts.buffer()); int* pRowCounts = reinterpret_cast<int*>(rowCounts.buffer());
int const* pRows = reinterpret_cast<int const*>(rowP->getBuffer()); int const* pRows = reinterpret_cast<int const*>(rowP->getBuffer());
int const* pCols = reinterpret_cast<int const*>(colP->getBuffer()); int const* pCols = reinterpret_cast<int const*>(colP->getBuffer());
for (int n = 0; n < N; n++) { for (Nd4jLong n = 0; n < N; n++) {
int begin = pRows[n];//->e<int>(n); int begin = pRows[n];//->e<int>(n);
int end = pRows[n + 1];//rowP->e<int>(n + 1); int end = pRows[n + 1];//rowP->e<int>(n + 1);
for (int i = begin; i < end; i++) { for (int i = begin; i < end; i++) {
@ -72,7 +72,7 @@ namespace helpers {
int const* pRows = reinterpret_cast<int const*>(rowP->getBuffer()); int const* pRows = reinterpret_cast<int const*>(rowP->getBuffer());
int* symRowP = reinterpret_cast<int*>(outputRows->buffer()); int* symRowP = reinterpret_cast<int*>(outputRows->buffer());
symRowP[0] = 0; symRowP[0] = 0;
for (int n = 0; n < N; n++) for (Nd4jLong n = 0; n < N; n++)
symRowP[n + 1] = symRowP[n] + rowCounts->e<int>(n); symRowP[n + 1] = symRowP[n] + rowCounts->e<int>(n);
// outputRows->printBuffer("output rows"); // outputRows->printBuffer("output rows");
@ -86,7 +86,7 @@ namespace helpers {
std::vector<int> offset(N);// = NDArrayFactory::create<int>('c', {N}); std::vector<int> offset(N);// = NDArrayFactory::create<int>('c', {N});
//PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(guided) shared(offset)) //PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(guided) shared(offset))
for (int n = 0; n < N; n++) { for (Nd4jLong n = 0; n < N; n++) {
int begin = pRows[n]; int begin = pRows[n];
int bound = pRows[n + 1]; int bound = pRows[n + 1];

View File

@ -146,17 +146,17 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
auto length = shape::length(inShapeInfo); auto length = shape::length(inShapeInfo);
if (inEWS == 1) { if (inEWS == 1) {
for (int i = 0; i < length; i++) for (Nd4jLong i = 0; i < length; i++)
max = nd4j::math::nd4j_max<T>(max, inBuff[i]); max = nd4j::math::nd4j_max<T>(max, inBuff[i]);
PRAGMA_OMP_SIMD_SUM(sum) PRAGMA_OMP_SIMD_SUM(sum)
for (int i = 0; i < length; i++) { for (Nd4jLong i = 0; i < length; i++) {
outBuff[i] = nd4j::math::nd4j_exp<T,T>(inBuff[i] - max); outBuff[i] = nd4j::math::nd4j_exp<T,T>(inBuff[i] - max);
sum += outBuff[i]; sum += outBuff[i];
} }
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int i = 0; i < length; i++) { for (Nd4jLong i = 0; i < length; i++) {
outBuff[i] /= sum; outBuff[i] /= sum;
outBuff[i] = nd4j::math::nd4j_log<T,T>(outBuff[i]); outBuff[i] = nd4j::math::nd4j_log<T,T>(outBuff[i]);
} }
@ -164,17 +164,17 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
else if (inEWS > 1) { else if (inEWS > 1) {
PRAGMA_OMP_SIMD_MAX(max) PRAGMA_OMP_SIMD_MAX(max)
for (int i = 0; i < length; i++) for (Nd4jLong i = 0; i < length; i++)
max = nd4j::math::nd4j_max<T>(max, inBuff[i * inEWS]); max = nd4j::math::nd4j_max<T>(max, inBuff[i * inEWS]);
PRAGMA_OMP_SIMD_SUM(sum) PRAGMA_OMP_SIMD_SUM(sum)
for (int i = 0; i < length; i++) { for (Nd4jLong i = 0; i < length; i++) {
outBuff[i * inEWS] = nd4j::math::nd4j_exp<T,T>(inBuff[i * inEWS] - max); outBuff[i * inEWS] = nd4j::math::nd4j_exp<T,T>(inBuff[i * inEWS] - max);
sum += outBuff[i * inEWS]; sum += outBuff[i * inEWS];
} }
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int i = 0; i < length; i++) { for (Nd4jLong i = 0; i < length; i++) {
outBuff[i * inEWS] /= sum; outBuff[i * inEWS] /= sum;
outBuff[i * inEWS] = nd4j::math::nd4j_log<T, T>(outBuff[i * inEWS]); outBuff[i * inEWS] = nd4j::math::nd4j_log<T, T>(outBuff[i * inEWS]);
} }

View File

@ -443,7 +443,7 @@ namespace nd4j {
const X* bias_new; const X* bias_new;
X* bias_extra = nullptr; X* bias_extra = nullptr;
size_t total_num = 1; size_t total_num = 1;
for (size_t i = 0; i < rank; i++) { for (Nd4jLong i = 0; i < rank; i++) {
total_num *= bases[i]; total_num *= bases[i];
} }
Nd4jLong inc; Nd4jLong inc;
@ -574,7 +574,7 @@ namespace nd4j {
for (size_t i = 0; i < 2; i++) { for (size_t i = 0; i < 2; i++) {
numNC *= bases[i]; numNC *= bases[i];
} }
for (size_t i = 2; i < rank; i++) { for (Nd4jLong i = 2; i < rank; i++) {
numHW *= bases[i]; numHW *= bases[i];
} }
Nd4jLong total_num = numNC * numHW; Nd4jLong total_num = numNC * numHW;

View File

@ -27,7 +27,7 @@ namespace helpers {
void adjustAxis(Nd4jLong rank, NDArray* axisVector, std::vector<int>& output) { void adjustAxis(Nd4jLong rank, NDArray* axisVector, std::vector<int>& output) {
output.resize(axisVector->lengthOf()); output.resize(axisVector->lengthOf());
for (int e = 0; e < axisVector->lengthOf(); e++) { for (Nd4jLong e = 0; e < axisVector->lengthOf(); e++) {
auto ca = axisVector->e<int>(e); auto ca = axisVector->e<int>(e);
if (ca < 0) if (ca < 0)
ca += rank; ca += rank;
@ -37,7 +37,7 @@ namespace helpers {
} }
void adjustAxis(Nd4jLong rank, std::vector<int> &axisVector) { void adjustAxis(Nd4jLong rank, std::vector<int> &axisVector) {
for (int e = 0; e < axisVector.size(); e++) { for (size_t e = 0; e < axisVector.size(); e++) {
auto a = axisVector[e]; auto a = axisVector[e];
if (a < 0) if (a < 0)
axisVector[e] = a + rank; axisVector[e] = a + rank;

View File

@ -66,7 +66,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
Nd4jLong* zOffsets = xzSameOffset ? xOffsets : new Nd4jLong[steps]; Nd4jLong* zOffsets = xzSameOffset ? xOffsets : new Nd4jLong[steps];
Nd4jLong* auxBuff = new Nd4jLong[2 * input->rankOf()]; Nd4jLong* auxBuff = new Nd4jLong[2 * input->rankOf()];
for (int j = 0; j < lenSmall; ++j) { for (Nd4jLong j = 0; j < lenSmall; ++j) {
const bool isOwner = (j < info._numThreads) ? thread_id == j : thread_id == (j % info._numThreads); const bool isOwner = (j < info._numThreads) ? thread_id == j : thread_id == (j % info._numThreads);
@ -96,7 +96,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
shape::outerArrayOffsets(zOffsets, j, output->getShapeInfo(), mean->getShapeInfo(), auxBuff, dimsToExclude.data()); shape::outerArrayOffsets(zOffsets, j, output->getShapeInfo(), mean->getShapeInfo(), auxBuff, dimsToExclude.data());
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (uint i = 0; i < steps; ++i) for (Nd4jLong i = 0; i < steps; ++i)
z[zOffsets[i]] = (x[xOffsets[i]] - meanVal) * sigmaInvGam + betaVal; z[zOffsets[i]] = (x[xOffsets[i]] - meanVal) * sigmaInvGam + betaVal;
} }

View File

@ -65,8 +65,8 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp
T *col, *im; T *col, *im;
int imRow, imCol; int imRow, imCol;
for (uint b = start_x; b < stop_x; b += inc_x) { for (auto b = start_x; b < stop_x; b += inc_x) {
for (uint c = start_y; c < stop_y; c += inc_y) { for (auto c = start_y; c < stop_y; c += inc_y) {
for (int kRow = 0; kRow < kH; ++kRow) { for (int kRow = 0; kRow < kH; ++kRow) {
for (int kCol = 0; kCol < kW; ++kCol) { for (int kCol = 0; kCol < kW; ++kCol) {
for (int colH = 0; colH < oH; ++colH) { for (int colH = 0; colH < oH; ++colH) {
@ -96,7 +96,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input, NDArray& outp
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
T *col, *im; T *col, *im;
for (uint b = start; b < stop; b++) { for (auto b = start; b < stop; b++) {
T *im0 = imBuff + b * imStride0; T *im0 = imBuff + b * imStride0;
T *col4 = colBuff + b * colStride0; T *col4 = colBuff + b * colStride0;
for (int colH = 0; colH < oH; ++colH, col4 += colStride4) { for (int colH = 0; colH < oH; ++colH, col4 += colStride4) {

View File

@ -55,8 +55,8 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
auto func = PRAGMA_THREADS_FOR_2D { auto func = PRAGMA_THREADS_FOR_2D {
for (uint b = start_x; b < stop_x; b += inc_x) { for (auto b = start_x; b < stop_x; b += inc_x) {
for (uint oh = start_y; oh < stop_y; oh += inc_y) { for (auto oh = start_y; oh < stop_y; oh += inc_y) {
for (uint ow = 0; ow < oW; ++ow) { for (uint ow = 0; ow < oW; ++ow) {
for (uint c = 0; c < iC; ++c) { for (uint c = 0; c < iC; ++c) {
@ -70,7 +70,7 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
const int iw = ow * sW - pW + kw * dW; const int iw = ow * sW - pW + kw * dW;
if (iw < 0 || iw >= iW) continue; if (iw < 0 || iw >= iW) continue;
uint xCoords[4] = {b, (uint)ih, (uint)iw, c}; uint xCoords[4] = { static_cast<uint>(b), static_cast<uint>(ih), static_cast<uint>(iw), c};
uint yCoords[3] = {kh, kw, c}; uint yCoords[3] = {kh, kw, c};
const X val = x[shape::getOffset(xShapeInfo, xCoords)] + y[shape::getOffset(yShapeInfo, yCoords)]; const X val = x[shape::getOffset(xShapeInfo, xCoords)] + y[shape::getOffset(yShapeInfo, yCoords)];
@ -79,7 +79,7 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
} }
} }
uint zCoords[4] = {b, oh, ow, c}; uint zCoords[4] = { static_cast<uint>(b), static_cast<uint>(oh), ow, c};
z[shape::getOffset(zShapeInfo, zCoords)] = static_cast<Z>(max); z[shape::getOffset(zShapeInfo, zCoords)] = static_cast<Z>(max);
} }
} }

View File

@ -63,7 +63,7 @@ namespace helpers {
std::vector<Nd4jLong> dims(reduceShape->lengthOf()); std::vector<Nd4jLong> dims(reduceShape->lengthOf());
bool fit = true; bool fit = true;
for( int i = 0; i < dims.size(); i++ ) { for(auto i = 0; i < dims.size(); i++ ) {
if (fit) { if (fit) {
dims[i] = reduceShape->e<Nd4jLong>(i); dims[i] = reduceShape->e<Nd4jLong>(i);
for (int e = 0; e < input->rankOf(); ++e) for (int e = 0; e < input->rankOf(); ++e)

View File

@ -53,7 +53,7 @@ namespace nd4j {
outputs[i].second = 0; outputs[i].second = 0;
//PRAGMA_OMP_PARALLEL_FOR_IF(indices->lengthOf() > Environment::getInstance()->elementwiseThreshold()) //PRAGMA_OMP_PARALLEL_FOR_IF(indices->lengthOf() > Environment::getInstance()->elementwiseThreshold())
for (int e = 0; e < indices->lengthOf(); ++e) for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
if ((*indices).e<Nd4jLong>(e) == i) if ((*indices).e<Nd4jLong>(e) == i)
listOutForCurrent.at(outputs[i].second++)->assign(listOfTensors.at(e)); listOutForCurrent.at(outputs[i].second++)->assign(listOfTensors.at(e));
} }
@ -65,7 +65,7 @@ namespace nd4j {
for (auto i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
outputs[i].first = outputList[i]; outputs[i].first = outputList[i];
outputs[i].second = 0; outputs[i].second = 0;
for (int e = 0; e < indices->lengthOf(); ++e) for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
if (indices->e<Nd4jLong>(e) == i) if (indices->e<Nd4jLong>(e) == i)
outputs[i].first->p(outputs[i].second++, input->e<T>(e)); outputs[i].first->p(outputs[i].second++, input->e<T>(e));
} }
@ -83,7 +83,7 @@ namespace nd4j {
for (int e = 0; e < numOfData; e++) { for (int e = 0; e < numOfData; e++) {
auto data = inputs[e]; auto data = inputs[e];
auto index = indices[e]; auto index = indices[e];
for (int i = 0; i < index->lengthOf(); i++) { for (Nd4jLong i = 0; i < index->lengthOf(); i++) {
Nd4jLong pos = index->e<Nd4jLong>(i); Nd4jLong pos = index->e<Nd4jLong>(i);
if (pos < 0) { if (pos < 0) {
nd4j_printf("dynamic_stitch: Index value should be non-negative. But %i was given", pos); nd4j_printf("dynamic_stitch: Index value should be non-negative. But %i was given", pos);
@ -100,7 +100,7 @@ namespace nd4j {
} }
else { else {
std::vector<int> restDims(output->rankOf() - 1); std::vector<int> restDims(output->rankOf() - 1);
for (int i = restDims.size(); i > 0; i--) for (auto i = restDims.size(); i > 0; i--)
restDims[restDims.size() - i] = output->rankOf() - i; restDims[restDims.size() - i] = output->rankOf() - i;
ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims); ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
@ -109,12 +109,12 @@ namespace nd4j {
auto data = inputs[e]; auto data = inputs[e];
auto index = indices[e]; auto index = indices[e];
std::vector<int> sourceDims(data->rankOf() - index->rankOf()); std::vector<int> sourceDims(data->rankOf() - index->rankOf());
for (int i = sourceDims.size(); i > 0; i--) for (auto i = sourceDims.size(); i > 0; i--)
sourceDims[sourceDims.size() - i] = data->rankOf() - i; sourceDims[sourceDims.size() - i] = data->rankOf() - i;
ResultSet listOfTensors = data->allTensorsAlongDimension(sourceDims) ; ResultSet listOfTensors = data->allTensorsAlongDimension(sourceDims) ;
for (int i = 0; i < index->lengthOf(); i++) { for (Nd4jLong i = 0; i < index->lengthOf(); i++) {
auto pos = index->e<Nd4jLong>(i); auto pos = index->e<Nd4jLong>(i);
if (pos < 0) { if (pos < 0) {
nd4j_printf("dynamic_stitch: Index value should be non-negative. But %i was given", pos); nd4j_printf("dynamic_stitch: Index value should be non-negative. But %i was given", pos);
@ -146,7 +146,7 @@ namespace nd4j {
ResultSet listOfTensors = outputList[0]->allTensorsAlongDimension(sourceDims); ResultSet listOfTensors = outputList[0]->allTensorsAlongDimension(sourceDims);
for (unsigned int i = 0; i < inputGradientList.size(); i++) { for (auto i = 0; i < inputGradientList.size(); i++) {
outputs[i].first = inputGradientList[i]; outputs[i].first = inputGradientList[i];
if (outputs[i].first->rankOf() < 1) continue; // skip empty gradient outs if (outputs[i].first->rankOf() < 1) continue; // skip empty gradient outs
std::vector<int> outDims(outputs[i].first->rankOf() - 1); std::vector<int> outDims(outputs[i].first->rankOf() - 1);
@ -158,7 +158,7 @@ namespace nd4j {
outputs[i].second = 0; outputs[i].second = 0;
for (int e = 0; e < indices->lengthOf(); ++e) for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
if (indices->e<Nd4jLong>(e) == i) if (indices->e<Nd4jLong>(e) == i)
listOfTensors.at(e)->assign(listOutForCurrent.at(outputs[i].second++)); listOfTensors.at(e)->assign(listOutForCurrent.at(outputs[i].second++));
} }
@ -171,7 +171,7 @@ namespace nd4j {
for (auto i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
outputs[i].first = inputGradientList[i]; outputs[i].first = inputGradientList[i];
outputs[i].second = 0; outputs[i].second = 0;
for (int e = 0; e < indices->lengthOf(); ++e) for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
if (indices->e<Nd4jLong>(e) == i) if (indices->e<Nd4jLong>(e) == i)
output->p<T>(e, outputs[i].first->e<T>(outputs[i].second++)); output->p<T>(e, outputs[i].first->e<T>(outputs[i].second++));
} }

View File

@ -45,7 +45,7 @@ namespace nd4j {
auto xShapeInfo = inputs[e]->shapeInfo(); auto xShapeInfo = inputs[e]->shapeInfo();
auto xLength = inputs[e]->lengthOf(); auto xLength = inputs[e]->lengthOf();
for (uint i = 0; i < xLength; i++) for (Nd4jLong i = 0; i < xLength; i++)
z[i] = xBuffer[getIndexOffsetOrdered(i, xShapeInfo, order)]; z[i] = xBuffer[getIndexOffsetOrdered(i, xShapeInfo, order)];
} }
} }

View File

@ -26,7 +26,7 @@ namespace nd4j {
namespace helpers { namespace helpers {
template <typename T> template <typename T>
static void hashCode_(LaunchContext *context, NDArray &array, NDArray &result) { static void hashCode_(LaunchContext *context, NDArray &array, NDArray &result) {
auto blockSize = 32; Nd4jLong blockSize = 32;
auto length = array.lengthOf(); auto length = array.lengthOf();
int numBlocks = length / blockSize + ((length % blockSize == 0) ? 0 : 1); int numBlocks = length / blockSize + ((length % blockSize == 0) ? 0 : 1);
auto tempA = NDArrayFactory::create<Nd4jLong>('c', {numBlocks}, context); auto tempA = NDArrayFactory::create<Nd4jLong>('c', {numBlocks}, context);
@ -42,11 +42,11 @@ namespace nd4j {
// we divide array into 32 element chunks, and store intermediate results once // we divide array into 32 element chunks, and store intermediate results once
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto b = 0; b < stop; b++) { for (auto b = start; b < stop; b++) {
auto blockBuffer = buffer + b * numBlocks; auto blockBuffer = buffer + b * numBlocks;
Nd4jLong r = 1; Nd4jLong r = 1;
for (int e = 0; e < blockSize && e + (b * numBlocks) < length; e++) { for (Nd4jLong e = 0; e < blockSize && e + (b * numBlocks) < length; e++) {
auto v = longBytes<T>(blockBuffer[e]); auto v = longBytes<T>(blockBuffer[e]);
r = 31 * r + v; r = 31 * r + v;
} }
@ -68,7 +68,7 @@ namespace nd4j {
auto blockBuffer = tempBuffer + b * numBlocks; auto blockBuffer = tempBuffer + b * numBlocks;
Nd4jLong r = 1; Nd4jLong r = 1;
for (int e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) { for (Nd4jLong e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) {
auto v = longBytes<T>(blockBuffer[e]); auto v = longBytes<T>(blockBuffer[e]);
r = 31 * r + v; r = 31 * r + v;
} }
@ -103,4 +103,3 @@ namespace nd4j {
} }
} }
} }

View File

@ -49,7 +49,7 @@ namespace nd4j {
} }
PRAGMA_OMP_SIMD PRAGMA_OMP_SIMD
for (int x = 0; x < numBins; x++) { for (Nd4jLong x = 0; x < numBins; x++) {
result[x] += bins[x]; result[x] += bins[x];
} }

View File

@ -64,8 +64,8 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input, NDArra
if (shape::order(imShapeBuffer) == 'c' && shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) { if (shape::order(imShapeBuffer) == 'c' && shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) {
auto func = PRAGMA_THREADS_FOR_2D { auto func = PRAGMA_THREADS_FOR_2D {
for (int b = start_x; b < stop_x; b++) { for (auto b = start_x; b < stop_x; b++) {
for (int c = start_y; c < stop_y; c++) { for (auto c = start_y; c < stop_y; c++) {
for (int kRow = 0; kRow < kH; ++kRow) { for (int kRow = 0; kRow < kH; ++kRow) {
for (int kCol = 0; kCol < kW; ++kCol) { for (int kCol = 0; kCol < kW; ++kCol) {
for (int colH = 0; colH < oH; ++colH) { for (int colH = 0; colH < oH; ++colH) {
@ -98,8 +98,8 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input, NDArra
T *col, *im; T *col, *im;
int imRow, imCol; int imRow, imCol;
for (int b = start_x; b < stop_x; b += inc_x) { for (auto b = start_x; b < stop_x; b += inc_x) {
for (int colH = start_y; colH < stop_y; colH += inc_y) { for (auto colH = start_y; colH < stop_y; colH += inc_y) {
for (int colW = 0; colW < oW; ++colW) { for (int colW = 0; colW < oW; ++colW) {
for (int c = 0; c < iC; ++c) { for (int c = 0; c < iC; ++c) {
for (int kRow = 0; kRow < kH; ++kRow) { for (int kRow = 0; kRow < kH; ++kRow) {

View File

@ -219,16 +219,16 @@ namespace helpers {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto batch = start; batch < stop; ++batch) { for (auto batch = start; batch < stop; ++batch) {
auto pInput = pInputBuf + batch * inBatchNumValues; auto pInput = pInputBuf + batch * inBatchNumValues;
for (auto y = 0; y < outHeight; ++y) { for (Nd4jLong y = 0; y < outHeight; ++y) {
auto pOutput = pOutputBuf + (batch * outHeight + y) * outRowSize; auto pOutput = pOutputBuf + (batch * outHeight + y) * outRowSize;
const T* ysInputLowerPtr = pInput + ys[y]._bottomIndex * inRowSize; const T* ysInputLowerPtr = pInput + ys[y]._bottomIndex * inRowSize;
const T* ysInputUpperPtr = pInput + ys[y]._topIndex * inRowSize; const T* ysInputUpperPtr = pInput + ys[y]._topIndex * inRowSize;
double yVal = ys[y]._interpolarValue; double yVal = ys[y]._interpolarValue;
for (auto x = 0; x < outWidth; ++x) { for (Nd4jLong x = 0; x < outWidth; ++x) {
auto xsBottom = xsPtr[x]._bottomIndex; auto xsBottom = xsPtr[x]._bottomIndex;
auto xsTop = xsPtr[x]._topIndex; auto xsTop = xsPtr[x]._topIndex;
auto xVal = xsPtr[x]._interpolarValue; auto xVal = xsPtr[x]._interpolarValue;
for (auto c = 0; c < channels; ++c) { for (Nd4jLong c = 0; c < channels; ++c) {
double topLeft(ysInputLowerPtr[xsBottom + c]); double topLeft(ysInputLowerPtr[xsBottom + c]);
double topRight(ysInputLowerPtr[xsTop + c]); double topRight(ysInputLowerPtr[xsTop + c]);
double bottomLeft(ysInputUpperPtr[xsBottom + c]); double bottomLeft(ysInputUpperPtr[xsBottom + c]);
@ -310,14 +310,14 @@ namespace helpers {
if (halfPixelCenter) { if (halfPixelCenter) {
inY = nd4j::math::nd4j_max(0LL, inY); inY = nd4j::math::nd4j_max(0LL, inY);
} }
for (auto x = 0; x < outWidth; ++x) { for (Nd4jLong x = 0; x < outWidth; ++x) {
auto posX = alignCorners ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(scaler(x, st.widthScale))) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(scaler(x, st.widthScale))); auto posX = alignCorners ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(scaler(x, st.widthScale))) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(scaler(x, st.widthScale)));
Nd4jLong inX = nd4j::math::nd4j_min(posX,inWidth - 1); Nd4jLong inX = nd4j::math::nd4j_min(posX,inWidth - 1);
if (halfPixelCenter) { if (halfPixelCenter) {
inX = nd4j::math::nd4j_max(0LL, inX); inX = nd4j::math::nd4j_max(0LL, inX);
} }
// copy pixel over all channels // copy pixel over all channels
for (auto e = 0; e < channels; e++) for (Nd4jLong e = 0; e < channels; e++)
output->t<T>(b, y, x, e) = images->t<T>(b, inY, inX, e); output->t<T>(b, y, x, e) = images->t<T>(b, inY, inX, e);
} }
} }
@ -613,7 +613,7 @@ namespace helpers {
for (auto b = start; b < stop; ++b) { for (auto b = start; b < stop; ++b) {
auto pInput = inputPtr + b * inBatchWidth; auto pInput = inputPtr + b * inBatchWidth;
for (auto y = 0; y < outHeight; ++y) { for (Nd4jLong y = 0; y < outHeight; ++y) {
auto pOutput = &pOutputY[(b * outHeight + y) * outWidth * numChannels]; auto pOutput = &pOutputY[(b * outHeight + y) * outWidth * numChannels];
WeightsAndIndices yWai; WeightsAndIndices yWai;
@ -635,7 +635,7 @@ namespace helpers {
F cached_value_0[4] = {0}; F cached_value_0[4] = {0};
F cached_value_1[4] = {0}; F cached_value_1[4] = {0};
F cached_value_2[4] = {0}; F cached_value_2[4] = {0};
for (auto x = 0; x < resizerState.outWidth; ++x) { for (Nd4jLong x = 0; x < resizerState.outWidth; ++x) {
const WeightsAndIndices &xWai = xWais[x]; const WeightsAndIndices &xWai = xWais[x];
// Shift values in cached_value_* to fill first '_advance' values. // Shift values in cached_value_* to fill first '_advance' values.
switch (xWai._advance) { switch (xWai._advance) {
@ -712,7 +712,7 @@ namespace helpers {
xWai._weight2, xWai._weight3); xWai._weight2, xWai._weight3);
} }
} else { } else {
for (auto x = 0; x < resizerState.outWidth; ++x) { for (Nd4jLong x = 0; x < resizerState.outWidth; ++x) {
const WeightsAndIndices &xWai = xWais[x]; const WeightsAndIndices &xWai = xWais[x];
// Shift values in cachedValue to fill first '_advance' values. // Shift values in cachedValue to fill first '_advance' values.
switch (xWai._advance) { switch (xWai._advance) {
@ -828,7 +828,7 @@ namespace helpers {
float sum_0 = 0; float sum_0 = 0;
float sum_1 = 0; float sum_1 = 0;
float sum_2 = 0; float sum_2 = 0;
for (int i = 0; i < yPtrs.size(); ++i) { for (size_t i = 0; i < yPtrs.size(); ++i) {
const T* ptr = yPtrs[i].yPtr; const T* ptr = yPtrs[i].yPtr;
float scaleX = xCache.startScale; float scaleX = xCache.startScale;
Nd4jLong offset = 3 * boundIfNeeded(xCache.start, st.inWidth); Nd4jLong offset = 3 * boundIfNeeded(xCache.start, st.inWidth);
@ -879,7 +879,7 @@ namespace helpers {
const auto numChannels = st.channels; const auto numChannels = st.channels;
for (Nd4jLong c = 0; c < numChannels; ++c) { for (Nd4jLong c = 0; c < numChannels; ++c) {
float sum = 0; float sum = 0;
for (int i = 0; i < yPtrs.size(); ++i) { for (size_t i = 0; i < yPtrs.size(); ++i) {
T const* ptr = yPtrs[i].yPtr; T const* ptr = yPtrs[i].yPtr;
float scaleX = xCache.startScale; float scaleX = xCache.startScale;
float sumY = static_cast<float>(ptr[numChannels * boundIfNeeded(xCache.start, st.inWidth) + c]) * scaleX; float sumY = static_cast<float>(ptr[numChannels * boundIfNeeded(xCache.start, st.inWidth) + c]) * scaleX;

View File

@ -62,7 +62,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
if(inTadEws == 1 && outTadEws == 1) { if(inTadEws == 1 && outTadEws == 1) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (uint i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
const T *x = inBuff + inTadOffsets[i]; const T *x = inBuff + inTadOffsets[i];
T *y = outBuff + outTadOffsets[i]; T *y = outBuff + outTadOffsets[i];
@ -70,7 +70,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
// calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1] // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
// we store each squared sum in corresponding element of y array // we store each squared sum in corresponding element of y array
for (uint j = 0; j < tadLen; ++j) { for (Nd4jLong j = 0; j < tadLen; ++j) {
const uint begin = nd4j::math::nd4j_max<int>(0, j - depth); const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
const uint last = depth + j + 1; const uint last = depth + j + 1;
const uint end = nd4j::math::nd4j_min<int>(last, tadLen); const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@ -100,7 +100,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
} }
else { else {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (uint i = 0; i < numOfTads; ++i) { for (Nd4jLong i = 0; i < numOfTads; ++i) {
const T *x = inBuff + inTadOffsets[i]; const T *x = inBuff + inTadOffsets[i];
T *y = outBuff + outTadOffsets[i]; T *y = outBuff + outTadOffsets[i];
@ -108,7 +108,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
// calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1] // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
// we store each squared sum in corresponding element of y array // we store each squared sum in corresponding element of y array
for (uint j = 0; j < tadLen; ++j) { for (Nd4jLong j = 0; j < tadLen; ++j) {
const uint begin = nd4j::math::nd4j_max<int>(0, j - depth); const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
const uint last = depth + j + 1; const uint last = depth + j + 1;
const uint end = nd4j::math::nd4j_min<int>(last, tadLen); const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@ -179,13 +179,13 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
if(inTadEws == 1 && gradITadEws == 1) { if(inTadEws == 1 && gradITadEws == 1) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (uint i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
const X *x = inBuff + inTadOffsets[i]; const X *x = inBuff + inTadOffsets[i];
Y *y = gradIBuff + gradITadOffsets[i]; Y *y = gradIBuff + gradITadOffsets[i];
// this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1] // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
// we store each squared sum in corresponding element of y array // we store each squared sum in corresponding element of y array
for (uint j = 0; j < tadLen; ++j) { for (Nd4jLong j = 0; j < tadLen; ++j) {
const uint begin = nd4j::math::nd4j_max<int>(0, j - depth); const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
const uint last = depth + j + 1; const uint last = depth + j + 1;
const uint end = nd4j::math::nd4j_min<int>(last, tadLen); const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@ -208,7 +208,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
Y prev = 0; Y prev = 0;
// second loop calculates derivatives using information gained in first loop above // second loop calculates derivatives using information gained in first loop above
for (uint j = 0; j < tadLen; ++j) { for (Nd4jLong j = 0; j < tadLen; ++j) {
const uint begin = nd4j::math::nd4j_max<int>(0, j - depth); const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
const uint last = depth + j + 1; const uint last = depth + j + 1;
const uint end = nd4j::math::nd4j_min<int>(last, tadLen); const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@ -247,13 +247,13 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
else { else {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (uint i = start; i < stop; i++) { for (auto i = start; i < stop; i++) {
const X *x = inBuff + inTadOffsets[i]; const X *x = inBuff + inTadOffsets[i];
Y *y = gradIBuff + gradITadOffsets[i]; Y *y = gradIBuff + gradITadOffsets[i];
// this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1] // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
// we store each squared sum in corresponding element of y array // we store each squared sum in corresponding element of y array
for (uint j = 0; j < tadLen; ++j) { for (Nd4jLong j = 0; j < tadLen; ++j) {
const uint begin = nd4j::math::nd4j_max<int>(0, j - depth); const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
const uint last = depth + j + 1; const uint last = depth + j + 1;
const uint end = nd4j::math::nd4j_min<int>(last, tadLen); const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@ -280,7 +280,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
Y prev = 0; Y prev = 0;
// second loop calculates derivatives using information gained in first loop above // second loop calculates derivatives using information gained in first loop above
for (uint j = 0; j < tadLen; ++j) { for (Nd4jLong j = 0; j < tadLen; ++j) {
const uint begin = nd4j::math::nd4j_max<int>(0, j - depth); const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
const uint last = depth + j + 1; const uint last = depth + j + 1;
const uint end = nd4j::math::nd4j_min<int>(last, tadLen); const uint end = nd4j::math::nd4j_min<int>(last, tadLen);

View File

@ -124,7 +124,7 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast,
auto h_ = h->bufferAsT<T>(); auto h_ = h->bufferAsT<T>();
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (uint e = start; e < stop; e++) { for (auto e = start; e < stop; e++) {
c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]); c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]); h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]);
} }

View File

@ -32,7 +32,7 @@ namespace helpers {
Nd4jLong preLastDim = input->rankOf() - 2; Nd4jLong preLastDim = input->rankOf() - 2;
ResultSet listOut = output->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}); ResultSet listOut = output->allTensorsAlongDimension({(int)preLastDim, (int)lastDim});
ResultSet listDiag = input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}); ResultSet listDiag = input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim});
for (Nd4jLong e = 0; e < listOut.size(); ++e) { for (Nd4jLong e = 0; e < static_cast<Nd4jLong>(listOut.size()); ++e) {
NDArray* inputMatrix = listDiag.at(e); NDArray* inputMatrix = listDiag.at(e);
NDArray* outputMatrix = listOut.at(e); NDArray* outputMatrix = listOut.at(e);
if (outputMatrix != inputMatrix) // if not inplace if (outputMatrix != inputMatrix) // if not inplace

View File

@ -68,7 +68,7 @@ namespace nd4j {
if (shape::elementWiseStride(xShapeInfo) == 1 && shape::elementWiseStride(zShapeInfo) == 1 && if (shape::elementWiseStride(xShapeInfo) == 1 && shape::elementWiseStride(zShapeInfo) == 1 &&
shape::order(xShapeInfo) == 'c' && shape::order(zShapeInfo) == 'c') { shape::order(xShapeInfo) == 'c' && shape::order(zShapeInfo) == 'c') {
for (int e = 0; e < length; e++) { for (Nd4jLong e = 0; e < length; e++) {
sum = op == scalar::Add ? simdOps::Add<T, T, T>::op(sum, x[e]) : simdOps::Multiply<T, T, T>::op(sum, x[e]); sum = op == scalar::Add ? simdOps::Add<T, T, T>::op(sum, x[e]) : simdOps::Multiply<T, T, T>::op(sum, x[e]);
if (!exclusive) if (!exclusive)
@ -81,7 +81,7 @@ namespace nd4j {
} }
else { else {
for (int e = 0; e < length; e++) { for (Nd4jLong e = 0; e < length; e++) {
auto xOffset = shape::getIndexOffset(e, xShapeInfo); auto xOffset = shape::getIndexOffset(e, xShapeInfo);
auto zOffset = shape::getIndexOffset(e, zShapeInfo); auto zOffset = shape::getIndexOffset(e, zShapeInfo);

View File

@ -43,8 +43,8 @@ namespace helpers {
T const* vBuf = v.getDataBuffer()->primaryAsT<T>(); T const* vBuf = v.getDataBuffer()->primaryAsT<T>();
T* resBuf = res.dataBuffer()->primaryAsT<T>(); T* resBuf = res.dataBuffer()->primaryAsT<T>();
auto interloop = PRAGMA_THREADS_FOR_2D { auto interloop = PRAGMA_THREADS_FOR_2D {
for (int i = start_x; i < n; i += inc_x) for (auto i = start_x; i < n; i += inc_x)
for (int j = start_y; j < n; j += inc_y) for (auto j = start_y; j < n; j += inc_y)
resBuf[i * n + j] = -2 * vBuf[i] * vBuf[j] + (i == j ? T(1) : T(0)); resBuf[i * n + j] = -2 * vBuf[i] * vBuf[j] + (i == j ? T(1) : T(0));
}; };
@ -63,7 +63,7 @@ namespace helpers {
NDArray z = *matrix; NDArray z = *matrix;
NDArray e('c', {M}, DataTypeUtils::fromT<T>()); // two internal buffers and scalar for squared norm NDArray e('c', {M}, DataTypeUtils::fromT<T>()); // two internal buffers and scalar for squared norm
for (auto k = 0; k < N && k < M - 1; k++) { // loop for columns, but not further then row number for (Nd4jLong k = 0; k < N && k < M - 1; k++) { // loop for columns, but not further then row number
e.nullify(); e.nullify();
z = matrixMinor<T>(z, k); // minor computing for current column with given matrix z (initally is a input matrix) z = matrixMinor<T>(z, k); // minor computing for current column with given matrix z (initally is a input matrix)
// z.printIndexedBuffer("Minor!!!"); // z.printIndexedBuffer("Minor!!!");
@ -87,7 +87,7 @@ namespace helpers {
} }
resQ.assign(q[0]); // resQ.assign(q[0]); //
// MmulHelper::matmul(&q[0], matrix, &resR, false, false); // MmulHelper::matmul(&q[0], matrix, &resR, false, false);
for (int i = 1; i < N && i < M - 1; i++) { for (Nd4jLong i = 1; i < N && i < M - 1; i++) {
auto tempResQ = resQ; auto tempResQ = resQ;
MmulHelper::matmul(&q[i], &resQ, &tempResQ, false, false); // use mmulMxM? MmulHelper::matmul(&q[i], &resQ, &tempResQ, false, false); // use mmulMxM?
resQ = std::move(tempResQ); resQ = std::move(tempResQ);

View File

@ -57,10 +57,10 @@ namespace helpers {
T* outputBuf = output->dataBuffer()->primaryAsT<T>(); T* outputBuf = output->dataBuffer()->primaryAsT<T>();
PRAGMA_OMP_PARALLEL_FOR PRAGMA_OMP_PARALLEL_FOR
for (auto k = 0; k < shift; k++) { for (Nd4jLong k = 0; k < shift; k++) {
auto pos = k * step; auto pos = k * step;
auto u = rng.relativeT<T>(k, 0., 1.); auto u = rng.relativeT<T>(k, 0., 1.);
for (auto e = 0; e < step; e++) for (Nd4jLong e = 0; e < step; e++)
if (directOutput) { if (directOutput) {
outputBuf[pos + e] = math::nd4j_igamma<T, T, T>(copyAlpha->t<T>(e), outputBuf[pos + e] = math::nd4j_igamma<T, T, T>(copyAlpha->t<T>(e),
beta != nullptr ? copyBeta->t<T>(e) * u : u); beta != nullptr ? copyBeta->t<T>(e) * u : u);
@ -104,10 +104,10 @@ namespace helpers {
bool directLa = lambda->ews() == 1 && lambda->ordering() == 'c'; bool directLa = lambda->ews() == 1 && lambda->ordering() == 'c';
bool directOut = output->ews() == 1 && output->ordering() == 'c'; bool directOut = output->ews() == 1 && output->ordering() == 'c';
PRAGMA_OMP_PARALLEL_FOR PRAGMA_OMP_PARALLEL_FOR
for (auto k = 0; k < shift; k++) { for (Nd4jLong k = 0; k < shift; k++) {
auto pos = k * step; auto pos = k * step;
auto u = rng.relativeT<T>(k, 0., 1.); auto u = rng.relativeT<T>(k, 0., 1.);
for (auto e = 0; e < step; e++) { for (Nd4jLong e = 0; e < step; e++) {
auto p = math::nd4j_exp<T, T>(-lambda->t<T>(e)); auto p = math::nd4j_exp<T, T>(-lambda->t<T>(e));
auto s = p; auto s = p;
auto x = T(0.f); auto x = T(0.f);
@ -143,7 +143,7 @@ namespace helpers {
RandomLauncher::fillUniform(context, rng, output, minVal, maxVal); RandomLauncher::fillUniform(context, rng, output, minVal, maxVal);
else { else {
PRAGMA_OMP_PARALLEL_FOR PRAGMA_OMP_PARALLEL_FOR
for (auto i = 0; i < output->lengthOf(); i++) { for (Nd4jLong i = 0; i < output->lengthOf(); i++) {
output->t<T>(i) = rng.relativeT<T>(i, minVal, maxVal); output->t<T>(i) = rng.relativeT<T>(i, minVal, maxVal);
} }
} }
@ -184,7 +184,7 @@ namespace helpers {
auto nSamplesPerBatch = nBatchIndex * numOfClassX * numOfSamples; auto nSamplesPerBatch = nBatchIndex * numOfClassX * numOfSamples;
auto nClassesPerSample = nSampleIndexInBatch * numOfClassX; auto nClassesPerSample = nSampleIndexInBatch * numOfClassX;
for (auto nClass = 0; nClass < numOfClassX; nClass += 1) { for (Nd4jLong nClass = 0; nClass < numOfClassX; nClass += 1) {
auto nIndex = nSamplesPerBatch + nClassesPerSample + nClass; auto nIndex = nSamplesPerBatch + nClassesPerSample + nClass;
auto unifornLog = nd4j::math::nd4j_log<Tx, Tx>(-nd4j::math::nd4j_log<Tx, Tx>(rng.relativeT<Tx>(nIndex, minVal, maxVal))); auto unifornLog = nd4j::math::nd4j_log<Tx, Tx>(-nd4j::math::nd4j_log<Tx, Tx>(rng.relativeT<Tx>(nIndex, minVal, maxVal)));
Tx tValue = (xTad[nClass * xDimAstride] - unifornLog); Tx tValue = (xTad[nClass * xDimAstride] - unifornLog);

View File

@ -50,7 +50,7 @@ namespace helpers {
width = lastDim; width = lastDim;
} }
for (int i = 0; i < input->lengthOf(); i += lastDim) { for (Nd4jLong i = 0; i < input->lengthOf(); i += lastDim) {
for (Nd4jLong k = startPos; k < width && pos < output->lengthOf(); k++) { for (Nd4jLong k = startPos; k < width && pos < output->lengthOf(); k++) {
output->p(pos++, input->e<T>(i + k)); output->p(pos++, input->e<T>(i + k));
} }

View File

@ -110,7 +110,7 @@ namespace helpers {
} }
else { else {
std::vector<int> dims(source->rankOf() - axe - 1); std::vector<int> dims(source->rankOf() - axe - 1);
for (int i = 0; i < dims.size(); ++i) for (size_t i = 0; i < dims.size(); ++i)
dims[i] = axe + 1 + i; dims[i] = axe + 1 + i;
ResultSet listOfTensors = source->allTensorsAlongDimension({dims}); ResultSet listOfTensors = source->allTensorsAlongDimension({dims});

View File

@ -55,9 +55,9 @@ static void batchToSpace_(const NDArray& input, NDArray& output, const uint crop
// loop through output array // loop through output array
auto func = PRAGMA_THREADS_FOR_3D { auto func = PRAGMA_THREADS_FOR_3D {
for (uint b = start_x; b < stop_x; b += inc_x) { for (auto b = start_x; b < stop_x; b += inc_x) {
for (uint h = start_y; h < stop_y; h += inc_y) { for (auto h = start_y; h < stop_y; h += inc_y) {
for (uint w = start_z; w < stop_z; w += inc_z) { for (auto w = start_z; w < stop_z; w += inc_z) {
for (uint c = 0; c < iC; ++c) { for (uint c = 0; c < iC; ++c) {
const Nd4jLong xOffset = b * xShapeInfo[5] + h * xShapeInfo[6] + w * xShapeInfo[7] + c * xShapeInfo[8]; const Nd4jLong xOffset = b * xShapeInfo[5] + h * xShapeInfo[6] + w * xShapeInfo[7] + c * xShapeInfo[8];
const Nd4jLong zOffset = b * zShapeInfo[5] + (h - cropBottom) * zShapeInfo[6] + (w - cropLeft) * zShapeInfo[7] + c * zShapeInfo[8]; const Nd4jLong zOffset = b * zShapeInfo[5] + (h - cropBottom) * zShapeInfo[6] + (w - cropLeft) * zShapeInfo[7] + c * zShapeInfo[8];
@ -146,11 +146,11 @@ void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const ND
std::vector<Nd4jLong> temp(numOfSpatialDims + rank); std::vector<Nd4jLong> temp(numOfSpatialDims + rank);
int i; uint i;
for(i = 0; i < numOfSpatialDims; ++i) for(i = 0; i < numOfSpatialDims; ++i)
temp[i] = blockShape.e<Nd4jLong>(i); temp[i] = blockShape.e<Nd4jLong>(i);
temp[i++] = output.sizeAt(0); temp[i++] = output.sizeAt(0);
for(int j = 1; j < rank; ++i, ++j) for(uint j = 1; j < rank; ++i, ++j)
temp[i] = input.sizeAt(j); temp[i] = input.sizeAt(j);
NDArray inputRearranged0 = input.reshape(input.ordering(), temp); NDArray inputRearranged0 = input.reshape(input.ordering(), temp);
@ -163,7 +163,7 @@ void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const ND
temp[2*i - 1] = numOfSpatialDims + i; temp[2*i - 1] = numOfSpatialDims + i;
temp[2*i] = i - 1; temp[2*i] = i - 1;
} }
for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i) for(i = 2 * numOfSpatialDims + 1; i < static_cast<uint>(temp.size()); ++i)
temp[i] = i; temp[i] = i;
inputRearranged0.permutei(temp); inputRearranged0.permutei(temp);
@ -216,8 +216,8 @@ static void spaceToBatch_(const NDArray& input, NDArray& output, const uint padB
// loop through output array // loop through output array
auto func = PRAGMA_THREADS_FOR_2D { auto func = PRAGMA_THREADS_FOR_2D {
for (uint b = start_x; b < stop_x; b += inc_x) { for (auto b = start_x; b < stop_x; b += inc_x) {
for (uint h = start_y; h < stop_y; h += inc_y) { for (auto h = start_y; h < stop_y; h += inc_y) {
for (uint w = 0; w < oW; ++w) { for (uint w = 0; w < oW; ++w) {
for (uint c = 0; c < iC; ++c) { for (uint c = 0; c < iC; ++c) {

View File

@ -87,7 +87,7 @@ namespace helpers {
if (input->isVector()) { if (input->isVector()) {
T val = input->e<T>(0); T val = input->e<T>(0);
for (int e = 1; e < indices->lengthOf(); e++) { for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
if (idx == indices->e<Nd4jLong>(e)) { if (idx == indices->e<Nd4jLong>(e)) {
// min // min
val = nd4j::math::nd4j_min<T>(val, input->t<T>(e)); val = nd4j::math::nd4j_min<T>(val, input->t<T>(e));
@ -115,7 +115,7 @@ namespace helpers {
for (Nd4jLong i = 1; i < indices->lengthOf(); i++) { for (Nd4jLong i = 1; i < indices->lengthOf(); i++) {
if (indices->e<Nd4jLong>(i) == idx) { if (indices->e<Nd4jLong>(i) == idx) {
for (int e = 0; e < minT->lengthOf(); e++) { for (Nd4jLong e = 0; e < minT->lengthOf(); e++) {
minT->p(e, nd4j::math::nd4j_min(minT->e<T>(e), listOfTensors.at(i)->e<T>(e))); minT->p(e, nd4j::math::nd4j_min(minT->e<T>(e), listOfTensors.at(i)->e<T>(e)));
} }
} }
@ -138,7 +138,7 @@ namespace helpers {
T val = T(0.f); T val = T(0.f);
int count = 0; int count = 0;
for (int e = 0; e < indices->lengthOf(); e++) { for (Nd4jLong e = 0; e < indices->lengthOf(); e++) {
if (idx == indices->e<int>(e)) { if (idx == indices->e<int>(e)) {
// mean // mean
val += input->e<T>(e); val += input->e<T>(e);
@ -166,7 +166,7 @@ namespace helpers {
auto meanV = meanT->dup(); auto meanV = meanT->dup();
meanV.assign(listOfTensors.at(0)); meanV.assign(listOfTensors.at(0));
for (int i = 1; i < indices->lengthOf(); i++) { for (Nd4jLong i = 1; i < indices->lengthOf(); i++) {
if (indices->e<int>(i) == idx) { if (indices->e<int>(i) == idx) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e++) { for (auto e = start; e < stop; e++) {
@ -198,7 +198,7 @@ namespace helpers {
if (input->isVector()) { if (input->isVector()) {
T val = T(0.f); T val = T(0.f);
int count = 0; int count = 0;
for (int e = 0; e < indices->lengthOf(); e++) { for (Nd4jLong e = 0; e < indices->lengthOf(); e++) {
if (idx == indices->e<int>(e)) { if (idx == indices->e<int>(e)) {
// sum // sum
val += input->t<T>(e); val += input->t<T>(e);
@ -220,7 +220,7 @@ namespace helpers {
std::vector<std::pair<NDArray*, int>> outputs(numOfClasses); std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
auto sumT = listOfOutTensors.at(idx); auto sumT = listOfOutTensors.at(idx);
for (int i = 0; i < indices->lengthOf(); i++) { for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
if (indices->e<int>(i) == idx) { if (indices->e<int>(i) == idx) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e++) { for (auto e = start; e < stop; e++) {
@ -248,7 +248,7 @@ namespace helpers {
T val = input->e<T>(0); T val = input->e<T>(0);
int count = 0; int count = 0;
for (int e = 1; e < indices->lengthOf(); e++) { for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
if (idx == indices->e<int>(e)) { if (idx == indices->e<int>(e)) {
// sum // sum
val *= input->e<T>(e); val *= input->e<T>(e);
@ -269,7 +269,7 @@ namespace helpers {
int numOfClasses = output->sizeAt(0); // number of classes int numOfClasses = output->sizeAt(0); // number of classes
auto sumT = listOfOutTensors.at(idx); auto sumT = listOfOutTensors.at(idx);
sumT->assign(listOfTensors.at(0)); sumT->assign(listOfTensors.at(0));
for (int i = 1; i < indices->lengthOf(); i++) { for (Nd4jLong i = 1; i < indices->lengthOf(); i++) {
if (indices->e<int>(i) == idx) { if (indices->e<int>(i) == idx) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e++) { for (auto e = start; e < stop; e++) {
@ -313,7 +313,7 @@ namespace helpers {
bool segmentIndicesValidate(nd4j::LaunchContext * context, NDArray* indices, NDArray& expected, NDArray& output) { bool segmentIndicesValidate(nd4j::LaunchContext * context, NDArray* indices, NDArray& expected, NDArray& output) {
auto val = indices->e(0); auto val = indices->e(0);
for (int e = 1; e < indices->lengthOf(); e++) { for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
output = indices->e(e); output = indices->e(e);
if (val.e<Nd4jLong>(0) > output.e<Nd4jLong>(0)) if (val.e<Nd4jLong>(0) > output.e<Nd4jLong>(0))
return false; return false;
@ -362,7 +362,7 @@ namespace helpers {
for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
T val = input->e<T>(fi->second.at(0)); T val = input->e<T>(fi->second.at(0));
for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) { for (Nd4jLong idx = 1; idx < static_cast<Nd4jLong>(fi->second.size()); ++idx) {
val = nd4j::math::nd4j_max(val, input->e<T>(fi->second.at(idx))); val = nd4j::math::nd4j_max(val, input->e<T>(fi->second.at(idx)));
} }
output->p(fi->first, val); output->p(fi->first, val);
@ -380,7 +380,7 @@ namespace helpers {
for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
auto outputT = listOfOutTensors.at(fi->first); auto outputT = listOfOutTensors.at(fi->first);
outputT->assign(listOfTensors.at(fi->second.at(0))); outputT->assign(listOfTensors.at(fi->second.at(0)));
for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) { for (Nd4jLong idx = 1; idx < static_cast<Nd4jLong>(fi->second.size()); ++idx) {
auto maxT = listOfTensors.at(fi->second.at(idx)); auto maxT = listOfTensors.at(fi->second.at(idx));
for (Nd4jLong e = 0; e < outputT->lengthOf(); ++e) { for (Nd4jLong e = 0; e < outputT->lengthOf(); ++e) {
T val = nd4j::math::nd4j_max(maxT->e<T>(e), outputT->e<T>(e)); T val = nd4j::math::nd4j_max(maxT->e<T>(e), outputT->e<T>(e));
@ -432,7 +432,7 @@ namespace helpers {
for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
auto outputT = listOfOutTensors.at(fi->first); auto outputT = listOfOutTensors.at(fi->first);
outputT->assign(listOfTensors.at(fi->second.at(0))); outputT->assign(listOfTensors.at(fi->second.at(0)));
for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) { for (size_t idx = 1; idx < fi->second.size(); ++idx) {
auto minT = listOfTensors.at(fi->second.at(idx)); auto minT = listOfTensors.at(fi->second.at(idx));
for (Nd4jLong e = 0; e < outputT->lengthOf(); ++e) { for (Nd4jLong e = 0; e < outputT->lengthOf(); ++e) {
@ -560,7 +560,7 @@ namespace helpers {
for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
auto outputT = listOfOutTensors.at(fi->first); auto outputT = listOfOutTensors.at(fi->first);
outputT->assign(listOfTensors.at(fi->second.at(0))); outputT->assign(listOfTensors.at(fi->second.at(0)));
for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) { for (size_t idx = 1; idx < fi->second.size(); ++idx) {
auto current = listOfTensors.at(fi->second.at(idx)); auto current = listOfTensors.at(fi->second.at(idx));
*outputT *= *current; *outputT *= *current;
@ -584,7 +584,7 @@ namespace helpers {
if (input->isVector()) { // 1D case if (input->isVector()) { // 1D case
for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
double sumValue = input->e<double>(fi->second.at(0)); double sumValue = input->e<double>(fi->second.at(0));
for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) { for (size_t idx = 1; idx < fi->second.size(); ++idx) {
sumValue += input->e<double>(fi->second.at(idx)); sumValue += input->e<double>(fi->second.at(idx));
} }
output->p(fi->first, sumValue / nd4j::math::nd4j_sqrt<Nd4jLong, double>(fi->second.size())); output->p(fi->first, sumValue / nd4j::math::nd4j_sqrt<Nd4jLong, double>(fi->second.size()));
@ -599,7 +599,7 @@ namespace helpers {
for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) { for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
auto outputT = listOfOutTensors.at(fi->first); auto outputT = listOfOutTensors.at(fi->first);
outputT->assign(listOfTensors.at(fi->second.at(0))); outputT->assign(listOfTensors.at(fi->second.at(0)));
for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) { for (size_t idx = 1; idx < fi->second.size(); ++idx) {
auto current = listOfTensors.at(fi->second.at(idx)); auto current = listOfTensors.at(fi->second.at(idx));
*outputT += *current; *outputT += *current;
} }
@ -651,7 +651,7 @@ namespace helpers {
auto currentOut = listOfOutTensors.at(i); auto currentOut = listOfOutTensors.at(i);
auto currentGradOut = listOfGradOuts.at(classNum); auto currentGradOut = listOfGradOuts.at(classNum);
for (uint64_t e = 0; e < current->lengthOf(); e++) { for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e<T>(e) - current->e<T>(e)) <= T(1.e-6)) if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e<T>(e) - current->e<T>(e)) <= T(1.e-6))
currentOut->p(e, currentGradOut->e<T>(e)); currentOut->p(e, currentGradOut->e<T>(e));
} }
@ -703,7 +703,7 @@ namespace helpers {
auto currentOut = listOfOutTensors.at(i); auto currentOut = listOfOutTensors.at(i);
auto currentGradOut = listOfGradOuts.at(classNum); auto currentGradOut = listOfGradOuts.at(classNum);
for (int e = 0; e < current->lengthOf(); e++) { for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e<double>(e) - current->e<double>(e)) < if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e<double>(e) - current->e<double>(e)) <
1.e-5) 1.e-5)
currentOut->p(e, currentGradOut->e<double>(e)); currentOut->p(e, currentGradOut->e<double>(e));
@ -746,13 +746,13 @@ namespace helpers {
int pos = 0; int pos = 0;
//auto func = [&](uint64_t thread_id, uint64_t start, uint64_t stop, uint64_t increment) -> void { //auto func = [&](uint64_t thread_id, uint64_t start, uint64_t stop, uint64_t increment) -> void {
for (auto i = 0; i < indices->lengthOf(); i++) { for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
auto classNum = indices->e<Nd4jLong>(i); auto classNum = indices->e<Nd4jLong>(i);
auto current = listOfTensors.at(i); auto current = listOfTensors.at(i);
auto currentOut = listOfOutTensors.at(i); auto currentOut = listOfOutTensors.at(i);
auto currentGradOut = listOfGradOuts.at(classNum); auto currentGradOut = listOfGradOuts.at(classNum);
for (int e = 0; e < current->lengthOf(); e++) { for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
currentOut->p(e, currentGradOut->e<double>(e) / classCount.at(classNum)); currentOut->p(e, currentGradOut->e<double>(e) / classCount.at(classNum));
} }
} }
@ -781,7 +781,7 @@ namespace helpers {
ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims); ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
//auto func = PRAGMA_THREADS_FOR { //auto func = PRAGMA_THREADS_FOR {
for (auto i = 0; i < indices->lengthOf(); i++) { for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
auto classNum = indices->e<Nd4jLong>(i); auto classNum = indices->e<Nd4jLong>(i);
auto current = listOfTensors.at(i); auto current = listOfTensors.at(i);
auto currentOut = listOfOutTensors.at(i); auto currentOut = listOfOutTensors.at(i);
@ -817,7 +817,7 @@ namespace helpers {
//std::vector<std::pair<NDArray*, int>> outputs(numOfClasses); //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
//auto func = PRAGMA_THREADS_FOR { //auto func = PRAGMA_THREADS_FOR {
for (auto i = 0; i < indices->lengthOf(); i++) { for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
auto classNum = indices->e<Nd4jLong>(i); auto classNum = indices->e<Nd4jLong>(i);
auto current = listOfTensors.at(i); auto current = listOfTensors.at(i);
auto currentOut = listOfOutTensors.at(i); auto currentOut = listOfOutTensors.at(i);
@ -860,7 +860,7 @@ namespace helpers {
ResultSet listOfTensors = input->allTensorsAlongDimension(restDims); ResultSet listOfTensors = input->allTensorsAlongDimension(restDims);
ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims); ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
for (int i = 0; i < indices->lengthOf(); i++) { for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
Nd4jLong classNum = indices->e<Nd4jLong>(i); Nd4jLong classNum = indices->e<Nd4jLong>(i);
NDArray* current = listOfTensors.at(i); NDArray* current = listOfTensors.at(i);
NDArray* currentOut = listOfOutTensors.at(i); NDArray* currentOut = listOfOutTensors.at(i);
@ -905,13 +905,13 @@ namespace helpers {
ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims); ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
//auto func = PRAGMA_THREADS_FOR { //auto func = PRAGMA_THREADS_FOR {
for (auto i = 0; i < indices->lengthOf(); i++) { for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
auto classNum = indices->e<Nd4jLong>(i); auto classNum = indices->e<Nd4jLong>(i);
auto current = listOfTensors.at(i); auto current = listOfTensors.at(i);
auto currentOut = listOfOutTensors.at(i); auto currentOut = listOfOutTensors.at(i);
auto currentGradOut = listOfGradOuts.at(classNum); auto currentGradOut = listOfGradOuts.at(classNum);
for (int e = 0; e < current->lengthOf(); e++) { for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->t<T>(e) - current->t<T>(e)) < 1.e-6) if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->t<T>(e) - current->t<T>(e)) < 1.e-6)
currentOut->t<T>(e) = currentGradOut->t<T>(e); currentOut->t<T>(e) = currentGradOut->t<T>(e);
} }
@ -955,7 +955,7 @@ namespace helpers {
ResultSet listOfTensors = input->allTensorsAlongDimension(restDims); ResultSet listOfTensors = input->allTensorsAlongDimension(restDims);
ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims); ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
for (int i = 0; i < indices->lengthOf(); i++) { for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
Nd4jLong classNum = indices->e<Nd4jLong>(i); Nd4jLong classNum = indices->e<Nd4jLong>(i);
NDArray* current = listOfTensors.at(i); NDArray* current = listOfTensors.at(i);
NDArray* currentOut = listOfOutTensors.at(i); NDArray* currentOut = listOfOutTensors.at(i);
@ -984,7 +984,7 @@ namespace helpers {
ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims); ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
//auto func = PRAGMA_THREADS_FOR { //auto func = PRAGMA_THREADS_FOR {
for (auto i = 0; i < indices->lengthOf(); i++) { for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
auto classNum = indices->e<Nd4jLong>(i); auto classNum = indices->e<Nd4jLong>(i);
auto currentOut = listOfOutTensors.at(i); auto currentOut = listOfOutTensors.at(i);
auto currentGradOut = listOfGradOuts.at(classNum); auto currentGradOut = listOfGradOuts.at(classNum);
@ -1021,7 +1021,7 @@ namespace helpers {
ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims); ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
//auto func = PRAGMA_THREADS_FOR { //auto func = PRAGMA_THREADS_FOR {
for (auto i = 0; i < indices->lengthOf(); i++) { for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
auto classNum = indices->e<Nd4jLong>(i); auto classNum = indices->e<Nd4jLong>(i);
auto current = listOfTensors.at(i); auto current = listOfTensors.at(i);
auto currentOut = listOfOutTensors.at(i); auto currentOut = listOfOutTensors.at(i);
@ -1053,7 +1053,7 @@ namespace helpers {
// if input is a vector: (as if in doc sample) // if input is a vector: (as if in doc sample)
if (input->isVector()) { if (input->isVector()) {
//auto func = PRAGMA_THREADS_FOR { //auto func = PRAGMA_THREADS_FOR {
for (auto e = 0; e < indices->lengthOf(); e++) { for (Nd4jLong e = 0; e < indices->lengthOf(); e++) {
auto classNum = indices->e<Nd4jLong>(e); auto classNum = indices->e<Nd4jLong>(e);
output->p(e, gradOut->e<double>(classNum) / nd4j::math::nd4j_sqrt<double, double>(classCount[classNum])); output->p(e, gradOut->e<double>(classNum) / nd4j::math::nd4j_sqrt<double, double>(classCount[classNum]));
} }
@ -1069,7 +1069,7 @@ namespace helpers {
ResultSet listOfOutTensors =output->allTensorsAlongDimension(restDims); ResultSet listOfOutTensors =output->allTensorsAlongDimension(restDims);
//auto func = PRAGMA_THREADS_FOR { //auto func = PRAGMA_THREADS_FOR {
for (auto i = 0; i < indices->lengthOf(); i++) { for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
auto classNum = indices->e<Nd4jLong>(i); auto classNum = indices->e<Nd4jLong>(i);
auto current = listOfTensors.at(i); auto current = listOfTensors.at(i);
auto currentOut = listOfOutTensors.at(i); auto currentOut = listOfOutTensors.at(i);

View File

@ -378,7 +378,7 @@ namespace nd4j {
int irow = 0; int irow = 0;
auto cShift = t * idxShift; auto cShift = t * idxShift;
for (int e = 0; e < hsRounds; e++) { for (Nd4jLong e = 0; e < hsRounds; e++) {
irow = bIndices[e + cShift]; irow = bIndices[e + cShift];
if (irow < 0 || irow >= vocabSize) if (irow < 0 || irow >= vocabSize)
continue; continue;
@ -457,7 +457,7 @@ namespace nd4j {
T sneu1[600]; T sneu1[600];
T sneu1e[600]; T sneu1e[600];
for (int e = start; e < stop; e++) { for (auto e = start; e < stop; e++) {
T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength]; T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength];
T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength]; T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
@ -500,7 +500,7 @@ namespace nd4j {
// hierarchic softmax step // hierarchic softmax step
if (!indices.isEmpty()) { if (!indices.isEmpty()) {
for (int i = 0; i < numIndices; i++) { for (Nd4jLong i = 0; i < numIndices; i++) {
const int cIndex = bIndices[(e * numIndices) + i]; const int cIndex = bIndices[(e * numIndices) + i];
const int cCode = bCodes[(e * numIndices) + i]; const int cCode = bCodes[(e * numIndices) + i];

View File

@ -41,8 +41,8 @@ namespace helpers {
auto batchLoop = PRAGMA_THREADS_FOR { auto batchLoop = PRAGMA_THREADS_FOR {
for (auto batch = start; batch < stop; batch++) { for (auto batch = start; batch < stop; batch++) {
for (auto r = 0; r < rows; r++) { for (Nd4jLong r = 0; r < rows; r++) {
for (auto c = 0; c < r; c++) { for (Nd4jLong c = 0; c < r; c++) {
math::nd4j_swap(outputPart[batch]->t<T>(r, c) , outputPart[batch]->t<T>(c, r)); math::nd4j_swap(outputPart[batch]->t<T>(r, c) , outputPart[batch]->t<T>(c, r));
} }
} }
@ -66,7 +66,7 @@ namespace helpers {
auto permutationsPart = permutations.allTensorsAlongDimension({-1}); auto permutationsPart = permutations.allTensorsAlongDimension({-1});
for (auto batch = 0; batch < permutationsPart.size(); ++batch) { for (auto batch = 0; batch < permutationsPart.size(); ++batch) {
for (auto row = 0; row < PPart[batch]->rows(); ++row) { for (Nd4jLong row = 0; row < PPart[batch]->rows(); ++row) {
PPart[batch]->t<T>(row, permutationsPart[batch]->t<int>(row)) = T(1.f); PPart[batch]->t<T>(row, permutationsPart[batch]->t<int>(row)) = T(1.f);
} }
} }
@ -77,7 +77,7 @@ namespace helpers {
MmulHelper::matmul(&P, rightInput, &rightPermuted, 0, 0); MmulHelper::matmul(&P, rightInput, &rightPermuted, 0, 0);
ResultSet leftLowerPart = leftLower.allTensorsAlongDimension({-2, -1}); ResultSet leftLowerPart = leftLower.allTensorsAlongDimension({-2, -1});
for (auto i = 0; i < leftLowerPart.size(); i++) { for (auto i = 0; i < leftLowerPart.size(); i++) {
for (auto r = 0; r < leftLowerPart[i]->rows(); r++) for (Nd4jLong r = 0; r < leftLowerPart[i]->rows(); r++)
leftLowerPart[i]->t<T>(r,r) = (T)1.f; leftLowerPart[i]->t<T>(r,r) = (T)1.f;
} }
// stage 2: triangularSolveFunctor for Lower with given b // stage 2: triangularSolveFunctor for Lower with given b

View File

@ -29,7 +29,7 @@ namespace helpers {
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template <typename T> template <typename T>
static void split_(const NDArray& input, const std::vector<NDArray*>& outArrs, const int axis) { static void split_(const NDArray& input, const std::vector<NDArray*>& outArrs, const int axis) {
int numSplits = outArrs.size(); uint numSplits = outArrs.size();
const auto sizeofT = input.sizeOfT(); const auto sizeofT = input.sizeOfT();
@ -73,9 +73,9 @@ namespace helpers {
if (luckCase2) { if (luckCase2) {
const uint xDim = input.sizeAt(axis); const auto xDim = input.sizeAt(axis);
for (uint i = 0; i < input.lengthOf() / xDim; ++i) { for (Nd4jLong i = 0; i < input.lengthOf() / xDim; ++i) {
T* x = xBuff + xDim * i; T* x = xBuff + xDim * i;

View File

@ -39,7 +39,7 @@ namespace helpers {
// } // }
// ----------------------------------------------------------------------------------------------- // // ----------------------------------------------------------------------------------------------- //
std::vector<int> dimsToExclude(input->rankOf() - 1); std::vector<int> dimsToExclude(input->rankOf() - 1);
for (int d = 0; d < dimsToExclude.size(); ++d) for (size_t d = 0; d < dimsToExclude.size(); ++d)
dimsToExclude[d] = d; dimsToExclude[d] = d;
const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(input->getShapeInfo(), dimsToExclude); const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(input->getShapeInfo(), dimsToExclude);
@ -72,7 +72,7 @@ namespace helpers {
NDArray topValues = NDArrayFactory::create<T>('c', {k}); NDArray topValues = NDArrayFactory::create<T>('c', {k});
NDArray sortedVals = NDArrayFactory::create<T>('c', {k}); NDArray sortedVals = NDArrayFactory::create<T>('c', {k});
NDArray topIndices = NDArrayFactory::create<Nd4jLong>('c', {k}); NDArray topIndices = NDArrayFactory::create<Nd4jLong>('c', {k});
for (Nd4jLong pos = 0; pos < k; ++pos) { for (uint pos = 0; pos < k; ++pos) {
topIndices.t<Nd4jLong>(pos) = pos; topIndices.t<Nd4jLong>(pos) = pos;
topValues.t<T>(pos) = trial.t<T>(pos); topValues.t<T>(pos) = trial.t<T>(pos);
} }
@ -80,7 +80,7 @@ namespace helpers {
sortedVals.assign(topValues);// = NDArrayFactory::create<T>('c', {k}); sortedVals.assign(topValues);// = NDArrayFactory::create<T>('c', {k});
//std::sort(sortedVals.begin(), sortedVals.end()); // sorted in ascending order //std::sort(sortedVals.begin(), sortedVals.end()); // sorted in ascending order
SpecialMethods<T>::sortGeneric(sortedVals.buffer(), sortedVals.shapeInfo(), false); SpecialMethods<T>::sortGeneric(sortedVals.buffer(), sortedVals.shapeInfo(), false);
for (int i = k; i < width; ++i) { for (Nd4jLong i = static_cast<Nd4jLong>(k); i < width; ++i) {
T val = trial.e<T>(i); T val = trial.e<T>(i);
T minTopVal = sortedVals.t<T>(0); T minTopVal = sortedVals.t<T>(0);
if (minTopVal < val) { // value should be inserted to top k if (minTopVal < val) { // value should be inserted to top k
@ -104,15 +104,15 @@ namespace helpers {
if (needSort) { if (needSort) {
SpecialMethods<T>::sortGeneric(topValues.buffer(), topValues.shapeInfo(), true); SpecialMethods<T>::sortGeneric(topValues.buffer(), topValues.shapeInfo(), true);
for (int j = 0; j < width; j++) for (Nd4jLong j = 0; j < width; j++)
for (int pos = 0; pos < k; ++pos) for (uint pos = 0; pos < k; ++pos)
if (topValues.t<T>(pos) == trial.t<T>(j)) if (topValues.t<T>(pos) == trial.t<T>(j))
topIndices.t<Nd4jLong>(pos) = j; topIndices.t<Nd4jLong>(pos) = j;
} }
else { // else sort by indices else { // else sort by indices
std::map<Nd4jLong, T> sortValsMap; std::map<Nd4jLong, T> sortValsMap;
//std::vector<std::pair<int, T>> data(topValues.lengthOf()); //std::vector<std::pair<int, T>> data(topValues.lengthOf());
for (size_t e = 0; e < topValues.lengthOf(); ++e) { for (Nd4jLong e = 0; e < topValues.lengthOf(); ++e) {
sortValsMap[topIndices.t<Nd4jLong>(e)] = topValues.t<T>(e); sortValsMap[topIndices.t<Nd4jLong>(e)] = topValues.t<T>(e);
} }
@ -152,7 +152,7 @@ namespace helpers {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e++) { for (auto e = start; e < stop; e++) {
bool found = false; bool found = false;
for (int j = 0; j < k; j++) { for (uint j = 0; j < k; j++) {
if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) { if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) {
found = true; found = true;
break; break;

View File

@ -597,7 +597,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
zCoordStart[yRank - 1] = coordToRestore; zCoordStart[yRank - 1] = coordToRestore;
// construct coordinates for x // construct coordinates for x
for (uint j = 0; j < yLastDim; ++j) for (int j = 0; j < yLastDim; ++j)
xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride
const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart); const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart);
@ -628,7 +628,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
if (indices != nullptr) { if (indices != nullptr) {
for(int i = 0; i < indices->lengthOf(); ++i) for(Nd4jLong i = 0; i < indices->lengthOf(); ++i)
if(indices->e<Nd4jLong>(i) >= input->sizeAt(axis)) if(indices->e<Nd4jLong>(i) >= input->sizeAt(axis))
throw std::runtime_error("helpers::gather function: indices array contains wrong elements, each element must be smaller than corresponding dimension of input array !"); throw std::runtime_error("helpers::gather function: indices array contains wrong elements, each element must be smaller than corresponding dimension of input array !");
@ -733,7 +733,7 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat
// increasing counter to skip numIndices // increasing counter to skip numIndices
e++; e++;
std::vector<int> indices; std::vector<int> indices;
for (; e < intArgs->size(); e++) for (; e < static_cast<Nd4jLong>(intArgs->size()); e++)
indices.push_back((*intArgs)[e]); indices.push_back((*intArgs)[e]);
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
@ -813,7 +813,7 @@ static void mergeMaxIndex_(const std::vector<NDArray*>& inArrs, NDArray& output)
T max = -DataTypeUtils::max<T>(); T max = -DataTypeUtils::max<T>();
Nd4jLong idx = 0; Nd4jLong idx = 0;
for (int i = 0; i < numArgs; i++) { for (Nd4jLong i = 0; i < numArgs; i++) {
T v = inArrs[i]->e<T>(e); T v = inArrs[i]->e<T>(e);
if (v > max) { if (v > max) {
max = v; max = v;
@ -841,7 +841,7 @@ static void mergeMax_(const std::vector<NDArray*>& inArrs, NDArray& output) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e++) { for (auto e = start; e < stop; e++) {
T max = -DataTypeUtils::max<T>(); T max = -DataTypeUtils::max<T>();
for (int i = 0; i < numArgs; i++) { for (Nd4jLong i = 0; i < numArgs; i++) {
T v = inArrs[i]->e<T>(e); T v = inArrs[i]->e<T>(e);
if (v > max) if (v > max)
max = v; max = v;
@ -867,7 +867,7 @@ static void mergeAvg_(const std::vector<NDArray*>& inArrs, NDArray& output) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e++) { for (auto e = start; e < stop; e++) {
T sum = 0.; T sum = 0.;
for (int i = 0; i < numArgs; i++) { for (Nd4jLong i = 0; i < numArgs; i++) {
T v = inArrs[i]->e<T>(e); T v = inArrs[i]->e<T>(e);
sum += v; sum += v;
} }
@ -893,7 +893,7 @@ static void mergeAdd_(const std::vector<NDArray*>& inArrs, NDArray& output) {
auto func = PRAGMA_THREADS_FOR { auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e++) { for (auto e = start; e < stop; e++) {
T sum = (T) 0.f; T sum = (T) 0.f;
for (int i = 0; i < numArgs; i++) for (Nd4jLong i = 0; i < numArgs; i++)
sum += inArrs[i]->e<T>(e); sum += inArrs[i]->e<T>(e);
output.p(e, sum); output.p(e, sum);
@ -1242,7 +1242,7 @@ static void tileBP_(const NDArray& gradO /*input*/, NDArray& gradI /*output*/, c
memset(gradIBuff, 0, gradILen * sizeof(T)); memset(gradIBuff, 0, gradILen * sizeof(T));
else { else {
//PRAGMA_OMP_PARALLEL_FOR_SIMD //PRAGMA_OMP_PARALLEL_FOR_SIMD
for (int i = 0; i < gradILen * gradIEWS; i += gradIEWS) for (Nd4jLong i = 0; i < gradILen * gradIEWS; i += gradIEWS)
gradIBuff[i] = static_cast<T>(0.f); gradIBuff[i] = static_cast<T>(0.f);
} }

View File

@ -43,10 +43,10 @@ namespace helpers {
auto rows = leftInput->rows(); auto rows = leftInput->rows();
auto cols = rightInput->columns(); auto cols = rightInput->columns();
//output->t<T>(0,0) = rightInput->t<T>(0,0) / leftInput->t<T>(0,0); //output->t<T>(0,0) = rightInput->t<T>(0,0) / leftInput->t<T>(0,0);
for (auto r = 0; r < rows; r++) { for (Nd4jLong r = 0; r < rows; r++) {
for (auto j = 0; j < cols; j++) { for (Nd4jLong j = 0; j < cols; j++) {
auto sum = rightInput->t<T>(r, j); auto sum = rightInput->t<T>(r, j);
for (auto c = 0; c < r; c++) { for (Nd4jLong c = 0; c < r; c++) {
sum -= leftInput->t<T>(r, c) * output->t<T>(c, j); sum -= leftInput->t<T>(r, c) * output->t<T>(c, j);
} }
output->t<T>(r, j) = sum / leftInput->t<T>(r, r); output->t<T>(r, j) = sum / leftInput->t<T>(r, r);
@ -72,10 +72,10 @@ namespace helpers {
static void upperTriangularSolve(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output) { static void upperTriangularSolve(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output) {
auto rows = leftInput->rows(); auto rows = leftInput->rows();
auto cols = rightInput->columns(); auto cols = rightInput->columns();
for (auto r = rows; r > 0; r--) { for (Nd4jLong r = rows; r > 0; r--) {
for (auto j = 0; j < cols; j++) { for (Nd4jLong j = 0; j < cols; j++) {
auto sum = rightInput->t<T>(r - 1, j); auto sum = rightInput->t<T>(r - 1, j);
for (auto c = r; c < rows; c++) { for (Nd4jLong c = r; c < rows; c++) {
sum -= leftInput->t<T>(r - 1, c) * output->t<T>(c, j); sum -= leftInput->t<T>(r - 1, c) * output->t<T>(c, j);
} }
output->t<T>(r - 1, j) = sum / leftInput->t<T>(r - 1, r - 1); output->t<T>(r - 1, j) = sum / leftInput->t<T>(r - 1, r - 1);
@ -114,14 +114,14 @@ namespace helpers {
auto batchLoop = PRAGMA_THREADS_FOR { auto batchLoop = PRAGMA_THREADS_FOR {
for (auto batch = start; batch < stop; batch++) { for (auto batch = start; batch < stop; batch++) {
if (!lower) { if (!lower) {
for (auto r = 0; r < rows; r++) { for (Nd4jLong r = 0; r < rows; r++) {
for (auto c = 0; c <= r; c++) { for (Nd4jLong c = 0; c <= r; c++) {
outputPart[batch]->t<T>(r, c) = inputPart[batch]->t<T>(c, r); outputPart[batch]->t<T>(r, c) = inputPart[batch]->t<T>(c, r);
} }
} }
} else { } else {
for (auto r = 0; r < rows; r++) { for (Nd4jLong r = 0; r < rows; r++) {
for (auto c = r; c < cols; c++) { for (Nd4jLong c = r; c < cols; c++) {
outputPart[batch]->t<T>(r, c) = inputPart[batch]->t<T>(c, r); outputPart[batch]->t<T>(r, c) = inputPart[batch]->t<T>(c, r);
} }
} }

View File

@ -26,7 +26,7 @@ namespace helpers {
template <typename T> template <typename T>
static void adjustWeights_(NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength) { static void adjustWeights_(NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength) {
for (int e = 0; e < input->lengthOf(); e++) { for (Nd4jLong e = 0; e < input->lengthOf(); e++) {
int val = input->e<int>(e); int val = input->e<int>(e);
if (val < maxLength) { if (val < maxLength) {
if (weights != nullptr) if (weights != nullptr)