Loops auto-vectorization problem fix (#277)
* libnd4j cast loop types Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j more type castination added to loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j sync casting types of iterated variable in loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j more loops reviewed for vectorization problem fix Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j fixed several typos Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j several more files reviewed to fix auto-vectorization problem in loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j merge master and reviewed more files to fix auto-vectorization problem in loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j several type casting added in broadcasting that were missed, fixed mac builds Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j double check all files and fix several more places in loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j fixed builds Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j revert changes for lup.cpp Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j more files reviewed for auto-vectorization problem fix Signed-off-by: Oleg <oleg.semeniv@gmail.com>master
parent
5332ace32b
commit
f116f53d61
|
@ -1702,7 +1702,7 @@ bool NDArray::isSameShape(const std::vector<Nd4jLong>& shape) const{
|
||||||
if (this->rankOf() != (int) shape.size())
|
if (this->rankOf() != (int) shape.size())
|
||||||
return false;
|
return false;
|
||||||
for (int e = 0; e < this->rankOf(); e++) {
|
for (int e = 0; e < this->rankOf(); e++) {
|
||||||
if (this->shapeOf()[e] != shape.at(e) && shape.at(e) != -1)
|
if (this->shapeOf()[e] != shape[e] && shape[e] != -1)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -980,7 +980,7 @@ std::string NDArray::asString(Nd4jLong limit) {
|
||||||
template<typename T>
|
template<typename T>
|
||||||
std::vector<T> NDArray::getBufferAsVector() {
|
std::vector<T> NDArray::getBufferAsVector() {
|
||||||
std::vector<T> vector(lengthOf());
|
std::vector<T> vector(lengthOf());
|
||||||
for (int e = 0; e < lengthOf(); e++)
|
for (Nd4jLong e = 0; e < lengthOf(); e++)
|
||||||
vector[e] = this->e<T>(e);
|
vector[e] = this->e<T>(e);
|
||||||
return vector;
|
return vector;
|
||||||
}
|
}
|
||||||
|
@ -2128,12 +2128,12 @@ bool NDArray::isIdentityMatrix() {
|
||||||
throw std::runtime_error("isIdentityMatrix method: matrix must be square and have rank = 2 !");
|
throw std::runtime_error("isIdentityMatrix method: matrix must be square and have rank = 2 !");
|
||||||
|
|
||||||
const double eps = 1e-5f;
|
const double eps = 1e-5f;
|
||||||
for(int i=0; i<rows(); ++i)
|
for(Nd4jLong i=0; i<rows(); ++i)
|
||||||
if(nd4j::math::nd4j_abs(e<double>(i,i) - 1.f) > eps)
|
if(nd4j::math::nd4j_abs(e<double>(i,i) - 1.f) > eps)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
for(int i=0; i<rows(); ++i) {
|
for(Nd4jLong i=0; i<rows(); ++i) {
|
||||||
for(int j=0; j<columns(); ++j) {
|
for(Nd4jLong j=0; j<columns(); ++j) {
|
||||||
if (i == j)
|
if (i == j)
|
||||||
continue;
|
continue;
|
||||||
if(nd4j::math::nd4j_abs(e<double>(i,j)) > eps)
|
if(nd4j::math::nd4j_abs(e<double>(i,j)) > eps)
|
||||||
|
@ -2335,7 +2335,7 @@ NDArray NDArray::asS() const {
|
||||||
Nd4jLong dataLength = 0;
|
Nd4jLong dataLength = 0;
|
||||||
|
|
||||||
auto data = bufferAsT<int8_t>() + offsetsLength;
|
auto data = bufferAsT<int8_t>() + offsetsLength;
|
||||||
for (int e = 0; e < lengthOf(); e++) {
|
for (Nd4jLong e = 0; e < lengthOf(); e++) {
|
||||||
offsets[e] = dataLength;
|
offsets[e] = dataLength;
|
||||||
start = nInputoffsets[e];
|
start = nInputoffsets[e];
|
||||||
stop = nInputoffsets[e + 1];
|
stop = nInputoffsets[e + 1];
|
||||||
|
@ -3524,7 +3524,7 @@ bool NDArray::equalsTo(const NDArray *other, double eps) const {
|
||||||
// string is special case, we'll compare them one by one, considering both arrays are guaranteed to have the same length
|
// string is special case, we'll compare them one by one, considering both arrays are guaranteed to have the same length
|
||||||
|
|
||||||
if (dataType() == DataType::UTF8) {
|
if (dataType() == DataType::UTF8) {
|
||||||
for (int e = 0; e < this->lengthOf(); e++) {
|
for (Nd4jLong e = 0; e < this->lengthOf(); e++) {
|
||||||
auto s1 = this->e<std::string>(e);
|
auto s1 = this->e<std::string>(e);
|
||||||
auto s2 = other->e<std::string>(e);
|
auto s2 = other->e<std::string>(e);
|
||||||
|
|
||||||
|
@ -3533,7 +3533,7 @@ bool NDArray::equalsTo(const NDArray *other, double eps) const {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (dataType() == DataType::UTF16) {
|
else if (dataType() == DataType::UTF16) {
|
||||||
for (int e = 0; e < this->lengthOf(); e++) {
|
for (Nd4jLong e = 0; e < this->lengthOf(); e++) {
|
||||||
auto s1 = this->e<std::u16string>(e);
|
auto s1 = this->e<std::u16string>(e);
|
||||||
auto s2 = other->e<std::u16string>(e);
|
auto s2 = other->e<std::u16string>(e);
|
||||||
|
|
||||||
|
@ -3542,7 +3542,7 @@ bool NDArray::equalsTo(const NDArray *other, double eps) const {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (int e = 0; e < this->lengthOf(); e++) {
|
for (Nd4jLong e = 0; e < this->lengthOf(); e++) {
|
||||||
auto s1 = this->e<std::u32string>(e);
|
auto s1 = this->e<std::u32string>(e);
|
||||||
auto s2 = other->e<std::u32string>(e);
|
auto s2 = other->e<std::u32string>(e);
|
||||||
|
|
||||||
|
@ -4801,7 +4801,7 @@ ResultSet NDArray::allTensorsAlongDimension(const std::vector<int> &dimensions)
|
||||||
auto pack = ConstantTadHelper::getInstance()->tadForDimensions(_shapeInfo, const_cast<int*>(dimensions.data()), dimensions.size());
|
auto pack = ConstantTadHelper::getInstance()->tadForDimensions(_shapeInfo, const_cast<int*>(dimensions.data()), dimensions.size());
|
||||||
auto numTads = pack.numberOfTads();
|
auto numTads = pack.numberOfTads();
|
||||||
|
|
||||||
for (int idx = 0; idx < numTads; idx++ ) {
|
for (Nd4jLong idx = 0; idx < numTads; idx++ ) {
|
||||||
auto array = new NDArray(_buffer, ShapeDescriptor(pack.primaryShapeInfo()), getContext(), pack.primaryOffsets()[idx] + getBufferOffset());
|
auto array = new NDArray(_buffer, ShapeDescriptor(pack.primaryShapeInfo()), getContext(), pack.primaryOffsets()[idx] + getBufferOffset());
|
||||||
array->_isView = true;
|
array->_isView = true;
|
||||||
result.push_back(array);
|
result.push_back(array);
|
||||||
|
@ -4872,7 +4872,7 @@ NDArray NDArray::operator()(const std::vector<Nd4jLong>& idx, const bool keepUni
|
||||||
|
|
||||||
std::vector<int> dimsWithUnities;
|
std::vector<int> dimsWithUnities;
|
||||||
|
|
||||||
for (uint d = 0; d < rank; ++d)
|
for (int d = 0; d < rank; ++d)
|
||||||
if(idx[n*d] != idx[n*d+1] && shapeOf[d] == 1)
|
if(idx[n*d] != idx[n*d+1] && shapeOf[d] == 1)
|
||||||
dimsWithUnities.push_back(d);
|
dimsWithUnities.push_back(d);
|
||||||
|
|
||||||
|
|
|
@ -308,7 +308,7 @@ void NDArray::tile(const std::vector<Nd4jLong>& reps, NDArray& target) const {
|
||||||
// fill newBuff, loop through all elements of newBuff
|
// fill newBuff, loop through all elements of newBuff
|
||||||
// looping through _buffer goes automatically by means of getSubArrayIndex applying
|
// looping through _buffer goes automatically by means of getSubArrayIndex applying
|
||||||
const int ews = target.ews();
|
const int ews = target.ews();
|
||||||
const int targetLen = target.lengthOf();
|
const auto targetLen = target.lengthOf();
|
||||||
if(target.ordering() == 'c' && ews == 1) { // ews == 1 always here
|
if(target.ordering() == 'c' && ews == 1) { // ews == 1 always here
|
||||||
//#pragma omp parallel for simd if(targetLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
|
//#pragma omp parallel for simd if(targetLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)
|
||||||
for(Nd4jLong i=0; i<targetLen; ++i) {
|
for(Nd4jLong i=0; i<targetLen; ++i) {
|
||||||
|
@ -372,7 +372,7 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
|
||||||
|
|
||||||
const int rank = input.rankOf(); // xRank = zRank
|
const int rank = input.rankOf(); // xRank = zRank
|
||||||
const int zLen = output.lengthOf(); // xLen <= zLen
|
const int zLen = output.lengthOf(); // xLen <= zLen
|
||||||
const int repSize = repeats.size();
|
const uint repSize = repeats.size();
|
||||||
|
|
||||||
// loop through input array
|
// loop through input array
|
||||||
auto func = PRAGMA_THREADS_FOR {
|
auto func = PRAGMA_THREADS_FOR {
|
||||||
|
|
|
@ -1300,16 +1300,16 @@ void pullRowsGeneric(void *vx,
|
||||||
|
|
||||||
if (xEWS == 1 && zEWS == 1) {
|
if (xEWS == 1 && zEWS == 1) {
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int i = 0; i < tadLength; i++) {
|
for (Nd4jLong i = 0; i < tadLength; i++) {
|
||||||
rZ[i] = rX[i];
|
rZ[i] = rX[i];
|
||||||
}
|
}
|
||||||
} else if (xEWS >= 1 && zEWS >= 1) {
|
} else if (xEWS >= 1 && zEWS >= 1) {
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int i = 0; i < tadLength; i++) {
|
for (Nd4jLong i = 0; i < tadLength; i++) {
|
||||||
rZ[i * zEWS] = rX[i * xEWS];
|
rZ[i * zEWS] = rX[i * xEWS];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < tadLength; i++) {
|
for (Nd4jLong i = 0; i < tadLength; i++) {
|
||||||
auto xOffset = xTadOffsetForBlock + shape::getIndexOffset(i, tadShapeInfo);
|
auto xOffset = xTadOffsetForBlock + shape::getIndexOffset(i, tadShapeInfo);
|
||||||
auto zOffset = zTadOffsetForBlock + shape::getIndexOffset(i, zTadShapeInfo);
|
auto zOffset = zTadOffsetForBlock + shape::getIndexOffset(i, zTadShapeInfo);
|
||||||
hZ[zOffset] = hX[xOffset];
|
hZ[zOffset] = hX[xOffset];
|
||||||
|
|
|
@ -78,7 +78,7 @@ static void usualGemm(const NDArray* vA, const NDArray* vB, NDArray* vC,
|
||||||
|
|
||||||
T3 val = A[aOffset] * B[bOffset]; // first iteration
|
T3 val = A[aOffset] * B[bOffset]; // first iteration
|
||||||
|
|
||||||
for (uint j = 1; j < K; ++j) { // rest iterations
|
for (int j = 1; j < K; ++j) { // rest iterations
|
||||||
aOffset += shape::stride(aShapeInfo)[aKaxis];
|
aOffset += shape::stride(aShapeInfo)[aKaxis];
|
||||||
bOffset += shape::stride(bShapeInfo)[bKaxis];
|
bOffset += shape::stride(bShapeInfo)[bKaxis];
|
||||||
val = val + A[aOffset] * B[bOffset];
|
val = val + A[aOffset] * B[bOffset];
|
||||||
|
@ -131,7 +131,7 @@ static void usualGemv(const NDArray* vA, const NDArray* vX, NDArray* vY, const
|
||||||
|
|
||||||
T3 val = A[aOffset] * X[xOffset]; // first iteration
|
T3 val = A[aOffset] * X[xOffset]; // first iteration
|
||||||
|
|
||||||
for (uint j = 1; j < N; ++j) { // rest iterations
|
for (int j = 1; j < N; ++j) { // rest iterations
|
||||||
aOffset += aNstride;
|
aOffset += aNstride;
|
||||||
xOffset += incx;
|
xOffset += incx;
|
||||||
val = val + A[aOffset] * X[xOffset];
|
val = val + A[aOffset] * X[xOffset];
|
||||||
|
@ -163,7 +163,7 @@ static void usualDot(const Nd4jLong length, const double alpha, const void* vX,
|
||||||
|
|
||||||
T3 sum = 0;
|
T3 sum = 0;
|
||||||
PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(length > Environment::getInstance()->elementwiseThreshold()) schedule(guided) reduction(OMP_SUMT:sum))
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(length > Environment::getInstance()->elementwiseThreshold()) schedule(guided) reduction(OMP_SUMT:sum))
|
||||||
for(int i = 0; i < length; ++i)
|
for(Nd4jLong i = 0; i < length; ++i)
|
||||||
sum += X[i * incx] * Y[i * incy];
|
sum += X[i * incx] * Y[i * incy];
|
||||||
|
|
||||||
if(betaPersent)
|
if(betaPersent)
|
||||||
|
@ -462,7 +462,7 @@ static void batchedGemm(const NDArray* vA, const NDArray* vB, NDArray* vC,
|
||||||
|
|
||||||
T3 val = A[aOffset] * B[bOffset]; // first iteration
|
T3 val = A[aOffset] * B[bOffset]; // first iteration
|
||||||
|
|
||||||
for (uint j = 1; j < K; ++j) { // rest iterations
|
for (int j = 1; j < K; ++j) { // rest iterations
|
||||||
aOffset += shape::stride(aShapeInfo)[aKaxis];
|
aOffset += shape::stride(aShapeInfo)[aKaxis];
|
||||||
bOffset += shape::stride(bShapeInfo)[bKaxis];
|
bOffset += shape::stride(bShapeInfo)[bKaxis];
|
||||||
val = val + A[aOffset] * B[bOffset];
|
val = val + A[aOffset] * B[bOffset];
|
||||||
|
|
|
@ -58,7 +58,7 @@ BiDiagonalUp::BiDiagonalUp(const NDArray& matrix): _HHmatrix(nd4j::NDArrayFactor
|
||||||
|
|
||||||
T _x, _y;
|
T _x, _y;
|
||||||
|
|
||||||
for(int i = 0; i < cols-1; ++i ) {
|
for(Nd4jLong i = 0; i < cols-1; ++i ) {
|
||||||
|
|
||||||
// evaluate Householder matrix nullifying columns
|
// evaluate Householder matrix nullifying columns
|
||||||
column = new NDArray(_HHmatrix({i,rows, i,i+1}, true));
|
column = new NDArray(_HHmatrix({i,rows, i,i+1}, true));
|
||||||
|
|
|
@ -53,7 +53,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
for (uint j = 0; j < tadLen; j++) {
|
for (Nd4jLong j = 0; j < tadLen; j++) {
|
||||||
functions::indexreduce::IndexValue<X> comp(tad[j], j);
|
functions::indexreduce::IndexValue<X> comp(tad[j], j);
|
||||||
indexValue = OpType::update(indexValue, comp, extraParams);
|
indexValue = OpType::update(indexValue, comp, extraParams);
|
||||||
}
|
}
|
||||||
|
@ -74,7 +74,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
for (uint j = 0; j < tadLen; j++) {
|
for (Nd4jLong j = 0; j < tadLen; j++) {
|
||||||
functions::indexreduce::IndexValue<X> comp(tad[j * tadEws], j);
|
functions::indexreduce::IndexValue<X> comp(tad[j * tadEws], j);
|
||||||
indexValue = OpType::update(indexValue, comp, extraParams);
|
indexValue = OpType::update(indexValue, comp, extraParams);
|
||||||
}
|
}
|
||||||
|
@ -95,7 +95,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
for (uint i0 = 0; i0 < tadLen; ++i0) {
|
for (Nd4jLong i0 = 0; i0 < tadLen; ++i0) {
|
||||||
functions::indexreduce::IndexValue<X> comp(tad[i0 * tadStride[0]], i0);
|
functions::indexreduce::IndexValue<X> comp(tad[i0 * tadStride[0]], i0);
|
||||||
indexValue = OpType::update(indexValue, comp, extraParams);
|
indexValue = OpType::update(indexValue, comp, extraParams);
|
||||||
}
|
}
|
||||||
|
@ -118,8 +118,8 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
|
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
|
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||||
const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1];
|
const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1];
|
||||||
const auto tadIndex = i0 * newStride[0] + i1;
|
const auto tadIndex = i0 * newStride[0] + i1;
|
||||||
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
|
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
|
||||||
|
@ -145,9 +145,9 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
|
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
|
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||||
for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
|
for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||||
const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2];
|
const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2];
|
||||||
const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2;
|
const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2;
|
||||||
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
|
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
|
||||||
|
@ -174,10 +174,10 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
|
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
|
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||||
for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
|
for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||||
for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
|
for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) {
|
||||||
const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3];
|
const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3];
|
||||||
const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3;
|
const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3;
|
||||||
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
|
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
|
||||||
|
@ -205,11 +205,11 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
|
for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
|
||||||
for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
|
for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
|
||||||
for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
|
for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
|
||||||
for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
|
for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) {
|
||||||
for (uint i4 = 0; i4 < tadShape[4]; ++i4) {
|
for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4) {
|
||||||
const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4];
|
const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4];
|
||||||
const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3 * newStride[3] + i4;
|
const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3 * newStride[3] + i4;
|
||||||
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
|
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
|
||||||
|
@ -238,7 +238,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
for (uint j = 0; j < tadLen; j++) {
|
for (Nd4jLong j = 0; j < tadLen; j++) {
|
||||||
functions::indexreduce::IndexValue<X> comp(tad[j * tadEws], j);
|
functions::indexreduce::IndexValue<X> comp(tad[j * tadEws], j);
|
||||||
indexValue = OpType::update(indexValue, comp, extraParams);
|
indexValue = OpType::update(indexValue, comp, extraParams);
|
||||||
}
|
}
|
||||||
|
@ -262,7 +262,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
for (uint j = 0; j < tadLen; j++) {
|
for (Nd4jLong j = 0; j < tadLen; j++) {
|
||||||
auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
|
auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
|
||||||
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
|
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
|
||||||
indexValue = OpType::update(indexValue, comp, extraParams);
|
indexValue = OpType::update(indexValue, comp, extraParams);
|
||||||
|
@ -288,7 +288,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
auto tad = const_cast<X *>(x) + tadOffsets[i];
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
for (uint j = 0; j < tadLen; j++) {
|
for (Nd4jLong j = 0; j < tadLen; j++) {
|
||||||
auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
|
auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
|
||||||
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
|
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
|
||||||
indexValue = OpType::update(indexValue, comp, extraParams);
|
indexValue = OpType::update(indexValue, comp, extraParams);
|
||||||
|
|
|
@ -374,7 +374,7 @@ T SVD<T>::secularEq(const T diff, const NDArray& col0, const NDArray& diag, cons
|
||||||
auto len = permut.lengthOf();
|
auto len = permut.lengthOf();
|
||||||
T res = 1.;
|
T res = 1.;
|
||||||
T item;
|
T item;
|
||||||
for(int i=0; i<len; ++i) {
|
for(Nd4jLong i=0; i<len; ++i) {
|
||||||
auto j = permut.e<int>(i);
|
auto j = permut.e<int>(i);
|
||||||
item = col0.e<T>(j) / ((diagShifted.e<T>(j) - diff) * (diag.e<T>(j) + shift + diff));
|
item = col0.e<T>(j) / ((diagShifted.e<T>(j) - diff) * (diag.e<T>(j) + shift + diff));
|
||||||
res += item * col0.e<T>(j);
|
res += item * col0.e<T>(j);
|
||||||
|
@ -383,7 +383,6 @@ T SVD<T>::secularEq(const T diff, const NDArray& col0, const NDArray& diag, cons
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void SVD<T>::calcSingVals(const NDArray& col0, const NDArray& diag, const NDArray& permut, NDArray& singVals, NDArray& shifts, NDArray& mus) {
|
void SVD<T>::calcSingVals(const NDArray& col0, const NDArray& diag, const NDArray& permut, NDArray& singVals, NDArray& shifts, NDArray& mus) {
|
||||||
|
@ -394,7 +393,7 @@ void SVD<T>::calcSingVals(const NDArray& col0, const NDArray& diag, const NDArra
|
||||||
while(curLen > 1 && col0.e<T>(curLen-1) == (T)0.f)
|
while(curLen > 1 && col0.e<T>(curLen-1) == (T)0.f)
|
||||||
--curLen;
|
--curLen;
|
||||||
|
|
||||||
for (int k = 0; k < len; ++k) {
|
for (Nd4jLong k = 0; k < len; ++k) {
|
||||||
|
|
||||||
if (col0.e<T>(k) == (T)0.f || curLen==1) {
|
if (col0.e<T>(k) == (T)0.f || curLen==1) {
|
||||||
|
|
||||||
|
|
|
@ -232,13 +232,13 @@ void lstmLayerTimeLoop(const NDArray* x, const NDArray* Wx, const NDArray* Wr,
|
||||||
if(!h) { // seqLen and h are absent
|
if(!h) { // seqLen and h are absent
|
||||||
|
|
||||||
lstmLayerCell(xSet->at(0), Wx, Wr, b, h0, c0, Wp, params, ht, ct); // first time step
|
lstmLayerCell(xSet->at(0), Wx, Wr, b, h0, c0, Wp, params, ht, ct); // first time step
|
||||||
for (int t = 1; t < sL; ++t)
|
for (Nd4jLong t = 1; t < sL; ++t)
|
||||||
lstmLayerCell(xSet->at(t), Wx, Wr, b, ht, ct, Wp, params, ht, ct); // rest time steps
|
lstmLayerCell(xSet->at(t), Wx, Wr, b, ht, ct, Wp, params, ht, ct); // rest time steps
|
||||||
}
|
}
|
||||||
else { // seqLen is absent and h is present
|
else { // seqLen is absent and h is present
|
||||||
|
|
||||||
lstmLayerCell(xSet->at(0), Wx, Wr, b, h0, c0, Wp, params, hSet->at(0), ct); // first time step
|
lstmLayerCell(xSet->at(0), Wx, Wr, b, h0, c0, Wp, params, hSet->at(0), ct); // first time step
|
||||||
for (int t = 1; t < sL; ++t)
|
for (Nd4jLong t = 1; t < sL; ++t)
|
||||||
lstmLayerCell(xSet->at(t), Wx, Wr, b, hSet->at(t - 1), ct, Wp, params, hSet->at(t), ct); // rest time steps
|
lstmLayerCell(xSet->at(t), Wx, Wr, b, hSet->at(t - 1), ct, Wp, params, hSet->at(t), ct); // rest time steps
|
||||||
|
|
||||||
if(hL)
|
if(hL)
|
||||||
|
@ -249,7 +249,7 @@ void lstmLayerTimeLoop(const NDArray* x, const NDArray* Wx, const NDArray* Wr,
|
||||||
|
|
||||||
if(!h) { // seqLen is present and h is absent
|
if(!h) { // seqLen is present and h is absent
|
||||||
|
|
||||||
for (int e = 0; e < bS; ++e) {
|
for (Nd4jLong e = 0; e < bS; ++e) {
|
||||||
|
|
||||||
const int limit = seqLen->e<int>(e);
|
const int limit = seqLen->e<int>(e);
|
||||||
|
|
||||||
|
@ -272,7 +272,7 @@ void lstmLayerTimeLoop(const NDArray* x, const NDArray* Wx, const NDArray* Wr,
|
||||||
}
|
}
|
||||||
else { // seqLen and h are present
|
else { // seqLen and h are present
|
||||||
|
|
||||||
for (int e = 0; e < bS; ++e) {
|
for (Nd4jLong e = 0; e < bS; ++e) {
|
||||||
|
|
||||||
int limit = seqLen->e<int>(e);
|
int limit = seqLen->e<int>(e);
|
||||||
|
|
||||||
|
@ -312,13 +312,13 @@ void lstmLayerTimeLoop(const NDArray* x, const NDArray* Wx, const NDArray* Wr,
|
||||||
if(!h) { // seqLen and h are absent
|
if(!h) { // seqLen and h are absent
|
||||||
|
|
||||||
lstmLayerCell(xSet->at(sL - 1), Wx, Wr, b, h0, c0, Wp, params, ht, ct); // first time step
|
lstmLayerCell(xSet->at(sL - 1), Wx, Wr, b, h0, c0, Wp, params, ht, ct); // first time step
|
||||||
for (int t = sL - 2; t >= 0; --t)
|
for (Nd4jLong t = sL - 2; t >= 0; --t)
|
||||||
lstmLayerCell(xSet->at(t), Wx, Wr, b, ht, ct, Wp, params, ht, ct); // rest time steps
|
lstmLayerCell(xSet->at(t), Wx, Wr, b, ht, ct, Wp, params, ht, ct); // rest time steps
|
||||||
}
|
}
|
||||||
else { // seqLen is absent and h is present
|
else { // seqLen is absent and h is present
|
||||||
|
|
||||||
lstmLayerCell(xSet->at(sL - 1), Wx, Wr, b, h0, c0, Wp, params, hSet->at(sL - 1), ct); // first time step
|
lstmLayerCell(xSet->at(sL - 1), Wx, Wr, b, h0, c0, Wp, params, hSet->at(sL - 1), ct); // first time step
|
||||||
for (int t = sL - 2; t >= 0; --t)
|
for (Nd4jLong t = sL - 2; t >= 0; --t)
|
||||||
lstmLayerCell(xSet->at(t), Wx, Wr, b, hSet->at(t + 1), ct, Wp, params, hSet->at(t), ct); // rest time steps
|
lstmLayerCell(xSet->at(t), Wx, Wr, b, hSet->at(t + 1), ct, Wp, params, hSet->at(t), ct); // rest time steps
|
||||||
|
|
||||||
if(hL)
|
if(hL)
|
||||||
|
@ -329,7 +329,7 @@ void lstmLayerTimeLoop(const NDArray* x, const NDArray* Wx, const NDArray* Wr,
|
||||||
|
|
||||||
if(!h) { // h is absent and seqLen is present
|
if(!h) { // h is absent and seqLen is present
|
||||||
|
|
||||||
for (int e = 0; e < bS; ++e) {
|
for (Nd4jLong e = 0; e < bS; ++e) {
|
||||||
|
|
||||||
const int limit = seqLen->e<int>(e);
|
const int limit = seqLen->e<int>(e);
|
||||||
|
|
||||||
|
@ -344,7 +344,7 @@ void lstmLayerTimeLoop(const NDArray* x, const NDArray* Wx, const NDArray* Wr,
|
||||||
auto ind = getBatchTimeTotalIndex(dataFormat, sL, bS, sL - 1, e);
|
auto ind = getBatchTimeTotalIndex(dataFormat, sL, bS, sL - 1, e);
|
||||||
lstmLayerCell(xSet->at(ind), Wx, Wr, b, h0Set->at(e), c0Set->at(e), Wp, params, htSet->at(e), ctSet->at(e)); // first time step
|
lstmLayerCell(xSet->at(ind), Wx, Wr, b, h0Set->at(e), c0Set->at(e), Wp, params, htSet->at(e), ctSet->at(e)); // first time step
|
||||||
|
|
||||||
for (int t = sL - 2; t >= sL - limit; --t) {
|
for (Nd4jLong t = sL - 2; t >= sL - limit; --t) {
|
||||||
ind = getBatchTimeTotalIndex(dataFormat, sL, bS, t, e);
|
ind = getBatchTimeTotalIndex(dataFormat, sL, bS, t, e);
|
||||||
lstmLayerCell(xSet->at(ind), Wx, Wr, b, htSet->at(e), ctSet->at(e), Wp, params, htSet->at(e), ctSet->at(e)); // rest time steps
|
lstmLayerCell(xSet->at(ind), Wx, Wr, b, htSet->at(e), ctSet->at(e), Wp, params, htSet->at(e), ctSet->at(e)); // rest time steps
|
||||||
}
|
}
|
||||||
|
@ -352,7 +352,7 @@ void lstmLayerTimeLoop(const NDArray* x, const NDArray* Wx, const NDArray* Wr,
|
||||||
}
|
}
|
||||||
else { // seqLen and h are present
|
else { // seqLen and h are present
|
||||||
|
|
||||||
for (int e = 0; e < bS; ++e) {
|
for (Nd4jLong e = 0; e < bS; ++e) {
|
||||||
|
|
||||||
int limit = seqLen->e<int>(e);
|
int limit = seqLen->e<int>(e);
|
||||||
|
|
||||||
|
@ -371,7 +371,7 @@ void lstmLayerTimeLoop(const NDArray* x, const NDArray* Wx, const NDArray* Wr,
|
||||||
auto indPrev = getBatchTimeTotalIndex(dataFormat, sL, bS, sL - 1, e);
|
auto indPrev = getBatchTimeTotalIndex(dataFormat, sL, bS, sL - 1, e);
|
||||||
lstmLayerCell(xSet->at(indPrev), Wx, Wr, b, h0Set->at(e), c0Set->at(e), Wp, params, hSet->at(indPrev), ctSet->at(e)); // first time step
|
lstmLayerCell(xSet->at(indPrev), Wx, Wr, b, h0Set->at(e), c0Set->at(e), Wp, params, hSet->at(indPrev), ctSet->at(e)); // first time step
|
||||||
|
|
||||||
for (int t = sL - 2; t >= sL - limit; --t) {
|
for (Nd4jLong t = sL - 2; t >= sL - limit; --t) {
|
||||||
auto indCurr = getBatchTimeTotalIndex(dataFormat, sL, bS, t, e);
|
auto indCurr = getBatchTimeTotalIndex(dataFormat, sL, bS, t, e);
|
||||||
lstmLayerCell(xSet->at(indCurr), Wx, Wr, b, hSet->at(indPrev), ctSet->at(e), Wp, params, hSet->at(indCurr), ctSet->at(e)); // rest time steps
|
lstmLayerCell(xSet->at(indCurr), Wx, Wr, b, hSet->at(indPrev), ctSet->at(e), Wp, params, hSet->at(indCurr), ctSet->at(e)); // rest time steps
|
||||||
indPrev = indCurr;
|
indPrev = indCurr;
|
||||||
|
@ -388,7 +388,7 @@ void lstmLayerTimeLoop(const NDArray* x, const NDArray* Wx, const NDArray* Wr,
|
||||||
|
|
||||||
if(!h) { // h is absent and seqLen is present
|
if(!h) { // h is absent and seqLen is present
|
||||||
|
|
||||||
for (int e = 0; e < bS; ++e) {
|
for (Nd4jLong e = 0; e < bS; ++e) {
|
||||||
|
|
||||||
const int limit = seqLen->e<int>(e);
|
const int limit = seqLen->e<int>(e);
|
||||||
|
|
||||||
|
@ -411,7 +411,7 @@ void lstmLayerTimeLoop(const NDArray* x, const NDArray* Wx, const NDArray* Wr,
|
||||||
}
|
}
|
||||||
else { // seqLen and h are present
|
else { // seqLen and h are present
|
||||||
|
|
||||||
for (int e = 0; e < bS; ++e) {
|
for (Nd4jLong e = 0; e < bS; ++e) {
|
||||||
|
|
||||||
int limit = seqLen->e<int>(e);
|
int limit = seqLen->e<int>(e);
|
||||||
|
|
||||||
|
|
|
@ -80,7 +80,7 @@ namespace nd4j {
|
||||||
valueCoords[e] = indices.e<Nd4jLong>(e);
|
valueCoords[e] = indices.e<Nd4jLong>(e);
|
||||||
|
|
||||||
// write results individually
|
// write results individually
|
||||||
for (uint64_t e = 0; e < numElements; e++) {
|
for (Nd4jLong e = 0; e < numElements; e++) {
|
||||||
auto vIndex = shape::coords2index(output.shapeInfo(), valueCoords.data());
|
auto vIndex = shape::coords2index(output.shapeInfo(), valueCoords.data());
|
||||||
auto cLength = 0L;
|
auto cLength = 0L;
|
||||||
std::string str;
|
std::string str;
|
||||||
|
|
|
@ -33,7 +33,7 @@ namespace helpers {
|
||||||
|
|
||||||
std::vector<T> values;
|
std::vector<T> values;
|
||||||
|
|
||||||
for (int e = 0; e < input->lengthOf(); e++) {
|
for (Nd4jLong e = 0; e < input->lengthOf(); e++) {
|
||||||
T v = input->e<T>(e);
|
T v = input->e<T>(e);
|
||||||
if (std::find(values.begin(), values.end(), v) == values.end()) {
|
if (std::find(values.begin(), values.end(), v) == values.end()) {
|
||||||
values.push_back(v);
|
values.push_back(v);
|
||||||
|
@ -56,7 +56,7 @@ namespace helpers {
|
||||||
MAP_IMPL<T, int> indicesMap;
|
MAP_IMPL<T, int> indicesMap;
|
||||||
MAP_IMPL<T, int> countsMap;
|
MAP_IMPL<T, int> countsMap;
|
||||||
|
|
||||||
for (int e = 0; e < input->lengthOf(); e++) {
|
for (Nd4jLong e = 0; e < input->lengthOf(); e++) {
|
||||||
T v = input->e<T>(e);
|
T v = input->e<T>(e);
|
||||||
if (std::find(valuesVector.begin(), valuesVector.end(), v) == valuesVector.end()) {
|
if (std::find(valuesVector.begin(), valuesVector.end(), v) == valuesVector.end()) {
|
||||||
valuesVector.push_back(v);
|
valuesVector.push_back(v);
|
||||||
|
@ -77,7 +77,7 @@ namespace helpers {
|
||||||
};
|
};
|
||||||
samediff::Threads::parallel_for(func, 0, values->lengthOf());
|
samediff::Threads::parallel_for(func, 0, values->lengthOf());
|
||||||
|
|
||||||
for (int e = 0; e < indices->lengthOf(); e++) {
|
for (Nd4jLong e = 0; e < indices->lengthOf(); e++) {
|
||||||
auto posI = std::find(valuesVector.begin(), valuesVector.end(), input->e<T>(e));
|
auto posI = std::find(valuesVector.begin(), valuesVector.end(), input->e<T>(e));
|
||||||
auto dist = std::distance(valuesVector.begin(), posI);
|
auto dist = std::distance(valuesVector.begin(), posI);
|
||||||
indices->p(e, Nd4jLong(dist));//indicesMap[(*input)(e)];
|
indices->p(e, Nd4jLong(dist));//indicesMap[(*input)(e)];
|
||||||
|
|
|
@ -30,7 +30,7 @@ namespace nd4j {
|
||||||
int cnt = 0;
|
int cnt = 0;
|
||||||
|
|
||||||
Nd4jLong idx[MAX_RANK];
|
Nd4jLong idx[MAX_RANK];
|
||||||
for (int e = 0; e < condition.lengthOf(); e++) {
|
for (Nd4jLong e = 0; e < condition.lengthOf(); e++) {
|
||||||
shape::index2coords(e, condition.getShapeInfo(), idx);
|
shape::index2coords(e, condition.getShapeInfo(), idx);
|
||||||
|
|
||||||
auto offset = shape::getOffset(condition.getShapeInfo(), idx);
|
auto offset = shape::getOffset(condition.getShapeInfo(), idx);
|
||||||
|
|
Loading…
Reference in New Issue