[WIP] bunch of improvements (#257)
* - profiling bias_add op - add some docementation Signed-off-by: Yurii <yurii@skymind.io> * - minor change Signed-off-by: Yurii <yurii@skymind.io> * - provide addBias cuda kernel Signed-off-by: Yurii <yurii@skymind.io> * - improve shape::getIndexOfffset and change its signature Signed-off-by: Yurii <yurii@skymind.io> * - same as previous Signed-off-by: Yurii <yurii@skymind.io> * - improve and change signature in some shape:: stuff which has to do with calculation of offsets for array elements Signed-off-by: Yurii <yurii@skymind.io> * - minor changes in flatten Signed-off-by: Yurii <shyrma@skymind.io> * - add function shape::getIndexOffsetOrdered Signed-off-by: Yurii <shyrma@skymind.io> * - correct shape::getIndexOffsetOrdered() Signed-off-by: Yurii <shyrma@skymind.io> * - move getIndexOffsetOrdered to flatten.h header in order to isolate this function Signed-off-by: Yurii <shyrma@skymind.io>master
parent
3e73e9b56e
commit
589401477d
|
@ -1770,7 +1770,7 @@ NDArray NDArray::operator()(const Nd4jLong i) const {
|
||||||
} else {
|
} else {
|
||||||
Nd4jLong idx[MAX_RANK];
|
Nd4jLong idx[MAX_RANK];
|
||||||
shape::ind2subC(rankOf(), shapeOf(), i, idx);
|
shape::ind2subC(rankOf(), shapeOf(), i, idx);
|
||||||
auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), idx, rankOf());
|
auto xOffset = shape::getOffset(getShapeInfo(), idx);
|
||||||
|
|
||||||
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
||||||
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
||||||
|
@ -1801,7 +1801,7 @@ NDArray& NDArray::operator()(const Nd4jLong i) {
|
||||||
} else {
|
} else {
|
||||||
Nd4jLong idx[MAX_RANK];
|
Nd4jLong idx[MAX_RANK];
|
||||||
shape::ind2subC(rankOf(), shapeOf(), i, idx);
|
shape::ind2subC(rankOf(), shapeOf(), i, idx);
|
||||||
auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), idx, rankOf());
|
auto xOffset = shape::getOffset(getShapeInfo(), idx);
|
||||||
|
|
||||||
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
||||||
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
||||||
|
@ -1818,7 +1818,7 @@ NDArray NDArray::operator()(const Nd4jLong i, const Nd4jLong j) const {
|
||||||
throw std::invalid_argument("NDArray::operator(i,j): one of input indexes is out of array length or rank!=2 !");
|
throw std::invalid_argument("NDArray::operator(i,j): one of input indexes is out of array length or rank!=2 !");
|
||||||
|
|
||||||
Nd4jLong coords[2] = {i, j};
|
Nd4jLong coords[2] = {i, j};
|
||||||
auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
auto xOffset = shape::getOffset(getShapeInfo(), coords);
|
||||||
|
|
||||||
// TODO: do we really want a view here?
|
// TODO: do we really want a view here?
|
||||||
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
||||||
|
@ -1834,7 +1834,7 @@ NDArray& NDArray::operator()(const Nd4jLong i, const Nd4jLong j) {
|
||||||
throw std::invalid_argument("NDArray::operator(i,j): one of input indexes is out of array length or rank!=2 !");
|
throw std::invalid_argument("NDArray::operator(i,j): one of input indexes is out of array length or rank!=2 !");
|
||||||
|
|
||||||
Nd4jLong coords[2] = {i, j};
|
Nd4jLong coords[2] = {i, j};
|
||||||
auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
auto xOffset = shape::getOffset(getShapeInfo(), coords);
|
||||||
|
|
||||||
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
||||||
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
||||||
|
@ -1853,7 +1853,7 @@ NDArray NDArray::operator()(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k
|
||||||
throw std::invalid_argument("NDArray::operator(i,j,k): one of input indexes is out of array length or rank!=3 !");
|
throw std::invalid_argument("NDArray::operator(i,j,k): one of input indexes is out of array length or rank!=3 !");
|
||||||
|
|
||||||
Nd4jLong coords[3] = {i, j, k};
|
Nd4jLong coords[3] = {i, j, k};
|
||||||
auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
auto xOffset = shape::getOffset(getShapeInfo(), coords);
|
||||||
|
|
||||||
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
||||||
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
||||||
|
@ -1870,7 +1870,7 @@ NDArray& NDArray::operator()(const Nd4jLong i, const Nd4jLong j, const Nd4jLong
|
||||||
throw std::invalid_argument("NDArray::operator(i,j,k): one of input indexes is out of array length or rank!=3 !");
|
throw std::invalid_argument("NDArray::operator(i,j,k): one of input indexes is out of array length or rank!=3 !");
|
||||||
|
|
||||||
Nd4jLong coords[3] = {i, j, k};
|
Nd4jLong coords[3] = {i, j, k};
|
||||||
auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
auto xOffset = shape::getOffset(getShapeInfo(), coords);
|
||||||
|
|
||||||
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
||||||
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
||||||
|
@ -1886,7 +1886,7 @@ NDArray NDArray::operator()(const Nd4jLong t, const Nd4jLong u, const Nd4jLong v
|
||||||
throw std::invalid_argument("NDArray::operator(t,u,v,w): one of input indexes is out of array length or rank!=4 !");
|
throw std::invalid_argument("NDArray::operator(t,u,v,w): one of input indexes is out of array length or rank!=4 !");
|
||||||
|
|
||||||
Nd4jLong coords[4] = {t, u, v, w};
|
Nd4jLong coords[4] = {t, u, v, w};
|
||||||
auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
auto xOffset = shape::getOffset(getShapeInfo(), coords);
|
||||||
|
|
||||||
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
||||||
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
||||||
|
@ -1900,7 +1900,7 @@ NDArray& NDArray::operator()(const Nd4jLong t, const Nd4jLong u, const Nd4jLong
|
||||||
throw std::invalid_argument("NDArray::operator(t,u,v,w): one of input indexes is out of array length or rank!=4 !");
|
throw std::invalid_argument("NDArray::operator(t,u,v,w): one of input indexes is out of array length or rank!=4 !");
|
||||||
|
|
||||||
Nd4jLong coords[4] = {t, u, v, w};
|
Nd4jLong coords[4] = {t, u, v, w};
|
||||||
auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
auto xOffset = shape::getOffset(getShapeInfo(), coords);
|
||||||
|
|
||||||
// FIXME
|
// FIXME
|
||||||
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
||||||
|
@ -1916,7 +1916,7 @@ NDArray NDArray::operator()(const Nd4jLong* idx) const {
|
||||||
if (idx[i] >= sizeAt(i))
|
if (idx[i] >= sizeAt(i))
|
||||||
throw std::invalid_argument("NDArray::operator(const Nd4jLong* idx): input index is out of dimension length !");
|
throw std::invalid_argument("NDArray::operator(const Nd4jLong* idx): input index is out of dimension length !");
|
||||||
|
|
||||||
auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), idx, rankOf());
|
auto xOffset = shape::getOffset(getShapeInfo(), idx);
|
||||||
|
|
||||||
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
||||||
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
||||||
|
@ -1931,7 +1931,7 @@ NDArray& NDArray::operator()(const Nd4jLong* idx) {
|
||||||
if (idx[i] >= sizeAt(i))
|
if (idx[i] >= sizeAt(i))
|
||||||
throw std::invalid_argument("NDArray::operator(const Nd4jLong* idx): input index is out of dimension length !");
|
throw std::invalid_argument("NDArray::operator(const Nd4jLong* idx): input index is out of dimension length !");
|
||||||
|
|
||||||
auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), idx, rankOf());
|
auto xOffset = shape::getOffset(getShapeInfo(), idx);
|
||||||
|
|
||||||
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
|
||||||
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
|
||||||
|
@ -2067,7 +2067,7 @@ T& NDArray::t(const Nd4jLong i, const Nd4jLong j) {
|
||||||
syncToHost();
|
syncToHost();
|
||||||
|
|
||||||
Nd4jLong coords[2] = {i, j};
|
Nd4jLong coords[2] = {i, j};
|
||||||
auto offset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
auto offset = shape::getOffset(getShapeInfo(), coords);
|
||||||
tickWriteHost();
|
tickWriteHost();
|
||||||
return *(reinterpret_cast<T*>(bufferWithOffset(offset)));
|
return *(reinterpret_cast<T*>(bufferWithOffset(offset)));
|
||||||
}
|
}
|
||||||
|
@ -2084,7 +2084,7 @@ T& NDArray::t(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k) {
|
||||||
syncToHost();
|
syncToHost();
|
||||||
|
|
||||||
Nd4jLong coords[3] = {i, j, k};
|
Nd4jLong coords[3] = {i, j, k};
|
||||||
auto offset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
auto offset = shape::getOffset(getShapeInfo(), coords);
|
||||||
tickWriteHost();
|
tickWriteHost();
|
||||||
return *(reinterpret_cast<T*>(bufferWithOffset(offset)));
|
return *(reinterpret_cast<T*>(bufferWithOffset(offset)));
|
||||||
}
|
}
|
||||||
|
@ -2118,7 +2118,7 @@ T NDArray::t(const Nd4jLong i, const Nd4jLong j) const {
|
||||||
syncToHost();
|
syncToHost();
|
||||||
|
|
||||||
Nd4jLong coords[2] = {i, j};
|
Nd4jLong coords[2] = {i, j};
|
||||||
auto offset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
auto offset = shape::getOffset(getShapeInfo(), coords);
|
||||||
tickReadHost();
|
tickReadHost();
|
||||||
return *(reinterpret_cast<T*>(bufferWithOffset(offset)));
|
return *(reinterpret_cast<T*>(bufferWithOffset(offset)));
|
||||||
}
|
}
|
||||||
|
@ -2135,7 +2135,7 @@ T NDArray::t(const Nd4jLong i, const Nd4jLong j) const {
|
||||||
syncToHost();
|
syncToHost();
|
||||||
|
|
||||||
Nd4jLong coords[3] = {i, j, k};
|
Nd4jLong coords[3] = {i, j, k};
|
||||||
auto offset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
auto offset = shape::getOffset(getShapeInfo(), coords);
|
||||||
tickReadHost();
|
tickReadHost();
|
||||||
return *(reinterpret_cast<T*>(bufferWithOffset(offset)));
|
return *(reinterpret_cast<T*>(bufferWithOffset(offset)));
|
||||||
}
|
}
|
||||||
|
|
|
@ -808,7 +808,7 @@ void NDArray::templatedSet(void *buffer, const Nd4jLong *indices, const void *va
|
||||||
auto t = reinterpret_cast<T *>(buffer);
|
auto t = reinterpret_cast<T *>(buffer);
|
||||||
const auto y = *(reinterpret_cast<const Y *>(value));
|
const auto y = *(reinterpret_cast<const Y *>(value));
|
||||||
|
|
||||||
auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), indices, rankOf());
|
auto xOffset = shape::getOffset(getShapeInfo(), indices);
|
||||||
t[xOffset] = static_cast<T>(y);
|
t[xOffset] = static_cast<T>(y);
|
||||||
}
|
}
|
||||||
BUILD_DOUBLE_TEMPLATE(template void NDArray::templatedSet, (void *buffer, const Nd4jLong *indices, const void *value), LIBND4J_TYPES, LIBND4J_TYPES);
|
BUILD_DOUBLE_TEMPLATE(template void NDArray::templatedSet, (void *buffer, const Nd4jLong *indices, const void *value), LIBND4J_TYPES, LIBND4J_TYPES);
|
||||||
|
@ -2462,14 +2462,13 @@ double NDArray::getTrace() const {
|
||||||
|
|
||||||
int rank = rankOf();
|
int rank = rankOf();
|
||||||
auto shape = shapeOf();
|
auto shape = shapeOf();
|
||||||
auto strides = stridesOf();
|
|
||||||
int minDim = 100000000;
|
int minDim = 100000000;
|
||||||
|
|
||||||
Nd4jLong indices[MAX_RANK];
|
Nd4jLong indices[MAX_RANK];
|
||||||
for(int j = 0; j < rank; ++j)
|
for(int j = 0; j < rank; ++j)
|
||||||
indices[j] = 1;
|
indices[j] = 1;
|
||||||
|
|
||||||
auto offset = shape::getOffset(0, shape, strides, indices, rank);
|
auto offset = shape::getOffset(getShapeInfo(), indices);
|
||||||
|
|
||||||
for(int i = 0; i < rank; ++i)
|
for(int i = 0; i < rank; ++i)
|
||||||
if(minDim > shape[i])
|
if(minDim > shape[i])
|
||||||
|
@ -3472,7 +3471,7 @@ T NDArray::e(const Nd4jLong i, const Nd4jLong j) const {
|
||||||
throw std::invalid_argument("NDArray::e(i,j): one of input indexes is out of array length or rank!=2 !");
|
throw std::invalid_argument("NDArray::e(i,j): one of input indexes is out of array length or rank!=2 !");
|
||||||
|
|
||||||
const Nd4jLong coords[2] = {i, j};
|
const Nd4jLong coords[2] = {i, j};
|
||||||
const auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
const auto xOffset = shape::getOffset(getShapeInfo(), coords);
|
||||||
|
|
||||||
NDArray::preparePrimaryUse({}, {this});
|
NDArray::preparePrimaryUse({}, {this});
|
||||||
NDArray::registerPrimaryUse({}, {this});
|
NDArray::registerPrimaryUse({}, {this});
|
||||||
|
@ -3492,7 +3491,7 @@ T NDArray::e(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k) const {
|
||||||
throw std::invalid_argument("NDArray::e(i,j,k): one of input indexes is out of array length or rank!=3 !");
|
throw std::invalid_argument("NDArray::e(i,j,k): one of input indexes is out of array length or rank!=3 !");
|
||||||
|
|
||||||
const Nd4jLong coords[3] = {i, j, k};
|
const Nd4jLong coords[3] = {i, j, k};
|
||||||
const auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
const auto xOffset = shape::getOffset(getShapeInfo(), coords);
|
||||||
|
|
||||||
NDArray::preparePrimaryUse({}, {this});
|
NDArray::preparePrimaryUse({}, {this});
|
||||||
NDArray::registerPrimaryUse({}, {this});
|
NDArray::registerPrimaryUse({}, {this});
|
||||||
|
@ -3512,7 +3511,7 @@ T NDArray::e(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k, const Nd4jLon
|
||||||
throw std::invalid_argument("NDArray::e(i,j,k,l): one of input indexes is out of array length or rank!=4 !");
|
throw std::invalid_argument("NDArray::e(i,j,k,l): one of input indexes is out of array length or rank!=4 !");
|
||||||
|
|
||||||
const Nd4jLong coords[4] = {i, j, k, l};
|
const Nd4jLong coords[4] = {i, j, k, l};
|
||||||
const auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
const auto xOffset = shape::getOffset(getShapeInfo(), coords);
|
||||||
|
|
||||||
NDArray::preparePrimaryUse({}, {this});
|
NDArray::preparePrimaryUse({}, {this});
|
||||||
NDArray::registerPrimaryUse({}, {this});
|
NDArray::registerPrimaryUse({}, {this});
|
||||||
|
@ -4095,7 +4094,7 @@ void NDArray::p(const Nd4jLong i, const Nd4jLong j, const T value) {
|
||||||
|
|
||||||
void *p = reinterpret_cast<void *>(const_cast<T *>(&value));
|
void *p = reinterpret_cast<void *>(const_cast<T *>(&value));
|
||||||
Nd4jLong coords[2] = {i, j};
|
Nd4jLong coords[2] = {i, j};
|
||||||
auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
auto xOffset = shape::getOffset(getShapeInfo(), coords);
|
||||||
|
|
||||||
NDArray::preparePrimaryUse({this}, {}, true);
|
NDArray::preparePrimaryUse({this}, {}, true);
|
||||||
BUILD_SINGLE_PARTIAL_SELECTOR(dataType(), templatedSet<, T>(this->getBuffer(), xOffset, p), LIBND4J_TYPES);
|
BUILD_SINGLE_PARTIAL_SELECTOR(dataType(), templatedSet<, T>(this->getBuffer(), xOffset, p), LIBND4J_TYPES);
|
||||||
|
@ -4127,7 +4126,7 @@ void NDArray::p(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k, const T va
|
||||||
|
|
||||||
void *p = reinterpret_cast<void *>(const_cast<T *>(&value));
|
void *p = reinterpret_cast<void *>(const_cast<T *>(&value));
|
||||||
Nd4jLong coords[3] = {i, j, k};
|
Nd4jLong coords[3] = {i, j, k};
|
||||||
auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
auto xOffset = shape::getOffset(getShapeInfo(), coords);
|
||||||
BUILD_SINGLE_PARTIAL_SELECTOR(dataType(), templatedSet<, T>(this->getBuffer(), xOffset, p), LIBND4J_TYPES);
|
BUILD_SINGLE_PARTIAL_SELECTOR(dataType(), templatedSet<, T>(this->getBuffer(), xOffset, p), LIBND4J_TYPES);
|
||||||
NDArray::registerPrimaryUse({this}, {});
|
NDArray::registerPrimaryUse({this}, {});
|
||||||
}
|
}
|
||||||
|
@ -4154,7 +4153,7 @@ void NDArray::p(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k, const Nd4j
|
||||||
|
|
||||||
void *p = reinterpret_cast<void *>(const_cast<T *>(&value));
|
void *p = reinterpret_cast<void *>(const_cast<T *>(&value));
|
||||||
Nd4jLong coords[4] = {i, j, k, l};
|
Nd4jLong coords[4] = {i, j, k, l};
|
||||||
auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
|
auto xOffset = shape::getOffset(getShapeInfo(), coords);
|
||||||
|
|
||||||
NDArray::preparePrimaryUse({this}, {}, true);
|
NDArray::preparePrimaryUse({this}, {}, true);
|
||||||
BUILD_SINGLE_PARTIAL_SELECTOR(dataType(), templatedSet<, T>(this->getBuffer(), xOffset, p), LIBND4J_TYPES);
|
BUILD_SINGLE_PARTIAL_SELECTOR(dataType(), templatedSet<, T>(this->getBuffer(), xOffset, p), LIBND4J_TYPES);
|
||||||
|
@ -4409,7 +4408,7 @@ Nd4jLong NDArray::getOffset(const Nd4jLong i) const {
|
||||||
if (i >= lengthOf())
|
if (i >= lengthOf())
|
||||||
throw std::invalid_argument("NDArray::getOffset: input index is out of array length !");
|
throw std::invalid_argument("NDArray::getOffset: input index is out of array length !");
|
||||||
|
|
||||||
return shape::getIndexOffset(i, _shapeInfo, lengthOf());
|
return shape::getIndexOffset(i, _shapeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
NDArray NDArray::like() {
|
NDArray NDArray::like() {
|
||||||
|
@ -4455,7 +4454,7 @@ NDArray* NDArray::diagonal(const char type) const {
|
||||||
indices[i] = 1;
|
indices[i] = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto step = shape::getOffset(0, shapeOf(), stridesOf(), indices, rank);
|
auto step = shape::getOffset(getShapeInfo(), indices);
|
||||||
|
|
||||||
if(type == 'c') {
|
if(type == 'c') {
|
||||||
outShapeInfo[1] = diagSize;
|
outShapeInfo[1] = diagSize;
|
||||||
|
|
|
@ -103,8 +103,8 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, const char
|
||||||
PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(zLen > Environment::getInstance()->elementwiseThreshold()) firstprivate(coords))
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(zLen > Environment::getInstance()->elementwiseThreshold()) firstprivate(coords))
|
||||||
for (Nd4jLong i = 0; i < zLen; ++i) {
|
for (Nd4jLong i = 0; i < zLen; ++i) {
|
||||||
|
|
||||||
shape::index2coords(zRank, target->shapeOf(), i, zLen, coords.data());
|
shape::index2coords(i, target->getShapeInfo(), coords.data());
|
||||||
const auto zOffset = shape::getOffset(0, target->shapeOf(), target->stridesOf(), coords.data(), zRank);
|
const auto zOffset = shape::getOffset(target->getShapeInfo(), coords.data());
|
||||||
|
|
||||||
// if( (row + upper < col) || (row + lower > col) )
|
// if( (row + upper < col) || (row + lower > col) )
|
||||||
if((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
|
if((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
|
||||||
|
@ -112,7 +112,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, const char
|
||||||
else if(this != target) { // when this and target are different arrays
|
else if(this != target) { // when this and target are different arrays
|
||||||
if(xRank != zRank)
|
if(xRank != zRank)
|
||||||
coords[0] = coords[1];
|
coords[0] = coords[1];
|
||||||
const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(0, shapeOf(), stridesOf(), coords.data(), xRank);
|
const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(getShapeInfo(), coords.data());
|
||||||
z[zOffset] = x[xOffset];
|
z[zOffset] = x[xOffset];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -128,13 +128,12 @@ void NDArray::setIdentity() {
|
||||||
|
|
||||||
int rank = rankOf();
|
int rank = rankOf();
|
||||||
auto shape = shapeOf();
|
auto shape = shapeOf();
|
||||||
auto strides = stridesOf();
|
|
||||||
int minDim = MAX_INT;
|
int minDim = MAX_INT;
|
||||||
Nd4jLong indices[MAX_RANK];
|
Nd4jLong indices[MAX_RANK];
|
||||||
for(int j = 0; j < rank; ++j)
|
for(int j = 0; j < rank; ++j)
|
||||||
indices[j] = 1;
|
indices[j] = 1;
|
||||||
|
|
||||||
Nd4jLong offset = shape::getOffset(0, shape, strides, indices, rank);
|
Nd4jLong offset = shape::getOffset(getShapeInfo(), indices);
|
||||||
|
|
||||||
for(int i = 0; i < rank; ++i)
|
for(int i = 0; i < rank; ++i)
|
||||||
if(minDim > shape[i])
|
if(minDim > shape[i])
|
||||||
|
@ -380,9 +379,9 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
|
||||||
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
|
||||||
for (Nd4jLong i = 0; i < zLen; ++i) {
|
for (Nd4jLong i = 0; i < zLen; ++i) {
|
||||||
|
|
||||||
shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data());
|
shape::index2coords(i, output.getShapeInfo(), coords.data());
|
||||||
|
|
||||||
const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank);
|
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
|
||||||
|
|
||||||
if(repSize > 1) {
|
if(repSize > 1) {
|
||||||
for (uint j = 0; j < repSize; ++j) {
|
for (uint j = 0; j < repSize; ++j) {
|
||||||
|
@ -396,7 +395,7 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
|
||||||
else
|
else
|
||||||
coords[axis] /= repeats[0];
|
coords[axis] /= repeats[0];
|
||||||
|
|
||||||
z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)];
|
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1385,8 +1385,8 @@ void pullRowsGeneric(void *vx,
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (int i = 0; i < tadLength; i++) {
|
for (int i = 0; i < tadLength; i++) {
|
||||||
auto xOffset = xTadOffsetForBlock + shape::getIndexOffset(i, tadShapeInfo, tadLength);
|
auto xOffset = xTadOffsetForBlock + shape::getIndexOffset(i, tadShapeInfo);
|
||||||
auto zOffset = zTadOffsetForBlock + shape::getIndexOffset(i, zTadShapeInfo, tadLength);
|
auto zOffset = zTadOffsetForBlock + shape::getIndexOffset(i, zTadShapeInfo);
|
||||||
hZ[zOffset] = hX[xOffset];
|
hZ[zOffset] = hX[xOffset];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1450,7 +1450,7 @@ void tearGeneric(void *vx,
|
||||||
else {
|
else {
|
||||||
|
|
||||||
for (Nd4jLong j = 0; j < tadLength; j++)
|
for (Nd4jLong j = 0; j < tadLength; j++)
|
||||||
hZ[shape::getIndexOffset(j, hZShapeInfo, tadLength)] = s[shape::getIndexOffset(j, tadShapeInfo, tadLength)];
|
hZ[shape::getIndexOffset(j, hZShapeInfo)] = s[shape::getIndexOffset(j, tadShapeInfo)];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1597,7 +1597,7 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (Nd4jLong i = 0; i < tadLength; i++) {
|
for (Nd4jLong i = 0; i < tadLength; i++) {
|
||||||
auto offset = shape::getIndexOffset(i, tadOnlyShapeInfo[f], tadLength);
|
auto offset = shape::getIndexOffset(i, tadOnlyShapeInfo[f]);
|
||||||
nd4j::math::nd4j_swap<T>(hX[offset + oldOffset], hX[offset + newOffset]);
|
nd4j::math::nd4j_swap<T>(hX[offset + oldOffset], hX[offset + newOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -106,8 +106,8 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
|
||||||
|
|
||||||
for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
|
for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
|
||||||
|
|
||||||
shape::index2coords(zRank, shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo)), i, zLen, coords);
|
shape::index2coords(i, zShapeInfo, coords);
|
||||||
const auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo)), shape::stride(const_cast<Nd4jLong*>(zShapeInfo)), coords, zRank);
|
const auto zOffset = shape::getOffset(zShapeInfo, coords);
|
||||||
|
|
||||||
// if( (row + upper < col) || (row + lower > col) )
|
// if( (row + upper < col) || (row + lower > col) )
|
||||||
if((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
|
if((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
|
||||||
|
@ -115,7 +115,7 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
|
||||||
else if(vx != vz) { // when x and z are different arrays
|
else if(vx != vz) { // when x and z are different arrays
|
||||||
if(xRank != zRank)
|
if(xRank != zRank)
|
||||||
coords[0] = coords[1];
|
coords[0] = coords[1];
|
||||||
const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), shape::stride(const_cast<Nd4jLong*>(xShapeInfo)), coords, xRank);
|
const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(xShapeInfo, coords);
|
||||||
z[zOffset] = x[xOffset];
|
z[zOffset] = x[xOffset];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -177,8 +177,8 @@ __global__ static void identityMatrixCuda(void* vx, const Nd4jLong* xShapeInfo,
|
||||||
|
|
||||||
for (Nd4jLong i = tid; i < len; i += totalThreads) {
|
for (Nd4jLong i = tid; i < len; i += totalThreads) {
|
||||||
|
|
||||||
shape::index2coords(rank, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), i, len, coords);
|
shape::index2coords(i, xShapeInfo, coords);
|
||||||
const auto offset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), shape::stride(const_cast<Nd4jLong*>(xShapeInfo)), coords, rank);
|
const auto offset = shape::getOffset(xShapeInfo, coords);
|
||||||
|
|
||||||
if(coords[rank - 2] == coords[rank - 1]) // row == col -> on diagonal
|
if(coords[rank - 2] == coords[rank - 1]) // row == col -> on diagonal
|
||||||
x[offset] = val;
|
x[offset] = val;
|
||||||
|
@ -424,9 +424,9 @@ __global__ static void repeatCuda(const void* vx, const Nd4jLong* xShapeInfo,
|
||||||
|
|
||||||
for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
|
for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
|
||||||
|
|
||||||
shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords);
|
shape::index2coords(i, zShapeInfo, coords);
|
||||||
|
|
||||||
const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
|
const auto zOffset = shape::getOffset(zShapeInfo, coords);
|
||||||
|
|
||||||
if(repSize > 1) {
|
if(repSize > 1) {
|
||||||
for (uint j = 0; j < repSize; ++j) {
|
for (uint j = 0; j < repSize; ++j) {
|
||||||
|
@ -440,7 +440,7 @@ __global__ static void repeatCuda(const void* vx, const Nd4jLong* xShapeInfo,
|
||||||
else
|
else
|
||||||
coords[axis] /= repeats[0];
|
coords[axis] /= repeats[0];
|
||||||
|
|
||||||
z[zOffset] = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)];
|
z[zOffset] = x[shape::getOffset(xShapeInfo, coords)];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,8 +23,8 @@
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
static Nd4jLong __device__ __noinline__ __getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo, Nd4jLong length) {
|
static Nd4jLong __device__ __noinline__ __getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo) {
|
||||||
return shape::getIndexOffset(index, shapeInfo, length);
|
return shape::getIndexOffset(index, shapeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
static Nd4jLong __device__ __noinline__ __length(Nd4jLong *shapeInfo) {
|
static Nd4jLong __device__ __noinline__ __length(Nd4jLong *shapeInfo) {
|
||||||
|
@ -103,8 +103,8 @@ static _CUDA_G void lambdaKernel(void* vx, Nd4jLong *xShapeInfo, void *vz, Nd4jL
|
||||||
z[e * zEws] = lambda(x[e * xEws]);
|
z[e * zEws] = lambda(x[e * xEws]);
|
||||||
} else {
|
} else {
|
||||||
for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
|
for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
|
||||||
auto xOffset = __getIndexOffset(e, xShapeInfo, zLength);
|
auto xOffset = __getIndexOffset(e, xShapeInfo);
|
||||||
auto zOffset = __getIndexOffset(e, zShapeInfo, zLength);
|
auto zOffset = __getIndexOffset(e, zShapeInfo);
|
||||||
|
|
||||||
z[zOffset] = lambda(x[xOffset]);
|
z[zOffset] = lambda(x[xOffset]);
|
||||||
}
|
}
|
||||||
|
@ -132,8 +132,8 @@ static _CUDA_G void lambdaIndexedKernel(void* vx, Nd4jLong *xShapeInfo, void *vz
|
||||||
z[e * zEws] = lambda(e, x[e * xEws]);
|
z[e * zEws] = lambda(e, x[e * xEws]);
|
||||||
} else {
|
} else {
|
||||||
for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
|
for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
|
||||||
auto xOffset = __getIndexOffset(e, xShapeInfo, zLength);
|
auto xOffset = __getIndexOffset(e, xShapeInfo);
|
||||||
auto zOffset = __getIndexOffset(e, zShapeInfo, zLength);
|
auto zOffset = __getIndexOffset(e, zShapeInfo);
|
||||||
|
|
||||||
z[zOffset] = lambda(e, x[xOffset]);
|
z[zOffset] = lambda(e, x[xOffset]);
|
||||||
}
|
}
|
||||||
|
@ -164,9 +164,9 @@ static _CUDA_G void lambdaIndexedPairwiseKernel(void* vx, Nd4jLong *xShapeInfo,
|
||||||
z[e * zEws] = lambda(e, x[e * xEws], y[e * yEws]);
|
z[e * zEws] = lambda(e, x[e * xEws], y[e * yEws]);
|
||||||
} else {
|
} else {
|
||||||
for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
|
for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
|
||||||
auto xOffset = __getIndexOffset(e, xShapeInfo, zLength);
|
auto xOffset = __getIndexOffset(e, xShapeInfo);
|
||||||
auto yOffset = __getIndexOffset(e, yShapeInfo, zLength);
|
auto yOffset = __getIndexOffset(e, yShapeInfo);
|
||||||
auto zOffset = __getIndexOffset(e, zShapeInfo, zLength);
|
auto zOffset = __getIndexOffset(e, zShapeInfo);
|
||||||
|
|
||||||
z[zOffset] = lambda(e, x[xOffset], y[yOffset]);
|
z[zOffset] = lambda(e, x[xOffset], y[yOffset]);
|
||||||
}
|
}
|
||||||
|
@ -197,9 +197,9 @@ static _CUDA_G void lambdaPairwiseKernel(void* vx, Nd4jLong *xShapeInfo, void* v
|
||||||
z[e * zEws] = lambda(x[e * xEws], y[e * yEws]);
|
z[e * zEws] = lambda(x[e * xEws], y[e * yEws]);
|
||||||
} else {
|
} else {
|
||||||
for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
|
for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
|
||||||
auto xOffset = __getIndexOffset(e, xShapeInfo, zLength);
|
auto xOffset = __getIndexOffset(e, xShapeInfo);
|
||||||
auto yOffset = __getIndexOffset(e, yShapeInfo, zLength);
|
auto yOffset = __getIndexOffset(e, yShapeInfo);
|
||||||
auto zOffset = __getIndexOffset(e, zShapeInfo, zLength);
|
auto zOffset = __getIndexOffset(e, zShapeInfo);
|
||||||
|
|
||||||
z[zOffset] = lambda(x[xOffset], y[yOffset]);
|
z[zOffset] = lambda(x[xOffset], y[yOffset]);
|
||||||
}
|
}
|
||||||
|
@ -233,10 +233,10 @@ static _CUDA_G void lambdaTriplewiseKernel(void* vw, Nd4jLong *wShapeInfo, void*
|
||||||
z[e * zEws] = lambda(w[e * wEws], x[e * xEws], y[e * yEws]);
|
z[e * zEws] = lambda(w[e * wEws], x[e * xEws], y[e * yEws]);
|
||||||
} else {
|
} else {
|
||||||
for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
|
for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
|
||||||
auto wOffset = __getIndexOffset(e, wShapeInfo, zLength);
|
auto wOffset = __getIndexOffset(e, wShapeInfo);
|
||||||
auto xOffset = __getIndexOffset(e, xShapeInfo, zLength);
|
auto xOffset = __getIndexOffset(e, xShapeInfo);
|
||||||
auto yOffset = __getIndexOffset(e, yShapeInfo, zLength);
|
auto yOffset = __getIndexOffset(e, yShapeInfo);
|
||||||
auto zOffset = __getIndexOffset(e, zShapeInfo, zLength);
|
auto zOffset = __getIndexOffset(e, zShapeInfo);
|
||||||
|
|
||||||
z[zOffset] = lambda(w[wOffset], x[xOffset], y[yOffset]);
|
z[zOffset] = lambda(w[wOffset], x[xOffset], y[yOffset]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -3228,8 +3228,8 @@ __global__ static void scatterUpdateCuda(const int opCode, const int numOfSubArr
|
||||||
|
|
||||||
for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) {
|
for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) {
|
||||||
|
|
||||||
const auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLenX);
|
const auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
const auto yOffset = shape::getIndexOffset(i, yShapeInfo, arrLenY);
|
const auto yOffset = shape::getIndexOffset(i, yShapeInfo);
|
||||||
|
|
||||||
switch (opCode) {
|
switch (opCode) {
|
||||||
case 0:
|
case 0:
|
||||||
|
|
|
@ -246,9 +246,9 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
||||||
auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
|
auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (uint i = 0; i < lenPerThread; i++) {
|
for (uint i = 0; i < lenPerThread; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, len, canCastY);
|
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = op(x[xOffset], y[yOffset], extraParams);
|
z[zOffset] = op(x[xOffset], y[yOffset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -452,7 +452,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
||||||
for (uint j = 0; j < tadLen; j++)
|
for (uint j = 0; j < tadLen; j++)
|
||||||
start = OpType::update(start, OpType::op(tad[j * tadEws], extraParams), extraParams);
|
start = OpType::update(start, OpType::op(tad[j * tadEws], extraParams), extraParams);
|
||||||
|
|
||||||
auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ);
|
auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
|
||||||
z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
|
z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -469,7 +469,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
||||||
auto start = OpType::startingValue(tad);
|
auto start = OpType::startingValue(tad);
|
||||||
|
|
||||||
for (uint j = 0; j < tadLen; j++) {
|
for (uint j = 0; j < tadLen; j++) {
|
||||||
auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, tadLen, canCastTad);
|
auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
|
||||||
start = OpType::update(start, OpType::op(tad[tadOffset], extraParams), extraParams);
|
start = OpType::update(start, OpType::op(tad[tadOffset], extraParams), extraParams);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -491,11 +491,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
||||||
// auto start = OpType::startingValue(tad);
|
// auto start = OpType::startingValue(tad);
|
||||||
|
|
||||||
// for (uint j = 0; j < tadLen; j++) {
|
// for (uint j = 0; j < tadLen; j++) {
|
||||||
// auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, tadLen, canCastTad);
|
// auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
|
||||||
// start = OpType::update(start, OpType::op(tad[tadOffset], extraParams), extraParams);
|
// start = OpType::update(start, OpType::op(tad[tadOffset], extraParams), extraParams);
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ);
|
// auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
|
||||||
// z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
|
// z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
@ -517,7 +517,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
||||||
for (uint j = 0; j < tadLen; j++)
|
for (uint j = 0; j < tadLen; j++)
|
||||||
start = OpType::update(start, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
|
start = OpType::update(start, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
|
||||||
|
|
||||||
auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ);
|
auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
|
||||||
z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
|
z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -658,13 +658,13 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (uint i = 0; i < lenPerThread; i++) {
|
for (uint i = 0; i < lenPerThread; i++) {
|
||||||
const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, len, canCastX);
|
const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, canCastX);
|
||||||
zi[i * zEws] = OpType::op(x[xOffset], extraParams);
|
zi[i * zEws] = OpType::op(x[xOffset], extraParams);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (uint i = 0; i < lenPerThread; i++) {
|
for (uint i = 0; i < lenPerThread; i++) {
|
||||||
const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, len, canCastX);
|
const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, canCastX);
|
||||||
zi[i] = OpType::op(x[xOffset], extraParams);
|
zi[i] = OpType::op(x[xOffset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -782,8 +782,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (uint i = 0; i < lenPerThread; i++) {
|
for (uint i = 0; i < lenPerThread; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpType::op(x[xOffset], extraParams);
|
z[zOffset] = OpType::op(x[xOffset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1123,7 +1123,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
||||||
auto start = OpType::startingValue(xTad);
|
auto start = OpType::startingValue(xTad);
|
||||||
|
|
||||||
for (uint j = 0; j < tadLen; ++j) {
|
for (uint j = 0; j < tadLen; ++j) {
|
||||||
const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, tadLen, canCastXTad);
|
const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
|
||||||
start = OpType::update(start, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
|
start = OpType::update(start, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1147,8 +1147,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
||||||
auto start = OpType::startingValue(xTad);
|
auto start = OpType::startingValue(xTad);
|
||||||
|
|
||||||
for (uint j = 0; j < tadLen; ++j) {
|
for (uint j = 0; j < tadLen; ++j) {
|
||||||
const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, tadLen, canCastXTad);
|
const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
|
||||||
const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, tadLen, canCastYTad);
|
const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
|
||||||
start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1423,7 +1423,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
||||||
auto start = startVal;
|
auto start = startVal;
|
||||||
|
|
||||||
for (uint j = 0; j < tadLen; ++j) {
|
for (uint j = 0; j < tadLen; ++j) {
|
||||||
const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, tadLen, canCastXTad);
|
const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
|
||||||
start = OpType::update(start, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
|
start = OpType::update(start, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
|
||||||
}
|
}
|
||||||
z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
|
z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
|
||||||
|
@ -1449,8 +1449,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
|
||||||
auto start = startVal;
|
auto start = startVal;
|
||||||
|
|
||||||
for (uint j = 0; j < tadLen; ++j) {
|
for (uint j = 0; j < tadLen; ++j) {
|
||||||
const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, tadLen, canCastXTad);
|
const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
|
||||||
const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, tadLen, canCastYTad);
|
const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
|
||||||
start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
|
|
||||||
//
|
//
|
||||||
// @author iuriish@yahoo.com
|
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||||
//
|
//
|
||||||
|
|
||||||
#ifndef LIBND4J_SHAPEUTILS_H
|
#ifndef LIBND4J_SHAPEUTILS_H
|
||||||
|
|
|
@ -526,7 +526,7 @@ namespace shape {
|
||||||
/* int *sub = new int[leftOverIndexLen];
|
/* int *sub = new int[leftOverIndexLen];
|
||||||
shape::ind2subOrder(tadShape,index,len,sub);
|
shape::ind2subOrder(tadShape,index,len,sub);
|
||||||
*/
|
*/
|
||||||
shape::index2coords(leftOverIndexLen,tadShape, index,len, sub);
|
shape::index2coords(index, leftOverIndexLen,tadShape, sub);
|
||||||
|
|
||||||
|
|
||||||
for(int i = 0; i < leftOverIndexLen; i++) {
|
for(int i = 0; i < leftOverIndexLen; i++) {
|
||||||
|
@ -609,7 +609,7 @@ namespace shape {
|
||||||
if(dimensionLength > 1) {
|
if(dimensionLength > 1) {
|
||||||
Nd4jLong *tad2Sub = this->tad2Sub(index, ptrManager);
|
Nd4jLong *tad2Sub = this->tad2Sub(index, ptrManager);
|
||||||
|
|
||||||
Nd4jLong ret = shape::getOffset(0,shape::shapeOf(shapeInfo),shape::stride(shapeInfo),tad2Sub,shape::rank(shapeInfo));
|
Nd4jLong ret = shape::getOffset(shapeInfo, tad2Sub);
|
||||||
|
|
||||||
if(ret < 0) {
|
if(ret < 0) {
|
||||||
if (ptrManager == nullptr)
|
if (ptrManager == nullptr)
|
||||||
|
@ -625,7 +625,7 @@ namespace shape {
|
||||||
else {
|
else {
|
||||||
Nd4jLong *tad2Sub = this->tad2Sub(index, ptrManager);
|
Nd4jLong *tad2Sub = this->tad2Sub(index, ptrManager);
|
||||||
|
|
||||||
Nd4jLong ret = shape::getOffset(0,shape::shapeOf(shapeInfo),shape::stride(shapeInfo),tad2Sub,shape::rank(shapeInfo));
|
Nd4jLong ret = shape::getOffset(shapeInfo, tad2Sub);
|
||||||
|
|
||||||
if (ptrManager == nullptr)
|
if (ptrManager == nullptr)
|
||||||
delete[] tad2Sub;
|
delete[] tad2Sub;
|
||||||
|
@ -703,7 +703,7 @@ namespace shape {
|
||||||
/* int *sub = new int[leftOverIndexLen];
|
/* int *sub = new int[leftOverIndexLen];
|
||||||
shape::ind2subOrder(tadShape,index,len,sub);
|
shape::ind2subOrder(tadShape,index,len,sub);
|
||||||
*/
|
*/
|
||||||
shape::index2coords(leftOverIndexLen,tadShape,index,len, sub);
|
shape::index2coords(index, leftOverIndexLen,tadShape, sub);
|
||||||
|
|
||||||
for(int i = 0; i < leftOverIndexLen; i++) {
|
for(int i = 0; i < leftOverIndexLen; i++) {
|
||||||
ret[leftOverIndexes[i]] = sub[i];
|
ret[leftOverIndexes[i]] = sub[i];
|
||||||
|
|
|
@ -64,7 +64,7 @@ namespace nd4j {
|
||||||
|
|
||||||
|
|
||||||
for (int i = 0; i < totalIterations; i++) {
|
for (int i = 0; i < totalIterations; i++) {
|
||||||
shape::index2coords(xRank, xShape, i, totalIterations, xCoords);
|
shape::index2coords(i, xRank, xShape, xCoords);
|
||||||
|
|
||||||
Parameters params;
|
Parameters params;
|
||||||
for (int j = 0; j < xRank; j++) {
|
for (int j = 0; j < xRank; j++) {
|
||||||
|
|
|
@ -226,7 +226,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
indexValue = OpType::update(indexValue, comp, extraParams);
|
indexValue = OpType::update(indexValue, comp, extraParams);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ);
|
auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
|
||||||
z[zOffset] = (Z) indexValue.index;
|
z[zOffset] = (Z) indexValue.index;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -243,7 +243,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
for (uint j = 0; j < tadLen; j++) {
|
for (uint j = 0; j < tadLen; j++) {
|
||||||
auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, tadLen, canCastTad);
|
auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
|
||||||
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
|
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
|
||||||
indexValue = OpType::update(indexValue, comp, extraParams);
|
indexValue = OpType::update(indexValue, comp, extraParams);
|
||||||
}
|
}
|
||||||
|
@ -266,12 +266,12 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
|
||||||
auto indexValue = OpType::startingIndexValue(tad);
|
auto indexValue = OpType::startingIndexValue(tad);
|
||||||
|
|
||||||
for (uint j = 0; j < tadLen; j++) {
|
for (uint j = 0; j < tadLen; j++) {
|
||||||
auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, tadLen, canCastTad);
|
auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
|
||||||
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
|
functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
|
||||||
indexValue = OpType::update(indexValue, comp, extraParams);
|
indexValue = OpType::update(indexValue, comp, extraParams);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ);
|
auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
|
||||||
z[zOffset] = (Z) indexValue.index;
|
z[zOffset] = (Z) indexValue.index;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
|
|
||||||
//
|
//
|
||||||
// @author Yurii Shyrma
|
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||||
//
|
//
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
@ -931,7 +931,7 @@ void ShapeUtils::evalIdxRangesForSubArr(const Nd4jLong subArrIdx, const Nd4jLon
|
||||||
for(int i = 0; i < subArrRank; ++i)
|
for(int i = 0; i < subArrRank; ++i)
|
||||||
shapeOfSubArr[i] = shapeInfo[dimsToExclude[i] + 1];
|
shapeOfSubArr[i] = shapeInfo[dimsToExclude[i] + 1];
|
||||||
|
|
||||||
shape::index2coords(subArrRank, shapeOfSubArr.data(), subArrIdx, indexes.data());
|
shape::index2coords(subArrIdx, subArrRank, shapeOfSubArr.data(), indexes.data());
|
||||||
|
|
||||||
memset(idxRanges, 0, 2 * rank * sizeof(Nd4jLong));
|
memset(idxRanges, 0, 2 * rank * sizeof(Nd4jLong));
|
||||||
|
|
||||||
|
|
|
@ -887,7 +887,7 @@ namespace shape {
|
||||||
* @param indices the indices to iterate over
|
* @param indices the indices to iterate over
|
||||||
* @return the double at the specified index
|
* @return the double at the specified index
|
||||||
*/
|
*/
|
||||||
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(Nd4jLong baseOffset, const Nd4jLong *shape, const Nd4jLong *stride, const Nd4jLong *indices, const int rank);
|
|
||||||
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset = 0);
|
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset = 0);
|
||||||
ND4J_EXPORT Nd4jLong getOffset(const Nd4jLong *shapeInfo, const std::vector<uint>& indices);
|
ND4J_EXPORT Nd4jLong getOffset(const Nd4jLong *shapeInfo, const std::vector<uint>& indices);
|
||||||
|
|
||||||
|
@ -897,20 +897,19 @@ namespace shape {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert a linear index to the corresponding coordinates
|
* Convert a linear index to the corresponding coordinates
|
||||||
* for example if shape is {2, 4}, then index 5 corresponds to following coordinates
|
* for example if shape is {2, 4}, then index 5 corresponds to coordinates [1, 1]
|
||||||
* -> [1, 1] in case of c order
|
|
||||||
* -> [1, 2] in case of f order
|
|
||||||
*/
|
*/
|
||||||
ND4J_EXPORT _CUDA_HD void index2coords(const int rank, const Nd4jLong *shape, Nd4jLong index, Nd4jLong arrLen, Nd4jLong *coords, const char order = 'c');
|
ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords);
|
||||||
ND4J_EXPORT _CUDA_HD void index2coords(const int rank, const Nd4jLong *shape, Nd4jLong index, Nd4jLong *coords, const char order = 'c');
|
ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, Nd4jLong *coords);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert coordinates to the corresponding linear index (sequence number in other words)
|
* Convert coordinates to the corresponding linear index (sequence number in other words)
|
||||||
* for example if shape is {2, 4}, then:
|
* for example if shape is {2, 4} and coordinates [1, 1] then index 5 is returned
|
||||||
* in case of c order and coordinates [1, 1] index 5 is returned
|
|
||||||
* in case of f order and coordinates [1, 2] index 5 is returned
|
|
||||||
*/
|
*/
|
||||||
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *coords, const char order = 'c');
|
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords);
|
||||||
|
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *coords);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* increment n-dimensional array by one iteration by changing coord appropriately
|
* increment n-dimensional array by one iteration by changing coord appropriately
|
||||||
|
@ -921,24 +920,10 @@ namespace shape {
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* calculates an array buffer offset for given "index" using following formula: offset = coord_0*stride_0 + coord_1*stride_1 + ... + coord_{rank-1}*stride_{rank-1}
|
/* calculates an array buffer offset for given "index" using following formula: offset = coord_0*stride_0 + coord_1*stride_1 + ... + coord_{rank-1}*stride_{rank-1}
|
||||||
* arrLen - array length
|
|
||||||
*/
|
*/
|
||||||
ND4J_EXPORT _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo, uint arrLen);
|
ND4J_EXPORT _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo);
|
||||||
ND4J_EXPORT _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen);
|
ND4J_EXPORT _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo);
|
||||||
ND4J_EXPORT _CUDA_HD Nd4jLong getIndexOrderOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen, const char order);
|
ND4J_EXPORT _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeInfo, const uint* uShapeInfo, const bool useUnsigned);
|
||||||
ND4J_EXPORT _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeInfo, const uint* uShapeInfo, Nd4jLong arrLen, const bool useUnsigned);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Compute the real linear indices for the given shape and stride
|
|
||||||
*/
|
|
||||||
ND4J_EXPORT _CUDA_HD Nd4jLong *computeIndices(int rank, Nd4jLong *shape, Nd4jLong *stride);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Compute the real linear indices for the
|
|
||||||
* given shape buffer. Shape,stride and rank are derived
|
|
||||||
* from the buffer
|
|
||||||
*/
|
|
||||||
ND4J_EXPORT _CUDA_HD Nd4jLong *computeIndices( Nd4jLong *shapeBuffer);
|
|
||||||
|
|
||||||
ND4J_EXPORT _CUDA_HD void printShapeInfo(Nd4jLong *shapeInfo);
|
ND4J_EXPORT _CUDA_HD void printShapeInfo(Nd4jLong *shapeInfo);
|
||||||
|
|
||||||
|
@ -1749,57 +1734,34 @@ __device__ INLINEDEF Nd4jLong *cuMalloc(Nd4jLong *buffer, long size) {
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Compute the real linear indices for the given shape and stride
|
|
||||||
*/
|
|
||||||
INLINEDEF _CUDA_HD Nd4jLong *computeIndices(int rank, Nd4jLong *shape, Nd4jLong *stride) {
|
|
||||||
Nd4jLong length = shape::prodLong(shape,rank);
|
|
||||||
|
|
||||||
traceNew(13);
|
|
||||||
|
|
||||||
Nd4jLong *ret = new Nd4jLong[length];
|
|
||||||
for(int i = 0; i < length; i++) {
|
|
||||||
Nd4jLong *idx = new Nd4jLong[rank];
|
|
||||||
shape::index2coords(rank, shape, i, idx, 'f');
|
|
||||||
ret[i] = shape::getOffset(0, shape, stride, idx, rank);
|
|
||||||
delete[] idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Compute the real linear indices for the given shape and stride
|
|
||||||
*/
|
|
||||||
INLINEDEF _CUDA_HD Nd4jLong *computeIndices(Nd4jLong *shapeBuffer) {
|
|
||||||
return computeIndices(shape::rank(shapeBuffer),shape::shapeOf(shapeBuffer),shape::stride(shapeBuffer));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *indices, const char order) {
|
INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *indices) {
|
||||||
|
|
||||||
Nd4jLong index, shift = 1;;
|
Nd4jLong index, shift = 1;;
|
||||||
|
|
||||||
if(order == 'c') {
|
index = indices[shapeInfo[0] - 1];
|
||||||
|
for(uint i = shapeInfo[0]; i > 1; --i) {
|
||||||
index = indices[rank - 1];
|
shift *= shapeInfo[i];
|
||||||
for(int i = rank - 2; i >= 0; --i) {
|
index += shift * indices[i - 2];
|
||||||
shift *= shape[i + 1];
|
|
||||||
index += shift * indices[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
index = indices[0];
|
|
||||||
for(int i = 1; i < rank; ++i) {
|
|
||||||
shift *= shape[i - 1];
|
|
||||||
index += shift * indices[i];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return index;
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *indices) {
|
||||||
|
|
||||||
|
Nd4jLong index, shift = 1;;
|
||||||
|
|
||||||
|
index = indices[rank - 1];
|
||||||
|
for(uint i = rank - 1; i >= 1; --i) {
|
||||||
|
shift *= shape[i];
|
||||||
|
index += shift * indices[i - 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
INLINEDEF _CUDA_HD void fill(T* buffer, T value, Nd4jLong length) {
|
INLINEDEF _CUDA_HD void fill(T* buffer, T value, Nd4jLong length) {
|
||||||
|
|
||||||
|
@ -1809,84 +1771,109 @@ template <typename T>
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// //////////////////////////////////////////////////////////////////////
|
||||||
|
// INLINEDEF _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen) {
|
||||||
|
|
||||||
|
// const Nd4jLong ews = shapeInfo[shapeInfo[0] + shapeInfo[0] + 2];
|
||||||
|
|
||||||
|
// if(ews > 0 && order(shapeInfo) == 'c')
|
||||||
|
// if (ews == 1)
|
||||||
|
// return index;
|
||||||
|
// else
|
||||||
|
// return ews * index;
|
||||||
|
|
||||||
|
// Nd4jLong offset = 0;
|
||||||
|
// Nd4jLong rank = shapeInfo[0];
|
||||||
|
// for(int i = 1; i <= shapeInfo[0]; ++i) {
|
||||||
|
// arrLen /= shapeInfo[i];
|
||||||
|
// if(arrLen > 0 && shapeInfo[i] > 1) {
|
||||||
|
// offset += (index / arrLen) * shapeInfo[i + rank];
|
||||||
|
// index %= arrLen;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// return offset;
|
||||||
|
// }
|
||||||
|
|
||||||
|
// INLINEDEF _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo, uint arrLen) {
|
||||||
|
|
||||||
|
// const uint rank = shapeInfo[0];
|
||||||
|
// const uint ews = shapeInfo[rank + rank + 2];
|
||||||
|
|
||||||
|
// if(ews > 0 && shapeInfo[rank + rank + 3] == 99)
|
||||||
|
// if (ews == 1)
|
||||||
|
// return index;
|
||||||
|
// else
|
||||||
|
// return ews * index;
|
||||||
|
|
||||||
|
// uint offset = 0;
|
||||||
|
|
||||||
|
// for(uint i = 1; i <= rank; ++i) {
|
||||||
|
// arrLen /= shapeInfo[i];
|
||||||
|
// if(arrLen > 0 && shapeInfo[i] > 1) {
|
||||||
|
// offset += (index / arrLen) * shapeInfo[i + rank];
|
||||||
|
// index %= arrLen;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// return offset;
|
||||||
|
// }
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
INLINEDEF _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen) {
|
INLINEDEF _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo) {
|
||||||
|
|
||||||
const Nd4jLong ews = shapeInfo[shapeInfo[0] + shapeInfo[0] + 2];
|
if (shapeInfo[2 * shapeInfo[0] + 3] == 99) {
|
||||||
|
|
||||||
if(ews > 0 && order(shapeInfo) == 'c')
|
const Nd4jLong ews = shapeInfo[2 * shapeInfo[0] + 2];
|
||||||
if (ews == 1)
|
if (ews == 1)
|
||||||
return index;
|
return index;
|
||||||
else
|
else if(ews > 1)
|
||||||
return ews * index;
|
return ews * index;
|
||||||
|
}
|
||||||
|
|
||||||
Nd4jLong offset = 0;
|
Nd4jLong offset = 0;
|
||||||
Nd4jLong rank = shapeInfo[0];
|
|
||||||
for(int i = 1; i <= shapeInfo[0]; ++i) {
|
for(uint i = shapeInfo[0]; i > 1; --i) {
|
||||||
arrLen /= shapeInfo[i];
|
offset += (index % shapeInfo[i]) * shapeInfo[i + shapeInfo[0]];
|
||||||
if(arrLen > 0 && shapeInfo[i] > 1) {
|
index /= shapeInfo[i];
|
||||||
offset += (index / arrLen) * shapeInfo[i + rank];
|
|
||||||
index %= arrLen;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
offset += index * shapeInfo[1 + shapeInfo[0]]; // last iteration
|
||||||
|
|
||||||
return offset;
|
return offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
INLINEDEF _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo, uint arrLen) {
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
INLINEDEF _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo) {
|
||||||
|
|
||||||
const uint rank = shapeInfo[0];
|
if (shapeInfo[2 * shapeInfo[0] + 3] == 99) {
|
||||||
const uint ews = shapeInfo[rank + rank + 2];
|
|
||||||
|
|
||||||
if(ews > 0 && shapeInfo[rank + rank + 3] == 99)
|
const Nd4jLong ews = shapeInfo[2 * shapeInfo[0] + 2];
|
||||||
if (ews == 1)
|
if (ews == 1)
|
||||||
return index;
|
return index;
|
||||||
else
|
else if(ews > 1)
|
||||||
return ews * index;
|
return ews * index;
|
||||||
|
}
|
||||||
|
|
||||||
uint offset = 0;
|
uint offset = 0;
|
||||||
|
|
||||||
for(uint i = 1; i <= rank; ++i) {
|
for(uint i = shapeInfo[0]; i > 1; --i) {
|
||||||
arrLen /= shapeInfo[i];
|
offset += (index % shapeInfo[i]) * shapeInfo[i + shapeInfo[0]];
|
||||||
if(arrLen > 0 && shapeInfo[i] > 1) {
|
index /= shapeInfo[i];
|
||||||
offset += (index / arrLen) * shapeInfo[i + rank];
|
|
||||||
index %= arrLen;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
offset += index * shapeInfo[1 + shapeInfo[0]]; // last iteration
|
||||||
|
|
||||||
return offset;
|
return offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeInfo, const uint* uShapeInfo, Nd4jLong arrLen, const bool useUnsigned) {
|
|
||||||
|
|
||||||
if(useUnsigned)
|
|
||||||
return getIndexOffset(static_cast<uint>(index), uShapeInfo, static_cast<uint>(arrLen));
|
|
||||||
|
|
||||||
return getIndexOffset(index, lShapeInfo, arrLen);
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
INLINEDEF _CUDA_HD Nd4jLong getIndexOrderOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen, const char order) {
|
INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeInfo, const uint* uShapeInfo, const bool useUnsigned) {
|
||||||
|
|
||||||
Nd4jLong offset = 0;
|
if(useUnsigned)
|
||||||
if(order == 'c') {
|
return getIndexOffset(static_cast<uint>(index), uShapeInfo);
|
||||||
for(int i = 1; i <= *shapeInfo; ++i) {
|
|
||||||
arrLen /= shapeInfo[i];
|
return getIndexOffset(index, lShapeInfo);
|
||||||
if(arrLen > 0 && shapeInfo[i] > 1) {
|
}
|
||||||
offset += (index / arrLen) * shapeInfo[i + *shapeInfo];
|
|
||||||
index %= arrLen;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
for(int i = *shapeInfo; i >= 1 ; --i) {
|
|
||||||
arrLen /= shapeInfo[i];
|
|
||||||
if(arrLen > 0 && shapeInfo[i] > 1) {
|
|
||||||
offset += (index / arrLen) * shapeInfo[i + *shapeInfo];
|
|
||||||
index %= arrLen;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
@ -2394,7 +2381,7 @@ template <typename T>
|
||||||
auto indices = new Nd4jLong[rank];
|
auto indices = new Nd4jLong[rank];
|
||||||
memset((void *) indices,0,rank * sizeof(Nd4jLong));
|
memset((void *) indices,0,rank * sizeof(Nd4jLong));
|
||||||
indices[0] = sliceIdx;
|
indices[0] = sliceIdx;
|
||||||
Nd4jLong offset = shape::getOffset(0,newShape,newStride,indices,rank);
|
Nd4jLong offset = shape::getOffset(newShapeBuffer, indices);
|
||||||
newShapeBuffer[shape::shapeInfoLength(newRank) - 3] = offset;
|
newShapeBuffer[shape::shapeInfoLength(newRank) - 3] = offset;
|
||||||
|
|
||||||
// set current order and ews
|
// set current order and ews
|
||||||
|
@ -3201,30 +3188,30 @@ INLINEDEF _CUDA_HD bool haveSameShapeAndStrides(const Nd4jLong *shapeInfo1, cons
|
||||||
* @param indices the indices to iterate over
|
* @param indices the indices to iterate over
|
||||||
* @return the double at the specified index
|
* @return the double at the specified index
|
||||||
*/
|
*/
|
||||||
INLINEDEF _CUDA_HD Nd4jLong getOffset(Nd4jLong baseOffset, const Nd4jLong *shape, const Nd4jLong *stride, const Nd4jLong *indices, const int rank) {
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset) {
|
||||||
|
|
||||||
Nd4jLong offset = baseOffset;
|
Nd4jLong offset = baseOffset;
|
||||||
for(int i = 0; i < rank; i++) {
|
|
||||||
if(shape[i] != 1)
|
for(uint i = 1; i <= shapeInfo[0]; ++i)
|
||||||
offset += indices[i] * stride[i];
|
if(shapeInfo[i] != 1)
|
||||||
}
|
offset += indices[i - 1] * shapeInfo[shapeInfo[0] + i];
|
||||||
|
|
||||||
return offset;
|
return offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset) {
|
//////////////////////////////////////////////////////////////////////////
|
||||||
return shape::getOffset(baseOffset, shape::shapeOf(const_cast<Nd4jLong*>(shapeInfo)), shape::stride(const_cast<Nd4jLong*>(shapeInfo)), indices, shapeInfo[0]);
|
INLINEDEF Nd4jLong getOffset(const Nd4jLong *shapeInfo, const std::vector<uint>& indices) {
|
||||||
}
|
|
||||||
|
|
||||||
INLINEDEF Nd4jLong getOffset(const Nd4jLong *shapeInfo, const std::vector<uint>& indices) {
|
|
||||||
|
|
||||||
Nd4jLong offset = 0;
|
Nd4jLong offset = 0;
|
||||||
|
|
||||||
for(uint i = 0; i < shapeInfo[0]; ++i)
|
for(uint i = 1; i <= shapeInfo[0]; ++i)
|
||||||
if(shapeInfo[i + 1] != 1)
|
if(shapeInfo[i] != 1)
|
||||||
offset += indices[i] * shapeInfo[shapeInfo[0] + i + 1];
|
offset += indices[i - 1] * shapeInfo[shapeInfo[0] + i];
|
||||||
|
|
||||||
return offset;
|
return offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -4209,24 +4196,24 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
|
||||||
INLINEDEF _CUDA_HD Nd4jLong subArrayIndex(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) {
|
INLINEDEF _CUDA_HD Nd4jLong subArrayIndex(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) {
|
||||||
|
|
||||||
Nd4jLong maxIdxs[MAX_RANK];
|
Nd4jLong maxIdxs[MAX_RANK];
|
||||||
shape::index2coords(shape::rank(maxShapeInfo), const_cast<Nd4jLong *>(maxShapeInfo)+1, const_cast<Nd4jLong&>(maxIdx), maxIdxs, shape::order(maxShapeInfo));
|
shape::index2coords(const_cast<Nd4jLong&>(maxIdx), maxShapeInfo, maxIdxs);
|
||||||
|
|
||||||
Nd4jLong minIdxs[MAX_RANK];
|
Nd4jLong minIdxs[MAX_RANK];
|
||||||
maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen);
|
maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen);
|
||||||
|
|
||||||
return coords2index(shape::rank(minShapeInfo), minShapeInfo + 1, minIdxs);
|
return shape::coords2index(minShapeInfo, minIdxs);
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
INLINEDEF _CUDA_HD Nd4jLong subArrayOffset(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) {
|
INLINEDEF _CUDA_HD Nd4jLong subArrayOffset(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) {
|
||||||
|
|
||||||
Nd4jLong maxIdxs[MAX_RANK];
|
Nd4jLong maxIdxs[MAX_RANK];
|
||||||
shape::index2coords(shape::rank(maxShapeInfo), const_cast<Nd4jLong *>(maxShapeInfo)+1, const_cast<Nd4jLong&>(maxIdx), maxIdxs, shape::order(maxShapeInfo));
|
shape::index2coords(const_cast<Nd4jLong&>(maxIdx), maxShapeInfo, maxIdxs);
|
||||||
|
|
||||||
Nd4jLong minIdxs[MAX_RANK];
|
Nd4jLong minIdxs[MAX_RANK];
|
||||||
maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen);
|
maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen);
|
||||||
|
|
||||||
return getOffset(0, minShapeInfo + 1, minShapeInfo + shape::rank(minShapeInfo) + 1, minIdxs, shape::rank(minShapeInfo));
|
return getOffset(minShapeInfo, minIdxs);
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
@ -4246,7 +4233,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
|
||||||
int N, minI, maxI;
|
int N, minI, maxI;
|
||||||
|
|
||||||
// calculate min per-dim-indices which corresponds to absolute minIdx index
|
// calculate min per-dim-indices which corresponds to absolute minIdx index
|
||||||
shape::index2coords(rankMin, minShapeInfo + 1, minIdx, indices, order(minShapeInfo));
|
shape::index2coords(minIdx, minShapeInfo, indices);
|
||||||
|
|
||||||
// transform storage indices to contain per-dim max indices, purpose - memory saving
|
// transform storage indices to contain per-dim max indices, purpose - memory saving
|
||||||
// fill increment array as well
|
// fill increment array as well
|
||||||
|
@ -4277,7 +4264,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
|
||||||
maxI = rankMax-1;
|
maxI = rankMax-1;
|
||||||
N = 0;
|
N = 0;
|
||||||
int step;
|
int step;
|
||||||
maxOffsets[N++] = shape::getOffset(0, maxShapeInfo + 1, maxShapeInfo + rankMax + 1, indices, rankMax);
|
maxOffsets[N++] = shape::getOffset(maxShapeInfo, indices);
|
||||||
|
|
||||||
// nested loops - producing of absolute indices for max array
|
// nested loops - producing of absolute indices for max array
|
||||||
while(maxI >= 0) {
|
while(maxI >= 0) {
|
||||||
|
@ -4290,7 +4277,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
|
||||||
step = -1;
|
step = -1;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
maxOffsets[N++] = shape::getOffset(0, maxShapeInfo + 1, maxShapeInfo + rankMax + 1, indices, rankMax);
|
maxOffsets[N++] = shape::getOffset(maxShapeInfo, indices);
|
||||||
step = rankMax - 1 - maxI;
|
step = rankMax - 1 - maxI;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4322,7 +4309,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
|
||||||
int N, minI, maxI;
|
int N, minI, maxI;
|
||||||
|
|
||||||
// calculate min per-dim-indices which corresponds to absolute minIdx index
|
// calculate min per-dim-indices which corresponds to absolute minIdx index
|
||||||
shape::index2coords(rankMin, minShapeInfo + 1, minIdx, indices, order(minShapeInfo));
|
shape::index2coords(minIdx, minShapeInfo, indices);
|
||||||
|
|
||||||
// transform storage indices to contain per-dim max indices, purpose - memory saving
|
// transform storage indices to contain per-dim max indices, purpose - memory saving
|
||||||
// fill increment array as well
|
// fill increment array as well
|
||||||
|
@ -4353,7 +4340,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
|
||||||
maxI = rankMax-1;
|
maxI = rankMax-1;
|
||||||
N = 0;
|
N = 0;
|
||||||
int step;
|
int step;
|
||||||
maxIdxs[N++] = coords2index(rankMax, maxShapeInfo + 1, indices);
|
maxIdxs[N++] = shape::coords2index(maxShapeInfo, indices);
|
||||||
|
|
||||||
// nested loops - producing of absolute indices for max array
|
// nested loops - producing of absolute indices for max array
|
||||||
while(maxI >= 0) {
|
while(maxI >= 0) {
|
||||||
|
@ -4366,7 +4353,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
|
||||||
step = -1;
|
step = -1;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
maxIdxs[N++] = coords2index(rankMax, maxShapeInfo + 1, indices);
|
maxIdxs[N++] = shape::coords2index(maxShapeInfo, indices);
|
||||||
step = rankMax - 1 - maxI;
|
step = rankMax - 1 - maxI;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4693,37 +4680,23 @@ INLINEDEF _CUDA_HD void calcSubArrShapeAndOffsets(const Nd4jLong* wholeShapeInfo
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
INLINEDEF void _CUDA_HD index2coords(const int rank, const Nd4jLong *shape, Nd4jLong index, Nd4jLong *coords, const char order) {
|
INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords) {
|
||||||
Nd4jLong arrLen = shape::prodLong(shape, rank);
|
|
||||||
shape::index2coords(rank, shape, index, arrLen, coords, order);
|
for(uint i = shapeInfo[0]; i > 1; --i) {
|
||||||
|
coords[i - 1] = index % shapeInfo[i];
|
||||||
|
index /= shapeInfo[i];
|
||||||
|
}
|
||||||
|
coords[0] = index; // last iteration
|
||||||
}
|
}
|
||||||
|
|
||||||
INLINEDEF void _CUDA_HD index2coords(const int rank, const Nd4jLong *shape, Nd4jLong index, Nd4jLong arrLen, Nd4jLong *coords, const char order) {
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, Nd4jLong *coords) {
|
||||||
|
|
||||||
if(order == 'c') {
|
for(uint i = rank - 1; i > 0; --i) {
|
||||||
|
coords[i] = index % shape[i];
|
||||||
for(int i = 0; i < rank; i++) {
|
index /= shape[i];
|
||||||
arrLen /= shape[i];
|
|
||||||
if(arrLen > 0 && shape[i] > 1) {
|
|
||||||
coords[i] = index / arrLen;
|
|
||||||
index %= arrLen;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
coords[i] = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
|
|
||||||
for(int i = rank - 1; i >= 0; i--) {
|
|
||||||
arrLen /= shape[i];
|
|
||||||
if(arrLen > 0 && shape[i] > 1) {
|
|
||||||
coords[i] = index / arrLen;
|
|
||||||
index %= arrLen;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
coords[i] = 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
coords[0] = index; // last iteration
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
|
|
@ -176,7 +176,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int f = 0; f < tadLength; f++) {
|
for (unsigned int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
oZ[offset] = OpType::op(oX[offset], y[offset]);
|
oZ[offset] = OpType::op(oX[offset], y[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -196,8 +196,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
|
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||||
oZ[zOffset] = OpType::op(oX[offset], y[offset]);
|
oZ[zOffset] = OpType::op(oX[offset], y[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -217,8 +217,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
|
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
oZ[offset] = OpType::op(oX[offset], y[yOffset]);
|
oZ[offset] = OpType::op(oX[offset], y[yOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -238,8 +238,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
|
auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
oZ[offset] = OpType::op(oX[xOffset], y[offset]);
|
oZ[offset] = OpType::op(oX[xOffset], y[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -261,9 +261,9 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
|
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
|
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||||
oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
|
oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -362,7 +362,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int f = 0; f < tadLength; f++) {
|
for (unsigned int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
oZ[offset] = OpType::op(x[offset], oY[offset]);
|
oZ[offset] = OpType::op(x[offset], oY[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -382,8 +382,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
|
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||||
oZ[zOffset] = OpType::op(x[offset], oY[offset]);
|
oZ[zOffset] = OpType::op(x[offset], oY[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -403,8 +403,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, lenX, canCastX);
|
auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, canCastX);
|
||||||
oZ[offset] = OpType::op(x[xOffset], oY[offset]);
|
oZ[offset] = OpType::op(x[xOffset], oY[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -424,8 +424,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
|
auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
oZ[offset] = OpType::op(x[offset], oY[yOffset]);
|
oZ[offset] = OpType::op(x[offset], oY[yOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -447,9 +447,9 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
|
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
|
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||||
oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]);
|
oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -178,7 +178,7 @@ namespace functions {
|
||||||
// all this stuff already happens within thread
|
// all this stuff already happens within thread
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
oZ[offset] = OpType::op(oX[offset], y[offset]);
|
oZ[offset] = OpType::op(oX[offset], y[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -198,8 +198,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
|
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||||
oZ[zOffset] = OpType::op(oX[offset], y[offset]);
|
oZ[zOffset] = OpType::op(oX[offset], y[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -219,8 +219,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
|
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
oZ[offset] = OpType::op(oX[offset], y[yOffset]);
|
oZ[offset] = OpType::op(oX[offset], y[yOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -240,8 +240,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
|
auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
oZ[offset] = OpType::op(oX[xOffset], y[offset]);
|
oZ[offset] = OpType::op(oX[xOffset], y[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -263,9 +263,9 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
|
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
|
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||||
oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
|
oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -365,7 +365,7 @@ namespace functions {
|
||||||
// all this stuff already happens within thread
|
// all this stuff already happens within thread
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
oZ[offset] = OpType::op(x[offset], oY[offset]);
|
oZ[offset] = OpType::op(x[offset], oY[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -385,8 +385,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
|
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||||
oZ[zOffset] = OpType::op(x[offset], oY[offset]);
|
oZ[zOffset] = OpType::op(x[offset], oY[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -406,8 +406,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
|
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
oZ[offset] = OpType::op(x[xOffset], oY[offset]);
|
oZ[offset] = OpType::op(x[xOffset], oY[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -427,8 +427,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
|
auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
oZ[offset] = OpType::op(x[offset], oY[yOffset]);
|
oZ[offset] = OpType::op(x[offset], oY[yOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -450,9 +450,9 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
|
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
|
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||||
oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]);
|
oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -178,7 +178,7 @@ namespace functions {
|
||||||
// all this stuff already happens within thread
|
// all this stuff already happens within thread
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
oZ[offset] = OpType::op(oX[offset], y[offset]);
|
oZ[offset] = OpType::op(oX[offset], y[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -198,8 +198,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
|
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||||
oZ[zOffset] = OpType::op(oX[offset], y[offset]);
|
oZ[zOffset] = OpType::op(oX[offset], y[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -219,8 +219,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
|
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
oZ[offset] = OpType::op(oX[offset], y[yOffset]);
|
oZ[offset] = OpType::op(oX[offset], y[yOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -240,8 +240,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
|
auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
oZ[offset] = OpType::op(oX[xOffset], y[offset]);
|
oZ[offset] = OpType::op(oX[xOffset], y[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -263,9 +263,9 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
|
auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
|
auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
|
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||||
oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
|
oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -365,7 +365,7 @@ namespace functions {
|
||||||
// all this stuff already happens within thread
|
// all this stuff already happens within thread
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
oZ[offset] = OpType::op(x[offset], oY[offset]);
|
oZ[offset] = OpType::op(x[offset], oY[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -385,8 +385,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
|
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||||
oZ[zOffset] = OpType::op(x[offset], oY[offset]);
|
oZ[zOffset] = OpType::op(x[offset], oY[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -406,8 +406,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
|
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
oZ[offset] = OpType::op(x[xOffset], oY[offset]);
|
oZ[offset] = OpType::op(x[xOffset], oY[offset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -427,8 +427,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
|
auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
oZ[offset] = OpType::op(x[offset], oY[yOffset]);
|
oZ[offset] = OpType::op(x[offset], oY[yOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -450,9 +450,9 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int f = 0; f < tadLength; f++) {
|
for (int f = 0; f < tadLength; f++) {
|
||||||
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
|
auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
|
auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
|
auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
|
||||||
oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]);
|
oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -92,7 +92,7 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
|
||||||
auto ulen = info.getItersPerThread(threadNum);
|
auto ulen = info.getItersPerThread(threadNum);
|
||||||
|
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(threadOffset + i, xShapeInfo, xShapeInfoCast, len, canCastX);
|
auto offset = shape::indexOffset(threadOffset + i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
IndexValue<X> curr(x[offset], threadOffset + i);
|
IndexValue<X> curr(x[offset], threadOffset + i);
|
||||||
local = OpType::update(local, curr, extraParams);
|
local = OpType::update(local, curr, extraParams);
|
||||||
}
|
}
|
||||||
|
|
|
@ -166,7 +166,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for(unsigned int i = 0; i < ulen; i++) {
|
for(unsigned int i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
z[offset] = OpType::op(x[offset], y[0], extraParams);
|
z[offset] = OpType::op(x[offset], y[0], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -183,8 +183,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for(unsigned int i = 0; i < ulen; i++) {
|
for(unsigned int i = 0; i < ulen; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
|
z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -218,7 +218,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int i = 0; i < ulen; i++) {
|
for (unsigned int i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
z[offset] = OpType::op(x[offset], y[offset], extraParams);
|
z[offset] = OpType::op(x[offset], y[offset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -238,8 +238,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int i = 0; i < ulen; i++) {
|
for (unsigned int i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
|
z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -259,8 +259,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int i = 0; i < ulen; i++) {
|
for (unsigned int i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
|
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
|
z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -280,8 +280,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int i = 0; i < ulen; i++) {
|
for (unsigned int i = 0; i < ulen; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
|
auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
|
z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -303,9 +303,9 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int i = 0; i < ulen; i++) {
|
for (unsigned int i = 0; i < ulen; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
|
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -158,7 +158,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for(Nd4jLong i = 0; i < ulen; i++) {
|
for(Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
z[offset] = OpType::op(x[offset], y[0], extraParams);
|
z[offset] = OpType::op(x[offset], y[0], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -176,8 +176,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for(Nd4jLong i = 0; i < ulen; i++) {
|
for(Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
|
z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -209,7 +209,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
z[offset] = OpType::op(x[offset], y[offset], extraParams);
|
z[offset] = OpType::op(x[offset], y[offset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -229,8 +229,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
|
z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -250,8 +250,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
|
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
|
z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -271,8 +271,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
|
auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
|
z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -294,9 +294,9 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
|
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -158,7 +158,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for(Nd4jLong i = 0; i < ulen; i++) {
|
for(Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
z[offset] = OpType::op(x[offset], y[0], extraParams);
|
z[offset] = OpType::op(x[offset], y[0], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -176,8 +176,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for(Nd4jLong i = 0; i < ulen; i++) {
|
for(Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
|
z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -209,7 +209,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
z[offset] = OpType::op(x[offset], y[offset], extraParams);
|
z[offset] = OpType::op(x[offset], y[offset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -229,8 +229,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
|
z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -250,8 +250,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
|
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
|
z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -271,8 +271,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
|
auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
|
z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -294,9 +294,9 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
|
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -70,7 +70,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
|
z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -90,8 +90,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, length, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
|
z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -111,8 +111,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, length, canCastY);
|
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
|
z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -132,8 +132,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < info.getItersPerThread(threadNum); i++) {
|
for (Nd4jLong i = 0; i < info.getItersPerThread(threadNum); i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, length, canCastY);
|
auto offset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
|
z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -155,9 +155,9 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, length, canCastY);
|
auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, length, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpClass::op(x[xOffset], y[yOffset], i, length, rng, extraArguments);
|
z[zOffset] = OpClass::op(x[xOffset], y[yOffset], i, length, rng, extraArguments);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -196,7 +196,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
|
z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -214,8 +214,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, length, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
|
z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -247,7 +247,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (Nd4jLong i = 0; i < ulen; i++) {
|
for (Nd4jLong i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, length, canCastZ);
|
auto offset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[offset] = OpClass::op(i+threadOffset, length, rng, extraArguments);
|
z[offset] = OpClass::op(i+threadOffset, length, rng, extraArguments);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -77,7 +77,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
|
PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
|
||||||
for(Nd4jLong i = 0; i < length; ++i)
|
for(Nd4jLong i = 0; i < length; ++i)
|
||||||
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
|
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
|
||||||
|
|
||||||
|
|
||||||
for (int e = 0; e < maxThreads; e++)
|
for (int e = 0; e < maxThreads; e++)
|
||||||
|
@ -112,7 +112,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_FOR_SIMD
|
PRAGMA_OMP_PARALLEL_FOR_SIMD
|
||||||
for(Nd4jLong i = 0; i < length; ++i)
|
for(Nd4jLong i = 0; i < length; ++i)
|
||||||
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
|
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
|
||||||
|
|
||||||
for (int e = 0; e < omp_get_max_threads(); e++)
|
for (int e = 0; e < omp_get_max_threads(); e++)
|
||||||
start = OpType::update(start, intermediate[e], extraParams);
|
start = OpType::update(start, intermediate[e], extraParams);
|
||||||
|
|
|
@ -81,7 +81,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
|
PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
|
||||||
for(Nd4jLong i = 0; i < length; ++i)
|
for(Nd4jLong i = 0; i < length; ++i)
|
||||||
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
|
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
|
||||||
|
|
||||||
|
|
||||||
for (int e = 0; e < maxThreads; e++)
|
for (int e = 0; e < maxThreads; e++)
|
||||||
|
@ -115,7 +115,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_FOR_SIMD
|
PRAGMA_OMP_PARALLEL_FOR_SIMD
|
||||||
for(Nd4jLong i = 0; i < length; ++i)
|
for(Nd4jLong i = 0; i < length; ++i)
|
||||||
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
|
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
|
||||||
|
|
||||||
for (int e = 0; e < omp_get_max_threads(); e++)
|
for (int e = 0; e < omp_get_max_threads(); e++)
|
||||||
start = OpType::update(start, intermediate[e], extraParams);
|
start = OpType::update(start, intermediate[e], extraParams);
|
||||||
|
|
|
@ -77,7 +77,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
|
PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
|
||||||
for(Nd4jLong i = 0; i < length; ++i)
|
for(Nd4jLong i = 0; i < length; ++i)
|
||||||
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
|
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
|
||||||
|
|
||||||
|
|
||||||
for (int e = 0; e < maxThreads; e++)
|
for (int e = 0; e < maxThreads; e++)
|
||||||
|
@ -113,7 +113,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_FOR_SIMD
|
PRAGMA_OMP_PARALLEL_FOR_SIMD
|
||||||
for(Nd4jLong i = 0; i < length; ++i)
|
for(Nd4jLong i = 0; i < length; ++i)
|
||||||
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
|
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
|
||||||
|
|
||||||
for (int e = 0; e < omp_get_max_threads(); e++)
|
for (int e = 0; e < omp_get_max_threads(); e++)
|
||||||
start = OpType::update(start, intermediate[e], extraParams);
|
start = OpType::update(start, intermediate[e], extraParams);
|
||||||
|
|
|
@ -79,7 +79,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
|
PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
|
||||||
for(Nd4jLong i = 0; i < length; ++i)
|
for(Nd4jLong i = 0; i < length; ++i)
|
||||||
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
|
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
|
||||||
|
|
||||||
|
|
||||||
for (int e = 0; e < maxThreads; e++)
|
for (int e = 0; e < maxThreads; e++)
|
||||||
|
@ -117,7 +117,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
|
PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
|
||||||
for(Nd4jLong i = 0; i < length; ++i)
|
for(Nd4jLong i = 0; i < length; ++i)
|
||||||
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
|
intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
|
||||||
|
|
||||||
for (int e = 0; e < maxThreads; e++)
|
for (int e = 0; e < maxThreads; e++)
|
||||||
start = OpType::update(start, intermediate[e], extraParams);
|
start = OpType::update(start, intermediate[e], extraParams);
|
||||||
|
|
|
@ -95,7 +95,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
|
||||||
PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads)
|
PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads)
|
||||||
for(unsigned int i = 0; i < length; i++) {
|
for(unsigned int i = 0; i < length; i++) {
|
||||||
const auto threadNum = omp_get_thread_num();
|
const auto threadNum = omp_get_thread_num();
|
||||||
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX);
|
auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum);
|
intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -105,8 +105,8 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
|
||||||
PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads)
|
PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads)
|
||||||
for(unsigned int i = 0; i < length; i++) {
|
for(unsigned int i = 0; i < length; i++) {
|
||||||
const auto threadNum = omp_get_thread_num();
|
const auto threadNum = omp_get_thread_num();
|
||||||
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX);
|
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, length, canCastY);
|
auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
|
||||||
intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum);
|
intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -165,7 +165,7 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int i = 0; i < ulen; i++) {
|
for (unsigned int i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
z[offset] = OpType::op(x[offset], scalar, extraParams);
|
z[offset] = OpType::op(x[offset], scalar, extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -183,8 +183,8 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int i = 0; i < ulen; i++) {
|
for (unsigned int i = 0; i < ulen; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
|
z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -173,7 +173,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int i = 0; i < ulen; i++) {
|
for (unsigned int i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
z[offset] = OpType::op(x[offset], scalar, extraParams);
|
z[offset] = OpType::op(x[offset], scalar, extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -191,8 +191,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int i = 0; i < ulen; i++) {
|
for (unsigned int i = 0; i < ulen; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
|
z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -173,7 +173,7 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int i = 0; i < ulen; i++) {
|
for (unsigned int i = 0; i < ulen; i++) {
|
||||||
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
|
auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
z[offset] = OpType::op(x[offset], scalar, extraParams);
|
z[offset] = OpType::op(x[offset], scalar, extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -191,8 +191,8 @@ namespace functions {
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int i = 0; i < ulen; i++) {
|
for (unsigned int i = 0; i < ulen; i++) {
|
||||||
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
|
auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
|
||||||
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ);
|
auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
|
||||||
z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
|
z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -92,7 +92,7 @@ namespace functions {
|
||||||
|
|
||||||
for (Nd4jLong i = 0; i < length; i++) {
|
for (Nd4jLong i = 0; i < length; i++) {
|
||||||
|
|
||||||
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCast);
|
auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCast);
|
||||||
|
|
||||||
SummaryStatsData<X> curr;
|
SummaryStatsData<X> curr;
|
||||||
curr.initWithValue(x[xOffset]);
|
curr.initWithValue(x[xOffset]);
|
||||||
|
@ -175,7 +175,7 @@ namespace functions {
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (int i = 1; i < tadLength; i ++) {
|
for (int i = 1; i < tadLength; i ++) {
|
||||||
auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCast);
|
auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast);
|
||||||
|
|
||||||
SummaryStatsData <X> indexVal2;
|
SummaryStatsData <X> indexVal2;
|
||||||
indexVal2.initWithValue(tx[xOffset]);
|
indexVal2.initWithValue(tx[xOffset]);
|
||||||
|
|
|
@ -64,8 +64,8 @@ static __global__ void broadcastInverseSimple(
|
||||||
namespace functions {
|
namespace functions {
|
||||||
namespace broadcast {
|
namespace broadcast {
|
||||||
|
|
||||||
static Nd4jLong __device__ __noinline__ _getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo, Nd4jLong length) {
|
static Nd4jLong __device__ __noinline__ _getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo) {
|
||||||
return shape::getIndexOffset(index, shapeInfo, length);
|
return shape::getIndexOffset(index, shapeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
static Nd4jLong __device__ __noinline__ _length(Nd4jLong *shapeInfo) {
|
static Nd4jLong __device__ __noinline__ _length(Nd4jLong *shapeInfo) {
|
||||||
|
@ -154,9 +154,9 @@ namespace functions {
|
||||||
else {
|
else {
|
||||||
// it is expected that x and z tads and y array all have the same length
|
// it is expected that x and z tads and y array all have the same length
|
||||||
for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
|
for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
|
||||||
auto xOffset = _getIndexOffset(i, xShapeInfo, tadLength);
|
auto xOffset = _getIndexOffset(i, xShapeInfo);
|
||||||
auto yOffset = _getIndexOffset(i, tadOnlyShapeInfo, tadLength);
|
auto yOffset = _getIndexOffset(i, tadOnlyShapeInfo);
|
||||||
auto zOffset = _getIndexOffset(i, tadOnlyShapeInfoZ, tadLength);
|
auto zOffset = _getIndexOffset(i, tadOnlyShapeInfoZ);
|
||||||
rZ[zOffset] = OpType::op(x[xOffset], rY[yOffset]);
|
rZ[zOffset] = OpType::op(x[xOffset], rY[yOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -219,9 +219,9 @@ namespace functions {
|
||||||
// it is expected that x and z tads and y array all have the same length
|
// it is expected that x and z tads and y array all have the same length
|
||||||
for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
|
for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
|
||||||
|
|
||||||
auto xOffset = _getIndexOffset(i, tadOnlyShapeInfo, tadLength);
|
auto xOffset = _getIndexOffset(i, tadOnlyShapeInfo);
|
||||||
auto yOffset = _getIndexOffset(i, yShapeInfo, tadLength);
|
auto yOffset = _getIndexOffset(i, yShapeInfo);
|
||||||
auto zOffset = _getIndexOffset(i, tadOnlyShapeInfoZ, tadLength);
|
auto zOffset = _getIndexOffset(i, tadOnlyShapeInfoZ);
|
||||||
rZ[zOffset] = OpType::op(rX[xOffset], y[yOffset]);
|
rZ[zOffset] = OpType::op(rX[xOffset], y[yOffset]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -145,9 +145,9 @@ namespace functions {
|
||||||
else {
|
else {
|
||||||
// it is expected that x and z tads and y array all have the same length
|
// it is expected that x and z tads and y array all have the same length
|
||||||
for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
|
for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, tadLength);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
auto yOffset = shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
|
auto yOffset = shape::getIndexOffset(i, tadOnlyShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ, tadLength);
|
auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ);
|
||||||
|
|
||||||
rZ[zOffset] = OpType::op(x[xOffset], rY[yOffset]);
|
rZ[zOffset] = OpType::op(x[xOffset], rY[yOffset]);
|
||||||
}
|
}
|
||||||
|
@ -213,9 +213,9 @@ namespace functions {
|
||||||
else {
|
else {
|
||||||
// it is expected that x and z tads and y array all have the same length
|
// it is expected that x and z tads and y array all have the same length
|
||||||
for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
|
for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
|
||||||
auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
|
auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo);
|
||||||
auto yOffset = shape::getIndexOffset(i, yShapeInfo, tadLength);
|
auto yOffset = shape::getIndexOffset(i, yShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ, tadLength);
|
auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ);
|
||||||
|
|
||||||
rZ[zOffset] = OpType::op(rX[xOffset], y[yOffset]);
|
rZ[zOffset] = OpType::op(rX[xOffset], y[yOffset]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -139,9 +139,9 @@ namespace functions {
|
||||||
else {
|
else {
|
||||||
// it is expected that x and z tads and y array all have the same length
|
// it is expected that x and z tads and y array all have the same length
|
||||||
for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
|
for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, tadLength);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
auto yOffset = shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
|
auto yOffset = shape::getIndexOffset(i, tadOnlyShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ, tadLength);
|
auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ);
|
||||||
|
|
||||||
rZ[zOffset] = OpType::op(x[xOffset], rY[yOffset]);
|
rZ[zOffset] = OpType::op(x[xOffset], rY[yOffset]);
|
||||||
}
|
}
|
||||||
|
@ -207,9 +207,9 @@ namespace functions {
|
||||||
else {
|
else {
|
||||||
// it is expected that x and z tads and y array all have the same length
|
// it is expected that x and z tads and y array all have the same length
|
||||||
for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
|
for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
|
||||||
auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
|
auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo);
|
||||||
auto yOffset = shape::getIndexOffset(i, yShapeInfo, tadLength);
|
auto yOffset = shape::getIndexOffset(i, yShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ, tadLength);
|
auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ);
|
||||||
|
|
||||||
rZ[zOffset] = OpType::op(rX[xOffset], y[yOffset]);
|
rZ[zOffset] = OpType::op(rX[xOffset], y[yOffset]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -251,7 +251,7 @@ namespace functions {
|
||||||
sPartials[threadIdx.x] = OpType::startingIndexValue(dx);
|
sPartials[threadIdx.x] = OpType::startingIndexValue(dx);
|
||||||
|
|
||||||
for(int i = threadIdx.x;i < tadLength; i += blockDim.x) {
|
for(int i = threadIdx.x;i < tadLength; i += blockDim.x) {
|
||||||
auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
|
auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
|
||||||
IndexValue<X> comp {dx[xOffset], i};
|
IndexValue<X> comp {dx[xOffset], i};
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], comp, extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], comp, extraParams);
|
||||||
}
|
}
|
||||||
|
@ -299,7 +299,7 @@ namespace functions {
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
for(Nd4jLong i = tid;i < n; i += blockDim.x * gridDim.x) {
|
for(Nd4jLong i = tid;i < n; i += blockDim.x * gridDim.x) {
|
||||||
auto offset = shape::getIndexOffset(i, xShapeInfo, n);
|
auto offset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
IndexValue<X> indexVal = {dx[offset], i};
|
IndexValue<X> indexVal = {dx[offset], i};
|
||||||
reduction = OpType::update(reduction, indexVal, extraParams);
|
reduction = OpType::update(reduction, indexVal, extraParams);
|
||||||
}
|
}
|
||||||
|
|
|
@ -115,7 +115,7 @@ namespace functions {
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
|
||||||
else
|
else
|
||||||
for (int i = tid; i < len; i += blockDim.x * gridDim.x)
|
for (int i = tid; i < len; i += blockDim.x * gridDim.x)
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
|
aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
|
||||||
|
|
|
@ -73,7 +73,7 @@ namespace functions {
|
||||||
|
|
||||||
|
|
||||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
z[shape::getIndexOffset(i, zShapeInfo, length)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo, length)], scalar, params);
|
z[shape::getIndexOffset(i, zShapeInfo)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo)], scalar, params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -72,8 +72,8 @@ namespace functions {
|
||||||
|
|
||||||
|
|
||||||
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
||||||
auto xOffset2 = shape::getIndexOffset(i, shapeInfo, length);
|
auto xOffset2 = shape::getIndexOffset(i, shapeInfo);
|
||||||
auto zOffset2 = shape::getIndexOffset(i, zShapeInfo, length);
|
auto zOffset2 = shape::getIndexOffset(i, zShapeInfo);
|
||||||
result[zOffset2] = OpType::op(dy[xOffset2], params);
|
result[zOffset2] = OpType::op(dy[xOffset2], params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -294,7 +294,7 @@ namespace functions {
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
||||||
shape::ind2subC(tadRank, tadShape, i, tadLength, xCoord);
|
shape::ind2subC(tadRank, tadShape, i, tadLength, xCoord);
|
||||||
auto xOffset = shape::getOffset(tadOffsetForBlock, tadShape, tadStride, xCoord, tadRank);
|
auto xOffset = shape::getOffset(tadOnlyShapeInfo, xCoord);
|
||||||
|
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[xOffset], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[xOffset], extraParams), extraParams);
|
||||||
}
|
}
|
||||||
|
@ -358,7 +358,7 @@ namespace functions {
|
||||||
for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
|
for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
|
||||||
shape::ind2subC(rank, xShape, i, n, ind2sub);
|
shape::ind2subC(rank, xShape, i, n, ind2sub);
|
||||||
|
|
||||||
auto offset = shape::getOffset(0, xShape, xStride, ind2sub, rank);
|
auto offset = shape::getOffset(xShapeInfo, ind2sub);
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[offset], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[offset], extraParams), extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -461,7 +461,7 @@ namespace functions {
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
||||||
shape::ind2subC(tadRank, tadShape, i, tadLength, xCoord);
|
shape::ind2subC(tadRank, tadShape, i, tadLength, xCoord);
|
||||||
auto xOffset = shape::getOffset(tadOffsetForBlock, tadShape, tadStride, xCoord, tadRank);
|
auto xOffset = shape::getOffset(tadOnlyShapeInfo, xCoord);
|
||||||
|
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[xOffset], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[xOffset], extraParams), extraParams);
|
||||||
}
|
}
|
||||||
|
@ -526,7 +526,7 @@ namespace functions {
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
||||||
shape::ind2subC(tadRank, tadShape, i, tadLength, xCoord);
|
shape::ind2subC(tadRank, tadShape, i, tadLength, xCoord);
|
||||||
auto xOffset = shape::getOffset(tadOffsetForBlock, tadShape, tadStride, xCoord, tadRank);
|
auto xOffset = shape::getOffset(tadOnlyShapeInfo, xCoord);
|
||||||
|
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[xOffset], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[xOffset], extraParams), extraParams);
|
||||||
}
|
}
|
||||||
|
|
|
@ -88,8 +88,8 @@ static inline __device__ void transformCuda(T scalar, T *dy, int *shapeInfo, T *
|
||||||
|
|
||||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
shape::ind2sub(xRank, xShape, i, length, xIdx);
|
shape::ind2sub(xRank, xShape, i, length, xIdx);
|
||||||
int xOffset2 = shape::getOffset(0, xShape, xStride, xIdx, xRank);
|
int xOffset2 = shape::getOffset(shapeInfo, xIdx);
|
||||||
int resultOffset = shape::getOffset(0, zShape, zStride, xIdx, zRank);
|
int resultOffset = shape::getOffset(0resultShapeInfo, xIdx);
|
||||||
result[resultOffset] = OpType::op(dy[xOffset2],scalar, params);
|
result[resultOffset] = OpType::op(dy[xOffset2],scalar, params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -230,8 +230,8 @@ namespace functions {
|
||||||
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
||||||
shape::ind2sub(xRank,shape::shapeOf(shapeInfo),i, length, xCoord);
|
shape::ind2sub(xRank,shape::shapeOf(shapeInfo),i, length, xCoord);
|
||||||
|
|
||||||
auto xOffset2 = shape::getOffset(0, xShape, xStride, xCoord, xRank);
|
auto xOffset2 = shape::getOffset(shapeInfo, xCoord);
|
||||||
auto resultOffset2 = shape::getOffset(0,xShape,shape::stride(resultShapeInfo),xCoord,xRank);
|
auto resultOffset2 = shape::getOffset(resultShapeInfo, xCoord);
|
||||||
|
|
||||||
result[resultOffset2] = OpType::op(dy[xOffset2], params);
|
result[resultOffset2] = OpType::op(dy[xOffset2], params);
|
||||||
}
|
}
|
||||||
|
|
|
@ -67,17 +67,17 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
|
||||||
}
|
}
|
||||||
else if (vx == vz) {
|
else if (vx == vz) {
|
||||||
for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
|
for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
auto yOffset = shape::getIndexOffset(i, yShapeInfo, len);
|
auto yOffset = shape::getIndexOffset(i, yShapeInfo);
|
||||||
|
|
||||||
z[xOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
z[xOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
|
for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
auto yOffset = shape::getIndexOffset(i, yShapeInfo, len);
|
auto yOffset = shape::getIndexOffset(i, yShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, len);
|
auto zOffset = shape::getIndexOffset(i, zShapeInfo);
|
||||||
|
|
||||||
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
||||||
}
|
}
|
||||||
|
|
|
@ -67,17 +67,17 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
|
||||||
}
|
}
|
||||||
else if (vx == vz) {
|
else if (vx == vz) {
|
||||||
for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
|
for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
auto yOffset = shape::getIndexOffset(i, yShapeInfo, len);
|
auto yOffset = shape::getIndexOffset(i, yShapeInfo);
|
||||||
|
|
||||||
z[xOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
z[xOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
|
for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
auto yOffset = shape::getIndexOffset(i, yShapeInfo, len);
|
auto yOffset = shape::getIndexOffset(i, yShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, len);
|
auto zOffset = shape::getIndexOffset(i, zShapeInfo);
|
||||||
|
|
||||||
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
||||||
}
|
}
|
||||||
|
|
|
@ -67,17 +67,17 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
|
||||||
}
|
}
|
||||||
else if (vx == vz) {
|
else if (vx == vz) {
|
||||||
for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
|
for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
auto yOffset = shape::getIndexOffset(i, yShapeInfo, len);
|
auto yOffset = shape::getIndexOffset(i, yShapeInfo);
|
||||||
|
|
||||||
z[xOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
z[xOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
|
for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
auto yOffset = shape::getIndexOffset(i, yShapeInfo, len);
|
auto yOffset = shape::getIndexOffset(i, yShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, len);
|
auto zOffset = shape::getIndexOffset(i, zShapeInfo);
|
||||||
|
|
||||||
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
|
||||||
}
|
}
|
||||||
|
|
|
@ -167,9 +167,9 @@ namespace functions {
|
||||||
} else {
|
} else {
|
||||||
for (Nd4jLong i = tid; i < length; i += blockDim.x * gridDim.x) {
|
for (Nd4jLong i = tid; i < length; i += blockDim.x * gridDim.x) {
|
||||||
|
|
||||||
auto xOffset2 = shape::getIndexOffset(i, xShapeBuffer, length);
|
auto xOffset2 = shape::getIndexOffset(i, xShapeBuffer);
|
||||||
auto yOffset2 = shape::getIndexOffset(i, yShapeBuffer, length);
|
auto yOffset2 = shape::getIndexOffset(i, yShapeBuffer);
|
||||||
auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer, length);
|
auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer);
|
||||||
|
|
||||||
z[zOffset2] = OpClass::op(x[xOffset2], y[yOffset2], i, length, buffer, extraArguments);
|
z[zOffset2] = OpClass::op(x[xOffset2], y[yOffset2], i, length, buffer, extraArguments);
|
||||||
}
|
}
|
||||||
|
@ -227,8 +227,8 @@ namespace functions {
|
||||||
|
|
||||||
for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < length; i += blockDim.x * gridDim.x) {
|
for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < length; i += blockDim.x * gridDim.x) {
|
||||||
|
|
||||||
auto xOffset2 = shape::getIndexOffset(i, xShapeBuffer, length);
|
auto xOffset2 = shape::getIndexOffset(i, xShapeBuffer);
|
||||||
auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer, length);
|
auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer);
|
||||||
|
|
||||||
z[zOffset2] = OpClass::op(x[xOffset2], i, length, buffer, extraArguments);
|
z[zOffset2] = OpClass::op(x[xOffset2], i, length, buffer, extraArguments);
|
||||||
}
|
}
|
||||||
|
@ -276,7 +276,7 @@ namespace functions {
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
for (Nd4jLong i = tid; i < length; i += blockDim.x * gridDim.x) {
|
for (Nd4jLong i = tid; i < length; i += blockDim.x * gridDim.x) {
|
||||||
auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer, length);
|
auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer);
|
||||||
z[zOffset2] = OpClass::op(i, length, buffer, extraArguments);
|
z[zOffset2] = OpClass::op(i, length, buffer, extraArguments);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -129,7 +129,7 @@ __device__ void ReduceBoolFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *xS
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
||||||
|
|
||||||
auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
|
auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
@ -140,7 +140,7 @@ __device__ void ReduceBoolFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *xS
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
if (threadIdx.x == 0)
|
||||||
z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo, numTads)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
|
z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -180,7 +180,7 @@ __device__ void ReduceBoolFunction<X,Z>::execScalarCuda(void *vx, Nd4jLong *xSha
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
|
||||||
else
|
else
|
||||||
for (int i = tid; i < len; i += blockDim.x * gridDim.x)
|
for (int i = tid; i < len; i += blockDim.x * gridDim.x)
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
|
aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
|
||||||
|
|
|
@ -129,7 +129,7 @@ __device__ void ReduceFloatFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *x
|
||||||
sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock);
|
sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock);
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
||||||
auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
|
auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
@ -139,7 +139,7 @@ __device__ void ReduceFloatFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *x
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
if (threadIdx.x == 0)
|
||||||
z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo, numTads)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
|
z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -179,7 +179,7 @@ __device__ void ReduceFloatFunction<X,Z>::execScalarCuda(void *vx, Nd4jLong *xSh
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
|
||||||
else
|
else
|
||||||
for (int i = tid; i < len; i += blockDim.x * gridDim.x)
|
for (int i = tid; i < len; i += blockDim.x * gridDim.x)
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
|
aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
|
||||||
|
|
|
@ -150,7 +150,7 @@ __device__ void ReduceLongFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *xS
|
||||||
sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock);
|
sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock);
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
||||||
auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
|
auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
@ -160,7 +160,7 @@ __device__ void ReduceLongFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *xS
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
if (threadIdx.x == 0)
|
||||||
z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo, numTads)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
|
z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -200,7 +200,7 @@ __device__ void ReduceLongFunction<X,Z>::execScalarCuda(void *vx, Nd4jLong *xSha
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
|
||||||
else
|
else
|
||||||
for (int i = tid; i < len; i += blockDim.x * gridDim.x)
|
for (int i = tid; i < len; i += blockDim.x * gridDim.x)
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
|
aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
|
||||||
|
|
|
@ -139,7 +139,7 @@ __device__ void ReduceSameFunction<X>::transformCudaXD( void *vx, Nd4jLong *xSha
|
||||||
sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock);
|
sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock);
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
||||||
auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
|
auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
@ -149,7 +149,7 @@ __device__ void ReduceSameFunction<X>::transformCudaXD( void *vx, Nd4jLong *xSha
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
if (threadIdx.x == 0)
|
||||||
z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo, numTads)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
|
z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -197,7 +197,7 @@ __device__ void ReduceSameFunction<X>::execScalarCuda(void *vx, Nd4jLong *xShape
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
|
||||||
else
|
else
|
||||||
for (int i = tid; i < len; i += blockDim.x * gridDim.x)
|
for (int i = tid; i < len; i += blockDim.x * gridDim.x)
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
|
aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
|
||||||
|
|
|
@ -161,8 +161,8 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
|
||||||
sPartials[threadIdx.x] = OpType::startingValue(x);
|
sPartials[threadIdx.x] = OpType::startingValue(x);
|
||||||
auto threadCount = gridDim.x * blockDim.x;
|
auto threadCount = gridDim.x * blockDim.x;
|
||||||
for(Nd4jLong i = tid; i < length; i += threadCount) {
|
for(Nd4jLong i = tid; i < length; i += threadCount) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
auto yOffset = shape::getIndexOffset(i, yShapeInfo, length);
|
auto yOffset = shape::getIndexOffset(i, yShapeInfo);
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::opAtomic(x[xOffset], y[yOffset], extraZ), extraZ);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::opAtomic(x[xOffset], y[yOffset], extraZ), extraZ);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -290,7 +290,7 @@ __device__ void Reduce3<X,Z>::transformAll( void *vx, Nd4jLong *xShapeInfo,
|
||||||
X *x = dx + xOffsets[r];
|
X *x = dx + xOffsets[r];
|
||||||
|
|
||||||
if (threadIdx.x < xTadLength && threadIdx.x < maxBlock) {
|
if (threadIdx.x < xTadLength && threadIdx.x < maxBlock) {
|
||||||
auto x0 = shape::getIndexOffset(threadIdx.x, xTadShapeInfo, shape::length(xTadShapeInfo));
|
auto x0 = shape::getIndexOffset(threadIdx.x, xTadShapeInfo);
|
||||||
tempX[threadIdx.x] = x[x0];
|
tempX[threadIdx.x] = x[x0];
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
@ -311,12 +311,12 @@ __device__ void Reduce3<X,Z>::transformAll( void *vx, Nd4jLong *xShapeInfo,
|
||||||
// we reset tempX IF we have >1 tiles
|
// we reset tempX IF we have >1 tiles
|
||||||
if (t >= 1 || (limit > 1 && g > 0))
|
if (t >= 1 || (limit > 1 && g > 0))
|
||||||
if (threadIdx.x + (t * maxBlock) < xTadLength) {
|
if (threadIdx.x + (t * maxBlock) < xTadLength) {
|
||||||
auto x0 = shape::getIndexOffset(threadIdx.x + (t * maxBlock), xTadShapeInfo, xTadLength);
|
auto x0 = shape::getIndexOffset(threadIdx.x + (t * maxBlock), xTadShapeInfo);
|
||||||
tempX[threadIdx.x] = x[x0];
|
tempX[threadIdx.x] = x[x0];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int f = threadIdx.x + (t * maxBlock); f < xTadLength && f < threadIdx.x + ((t + 1) * maxBlock); f += blockDim.x * gridDim.x) {
|
for (int f = threadIdx.x + (t * maxBlock); f < xTadLength && f < threadIdx.x + ((t + 1) * maxBlock); f += blockDim.x * gridDim.x) {
|
||||||
auto y0 = shape::getIndexOffset(f, yTadShapeInfo, yTadLength);
|
auto y0 = shape::getIndexOffset(f, yTadShapeInfo);
|
||||||
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::opAtomic(tempX[threadIdx.x], y[y0], extraZ), extraZ);
|
sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::opAtomic(tempX[threadIdx.x], y[y0], extraZ), extraZ);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -433,8 +433,8 @@ __device__ void Reduce3<X,Z>::transform(void *vx, Nd4jLong *xShapeInfo,
|
||||||
|
|
||||||
for (int j = threadIdx.x; j < tadLen; j += blockDim.x) {
|
for (int j = threadIdx.x; j < tadLen; j += blockDim.x) {
|
||||||
|
|
||||||
Nd4jLong xOffset2 = xOffset + shape::getIndexOffset(j, tadOnlyShapeInfo, tadLen);
|
Nd4jLong xOffset2 = xOffset + shape::getIndexOffset(j, tadOnlyShapeInfo);
|
||||||
Nd4jLong yOffset2 = yOffset + shape::getIndexOffset(j, yTadOnlyShapeInfo, tadLen);
|
Nd4jLong yOffset2 = yOffset + shape::getIndexOffset(j, yTadOnlyShapeInfo);
|
||||||
sPartials[threadIdx.x] = j < blockDim.x ? OpType::opAtomic(x[xOffset2], y[yOffset2], extraZ) : OpType::update(sPartials[threadIdx.x], OpType::opAtomic(x[xOffset2], y[yOffset2], extraZ), extraZ);
|
sPartials[threadIdx.x] = j < blockDim.x ? OpType::opAtomic(x[xOffset2], y[yOffset2], extraZ) : OpType::update(sPartials[threadIdx.x], OpType::opAtomic(x[xOffset2], y[yOffset2], extraZ), extraZ);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -61,7 +61,7 @@ __global__ static void scalarSimpleShaped(void* vx, void *vscalar, Nd4jLong *xSh
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (Nd4jLong i = tid; i < length; i += totalThreads) {
|
for (Nd4jLong i = tid; i < length; i += totalThreads) {
|
||||||
z[shape::getIndexOffset(i, zShapeInfo, length)] = OpType::op(x[shape::getIndexOffset(i, xShapeInfo, length)], scalar, params);
|
z[shape::getIndexOffset(i, zShapeInfo)] = OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], scalar, params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -114,7 +114,7 @@ __global__ static void scalarAlongDimension(void *vx, Nd4jLong *xShapeInfo,
|
||||||
auto s = scalars[r];
|
auto s = scalars[r];
|
||||||
|
|
||||||
for (int f = threadIdx.x; f < tadLength; f += blockDim.x)
|
for (int f = threadIdx.x; f < tadLength; f += blockDim.x)
|
||||||
oZ[shape::getIndexOffset(f, tadShapeInfoZ, tadLength)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo, tadLength)], s, extraParams);
|
oZ[shape::getIndexOffset(f, tadShapeInfoZ)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo)], s, extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -93,7 +93,7 @@ __device__ void ScalarBoolTransform<X, Z>::transformCuda(void* vscalar,
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (Nd4jLong i = tid; i < len; i+= totalThreads)
|
for (Nd4jLong i = tid; i < len; i+= totalThreads)
|
||||||
z[shape::getIndexOffset(i, zShapeInfo, len)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo, len)], scalar, params);
|
z[shape::getIndexOffset(i, zShapeInfo)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo)], scalar, params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -174,7 +174,7 @@ __device__ void ScalarBoolTransform<X, Z>::transformCuda(void *vx, Nd4jLong *xS
|
||||||
auto s = scalars[r];
|
auto s = scalars[r];
|
||||||
|
|
||||||
for (int f = threadIdx.x; f < tadLength; f += blockDim.x)
|
for (int f = threadIdx.x; f < tadLength; f += blockDim.x)
|
||||||
oZ[shape::getIndexOffset(f, tadShapeInfoZ, tadLength)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo, tadLength)], s, extraParams);
|
oZ[shape::getIndexOffset(f, tadShapeInfoZ)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo)], s, extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -93,7 +93,7 @@ __device__ void ScalarIntTransform<X>::transformCuda(void* vscalar,
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (Nd4jLong i = tid; i < len; i+= totalThreads)
|
for (Nd4jLong i = tid; i < len; i+= totalThreads)
|
||||||
z[shape::getIndexOffset(i, zShapeInfo, len)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo, len)], scalar, params);
|
z[shape::getIndexOffset(i, zShapeInfo)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo)], scalar, params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -174,7 +174,7 @@ __device__ void ScalarIntTransform<X>::transformCuda(void *vx, Nd4jLong *xShape
|
||||||
auto s = scalars[r];
|
auto s = scalars[r];
|
||||||
|
|
||||||
for (int f = threadIdx.x; f < tadLength; f += blockDim.x)
|
for (int f = threadIdx.x; f < tadLength; f += blockDim.x)
|
||||||
oZ[shape::getIndexOffset(f, tadShapeInfoZ, tadLength)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo, tadLength)], s, extraParams);
|
oZ[shape::getIndexOffset(f, tadShapeInfoZ)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo)], s, extraParams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -80,8 +80,8 @@ __global__ void bitonicArbitraryStepKernelKey(void *vx, Nd4jLong *xShapeInfo, vo
|
||||||
int it = (reverse) ? i + j + half : i + window - j - 1;
|
int it = (reverse) ? i + j + half : i + window - j - 1;
|
||||||
int ij = i+j;
|
int ij = i+j;
|
||||||
if (it < length && ij < length ) {
|
if (it < length && ij < length ) {
|
||||||
int posIT = shape::getIndexOffset(it, xShapeInfo, xLength);
|
int posIT = shape::getIndexOffset(it, xShapeInfo);
|
||||||
int posIJ = shape::getIndexOffset(ij, xShapeInfo, xLength);
|
int posIJ = shape::getIndexOffset(ij, xShapeInfo);
|
||||||
|
|
||||||
X v0 = x[posIJ];
|
X v0 = x[posIJ];
|
||||||
X v1 = x[posIT];
|
X v1 = x[posIT];
|
||||||
|
@ -160,8 +160,8 @@ __global__ void execBitonicArbitraryStepKernel(void *vx, Nd4jLong *xShapeInfo, i
|
||||||
int it = (reverse) ? i + j + half : i + window - j - 1;
|
int it = (reverse) ? i + j + half : i + window - j - 1;
|
||||||
int ij = i+j;
|
int ij = i+j;
|
||||||
if (it < length && ij < length ) {
|
if (it < length && ij < length ) {
|
||||||
int posIT = shape::getIndexOffset(it, xShapeInfo, xLength);
|
int posIT = shape::getIndexOffset(it, xShapeInfo);
|
||||||
int posIJ = shape::getIndexOffset(ij, xShapeInfo, xLength);
|
int posIJ = shape::getIndexOffset(ij, xShapeInfo);
|
||||||
|
|
||||||
shmem[threadIdx.x] = x[posIJ];
|
shmem[threadIdx.x] = x[posIJ];
|
||||||
shmem[threadIdx.x + blockDim.x] = x[posIT];
|
shmem[threadIdx.x + blockDim.x] = x[posIT];
|
||||||
|
|
|
@ -46,8 +46,8 @@ __global__ void bitonicSortStepKernelKey(void *vx, Nd4jLong *xShapeInfo, void *v
|
||||||
|
|
||||||
/* The threads with the lowest ids sort the array. */
|
/* The threads with the lowest ids sort the array. */
|
||||||
if ((ixj)>i) {
|
if ((ixj)>i) {
|
||||||
int posI = shape::getIndexOffset(i, xShapeInfo, xLength);
|
int posI = shape::getIndexOffset(i, xShapeInfo);
|
||||||
int posIXJ = shape::getIndexOffset(ixj, xShapeInfo, xLength);
|
int posIXJ = shape::getIndexOffset(ixj, xShapeInfo);
|
||||||
|
|
||||||
if ((i&k)==0) {
|
if ((i&k)==0) {
|
||||||
/* Sort ascending */
|
/* Sort ascending */
|
||||||
|
@ -100,8 +100,8 @@ __global__ void bitonicSortStepKernel(void *vx, Nd4jLong *xShapeInfo, int j, int
|
||||||
|
|
||||||
/* The threads with the lowest ids sort the array. */
|
/* The threads with the lowest ids sort the array. */
|
||||||
if ((ixj)>i) {
|
if ((ixj)>i) {
|
||||||
int posI = shape::getIndexOffset(i, xShapeInfo, xLength);
|
int posI = shape::getIndexOffset(i, xShapeInfo);
|
||||||
int posIXJ = shape::getIndexOffset(ixj, xShapeInfo, xLength);
|
int posIXJ = shape::getIndexOffset(ixj, xShapeInfo);
|
||||||
|
|
||||||
if ((i&k)==0) {
|
if ((i&k)==0) {
|
||||||
/* Sort ascending */
|
/* Sort ascending */
|
||||||
|
|
|
@ -139,19 +139,19 @@ namespace nd4j {
|
||||||
|
|
||||||
Nd4jLong sub[MAX_RANK];
|
Nd4jLong sub[MAX_RANK];
|
||||||
|
|
||||||
shape::index2coords(shape::rank(zTadShape),shape::shapeOf(zTadShape), arrOffset, sub, shape::order(zTadShape));
|
shape::index2coords(arrOffset, zTadShape, sub);
|
||||||
|
|
||||||
Nd4jLong baseOffset = shape::getOffset(0,shape::shapeOf(zTadShape),shape::stride(zTadShape), sub, shape::rank(zTadShape));
|
Nd4jLong baseOffset = shape::getOffset(zTadShape, sub);
|
||||||
|
|
||||||
resultTAD += baseOffset;
|
resultTAD += baseOffset;
|
||||||
|
|
||||||
auto yRank = shape::rank(currentTad);
|
auto yRank = shape::rank(currentTad);
|
||||||
auto tadRank = shape::rank(zTadShape);
|
auto tadRank = shape::rank(zTadShape);
|
||||||
|
|
||||||
shape::index2coords(yRank, shape::shapeOf(currentTad), 0, sub);
|
shape::index2coords(0, currentTad, sub);
|
||||||
|
|
||||||
auto yOffset = shape::getOffset(0, shape::shapeOf(currentTad), shape::stride(currentTad), sub, yRank);
|
auto yOffset = shape::getOffset(currentTad, sub);
|
||||||
resultOffset = shape::getOffset(0, shape::shapeOf(zTadShape), shape::stride(zTadShape), sub, tadRank);
|
resultOffset = shape::getOffset(zTadShape, sub);
|
||||||
|
|
||||||
resultTAD[resultOffset] = dataTAD[yOffset];
|
resultTAD[resultOffset] = dataTAD[yOffset];
|
||||||
}
|
}
|
||||||
|
@ -168,8 +168,8 @@ namespace nd4j {
|
||||||
|
|
||||||
Nd4jLong sub[MAX_RANK];
|
Nd4jLong sub[MAX_RANK];
|
||||||
|
|
||||||
shape::index2coords(shape::rank(zTadShape),shape::shapeOf(zTadShape), arrOffset, sub);
|
shape::index2coords(arrOffset, zTadShape, sub);
|
||||||
Nd4jLong baseOffset = shape::getOffset(0,shape::shapeOf(zTadShape),shape::stride(zTadShape), sub, shape::rank(zTadShape));
|
Nd4jLong baseOffset = shape::getOffset(zTadShape, sub);
|
||||||
|
|
||||||
resultTAD += baseOffset;
|
resultTAD += baseOffset;
|
||||||
|
|
||||||
|
@ -203,8 +203,8 @@ namespace nd4j {
|
||||||
auto yRank = shape::rank(currentTad);
|
auto yRank = shape::rank(currentTad);
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < yLength; i+= blockDim.x) {
|
for (int i = threadIdx.x; i < yLength; i+= blockDim.x) {
|
||||||
shape::index2coords(yRank, shape::shapeOf(currentTad), i, yIdx);
|
shape::index2coords(i, currentTad, yIdx);
|
||||||
auto yOffset = shape::getOffset(0, shape::shapeOf(currentTad), shape::stride(currentTad), yIdx, yRank);
|
auto yOffset = shape::getOffset(currentTad, yIdx);
|
||||||
|
|
||||||
resultTAD[baseIdx + i * tadEWS] = dataTAD[yOffset];
|
resultTAD[baseIdx + i * tadEWS] = dataTAD[yOffset];
|
||||||
}
|
}
|
||||||
|
@ -220,11 +220,11 @@ namespace nd4j {
|
||||||
auto tadRank = shape::rank(zTadShape);
|
auto tadRank = shape::rank(zTadShape);
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < yLength; i+= blockDim.x) {
|
for (int i = threadIdx.x; i < yLength; i+= blockDim.x) {
|
||||||
shape::index2coords(yRank, shape::shapeOf(currentTad), i, yIdx);
|
shape::index2coords(i, currentTad, yIdx);
|
||||||
shape::index2coords(tadRank, shape::shapeOf(zTadShape), i, zIdx);
|
shape::index2coords(i, zTadShape, zIdx);
|
||||||
|
|
||||||
auto yOffset = shape::getOffset(0, shape::shapeOf(currentTad), shape::stride(currentTad), yIdx, yRank);
|
auto yOffset = shape::getOffset(currentTad, yIdx);
|
||||||
auto resultOffset = shape::getOffset(0, shape::shapeOf(zTadShape), shape::stride(zTadShape), zIdx, tadRank);
|
auto resultOffset = shape::getOffset(zTadShape, zIdx);
|
||||||
|
|
||||||
resultTAD[resultOffset] = dataTAD[yOffset];
|
resultTAD[resultOffset] = dataTAD[yOffset];
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,7 +53,7 @@ namespace nd4j {
|
||||||
if (dimensionLength > 1 || tadEWS < 1) {
|
if (dimensionLength > 1 || tadEWS < 1) {
|
||||||
|
|
||||||
for (Nd4jLong e = threadIdx.x; e < tadLength; e += blockDim.x) {
|
for (Nd4jLong e = threadIdx.x; e < tadLength; e += blockDim.x) {
|
||||||
auto xOffset = tadOffsetForBlock + shape::getIndexOffset(e, tadOnlyShapeInfo, tadLength);
|
auto xOffset = tadOffsetForBlock + shape::getIndexOffset(e, tadOnlyShapeInfo);
|
||||||
dZ[xOffset] = (e == highestElement ? (T) 1 : (T) 0);
|
dZ[xOffset] = (e == highestElement ? (T) 1 : (T) 0);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -30,7 +30,7 @@ namespace nd4j {
|
||||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
for (Nd4jLong i = tid; i < length; i += blockDim.x * gridDim.x)
|
for (Nd4jLong i = tid; i < length; i += blockDim.x * gridDim.x)
|
||||||
dz[shape::getIndexOffset(i, xShapeInfo, length)] = (i == idx ? (T) 1 : (T) 0);
|
dz[shape::getIndexOffset(i, xShapeInfo)] = (i == idx ? (T) 1 : (T) 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
//
|
//
|
||||||
|
|
||||||
#include <loops/special_kernels.h>
|
#include <loops/special_kernels.h>
|
||||||
|
#include <ops/declarable/helpers/flatten.h>
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
|
|
||||||
|
@ -47,16 +48,8 @@ __global__ void flattenKernel(
|
||||||
|
|
||||||
Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
|
Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
if (zEWS >= 1 && yEWS >= 1 && yOrder == order) {
|
|
||||||
|
|
||||||
for (int i = tid; i < lenY; i += gridDim.x * blockDim.x)
|
|
||||||
z[i * zEWS + dOffset] = y[i * yEWS];
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
|
|
||||||
for(auto i = tid; i < lenY; i += gridDim.x * blockDim.x)
|
for(auto i = tid; i < lenY; i += gridDim.x * blockDim.x)
|
||||||
z[i * zEWS + dOffset] = y[shape::getIndexOrderOffset(i, yShapeInfo, lenY, order)];
|
z[i * zEWS + dOffset] = y[ops::helpers::getIndexOffsetOrdered(i, yShapeInfo, order)];
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
|
|
@ -54,8 +54,8 @@ __global__ void execOesTadKernelKey(void *vx, Nd4jLong *xShapeInfo,
|
||||||
for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
|
for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
|
||||||
auto top = 2 * tid + 1;
|
auto top = 2 * tid + 1;
|
||||||
if (top < xTadLength) {
|
if (top < xTadLength) {
|
||||||
auto t0 = shape::getIndexOffset(top - 1, tadShapeInfo, xTadLength);
|
auto t0 = shape::getIndexOffset(top - 1, tadShapeInfo);
|
||||||
auto t1 = shape::getIndexOffset(top, tadShapeInfo, xTadLength);
|
auto t1 = shape::getIndexOffset(top, tadShapeInfo);
|
||||||
|
|
||||||
if (!descending == (dx[t0] > dx[t1])) {
|
if (!descending == (dx[t0] > dx[t1])) {
|
||||||
X dt0 = dx[t0];
|
X dt0 = dx[t0];
|
||||||
|
@ -72,8 +72,8 @@ __global__ void execOesTadKernelKey(void *vx, Nd4jLong *xShapeInfo,
|
||||||
for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
|
for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
|
||||||
auto top = 2 * tid + 2;
|
auto top = 2 * tid + 2;
|
||||||
if (top < xTadLength) {
|
if (top < xTadLength) {
|
||||||
auto t0 = shape::getIndexOffset(top - 1, tadShapeInfo, xTadLength);
|
auto t0 = shape::getIndexOffset(top - 1, tadShapeInfo);
|
||||||
auto t1 = shape::getIndexOffset(top, tadShapeInfo, xTadLength);
|
auto t1 = shape::getIndexOffset(top, tadShapeInfo);
|
||||||
|
|
||||||
if (!descending == (dx[t0] > dx[t1])) {
|
if (!descending == (dx[t0] > dx[t1])) {
|
||||||
X dt0 = dx[t0];
|
X dt0 = dx[t0];
|
||||||
|
@ -126,7 +126,7 @@ __global__ void execOesTadKernel(void *vx, Nd4jLong *xShapeInfo,
|
||||||
int iterations = xTadLength;
|
int iterations = xTadLength;
|
||||||
if (cached) {
|
if (cached) {
|
||||||
for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
|
for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
|
||||||
auto t0 = shape::getIndexOffset(tid, tadShapeInfo, xTadLength);
|
auto t0 = shape::getIndexOffset(tid, tadShapeInfo);
|
||||||
shmem[tid] = dx[t0];
|
shmem[tid] = dx[t0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -140,8 +140,8 @@ __global__ void execOesTadKernel(void *vx, Nd4jLong *xShapeInfo,
|
||||||
for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
|
for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
|
||||||
auto top = 2 * tid + 1;
|
auto top = 2 * tid + 1;
|
||||||
if (top < xTadLength) {
|
if (top < xTadLength) {
|
||||||
auto t0 = cached ? top - 1 : shape::getIndexOffset(top - 1, tadShapeInfo, xTadLength);
|
auto t0 = cached ? top - 1 : shape::getIndexOffset(top - 1, tadShapeInfo);
|
||||||
auto t1 = cached ? top : shape::getIndexOffset(top, tadShapeInfo, xTadLength);
|
auto t1 = cached ? top : shape::getIndexOffset(top, tadShapeInfo);
|
||||||
|
|
||||||
if (!descending == (dx[t0] > dx[t1])) {
|
if (!descending == (dx[t0] > dx[t1])) {
|
||||||
T dt0 = dx[t0];
|
T dt0 = dx[t0];
|
||||||
|
@ -154,8 +154,8 @@ __global__ void execOesTadKernel(void *vx, Nd4jLong *xShapeInfo,
|
||||||
for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
|
for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
|
||||||
auto top = 2 * tid + 2;
|
auto top = 2 * tid + 2;
|
||||||
if (top < xTadLength) {
|
if (top < xTadLength) {
|
||||||
auto t0 = cached ? top - 1 : shape::getIndexOffset(top - 1, tadShapeInfo, xTadLength);
|
auto t0 = cached ? top - 1 : shape::getIndexOffset(top - 1, tadShapeInfo);
|
||||||
auto t1 = cached ? top : shape::getIndexOffset(top, tadShapeInfo, xTadLength);
|
auto t1 = cached ? top : shape::getIndexOffset(top, tadShapeInfo);
|
||||||
|
|
||||||
if (!descending == (dx[t0] > dx[t1])) {
|
if (!descending == (dx[t0] > dx[t1])) {
|
||||||
T dt0 = dx[t0];
|
T dt0 = dx[t0];
|
||||||
|
@ -172,7 +172,7 @@ __global__ void execOesTadKernel(void *vx, Nd4jLong *xShapeInfo,
|
||||||
if (cached) {
|
if (cached) {
|
||||||
dx = x + tadOffsets[r];
|
dx = x + tadOffsets[r];
|
||||||
for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
|
for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
|
||||||
auto t0 = shape::getIndexOffset(tid, tadShapeInfo, xTadLength);
|
auto t0 = shape::getIndexOffset(tid, tadShapeInfo);
|
||||||
dx[t0] = shmem[tid];
|
dx[t0] = shmem[tid];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,8 +53,8 @@ namespace nd4j {
|
||||||
T *rZ = z + zTadOffsets[idx];
|
T *rZ = z + zTadOffsets[idx];
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
||||||
auto xOffset = shape::getIndexOffset(i, tadShapeInfo, tadLength);
|
auto xOffset = shape::getIndexOffset(i, tadShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(i, zTadShapeInfo, tadLength);
|
auto zOffset = shape::getIndexOffset(i, zTadShapeInfo);
|
||||||
rZ[zOffset] = rX[xOffset];
|
rZ[zOffset] = rX[xOffset];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,7 +33,7 @@ namespace nd4j {
|
||||||
for (Nd4jLong i = blockIdx.x; i < rows; i += gridDim.x) {
|
for (Nd4jLong i = blockIdx.x; i < rows; i += gridDim.x) {
|
||||||
for (int j = threadIdx.x; j < cols; j += totalThreads) {
|
for (int j = threadIdx.x; j < cols; j += totalThreads) {
|
||||||
Nd4jLong coords[2] = {i, j};
|
Nd4jLong coords[2] = {i, j};
|
||||||
Nd4jLong xOffset = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), coords, rank);
|
Nd4jLong xOffset = shape::getOffset(shape, coords);
|
||||||
if (i + diagonal <= j)
|
if (i + diagonal <= j)
|
||||||
array[xOffset] = value;
|
array[xOffset] = value;
|
||||||
}
|
}
|
||||||
|
@ -48,7 +48,7 @@ namespace nd4j {
|
||||||
for (Nd4jLong i = blockIdx.x; i < rows; i += gridDim.x) {
|
for (Nd4jLong i = blockIdx.x; i < rows; i += gridDim.x) {
|
||||||
for (int j = threadIdx.x; j < cols; j += totalThreads) {
|
for (int j = threadIdx.x; j < cols; j += totalThreads) {
|
||||||
Nd4jLong coords[2] = {i, j};
|
Nd4jLong coords[2] = {i, j};
|
||||||
auto xOffset = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), coords, rank);
|
auto xOffset = shape::getOffset(shape, coords);
|
||||||
if (i + diagonal >= j)
|
if (i + diagonal >= j)
|
||||||
*(reinterpret_cast<T*>(buffer) + xOffset) = value;
|
*(reinterpret_cast<T*>(buffer) + xOffset) = value;
|
||||||
}
|
}
|
||||||
|
|
|
@ -92,7 +92,7 @@ namespace nd4j {
|
||||||
} else {
|
} else {
|
||||||
for (Nd4jLong i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
for (Nd4jLong i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
||||||
|
|
||||||
auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo[f], tadLength);
|
auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo[f]);
|
||||||
auto yOffset = newOffset + xOffset;
|
auto yOffset = newOffset + xOffset;
|
||||||
xOffset += oldOffset;
|
xOffset += oldOffset;
|
||||||
|
|
||||||
|
|
|
@ -34,8 +34,8 @@ namespace nd4j {
|
||||||
auto xEws = shape::order(theFirstShape) == 'c'? shape::elementWiseStride(theFirstShape) :1;
|
auto xEws = shape::order(theFirstShape) == 'c'? shape::elementWiseStride(theFirstShape) :1;
|
||||||
auto yEws = shape::order(theSecondShape) == 'c'? shape::elementWiseStride(theSecondShape):1;
|
auto yEws = shape::order(theSecondShape) == 'c'? shape::elementWiseStride(theSecondShape):1;
|
||||||
//if (shape::order(theFirstShape) ==)
|
//if (shape::order(theFirstShape) ==)
|
||||||
auto xOffset = shape::getIndexOffset(i * xEws, theFirstShape, resultLength);
|
auto xOffset = shape::getIndexOffset(i * xEws, theFirstShape);
|
||||||
auto yOffset = shape::getIndexOffset(i * yEws, theSecondShape, resultLength);
|
auto yOffset = shape::getIndexOffset(i * yEws, theSecondShape);
|
||||||
T temp = *(reinterpret_cast<T*>(theFirstBuffer) + xOffset);
|
T temp = *(reinterpret_cast<T*>(theFirstBuffer) + xOffset);
|
||||||
*(reinterpret_cast<T*>(theFirstBuffer) + xOffset) = *(reinterpret_cast<T*>(theSecondBuffer) + yOffset);
|
*(reinterpret_cast<T*>(theFirstBuffer) + xOffset) = *(reinterpret_cast<T*>(theSecondBuffer) + yOffset);
|
||||||
*(reinterpret_cast<T*>(theSecondBuffer) + yOffset) = temp;
|
*(reinterpret_cast<T*>(theSecondBuffer) + yOffset) = temp;
|
||||||
|
|
|
@ -61,8 +61,8 @@ namespace nd4j {
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
for (Nd4jLong j = threadIdx.x; j < tadLength; j += blockDim.x) {
|
for (Nd4jLong j = threadIdx.x; j < tadLength; j += blockDim.x) {
|
||||||
auto xOffset = shape::getIndexOffset(j, tadShapeInfo, tadLength);
|
auto xOffset = shape::getIndexOffset(j, tadShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(j, zShapeInfo, tadLength);
|
auto zOffset = shape::getIndexOffset(j, zShapeInfo);
|
||||||
|
|
||||||
z[zOffset] = s[xOffset];
|
z[zOffset] = s[xOffset];
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,8 +21,8 @@
|
||||||
#include <loops/special_kernels.h>
|
#include <loops/special_kernels.h>
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
static Nd4jLong __device__ __noinline__ _getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo, Nd4jLong length) {
|
static Nd4jLong __device__ __noinline__ _getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo) {
|
||||||
return shape::getIndexOffset(index, shapeInfo, length);
|
return shape::getIndexOffset(index, shapeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
static Nd4jLong __device__ __noinline__ _subArrayOffset(Nd4jLong index, Nd4jLong *shapeInfoA, Nd4jLong *shapeInfoB) {
|
static Nd4jLong __device__ __noinline__ _subArrayOffset(Nd4jLong index, Nd4jLong *shapeInfoA, Nd4jLong *shapeInfoB) {
|
||||||
|
@ -50,7 +50,7 @@ namespace nd4j {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = tid; i < resultLength; i += totalThreads) {
|
for (int i = tid; i < resultLength; i += totalThreads) {
|
||||||
auto xOffset = _getIndexOffset(i, outputShape, resultLength);
|
auto xOffset = _getIndexOffset(i, outputShape);
|
||||||
auto yOffset = _subArrayOffset(i, outputShape, inputShape);
|
auto yOffset = _subArrayOffset(i, outputShape, inputShape);
|
||||||
*(reinterpret_cast<T *>(outputBuffer) + xOffset) = *(reinterpret_cast<T const *>(inputBuffer) + yOffset);
|
*(reinterpret_cast<T *>(outputBuffer) + xOffset) = *(reinterpret_cast<T const *>(inputBuffer) + yOffset);
|
||||||
}
|
}
|
||||||
|
@ -89,7 +89,7 @@ namespace nd4j {
|
||||||
|
|
||||||
for (int i = tid; i < resultLength; i += totalThreads) {
|
for (int i = tid; i < resultLength; i += totalThreads) {
|
||||||
|
|
||||||
auto xOffset = _getIndexOffset(i, outputShape, resultLength);
|
auto xOffset = _getIndexOffset(i, outputShape);
|
||||||
auto yOffset = _subArrayOffset(i, outputShape, inputShape);
|
auto yOffset = _subArrayOffset(i, outputShape, inputShape);
|
||||||
*(reinterpret_cast<X *>(outputBuffer) + xOffset) = static_cast<X>(*(reinterpret_cast<Y const *>(inputBuffer) + yOffset));
|
*(reinterpret_cast<X *>(outputBuffer) + xOffset) = static_cast<X>(*(reinterpret_cast<Y const *>(inputBuffer) + yOffset));
|
||||||
}
|
}
|
||||||
|
|
|
@ -204,7 +204,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
|
||||||
sPartials[threadIdx.x] = val;
|
sPartials[threadIdx.x] = val;
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
|
||||||
auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
|
auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
|
||||||
SummaryStatsData<X> indexVal2;
|
SummaryStatsData<X> indexVal2;
|
||||||
indexVal2.initWithValue(dx[xOffset]);
|
indexVal2.initWithValue(dx[xOffset]);
|
||||||
|
|
||||||
|
@ -265,7 +265,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
|
||||||
|
|
||||||
for (Nd4jLong i = tid; i < n; i += blockDim.x * gridDim.x) {
|
for (Nd4jLong i = tid; i < n; i += blockDim.x * gridDim.x) {
|
||||||
|
|
||||||
auto offset = shape::getIndexOffset(i, xShapeInfo, n);
|
auto offset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
SummaryStatsData<X> indexVal2;
|
SummaryStatsData<X> indexVal2;
|
||||||
indexVal2.initWithValue(dx[offset]);
|
indexVal2.initWithValue(dx[offset]);
|
||||||
reduction = update(reduction, indexVal2, extraParams);
|
reduction = update(reduction, indexVal2, extraParams);
|
||||||
|
|
|
@ -92,14 +92,14 @@ namespace functions {
|
||||||
else {
|
else {
|
||||||
if(vx == vz) {
|
if(vx == vz) {
|
||||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
z[xOffset] = OpType::op(x[xOffset], params);
|
z[xOffset] = OpType::op(x[xOffset], params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
|
auto zOffset = shape::getIndexOffset(i, zShapeInfo);
|
||||||
z[zOffset] = OpType::op(x[xOffset], params);
|
z[zOffset] = OpType::op(x[xOffset], params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -97,14 +97,14 @@ namespace functions {
|
||||||
else {
|
else {
|
||||||
if(vx == vz) {
|
if(vx == vz) {
|
||||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
z[xOffset] = OpType::op(x[xOffset], params);
|
z[xOffset] = OpType::op(x[xOffset], params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
|
auto zOffset = shape::getIndexOffset(i, zShapeInfo);
|
||||||
z[zOffset] = OpType::op(x[xOffset], params);
|
z[zOffset] = OpType::op(x[xOffset], params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -105,14 +105,14 @@ namespace functions {
|
||||||
else {
|
else {
|
||||||
if(vx == vz) {
|
if(vx == vz) {
|
||||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
z[xOffset] = OpType::op(x[xOffset], params);
|
z[xOffset] = OpType::op(x[xOffset], params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
|
auto zOffset = shape::getIndexOffset(i, zShapeInfo);
|
||||||
z[zOffset] = OpType::op(x[xOffset], params);
|
z[zOffset] = OpType::op(x[xOffset], params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -95,14 +95,14 @@ namespace functions {
|
||||||
else {
|
else {
|
||||||
if(vx == vz) {
|
if(vx == vz) {
|
||||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
z[xOffset] = OpType::op(x[xOffset], params);
|
z[xOffset] = OpType::op(x[xOffset], params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
|
auto zOffset = shape::getIndexOffset(i, zShapeInfo);
|
||||||
z[zOffset] = OpType::op(x[xOffset], params);
|
z[zOffset] = OpType::op(x[xOffset], params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -97,14 +97,14 @@ namespace functions {
|
||||||
else {
|
else {
|
||||||
if(vx == vz) {
|
if(vx == vz) {
|
||||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
z[xOffset] = OpType::op(x[xOffset], params);
|
z[xOffset] = OpType::op(x[xOffset], params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
|
auto zOffset = shape::getIndexOffset(i, zShapeInfo);
|
||||||
z[zOffset] = OpType::op(x[xOffset], params);
|
z[zOffset] = OpType::op(x[xOffset], params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,7 @@
|
||||||
|
|
||||||
#include <ops/declarable/CustomOperations.h>
|
#include <ops/declarable/CustomOperations.h>
|
||||||
#include <ops/declarable/helpers/convolutions.h>
|
#include <ops/declarable/helpers/convolutions.h>
|
||||||
|
#include <ops/declarable/helpers/addBias.h>
|
||||||
#include <MmulHelper.h>
|
#include <MmulHelper.h>
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
|
@ -162,7 +163,8 @@ CUSTOM_OP_IMPL(conv3dnew, 2, 1, false, 0, 13) {
|
||||||
MmulHelper::tensorDot(&columns, weights, output, {1,2,3,4}, {3,0,1,2}, permutForOutput);
|
MmulHelper::tensorDot(&columns, weights, output, {1,2,3,4}, {3,0,1,2}, permutForOutput);
|
||||||
|
|
||||||
if(bias)
|
if(bias)
|
||||||
output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
|
// output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
|
||||||
|
helpers::addBias(block, *output, *bias, *output, isNCDHW);
|
||||||
|
|
||||||
if(!isNCDHW)
|
if(!isNCDHW)
|
||||||
delete input;
|
delete input;
|
||||||
|
|
|
@ -27,7 +27,7 @@
|
||||||
#include <declarable/helpers/convolutions.h>
|
#include <declarable/helpers/convolutions.h>
|
||||||
#include <ops/declarable/helpers/im2col.h>
|
#include <ops/declarable/helpers/im2col.h>
|
||||||
#include <ops/declarable/helpers/col2im.h>
|
#include <ops/declarable/helpers/col2im.h>
|
||||||
|
#include <ops/declarable/helpers/addBias.h>
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
namespace ops {
|
namespace ops {
|
||||||
|
@ -80,7 +80,8 @@ CUSTOM_OP_IMPL(deconv2d, 2, 1, false, 0, 9) {
|
||||||
|
|
||||||
//----- add biases if required -----//
|
//----- add biases if required -----//
|
||||||
if(bias)
|
if(bias)
|
||||||
output->applyBroadcast(broadcast::Add, {1}, bias);
|
// output->applyBroadcast(broadcast::Add, {1}, bias);
|
||||||
|
helpers::addBias(block, *output, *bias, *output, true);
|
||||||
|
|
||||||
if(!isNCHW)
|
if(!isNCHW)
|
||||||
delete output;
|
delete output;
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
|
|
||||||
#include <ops/declarable/CustomOperations.h>
|
#include <ops/declarable/CustomOperations.h>
|
||||||
#include <ops/declarable/helpers/convolutions.h>
|
#include <ops/declarable/helpers/convolutions.h>
|
||||||
|
#include <ops/declarable/helpers/addBias.h>
|
||||||
#include <MmulHelper.h>
|
#include <MmulHelper.h>
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
|
@ -79,7 +80,8 @@ CUSTOM_OP_IMPL(deconv3d, 2, 1, false, 0, 13) {
|
||||||
|
|
||||||
//----- add biases if required -----//
|
//----- add biases if required -----//
|
||||||
if(bias)
|
if(bias)
|
||||||
output->applyBroadcast(broadcast::Add,{1}, bias);
|
// output->applyBroadcast(broadcast::Add,{1}, bias);
|
||||||
|
helpers::addBias(block, *output, *bias, *output, true);
|
||||||
|
|
||||||
if(!isNCDHW)
|
if(!isNCDHW)
|
||||||
delete output;
|
delete output;
|
||||||
|
|
|
@ -16,62 +16,59 @@
|
||||||
|
|
||||||
//
|
//
|
||||||
// @author raver119@gmail.com
|
// @author raver119@gmail.com
|
||||||
|
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||||
//
|
//
|
||||||
|
|
||||||
#include <op_boilerplate.h>
|
#include <op_boilerplate.h>
|
||||||
#if NOT_EXCLUDED(OP_biasadd)
|
#if NOT_EXCLUDED(OP_biasadd)
|
||||||
|
|
||||||
#include <ops/declarable/CustomOperations.h>
|
#include <ops/declarable/CustomOperations.h>
|
||||||
|
#include<ops/declarable/helpers/addBias.h>
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
namespace ops {
|
namespace ops {
|
||||||
DECLARE_TYPES(biasadd) {
|
|
||||||
getOpDescriptor()
|
////////////////////////////////////////////////////////////////////
|
||||||
->setAllowedInputTypes(nd4j::DataType::ANY)
|
CUSTOM_OP_IMPL(biasadd, 2, 1, true, 0, 0) {
|
||||||
->setAllowedOutputTypes({ALL_FLOATS});
|
|
||||||
}
|
|
||||||
|
|
||||||
CUSTOM_OP_IMPL(biasadd, 2, 1, true, 0, 0) {
|
|
||||||
//REQUIRE_OK(this->validateInput2D(block));
|
|
||||||
auto input = INPUT_VARIABLE(0);
|
auto input = INPUT_VARIABLE(0);
|
||||||
auto bias = INPUT_VARIABLE(1);
|
auto bias = INPUT_VARIABLE(1);
|
||||||
|
|
||||||
REQUIRE_TRUE(bias->isRowVector(), 0, "Bias array should be a vector");
|
auto output = OUTPUT_VARIABLE(0);
|
||||||
|
|
||||||
auto z = OUTPUT_VARIABLE(0);
|
const bool isNCHW = !block.getBArguments()->empty() ? B_ARG(0) : false;
|
||||||
|
const int channelDim = isNCHW ? 1 : input->rankOf() - 1; // second or last
|
||||||
|
|
||||||
if (input->isMatrix())
|
REQUIRE_TRUE(bias->rankOf() == 1, 0, "BIASADD CUSTOM_OP: bias array should have rank = 1, but got %i instead !", bias->rankOf());
|
||||||
input->addRowVector(bias, z);
|
|
||||||
else {
|
|
||||||
// TODO: we might want to use NDArray::applyTrueBroadcast here, like AddOp does
|
|
||||||
std::vector<Nd4jLong> shape({-1, bias->lengthOf()});
|
|
||||||
//nd4j_debug("Reshaping to: [%i, %i]\n", -1, (int) bias->lengthOf());
|
|
||||||
auto tArr = input->reshape(input->ordering(), shape);
|
|
||||||
auto zArr = z->reshape(z->ordering(), shape);
|
|
||||||
tArr.addRowVector(bias, &zArr);
|
|
||||||
}
|
|
||||||
|
|
||||||
STORE_RESULT(*z);
|
REQUIRE_TRUE(bias->sizeAt(0) == input->sizeAt(channelDim), 0, "BIASADD CUSTOM_OP: shapes of bias %s and input %s arrays are not suitable for broadcast operation along channel dimension %i !", ShapeUtils::shapeAsString(bias).c_str(), ShapeUtils::shapeAsString(input).c_str(), channelDim);
|
||||||
|
|
||||||
|
REQUIRE_TRUE(output->isSameShape(input), 0, "BIASADD CUSTOM_OP: wrong shape of output array, expected is %s but got %s instead !", ShapeUtils::shapeAsString(input).c_str(), ShapeUtils::shapeAsString(output).c_str());
|
||||||
|
|
||||||
|
helpers::addBias(block, *input, *bias, *output, isNCHW);
|
||||||
|
// input->applyBroadcast(nd4j::broadcast::Add, {channelDim}, bias, output);
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
DECLARE_SYN(bias_add, biasadd);
|
DECLARE_SYN(bias_add, biasadd);
|
||||||
|
|
||||||
DECLARE_SHAPE_FN(biasadd) {
|
////////////////////////////////////////////////////////////////////
|
||||||
|
DECLARE_SHAPE_FN(biasadd) {
|
||||||
auto xShape = inputShape->at(0);
|
auto xShape = inputShape->at(0);
|
||||||
auto yShape = inputShape->at(1);
|
auto yShape = inputShape->at(1);
|
||||||
|
|
||||||
auto dtype = ArrayOptions::dataType(yShape);
|
auto dtype = ArrayOptions::dataType(yShape);
|
||||||
return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(xShape, dtype)));
|
return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(xShape, dtype)));
|
||||||
}
|
}
|
||||||
|
|
||||||
DECLARE_TYPES(biasadd_bp) {
|
DECLARE_TYPES(biasadd) {
|
||||||
getOpDescriptor()
|
getOpDescriptor()
|
||||||
->setAllowedInputTypes(nd4j::DataType::ANY)
|
->setAllowedInputTypes(nd4j::DataType::ANY)
|
||||||
->setAllowedOutputTypes({ALL_FLOATS});
|
->setAllowedOutputTypes({ALL_FLOATS});
|
||||||
}
|
}
|
||||||
|
|
||||||
CUSTOM_OP_IMPL(biasadd_bp, 3, 2, false, 0, 0) {
|
////////////////////////////////////////////////////////////////////
|
||||||
|
CUSTOM_OP_IMPL(biasadd_bp, 3, 2, false, 0, 0) {
|
||||||
auto input = INPUT_VARIABLE(0);
|
auto input = INPUT_VARIABLE(0);
|
||||||
auto bias = INPUT_VARIABLE(1);
|
auto bias = INPUT_VARIABLE(1);
|
||||||
auto epsilonNext = INPUT_VARIABLE(2);
|
auto epsilonNext = INPUT_VARIABLE(2);
|
||||||
|
@ -99,10 +96,10 @@ namespace nd4j {
|
||||||
}
|
}
|
||||||
|
|
||||||
return ND4J_STATUS_OK;
|
return ND4J_STATUS_OK;
|
||||||
}
|
}
|
||||||
DECLARE_SYN(BiasAddGrad, biasadd_bp);
|
DECLARE_SYN(BiasAddGrad, biasadd_bp);
|
||||||
|
|
||||||
DECLARE_SHAPE_FN(biasadd_bp) {
|
DECLARE_SHAPE_FN(biasadd_bp) {
|
||||||
auto input = inputShape->at(0);
|
auto input = inputShape->at(0);
|
||||||
auto bias = inputShape->at(1);
|
auto bias = inputShape->at(1);
|
||||||
|
|
||||||
|
@ -113,9 +110,16 @@ namespace nd4j {
|
||||||
COPY_SHAPE(bias, gradShape);
|
COPY_SHAPE(bias, gradShape);
|
||||||
|
|
||||||
return SHAPELIST(CONSTANT(epsShape), CONSTANT(gradShape));
|
return SHAPELIST(CONSTANT(epsShape), CONSTANT(gradShape));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
DECLARE_TYPES(biasadd_bp) {
|
||||||
}
|
getOpDescriptor()
|
||||||
|
->setAllowedInputTypes(nd4j::DataType::ANY)
|
||||||
|
->setAllowedOutputTypes({ALL_FLOATS});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
|
@ -43,14 +43,15 @@ DECLARE_SHAPE_FN(matrix_diag) {
|
||||||
auto in = inputShape->at(0);
|
auto in = inputShape->at(0);
|
||||||
int inRank = shape::rank(in);
|
int inRank = shape::rank(in);
|
||||||
|
|
||||||
|
// if for example diagonal array has shape [A,B,C] then output array has shape [A,B,C,C]
|
||||||
|
|
||||||
int outRank = inRank + 1;
|
int outRank = inRank + 1;
|
||||||
auto lastDimension = shape::sizeAt(in, -1);
|
|
||||||
|
|
||||||
ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong);
|
ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong);
|
||||||
outShapeInfo[0] = outRank;
|
outShapeInfo[0] = outRank;
|
||||||
for(int i = 0; i < inRank; ++i)
|
for(int i = 0; i < inRank; ++i)
|
||||||
outShapeInfo[i + 1] = shape::sizeAt(in, i);
|
outShapeInfo[i + 1] = shape::sizeAt(in, i);
|
||||||
outShapeInfo[outRank] = lastDimension;
|
outShapeInfo[outRank] = shape::sizeAt(in, -1);
|
||||||
|
|
||||||
ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in));
|
ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in));
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,7 @@
|
||||||
|
|
||||||
#include <ops/declarable/CustomOperations.h>
|
#include <ops/declarable/CustomOperations.h>
|
||||||
#include <ops/declarable/helpers/reverse.h>
|
#include <ops/declarable/helpers/reverse.h>
|
||||||
|
#include <ops/declarable/helpers/addBias.h>
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
namespace ops {
|
namespace ops {
|
||||||
|
@ -59,7 +59,8 @@ namespace ops {
|
||||||
output->applyBroadcast(nd4j::broadcast::Multiply, {dimC}, gain);
|
output->applyBroadcast(nd4j::broadcast::Multiply, {dimC}, gain);
|
||||||
if(bias != nullptr) {
|
if(bias != nullptr) {
|
||||||
// output->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), bias, output);
|
// output->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), bias, output);
|
||||||
output->applyBroadcast(nd4j::broadcast::Add, {dimC}, bias);
|
// output->applyBroadcast(nd4j::broadcast::Add, {dimC}, bias);
|
||||||
|
helpers::addBias(block, *output, *bias, *output, isNCHW);
|
||||||
}
|
}
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
|
|
|
@ -79,8 +79,8 @@ namespace nd4j {
|
||||||
* Inserts elements provided by diagonal array into the main diagonal of innermost matrices of input array
|
* Inserts elements provided by diagonal array into the main diagonal of innermost matrices of input array
|
||||||
*
|
*
|
||||||
* Input arrays:
|
* Input arrays:
|
||||||
* input: input array, considered as batch of matrices
|
* 0: input array, considered as batch of matrices
|
||||||
* diagonal: array containing elements to be inserted into input array,
|
* 1: diagonal array containing elements to be inserted into input array,
|
||||||
* following rank condition should be satisfied: diagonal_rank = input_rank - 1,
|
* following rank condition should be satisfied: diagonal_rank = input_rank - 1,
|
||||||
* the shapes of diagonal and input arrays must be equal except last dimension of input array,
|
* the shapes of diagonal and input arrays must be equal except last dimension of input array,
|
||||||
* for example if input_shape = [A,B,C,D] then diagonal_shape = [A,B,C],
|
* for example if input_shape = [A,B,C,D] then diagonal_shape = [A,B,C],
|
||||||
|
@ -88,27 +88,35 @@ namespace nd4j {
|
||||||
* that is: diagonal_shape[-1] = min(input_shape[-1], input_shape[-2])
|
* that is: diagonal_shape[-1] = min(input_shape[-1], input_shape[-2])
|
||||||
*
|
*
|
||||||
* Output array:
|
* Output array:
|
||||||
* has the same shape as input, corresponding diagonal elements are substituted
|
* 0: has the same shape as input, corresponding diagonal elements are substituted
|
||||||
*/
|
*/
|
||||||
#if NOT_EXCLUDED(OP_matrix_set_diag)
|
#if NOT_EXCLUDED(OP_matrix_set_diag)
|
||||||
DECLARE_CONFIGURABLE_OP(matrix_set_diag, 2, 1, false, 0, 0);
|
DECLARE_CONFIGURABLE_OP(matrix_set_diag, 2, 1, false, 0, 0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a batched matrix tensor with diagonal values given (as TF.matrix_diag).
|
* Inserts elements provided by diagonal array into the main diagonal of innermost matrices of output array,
|
||||||
|
* rest output elements are set to zeros
|
||||||
|
*
|
||||||
|
* Input array:
|
||||||
|
* diagonal: array containing elements to be inserted into output array,
|
||||||
|
* following rank condition is present: diagonal_rank = ouput_rank - 1
|
||||||
|
*
|
||||||
|
* Output array:
|
||||||
|
* 0: is considered as batch of matrices, if for example diagonal array has shape [A,B,C] then output array has shape [A,B,C,C]
|
||||||
*/
|
*/
|
||||||
DECLARE_CUSTOM_OP(matrix_diag, 1, 1, false, 0, 0);
|
DECLARE_CUSTOM_OP(matrix_diag, 1, 1, false, 0, 0);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This op calculates regularized incomplete beta integral Ix(a, b).
|
* This op calculates regularized incomplete beta integral Ix(a, b).
|
||||||
* Implementation is based on two algorithms depending on input values of a and b:
|
* Implementation is based on two algorithms depending on input values of a and b:
|
||||||
* - when a and b are both > maxValue (3000.), then apply Gauss-Legendre quadrature method
|
* - when a and b are both > maxValue (3000.), then Gauss-Legendre quadrature method is applied
|
||||||
* - when a and b are both <= maxValue (3000.), then apply modified Lentz’s algorithm for continued fractions
|
* - when a and b are both <= maxValue (3000.), then modified Lentz’s algorithm for continued fractions is applied
|
||||||
*
|
*
|
||||||
* Input arrays:
|
* Input arrays:
|
||||||
* a: define power t^{a-1}, must be > 0, type float.
|
* a: defines power t^{a-1}, must be > 0, type float.
|
||||||
* b: define power (1-t)^{b-1}, must be > 0, type float.
|
* b: defines power (1-t)^{b-1}, must be > 0, type float.
|
||||||
* x: define upper limit of integration, must be within (0 <= x <= 1) range, type float.
|
* x: defines upper limit of integration, must be within (0 <= x <= 1) range, type float.
|
||||||
*
|
*
|
||||||
* Output array:
|
* Output array:
|
||||||
* 0: values of regularized incomplete beta integral that corresponds to variable upper limit x, type float
|
* 0: values of regularized incomplete beta integral that corresponds to variable upper limit x, type float
|
||||||
|
|
|
@ -22,13 +22,14 @@
|
||||||
#define LIBND4J_ADDBIAS_H
|
#define LIBND4J_ADDBIAS_H
|
||||||
|
|
||||||
#include <ops/declarable/helpers/helpers.h>
|
#include <ops/declarable/helpers/helpers.h>
|
||||||
|
#include <graph/Context.h>
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
namespace ops {
|
namespace ops {
|
||||||
namespace helpers {
|
namespace helpers {
|
||||||
|
|
||||||
|
|
||||||
void addBias(NDArray& input, const NDArray& bias, const bool isNCHW);
|
void addBias(graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -91,19 +91,19 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD_ARGS(reduction(OMP_MAXT:max))
|
PRAGMA_OMP_SIMD_ARGS(reduction(OMP_MAXT:max))
|
||||||
for (int i = 0; i < length; i++) {
|
for (int i = 0; i < length; i++) {
|
||||||
const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
|
const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo);
|
||||||
max = nd4j::math::nd4j_max<T>(max, inBuff[offset]);
|
max = nd4j::math::nd4j_max<T>(max, inBuff[offset]);
|
||||||
}
|
}
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(reduction(OMP_SUMT:sum))
|
PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(reduction(OMP_SUMT:sum))
|
||||||
for (int i = 0; i < length; i++) {
|
for (int i = 0; i < length; i++) {
|
||||||
const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
|
const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo);
|
||||||
outBuff[offset] = nd4j::math::nd4j_exp<T, T>(inBuff[offset] - max);
|
outBuff[offset] = nd4j::math::nd4j_exp<T, T>(inBuff[offset] - max);
|
||||||
sum += outBuff[offset];
|
sum += outBuff[offset];
|
||||||
}
|
}
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int i = 0; i < length; i++) {
|
for (int i = 0; i < length; i++) {
|
||||||
const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
|
const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo);
|
||||||
outBuff[offset] /= sum;
|
outBuff[offset] /= sum;
|
||||||
outBuff[offset] *= (1.f - outBuff[offset]); // derivative
|
outBuff[offset] *= (1.f - outBuff[offset]); // derivative
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,70 +28,116 @@ namespace helpers {
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
template <typename X, typename Y>
|
template <typename X, typename Y>
|
||||||
static void addBias_(NDArray& input, const NDArray& bias, const bool isNCHW) {
|
static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output, const bool isNCHW) {
|
||||||
|
|
||||||
// input [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
|
|
||||||
// bias [oC]
|
// bias [oC]
|
||||||
|
|
||||||
X* inBuff = input.bufferAsT<X>();
|
// if(input_rank == 4)
|
||||||
const Y* biasBuff = bias.bufferAsT<Y>();
|
// input and output have same shapes: [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
|
||||||
|
// if(input_rank == 5)
|
||||||
|
// input and output have same shapes: [bS, oD, oH, oW, oC] (NHWC) or [bS, oD, oC, oH, oW] (NCHW)
|
||||||
|
// else
|
||||||
|
// apply applyBroadCast
|
||||||
|
|
||||||
int bS, iC, iH, iW, oC, oH, oW; // batch size, input channels, input height/width, output channels, output height/width;
|
|
||||||
bS = input.sizeAt(0);
|
|
||||||
const Nd4jLong stride0 = input.stridesOf()[0];
|
|
||||||
const Nd4jLong stride1 = input.stridesOf()[1];
|
|
||||||
const Nd4jLong stride2 = input.stridesOf()[2];
|
|
||||||
|
|
||||||
uint biasShapeInfoCast[MAX_RANK];
|
const X* x = input.bufferAsT<X>();
|
||||||
bool canCastBias = nd4j::DataTypeUtils::castShapeInfo(bias.getShapeInfo(), biasShapeInfoCast);
|
const Y* y = bias.bufferAsT<Y>();
|
||||||
|
X* z = output.bufferAsT<X>();
|
||||||
|
|
||||||
if(isNCHW) {
|
const bool inOutAreSame = x == z;
|
||||||
|
|
||||||
oC = input.sizeAt(1);
|
const uint bS = output.sizeAt(0); // batch size
|
||||||
oH = input.sizeAt(2);
|
const Nd4jLong yStrideC = bias.stridesOf()[0];
|
||||||
oW = input.sizeAt(3);
|
const Nd4jLong zStrideB = output.stridesOf()[0];
|
||||||
|
|
||||||
const int oHoW = oH*oW;
|
if(output.rankOf() == 4) {
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2)
|
const uint C = isNCHW ? output.sizeAt(1) : output.sizeAt(3); // channels
|
||||||
for (int i = 0; i < bS; ++i) {
|
const uint oH = isNCHW ? output.sizeAt(2) : output.sizeAt(1); // height
|
||||||
for (int c = 0; c < oC; ++c) {
|
const uint oW = isNCHW ? output.sizeAt(3) : output.sizeAt(2); // width
|
||||||
|
|
||||||
auto biasOffset = shape::indexOffset(c, bias.getShapeInfo(), biasShapeInfoCast, oC, canCastBias);
|
const Nd4jLong zStrideC = isNCHW ? output.stridesOf()[1] : output.stridesOf()[3];
|
||||||
auto inOffset = i * stride0 + c * stride1;
|
const Nd4jLong zStrideH = isNCHW ? output.stridesOf()[2] : output.stridesOf()[1];
|
||||||
|
const Nd4jLong zStrideW = isNCHW ? output.stridesOf()[3] : output.stridesOf()[2];
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
if(inOutAreSame) {
|
||||||
for (uint k = 0; k < oHoW; ++k)
|
|
||||||
inBuff[inOffset + k] += static_cast<X>(biasBuff[biasOffset]);
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4))
|
||||||
}
|
for(uint b = 0; b < bS; ++b)
|
||||||
}
|
for(uint c = 0; c < C; ++c)
|
||||||
|
for(uint h = 0; h < oH ; ++h)
|
||||||
|
for(uint w = 0; w < oW ; ++w)
|
||||||
|
z[b*zStrideB + c*zStrideC + h*zStrideH + w*zStrideW] += static_cast<X>(y[c*yStrideC]);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
||||||
oC = input.sizeAt(3);
|
const Nd4jLong xStrideB = input.stridesOf()[0];
|
||||||
oH = input.sizeAt(1);
|
const Nd4jLong xStrideC = isNCHW ? input.stridesOf()[1] : input.stridesOf()[3];
|
||||||
oW = input.sizeAt(2);
|
const Nd4jLong xStrideH = isNCHW ? input.stridesOf()[2] : input.stridesOf()[1];
|
||||||
|
const Nd4jLong xStrideW = isNCHW ? input.stridesOf()[3] : input.stridesOf()[2];
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_FOR
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4))
|
||||||
for (int i = 0; i < bS*oH*oW; ++i) {
|
for(uint b = 0; b < bS; ++b)
|
||||||
|
for(uint c = 0; c < C; ++c)
|
||||||
PRAGMA_OMP_SIMD
|
for(uint h = 0; h < oH ; ++h)
|
||||||
for (int c = 0; c < oC; ++c) {
|
for(uint w = 0; w < oW ; ++w)
|
||||||
auto biasOffset = shape::indexOffset(c, bias.getShapeInfo(), biasShapeInfoCast, oC, canCastBias);
|
z[b*zStrideB + c*zStrideC + h*zStrideH + w*zStrideW] = x[b*xStrideB + c*xStrideC + h*xStrideH + w*xStrideW] + static_cast<X>(y[c*yStrideC]);
|
||||||
inBuff[i * oC + c] += static_cast<X>(biasBuff[biasOffset]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if(output.rankOf() == 5) {
|
||||||
|
|
||||||
|
const uint C = isNCHW ? output.sizeAt(1) : output.sizeAt(4); // channels
|
||||||
|
const uint oD = isNCHW ? output.sizeAt(2) : output.sizeAt(1); // depth
|
||||||
|
const uint oH = isNCHW ? output.sizeAt(3) : output.sizeAt(2); // height
|
||||||
|
const uint oW = isNCHW ? output.sizeAt(4) : output.sizeAt(3); // width
|
||||||
|
|
||||||
|
const Nd4jLong zStrideC = isNCHW ? output.stridesOf()[1] : output.stridesOf()[4];
|
||||||
|
const Nd4jLong zStrideD = isNCHW ? output.stridesOf()[2] : output.stridesOf()[1];
|
||||||
|
const Nd4jLong zStrideH = isNCHW ? output.stridesOf()[3] : output.stridesOf()[2];
|
||||||
|
const Nd4jLong zStrideW = isNCHW ? output.stridesOf()[4] : output.stridesOf()[3];
|
||||||
|
|
||||||
|
if(inOutAreSame) {
|
||||||
|
|
||||||
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5))
|
||||||
|
for(uint b = 0; b < bS; ++b)
|
||||||
|
for(uint c = 0; c < C; ++c)
|
||||||
|
for(uint d = 0; d < oD ; ++d)
|
||||||
|
for(uint h = 0; h < oH ; ++h)
|
||||||
|
for(uint w = 0; w < oW ; ++w)
|
||||||
|
z[b*zStrideB + c*zStrideC + d*zStrideD + h*zStrideH + w*zStrideW] += static_cast<X>(y[c*yStrideC]);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
|
||||||
|
const Nd4jLong xStrideB = input.stridesOf()[0];
|
||||||
|
const Nd4jLong xStrideC = isNCHW ? input.stridesOf()[1] : input.stridesOf()[4];
|
||||||
|
const Nd4jLong xStrideD = isNCHW ? input.stridesOf()[2] : input.stridesOf()[1];
|
||||||
|
const Nd4jLong xStrideH = isNCHW ? input.stridesOf()[3] : input.stridesOf()[2];
|
||||||
|
const Nd4jLong xStrideW = isNCHW ? input.stridesOf()[4] : input.stridesOf()[3];
|
||||||
|
|
||||||
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5))
|
||||||
|
for(uint b = 0; b < bS; ++b)
|
||||||
|
for(uint c = 0; c < C; ++c)
|
||||||
|
for(uint d = 0; d < oD ; ++d)
|
||||||
|
for(uint h = 0; h < oH ; ++h)
|
||||||
|
for(uint w = 0; w < oW ; ++w)
|
||||||
|
z[b*zStrideB + c*zStrideC + d*zStrideD + h*zStrideH + w*zStrideW] = x[b*xStrideB + c*xStrideC + d*xStrideD + h*xStrideH + w*xStrideW] + static_cast<X>(y[c*yStrideC]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const int channelDim = isNCHW ? 1 : input.rankOf() - 1; // second or last
|
||||||
|
const_cast<NDArray&>(input).applyBroadcast(nd4j::broadcast::Add, {channelDim}, &bias, &output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
void addBias(NDArray& input, const NDArray& bias, const bool isNCHW) {
|
void addBias(nd4j::graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) {
|
||||||
|
|
||||||
BUILD_DOUBLE_SELECTOR(input.dataType(), bias.dataType(), addBias_, (input, bias, isNCHW), FLOAT_TYPES, FLOAT_TYPES);
|
// bias.rankOf() == 1 ? bias : bias.reshape(bias.ordering(), {bias.lengthOf()})
|
||||||
|
BUILD_DOUBLE_SELECTOR(input.dataType(), bias.dataType(), addBias_, (input, bias, output, isNCHW), FLOAT_TYPES, FLOAT_TYPES);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
BUILD_DOUBLE_TEMPLATE(template void addBias_, (NDArray& input, const NDArray& bias, const bool isNCHW), FLOAT_TYPES, FLOAT_TYPES);
|
BUILD_DOUBLE_TEMPLATE(template void addBias_, (const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW), FLOAT_TYPES, FLOAT_TYPES);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -84,7 +84,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
|
||||||
const Nd4jLong end = start + step;
|
const Nd4jLong end = start + step;
|
||||||
|
|
||||||
// calculate offset for mean, variance, gamma, beta (all of them have the same shape)
|
// calculate offset for mean, variance, gamma, beta (all of them have the same shape)
|
||||||
auto offsetSmall = shape::indexOffset(j, meanShapeInfo, meanShapeInfoCast, lenSmall, canCastMean);
|
auto offsetSmall = shape::indexOffset(j, meanShapeInfo, meanShapeInfoCast, canCastMean);
|
||||||
// calculate offset for input and output (all of them have the same shape)
|
// calculate offset for input and output (all of them have the same shape)
|
||||||
shape::outerArrayOffsets(inOffsets, j, inShapeInfo, meanShapeInfo, memBuff, dimsToExclude.data());
|
shape::outerArrayOffsets(inOffsets, j, inShapeInfo, meanShapeInfo, memBuff, dimsToExclude.data());
|
||||||
|
|
||||||
|
@ -114,7 +114,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
|
||||||
const Nd4jLong end = start + step;
|
const Nd4jLong end = start + step;
|
||||||
|
|
||||||
// calculate offset for mean, variance, gamma, beta (all of them have the same shape)
|
// calculate offset for mean, variance, gamma, beta (all of them have the same shape)
|
||||||
auto offsetSmall = shape::indexOffset(j, meanShapeInfo, meanShapeInfoCast, lenSmall, canCastMean);
|
auto offsetSmall = shape::indexOffset(j, meanShapeInfo, meanShapeInfoCast, canCastMean);
|
||||||
// calculate offset for input and output (all of them have the same shape)
|
// calculate offset for input and output (all of them have the same shape)
|
||||||
shape::outerArrayOffsets(inOffsets, j, inShapeInfo, meanShapeInfo, memBuff, dimsToExclude.data());
|
shape::outerArrayOffsets(inOffsets, j, inShapeInfo, meanShapeInfo, memBuff, dimsToExclude.data());
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,7 @@ namespace helpers {
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
// modified Lentz’s algorithm for continued fractions,
|
// modified Lentz’s algorithm for continued fractions,
|
||||||
// reference: Lentz, W.J. 1976, “Generating Bessel Functions in Mie Scattering Calculations Using Continued Fractions,”
|
// reference: Lentz, W.J. 1976, “Generating Bessel Functions in Mie Scattering Calculations Using Continued Fractions”
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static T continuedFraction(const T a, const T b, const T x) {
|
static T continuedFraction(const T a, const T b, const T x) {
|
||||||
|
|
||||||
|
@ -122,9 +122,8 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con
|
||||||
int xLen = x.lengthOf();
|
int xLen = x.lengthOf();
|
||||||
|
|
||||||
PRAGMA_OMP_PARALLEL_FOR_IF(xLen > Environment::getInstance()->elementwiseThreshold())
|
PRAGMA_OMP_PARALLEL_FOR_IF(xLen > Environment::getInstance()->elementwiseThreshold())
|
||||||
for(int i = 0; i < xLen; ++i) {
|
for(int i = 0; i < xLen; ++i)
|
||||||
output.p(i, betaIncCore<T>(a.e<T>(i), b.e<T>(i), x.e<T>(i)));
|
output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i));
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
|
|
|
@ -648,7 +648,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
//----- add biases if required -----//
|
//----- add biases if required -----//
|
||||||
if(bias)
|
if(bias)
|
||||||
// output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
|
// output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
|
||||||
helpers::addBias(*output, *bias, isNCHW);
|
helpers::addBias(block, *output, *bias, *output, isNCHW);
|
||||||
|
|
||||||
if(!isNCHW)
|
if(!isNCHW)
|
||||||
delete input;
|
delete input;
|
||||||
|
@ -875,7 +875,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
template <typename X, typename Y>
|
template <typename X, typename Y>
|
||||||
static void depthwiseConv2d_(const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) {
|
static void depthwiseConv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) {
|
||||||
|
|
||||||
// input [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
|
// input [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
|
||||||
// weights [kH, kW, iC, mC] always
|
// weights [kH, kW, iC, mC] always
|
||||||
|
@ -922,7 +922,8 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, {{2,0,1,3},{iC,kH*kW,mC}}, modifOutput); // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC]
|
MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, {{2,0,1,3},{iC,kH*kW,mC}}, modifOutput); // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC]
|
||||||
|
|
||||||
if(bias)
|
if(bias)
|
||||||
output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
|
// output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
|
||||||
|
helpers::addBias(block, *output, *bias, *output, isNCHW);
|
||||||
|
|
||||||
if(!isNCHW)
|
if(!isNCHW)
|
||||||
delete input;
|
delete input;
|
||||||
|
@ -2451,7 +2452,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2dBP_, (block, input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES);
|
BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2dBP_, (block, input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES);
|
||||||
}
|
}
|
||||||
void ConvolutionUtils::depthwiseConv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) {
|
void ConvolutionUtils::depthwiseConv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) {
|
||||||
BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES);
|
BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES);
|
||||||
}
|
}
|
||||||
void ConvolutionUtils::depthwiseConv2dBP(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) {
|
void ConvolutionUtils::depthwiseConv2dBP(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) {
|
||||||
BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2dBP_, (input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES);
|
BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2dBP_, (input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES);
|
||||||
|
|
|
@ -37,24 +37,16 @@ namespace nd4j {
|
||||||
cOffset += inputs[e]->lengthOf();
|
cOffset += inputs[e]->lengthOf();
|
||||||
}
|
}
|
||||||
|
|
||||||
Nd4jLong xCoord[MAX_RANK];
|
|
||||||
|
|
||||||
// actually transferring data
|
// actually transferring data
|
||||||
for (int e = 0; e < numArrays; e++) {
|
for (int e = 0; e < numArrays; e++) {
|
||||||
auto z = reinterpret_cast<T *>(output->bufferWithOffset(offsets[e]));
|
auto z = reinterpret_cast<T *>(output->bufferWithOffset(offsets[e]));
|
||||||
|
|
||||||
auto xBuffer = inputs[e]->bufferAsT<T>();
|
auto xBuffer = inputs[e]->bufferAsT<T>();
|
||||||
auto xShapeInfo = inputs[e]->shapeInfo();
|
auto xShapeInfo = inputs[e]->shapeInfo();
|
||||||
auto xShape = shape::shapeOf(xShapeInfo);
|
|
||||||
auto xStride = shape::stride(xShapeInfo);
|
|
||||||
auto xRank = shape::rank(xShapeInfo);
|
|
||||||
auto xLength = inputs[e]->lengthOf();
|
auto xLength = inputs[e]->lengthOf();
|
||||||
|
|
||||||
for (uint i = 0; i < xLength; i++) {
|
for (uint i = 0; i < xLength; i++)
|
||||||
shape::index2coords(xRank, xShape, i, xLength, xCoord, order);
|
z[i] = xBuffer[getIndexOffsetOrdered(i, xShapeInfo, order)];
|
||||||
auto xOffset = shape::getOffset(0, xShape, xStride, xCoord, xRank);
|
|
||||||
z[i] = xBuffer[xOffset];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -184,7 +184,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < tadLength; i++) {
|
for (int i = 0; i < tadLength; i++) {
|
||||||
auto xOffset = shape::getIndexOffset(i, tadShapeShapeInfo, tadLength);
|
auto xOffset = shape::getIndexOffset(i, tadShapeShapeInfo);
|
||||||
if (rX[xOffset] > maxValue) {
|
if (rX[xOffset] > maxValue) {
|
||||||
maxIdx = i;
|
maxIdx = i;
|
||||||
maxValue = rX[xOffset];
|
maxValue = rX[xOffset];
|
||||||
|
@ -193,7 +193,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
|
||||||
|
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (int i = 0; i < tadLength; i++) {
|
for (int i = 0; i < tadLength; i++) {
|
||||||
auto zOffset = shape::getIndexOffset(i, tadPackZ.primaryShapeInfo(), tadLength);
|
auto zOffset = shape::getIndexOffset(i, tadPackZ.primaryShapeInfo());
|
||||||
rZ[zOffset] = maxIdx == i ? (Z) 1 : (Z) 0;
|
rZ[zOffset] = maxIdx == i ? (Z) 1 : (Z) 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,14 +52,14 @@ void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& outp
|
||||||
PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
|
||||||
for (Nd4jLong i = 0; i < xLen; ++i) {
|
for (Nd4jLong i = 0; i < xLen; ++i) {
|
||||||
|
|
||||||
shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords.data());
|
shape::index2coords(i, xShapeInfo, coords.data());
|
||||||
|
|
||||||
const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords.data(), xRank);
|
const auto xOffset = shape::getOffset(xShapeInfo, coords.data());
|
||||||
const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords.data(), xRank);
|
const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords.data());
|
||||||
|
|
||||||
// condition to be on diagonal of innermost matrix
|
// condition to be on diagonal of innermost matrix
|
||||||
if(coords[xRank - 2] == coords[xRank - 1])
|
if(coords[xRank - 2] == coords[xRank - 1])
|
||||||
z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords.data(), xRank - 1)];
|
z[zOffset] = y[shape::getOffset(yShapeInfo, coords.data())];
|
||||||
else
|
else
|
||||||
z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
|
z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
|
||||||
}
|
}
|
||||||
|
|
|
@ -73,12 +73,12 @@ namespace nd4j {
|
||||||
if (idx < 0 || idx >= tLen) {
|
if (idx < 0 || idx >= tLen) {
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int t = 0; t < tLen; t++) {
|
for (unsigned int t = 0; t < tLen; t++) {
|
||||||
cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo(), tLen)] = zero;
|
cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = zero;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PRAGMA_OMP_SIMD
|
PRAGMA_OMP_SIMD
|
||||||
for (unsigned int t = 0; t < tLen; t++) {
|
for (unsigned int t = 0; t < tLen; t++) {
|
||||||
cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo(), tLen)] = idx == t ? one : zero;
|
cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = idx == t ? one : zero;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,8 +53,8 @@ namespace nd4j {
|
||||||
|
|
||||||
for (Nd4jLong e = length - 1; e >= 0; --e) {
|
for (Nd4jLong e = length - 1; e >= 0; --e) {
|
||||||
|
|
||||||
auto xOffset = shape::getIndexOffset(e, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(e, xShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(e, zShapeInfo, length);
|
auto zOffset = shape::getIndexOffset(e, zShapeInfo);
|
||||||
sum = op == scalar::Add ? simdOps::Add<T, T, T>::op(sum, x[xOffset]) : simdOps::Multiply<T, T, T>::op(sum, x[xOffset]);
|
sum = op == scalar::Add ? simdOps::Add<T, T, T>::op(sum, x[xOffset]) : simdOps::Multiply<T, T, T>::op(sum, x[xOffset]);
|
||||||
|
|
||||||
if (!exclusive)
|
if (!exclusive)
|
||||||
|
@ -83,8 +83,8 @@ namespace nd4j {
|
||||||
|
|
||||||
for (int e = 0; e < length; e++) {
|
for (int e = 0; e < length; e++) {
|
||||||
|
|
||||||
auto xOffset = shape::getIndexOffset(e, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(e, xShapeInfo);
|
||||||
auto zOffset = shape::getIndexOffset(e, zShapeInfo, length);
|
auto zOffset = shape::getIndexOffset(e, zShapeInfo);
|
||||||
sum = op == scalar::Add ? simdOps::Add<T, T, T>::op(sum, x[xOffset]) : simdOps::Multiply<T, T, T>::op(sum, x[xOffset]);
|
sum = op == scalar::Add ? simdOps::Add<T, T, T>::op(sum, x[xOffset]) : simdOps::Multiply<T, T, T>::op(sum, x[xOffset]);
|
||||||
|
|
||||||
if (!exclusive)
|
if (!exclusive)
|
||||||
|
|
|
@ -77,8 +77,8 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
|
||||||
PRAGMA_OMP_PARALLEL_FOR
|
PRAGMA_OMP_PARALLEL_FOR
|
||||||
for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) {
|
for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) {
|
||||||
|
|
||||||
auto inOffset = shape::getIndexOffset(e, inShapeBuffer, inLength);
|
auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
|
||||||
auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer, inLength);
|
auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer);
|
||||||
//outArr[outOffset] = inArr[inOffset];
|
//outArr[outOffset] = inArr[inOffset];
|
||||||
swap(outArr, inOffset, outOffset);
|
swap(outArr, inOffset, outOffset);
|
||||||
}
|
}
|
||||||
|
@ -118,8 +118,8 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
|
||||||
PRAGMA_OMP_PARALLEL_FOR
|
PRAGMA_OMP_PARALLEL_FOR
|
||||||
for (Nd4jLong e = 0; e < numOfElemsToReverse; e++) {
|
for (Nd4jLong e = 0; e < numOfElemsToReverse; e++) {
|
||||||
|
|
||||||
auto inOffset = shape::getIndexOffset(e, inShapeBuffer, inLength);
|
auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
|
||||||
auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer, outLength);
|
auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer);
|
||||||
outArr[outOffset] = inArr[inOffset];
|
outArr[outOffset] = inArr[inOffset];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -128,8 +128,8 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
|
||||||
PRAGMA_OMP_PARALLEL_FOR
|
PRAGMA_OMP_PARALLEL_FOR
|
||||||
for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++) {
|
for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++) {
|
||||||
|
|
||||||
auto inOffset = shape::getIndexOffset(e, inShapeBuffer, inLength);
|
auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
|
||||||
auto outOffset = shape::getIndexOffset(e, outShapeBuffer, outLength);
|
auto outOffset = shape::getIndexOffset(e, outShapeBuffer);
|
||||||
outArr[outOffset] = inArr[inOffset];
|
outArr[outOffset] = inArr[inOffset];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -116,15 +116,15 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray&
|
||||||
|
|
||||||
for (Nd4jLong i = 0; i < zLen; ++i) {
|
for (Nd4jLong i = 0; i < zLen; ++i) {
|
||||||
|
|
||||||
shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data());
|
shape::index2coords(i, output.getShapeInfo(), coords.data());
|
||||||
|
|
||||||
const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank);
|
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
|
||||||
|
|
||||||
// evaluate spatial coordinates for x
|
// evaluate spatial coordinates for x
|
||||||
for(uint j = 1; j <= numOfSpatialDims; ++j)
|
for(uint j = 1; j <= numOfSpatialDims; ++j)
|
||||||
coords[j] += crop.e<uint>(j - 1, 0); // add crop left
|
coords[j] += crop.e<uint>(j - 1, 0); // add crop left
|
||||||
|
|
||||||
z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)];
|
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -298,9 +298,9 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
|
||||||
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
|
||||||
for (Nd4jLong i = 0; i < zLen; ++i) {
|
for (Nd4jLong i = 0; i < zLen; ++i) {
|
||||||
|
|
||||||
shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data());
|
shape::index2coords(i, output.getShapeInfo(), coords.data());
|
||||||
|
|
||||||
const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank);
|
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
|
||||||
|
|
||||||
bool within = true;
|
bool within = true;
|
||||||
|
|
||||||
|
@ -318,7 +318,7 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
|
||||||
}
|
}
|
||||||
|
|
||||||
if(within)
|
if(within)
|
||||||
z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)];
|
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
|
||||||
else
|
else
|
||||||
z[zOffset] = 0.f;
|
z[zOffset] = 0.f;
|
||||||
}
|
}
|
||||||
|
|
|
@ -178,8 +178,6 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
|
||||||
|
|
||||||
const Nd4jLong* xShape = input.shapeOf();
|
const Nd4jLong* xShape = input.shapeOf();
|
||||||
const Nd4jLong* zShape = output.shapeOf();
|
const Nd4jLong* zShape = output.shapeOf();
|
||||||
const Nd4jLong* xStride = input.stridesOf();
|
|
||||||
const Nd4jLong* zStride = output.stridesOf();
|
|
||||||
|
|
||||||
const int rank = input.rankOf(); // both input and output have the same rank
|
const int rank = input.rankOf(); // both input and output have the same rank
|
||||||
const int rankMinusOne = rank - 1;
|
const int rankMinusOne = rank - 1;
|
||||||
|
@ -195,8 +193,8 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
|
||||||
PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
|
||||||
for(uint i = 0; i < zLen; ++i) {
|
for(uint i = 0; i < zLen; ++i) {
|
||||||
|
|
||||||
shape::index2coords(rank, zShape, i, zLen, coords.data());
|
shape::index2coords(i, output.getShapeInfo(), coords.data());
|
||||||
const auto zOffset = shape::getOffset(0, zShape, zStride, coords.data(), rank);
|
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
|
||||||
|
|
||||||
bool within = true;
|
bool within = true;
|
||||||
for(int j = rankMinusOne; j >= 0; --j) {
|
for(int j = rankMinusOne; j >= 0; --j) {
|
||||||
|
@ -207,7 +205,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
|
||||||
}
|
}
|
||||||
|
|
||||||
if(within)
|
if(within)
|
||||||
z[zOffset] = x[shape::getOffset(0, xShape, xStride, coords.data(), rank)];
|
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
|
||||||
else
|
else
|
||||||
z[zOffset] = padVal;
|
z[zOffset] = padVal;
|
||||||
}
|
}
|
||||||
|
@ -220,8 +218,8 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
|
||||||
PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
|
||||||
for(uint i = 0; i < zLen; ++i) {
|
for(uint i = 0; i < zLen; ++i) {
|
||||||
|
|
||||||
shape::index2coords(rank, zShape, i, zLen, coords.data());
|
shape::index2coords(i, output.getShapeInfo(), coords.data());
|
||||||
const auto zOffset = shape::getOffset(0, zShape, zStride, coords.data(), rank);
|
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
|
||||||
|
|
||||||
for(int j = rankMinusOne; j >= 0; --j) {
|
for(int j = rankMinusOne; j >= 0; --j) {
|
||||||
|
|
||||||
|
@ -231,7 +229,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
|
||||||
else if(coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right
|
else if(coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto xOffset = shape::getOffset(0, xShape, xStride, coords.data(), rank);
|
const auto xOffset = shape::getOffset(input.getShapeInfo(), coords.data());
|
||||||
z[zOffset] = x[xOffset];
|
z[zOffset] = x[xOffset];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -580,9 +578,9 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
|
||||||
xCoordStart = coords.data();
|
xCoordStart = coords.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
shape::index2coords(zRank, output.shapeOf(), i, zLen, zCoordStart);
|
shape::index2coords(i, output.getShapeInfo(), zCoordStart);
|
||||||
|
|
||||||
const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), zCoordStart, zRank);
|
const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoordStart);
|
||||||
|
|
||||||
// last y coordinate
|
// last y coordinate
|
||||||
uint coordToRestore;
|
uint coordToRestore;
|
||||||
|
@ -590,7 +588,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
|
||||||
coordToRestore = static_cast<uint>(zCoordStart[yRank - 1]);
|
coordToRestore = static_cast<uint>(zCoordStart[yRank - 1]);
|
||||||
|
|
||||||
zCoordStart[yRank - 1] = 0;
|
zCoordStart[yRank - 1] = 0;
|
||||||
const auto yOffset = shape::getOffset(0, indices.shapeOf(), indices.stridesOf(), zCoordStart, yRank);
|
const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoordStart);
|
||||||
|
|
||||||
//restore z coordinate
|
//restore z coordinate
|
||||||
if(yLastDim != xRank)
|
if(yLastDim != xRank)
|
||||||
|
@ -600,7 +598,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
|
||||||
for(uint j = 0; j < yLastDim; ++j)
|
for(uint j = 0; j < yLastDim; ++j)
|
||||||
xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride
|
xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride
|
||||||
|
|
||||||
const auto xOffset = shape::getOffset(0, input.shapeOf(), input.stridesOf(), xCoordStart, xRank);
|
const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart);
|
||||||
|
|
||||||
z[zOffset] = x[xOffset];
|
z[zOffset] = x[xOffset];
|
||||||
}
|
}
|
||||||
|
@ -1172,7 +1170,7 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
|
||||||
PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(inIdx, outIdx))
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(inIdx, outIdx))
|
||||||
for(int i = 0; i < outLen; ++i) {
|
for(int i = 0; i < outLen; ++i) {
|
||||||
|
|
||||||
shape::index2coords(rank, output.shapeOf(), i, outIdx.data());
|
shape::index2coords(i, output.getShapeInfo(), outIdx.data());
|
||||||
|
|
||||||
for(int j = 0; j < rank; ++j) {
|
for(int j = 0; j < rank; ++j) {
|
||||||
|
|
||||||
|
@ -1191,8 +1189,8 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
|
||||||
inIdx[j] = len - outIdx[j];
|
inIdx[j] = len - outIdx[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
auto outOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), outIdx.data(), rank);
|
auto outOffset = shape::getOffset(output.getShapeInfo(), outIdx.data());
|
||||||
auto inOffset = shape::getOffset(0, input.shapeOf(), input.stridesOf(), inIdx.data(), rank);
|
auto inOffset = shape::getOffset(input.getShapeInfo(), inIdx.data());
|
||||||
reinterpret_cast<T*>(output.buffer())[outOffset] = reinterpret_cast<T*>(input.getBuffer())[inOffset];
|
reinterpret_cast<T*>(output.buffer())[outOffset] = reinterpret_cast<T*>(input.getBuffer())[inOffset];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1259,7 +1257,7 @@ static void tileBP_(const NDArray& gradO /*input*/, NDArray& gradI /*output*/, c
|
||||||
for(Nd4jLong i=0; i<gradOLen; ++i) {
|
for(Nd4jLong i=0; i<gradOLen; ++i) {
|
||||||
|
|
||||||
auto fidx = shape::subArrayIndex(i, gradO.getShapeInfo(), gradI.getShapeInfo());
|
auto fidx = shape::subArrayIndex(i, gradO.getShapeInfo(), gradI.getShapeInfo());
|
||||||
gradI.p(fidx, gradI.e<T>(fidx) + gradOBuff[shape::getIndexOffset(i, gradO.getShapeInfo(), gradOLen)]);
|
gradI.p(fidx, gradI.e<T>(fidx) + gradOBuff[shape::getIndexOffset(i, gradO.getShapeInfo())]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -60,9 +60,9 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo,
|
||||||
|
|
||||||
for (int i = tid; i < xzLen; i += totalThreads) {
|
for (int i = tid; i < xzLen; i += totalThreads) {
|
||||||
|
|
||||||
shape::index2coords(xzRank, xShapeInfo + 1, i, xzLen, coords);
|
shape::index2coords(i, xShapeInfo, coords);
|
||||||
|
|
||||||
const auto xzOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xzRank + 1, coords, xzRank);
|
const auto xzOffset = shape::getOffset(xShapeInfo, coords);
|
||||||
|
|
||||||
const auto xVal = x[xzOffset];
|
const auto xVal = x[xzOffset];
|
||||||
|
|
||||||
|
@ -72,7 +72,7 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo,
|
||||||
if(yShapeInfo[j + 1] == 1)
|
if(yShapeInfo[j + 1] == 1)
|
||||||
coords[j + 1] = 0;
|
coords[j + 1] = 0;
|
||||||
|
|
||||||
z[xzOffset] = xVal * y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + yRank + 1, coords + 1, yRank)];
|
z[xzOffset] = xVal * y[shape::getOffset(yShapeInfo, coords + 1)];
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
z[xzOffset] = xVal;
|
z[xzOffset] = xVal;
|
||||||
|
@ -139,11 +139,11 @@ __global__ linkage void preluBPCuda(const void *vIn, const Nd4jLong *inShapeI
|
||||||
|
|
||||||
for (int i = tid; i < inLen; i += totalThreads) {
|
for (int i = tid; i < inLen; i += totalThreads) {
|
||||||
|
|
||||||
shape::index2coords(inRank, inShapeInfo + 1, i, inLen, coords);
|
shape::index2coords(i, inShapeInfo, coords);
|
||||||
|
|
||||||
const auto inOffset = shape::getOffset(0, inShapeInfo + 1, inShapeInfo + inRank + 1, coords, inRank);
|
const auto inOffset = shape::getOffset(inShapeInfo, coords);
|
||||||
const auto dLdOOffset = shape::getOffset(0, dLdOShapeInfo + 1, dLdOShapeInfo + inRank + 1, coords, inRank);
|
const auto dLdOOffset = shape::getOffset(dLdOShapeInfo, coords);
|
||||||
const auto dLdIOffset = shape::getOffset(0, dLdIShapeInfo + 1, dLdIShapeInfo + inRank + 1, coords, inRank);
|
const auto dLdIOffset = shape::getOffset(dLdIShapeInfo, coords);
|
||||||
|
|
||||||
const auto xVal = in[inOffset];
|
const auto xVal = in[inOffset];
|
||||||
const auto grO = dLdO[dLdOOffset];
|
const auto grO = dLdO[dLdOOffset];
|
||||||
|
@ -154,8 +154,8 @@ __global__ linkage void preluBPCuda(const void *vIn, const Nd4jLong *inShapeI
|
||||||
if(alphaShapeInfo[j + 1] == 1)
|
if(alphaShapeInfo[j + 1] == 1)
|
||||||
coords[j + 1] = 0;
|
coords[j + 1] = 0;
|
||||||
|
|
||||||
const auto alphaOffset = shape::getOffset(0, alphaShapeInfo + 1, alphaShapeInfo + alphaRank + 1, coords + 1, alphaRank);
|
const auto alphaOffset = shape::getOffset(alphaShapeInfo, coords + 1);
|
||||||
const auto dLdAOffset = shape::getOffset(0, dLdAShapeInfo + 1, dLdAShapeInfo + alphaRank + 1, coords + 1, alphaRank);
|
const auto dLdAOffset = shape::getOffset(dLdAShapeInfo, coords + 1);
|
||||||
|
|
||||||
dLdI[dLdIOffset] = grO * alpha[alphaOffset];
|
dLdI[dLdIOffset] = grO * alpha[alphaOffset];
|
||||||
|
|
||||||
|
@ -223,7 +223,7 @@ __device__ void softMaxForVectorCuda(const void *vx, const Nd4jLong *xShapeInfo,
|
||||||
|
|
||||||
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
||||||
if(elemIdx < len) {
|
if(elemIdx < len) {
|
||||||
const Nd4jLong xOffset = shape::getIndexOffset(elemIdx, xShapeInfo, len);
|
const Nd4jLong xOffset = shape::getIndexOffset(elemIdx, xShapeInfo);
|
||||||
shmem[threadIdx.x] = (threadIdx.x != 0) ? x[xOffset] : nd4j::math::nd4j_max<T>(x[xOffset], temp); // take into account max element evaluated on previous iteration and stored in temp
|
shmem[threadIdx.x] = (threadIdx.x != 0) ? x[xOffset] : nd4j::math::nd4j_max<T>(x[xOffset], temp); // take into account max element evaluated on previous iteration and stored in temp
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -249,8 +249,8 @@ __device__ void softMaxForVectorCuda(const void *vx, const Nd4jLong *xShapeInfo,
|
||||||
|
|
||||||
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
||||||
if(elemIdx < len) {
|
if(elemIdx < len) {
|
||||||
const Nd4jLong xOffset = shape::getIndexOffset(elemIdx, xShapeInfo, len);
|
const Nd4jLong xOffset = shape::getIndexOffset(elemIdx, xShapeInfo);
|
||||||
const Nd4jLong zOffset = shape::getIndexOffset(elemIdx, zShapeInfo, len);
|
const Nd4jLong zOffset = shape::getIndexOffset(elemIdx, zShapeInfo);
|
||||||
z[zOffset] = nd4j::math::nd4j_exp<T, T>(x[xOffset] - max);
|
z[zOffset] = nd4j::math::nd4j_exp<T, T>(x[xOffset] - max);
|
||||||
shmem[threadIdx.x] = (threadIdx.x != 0) ? z[zOffset] : (z[zOffset] + temp); // take into account sum element evaluated on previous iteration and stored in temp
|
shmem[threadIdx.x] = (threadIdx.x != 0) ? z[zOffset] : (z[zOffset] + temp); // take into account sum element evaluated on previous iteration and stored in temp
|
||||||
}
|
}
|
||||||
|
@ -272,7 +272,7 @@ __device__ void softMaxForVectorCuda(const void *vx, const Nd4jLong *xShapeInfo,
|
||||||
for (int i = 0; i < numOfIters; ++i) {
|
for (int i = 0; i < numOfIters; ++i) {
|
||||||
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
||||||
if(elemIdx >= len) continue;
|
if(elemIdx >= len) continue;
|
||||||
const Nd4jLong zOffset = shape::getIndexOffset(elemIdx, zShapeInfo, len);
|
const Nd4jLong zOffset = shape::getIndexOffset(elemIdx, zShapeInfo);
|
||||||
z[zOffset] /= shmem[0];
|
z[zOffset] /= shmem[0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -386,7 +386,7 @@ __global__ void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape
|
||||||
|
|
||||||
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
||||||
if(elemIdx < len) {
|
if(elemIdx < len) {
|
||||||
const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len);
|
const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
|
||||||
shmem[threadIdx.x] = (threadIdx.x != 0) ? x[offset] : nd4j::math::nd4j_max<T>(x[offset], temp); // take into account max element evaluated on previous iteration and stored in temp
|
shmem[threadIdx.x] = (threadIdx.x != 0) ? x[offset] : nd4j::math::nd4j_max<T>(x[offset], temp); // take into account max element evaluated on previous iteration and stored in temp
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -412,7 +412,7 @@ __global__ void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape
|
||||||
|
|
||||||
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
||||||
if(elemIdx < len) {
|
if(elemIdx < len) {
|
||||||
const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len);
|
const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
|
||||||
z[offset] = nd4j::math::nd4j_exp<T, T>(x[offset] - max);
|
z[offset] = nd4j::math::nd4j_exp<T, T>(x[offset] - max);
|
||||||
shmem[threadIdx.x] = (threadIdx.x != 0) ? z[offset] : (z[offset] + temp); // take into account sum element evaluated on previous iteration and stored in temp
|
shmem[threadIdx.x] = (threadIdx.x != 0) ? z[offset] : (z[offset] + temp); // take into account sum element evaluated on previous iteration and stored in temp
|
||||||
}
|
}
|
||||||
|
@ -434,7 +434,7 @@ __global__ void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape
|
||||||
for (int i = 0; i < numOfIters; ++i) {
|
for (int i = 0; i < numOfIters; ++i) {
|
||||||
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
||||||
if(elemIdx >= len) continue;
|
if(elemIdx >= len) continue;
|
||||||
const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len);
|
const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
|
||||||
z[offset] = nd4j::math::nd4j_log<T,T>(z[offset] / shmem[0]);
|
z[offset] = nd4j::math::nd4j_log<T,T>(z[offset] / shmem[0]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -505,7 +505,7 @@ __global__ linkage void softMaxDerivForVectorCuda(const void *vx, const Nd4jLong
|
||||||
|
|
||||||
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
||||||
if(elemIdx < len) {
|
if(elemIdx < len) {
|
||||||
const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len);
|
const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
|
||||||
shmem[threadIdx.x] = (threadIdx.x != 0) ? x[offset] : nd4j::math::nd4j_max<T>(x[offset], temp); // take into account max element evaluated on previous iteration and stored in temp
|
shmem[threadIdx.x] = (threadIdx.x != 0) ? x[offset] : nd4j::math::nd4j_max<T>(x[offset], temp); // take into account max element evaluated on previous iteration and stored in temp
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -531,7 +531,7 @@ __global__ linkage void softMaxDerivForVectorCuda(const void *vx, const Nd4jLong
|
||||||
|
|
||||||
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
||||||
if(elemIdx < len) {
|
if(elemIdx < len) {
|
||||||
const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len);
|
const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
|
||||||
z[offset] = nd4j::math::nd4j_exp<T, T>(x[offset] - max);
|
z[offset] = nd4j::math::nd4j_exp<T, T>(x[offset] - max);
|
||||||
shmem[threadIdx.x] = (threadIdx.x != 0) ? z[offset] : (z[offset] + temp); // take into account sum element evaluated on previous iteration and stored in temp
|
shmem[threadIdx.x] = (threadIdx.x != 0) ? z[offset] : (z[offset] + temp); // take into account sum element evaluated on previous iteration and stored in temp
|
||||||
}
|
}
|
||||||
|
@ -553,7 +553,7 @@ __global__ linkage void softMaxDerivForVectorCuda(const void *vx, const Nd4jLong
|
||||||
for (int i = 0; i < numOfIters; ++i) {
|
for (int i = 0; i < numOfIters; ++i) {
|
||||||
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
|
||||||
if(elemIdx >= len) continue;
|
if(elemIdx >= len) continue;
|
||||||
const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len);
|
const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
|
||||||
z[offset] /= shmem[0];
|
z[offset] /= shmem[0];
|
||||||
z[offset] *= (1.f - z[offset]); // derivative
|
z[offset] *= (1.f - z[offset]); // derivative
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,110 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (c) 2015-2018 Skymind, Inc.
|
||||||
|
*
|
||||||
|
* This program and the accompanying materials are made available under the
|
||||||
|
* terms of the Apache License, Version 2.0 which is available at
|
||||||
|
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
* License for the specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
******************************************************************************/
|
||||||
|
|
||||||
|
//
|
||||||
|
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
|
#include<ops/declarable/helpers/addBias.h>
|
||||||
|
#include <PointersManager.h>
|
||||||
|
|
||||||
|
namespace nd4j {
|
||||||
|
namespace ops {
|
||||||
|
namespace helpers {
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
template<typename X, typename Y>
|
||||||
|
__global__ static void addBiasCuda( const void* vx, const Nd4jLong* xShapeInfo,
|
||||||
|
const void* vy, const Nd4jLong* yShapeInfo,
|
||||||
|
void* vz, const Nd4jLong* zShapeInfo,
|
||||||
|
const bool isNCHW) {
|
||||||
|
|
||||||
|
// bias [oC]
|
||||||
|
|
||||||
|
// if(input_rank == 4)
|
||||||
|
// input and output have same shapes: [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
|
||||||
|
// if(input_rank == 5)
|
||||||
|
// input and output have same shapes: [bS, oD, oH, oW, oC] (NHWC) or [bS, oD, oC, oH, oW] (NCHW)
|
||||||
|
|
||||||
|
const X* x = reinterpret_cast<const X*>(vx);
|
||||||
|
const Y* y = reinterpret_cast<const Y*>(vy);
|
||||||
|
X* z = reinterpret_cast<X*>(vz);
|
||||||
|
|
||||||
|
__shared__ int rank, channelPosition;
|
||||||
|
__shared__ Nd4jLong *sharedMem, len;
|
||||||
|
__shared__ bool xzSameOffsets, xzAreSame;
|
||||||
|
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
|
extern __shared__ unsigned char shmem[];
|
||||||
|
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
|
|
||||||
|
rank = shape::rank(xShapeInfo); // xRank == zRank
|
||||||
|
xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
|
||||||
|
len = shape::length(xShapeInfo);
|
||||||
|
channelPosition = isNCHW ? 1 : rank - 1; // second or last
|
||||||
|
xzAreSame = x == z;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
auto coords = sharedMem + threadIdx.x * rank;
|
||||||
|
|
||||||
|
for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < len; i += blockDim.x * gridDim.x) {
|
||||||
|
|
||||||
|
shape::index2coords(i, xShapeInfo, coords);
|
||||||
|
|
||||||
|
const auto xOffsets = shape::getOffset(xShapeInfo, coords);
|
||||||
|
const auto zOffsets = xzSameOffsets ? xOffsets : shape::getOffset(zShapeInfo, coords);
|
||||||
|
const auto yOffsets = shape::getOffset(yShapeInfo, coords + channelPosition);
|
||||||
|
|
||||||
|
if(xzAreSame)
|
||||||
|
z[zOffsets] += static_cast<X>(y[yOffsets]);
|
||||||
|
else
|
||||||
|
z[zOffsets] = x[xOffsets] + static_cast<X>(y[yOffsets]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
template<typename X, typename Y>
|
||||||
|
static void addBiasCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
|
||||||
|
const void* vx, const Nd4jLong* xShapeInfo,
|
||||||
|
const void* vy, const Nd4jLong* yShapeInfo,
|
||||||
|
void* vz, const Nd4jLong* zShapeInfo,
|
||||||
|
const bool isNCHW) {
|
||||||
|
|
||||||
|
addBiasCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, isNCHW);
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
void addBias(nd4j::graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) {
|
||||||
|
|
||||||
|
PointersManager manager(block.launchContext(), "addBias");
|
||||||
|
|
||||||
|
const int threadsPerBlock = MAX_NUM_THREADS;
|
||||||
|
const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||||
|
const int sharedMem = input.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
|
||||||
|
|
||||||
|
NDArray::prepareSpecialUse({&output}, {&input, &bias});
|
||||||
|
BUILD_DOUBLE_SELECTOR(input.dataType(), bias.dataType(), addBiasCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), bias.getSpecialBuffer(), bias.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), isNCHW), FLOAT_TYPES, FLOAT_TYPES);
|
||||||
|
NDArray::registerSpecialUse({&output}, {&input, &bias});
|
||||||
|
|
||||||
|
manager.synchronize();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -143,13 +143,13 @@ static void _CUDA_G adjustHueSingleNCHWKernel(void *xBuffer, Nd4jLong *xTadShape
|
||||||
|
|
||||||
|
|
||||||
for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
|
for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
|
||||||
auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
|
auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo);
|
||||||
auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
|
auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo);
|
||||||
auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
|
auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo);
|
||||||
|
|
||||||
auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
|
auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo);
|
||||||
auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
|
auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo);
|
||||||
auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
|
auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo);
|
||||||
|
|
||||||
T h, v_min, v_max;
|
T h, v_min, v_max;
|
||||||
helpers::rgb_to_hv(_ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
|
helpers::rgb_to_hv(_ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
|
||||||
|
|
|
@ -139,13 +139,13 @@ static void _CUDA_G adjustSaturationSingleNCHWKernel(void *xBuffer, Nd4jLong *xT
|
||||||
auto outputB = reinterpret_cast<T *>(zBuffer) + zOffsets[2];
|
auto outputB = reinterpret_cast<T *>(zBuffer) + zOffsets[2];
|
||||||
|
|
||||||
for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
|
for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
|
||||||
auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
|
auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo);
|
||||||
auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
|
auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo);
|
||||||
auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
|
auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo);
|
||||||
|
|
||||||
auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
|
auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo);
|
||||||
auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
|
auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo);
|
||||||
auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
|
auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo);
|
||||||
|
|
||||||
T h, s, v;
|
T h, s, v;
|
||||||
// Convert the RGB color to Hue/V-range.
|
// Convert the RGB color to Hue/V-range.
|
||||||
|
|
|
@ -64,25 +64,25 @@ __global__ static void batchnormCuda(const void* vx, const Nd4jLong* xShapeInfo,
|
||||||
|
|
||||||
for (uint i = tid; i < minLen; i += totalThreads) {
|
for (uint i = tid; i < minLen; i += totalThreads) {
|
||||||
|
|
||||||
const auto meanOffset = shape::getIndexOffset(i, meanShapeInfo, minLen);
|
const auto meanOffset = shape::getIndexOffset(i, meanShapeInfo);
|
||||||
const auto varianceOffset = shape::getIndexOffset(i, varianceShapeInfo, minLen);
|
const auto varianceOffset = shape::getIndexOffset(i, varianceShapeInfo);
|
||||||
|
|
||||||
T sigmaInvGam = 1. / nd4j::math::nd4j_sqrt<T, T>(variance[varianceOffset] + epsilon);
|
T sigmaInvGam = 1. / nd4j::math::nd4j_sqrt<T, T>(variance[varianceOffset] + epsilon);
|
||||||
|
|
||||||
if(gamma != nullptr)
|
if(gamma != nullptr)
|
||||||
sigmaInvGam *= gamma[shape::getIndexOffset(i, gammaShapeInfo, minLen)];
|
sigmaInvGam *= gamma[shape::getIndexOffset(i, gammaShapeInfo)];
|
||||||
|
|
||||||
auto betaOffset = 0;
|
auto betaOffset = 0;
|
||||||
if(beta != nullptr)
|
if(beta != nullptr)
|
||||||
betaOffset = shape::getIndexOffset(i, betaShapeInfo, minLen);
|
betaOffset = shape::getIndexOffset(i, betaShapeInfo);
|
||||||
|
|
||||||
const auto xTad = x + xTadOffsets[i];
|
const auto xTad = x + xTadOffsets[i];
|
||||||
auto zTad = z + zTadOffsets[i];
|
auto zTad = z + zTadOffsets[i];
|
||||||
|
|
||||||
for (uint j = 0; j < tadLen; ++j) {
|
for (uint j = 0; j < tadLen; ++j) {
|
||||||
|
|
||||||
const auto xTadOffset = shape::getIndexOffset(j, xTadShapeInfo, tadLen);
|
const auto xTadOffset = shape::getIndexOffset(j, xTadShapeInfo);
|
||||||
const auto zTadOffset = shape::getIndexOffset(j, zTadShapeInfo, tadLen);
|
const auto zTadOffset = shape::getIndexOffset(j, zTadShapeInfo);
|
||||||
|
|
||||||
zTad[zTadOffset] = (xTad[xTadOffset] - mean[meanOffset]) * sigmaInvGam;
|
zTad[zTadOffset] = (xTad[xTadOffset] - mean[meanOffset]) * sigmaInvGam;
|
||||||
|
|
||||||
|
@ -130,10 +130,10 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo
|
||||||
|
|
||||||
for (uint i = tid; i < xLen; i += totalThreads) {
|
for (uint i = tid; i < xLen; i += totalThreads) {
|
||||||
|
|
||||||
shape::index2coords(xRank, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), i, xLen, coords);
|
shape::index2coords(i, xShapeInfo, coords);
|
||||||
|
|
||||||
const auto xOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), shape::stride(const_cast<Nd4jLong*>(xShapeInfo)), coords, xRank);
|
const auto xOffset = shape::getOffset(xShapeInfo, coords);
|
||||||
const auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo)), shape::stride(const_cast<Nd4jLong*>(zShapeInfo)), coords, xRank);
|
const auto zOffset = shape::getOffset(zShapeInfo, coords);
|
||||||
|
|
||||||
if(minRank == xRank) {
|
if(minRank == xRank) {
|
||||||
for (uint i = 0, j = 0; i < xRank; ++i) {
|
for (uint i = 0, j = 0; i < xRank; ++i) {
|
||||||
|
@ -146,20 +146,20 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo
|
||||||
else // minRank = numDims = 1 in this case
|
else // minRank = numDims = 1 in this case
|
||||||
coords[0] = coords[dims[0]];
|
coords[0] = coords[dims[0]];
|
||||||
|
|
||||||
const auto meanOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(meanShapeInfo)), shape::stride(const_cast<Nd4jLong*>(meanShapeInfo)), coords, minRank);
|
const auto meanOffset = shape::getOffset(meanShapeInfo, coords);
|
||||||
const auto varianceOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(varianceShapeInfo)), shape::stride(const_cast<Nd4jLong*>(varianceShapeInfo)), coords, minRank);
|
const auto varianceOffset = shape::getOffset(varianceShapeInfo, coords);
|
||||||
|
|
||||||
T sigmaInvGam = 1. / nd4j::math::nd4j_sqrt<T, T>(variance[varianceOffset] + epsilon);
|
T sigmaInvGam = 1. / nd4j::math::nd4j_sqrt<T, T>(variance[varianceOffset] + epsilon);
|
||||||
|
|
||||||
if(gamma != nullptr) {
|
if(gamma != nullptr) {
|
||||||
const auto gammaOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(gammaShapeInfo)), shape::stride(const_cast<Nd4jLong*>(gammaShapeInfo)), coords, minRank);
|
const auto gammaOffset = shape::getOffset(gammaShapeInfo, coords);
|
||||||
sigmaInvGam *= gamma[gammaOffset];
|
sigmaInvGam *= gamma[gammaOffset];
|
||||||
}
|
}
|
||||||
|
|
||||||
z[zOffset] = (x[xOffset] - mean[meanOffset]) * sigmaInvGam;
|
z[zOffset] = (x[xOffset] - mean[meanOffset]) * sigmaInvGam;
|
||||||
|
|
||||||
if(beta != nullptr) {
|
if(beta != nullptr) {
|
||||||
const auto betaOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(betaShapeInfo)), shape::stride(const_cast<Nd4jLong*>(betaShapeInfo)), coords, minRank);
|
const auto betaOffset = shape::getOffset(betaShapeInfo, coords);
|
||||||
z[zOffset] += beta[betaOffset];
|
z[zOffset] += beta[betaOffset];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
|
|
||||||
//
|
//
|
||||||
// Created by Yurii Shyrma on 11.12.2017
|
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||||
//
|
//
|
||||||
|
|
||||||
#include<cmath>
|
#include<cmath>
|
||||||
|
@ -117,10 +117,10 @@ __global__ void betaIncForArrayCuda(const void* va, const Nd4jLong* aShapeInfo,
|
||||||
|
|
||||||
Nd4jLong len = shape::length(xShapeInfo);
|
Nd4jLong len = shape::length(xShapeInfo);
|
||||||
|
|
||||||
const T a = *(reinterpret_cast<const T*>(va) + shape::getIndexOffset(j, aShapeInfo, len));
|
const T a = *(reinterpret_cast<const T*>(va) + shape::getIndexOffset(j, aShapeInfo));
|
||||||
const T b = *(reinterpret_cast<const T*>(vb) + shape::getIndexOffset(j, bShapeInfo, len));
|
const T b = *(reinterpret_cast<const T*>(vb) + shape::getIndexOffset(j, bShapeInfo));
|
||||||
const T x = *(reinterpret_cast<const T*>(vx) + shape::getIndexOffset(j, xShapeInfo, len));
|
const T x = *(reinterpret_cast<const T*>(vx) + shape::getIndexOffset(j, xShapeInfo));
|
||||||
T& z = *(reinterpret_cast<T*>(vz) + shape::getIndexOffset(j, zShapeInfo, len));
|
T& z = *(reinterpret_cast<T*>(vz) + shape::getIndexOffset(j, zShapeInfo));
|
||||||
|
|
||||||
// t^{n-1} * (1 - t)^{n-1} is symmetric function with respect to x = 0.5
|
// t^{n-1} * (1 - t)^{n-1} is symmetric function with respect to x = 0.5
|
||||||
if(a == b && x == static_cast<T>(0.5)) {
|
if(a == b && x == static_cast<T>(0.5)) {
|
||||||
|
|
|
@ -68,7 +68,7 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close))
|
||||||
const auto len = shape::length(imShapeBuffer);
|
const auto len = shape::length(imShapeBuffer);
|
||||||
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close))
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close))
|
||||||
for (int i = 0; i < len; i++)
|
for (int i = 0; i < len; i++)
|
||||||
imBuff[shape::getIndexOffset(i, imShapeBuffer, len)] = static_cast<T>(0.f);
|
imBuff[shape::getIndexOffset(i, imShapeBuffer)] = static_cast<T>(0.f);
|
||||||
}
|
}
|
||||||
|
|
||||||
T *col, *im;
|
T *col, *im;
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue