[WIP] Shyrma coords (#305)

* - provide faster index2coords function for cpu

Signed-off-by: Yurii <iuriish@yahoo.com>

* - new faster index2coords function is introduced into cpu code

Signed-off-by: Yurii <iuriish@yahoo.com>

* - replace long long coordinates with int coordinates

Signed-off-by: Yurii <iuriish@yahoo.com>

* - add missed reload of coords2index function

Signed-off-by: Yurii <iuriish@yahoo.com>

* - reststart  jenkins

Signed-off-by: Yurii <iuriish@yahoo.com>

* - rollback changes in convolutions.cu and addBias.cu

Signed-off-by: Yurii <iuriish@yahoo.com>
master
Yurii Shyrma 2020-03-11 15:21:59 +02:00 committed by GitHub
parent 50b7d82b96
commit 58550b7c98
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
43 changed files with 359 additions and 273 deletions

View File

@ -95,22 +95,29 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t
const bool areSameOffsets = shape::haveSameShapeAndStrides(getShapeInfo(), target.getShapeInfo());
auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK];
int coords[MAX_RANK], temp;
for (auto i = start; i < stop; i++) {
shape::index2coords(i, target.getShapeInfo(), coords);
shape::index2coordsCPU(start, i, target.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(target.getShapeInfo(), coords);
// if( (row + upper < col) || (row + lower > col) )
if ((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
z[zOffset] = value;
else if (this != &target) { // when this and target are different arrays
if (xRank != zRank)
if (xRank != zRank) {
temp = coords[0];
coords[0] = coords[1];
}
const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(getShapeInfo(), coords);
z[zOffset] = x[xOffset];
if (xRank != zRank) // restore first coordinate
coords[0] = temp;
}
}
};
@ -376,12 +383,16 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
// loop through input array
auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK];
for (auto i = start; i < stop; i++) {
shape::index2coords(i, output.getShapeInfo(), coords);
int coords[MAX_RANK], temp;
for (auto i = start; i < stop; i++) {
shape::index2coordsCPU(start, i, output.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
temp = coords[axis];
if (repSize > 1) {
for (uint j = 0; j < repSize; ++j) {
coords[axis] -= repeats[j];
@ -394,6 +405,8 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
coords[axis] /= repeats[0];
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
coords[axis] = temp;
}
};

View File

@ -85,12 +85,12 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
const auto x = reinterpret_cast<const T*>(vx);
auto z = reinterpret_cast<T*>(vz);
__shared__ int zRank, xRank, areSameOffsets; // xRank == zRank always, except when xRank = 1, in this case zRank = 2
__shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
__shared__ int zRank, xRank, areSameOffsets, *sharedMem; // xRank == zRank always, except when xRank = 1, in this case zRank = 2
__shared__ Nd4jLong zLen, totalThreads; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
xRank = shape::rank(xShapeInfo);
zRank = shape::rank(zShapeInfo);
@ -137,7 +137,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t
const int threadsPerBlock = MAX_NUM_THREADS / 4;
const int blocksPerGrid = (target.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = threadsPerBlock * sizeof(decltype(*target.getShapeInfo())) * target.rankOf() + 128;
const int sharedMem = threadsPerBlock * sizeof(int) * target.rankOf() + 128;
PointersManager manager(getContext(), "NDArray::fillAsTriangular");
@ -155,12 +155,12 @@ __global__ static void identityMatrixCuda(void* vx, const Nd4jLong* xShapeInfo,
auto x = reinterpret_cast<T*>(vx);
__shared__ int rank;
__shared__ Nd4jLong len, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
__shared__ int rank, *sharedMem;
__shared__ Nd4jLong len, totalThreads; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
rank = shape::rank(xShapeInfo);
len = shape::length(xShapeInfo);
totalThreads = gridDim.x * blockDim.x;
@ -201,7 +201,7 @@ void NDArray::setIdentity() {
const int threadsPerBlock = MAX_NUM_THREADS / 4;
const int blocksPerGrid = (lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = threadsPerBlock * sizeof(decltype(getShapeInfo())) * rankOf() + 128;
const int sharedMem = threadsPerBlock * sizeof(int) * rankOf() + 128;
PointersManager manager(getContext(), "NDArray::setIdentity");
@ -398,13 +398,13 @@ __global__ static void repeatCuda(const void* vx, const Nd4jLong* xShapeInfo,
const X* x = reinterpret_cast<const X*>(vx);
Z* z = reinterpret_cast<Z*>(vz);
__shared__ int rank;
__shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen = zLen
__shared__ int rank, *sharedMem;
__shared__ Nd4jLong zLen, totalThreads; // xLen = zLen
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
rank = shape::rank(zShapeInfo); // xRank = zRank
zLen = shape::length(zShapeInfo); // xLen <= zLen
@ -460,7 +460,7 @@ NDArray NDArray::repeat(const int axis, const std::vector<int>& repeats) const {
const int threadsPerBlock = MAX_NUM_THREADS / 2;
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
const int sharedMem = output.rankOf() * sizeof(int) * threadsPerBlock + 128;
PointersManager manager(getContext(), "NDArray::repeat(const int axis, const std::vector<int>& repeats)");
@ -484,7 +484,7 @@ void NDArray::repeat(const int axis, const std::vector<int>& repeats, NDArray& t
const int threadsPerBlock = MAX_NUM_THREADS / 2;
const int blocksPerGrid = (target.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = target.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
const int sharedMem = target.rankOf() * sizeof(int) * threadsPerBlock + 128;
PointersManager manager(getContext(), "NDArray::repeat(const int axis, const std::vector<int>& repeats)");

View File

@ -46,7 +46,7 @@ namespace sd {
int totalIterations = 1;
// hehe
Nd4jLong xCoords[MAX_RANK];
int xCoords[MAX_RANK];
Nd4jLong xShape[MAX_RANK];
int xRank = _spaces.size();

View File

@ -63,7 +63,7 @@ static void usualGemm(const NDArray* vA, const NDArray* vB, NDArray* vC,
for (auto i = start; i < stop; ++i) {
// evaluate C coordinates
shape::index2coords(i, cShapeInfo, cCoords.data());
shape::index2coordsCPU(start, i, cShapeInfo, cCoords.data());
// evaluate A coordinates
aCoords[aMaxis] = cCoords[cMaxis];
@ -433,12 +433,12 @@ static void batchedGemm(const NDArray* vA, const NDArray* vB, NDArray* vC,
auto func = PRAGMA_THREADS_FOR {
std::vector<Nd4jLong> aCoords(aRank), bCoords(bRank), cCoords(cRank);
std::vector<int> aCoords(aRank), bCoords(bRank), cCoords(cRank);
for (auto i = start; i < stop; ++i) {
// evaluate C coordinates
shape::index2coords(i, cShapeInfo, cCoords.data());
shape::index2coordsCPU(start, i, cShapeInfo, cCoords.data());
// calculate index of current batch
Nd4jLong batchInd;

View File

@ -40,15 +40,15 @@ static __global__ void usualCudaGemm(const void* vA, const Nd4jLong* aShapeInfo,
const T2* B = reinterpret_cast<const T2*>(vB);
T3* C = reinterpret_cast< T3*>(vC);
__shared__ int K;
__shared__ int K, *coords;
__shared__ bool betaPresent;
__shared__ Nd4jLong cLen, totalThreads, *coords;
__shared__ Nd4jLong cLen, totalThreads;
__shared__ T3 alphaZ, betaZ;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
coords = reinterpret_cast<Nd4jLong*>(shmem);
coords = reinterpret_cast<int*>(shmem);
cLen = shape::length(cShapeInfo);
K = shape::shapeOf(const_cast<Nd4jLong*>(aShapeInfo))[aKaxis];
@ -263,7 +263,7 @@ NDArray* MmulHelper::mmulMxM(const NDArray* A, const NDArray* B, NDArray* C, dou
const int threadsPerBlock = MAX_NUM_THREADS / 2;
const int blocksPerGrid = (C->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * 6 + 128; // 6 = aRank + bRank + cRank
const int sharedMem = threadsPerBlock * sizeof(int) * 6 + 128; // 6 = aRank + bRank + cRank
NDArray::prepareSpecialUse({C}, {A, B});
// BUILD_TRIPLE_SELECTOR(aType, bType, cType, usualGemm, (blocksPerGrid, threadsPerBlock, sharedMem, stream, A->getSpecialBuffer(), A->getSpecialShapeInfo(), B->getSpecialBuffer(), B->getSpecialShapeInfo(), C->getSpecialBuffer(), C->getSpecialShapeInfo(), 0, 1, 0, 1, 0, 1, alpha, beta), NUMERIC_TYPES, NUMERIC_TYPES, FLOAT_TYPES);
@ -529,14 +529,14 @@ static __global__ void batchedCudaGemm(const void* vA, const Nd4jLong* aShapeInf
T3* C = reinterpret_cast< T3*>(vC);
__shared__ bool betaPresent;
__shared__ int aRank, bRank, cRank, K;
__shared__ Nd4jLong cLen, totalThreads, *coords;
__shared__ int aRank, bRank, cRank, K, *coords;
__shared__ Nd4jLong cLen, totalThreads;
__shared__ T3 alphaZ, betaZ;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
coords = reinterpret_cast<Nd4jLong*>(shmem);
coords = reinterpret_cast<int*>(shmem);
cLen = shape::length(cShapeInfo);
K = shape::shapeOf(const_cast<Nd4jLong*>(aShapeInfo))[aKaxis];
@ -649,7 +649,7 @@ NDArray* MmulHelper::mmulNxN(const NDArray* A, const NDArray* B, NDArray* C, con
const int threadsPerBlock = MAX_NUM_THREADS / 8;
const int blocksPerGrid = (C->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * (aRank + bRank + cRank) + 128;
const int sharedMem = threadsPerBlock * sizeof(int) * (aRank + bRank + cRank) + 128;
PointersManager manager(A->getContext(), "MmulHelper::mmulNxN");

View File

@ -306,7 +306,6 @@ std::vector<Nd4jLong> ShapeUtils::evalRepeatShape(int axis, const std::vector<in
if(repeats.size() == 1)
outShape[axis] *= repeats[0];
else
outShape[axis] = std::accumulate(repeats.begin(), repeats.end(), 0);

View File

@ -915,12 +915,14 @@ namespace shape {
ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, uint *coords);
ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, Nd4jLong *coords);
ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, int *coords);
ND4J_EXPORT _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, Nd4jLong *coords);
ND4J_EXPORT _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, int *coords);
/**
* take into account only dimensions stored in tadDims, tadDims must be sorted in increasing order!
*/
ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords, const int dimsSize, const int* tadDims);
ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, int *coords, const int dimsSize, const int* tadDims);
/**
* Convert coordinates to the corresponding linear index (sequence number in other words)
@ -929,11 +931,11 @@ namespace shape {
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords);
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords);
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const uint *coords);
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *coords);
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const int *coords);
/**
* take into account only dimensions stored in tadDims, tadDims must be sorted in increasing order!
*/
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords, const int dimsSize, const int* tadDims);
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords, const int dimsSize, const int* tadDims);
/**
* increment n-dimensional array by one iteration by changing coord appropriately
@ -988,17 +990,17 @@ namespace shape {
// function calculates the coordinates of min array (and saves them into minIdxs) given coordinates of max array (already stored in maxIdxs)
// dimsToExclude - should be sorted in increasing order
// dimsLen - length of dimsToExclude, if not set (= -1), then it is calculated as maxRank - minRank
ND4J_EXPORT _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr, const int dimsLen = -1);
ND4J_EXPORT _CUDA_HD void maxIndToMinInd(int* maxIdxs, int* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr, const int dimsLen = -1);
// calculate indexes of max-array, these output indexes correspond to one minIdx index of min-array which is sub-array of max-array
// dimsToExclude - should be sorted in increasing order
ND4J_EXPORT _CUDA_HD int outerArrayIndexes(Nd4jLong* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr);
ND4J_EXPORT _CUDA_HD int outerArrayIndexes(int* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr);
// calculate offsets of max-array, these offsets correspond to one minIdx index of min-array which is sub-array of max-array
// maxOffsets - will contain calculated offsets of max-array, buffer for maxOffsets should be allocated beforehand
// dimsToExclude - should be sorted in increasing order
// memBuff - auxiliary memory buffer (size = 2 * max_rank) for coordinates and increments storing, should be allocated beforehand
ND4J_EXPORT _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, Nd4jLong* memBuff, const int* dimsToExclude = nullptr);
ND4J_EXPORT _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, int* memBuff, const int* dimsToExclude = nullptr);
// calculates offsets for entities (elements or sub-arrays), shape in context of sub-array means dimensions excluded from outer array
// rank is equal to size of shape
@ -1064,7 +1066,7 @@ namespace shape {
* get stride over contiguous axis (contiguous axis must have stride = 1)
* for example when inShapeInfo is {4, 2,5,4,3, 60,1,5,20, 16384,0,99} then output is 5 (that is smallest stride in inShapeInfo except those equal to 1)
*/
INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo);
// INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo);
@ -1832,7 +1834,7 @@ INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const uint *
}
//////////////////////////////////////////////////////////////////////
INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *indices) {
INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const int *indices) {
Nd4jLong index, shift = 1;;
@ -1845,7 +1847,7 @@ INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape,
return index;
}
INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords, const int dimsSize, const int* tadDims) {
INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords, const int dimsSize, const int* tadDims) {
Nd4jLong index, shift = 1;;
@ -4276,7 +4278,7 @@ INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, Nd4jLong* newShap
// max array is outer for min array, min array is sub-array of max array
// function calculates the coordinates of min array (and saves them into minIdxs) given coordinates of max array (already stored in maxIdxs)
INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, int dimsLen) {
INLINEDEF _CUDA_HD void maxIndToMinInd(int* maxIdxs, int* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, int dimsLen) {
const auto maxRank = shape::rank(maxShapeInfo);
const auto minRank = shape::rank(minShapeInfo);
@ -4362,10 +4364,10 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
//////////////////////////////////////////////////////////////////////
INLINEDEF _CUDA_HD Nd4jLong subArrayIndex(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) {
Nd4jLong maxIdxs[MAX_RANK];
int maxIdxs[MAX_RANK];
shape::index2coords(const_cast<Nd4jLong&>(maxIdx), maxShapeInfo, maxIdxs);
Nd4jLong minIdxs[MAX_RANK];
int minIdxs[MAX_RANK];
maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen);
return shape::coords2index(minShapeInfo, minIdxs);
@ -4374,17 +4376,17 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
//////////////////////////////////////////////////////////////////////
INLINEDEF _CUDA_HD Nd4jLong subArrayOffset(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) {
Nd4jLong maxIdxs[MAX_RANK];
int maxIdxs[MAX_RANK];
shape::index2coords(const_cast<Nd4jLong&>(maxIdx), maxShapeInfo, maxIdxs);
Nd4jLong minIdxs[MAX_RANK];
int minIdxs[MAX_RANK];
maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen);
return getOffset(minShapeInfo, minIdxs);
}
//////////////////////////////////////////////////////////////////////
INLINEDEF _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, Nd4jLong* memBuff, const int* dimsToExclude) {
INLINEDEF _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, int* memBuff, const int* dimsToExclude) {
const auto rankMin = shape::rank(minShapeInfo);
const auto rankMax = shape::rank(maxShapeInfo);
@ -4394,8 +4396,8 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
const auto diff = rankMax - rankMin; // the size of dimsToExclude is equal to diff
Nd4jLong* indices = memBuff;
Nd4jLong* increment = memBuff + rankMax;
int* indices = memBuff;
int* increment = memBuff + rankMax;
int N, minI, maxI;
@ -4457,7 +4459,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
}
//////////////////////////////////////////////////////////////////////
INLINEDEF _CUDA_HD int outerArrayIndexes(Nd4jLong* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude) {
INLINEDEF _CUDA_HD int outerArrayIndexes(int* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude) {
const auto rankMin = shape::rank(minShapeInfo);
const auto rankMax = shape::rank(maxShapeInfo);
@ -4469,9 +4471,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
const auto diff = rankMax - rankMin; // the size of dimsToExclude is equal to diff
Nd4jLong buffer[MAX_RANK];
Nd4jLong* indices = buffer;
Nd4jLong* increment = buffer + MAX_RANK/2;
int indices[MAX_RANK], increment[MAX_RANK];
int N, minI, maxI;
@ -4886,7 +4886,7 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const int rank, const Nd4jL
}
//////////////////////////////////////////////////////////////////////
INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords, const int dimsSize, const int* tadDims) {
INLINEDEF _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, int *coords, const int dimsSize, const int* tadDims) {
for(uint i = dimsSize - 1; i > 0; --i) {
coords[tadDims[i]] = index % shapeInfo[1 + tadDims[i]];
@ -4895,6 +4895,34 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo,
coords[tadDims[0]] = index; // last iteration
}
//////////////////////////////////////////////////////////////////////
INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, Nd4jLong *coords) {
if(startIndex == index) {
shape::index2coords(index, shapeInfo, coords);
}
else {
int axis = shapeInfo[0] - 1;
while(coords[axis] == shape::sizeAt(shapeInfo, axis) - 1)
coords[axis--] = 0;
++coords[axis];
}
}
//////////////////////////////////////////////////////////////////////
INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, int *coords) {
if(startIndex == index) {
shape::index2coords(index, shapeInfo, coords);
}
else {
int axis = shapeInfo[0] - 1;
while(coords[axis] == shape::sizeAt(shapeInfo, axis) - 1)
coords[axis--] = 0;
++coords[axis];
}
}
//////////////////////////////////////////////////////////////////////
// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) {
@ -5131,23 +5159,23 @@ INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo,
//////////////////////////////////////////////////////////////////////
INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo) {
// INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo) {
Nd4jLong result = 9223372036854775807LL;
// Nd4jLong result = 9223372036854775807LL;
for(uint i = 0; i < shape::rank(inShapeInfo); ++i) {
// for(uint i = 0; i < shape::rank(inShapeInfo); ++i) {
const auto currentStride = shape::stride(inShapeInfo)[i];
// const auto currentStride = shape::stride(inShapeInfo)[i];
if(i == axis || shape::shapeOf(inShapeInfo)[i] == 1)
continue;
// if(i == axis || shape::shapeOf(inShapeInfo)[i] == 1)
// continue;
if(result > currentStride)
result = currentStride;
}
// if(result > currentStride)
// result = currentStride;
// }
return result == 9223372036854775807LL ? 1 : result;
}
// return result == 9223372036854775807LL ? 1 : result;
// }

View File

@ -739,11 +739,11 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
auto func = PRAGMA_THREADS_FOR{
Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
for (auto i = start; i < stop; ++i) {
shape::index2coords(i, zShapeInfo, zCoords);
shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
for (uint j = 0; j < rank; ++j) {
xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];

View File

@ -449,11 +449,11 @@ void BroadcastBool<X, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
auto func = PRAGMA_THREADS_FOR{
Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
for (auto i = start; i < stop; ++i) {
shape::index2coords(i, zShapeInfo, zCoords);
shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
for (uint j = 0; j < rank; ++j) {
xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];

View File

@ -609,11 +609,11 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
auto func = PRAGMA_THREADS_FOR{
Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
for (auto i = start; i < stop; ++i) {
shape::index2coords(i, zShapeInfo, zCoords);
shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
for (uint j = 0; j < rank; ++j) {
xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];

View File

@ -275,7 +275,7 @@ __device__ void Broadcast<X,Y,Z>::transformCuda(
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {

View File

@ -291,7 +291,7 @@ __device__ void BroadcastBool<X,Z>::transformCuda(const void *vx, const Nd4jLong
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {

View File

@ -271,7 +271,7 @@ __device__ void BroadcastInt<X>::transformCuda(const void *vx, const Nd4jLong *x
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {

View File

@ -137,7 +137,7 @@ namespace sd {
T *dataTAD = currentData + inputOffset;
T *resultTAD = result + resultOffset;
Nd4jLong sub[MAX_RANK];
int sub[MAX_RANK];
shape::index2coords(arrOffset, zTadShape, sub);
@ -166,7 +166,7 @@ namespace sd {
auto dataTAD = currentData + inputOffset;
auto resultTAD = result + resultOffset;
Nd4jLong sub[MAX_RANK];
int sub[MAX_RANK];
shape::index2coords(arrOffset, zTadShape, sub);
Nd4jLong baseOffset = shape::getOffset(zTadShape, sub);
@ -199,7 +199,7 @@ namespace sd {
resultTAD[baseIdx + k * tadEWS] = dataTAD[k];
}
} else {
Nd4jLong yIdx[MAX_RANK];
int yIdx[MAX_RANK];
auto yRank = shape::rank(currentTad);
for (int i = threadIdx.x; i < yLength; i+= blockDim.x) {
@ -214,8 +214,8 @@ namespace sd {
//if (threadIdx.x == 0 && blockIdx.x == 0)
// printf("Branch C; yLength: %i;\n", yLength);
Nd4jLong zIdx[MAX_RANK];
Nd4jLong yIdx[MAX_RANK];
int zIdx[MAX_RANK];
int yIdx[MAX_RANK];
auto yRank = shape::rank(currentTad);
auto tadRank = shape::rank(zTadShape);

View File

@ -39,8 +39,7 @@ namespace sd {
delim->syncToHost();
// output rank N+1 wrt input rank
std::vector<Nd4jLong> ocoords(input->rankOf() + 1);
std::vector<Nd4jLong> icoords(input->rankOf());
std::vector<int> icoords(input->rankOf());
// getting buffer lengths
// FIXME: it'll be bigger, since it'll include delimiters,
@ -54,7 +53,7 @@ namespace sd {
auto s = input->e<std::string>(e);
// getting base index
shape::index2coords(e, input->shapeInfo(), icoords.data());
shape::index2coordsCPU(0, e, input->shapeInfo(), icoords.data());
// getting number of substrings
auto cnt = StringUtils::countSubarrays(s.c_str(), s.length(), d.c_str(), d.length()) + 1;

View File

@ -64,7 +64,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
Nd4jLong* xOffsets = new Nd4jLong[steps];
Nd4jLong* zOffsets = xzSameOffset ? xOffsets : new Nd4jLong[steps];
Nd4jLong* auxBuff = new Nd4jLong[2 * input->rankOf()];
int* auxBuff = new int[2 * input->rankOf()];
for (Nd4jLong j = 0; j < lenSmall; ++j) {
@ -139,40 +139,42 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray
auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK];
int xzCoords[MAX_RANK], minCoords[MAX_RANK];
for (uint i = 0, j = 0; i < xRank; ++i)
if(j < numAxes && i != axes[j])
minCoords[i] = 0;
else
++j;
for (auto i = start; i < stop; i++) {
shape::index2coords(i, input->getShapeInfo(), coords);
shape::index2coordsCPU(start, i, input->getShapeInfo(), xzCoords);
const auto xOffset = shape::getOffset(input->getShapeInfo(), coords);
const auto zOffset = xzSameOffset ? xOffset : shape::getOffset(output->getShapeInfo(), coords);
const auto xOffset = shape::getOffset(input->getShapeInfo(), xzCoords);
const auto zOffset = xzSameOffset ? xOffset : shape::getOffset(output->getShapeInfo(), xzCoords);
if(minRank == xRank) {
for (uint i = 0, j = 0; i < xRank; ++i) {
if(j < numAxes && i != axes[j])
coords[i] = 0;
else
++j;
}
for (uint j = 0; j < numAxes; ++j)
minCoords[axes[j]] = xzCoords[axes[j]];
}
else // minRank = numAxes = 1 in this case
coords[0] = coords[axes[0]];
minCoords[0] = xzCoords[axes[0]];
const auto meanOffset = shape::getOffset(mean->getShapeInfo(), coords);
const auto varianceOffset = paramSameOffset ? meanOffset : shape::getOffset(variance->getShapeInfo(), coords);
const auto meanOffset = shape::getOffset(mean->getShapeInfo(), minCoords);
const auto varianceOffset = paramSameOffset ? meanOffset : shape::getOffset(variance->getShapeInfo(), minCoords);
T sigmaInvGam = 1. / sd::math::nd4j_sqrt<T, T>(v[varianceOffset] + epsilon);
if(g != nullptr) {
const auto gammaOffset = paramSameOffset ? meanOffset : shape::getOffset(gamma->getShapeInfo(), coords);
const auto gammaOffset = paramSameOffset ? meanOffset : shape::getOffset(gamma->getShapeInfo(), minCoords);
sigmaInvGam *= g[gammaOffset];
}
z[zOffset] = (x[xOffset] - m[meanOffset]) * sigmaInvGam;
if(b != nullptr) {
const auto betaOffset = paramSameOffset ? meanOffset : shape::getOffset(beta->getShapeInfo(), coords);
const auto betaOffset = paramSameOffset ? meanOffset : shape::getOffset(beta->getShapeInfo(), minCoords);
z[zOffset] += b[betaOffset];
}
}
@ -184,7 +186,7 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray
//////////////////////////////////////////////////////////////////////////
void batchnorm(const NDArray* input, const NDArray* mean, const NDArray* variance, const NDArray* gamma, const NDArray* beta, NDArray* output, const std::vector<int>& axes, const double epsilon) {
// batchnorm2_ is slower
// batchnorm2_ is still slower ?
BUILD_SINGLE_SELECTOR(input->dataType(), batchnorm_, (input, mean, variance, gamma, beta, output, axes, epsilon), FLOAT_TYPES);
}

View File

@ -51,19 +51,19 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
auto func = PRAGMA_THREADS_FOR{
Nd4jLong coords[MAX_RANK];
for (auto i = start; i < stop; i++) {
shape::index2coords(i, output.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
const auto xOffset0 = shape::getOffset(input.getShapeInfo(), coords);
const auto xOffset1 = xOffset0 + input.strideAt(dimC);
const auto xOffset2 = xOffset1 + input.strideAt(dimC);
z[zOffset] = 0.2989f*x[xOffset0] + 0.5870f*x[xOffset1] + 0.1140f*x[xOffset2];
}
};
int coords[MAX_RANK];
for (auto i = start; i < stop; i++) {
shape::index2coordsCPU(start, i, output.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
const auto xOffset0 = shape::getOffset(input.getShapeInfo(), coords);
const auto xOffset1 = xOffset0 + input.strideAt(dimC);
const auto xOffset2 = xOffset1 + input.strideAt(dimC);
z[zOffset] = 0.2989f*x[xOffset0] + 0.5870f*x[xOffset1] + 0.1140f*x[xOffset2];
}
};
samediff::Threads::parallel_for(func, 0, output.lengthOf(), 1);
return;
samediff::Threads::parallel_for(func, 0, output.lengthOf(), 1);
return;
}
void transformRgbGrs(sd::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
@ -78,9 +78,9 @@ FORCEINLINE static void rgbToFromYuv_(const NDArray& input, NDArray& output, con
const int rank = input.rankOf();
bool bSimple = (dimC == rank - 1 && 'c' == input.ordering() && 1 == input.ews() &&
'c' == output.ordering() && 1 == output.ews());
if (bSimple) {
auto func = PRAGMA_THREADS_FOR{
for (auto i = start; i < stop; i += increment) {
op(x[i], x[i + 1], x[i + 2], z[i], z[i + 1], z[i + 2]);
@ -177,12 +177,12 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
const T* x = input->bufferAsT<T>();
T* z = output->bufferAsT<T>();
// TODO: Use tensordot or other optimizied helpers to see if we can get better performance.
// TODO: Use tensordot or other optimizied helpers to see if we can get better performance.
if (dimC == rank - 1 && input->ews() == 1 && output->ews() == 1 && input->ordering() == 'c' && output->ordering() == 'c') {
auto func = PRAGMA_THREADS_FOR{
for (auto i = start; i < stop; i += increment) {
for (auto i = start; i < stop; i += increment) {
//simple M*v //tr.T*v.T // v * tr //rule: (AB)' =B'A'
// v.shape (1,3) row vector
T x0, x1, x2;
@ -192,7 +192,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
z[i] = x0 * tr[0][0] + x1 * tr[1][0] + x2 * tr[2][0];
z[i+1] = x0 * tr[0][1] + x1 * tr[1][1] + x2 * tr[2][1];
z[i+2] = x0 * tr[0][2] + x1 * tr[1][2] + x2 * tr[2][2];
}
};

View File

@ -49,9 +49,12 @@ void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& outp
const auto xLen = input.lengthOf();
auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK];
int coords[MAX_RANK];
for (Nd4jLong i = 0; i < xLen; ++i) {
shape::index2coords(i, xShapeInfo, coords);
shape::index2coordsCPU(start, i, xShapeInfo, coords);
const auto xOffset = shape::getOffset(xShapeInfo, coords);
const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords);

View File

@ -113,18 +113,23 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray&
// loop through input array
auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK];
int zCoords[MAX_RANK], xCoords[MAX_RANK];
for (auto i = start; i < stop; i++) {
shape::index2coords(i, output.getShapeInfo(), coords);
shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
memcpy(xCoords, zCoords, rank * sizeof(int));
// evaluate spatial coordinates for x
for (uint j = 1; j <= numOfSpatialDims; ++j)
coords[j] += crop.e<uint>(j - 1, 0); // add crop left
xCoords[j] += crop.e<uint>(j - 1, 0); // add crop left
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords);
z[zOffset] = x[xOffset];
}
};
@ -299,11 +304,16 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
// loop through output array
auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK];
for (auto i = start; i < stop; i++) {
shape::index2coords(i, output.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
int zCoords[MAX_RANK], xCoords[MAX_RANK];
for (auto i = start; i < stop; i++) {
shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
memcpy(xCoords, zCoords, rank * sizeof(int));
bool within = true;
@ -312,16 +322,16 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
const auto padLeft = padding.e<uint>(j - 1, 0);
const auto padRight = padding.e<uint>(j - 1, 1);
within &= (coords[j] >= padLeft && coords[j] < output.sizeAt(j) - padRight);
within &= zCoords[j] >= padLeft && zCoords[j] < output.sizeAt(j) - padRight;
if (!within)
break;
coords[j] -= padLeft; // get coordinates for x
xCoords[j] = zCoords[j] - padLeft; // get coordinates for x
}
if (within)
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), xCoords)];
else
z[zOffset] = 0.f;
}

View File

@ -43,11 +43,11 @@ Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int
auto func = PRAGMA_THREADS_FOR {
Nd4jLong xCoords[MAX_RANK];
int xCoords[MAX_RANK];
for (auto i = start; i < stop; i++) {
shape::index2coords(i, xShapeInfo, xCoords);
shape::index2coordsCPU(start, i, xShapeInfo, xCoords);
const Nd4jLong currentInd = x[shape::getOffset(xShapeInfo, xCoords)];

View File

@ -96,14 +96,17 @@ namespace helpers {
auto func = PRAGMA_THREADS_FOR{
Nd4jLong coords[MAX_RANK];
int coords[MAX_RANK], temp;
for (auto i = start; i < stop; i += increment) {
shape::index2coords(i, input.getShapeInfo(), coords);
shape::index2coordsCPU(start, i, input.getShapeInfo(), coords);
const auto xOffset = shape::getOffset(input.getShapeInfo(), coords);
uint outArrIdx = 0;
temp = coords[axis];
while (coords[axis] >= zDim) {
coords[axis] -= zDim;
++outArrIdx;
@ -112,6 +115,8 @@ namespace helpers {
T* z = outArrs[outArrIdx]->bufferAsT<T>();
const auto zOffset = shape::getOffset(outArrs[outArrIdx]->getShapeInfo(), coords);
z[zOffset] = xBuff[xOffset];
coords[axis] = temp;
}
};

View File

@ -188,24 +188,35 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
const T padVal = padValue.e<T>(0);
auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK];
int zCoords[MAX_RANK], xCoords[MAX_RANK];
for (auto i = start; i < stop; i++) {
shape::index2coords(i, output.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
memcpy(xCoords, zCoords, rank * sizeof(int));
bool within = true;
for (int j = rankMinusOne; j >= 0; --j) {
if (xShape[j] == zShape[j]) continue;
if (xShape[j] == zShape[j])
continue;
const auto left = paddings.e<Nd4jLong>(j, 0);
if (coords[j] < left || coords[j] >= left + xShape[j]) {
if (zCoords[j] < left || zCoords[j] >= left + xShape[j]) {
within = false;
break;
}
else { coords[j] = coords[j] - left; }
else
xCoords[j] = zCoords[j] - left;
}
if (within)
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), xCoords)];
else
z[zOffset] = padVal;
}
@ -219,20 +230,30 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
const Nd4jLong shift2 = mode == 1 ? 2 : 1; // REFLECT : SYMMETRIC
auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK];
int zCoords[MAX_RANK], xCoords[MAX_RANK];
for (auto i = start; i < stop; i++) {
shape::index2coords(i, output.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
memcpy(xCoords, zCoords, rank * sizeof(int));
for (int j = rankMinusOne; j >= 0; --j) {
if (xShape[j] == zShape[j]) continue;
coords[j] = coords[j] - paddings.e<Nd4jLong>(j, 0); // are ready to fill middle (within input dimension range)
if (coords[j] < 0) coords[j] = -coords[j] - shift1; // means fill from left
else if (coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right
if (xShape[j] == zShape[j])
continue;
xCoords[j] = zCoords[j] - paddings.e<Nd4jLong>(j, 0); // are ready to fill middle (within input dimension range)
if (xCoords[j] < 0)
xCoords[j] = -xCoords[j] - shift1; // means fill from left
else if (xCoords[j] >= xShape[j])
xCoords[j] = 2 * xShape[j] - xCoords[j] - shift2; // means fill from right
}
const auto xOffset = shape::getOffset(input.getShapeInfo(), coords);
const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords);
z[zOffset] = x[xOffset];
}
};
@ -562,45 +583,37 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
const Nd4jLong zLen = output.lengthOf();
const int yLastDim = indices.sizeAt(-1);
const uint yLastDim = indices.sizeAt(-1);
const int diff = zRank - xRank;
const bool bEqual = yLastDim == xRank;
auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK * 3];
int xCoords[MAX_RANK], zCoords[MAX_RANK], temp;
for (auto i = start; i < stop; i++) {
Nd4jLong *zCoordStart, *xCoordStart;
if (yLastDim == xRank) {
zCoordStart = coords;
xCoordStart = coords;
} else if (zRank >= xRank) {
zCoordStart = coords;
xCoordStart = coords + zRank - xRank;
} else {
zCoordStart = coords + xRank - zRank;
xCoordStart = coords;
}
shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
shape::index2coords(i, output.getShapeInfo(), zCoordStart);
const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoordStart);
temp = zCoords[yRank - 1];
zCoords[yRank - 1] = 0;
const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoords);
zCoords[yRank - 1] = temp;
// last y coordinate
uint coordToRestore;
if (yLastDim != xRank)
coordToRestore = static_cast<uint>(zCoordStart[yRank - 1]);
if(bEqual)
memcpy(xCoords, zCoords, zRank * sizeof(int));
else if(diff >= 0)
memcpy(xCoords, zCoords + diff, xRank * sizeof(int));
else
memcpy(xCoords - diff, zCoords, zRank * sizeof(int));
zCoordStart[yRank - 1] = 0;
const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoordStart);
for (uint j = 0; j < yLastDim; ++j)
xCoords[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride
//restore z coordinate
if (yLastDim != xRank)
zCoordStart[yRank - 1] = coordToRestore;
// construct coordinates for x
for (int j = 0; j < yLastDim; ++j)
xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride
const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart);
const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords);
z[zOffset] = x[xOffset];
}
@ -1188,10 +1201,12 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
else {
auto func = PRAGMA_THREADS_FOR {
Nd4jLong inIdx[MAX_RANK];
Nd4jLong outIdx[MAX_RANK];
int inIdx[MAX_RANK], outIdx[MAX_RANK];
for (auto i = start; i < stop; i++) {
shape::index2coords(i, output.getShapeInfo(), outIdx);
shape::index2coordsCPU(start, i, output.getShapeInfo(), outIdx);
for (int j = 0; j < rank; ++j) {
const Nd4jLong inLen = input.sizeAt(j);

View File

@ -52,7 +52,7 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo,
__syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
Nd4jLong coords[MAX_RANK];
int coords[MAX_RANK];
for (int i = tid; i < xzLen; i += blockDim.x * gridDim.x) {
shape::index2coords(i, xShapeInfo, coords);
@ -124,7 +124,7 @@ __global__ linkage void preluBPCuda(const void *vIn, const Nd4jLong *inShapeI
__syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
Nd4jLong coords[MAX_RANK];
int coords[MAX_RANK];
for (int i = tid; i < inLen; i += totalThreads) {
shape::index2coords(i, inShapeInfo, coords);

View File

@ -45,7 +45,7 @@ __global__ static void addBiasCuda( const void* vx, const Nd4jLong* xShapeInfo,
X* z = reinterpret_cast<X*>(vz);
__shared__ int rank, channelPosition, posOfNonUnityDim;
__shared__ Nd4jLong *sharedMem, len;
__shared__ Nd4jLong len, *sharedMem;
__shared__ bool xzSameOffsets, xzAreSame;
if (threadIdx.x == 0) {
@ -130,7 +130,7 @@ void addBias(sd::graph::Context& block, const NDArray& input, const NDArray& bia
FLOAT_TYPES, FLOAT_TYPES);
} else {
// default case
const int threadsPerBlock = MAX_NUM_THREADS / 2;
const int threadsPerBlock = MAX_NUM_THREADS / 4;
const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = input.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;

View File

@ -124,7 +124,7 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo
}
__syncthreads();
Nd4jLong coords[MAX_RANK];
int coords[MAX_RANK];
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -51,7 +51,7 @@ __global__ static void concatCuda(void* pVx, void* pxShapeInfo, void* vz, Nd4jL
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
Nd4jLong coords[MAX_RANK];
int coords[MAX_RANK];
for (uint64_t i = tid; i < zLen; i += totalThreads) {
shape::index2coords(i, zShapeInfo, coords);

View File

@ -706,7 +706,7 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,
T* z = reinterpret_cast<T*>(vz);
__shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd;
__shared__ Nd4jLong *sharedMem, zLen;
__shared__ Nd4jLong zLen, *sharedMem;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
@ -858,7 +858,7 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
Nd4jLong coord2, coord3;
__shared__ int rank, kHeff, kWeff, iH, iW, kProd;
__shared__ Nd4jLong *sharedMem, yLen;
__shared__ Nd4jLong yLen, *sharedMem;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
@ -1017,7 +1017,7 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
Nd4jLong coord2, coord3, coord4;
__shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd;
__shared__ Nd4jLong *sharedMem, yLen;
__shared__ Nd4jLong yLen, *sharedMem;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
@ -1342,7 +1342,7 @@ __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeIn
T* z = reinterpret_cast<T*>(vz);
__shared__ int rank, dimIH;
__shared__ Nd4jLong *sharedMem, zLen;
__shared__ Nd4jLong zLen, *sharedMem;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
@ -1410,7 +1410,7 @@ __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeIn
T* z = reinterpret_cast<T*>(vz);
__shared__ int rank, dimID;
__shared__ Nd4jLong *sharedMem, zLen;
__shared__ Nd4jLong zLen, *sharedMem;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
@ -1480,7 +1480,7 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape
__shared__ int rank, dimIH;
__shared__ uint factorH, factorW;
__shared__ Nd4jLong *sharedMem, zLen;
__shared__ Nd4jLong zLen, *sharedMem;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
@ -1554,7 +1554,7 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape
__shared__ int rank, dimID;
__shared__ uint factorD, factorH, factorW;
__shared__ Nd4jLong *sharedMem, zLen;
__shared__ Nd4jLong zLen, *sharedMem;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];

View File

@ -36,8 +36,8 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
__shared__ const T* x;
__shared__ const T* y;
__shared__ T* z;
__shared__ int rank;
__shared__ Nd4jLong lenWithoutLastDim, totalThreads, *sharedMem;
__shared__ int rank, *sharedMem;
__shared__ Nd4jLong lenWithoutLastDim, totalThreads;
if (threadIdx.x == 0) {
x = reinterpret_cast<const T*>(vx);
@ -45,7 +45,7 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
z = reinterpret_cast<T*>(vz);
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
totalThreads = gridDim.x * blockDim.x;
rank = shape::rank(xShapeInfo);
@ -106,7 +106,7 @@ void crossBatched(sd::LaunchContext* context, NDArray *x, NDArray *y, NDArray *z
const int threadsPerBlock = MAX_NUM_THREADS / 4;
const int blocksPerGrid = (x->lengthOf() / x->sizeAt(-1) + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = sizeof(Nd4jLong) * threadsPerBlock * x->rankOf() + 128;
const int sharedMem = sizeof(int) * threadsPerBlock * x->rankOf() + 128;
PointersManager manager(context, "cross");

View File

@ -43,13 +43,13 @@ __global__ static void dilation2dCuda(const void* vx, const Nd4jLong* xShapeInfo
const X* y = reinterpret_cast<const X*>(vy);
Z* z = reinterpret_cast<Z*>(vz);
__shared__ int xzRank, yRank;
__shared__ int xzRank, yRank, *sharedMem;
__shared__ uint iH, iW, kH, kW;
__shared__ Nd4jLong *sharedMem, zLen;
__shared__ Nd4jLong zLen;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
zLen = shape::length(zShapeInfo);
@ -119,7 +119,7 @@ void dilation2d(sd::LaunchContext* context, NDArray *input, NDArray *weights, ND
const int threadsPerBlock = MAX_NUM_THREADS / 2;
const int blocksPerGrid = (output->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = (weights->rankOf() + output->rankOf()) * sizeof(Nd4jLong) * threadsPerBlock + 128;
const int sharedMem = (weights->rankOf() + output->rankOf()) * sizeof(int) * threadsPerBlock + 128;
NDArray::prepareSpecialUse({output}, {input, weights});
BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), dilation2dCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input->getSpecialBuffer(), input->getSpecialShapeInfo(), weights->getSpecialBuffer(), weights->getSpecialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), sH, sW, pH, pW, dH, dW), FLOAT_TYPES);

View File

@ -27,7 +27,7 @@ namespace sd {
template <typename T>
void _CUDA_G flattenKernel(void **xBuffers, Nd4jLong **xShapeInfos, Nd4jLong *offsets, Nd4jLong numInputs, void *zBuffer, Nd4jLong *zShapeInfo, char order) {
Nd4jLong xCoord[MAX_RANK];
int xCoord[MAX_RANK];
// each block of threads works on 1 input array
for (Nd4jLong e = blockIdx.x; e < numInputs; e += gridDim.x) {

View File

@ -40,12 +40,12 @@ __global__ static void im2colCuda(const void *image, void *columns,
const auto im = reinterpret_cast<const T*>(image);
auto col = reinterpret_cast<T*>(columns);
__shared__ Nd4jLong colLen, *sharedMem, iH, iW;
__shared__ int imRank, colRank;
__shared__ Nd4jLong colLen, iH, iW;
__shared__ int imRank, colRank, *sharedMem;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
colRank = 6;
imRank = 4;
@ -81,7 +81,7 @@ __global__ static void im2colCuda(const void *image, void *columns,
//////////////////////////////////////////////////////////////////////////
template <typename T>
static void im2colCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, sd::LaunchContext & context, const void *image, void *columns, const Nd4jLong *imShapeInfo, const Nd4jLong *colShapeInfo, int sH, int sW, int pH, int pW, int dH, int dW, double zeroPadVal) {
im2colCuda<T><<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(Nd4jLong) * 6 /* rank of columns = 6 */, *context.getCudaStream()>>>(image, columns, imShapeInfo, colShapeInfo, sH, sW, pH, pW, dH, dW, zeroPadVal);
im2colCuda<T><<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int) * 6 /* rank of columns = 6 */, *context.getCudaStream()>>>(image, columns, imShapeInfo, colShapeInfo, sH, sW, pH, pW, dH, dW, zeroPadVal);
}
//////////////////////////////////////////////////////////////////////////

View File

@ -149,19 +149,19 @@ __global__ void rgbToGrsCuda(const void *vx, const Nd4jLong *xShapeInfo, void *v
const auto x = reinterpret_cast<const T*>(vx);
auto z = reinterpret_cast<T*>(vz);
__shared__ Nd4jLong zLen, *sharedMem;
__shared__ int rank; // xRank == zRank
__shared__ Nd4jLong zLen;
__shared__ int rank, *sharedMem; // xRank == zRank
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
zLen = shape::length(zShapeInfo);
rank = shape::rank(zShapeInfo);
}
__syncthreads();
Nd4jLong* coords = sharedMem + threadIdx.x * rank;
auto coords = sharedMem + threadIdx.x * rank;
for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i += gridDim.x * blockDim.x) {
@ -197,7 +197,7 @@ void transformRgbGrs(sd::LaunchContext* context, const NDArray& input, NDArray&
const int threadsPerBlock = MAX_NUM_THREADS / 4;
const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = input.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
const int sharedMem = input.rankOf() * sizeof(int) * threadsPerBlock + 128;
NDArray::prepareSpecialUse({&output}, {&input});
BUILD_SINGLE_SELECTOR(input.dataType(), rgbToGrsCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), dimC), NUMERIC_TYPES);

View File

@ -39,14 +39,14 @@ __global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeI
const auto y = reinterpret_cast<const T*>(vy);
auto z = reinterpret_cast<T*>(vz);
__shared__ int xRank; // xRank = zRank, xRank = yRank + 1
__shared__ Nd4jLong xLen, *sharedMem; // xLen = zLen
__shared__ int xRank, *sharedMem; // xRank = zRank, xRank = yRank + 1
__shared__ Nd4jLong xLen; // xLen = zLen
__shared__ bool areSameOffsets;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); // shapes are definitely the same, but strides might not
@ -56,7 +56,7 @@ __global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeI
__syncthreads();
auto coords = sharedMem + threadIdx.x * xRank; // we provide (xRank * sizeof(Nd4jLong) * threadIdx.x) amount of shared memory per each thread
auto coords = sharedMem + threadIdx.x * xRank; // we provide (xRank * sizeof(int) * threadIdx.x) amount of shared memory per each thread
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
for (Nd4jLong i = tid; i < xLen; i += gridDim.x * blockDim.x) {
@ -86,7 +86,7 @@ void matrixSetDiag(sd::LaunchContext* context, const NDArray& input, const NDArr
const int threadsPerBlock = MAX_NUM_THREADS / 2;
const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * input.rankOf() + 128;
const int sharedMem = threadsPerBlock * sizeof(int) * input.rankOf() + 128;
PointersManager manager(context, "matrixSetDiag");

View File

@ -43,12 +43,12 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
const auto x = reinterpret_cast<const T*>(vx);
auto z = reinterpret_cast<T*>(vz);
__shared__ int rank;
__shared__ Nd4jLong zLen, *sharedMem;
__shared__ int rank, *sharedMem;
__shared__ Nd4jLong zLen;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
rank = shape::rank(zShapeInfo);
zLen = shape::length(zShapeInfo);
@ -103,7 +103,7 @@ void batchToSpace(sd::LaunchContext* context, const NDArray& input, NDArray& out
const int threadsPerBlock = MAX_NUM_THREADS / 2;
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
PointersManager manager(context, "batchToSpace");
@ -138,13 +138,13 @@ __global__ static void batchToSpaceNDCuda(const void* vx, const Nd4jLong* xShape
const auto y = reinterpret_cast<const Y*>(vy);
auto z = reinterpret_cast<X*>(vz);
__shared__ int rank;
__shared__ Nd4jLong zLen, *sharedMem;
__shared__ int rank, *sharedMem;
__shared__ Nd4jLong zLen;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
rank = shape::rank(zShapeInfo);
zLen = shape::length(zShapeInfo);
@ -234,7 +234,7 @@ void batchToSpaceND(sd::LaunchContext* context, const NDArray& input, const NDAr
const int threadsPerBlock = MAX_NUM_THREADS / 4;
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
PointersManager manager(context, "batchToSpaceND");
@ -264,12 +264,12 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
const auto x = reinterpret_cast<const T*>(vx);
auto z = reinterpret_cast<T*>(vz);
__shared__ int rank;
__shared__ Nd4jLong zLen, *sharedMem;
__shared__ int rank, *sharedMem;
__shared__ Nd4jLong zLen;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
rank = shape::rank(zShapeInfo);
zLen = shape::length(zShapeInfo);
@ -326,7 +326,7 @@ void spaceToBatch(sd::LaunchContext* context, const NDArray& input, NDArray& out
const int threadsPerBlock = MAX_NUM_THREADS / 2;
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
PointersManager manager(context, "spaceToBatch");
@ -364,13 +364,13 @@ __global__ static void spaceToBatchNDCuda(const void* vx, const Nd4jLong* xShape
const auto y = reinterpret_cast<const Y*>(vy);
auto z = reinterpret_cast<X*>(vz);
__shared__ int rank; // xRank = zRank, yRank = 2;
__shared__ Nd4jLong zLen, totalThreads, *sharedMem;
__shared__ int rank, *sharedMem; // xRank = zRank, yRank = 2;
__shared__ Nd4jLong zLen, totalThreads;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
rank = shape::rank(zShapeInfo);
zLen = shape::length(zShapeInfo);
@ -473,7 +473,7 @@ void spaceToBatchND(sd::LaunchContext* context, const NDArray& input, const NDAr
const int threadsPerBlock = MAX_NUM_THREADS / 4;
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
PointersManager manager(context, "spaceToBatchND");

View File

@ -628,12 +628,12 @@ __global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo,
auto y = reinterpret_cast<Z*>(vy);
auto z = reinterpret_cast<Z*>(vz);
__shared__ Nd4jLong xLen, *sharedMem;
__shared__ int xRank; // xRank = zRank, yRank = xRank + 1
__shared__ Nd4jLong xLen;
__shared__ int xRank, *sharedMem; // xRank = zRank, yRank = xRank + 1
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
xLen = shape::length(xShapeInfo);
xRank = shape::rank(xShapeInfo);
@ -678,7 +678,7 @@ void scatterForLoss(sd::LaunchContext* context, const NDArray& indices, NDArray&
const int threadsPerBlock = MAX_NUM_THREADS / 2;
const int blocksPerGrid = (indices.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = updates.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
const int sharedMem = updates.rankOf() * sizeof(int) * threadsPerBlock + 128;
if(calcGrad) {
NDArray::prepareSpecialUse({&updates}, {&indices});

View File

@ -54,7 +54,7 @@ __global__ static void splitCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
Nd4jLong coords[MAX_RANK];
int coords[MAX_RANK];
for (uint64_t i = tid; i < xLen; i += totalThreads) {

View File

@ -135,13 +135,13 @@ __global__ static void sruBICuda(const void* vx, const Nd4jLong* xShapeInfo,
const int rank = 3;
__shared__ int time, K;
__shared__ Nd4jLong len, totalThreads, *sharedMem;
__shared__ int time, K, *sharedMem;
__shared__ Nd4jLong len, totalThreads;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
time = xShapeInfo[1];
K = xShapeInfo[3] / 2;
@ -152,7 +152,7 @@ __global__ static void sruBICuda(const void* vx, const Nd4jLong* xShapeInfo,
__syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
Nd4jLong* coords = sharedMem + threadIdx.x * rank;
auto coords = sharedMem + threadIdx.x * rank;
if(tid >= len)
return;
@ -245,7 +245,7 @@ void sruBI(sd::LaunchContext * context, NDArray* x, const NDArray* w, const NDAr
const int threadsPerBlock = MAX_NUM_THREADS / 4;
const int blocksPerGrid = (x->sizeAt(1) * x->sizeAt(2) + threadsPerBlock - 1) / threadsPerBlock; // loop through last two dimensions of x array -> bS, 2*K
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * x->rankOf() + 128;
const int sharedMem = threadsPerBlock * sizeof(int) * x->rankOf() + 128;
NDArray::prepareSpecialUse({ht, ct}, {x, &wi, b, c0, mask});
BUILD_SINGLE_SELECTOR(x->dataType(), sruBICudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), x->getSpecialBuffer(), x->getSpecialShapeInfo(), wi.getSpecialBuffer(), wi.getSpecialShapeInfo(), b->getSpecialBuffer(), b->getSpecialShapeInfo(), c0->getSpecialBuffer(), c0->getSpecialShapeInfo(), mask ? mask->getSpecialBuffer() : nullptr, mask ? mask->getSpecialShapeInfo() : nullptr, ht->specialBuffer(), ht->specialShapeInfo(), ct->specialBuffer(), ct->specialShapeInfo()), FLOAT_TYPES);
@ -340,13 +340,13 @@ __global__ static void sruBIBPCuda(const void* vx, const Nd4jLong* xShapeI
const int rank = 3;
__shared__ int time, K;
__shared__ Nd4jLong len, totalThreads, *sharedMem;
__shared__ int time, K, *sharedMem;
__shared__ Nd4jLong len, totalThreads;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
time = xShapeInfo[1];
K = xShapeInfo[3] / 2;
@ -358,7 +358,7 @@ __global__ static void sruBIBPCuda(const void* vx, const Nd4jLong* xShapeI
__syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
Nd4jLong* coords = sharedMem + threadIdx.x * rank;
auto coords = sharedMem + threadIdx.x * rank;
if(tid >= len)
return;
@ -513,7 +513,7 @@ void sruBIBP(sd::LaunchContext* context, NDArray* x, const NDArray* w, const NDA
const int threadsPerBlock = MAX_NUM_THREADS / 4;
const int blocksPerGrid = (x->sizeAt(1) * x->sizeAt(2) + threadsPerBlock - 1) / threadsPerBlock; // loop through last two dimensions of x array -> bS, 2*K
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * x->rankOf() + 128;
const int sharedMem = threadsPerBlock * sizeof(int) * x->rankOf() + 128;
NDArray::prepareSpecialUse({gradI, &gradWi, &gradBias, gradC0}, {x, &wi, b, c0, ct, gradCt, gradHt, mask});
BUILD_SINGLE_SELECTOR(x->dataType(), sruBIBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), x->getSpecialBuffer(), x->getSpecialShapeInfo(), wi.getSpecialBuffer(), wi.getSpecialShapeInfo(), b->getSpecialBuffer(), b->getSpecialShapeInfo(), c0->getSpecialBuffer(), c0->getSpecialShapeInfo(), mask ? mask->getSpecialBuffer() : nullptr, mask ? mask->getSpecialShapeInfo() : nullptr, ct->getSpecialBuffer(), ct->getSpecialShapeInfo(), gradHt->getSpecialBuffer(), gradHt->getSpecialShapeInfo(), gradCt->getSpecialBuffer(), gradCt->getSpecialShapeInfo(), gradI->specialBuffer(), gradI->specialShapeInfo(), gradWi.specialBuffer(), gradWi.specialShapeInfo(), gradBias.specialBuffer(), gradBias.specialShapeInfo(), gradC0->specialBuffer(), gradC0->specialShapeInfo()), FLOAT_TYPES);

View File

@ -93,13 +93,13 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
auto z = reinterpret_cast<T*>(vz);
__shared__ T* sharedMem;
__shared__ int xRank, zRank; // xRank = zRank + 2
__shared__ Nd4jLong xLen, zLen, *coordsMem;
__shared__ int xRank, zRank, *coordsMem; // xRank = zRank + 2
__shared__ Nd4jLong xLen, zLen;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<T*>(shmem);
coordsMem = reinterpret_cast<Nd4jLong*>(shmem + blockDim.x * sizeof(T));
coordsMem = reinterpret_cast<int*>(shmem + blockDim.x * sizeof(T));
xRank = shape::rank(xShapeInfo);
zRank = shape::rank(zShapeInfo);
@ -109,7 +109,7 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
}
__syncthreads();
Nd4jLong* coords = coordsMem + threadIdx.x * xRank;
auto coords = coordsMem + threadIdx.x * xRank;
for (uint m = blockIdx.x; m < zLen; m += gridDim.x) { // one block per each element of z, that is per each matrix
@ -160,7 +160,7 @@ void trace(sd::LaunchContext* context, const NDArray& input, NDArray& output) {
const uint diagLen = input.sizeAt(-1) < input.sizeAt(-2) ? input.sizeAt(-1) : input.sizeAt(-2);
const int threadsPerBlock = MAX_NUM_THREADS / 4;
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = threadsPerBlock * (sizeof(Nd4jLong) * input.rankOf() + input.sizeOfT()) + 128;
const int sharedMem = threadsPerBlock * (sizeof(int) * input.rankOf() + input.sizeOfT()) + 128;
NDArray::prepareSpecialUse({&output}, {&input});
BUILD_SINGLE_SELECTOR(input.dataType(), traceCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), diagLen), LIBND4J_TYPES);
@ -177,13 +177,13 @@ __global__ static void triuBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo
const auto x = reinterpret_cast<const T*>(vx); // gradO
auto z = reinterpret_cast<T*>(vz); // gradI
__shared__ int rank, areSameOffsets; // xRank = zRank
__shared__ Nd4jLong len, totalThreads, *sharedMem; // xLen = zLen
__shared__ int rank, areSameOffsets, *sharedMem; // xRank = zRank
__shared__ Nd4jLong len, totalThreads; // xLen = zLen
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
rank = shape::rank(xShapeInfo);
len = shape::length(zShapeInfo);
@ -221,7 +221,7 @@ void triuBP(sd::LaunchContext* context, const NDArray& input, const NDArray& gra
const int threadsPerBlock = MAX_NUM_THREADS / 4;
const int blocksPerGrid = (gradO.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * gradO.rankOf() + 128;
const int sharedMem = threadsPerBlock * sizeof(int) * gradO.rankOf() + 128;
PointersManager manager(context, "triuBP");
@ -240,13 +240,13 @@ __global__ static void tileBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo
const auto x = reinterpret_cast<const T*>(vx); // gradO
auto z = reinterpret_cast<T*>(vz); // gradI
__shared__ int xRank, zRank; // xRank >= zRank
__shared__ Nd4jLong numOfXOffsets, zLen, totalThreads, *sharedMem; // xLen >= zLen
__shared__ int xRank, zRank, *sharedMem; // xRank >= zRank
__shared__ Nd4jLong numOfXOffsets, zLen, totalThreads; // xLen >= zLen
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
sharedMem = reinterpret_cast<int*>(shmem);
xRank = shape::rank(zShapeInfo);
zLen = shape::length(zShapeInfo);
@ -289,7 +289,7 @@ void tileBP(sd::LaunchContext * context, const NDArray& gradO /*input*/, NDArray
const int threadsPerBlock = MAX_NUM_THREADS / 4;
const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * 2 * gradO.rankOf() + 128;
const int sharedMem = threadsPerBlock * sizeof(int) * 2 * gradO.rankOf() + 128;
PointersManager manager(context, "tileBP");

View File

@ -32,7 +32,7 @@ namespace sd {
auto indices = reinterpret_cast<const I*>(vindices);
auto output = reinterpret_cast<X*>(voutput);
Nd4jLong coords[MAX_RANK];
int coords[MAX_RANK];
uint64_t pos = 0;
for (uint64_t e = 0L; e < length; e++) {
// indices come in blocks

View File

@ -29,11 +29,14 @@ namespace sd {
NDArrayList list(0, true);
int cnt = 0;
Nd4jLong idx[MAX_RANK];
int idx[MAX_RANK];
for (Nd4jLong e = 0; e < condition.lengthOf(); e++) {
shape::index2coords(e, condition.getShapeInfo(), idx);
shape::index2coordsCPU(0, e, condition.getShapeInfo(), idx);
auto offset = shape::getOffset(condition.getShapeInfo(), idx);
if (condition.e<bool>(offset)) {
auto array = NDArrayFactory::create_('c', {1, condition.rankOf()}, output.dataType(), output.getContext());
for (int f = 0; f < condition.rankOf(); f++)

View File

@ -178,16 +178,18 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<const NDArray*>& inAr
// general case
auto func = PRAGMA_THREADS_FOR {
Nd4jLong coords[MAX_RANK];
int coords[MAX_RANK], temp;
for (auto i = start; i < stop; i += increment) {
shape::index2coords(i, output.getShapeInfo(), coords);
shape::index2coordsCPU(start, i, output.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
uint inArrIdx = 0;
uint xDim = inArrs[inArrIdx]->sizeAt(axis);
temp = coords[axis];
while (coords[axis] >= xDim) {
coords[axis] -= xDim;
xDim = inArrs[++inArrIdx]->sizeAt(axis);
@ -197,6 +199,8 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<const NDArray*>& inAr
const auto xOffset = shape::getOffset(inArrs[inArrIdx]->getShapeInfo(), coords);
zBuff[zOffset] = x[xOffset];
coords[axis] = temp;
}
};
@ -298,13 +302,15 @@ void SpecialMethods<T>::splitCpuGeneric(const NDArray& input, const std::vector<
auto func = PRAGMA_THREADS_FOR{
Nd4jLong coords[MAX_RANK];
int coords[MAX_RANK], temp;
for (auto i = start; i < stop; i += increment) {
shape::index2coords(i, input.getShapeInfo(), coords);
shape::index2coordsCPU(start, i, input.getShapeInfo(), coords);
const auto xOffset = shape::getOffset(input.getShapeInfo(), coords);
uint outArrIdx = 0;
temp = coords[axis];
while (coords[axis] >= zDim) {
coords[axis] -= zDim;
@ -314,6 +320,8 @@ void SpecialMethods<T>::splitCpuGeneric(const NDArray& input, const std::vector<
T* z = outArrs[outArrIdx]->bufferAsT<T>();
const auto zOffset = shape::getOffset(outArrs[outArrIdx]->getShapeInfo(), coords);
z[zOffset] = xBuff[xOffset];
coords[axis] = temp;
}
};

View File

@ -258,6 +258,7 @@ TEST_F(PlaygroundTests, test_bert_2) {
delete graph;
}
TEST_F(PlaygroundTests, test_one_off_ops_1) {
auto x = NDArrayFactory::create<float>('c', {4, 128, 768});
auto y = NDArrayFactory::create<float>('c', {4, 128, 1});

View File

@ -289,7 +289,7 @@ TEST_F(TadTests, calcOffsets_1) {
TEST_F(TadTests, outerArrayIndexes_1) {
NDArray x('c', {2,3,4,5}, sd::DataType::FLOAT32);
Nd4jLong maxIdxs[120];
int maxIdxs[120];
NDArray y1('c', {3,5}, sd::DataType::FLOAT32);
const std::vector<int> dimsToExclude1 = {0,2};