[WIP] Shyrma coords (#305)
* - provide faster index2coords function for cpu Signed-off-by: Yurii <iuriish@yahoo.com> * - new faster index2coords function is introduced into cpu code Signed-off-by: Yurii <iuriish@yahoo.com> * - replace long long coordinates with int coordinates Signed-off-by: Yurii <iuriish@yahoo.com> * - add missed reload of coords2index function Signed-off-by: Yurii <iuriish@yahoo.com> * - reststart jenkins Signed-off-by: Yurii <iuriish@yahoo.com> * - rollback changes in convolutions.cu and addBias.cu Signed-off-by: Yurii <iuriish@yahoo.com>master
parent
50b7d82b96
commit
58550b7c98
|
@ -95,22 +95,29 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t
|
|||
|
||||
const bool areSameOffsets = shape::haveSameShapeAndStrides(getShapeInfo(), target.getShapeInfo());
|
||||
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
|
||||
int coords[MAX_RANK], temp;
|
||||
|
||||
for (auto i = start; i < stop; i++) {
|
||||
shape::index2coords(i, target.getShapeInfo(), coords);
|
||||
|
||||
shape::index2coordsCPU(start, i, target.getShapeInfo(), coords);
|
||||
const auto zOffset = shape::getOffset(target.getShapeInfo(), coords);
|
||||
|
||||
// if( (row + upper < col) || (row + lower > col) )
|
||||
if ((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
|
||||
z[zOffset] = value;
|
||||
else if (this != &target) { // when this and target are different arrays
|
||||
if (xRank != zRank)
|
||||
if (xRank != zRank) {
|
||||
temp = coords[0];
|
||||
coords[0] = coords[1];
|
||||
}
|
||||
|
||||
const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(getShapeInfo(), coords);
|
||||
z[zOffset] = x[xOffset];
|
||||
|
||||
if (xRank != zRank) // restore first coordinate
|
||||
coords[0] = temp;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -376,12 +383,16 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
|
|||
|
||||
// loop through input array
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
for (auto i = start; i < stop; i++) {
|
||||
shape::index2coords(i, output.getShapeInfo(), coords);
|
||||
|
||||
int coords[MAX_RANK], temp;
|
||||
|
||||
for (auto i = start; i < stop; i++) {
|
||||
|
||||
shape::index2coordsCPU(start, i, output.getShapeInfo(), coords);
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
||||
|
||||
temp = coords[axis];
|
||||
|
||||
if (repSize > 1) {
|
||||
for (uint j = 0; j < repSize; ++j) {
|
||||
coords[axis] -= repeats[j];
|
||||
|
@ -394,6 +405,8 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
|
|||
coords[axis] /= repeats[0];
|
||||
|
||||
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
|
||||
|
||||
coords[axis] = temp;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -85,12 +85,12 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
|
|||
const auto x = reinterpret_cast<const T*>(vx);
|
||||
auto z = reinterpret_cast<T*>(vz);
|
||||
|
||||
__shared__ int zRank, xRank, areSameOffsets; // xRank == zRank always, except when xRank = 1, in this case zRank = 2
|
||||
__shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
|
||||
__shared__ int zRank, xRank, areSameOffsets, *sharedMem; // xRank == zRank always, except when xRank = 1, in this case zRank = 2
|
||||
__shared__ Nd4jLong zLen, totalThreads; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
|
||||
xRank = shape::rank(xShapeInfo);
|
||||
zRank = shape::rank(zShapeInfo);
|
||||
|
@ -137,7 +137,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 4;
|
||||
const int blocksPerGrid = (target.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = threadsPerBlock * sizeof(decltype(*target.getShapeInfo())) * target.rankOf() + 128;
|
||||
const int sharedMem = threadsPerBlock * sizeof(int) * target.rankOf() + 128;
|
||||
|
||||
PointersManager manager(getContext(), "NDArray::fillAsTriangular");
|
||||
|
||||
|
@ -155,12 +155,12 @@ __global__ static void identityMatrixCuda(void* vx, const Nd4jLong* xShapeInfo,
|
|||
|
||||
auto x = reinterpret_cast<T*>(vx);
|
||||
|
||||
__shared__ int rank;
|
||||
__shared__ Nd4jLong len, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
|
||||
__shared__ int rank, *sharedMem;
|
||||
__shared__ Nd4jLong len, totalThreads; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
rank = shape::rank(xShapeInfo);
|
||||
len = shape::length(xShapeInfo);
|
||||
totalThreads = gridDim.x * blockDim.x;
|
||||
|
@ -201,7 +201,7 @@ void NDArray::setIdentity() {
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 4;
|
||||
const int blocksPerGrid = (lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = threadsPerBlock * sizeof(decltype(getShapeInfo())) * rankOf() + 128;
|
||||
const int sharedMem = threadsPerBlock * sizeof(int) * rankOf() + 128;
|
||||
|
||||
PointersManager manager(getContext(), "NDArray::setIdentity");
|
||||
|
||||
|
@ -398,13 +398,13 @@ __global__ static void repeatCuda(const void* vx, const Nd4jLong* xShapeInfo,
|
|||
const X* x = reinterpret_cast<const X*>(vx);
|
||||
Z* z = reinterpret_cast<Z*>(vz);
|
||||
|
||||
__shared__ int rank;
|
||||
__shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen = zLen
|
||||
__shared__ int rank, *sharedMem;
|
||||
__shared__ Nd4jLong zLen, totalThreads; // xLen = zLen
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
|
||||
rank = shape::rank(zShapeInfo); // xRank = zRank
|
||||
zLen = shape::length(zShapeInfo); // xLen <= zLen
|
||||
|
@ -460,7 +460,7 @@ NDArray NDArray::repeat(const int axis, const std::vector<int>& repeats) const {
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 2;
|
||||
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
|
||||
const int sharedMem = output.rankOf() * sizeof(int) * threadsPerBlock + 128;
|
||||
|
||||
PointersManager manager(getContext(), "NDArray::repeat(const int axis, const std::vector<int>& repeats)");
|
||||
|
||||
|
@ -484,7 +484,7 @@ void NDArray::repeat(const int axis, const std::vector<int>& repeats, NDArray& t
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 2;
|
||||
const int blocksPerGrid = (target.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = target.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
|
||||
const int sharedMem = target.rankOf() * sizeof(int) * threadsPerBlock + 128;
|
||||
|
||||
PointersManager manager(getContext(), "NDArray::repeat(const int axis, const std::vector<int>& repeats)");
|
||||
|
||||
|
|
|
@ -46,7 +46,7 @@ namespace sd {
|
|||
int totalIterations = 1;
|
||||
|
||||
// hehe
|
||||
Nd4jLong xCoords[MAX_RANK];
|
||||
int xCoords[MAX_RANK];
|
||||
Nd4jLong xShape[MAX_RANK];
|
||||
int xRank = _spaces.size();
|
||||
|
||||
|
|
|
@ -63,7 +63,7 @@ static void usualGemm(const NDArray* vA, const NDArray* vB, NDArray* vC,
|
|||
for (auto i = start; i < stop; ++i) {
|
||||
|
||||
// evaluate C coordinates
|
||||
shape::index2coords(i, cShapeInfo, cCoords.data());
|
||||
shape::index2coordsCPU(start, i, cShapeInfo, cCoords.data());
|
||||
|
||||
// evaluate A coordinates
|
||||
aCoords[aMaxis] = cCoords[cMaxis];
|
||||
|
@ -433,12 +433,12 @@ static void batchedGemm(const NDArray* vA, const NDArray* vB, NDArray* vC,
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
|
||||
std::vector<Nd4jLong> aCoords(aRank), bCoords(bRank), cCoords(cRank);
|
||||
std::vector<int> aCoords(aRank), bCoords(bRank), cCoords(cRank);
|
||||
|
||||
for (auto i = start; i < stop; ++i) {
|
||||
|
||||
// evaluate C coordinates
|
||||
shape::index2coords(i, cShapeInfo, cCoords.data());
|
||||
shape::index2coordsCPU(start, i, cShapeInfo, cCoords.data());
|
||||
|
||||
// calculate index of current batch
|
||||
Nd4jLong batchInd;
|
||||
|
|
|
@ -40,15 +40,15 @@ static __global__ void usualCudaGemm(const void* vA, const Nd4jLong* aShapeInfo,
|
|||
const T2* B = reinterpret_cast<const T2*>(vB);
|
||||
T3* C = reinterpret_cast< T3*>(vC);
|
||||
|
||||
__shared__ int K;
|
||||
__shared__ int K, *coords;
|
||||
__shared__ bool betaPresent;
|
||||
__shared__ Nd4jLong cLen, totalThreads, *coords;
|
||||
__shared__ Nd4jLong cLen, totalThreads;
|
||||
__shared__ T3 alphaZ, betaZ;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
extern __shared__ unsigned char shmem[];
|
||||
coords = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
coords = reinterpret_cast<int*>(shmem);
|
||||
cLen = shape::length(cShapeInfo);
|
||||
|
||||
K = shape::shapeOf(const_cast<Nd4jLong*>(aShapeInfo))[aKaxis];
|
||||
|
@ -263,7 +263,7 @@ NDArray* MmulHelper::mmulMxM(const NDArray* A, const NDArray* B, NDArray* C, dou
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 2;
|
||||
const int blocksPerGrid = (C->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * 6 + 128; // 6 = aRank + bRank + cRank
|
||||
const int sharedMem = threadsPerBlock * sizeof(int) * 6 + 128; // 6 = aRank + bRank + cRank
|
||||
|
||||
NDArray::prepareSpecialUse({C}, {A, B});
|
||||
// BUILD_TRIPLE_SELECTOR(aType, bType, cType, usualGemm, (blocksPerGrid, threadsPerBlock, sharedMem, stream, A->getSpecialBuffer(), A->getSpecialShapeInfo(), B->getSpecialBuffer(), B->getSpecialShapeInfo(), C->getSpecialBuffer(), C->getSpecialShapeInfo(), 0, 1, 0, 1, 0, 1, alpha, beta), NUMERIC_TYPES, NUMERIC_TYPES, FLOAT_TYPES);
|
||||
|
@ -529,14 +529,14 @@ static __global__ void batchedCudaGemm(const void* vA, const Nd4jLong* aShapeInf
|
|||
T3* C = reinterpret_cast< T3*>(vC);
|
||||
|
||||
__shared__ bool betaPresent;
|
||||
__shared__ int aRank, bRank, cRank, K;
|
||||
__shared__ Nd4jLong cLen, totalThreads, *coords;
|
||||
__shared__ int aRank, bRank, cRank, K, *coords;
|
||||
__shared__ Nd4jLong cLen, totalThreads;
|
||||
__shared__ T3 alphaZ, betaZ;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
extern __shared__ unsigned char shmem[];
|
||||
coords = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
coords = reinterpret_cast<int*>(shmem);
|
||||
cLen = shape::length(cShapeInfo);
|
||||
|
||||
K = shape::shapeOf(const_cast<Nd4jLong*>(aShapeInfo))[aKaxis];
|
||||
|
@ -649,7 +649,7 @@ NDArray* MmulHelper::mmulNxN(const NDArray* A, const NDArray* B, NDArray* C, con
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 8;
|
||||
const int blocksPerGrid = (C->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * (aRank + bRank + cRank) + 128;
|
||||
const int sharedMem = threadsPerBlock * sizeof(int) * (aRank + bRank + cRank) + 128;
|
||||
|
||||
PointersManager manager(A->getContext(), "MmulHelper::mmulNxN");
|
||||
|
||||
|
|
|
@ -306,7 +306,6 @@ std::vector<Nd4jLong> ShapeUtils::evalRepeatShape(int axis, const std::vector<in
|
|||
|
||||
if(repeats.size() == 1)
|
||||
outShape[axis] *= repeats[0];
|
||||
|
||||
else
|
||||
outShape[axis] = std::accumulate(repeats.begin(), repeats.end(), 0);
|
||||
|
||||
|
|
|
@ -915,12 +915,14 @@ namespace shape {
|
|||
ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, uint *coords);
|
||||
ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, Nd4jLong *coords);
|
||||
ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, int *coords);
|
||||
|
||||
ND4J_EXPORT _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, Nd4jLong *coords);
|
||||
ND4J_EXPORT _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, int *coords);
|
||||
|
||||
/**
|
||||
* take into account only dimensions stored in tadDims, tadDims must be sorted in increasing order!
|
||||
*/
|
||||
ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords, const int dimsSize, const int* tadDims);
|
||||
|
||||
|
||||
ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, int *coords, const int dimsSize, const int* tadDims);
|
||||
|
||||
/**
|
||||
* Convert coordinates to the corresponding linear index (sequence number in other words)
|
||||
|
@ -929,11 +931,11 @@ namespace shape {
|
|||
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords);
|
||||
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords);
|
||||
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const uint *coords);
|
||||
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *coords);
|
||||
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const int *coords);
|
||||
/**
|
||||
* take into account only dimensions stored in tadDims, tadDims must be sorted in increasing order!
|
||||
*/
|
||||
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords, const int dimsSize, const int* tadDims);
|
||||
ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords, const int dimsSize, const int* tadDims);
|
||||
|
||||
/**
|
||||
* increment n-dimensional array by one iteration by changing coord appropriately
|
||||
|
@ -988,17 +990,17 @@ namespace shape {
|
|||
// function calculates the coordinates of min array (and saves them into minIdxs) given coordinates of max array (already stored in maxIdxs)
|
||||
// dimsToExclude - should be sorted in increasing order
|
||||
// dimsLen - length of dimsToExclude, if not set (= -1), then it is calculated as maxRank - minRank
|
||||
ND4J_EXPORT _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr, const int dimsLen = -1);
|
||||
ND4J_EXPORT _CUDA_HD void maxIndToMinInd(int* maxIdxs, int* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr, const int dimsLen = -1);
|
||||
|
||||
// calculate indexes of max-array, these output indexes correspond to one minIdx index of min-array which is sub-array of max-array
|
||||
// dimsToExclude - should be sorted in increasing order
|
||||
ND4J_EXPORT _CUDA_HD int outerArrayIndexes(Nd4jLong* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr);
|
||||
ND4J_EXPORT _CUDA_HD int outerArrayIndexes(int* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr);
|
||||
|
||||
// calculate offsets of max-array, these offsets correspond to one minIdx index of min-array which is sub-array of max-array
|
||||
// maxOffsets - will contain calculated offsets of max-array, buffer for maxOffsets should be allocated beforehand
|
||||
// dimsToExclude - should be sorted in increasing order
|
||||
// memBuff - auxiliary memory buffer (size = 2 * max_rank) for coordinates and increments storing, should be allocated beforehand
|
||||
ND4J_EXPORT _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, Nd4jLong* memBuff, const int* dimsToExclude = nullptr);
|
||||
ND4J_EXPORT _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, int* memBuff, const int* dimsToExclude = nullptr);
|
||||
|
||||
// calculates offsets for entities (elements or sub-arrays), shape in context of sub-array means dimensions excluded from outer array
|
||||
// rank is equal to size of shape
|
||||
|
@ -1064,7 +1066,7 @@ namespace shape {
|
|||
* get stride over contiguous axis (contiguous axis must have stride = 1)
|
||||
* for example when inShapeInfo is {4, 2,5,4,3, 60,1,5,20, 16384,0,99} then output is 5 (that is smallest stride in inShapeInfo except those equal to 1)
|
||||
*/
|
||||
INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo);
|
||||
// INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo);
|
||||
|
||||
|
||||
|
||||
|
@ -1832,7 +1834,7 @@ INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const uint *
|
|||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *indices) {
|
||||
INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const int *indices) {
|
||||
|
||||
Nd4jLong index, shift = 1;;
|
||||
|
||||
|
@ -1845,7 +1847,7 @@ INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape,
|
|||
return index;
|
||||
}
|
||||
|
||||
INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords, const int dimsSize, const int* tadDims) {
|
||||
INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords, const int dimsSize, const int* tadDims) {
|
||||
|
||||
Nd4jLong index, shift = 1;;
|
||||
|
||||
|
@ -4276,7 +4278,7 @@ INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, Nd4jLong* newShap
|
|||
|
||||
// max array is outer for min array, min array is sub-array of max array
|
||||
// function calculates the coordinates of min array (and saves them into minIdxs) given coordinates of max array (already stored in maxIdxs)
|
||||
INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, int dimsLen) {
|
||||
INLINEDEF _CUDA_HD void maxIndToMinInd(int* maxIdxs, int* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, int dimsLen) {
|
||||
|
||||
const auto maxRank = shape::rank(maxShapeInfo);
|
||||
const auto minRank = shape::rank(minShapeInfo);
|
||||
|
@ -4362,10 +4364,10 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
|
|||
//////////////////////////////////////////////////////////////////////
|
||||
INLINEDEF _CUDA_HD Nd4jLong subArrayIndex(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) {
|
||||
|
||||
Nd4jLong maxIdxs[MAX_RANK];
|
||||
int maxIdxs[MAX_RANK];
|
||||
shape::index2coords(const_cast<Nd4jLong&>(maxIdx), maxShapeInfo, maxIdxs);
|
||||
|
||||
Nd4jLong minIdxs[MAX_RANK];
|
||||
int minIdxs[MAX_RANK];
|
||||
maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen);
|
||||
|
||||
return shape::coords2index(minShapeInfo, minIdxs);
|
||||
|
@ -4374,17 +4376,17 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
|
|||
//////////////////////////////////////////////////////////////////////
|
||||
INLINEDEF _CUDA_HD Nd4jLong subArrayOffset(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) {
|
||||
|
||||
Nd4jLong maxIdxs[MAX_RANK];
|
||||
int maxIdxs[MAX_RANK];
|
||||
shape::index2coords(const_cast<Nd4jLong&>(maxIdx), maxShapeInfo, maxIdxs);
|
||||
|
||||
Nd4jLong minIdxs[MAX_RANK];
|
||||
int minIdxs[MAX_RANK];
|
||||
maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen);
|
||||
|
||||
return getOffset(minShapeInfo, minIdxs);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
INLINEDEF _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, Nd4jLong* memBuff, const int* dimsToExclude) {
|
||||
INLINEDEF _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, int* memBuff, const int* dimsToExclude) {
|
||||
|
||||
const auto rankMin = shape::rank(minShapeInfo);
|
||||
const auto rankMax = shape::rank(maxShapeInfo);
|
||||
|
@ -4394,8 +4396,8 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
|
|||
|
||||
const auto diff = rankMax - rankMin; // the size of dimsToExclude is equal to diff
|
||||
|
||||
Nd4jLong* indices = memBuff;
|
||||
Nd4jLong* increment = memBuff + rankMax;
|
||||
int* indices = memBuff;
|
||||
int* increment = memBuff + rankMax;
|
||||
|
||||
int N, minI, maxI;
|
||||
|
||||
|
@ -4457,7 +4459,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
|
|||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
INLINEDEF _CUDA_HD int outerArrayIndexes(Nd4jLong* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude) {
|
||||
INLINEDEF _CUDA_HD int outerArrayIndexes(int* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude) {
|
||||
|
||||
const auto rankMin = shape::rank(minShapeInfo);
|
||||
const auto rankMax = shape::rank(maxShapeInfo);
|
||||
|
@ -4469,9 +4471,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
|
|||
|
||||
const auto diff = rankMax - rankMin; // the size of dimsToExclude is equal to diff
|
||||
|
||||
Nd4jLong buffer[MAX_RANK];
|
||||
Nd4jLong* indices = buffer;
|
||||
Nd4jLong* increment = buffer + MAX_RANK/2;
|
||||
int indices[MAX_RANK], increment[MAX_RANK];
|
||||
|
||||
int N, minI, maxI;
|
||||
|
||||
|
@ -4886,7 +4886,7 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const int rank, const Nd4jL
|
|||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords, const int dimsSize, const int* tadDims) {
|
||||
INLINEDEF _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, int *coords, const int dimsSize, const int* tadDims) {
|
||||
|
||||
for(uint i = dimsSize - 1; i > 0; --i) {
|
||||
coords[tadDims[i]] = index % shapeInfo[1 + tadDims[i]];
|
||||
|
@ -4895,6 +4895,34 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo,
|
|||
coords[tadDims[0]] = index; // last iteration
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, Nd4jLong *coords) {
|
||||
|
||||
if(startIndex == index) {
|
||||
shape::index2coords(index, shapeInfo, coords);
|
||||
}
|
||||
else {
|
||||
int axis = shapeInfo[0] - 1;
|
||||
while(coords[axis] == shape::sizeAt(shapeInfo, axis) - 1)
|
||||
coords[axis--] = 0;
|
||||
++coords[axis];
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, int *coords) {
|
||||
|
||||
if(startIndex == index) {
|
||||
shape::index2coords(index, shapeInfo, coords);
|
||||
}
|
||||
else {
|
||||
int axis = shapeInfo[0] - 1;
|
||||
while(coords[axis] == shape::sizeAt(shapeInfo, axis) - 1)
|
||||
coords[axis--] = 0;
|
||||
++coords[axis];
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) {
|
||||
|
||||
|
@ -5131,23 +5159,23 @@ INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo,
|
|||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo) {
|
||||
// INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo) {
|
||||
|
||||
Nd4jLong result = 9223372036854775807LL;
|
||||
// Nd4jLong result = 9223372036854775807LL;
|
||||
|
||||
for(uint i = 0; i < shape::rank(inShapeInfo); ++i) {
|
||||
// for(uint i = 0; i < shape::rank(inShapeInfo); ++i) {
|
||||
|
||||
const auto currentStride = shape::stride(inShapeInfo)[i];
|
||||
// const auto currentStride = shape::stride(inShapeInfo)[i];
|
||||
|
||||
if(i == axis || shape::shapeOf(inShapeInfo)[i] == 1)
|
||||
continue;
|
||||
// if(i == axis || shape::shapeOf(inShapeInfo)[i] == 1)
|
||||
// continue;
|
||||
|
||||
if(result > currentStride)
|
||||
result = currentStride;
|
||||
}
|
||||
// if(result > currentStride)
|
||||
// result = currentStride;
|
||||
// }
|
||||
|
||||
return result == 9223372036854775807LL ? 1 : result;
|
||||
}
|
||||
// return result == 9223372036854775807LL ? 1 : result;
|
||||
// }
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -739,11 +739,11 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
|
||||
for (auto i = start; i < stop; ++i) {
|
||||
|
||||
shape::index2coords(i, zShapeInfo, zCoords);
|
||||
shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
|
||||
|
||||
for (uint j = 0; j < rank; ++j) {
|
||||
xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
|
||||
|
|
|
@ -449,11 +449,11 @@ void BroadcastBool<X, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
|
||||
for (auto i = start; i < stop; ++i) {
|
||||
|
||||
shape::index2coords(i, zShapeInfo, zCoords);
|
||||
shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
|
||||
|
||||
for (uint j = 0; j < rank; ++j) {
|
||||
xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
|
||||
|
|
|
@ -609,11 +609,11 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
|
||||
for (auto i = start; i < stop; ++i) {
|
||||
|
||||
shape::index2coords(i, zShapeInfo, zCoords);
|
||||
shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
|
||||
|
||||
for (uint j = 0; j < rank; ++j) {
|
||||
xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
|
||||
|
|
|
@ -275,7 +275,7 @@ __device__ void Broadcast<X,Y,Z>::transformCuda(
|
|||
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
|
||||
for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {
|
||||
|
||||
|
|
|
@ -291,7 +291,7 @@ __device__ void BroadcastBool<X,Z>::transformCuda(const void *vx, const Nd4jLong
|
|||
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
|
||||
for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {
|
||||
|
||||
|
|
|
@ -271,7 +271,7 @@ __device__ void BroadcastInt<X>::transformCuda(const void *vx, const Nd4jLong *x
|
|||
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
|
||||
|
||||
for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {
|
||||
|
||||
|
|
|
@ -137,7 +137,7 @@ namespace sd {
|
|||
T *dataTAD = currentData + inputOffset;
|
||||
T *resultTAD = result + resultOffset;
|
||||
|
||||
Nd4jLong sub[MAX_RANK];
|
||||
int sub[MAX_RANK];
|
||||
|
||||
shape::index2coords(arrOffset, zTadShape, sub);
|
||||
|
||||
|
@ -166,7 +166,7 @@ namespace sd {
|
|||
auto dataTAD = currentData + inputOffset;
|
||||
auto resultTAD = result + resultOffset;
|
||||
|
||||
Nd4jLong sub[MAX_RANK];
|
||||
int sub[MAX_RANK];
|
||||
|
||||
shape::index2coords(arrOffset, zTadShape, sub);
|
||||
Nd4jLong baseOffset = shape::getOffset(zTadShape, sub);
|
||||
|
@ -199,7 +199,7 @@ namespace sd {
|
|||
resultTAD[baseIdx + k * tadEWS] = dataTAD[k];
|
||||
}
|
||||
} else {
|
||||
Nd4jLong yIdx[MAX_RANK];
|
||||
int yIdx[MAX_RANK];
|
||||
auto yRank = shape::rank(currentTad);
|
||||
|
||||
for (int i = threadIdx.x; i < yLength; i+= blockDim.x) {
|
||||
|
@ -214,8 +214,8 @@ namespace sd {
|
|||
//if (threadIdx.x == 0 && blockIdx.x == 0)
|
||||
// printf("Branch C; yLength: %i;\n", yLength);
|
||||
|
||||
Nd4jLong zIdx[MAX_RANK];
|
||||
Nd4jLong yIdx[MAX_RANK];
|
||||
int zIdx[MAX_RANK];
|
||||
int yIdx[MAX_RANK];
|
||||
auto yRank = shape::rank(currentTad);
|
||||
auto tadRank = shape::rank(zTadShape);
|
||||
|
||||
|
|
|
@ -39,8 +39,7 @@ namespace sd {
|
|||
delim->syncToHost();
|
||||
|
||||
// output rank N+1 wrt input rank
|
||||
std::vector<Nd4jLong> ocoords(input->rankOf() + 1);
|
||||
std::vector<Nd4jLong> icoords(input->rankOf());
|
||||
std::vector<int> icoords(input->rankOf());
|
||||
|
||||
// getting buffer lengths
|
||||
// FIXME: it'll be bigger, since it'll include delimiters,
|
||||
|
@ -54,7 +53,7 @@ namespace sd {
|
|||
auto s = input->e<std::string>(e);
|
||||
|
||||
// getting base index
|
||||
shape::index2coords(e, input->shapeInfo(), icoords.data());
|
||||
shape::index2coordsCPU(0, e, input->shapeInfo(), icoords.data());
|
||||
|
||||
// getting number of substrings
|
||||
auto cnt = StringUtils::countSubarrays(s.c_str(), s.length(), d.c_str(), d.length()) + 1;
|
||||
|
|
|
@ -64,7 +64,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
|
|||
|
||||
Nd4jLong* xOffsets = new Nd4jLong[steps];
|
||||
Nd4jLong* zOffsets = xzSameOffset ? xOffsets : new Nd4jLong[steps];
|
||||
Nd4jLong* auxBuff = new Nd4jLong[2 * input->rankOf()];
|
||||
int* auxBuff = new int[2 * input->rankOf()];
|
||||
|
||||
for (Nd4jLong j = 0; j < lenSmall; ++j) {
|
||||
|
||||
|
@ -139,40 +139,42 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
int xzCoords[MAX_RANK], minCoords[MAX_RANK];
|
||||
|
||||
for (uint i = 0, j = 0; i < xRank; ++i)
|
||||
if(j < numAxes && i != axes[j])
|
||||
minCoords[i] = 0;
|
||||
else
|
||||
++j;
|
||||
|
||||
for (auto i = start; i < stop; i++) {
|
||||
|
||||
shape::index2coords(i, input->getShapeInfo(), coords);
|
||||
shape::index2coordsCPU(start, i, input->getShapeInfo(), xzCoords);
|
||||
|
||||
const auto xOffset = shape::getOffset(input->getShapeInfo(), coords);
|
||||
const auto zOffset = xzSameOffset ? xOffset : shape::getOffset(output->getShapeInfo(), coords);
|
||||
const auto xOffset = shape::getOffset(input->getShapeInfo(), xzCoords);
|
||||
const auto zOffset = xzSameOffset ? xOffset : shape::getOffset(output->getShapeInfo(), xzCoords);
|
||||
|
||||
if(minRank == xRank) {
|
||||
for (uint i = 0, j = 0; i < xRank; ++i) {
|
||||
if(j < numAxes && i != axes[j])
|
||||
coords[i] = 0;
|
||||
else
|
||||
++j;
|
||||
}
|
||||
for (uint j = 0; j < numAxes; ++j)
|
||||
minCoords[axes[j]] = xzCoords[axes[j]];
|
||||
}
|
||||
else // minRank = numAxes = 1 in this case
|
||||
coords[0] = coords[axes[0]];
|
||||
minCoords[0] = xzCoords[axes[0]];
|
||||
|
||||
const auto meanOffset = shape::getOffset(mean->getShapeInfo(), coords);
|
||||
const auto varianceOffset = paramSameOffset ? meanOffset : shape::getOffset(variance->getShapeInfo(), coords);
|
||||
const auto meanOffset = shape::getOffset(mean->getShapeInfo(), minCoords);
|
||||
const auto varianceOffset = paramSameOffset ? meanOffset : shape::getOffset(variance->getShapeInfo(), minCoords);
|
||||
|
||||
T sigmaInvGam = 1. / sd::math::nd4j_sqrt<T, T>(v[varianceOffset] + epsilon);
|
||||
|
||||
if(g != nullptr) {
|
||||
const auto gammaOffset = paramSameOffset ? meanOffset : shape::getOffset(gamma->getShapeInfo(), coords);
|
||||
const auto gammaOffset = paramSameOffset ? meanOffset : shape::getOffset(gamma->getShapeInfo(), minCoords);
|
||||
sigmaInvGam *= g[gammaOffset];
|
||||
}
|
||||
|
||||
z[zOffset] = (x[xOffset] - m[meanOffset]) * sigmaInvGam;
|
||||
|
||||
if(b != nullptr) {
|
||||
const auto betaOffset = paramSameOffset ? meanOffset : shape::getOffset(beta->getShapeInfo(), coords);
|
||||
const auto betaOffset = paramSameOffset ? meanOffset : shape::getOffset(beta->getShapeInfo(), minCoords);
|
||||
z[zOffset] += b[betaOffset];
|
||||
}
|
||||
}
|
||||
|
@ -184,7 +186,7 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray
|
|||
//////////////////////////////////////////////////////////////////////////
|
||||
void batchnorm(const NDArray* input, const NDArray* mean, const NDArray* variance, const NDArray* gamma, const NDArray* beta, NDArray* output, const std::vector<int>& axes, const double epsilon) {
|
||||
|
||||
// batchnorm2_ is slower
|
||||
// batchnorm2_ is still slower ?
|
||||
BUILD_SINGLE_SELECTOR(input->dataType(), batchnorm_, (input, mean, variance, gamma, beta, output, axes, epsilon), FLOAT_TYPES);
|
||||
}
|
||||
|
||||
|
|
|
@ -51,19 +51,19 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
for (auto i = start; i < stop; i++) {
|
||||
shape::index2coords(i, output.getShapeInfo(), coords);
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
||||
const auto xOffset0 = shape::getOffset(input.getShapeInfo(), coords);
|
||||
const auto xOffset1 = xOffset0 + input.strideAt(dimC);
|
||||
const auto xOffset2 = xOffset1 + input.strideAt(dimC);
|
||||
z[zOffset] = 0.2989f*x[xOffset0] + 0.5870f*x[xOffset1] + 0.1140f*x[xOffset2];
|
||||
}
|
||||
};
|
||||
int coords[MAX_RANK];
|
||||
for (auto i = start; i < stop; i++) {
|
||||
shape::index2coordsCPU(start, i, output.getShapeInfo(), coords);
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
||||
const auto xOffset0 = shape::getOffset(input.getShapeInfo(), coords);
|
||||
const auto xOffset1 = xOffset0 + input.strideAt(dimC);
|
||||
const auto xOffset2 = xOffset1 + input.strideAt(dimC);
|
||||
z[zOffset] = 0.2989f*x[xOffset0] + 0.5870f*x[xOffset1] + 0.1140f*x[xOffset2];
|
||||
}
|
||||
};
|
||||
|
||||
samediff::Threads::parallel_for(func, 0, output.lengthOf(), 1);
|
||||
return;
|
||||
samediff::Threads::parallel_for(func, 0, output.lengthOf(), 1);
|
||||
return;
|
||||
}
|
||||
|
||||
void transformRgbGrs(sd::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
|
||||
|
@ -78,9 +78,9 @@ FORCEINLINE static void rgbToFromYuv_(const NDArray& input, NDArray& output, con
|
|||
const int rank = input.rankOf();
|
||||
bool bSimple = (dimC == rank - 1 && 'c' == input.ordering() && 1 == input.ews() &&
|
||||
'c' == output.ordering() && 1 == output.ews());
|
||||
|
||||
|
||||
if (bSimple) {
|
||||
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
for (auto i = start; i < stop; i += increment) {
|
||||
op(x[i], x[i + 1], x[i + 2], z[i], z[i + 1], z[i + 2]);
|
||||
|
@ -177,12 +177,12 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
|
|||
|
||||
const T* x = input->bufferAsT<T>();
|
||||
T* z = output->bufferAsT<T>();
|
||||
// TODO: Use tensordot or other optimizied helpers to see if we can get better performance.
|
||||
// TODO: Use tensordot or other optimizied helpers to see if we can get better performance.
|
||||
|
||||
if (dimC == rank - 1 && input->ews() == 1 && output->ews() == 1 && input->ordering() == 'c' && output->ordering() == 'c') {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
for (auto i = start; i < stop; i += increment) {
|
||||
for (auto i = start; i < stop; i += increment) {
|
||||
//simple M*v //tr.T*v.T // v * tr //rule: (AB)' =B'A'
|
||||
// v.shape (1,3) row vector
|
||||
T x0, x1, x2;
|
||||
|
@ -192,7 +192,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
|
|||
z[i] = x0 * tr[0][0] + x1 * tr[1][0] + x2 * tr[2][0];
|
||||
z[i+1] = x0 * tr[0][1] + x1 * tr[1][1] + x2 * tr[2][1];
|
||||
z[i+2] = x0 * tr[0][2] + x1 * tr[1][2] + x2 * tr[2][2];
|
||||
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -49,9 +49,12 @@ void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& outp
|
|||
const auto xLen = input.lengthOf();
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
|
||||
int coords[MAX_RANK];
|
||||
|
||||
for (Nd4jLong i = 0; i < xLen; ++i) {
|
||||
shape::index2coords(i, xShapeInfo, coords);
|
||||
|
||||
shape::index2coordsCPU(start, i, xShapeInfo, coords);
|
||||
|
||||
const auto xOffset = shape::getOffset(xShapeInfo, coords);
|
||||
const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords);
|
||||
|
|
|
@ -113,18 +113,23 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray&
|
|||
|
||||
// loop through input array
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
|
||||
int zCoords[MAX_RANK], xCoords[MAX_RANK];
|
||||
|
||||
for (auto i = start; i < stop; i++) {
|
||||
|
||||
shape::index2coords(i, output.getShapeInfo(), coords);
|
||||
shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
|
||||
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
||||
memcpy(xCoords, zCoords, rank * sizeof(int));
|
||||
|
||||
// evaluate spatial coordinates for x
|
||||
for (uint j = 1; j <= numOfSpatialDims; ++j)
|
||||
coords[j] += crop.e<uint>(j - 1, 0); // add crop left
|
||||
xCoords[j] += crop.e<uint>(j - 1, 0); // add crop left
|
||||
|
||||
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
|
||||
const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords);
|
||||
|
||||
z[zOffset] = x[xOffset];
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -299,11 +304,16 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
|
|||
|
||||
// loop through output array
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
for (auto i = start; i < stop; i++) {
|
||||
shape::index2coords(i, output.getShapeInfo(), coords);
|
||||
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
||||
int zCoords[MAX_RANK], xCoords[MAX_RANK];
|
||||
|
||||
for (auto i = start; i < stop; i++) {
|
||||
|
||||
shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
|
||||
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
|
||||
|
||||
memcpy(xCoords, zCoords, rank * sizeof(int));
|
||||
|
||||
bool within = true;
|
||||
|
||||
|
@ -312,16 +322,16 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
|
|||
const auto padLeft = padding.e<uint>(j - 1, 0);
|
||||
const auto padRight = padding.e<uint>(j - 1, 1);
|
||||
|
||||
within &= (coords[j] >= padLeft && coords[j] < output.sizeAt(j) - padRight);
|
||||
within &= zCoords[j] >= padLeft && zCoords[j] < output.sizeAt(j) - padRight;
|
||||
|
||||
if (!within)
|
||||
break;
|
||||
|
||||
coords[j] -= padLeft; // get coordinates for x
|
||||
xCoords[j] = zCoords[j] - padLeft; // get coordinates for x
|
||||
}
|
||||
|
||||
if (within)
|
||||
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
|
||||
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), xCoords)];
|
||||
else
|
||||
z[zOffset] = 0.f;
|
||||
}
|
||||
|
|
|
@ -43,11 +43,11 @@ Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
|
||||
Nd4jLong xCoords[MAX_RANK];
|
||||
int xCoords[MAX_RANK];
|
||||
|
||||
for (auto i = start; i < stop; i++) {
|
||||
|
||||
shape::index2coords(i, xShapeInfo, xCoords);
|
||||
shape::index2coordsCPU(start, i, xShapeInfo, xCoords);
|
||||
|
||||
const Nd4jLong currentInd = x[shape::getOffset(xShapeInfo, xCoords)];
|
||||
|
||||
|
|
|
@ -96,14 +96,17 @@ namespace helpers {
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
int coords[MAX_RANK], temp;
|
||||
|
||||
for (auto i = start; i < stop; i += increment) {
|
||||
|
||||
shape::index2coords(i, input.getShapeInfo(), coords);
|
||||
shape::index2coordsCPU(start, i, input.getShapeInfo(), coords);
|
||||
const auto xOffset = shape::getOffset(input.getShapeInfo(), coords);
|
||||
|
||||
uint outArrIdx = 0;
|
||||
|
||||
temp = coords[axis];
|
||||
|
||||
while (coords[axis] >= zDim) {
|
||||
coords[axis] -= zDim;
|
||||
++outArrIdx;
|
||||
|
@ -112,6 +115,8 @@ namespace helpers {
|
|||
T* z = outArrs[outArrIdx]->bufferAsT<T>();
|
||||
const auto zOffset = shape::getOffset(outArrs[outArrIdx]->getShapeInfo(), coords);
|
||||
z[zOffset] = xBuff[xOffset];
|
||||
|
||||
coords[axis] = temp;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -188,24 +188,35 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
|
|||
const T padVal = padValue.e<T>(0);
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
|
||||
int zCoords[MAX_RANK], xCoords[MAX_RANK];
|
||||
|
||||
for (auto i = start; i < stop; i++) {
|
||||
shape::index2coords(i, output.getShapeInfo(), coords);
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
||||
|
||||
shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
|
||||
|
||||
memcpy(xCoords, zCoords, rank * sizeof(int));
|
||||
|
||||
bool within = true;
|
||||
|
||||
for (int j = rankMinusOne; j >= 0; --j) {
|
||||
if (xShape[j] == zShape[j]) continue;
|
||||
|
||||
if (xShape[j] == zShape[j])
|
||||
continue;
|
||||
|
||||
const auto left = paddings.e<Nd4jLong>(j, 0);
|
||||
if (coords[j] < left || coords[j] >= left + xShape[j]) {
|
||||
|
||||
if (zCoords[j] < left || zCoords[j] >= left + xShape[j]) {
|
||||
within = false;
|
||||
break;
|
||||
}
|
||||
else { coords[j] = coords[j] - left; }
|
||||
else
|
||||
xCoords[j] = zCoords[j] - left;
|
||||
}
|
||||
|
||||
if (within)
|
||||
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
|
||||
z[zOffset] = x[shape::getOffset(input.getShapeInfo(), xCoords)];
|
||||
else
|
||||
z[zOffset] = padVal;
|
||||
}
|
||||
|
@ -219,20 +230,30 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
|
|||
const Nd4jLong shift2 = mode == 1 ? 2 : 1; // REFLECT : SYMMETRIC
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
|
||||
int zCoords[MAX_RANK], xCoords[MAX_RANK];
|
||||
|
||||
for (auto i = start; i < stop; i++) {
|
||||
shape::index2coords(i, output.getShapeInfo(), coords);
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
||||
|
||||
shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
|
||||
|
||||
memcpy(xCoords, zCoords, rank * sizeof(int));
|
||||
|
||||
for (int j = rankMinusOne; j >= 0; --j) {
|
||||
|
||||
if (xShape[j] == zShape[j]) continue;
|
||||
coords[j] = coords[j] - paddings.e<Nd4jLong>(j, 0); // are ready to fill middle (within input dimension range)
|
||||
if (coords[j] < 0) coords[j] = -coords[j] - shift1; // means fill from left
|
||||
else if (coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right
|
||||
if (xShape[j] == zShape[j])
|
||||
continue;
|
||||
|
||||
xCoords[j] = zCoords[j] - paddings.e<Nd4jLong>(j, 0); // are ready to fill middle (within input dimension range)
|
||||
|
||||
if (xCoords[j] < 0)
|
||||
xCoords[j] = -xCoords[j] - shift1; // means fill from left
|
||||
else if (xCoords[j] >= xShape[j])
|
||||
xCoords[j] = 2 * xShape[j] - xCoords[j] - shift2; // means fill from right
|
||||
}
|
||||
|
||||
const auto xOffset = shape::getOffset(input.getShapeInfo(), coords);
|
||||
const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords);
|
||||
z[zOffset] = x[xOffset];
|
||||
}
|
||||
};
|
||||
|
@ -562,45 +583,37 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
|
|||
|
||||
const Nd4jLong zLen = output.lengthOf();
|
||||
|
||||
const int yLastDim = indices.sizeAt(-1);
|
||||
const uint yLastDim = indices.sizeAt(-1);
|
||||
|
||||
const int diff = zRank - xRank;
|
||||
const bool bEqual = yLastDim == xRank;
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
Nd4jLong coords[MAX_RANK * 3];
|
||||
|
||||
int xCoords[MAX_RANK], zCoords[MAX_RANK], temp;
|
||||
|
||||
for (auto i = start; i < stop; i++) {
|
||||
Nd4jLong *zCoordStart, *xCoordStart;
|
||||
|
||||
if (yLastDim == xRank) {
|
||||
zCoordStart = coords;
|
||||
xCoordStart = coords;
|
||||
} else if (zRank >= xRank) {
|
||||
zCoordStart = coords;
|
||||
xCoordStart = coords + zRank - xRank;
|
||||
} else {
|
||||
zCoordStart = coords + xRank - zRank;
|
||||
xCoordStart = coords;
|
||||
}
|
||||
shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
|
||||
|
||||
shape::index2coords(i, output.getShapeInfo(), zCoordStart);
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
|
||||
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoordStart);
|
||||
temp = zCoords[yRank - 1];
|
||||
zCoords[yRank - 1] = 0;
|
||||
const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoords);
|
||||
zCoords[yRank - 1] = temp;
|
||||
|
||||
// last y coordinate
|
||||
uint coordToRestore;
|
||||
if (yLastDim != xRank)
|
||||
coordToRestore = static_cast<uint>(zCoordStart[yRank - 1]);
|
||||
if(bEqual)
|
||||
memcpy(xCoords, zCoords, zRank * sizeof(int));
|
||||
else if(diff >= 0)
|
||||
memcpy(xCoords, zCoords + diff, xRank * sizeof(int));
|
||||
else
|
||||
memcpy(xCoords - diff, zCoords, zRank * sizeof(int));
|
||||
|
||||
zCoordStart[yRank - 1] = 0;
|
||||
const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoordStart);
|
||||
for (uint j = 0; j < yLastDim; ++j)
|
||||
xCoords[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride
|
||||
|
||||
//restore z coordinate
|
||||
if (yLastDim != xRank)
|
||||
zCoordStart[yRank - 1] = coordToRestore;
|
||||
|
||||
// construct coordinates for x
|
||||
for (int j = 0; j < yLastDim; ++j)
|
||||
xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]]; // last stride
|
||||
|
||||
const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart);
|
||||
const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords);
|
||||
|
||||
z[zOffset] = x[xOffset];
|
||||
}
|
||||
|
@ -1188,10 +1201,12 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
|
|||
else {
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
Nd4jLong inIdx[MAX_RANK];
|
||||
Nd4jLong outIdx[MAX_RANK];
|
||||
|
||||
int inIdx[MAX_RANK], outIdx[MAX_RANK];
|
||||
|
||||
for (auto i = start; i < stop; i++) {
|
||||
shape::index2coords(i, output.getShapeInfo(), outIdx);
|
||||
|
||||
shape::index2coordsCPU(start, i, output.getShapeInfo(), outIdx);
|
||||
|
||||
for (int j = 0; j < rank; ++j) {
|
||||
const Nd4jLong inLen = input.sizeAt(j);
|
||||
|
|
|
@ -52,7 +52,7 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo,
|
|||
__syncthreads();
|
||||
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
int coords[MAX_RANK];
|
||||
|
||||
for (int i = tid; i < xzLen; i += blockDim.x * gridDim.x) {
|
||||
shape::index2coords(i, xShapeInfo, coords);
|
||||
|
@ -124,7 +124,7 @@ __global__ linkage void preluBPCuda(const void *vIn, const Nd4jLong *inShapeI
|
|||
__syncthreads();
|
||||
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
int coords[MAX_RANK];
|
||||
|
||||
for (int i = tid; i < inLen; i += totalThreads) {
|
||||
shape::index2coords(i, inShapeInfo, coords);
|
||||
|
|
|
@ -45,7 +45,7 @@ __global__ static void addBiasCuda( const void* vx, const Nd4jLong* xShapeInfo,
|
|||
X* z = reinterpret_cast<X*>(vz);
|
||||
|
||||
__shared__ int rank, channelPosition, posOfNonUnityDim;
|
||||
__shared__ Nd4jLong *sharedMem, len;
|
||||
__shared__ Nd4jLong len, *sharedMem;
|
||||
__shared__ bool xzSameOffsets, xzAreSame;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
@ -130,7 +130,7 @@ void addBias(sd::graph::Context& block, const NDArray& input, const NDArray& bia
|
|||
FLOAT_TYPES, FLOAT_TYPES);
|
||||
} else {
|
||||
// default case
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 2;
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 4;
|
||||
const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = input.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
|
||||
|
||||
|
|
|
@ -124,7 +124,7 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo
|
|||
}
|
||||
__syncthreads();
|
||||
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
int coords[MAX_RANK];
|
||||
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ __global__ static void concatCuda(void* pVx, void* pxShapeInfo, void* vz, Nd4jL
|
|||
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
int coords[MAX_RANK];
|
||||
|
||||
for (uint64_t i = tid; i < zLen; i += totalThreads) {
|
||||
shape::index2coords(i, zShapeInfo, coords);
|
||||
|
|
|
@ -706,7 +706,7 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,
|
|||
T* z = reinterpret_cast<T*>(vz);
|
||||
|
||||
__shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd;
|
||||
__shared__ Nd4jLong *sharedMem, zLen;
|
||||
__shared__ Nd4jLong zLen, *sharedMem;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
|
@ -858,7 +858,7 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
|
|||
|
||||
Nd4jLong coord2, coord3;
|
||||
__shared__ int rank, kHeff, kWeff, iH, iW, kProd;
|
||||
__shared__ Nd4jLong *sharedMem, yLen;
|
||||
__shared__ Nd4jLong yLen, *sharedMem;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
|
@ -1017,7 +1017,7 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
|
|||
|
||||
Nd4jLong coord2, coord3, coord4;
|
||||
__shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd;
|
||||
__shared__ Nd4jLong *sharedMem, yLen;
|
||||
__shared__ Nd4jLong yLen, *sharedMem;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
|
@ -1342,7 +1342,7 @@ __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeIn
|
|||
T* z = reinterpret_cast<T*>(vz);
|
||||
|
||||
__shared__ int rank, dimIH;
|
||||
__shared__ Nd4jLong *sharedMem, zLen;
|
||||
__shared__ Nd4jLong zLen, *sharedMem;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
|
@ -1410,7 +1410,7 @@ __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeIn
|
|||
T* z = reinterpret_cast<T*>(vz);
|
||||
|
||||
__shared__ int rank, dimID;
|
||||
__shared__ Nd4jLong *sharedMem, zLen;
|
||||
__shared__ Nd4jLong zLen, *sharedMem;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
|
@ -1480,7 +1480,7 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape
|
|||
|
||||
__shared__ int rank, dimIH;
|
||||
__shared__ uint factorH, factorW;
|
||||
__shared__ Nd4jLong *sharedMem, zLen;
|
||||
__shared__ Nd4jLong zLen, *sharedMem;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
|
@ -1554,7 +1554,7 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape
|
|||
|
||||
__shared__ int rank, dimID;
|
||||
__shared__ uint factorD, factorH, factorW;
|
||||
__shared__ Nd4jLong *sharedMem, zLen;
|
||||
__shared__ Nd4jLong zLen, *sharedMem;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
|
|
|
@ -36,8 +36,8 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
|
|||
__shared__ const T* x;
|
||||
__shared__ const T* y;
|
||||
__shared__ T* z;
|
||||
__shared__ int rank;
|
||||
__shared__ Nd4jLong lenWithoutLastDim, totalThreads, *sharedMem;
|
||||
__shared__ int rank, *sharedMem;
|
||||
__shared__ Nd4jLong lenWithoutLastDim, totalThreads;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
x = reinterpret_cast<const T*>(vx);
|
||||
|
@ -45,7 +45,7 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
|
|||
z = reinterpret_cast<T*>(vz);
|
||||
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
totalThreads = gridDim.x * blockDim.x;
|
||||
|
||||
rank = shape::rank(xShapeInfo);
|
||||
|
@ -106,7 +106,7 @@ void crossBatched(sd::LaunchContext* context, NDArray *x, NDArray *y, NDArray *z
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 4;
|
||||
const int blocksPerGrid = (x->lengthOf() / x->sizeAt(-1) + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = sizeof(Nd4jLong) * threadsPerBlock * x->rankOf() + 128;
|
||||
const int sharedMem = sizeof(int) * threadsPerBlock * x->rankOf() + 128;
|
||||
|
||||
PointersManager manager(context, "cross");
|
||||
|
||||
|
|
|
@ -43,13 +43,13 @@ __global__ static void dilation2dCuda(const void* vx, const Nd4jLong* xShapeInfo
|
|||
const X* y = reinterpret_cast<const X*>(vy);
|
||||
Z* z = reinterpret_cast<Z*>(vz);
|
||||
|
||||
__shared__ int xzRank, yRank;
|
||||
__shared__ int xzRank, yRank, *sharedMem;
|
||||
__shared__ uint iH, iW, kH, kW;
|
||||
__shared__ Nd4jLong *sharedMem, zLen;
|
||||
__shared__ Nd4jLong zLen;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
|
||||
zLen = shape::length(zShapeInfo);
|
||||
|
||||
|
@ -119,7 +119,7 @@ void dilation2d(sd::LaunchContext* context, NDArray *input, NDArray *weights, ND
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 2;
|
||||
const int blocksPerGrid = (output->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = (weights->rankOf() + output->rankOf()) * sizeof(Nd4jLong) * threadsPerBlock + 128;
|
||||
const int sharedMem = (weights->rankOf() + output->rankOf()) * sizeof(int) * threadsPerBlock + 128;
|
||||
|
||||
NDArray::prepareSpecialUse({output}, {input, weights});
|
||||
BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), dilation2dCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input->getSpecialBuffer(), input->getSpecialShapeInfo(), weights->getSpecialBuffer(), weights->getSpecialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), sH, sW, pH, pW, dH, dW), FLOAT_TYPES);
|
||||
|
|
|
@ -27,7 +27,7 @@ namespace sd {
|
|||
template <typename T>
|
||||
void _CUDA_G flattenKernel(void **xBuffers, Nd4jLong **xShapeInfos, Nd4jLong *offsets, Nd4jLong numInputs, void *zBuffer, Nd4jLong *zShapeInfo, char order) {
|
||||
|
||||
Nd4jLong xCoord[MAX_RANK];
|
||||
int xCoord[MAX_RANK];
|
||||
|
||||
// each block of threads works on 1 input array
|
||||
for (Nd4jLong e = blockIdx.x; e < numInputs; e += gridDim.x) {
|
||||
|
|
|
@ -40,12 +40,12 @@ __global__ static void im2colCuda(const void *image, void *columns,
|
|||
const auto im = reinterpret_cast<const T*>(image);
|
||||
auto col = reinterpret_cast<T*>(columns);
|
||||
|
||||
__shared__ Nd4jLong colLen, *sharedMem, iH, iW;
|
||||
__shared__ int imRank, colRank;
|
||||
__shared__ Nd4jLong colLen, iH, iW;
|
||||
__shared__ int imRank, colRank, *sharedMem;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
|
||||
colRank = 6;
|
||||
imRank = 4;
|
||||
|
@ -81,7 +81,7 @@ __global__ static void im2colCuda(const void *image, void *columns,
|
|||
//////////////////////////////////////////////////////////////////////////
|
||||
template <typename T>
|
||||
static void im2colCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, sd::LaunchContext & context, const void *image, void *columns, const Nd4jLong *imShapeInfo, const Nd4jLong *colShapeInfo, int sH, int sW, int pH, int pW, int dH, int dW, double zeroPadVal) {
|
||||
im2colCuda<T><<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(Nd4jLong) * 6 /* rank of columns = 6 */, *context.getCudaStream()>>>(image, columns, imShapeInfo, colShapeInfo, sH, sW, pH, pW, dH, dW, zeroPadVal);
|
||||
im2colCuda<T><<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int) * 6 /* rank of columns = 6 */, *context.getCudaStream()>>>(image, columns, imShapeInfo, colShapeInfo, sH, sW, pH, pW, dH, dW, zeroPadVal);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -149,19 +149,19 @@ __global__ void rgbToGrsCuda(const void *vx, const Nd4jLong *xShapeInfo, void *v
|
|||
const auto x = reinterpret_cast<const T*>(vx);
|
||||
auto z = reinterpret_cast<T*>(vz);
|
||||
|
||||
__shared__ Nd4jLong zLen, *sharedMem;
|
||||
__shared__ int rank; // xRank == zRank
|
||||
__shared__ Nd4jLong zLen;
|
||||
__shared__ int rank, *sharedMem; // xRank == zRank
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
|
||||
zLen = shape::length(zShapeInfo);
|
||||
rank = shape::rank(zShapeInfo);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
Nd4jLong* coords = sharedMem + threadIdx.x * rank;
|
||||
auto coords = sharedMem + threadIdx.x * rank;
|
||||
|
||||
for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i += gridDim.x * blockDim.x) {
|
||||
|
||||
|
@ -197,7 +197,7 @@ void transformRgbGrs(sd::LaunchContext* context, const NDArray& input, NDArray&
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 4;
|
||||
const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = input.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
|
||||
const int sharedMem = input.rankOf() * sizeof(int) * threadsPerBlock + 128;
|
||||
|
||||
NDArray::prepareSpecialUse({&output}, {&input});
|
||||
BUILD_SINGLE_SELECTOR(input.dataType(), rgbToGrsCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), dimC), NUMERIC_TYPES);
|
||||
|
|
|
@ -39,14 +39,14 @@ __global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeI
|
|||
const auto y = reinterpret_cast<const T*>(vy);
|
||||
auto z = reinterpret_cast<T*>(vz);
|
||||
|
||||
__shared__ int xRank; // xRank = zRank, xRank = yRank + 1
|
||||
__shared__ Nd4jLong xLen, *sharedMem; // xLen = zLen
|
||||
__shared__ int xRank, *sharedMem; // xRank = zRank, xRank = yRank + 1
|
||||
__shared__ Nd4jLong xLen; // xLen = zLen
|
||||
__shared__ bool areSameOffsets;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
|
||||
areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); // shapes are definitely the same, but strides might not
|
||||
|
||||
|
@ -56,7 +56,7 @@ __global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeI
|
|||
|
||||
__syncthreads();
|
||||
|
||||
auto coords = sharedMem + threadIdx.x * xRank; // we provide (xRank * sizeof(Nd4jLong) * threadIdx.x) amount of shared memory per each thread
|
||||
auto coords = sharedMem + threadIdx.x * xRank; // we provide (xRank * sizeof(int) * threadIdx.x) amount of shared memory per each thread
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (Nd4jLong i = tid; i < xLen; i += gridDim.x * blockDim.x) {
|
||||
|
@ -86,7 +86,7 @@ void matrixSetDiag(sd::LaunchContext* context, const NDArray& input, const NDArr
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 2;
|
||||
const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * input.rankOf() + 128;
|
||||
const int sharedMem = threadsPerBlock * sizeof(int) * input.rankOf() + 128;
|
||||
|
||||
PointersManager manager(context, "matrixSetDiag");
|
||||
|
||||
|
|
|
@ -43,12 +43,12 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
|
|||
const auto x = reinterpret_cast<const T*>(vx);
|
||||
auto z = reinterpret_cast<T*>(vz);
|
||||
|
||||
__shared__ int rank;
|
||||
__shared__ Nd4jLong zLen, *sharedMem;
|
||||
__shared__ int rank, *sharedMem;
|
||||
__shared__ Nd4jLong zLen;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
|
||||
rank = shape::rank(zShapeInfo);
|
||||
zLen = shape::length(zShapeInfo);
|
||||
|
@ -103,7 +103,7 @@ void batchToSpace(sd::LaunchContext* context, const NDArray& input, NDArray& out
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 2;
|
||||
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
|
||||
const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
|
||||
|
||||
PointersManager manager(context, "batchToSpace");
|
||||
|
||||
|
@ -138,13 +138,13 @@ __global__ static void batchToSpaceNDCuda(const void* vx, const Nd4jLong* xShape
|
|||
const auto y = reinterpret_cast<const Y*>(vy);
|
||||
auto z = reinterpret_cast<X*>(vz);
|
||||
|
||||
__shared__ int rank;
|
||||
__shared__ Nd4jLong zLen, *sharedMem;
|
||||
__shared__ int rank, *sharedMem;
|
||||
__shared__ Nd4jLong zLen;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
|
||||
rank = shape::rank(zShapeInfo);
|
||||
zLen = shape::length(zShapeInfo);
|
||||
|
@ -234,7 +234,7 @@ void batchToSpaceND(sd::LaunchContext* context, const NDArray& input, const NDAr
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 4;
|
||||
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
|
||||
const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
|
||||
|
||||
PointersManager manager(context, "batchToSpaceND");
|
||||
|
||||
|
@ -264,12 +264,12 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
|
|||
const auto x = reinterpret_cast<const T*>(vx);
|
||||
auto z = reinterpret_cast<T*>(vz);
|
||||
|
||||
__shared__ int rank;
|
||||
__shared__ Nd4jLong zLen, *sharedMem;
|
||||
__shared__ int rank, *sharedMem;
|
||||
__shared__ Nd4jLong zLen;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
|
||||
rank = shape::rank(zShapeInfo);
|
||||
zLen = shape::length(zShapeInfo);
|
||||
|
@ -326,7 +326,7 @@ void spaceToBatch(sd::LaunchContext* context, const NDArray& input, NDArray& out
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 2;
|
||||
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
|
||||
const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
|
||||
|
||||
PointersManager manager(context, "spaceToBatch");
|
||||
|
||||
|
@ -364,13 +364,13 @@ __global__ static void spaceToBatchNDCuda(const void* vx, const Nd4jLong* xShape
|
|||
const auto y = reinterpret_cast<const Y*>(vy);
|
||||
auto z = reinterpret_cast<X*>(vz);
|
||||
|
||||
__shared__ int rank; // xRank = zRank, yRank = 2;
|
||||
__shared__ Nd4jLong zLen, totalThreads, *sharedMem;
|
||||
__shared__ int rank, *sharedMem; // xRank = zRank, yRank = 2;
|
||||
__shared__ Nd4jLong zLen, totalThreads;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
|
||||
rank = shape::rank(zShapeInfo);
|
||||
zLen = shape::length(zShapeInfo);
|
||||
|
@ -473,7 +473,7 @@ void spaceToBatchND(sd::LaunchContext* context, const NDArray& input, const NDAr
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 4;
|
||||
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
|
||||
const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
|
||||
|
||||
PointersManager manager(context, "spaceToBatchND");
|
||||
|
||||
|
|
|
@ -628,12 +628,12 @@ __global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo,
|
|||
auto y = reinterpret_cast<Z*>(vy);
|
||||
auto z = reinterpret_cast<Z*>(vz);
|
||||
|
||||
__shared__ Nd4jLong xLen, *sharedMem;
|
||||
__shared__ int xRank; // xRank = zRank, yRank = xRank + 1
|
||||
__shared__ Nd4jLong xLen;
|
||||
__shared__ int xRank, *sharedMem; // xRank = zRank, yRank = xRank + 1
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
|
||||
xLen = shape::length(xShapeInfo);
|
||||
xRank = shape::rank(xShapeInfo);
|
||||
|
@ -678,7 +678,7 @@ void scatterForLoss(sd::LaunchContext* context, const NDArray& indices, NDArray&
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 2;
|
||||
const int blocksPerGrid = (indices.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = updates.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
|
||||
const int sharedMem = updates.rankOf() * sizeof(int) * threadsPerBlock + 128;
|
||||
|
||||
if(calcGrad) {
|
||||
NDArray::prepareSpecialUse({&updates}, {&indices});
|
||||
|
|
|
@ -54,7 +54,7 @@ __global__ static void splitCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
|
|||
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
int coords[MAX_RANK];
|
||||
|
||||
for (uint64_t i = tid; i < xLen; i += totalThreads) {
|
||||
|
||||
|
|
|
@ -135,13 +135,13 @@ __global__ static void sruBICuda(const void* vx, const Nd4jLong* xShapeInfo,
|
|||
|
||||
const int rank = 3;
|
||||
|
||||
__shared__ int time, K;
|
||||
__shared__ Nd4jLong len, totalThreads, *sharedMem;
|
||||
__shared__ int time, K, *sharedMem;
|
||||
__shared__ Nd4jLong len, totalThreads;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
|
||||
time = xShapeInfo[1];
|
||||
K = xShapeInfo[3] / 2;
|
||||
|
@ -152,7 +152,7 @@ __global__ static void sruBICuda(const void* vx, const Nd4jLong* xShapeInfo,
|
|||
__syncthreads();
|
||||
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
Nd4jLong* coords = sharedMem + threadIdx.x * rank;
|
||||
auto coords = sharedMem + threadIdx.x * rank;
|
||||
|
||||
if(tid >= len)
|
||||
return;
|
||||
|
@ -245,7 +245,7 @@ void sruBI(sd::LaunchContext * context, NDArray* x, const NDArray* w, const NDAr
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 4;
|
||||
const int blocksPerGrid = (x->sizeAt(1) * x->sizeAt(2) + threadsPerBlock - 1) / threadsPerBlock; // loop through last two dimensions of x array -> bS, 2*K
|
||||
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * x->rankOf() + 128;
|
||||
const int sharedMem = threadsPerBlock * sizeof(int) * x->rankOf() + 128;
|
||||
|
||||
NDArray::prepareSpecialUse({ht, ct}, {x, &wi, b, c0, mask});
|
||||
BUILD_SINGLE_SELECTOR(x->dataType(), sruBICudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), x->getSpecialBuffer(), x->getSpecialShapeInfo(), wi.getSpecialBuffer(), wi.getSpecialShapeInfo(), b->getSpecialBuffer(), b->getSpecialShapeInfo(), c0->getSpecialBuffer(), c0->getSpecialShapeInfo(), mask ? mask->getSpecialBuffer() : nullptr, mask ? mask->getSpecialShapeInfo() : nullptr, ht->specialBuffer(), ht->specialShapeInfo(), ct->specialBuffer(), ct->specialShapeInfo()), FLOAT_TYPES);
|
||||
|
@ -340,13 +340,13 @@ __global__ static void sruBIBPCuda(const void* vx, const Nd4jLong* xShapeI
|
|||
|
||||
const int rank = 3;
|
||||
|
||||
__shared__ int time, K;
|
||||
__shared__ Nd4jLong len, totalThreads, *sharedMem;
|
||||
__shared__ int time, K, *sharedMem;
|
||||
__shared__ Nd4jLong len, totalThreads;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
|
||||
time = xShapeInfo[1];
|
||||
K = xShapeInfo[3] / 2;
|
||||
|
@ -358,7 +358,7 @@ __global__ static void sruBIBPCuda(const void* vx, const Nd4jLong* xShapeI
|
|||
__syncthreads();
|
||||
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
Nd4jLong* coords = sharedMem + threadIdx.x * rank;
|
||||
auto coords = sharedMem + threadIdx.x * rank;
|
||||
|
||||
if(tid >= len)
|
||||
return;
|
||||
|
@ -513,7 +513,7 @@ void sruBIBP(sd::LaunchContext* context, NDArray* x, const NDArray* w, const NDA
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 4;
|
||||
const int blocksPerGrid = (x->sizeAt(1) * x->sizeAt(2) + threadsPerBlock - 1) / threadsPerBlock; // loop through last two dimensions of x array -> bS, 2*K
|
||||
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * x->rankOf() + 128;
|
||||
const int sharedMem = threadsPerBlock * sizeof(int) * x->rankOf() + 128;
|
||||
|
||||
NDArray::prepareSpecialUse({gradI, &gradWi, &gradBias, gradC0}, {x, &wi, b, c0, ct, gradCt, gradHt, mask});
|
||||
BUILD_SINGLE_SELECTOR(x->dataType(), sruBIBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), x->getSpecialBuffer(), x->getSpecialShapeInfo(), wi.getSpecialBuffer(), wi.getSpecialShapeInfo(), b->getSpecialBuffer(), b->getSpecialShapeInfo(), c0->getSpecialBuffer(), c0->getSpecialShapeInfo(), mask ? mask->getSpecialBuffer() : nullptr, mask ? mask->getSpecialShapeInfo() : nullptr, ct->getSpecialBuffer(), ct->getSpecialShapeInfo(), gradHt->getSpecialBuffer(), gradHt->getSpecialShapeInfo(), gradCt->getSpecialBuffer(), gradCt->getSpecialShapeInfo(), gradI->specialBuffer(), gradI->specialShapeInfo(), gradWi.specialBuffer(), gradWi.specialShapeInfo(), gradBias.specialBuffer(), gradBias.specialShapeInfo(), gradC0->specialBuffer(), gradC0->specialShapeInfo()), FLOAT_TYPES);
|
||||
|
|
|
@ -93,13 +93,13 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
|
|||
auto z = reinterpret_cast<T*>(vz);
|
||||
|
||||
__shared__ T* sharedMem;
|
||||
__shared__ int xRank, zRank; // xRank = zRank + 2
|
||||
__shared__ Nd4jLong xLen, zLen, *coordsMem;
|
||||
__shared__ int xRank, zRank, *coordsMem; // xRank = zRank + 2
|
||||
__shared__ Nd4jLong xLen, zLen;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<T*>(shmem);
|
||||
coordsMem = reinterpret_cast<Nd4jLong*>(shmem + blockDim.x * sizeof(T));
|
||||
coordsMem = reinterpret_cast<int*>(shmem + blockDim.x * sizeof(T));
|
||||
|
||||
xRank = shape::rank(xShapeInfo);
|
||||
zRank = shape::rank(zShapeInfo);
|
||||
|
@ -109,7 +109,7 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
|
|||
}
|
||||
__syncthreads();
|
||||
|
||||
Nd4jLong* coords = coordsMem + threadIdx.x * xRank;
|
||||
auto coords = coordsMem + threadIdx.x * xRank;
|
||||
|
||||
for (uint m = blockIdx.x; m < zLen; m += gridDim.x) { // one block per each element of z, that is per each matrix
|
||||
|
||||
|
@ -160,7 +160,7 @@ void trace(sd::LaunchContext* context, const NDArray& input, NDArray& output) {
|
|||
const uint diagLen = input.sizeAt(-1) < input.sizeAt(-2) ? input.sizeAt(-1) : input.sizeAt(-2);
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 4;
|
||||
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = threadsPerBlock * (sizeof(Nd4jLong) * input.rankOf() + input.sizeOfT()) + 128;
|
||||
const int sharedMem = threadsPerBlock * (sizeof(int) * input.rankOf() + input.sizeOfT()) + 128;
|
||||
|
||||
NDArray::prepareSpecialUse({&output}, {&input});
|
||||
BUILD_SINGLE_SELECTOR(input.dataType(), traceCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), diagLen), LIBND4J_TYPES);
|
||||
|
@ -177,13 +177,13 @@ __global__ static void triuBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo
|
|||
const auto x = reinterpret_cast<const T*>(vx); // gradO
|
||||
auto z = reinterpret_cast<T*>(vz); // gradI
|
||||
|
||||
__shared__ int rank, areSameOffsets; // xRank = zRank
|
||||
__shared__ Nd4jLong len, totalThreads, *sharedMem; // xLen = zLen
|
||||
__shared__ int rank, areSameOffsets, *sharedMem; // xRank = zRank
|
||||
__shared__ Nd4jLong len, totalThreads; // xLen = zLen
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
|
||||
rank = shape::rank(xShapeInfo);
|
||||
len = shape::length(zShapeInfo);
|
||||
|
@ -221,7 +221,7 @@ void triuBP(sd::LaunchContext* context, const NDArray& input, const NDArray& gra
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 4;
|
||||
const int blocksPerGrid = (gradO.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * gradO.rankOf() + 128;
|
||||
const int sharedMem = threadsPerBlock * sizeof(int) * gradO.rankOf() + 128;
|
||||
|
||||
PointersManager manager(context, "triuBP");
|
||||
|
||||
|
@ -240,13 +240,13 @@ __global__ static void tileBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo
|
|||
const auto x = reinterpret_cast<const T*>(vx); // gradO
|
||||
auto z = reinterpret_cast<T*>(vz); // gradI
|
||||
|
||||
__shared__ int xRank, zRank; // xRank >= zRank
|
||||
__shared__ Nd4jLong numOfXOffsets, zLen, totalThreads, *sharedMem; // xLen >= zLen
|
||||
__shared__ int xRank, zRank, *sharedMem; // xRank >= zRank
|
||||
__shared__ Nd4jLong numOfXOffsets, zLen, totalThreads; // xLen >= zLen
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
sharedMem = reinterpret_cast<int*>(shmem);
|
||||
|
||||
xRank = shape::rank(zShapeInfo);
|
||||
zLen = shape::length(zShapeInfo);
|
||||
|
@ -289,7 +289,7 @@ void tileBP(sd::LaunchContext * context, const NDArray& gradO /*input*/, NDArray
|
|||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 4;
|
||||
const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * 2 * gradO.rankOf() + 128;
|
||||
const int sharedMem = threadsPerBlock * sizeof(int) * 2 * gradO.rankOf() + 128;
|
||||
|
||||
PointersManager manager(context, "tileBP");
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ namespace sd {
|
|||
auto indices = reinterpret_cast<const I*>(vindices);
|
||||
auto output = reinterpret_cast<X*>(voutput);
|
||||
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
int coords[MAX_RANK];
|
||||
uint64_t pos = 0;
|
||||
for (uint64_t e = 0L; e < length; e++) {
|
||||
// indices come in blocks
|
||||
|
|
|
@ -29,11 +29,14 @@ namespace sd {
|
|||
NDArrayList list(0, true);
|
||||
int cnt = 0;
|
||||
|
||||
Nd4jLong idx[MAX_RANK];
|
||||
int idx[MAX_RANK];
|
||||
|
||||
for (Nd4jLong e = 0; e < condition.lengthOf(); e++) {
|
||||
shape::index2coords(e, condition.getShapeInfo(), idx);
|
||||
|
||||
shape::index2coordsCPU(0, e, condition.getShapeInfo(), idx);
|
||||
|
||||
auto offset = shape::getOffset(condition.getShapeInfo(), idx);
|
||||
|
||||
if (condition.e<bool>(offset)) {
|
||||
auto array = NDArrayFactory::create_('c', {1, condition.rankOf()}, output.dataType(), output.getContext());
|
||||
for (int f = 0; f < condition.rankOf(); f++)
|
||||
|
|
|
@ -178,16 +178,18 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<const NDArray*>& inAr
|
|||
// general case
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
int coords[MAX_RANK], temp;
|
||||
|
||||
for (auto i = start; i < stop; i += increment) {
|
||||
|
||||
shape::index2coords(i, output.getShapeInfo(), coords);
|
||||
shape::index2coordsCPU(start, i, output.getShapeInfo(), coords);
|
||||
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
||||
|
||||
uint inArrIdx = 0;
|
||||
uint xDim = inArrs[inArrIdx]->sizeAt(axis);
|
||||
|
||||
temp = coords[axis];
|
||||
while (coords[axis] >= xDim) {
|
||||
coords[axis] -= xDim;
|
||||
xDim = inArrs[++inArrIdx]->sizeAt(axis);
|
||||
|
@ -197,6 +199,8 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<const NDArray*>& inAr
|
|||
const auto xOffset = shape::getOffset(inArrs[inArrIdx]->getShapeInfo(), coords);
|
||||
|
||||
zBuff[zOffset] = x[xOffset];
|
||||
|
||||
coords[axis] = temp;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -298,13 +302,15 @@ void SpecialMethods<T>::splitCpuGeneric(const NDArray& input, const std::vector<
|
|||
|
||||
auto func = PRAGMA_THREADS_FOR{
|
||||
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
int coords[MAX_RANK], temp;
|
||||
|
||||
for (auto i = start; i < stop; i += increment) {
|
||||
|
||||
shape::index2coords(i, input.getShapeInfo(), coords);
|
||||
shape::index2coordsCPU(start, i, input.getShapeInfo(), coords);
|
||||
const auto xOffset = shape::getOffset(input.getShapeInfo(), coords);
|
||||
|
||||
uint outArrIdx = 0;
|
||||
temp = coords[axis];
|
||||
|
||||
while (coords[axis] >= zDim) {
|
||||
coords[axis] -= zDim;
|
||||
|
@ -314,6 +320,8 @@ void SpecialMethods<T>::splitCpuGeneric(const NDArray& input, const std::vector<
|
|||
T* z = outArrs[outArrIdx]->bufferAsT<T>();
|
||||
const auto zOffset = shape::getOffset(outArrs[outArrIdx]->getShapeInfo(), coords);
|
||||
z[zOffset] = xBuff[xOffset];
|
||||
|
||||
coords[axis] = temp;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -258,6 +258,7 @@ TEST_F(PlaygroundTests, test_bert_2) {
|
|||
delete graph;
|
||||
}
|
||||
|
||||
|
||||
TEST_F(PlaygroundTests, test_one_off_ops_1) {
|
||||
auto x = NDArrayFactory::create<float>('c', {4, 128, 768});
|
||||
auto y = NDArrayFactory::create<float>('c', {4, 128, 1});
|
||||
|
|
|
@ -289,7 +289,7 @@ TEST_F(TadTests, calcOffsets_1) {
|
|||
TEST_F(TadTests, outerArrayIndexes_1) {
|
||||
|
||||
NDArray x('c', {2,3,4,5}, sd::DataType::FLOAT32);
|
||||
Nd4jLong maxIdxs[120];
|
||||
int maxIdxs[120];
|
||||
|
||||
NDArray y1('c', {3,5}, sd::DataType::FLOAT32);
|
||||
const std::vector<int> dimsToExclude1 = {0,2};
|
||||
|
|
Loading…
Reference in New Issue