parent
38310777ee
commit
23c8738d4a
|
@ -78,7 +78,6 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
|
||||||
__shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
|
__shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
|
areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
|
||||||
|
@ -87,7 +86,6 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
|
||||||
zLen = shape::length(zShapeInfo);
|
zLen = shape::length(zShapeInfo);
|
||||||
totalThreads = gridDim.x * blockDim.x;
|
totalThreads = gridDim.x * blockDim.x;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
auto coords = sharedMem + threadIdx.x * zRank;
|
auto coords = sharedMem + threadIdx.x * zRank;
|
||||||
|
@ -153,14 +151,12 @@ __global__ static void identityMatrixCuda(void* vx, const Nd4jLong* xShapeInfo,
|
||||||
__shared__ Nd4jLong len, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
|
__shared__ Nd4jLong len, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
rank = shape::rank(xShapeInfo);
|
rank = shape::rank(xShapeInfo);
|
||||||
len = shape::length(xShapeInfo);
|
len = shape::length(xShapeInfo);
|
||||||
totalThreads = gridDim.x * blockDim.x;
|
totalThreads = gridDim.x * blockDim.x;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
auto coords = sharedMem + threadIdx.x * rank;
|
auto coords = sharedMem + threadIdx.x * rank;
|
||||||
|
|
|
@ -1183,7 +1183,6 @@ __global__ static void concatCuda(const int numOfArrs, void* pVx, void* pxShape
|
||||||
__shared__ Nd4jLong *zShapeInfo, *xShapeInfo, arrLen, arrLenZ, arrLenPerBlock, start, end;
|
__shared__ Nd4jLong *zShapeInfo, *xShapeInfo, arrLen, arrLenZ, arrLenPerBlock, start, end;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
blocksPerArr = (gridDim.x - gridDim.x % numOfArrs) / numOfArrs; // floor
|
blocksPerArr = (gridDim.x - gridDim.x % numOfArrs) / numOfArrs; // floor
|
||||||
arrIdx = blockIdx.x / blocksPerArr;
|
arrIdx = blockIdx.x / blocksPerArr;
|
||||||
if (arrIdx >= numOfArrs)
|
if (arrIdx >= numOfArrs)
|
||||||
|
@ -1200,8 +1199,8 @@ __global__ static void concatCuda(const int numOfArrs, void* pVx, void* pxShape
|
||||||
start = arrLenPerBlock * (blockIdx.x % blocksPerArr);
|
start = arrLenPerBlock * (blockIdx.x % blocksPerArr);
|
||||||
end = (start + arrLenPerBlock) > arrLen ? arrLen : (start + arrLenPerBlock);
|
end = (start + arrLenPerBlock) > arrLen ? arrLen : (start + arrLenPerBlock);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
for (Nd4jLong i = threadIdx.x + start; i < end; i += blockDim.x) {
|
for (Nd4jLong i = threadIdx.x + start; i < end; i += blockDim.x) {
|
||||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, arrLenZ);
|
auto zOffset = shape::getIndexOffset(i, zShapeInfo, arrLenZ);
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLen);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLen);
|
||||||
|
@ -3165,7 +3164,6 @@ __global__ static void scatterUpdateCuda(const int opCode, const int numOfSubArr
|
||||||
arrLenX = shape::length(xShapeInfo);
|
arrLenX = shape::length(xShapeInfo);
|
||||||
arrLenY = shape::length(yShapeInfo);
|
arrLenY = shape::length(yShapeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (arrLenX != arrLenY)
|
if (arrLenX != arrLenY)
|
||||||
|
|
|
@ -128,7 +128,6 @@ namespace functions {
|
||||||
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
tadLength = _length(tadOnlyShapeInfo);
|
tadLength = _length(tadOnlyShapeInfo);
|
||||||
tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
|
tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
|
||||||
numTads = _length(yShapeInfo) / tadLength;
|
numTads = _length(yShapeInfo) / tadLength;
|
||||||
|
@ -194,7 +193,6 @@ namespace functions {
|
||||||
__shared__ Nd4jLong zEWS;
|
__shared__ Nd4jLong zEWS;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
tadLength = _length(tadOnlyShapeInfo);
|
tadLength = _length(tadOnlyShapeInfo);
|
||||||
tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
|
tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
|
||||||
numTads = _length(xShapeInfo) / tadLength;
|
numTads = _length(xShapeInfo) / tadLength;
|
||||||
|
|
|
@ -185,7 +185,6 @@ namespace functions {
|
||||||
__shared__ Nd4jLong zEWS;
|
__shared__ Nd4jLong zEWS;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength);
|
tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength);
|
||||||
tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
|
tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
|
||||||
numTads = shape::length(xShapeInfo) / tadLength;
|
numTads = shape::length(xShapeInfo) / tadLength;
|
||||||
|
|
|
@ -231,7 +231,6 @@ namespace functions {
|
||||||
|
|
||||||
xLength = shape::length(xShapeInfo);
|
xLength = shape::length(xShapeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (!resultScalar) {
|
if (!resultScalar) {
|
||||||
|
@ -267,6 +266,7 @@ namespace functions {
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
result[r] = sPartials[threadIdx.x].index;
|
result[r] = sPartials[threadIdx.x].index;
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
@ -287,6 +287,7 @@ namespace functions {
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
result[i] = sPartials[threadIdx.x].index; //postProcess(sPartials[0],tadLength ,extraParams);
|
result[i] = sPartials[threadIdx.x].index; //postProcess(sPartials[0],tadLength ,extraParams);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -49,7 +49,6 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
|
||||||
__shared__ Nd4jLong len;
|
__shared__ Nd4jLong len;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
xEws = shape::elementWiseStride(xShapeInfo);
|
xEws = shape::elementWiseStride(xShapeInfo);
|
||||||
yEws = shape::elementWiseStride(yShapeInfo);
|
yEws = shape::elementWiseStride(yShapeInfo);
|
||||||
zEws = shape::elementWiseStride(zShapeInfo);
|
zEws = shape::elementWiseStride(zShapeInfo);
|
||||||
|
|
|
@ -49,7 +49,6 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
|
||||||
__shared__ Nd4jLong len;
|
__shared__ Nd4jLong len;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
xEws = shape::elementWiseStride(xShapeInfo);
|
xEws = shape::elementWiseStride(xShapeInfo);
|
||||||
yEws = shape::elementWiseStride(yShapeInfo);
|
yEws = shape::elementWiseStride(yShapeInfo);
|
||||||
zEws = shape::elementWiseStride(zShapeInfo);
|
zEws = shape::elementWiseStride(zShapeInfo);
|
||||||
|
|
|
@ -125,7 +125,6 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
|
||||||
__shared__ Z* sPartials;
|
__shared__ Z* sPartials;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
sPartials = reinterpret_cast<Z*>(shmem);
|
sPartials = reinterpret_cast<Z*>(shmem);
|
||||||
|
|
||||||
|
@ -137,7 +136,6 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
|
||||||
else
|
else
|
||||||
extraZ[2] = (Z) 0.0f;
|
extraZ[2] = (Z) 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
sPartials[threadIdx.x] = OpType::startingValue(x);
|
sPartials[threadIdx.x] = OpType::startingValue(x);
|
||||||
|
@ -377,7 +375,6 @@ __device__ void Reduce3<X,Z>::transform(void *vx, Nd4jLong *xShapeInfo,
|
||||||
__shared__ char yTadOrder;
|
__shared__ char yTadOrder;
|
||||||
|
|
||||||
if(threadIdx.x == 0) {
|
if(threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
sPartials = reinterpret_cast<Z*>(shmem);
|
sPartials = reinterpret_cast<Z*>(shmem);
|
||||||
|
|
||||||
|
|
|
@ -217,7 +217,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
z[r] = OpType::getValue(postProcessOrNot, sPartials[threadIdx.x]);
|
z[r] = OpType::getValue(postProcessOrNot, sPartials[threadIdx.x]);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -285,8 +285,8 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
|
||||||
SummaryStatsData<X> *pBuffer = (SummaryStatsData<X>*) reductionBuffer;
|
SummaryStatsData<X> *pBuffer = (SummaryStatsData<X>*) reductionBuffer;
|
||||||
pBuffer[blockIdx.x] = sPartials[0];
|
pBuffer[blockIdx.x] = sPartials[0];
|
||||||
}
|
}
|
||||||
__syncthreads();
|
|
||||||
__threadfence();
|
__threadfence();
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
unsigned int ticket = atomicInc(&tc[16384], gridDim.x);
|
unsigned int ticket = atomicInc(&tc[16384], gridDim.x);
|
||||||
|
|
|
@ -52,7 +52,6 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo,
|
||||||
xzRank = shape::rank(xShapeInfo);
|
xzRank = shape::rank(xShapeInfo);
|
||||||
yRank = shape::rank(yShapeInfo);
|
yRank = shape::rank(yShapeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
@ -132,7 +131,6 @@ __global__ linkage void preluBPCuda(const void *vIn, const Nd4jLong *inShapeI
|
||||||
inRank = shape::rank(inShapeInfo);
|
inRank = shape::rank(inShapeInfo);
|
||||||
alphaRank = shape::rank(alphaShapeInfo);
|
alphaRank = shape::rank(alphaShapeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
|
@ -45,7 +45,6 @@ static void _CUDA_G adjustHueCuda(const void* vx, const Nd4jLong* xShapeInfo, co
|
||||||
xDimCstride = shape::stride(xShapeInfo)[dimC];
|
xDimCstride = shape::stride(xShapeInfo)[dimC];
|
||||||
zDimCstride = shape::stride(zShapeInfo)[dimC];
|
zDimCstride = shape::stride(zShapeInfo)[dimC];
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
|
@ -47,7 +47,6 @@ static void _CUDA_G adjustSaturationCuda(const void* vx, const Nd4jLong* xShapeI
|
||||||
xDimCstride = shape::stride(xShapeInfo)[dimC];
|
xDimCstride = shape::stride(xShapeInfo)[dimC];
|
||||||
zDimCstride = shape::stride(zShapeInfo)[dimC];
|
zDimCstride = shape::stride(zShapeInfo)[dimC];
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
|
@ -53,7 +53,6 @@ __global__ static void batchnormCuda(const void* vx, const Nd4jLong* xShapeInfo,
|
||||||
__shared__ Nd4jLong minLen, tadLen, totalThreads;
|
__shared__ Nd4jLong minLen, tadLen, totalThreads;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
totalThreads = gridDim.x * blockDim.x;
|
totalThreads = gridDim.x * blockDim.x;
|
||||||
|
|
||||||
minLen = shape::length(meanShapeInfo);
|
minLen = shape::length(meanShapeInfo);
|
||||||
|
@ -116,7 +115,6 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo
|
||||||
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
totalThreads = gridDim.x * blockDim.x;
|
totalThreads = gridDim.x * blockDim.x;
|
||||||
|
|
|
@ -38,7 +38,6 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI
|
||||||
__shared__ Nd4jLong *sharedMem, imLen;
|
__shared__ Nd4jLong *sharedMem, imLen;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
|
|
||||||
|
@ -53,7 +52,6 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI
|
||||||
|
|
||||||
imLen = shape::length(imShapeInfo);
|
imLen = shape::length(imShapeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto imInd = threadIdx.x + blockIdx.x * blockDim.x;
|
const auto imInd = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
|
|
@ -50,7 +50,6 @@ __global__ static void concatCuda(void* pVx, void* pxShapeInfo, void* vz, Nd4jL
|
||||||
rank = shape::rank(zShapeInfo);
|
rank = shape::rank(zShapeInfo);
|
||||||
totalThreads = gridDim.x * blockDim.x;
|
totalThreads = gridDim.x * blockDim.x;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
|
@ -43,7 +43,6 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
|
||||||
__shared__ Nd4jLong *sharedMem;
|
__shared__ Nd4jLong *sharedMem;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
|
|
||||||
|
@ -56,7 +55,6 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
|
||||||
iH = volShapeInfo[4];
|
iH = volShapeInfo[4];
|
||||||
iW = volShapeInfo[5];
|
iW = volShapeInfo[5];
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;
|
const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
@ -127,7 +125,6 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape
|
||||||
__shared__ Nd4jLong *sharedMem, volLen;
|
__shared__ Nd4jLong *sharedMem, volLen;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
|
|
||||||
|
@ -144,7 +141,6 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape
|
||||||
|
|
||||||
volLen = shape::length(volShapeInfo);
|
volLen = shape::length(volShapeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto volInd = threadIdx.x + blockIdx.x * blockDim.x;
|
const auto volInd = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
@ -404,7 +400,6 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
|
||||||
__shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;
|
__shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
bS = shape::sizeAt(xShapeInfo, 0);
|
bS = shape::sizeAt(xShapeInfo, 0);
|
||||||
iC = shape::sizeAt(xShapeInfo, 1);
|
iC = shape::sizeAt(xShapeInfo, 1);
|
||||||
oH = shape::sizeAt(zShapeInfo, 2);
|
oH = shape::sizeAt(zShapeInfo, 2);
|
||||||
|
@ -428,7 +423,6 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
|
||||||
kHEff = kH + (kH-1)*(dH-1);
|
kHEff = kH + (kH-1)*(dH-1);
|
||||||
kWEff = kW + (kW-1)*(dW-1);
|
kWEff = kW + (kW-1)*(dW-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
int tid = blockIdx.x * gridDim.x + threadIdx.x;
|
int tid = blockIdx.x * gridDim.x + threadIdx.x;
|
||||||
|
@ -501,7 +495,6 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
|
||||||
__shared__ bool fOrder;
|
__shared__ bool fOrder;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
bS = shape::sizeAt(xShapeInfo, 0);
|
bS = shape::sizeAt(xShapeInfo, 0);
|
||||||
iC = shape::sizeAt(xShapeInfo, 1);
|
iC = shape::sizeAt(xShapeInfo, 1);
|
||||||
oH = shape::sizeAt(zShapeInfo, 2);
|
oH = shape::sizeAt(zShapeInfo, 2);
|
||||||
|
@ -525,7 +518,6 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
|
||||||
kHEff = kH + (kH-1)*(dH-1);
|
kHEff = kH + (kH-1)*(dH-1);
|
||||||
kWEff = kW + (kW-1)*(dW-1);
|
kWEff = kW + (kW-1)*(dW-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
int tid = blockIdx.x * gridDim.x + threadIdx.x;
|
int tid = blockIdx.x * gridDim.x + threadIdx.x;
|
||||||
|
@ -594,7 +586,6 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
|
||||||
__shared__ bool fOrder;
|
__shared__ bool fOrder;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
bS = shape::sizeAt(xShapeInfo, 0);
|
bS = shape::sizeAt(xShapeInfo, 0);
|
||||||
iC = shape::sizeAt(xShapeInfo, 1);
|
iC = shape::sizeAt(xShapeInfo, 1);
|
||||||
oH = shape::sizeAt(zShapeInfo, 2);
|
oH = shape::sizeAt(zShapeInfo, 2);
|
||||||
|
@ -618,7 +609,6 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
|
||||||
kHEff = kH + (kH-1)*(dH-1);
|
kHEff = kH + (kH-1)*(dH-1);
|
||||||
kWEff = kW + (kW-1)*(dW-1);
|
kWEff = kW + (kW-1)*(dW-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
int tid = blockIdx.x * gridDim.x + threadIdx.x;
|
int tid = blockIdx.x * gridDim.x + threadIdx.x;
|
||||||
|
@ -737,7 +727,6 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,
|
||||||
|
|
||||||
kProd = kD * kH * kW;
|
kProd = kD * kH * kW;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
|
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
@ -888,7 +877,6 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
|
||||||
|
|
||||||
kProd = kH * kW;
|
kProd = kH * kW;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
|
const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
@ -1043,7 +1031,6 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
|
||||||
|
|
||||||
kProd = kD * kH * kW;
|
kProd = kD * kH * kW;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
|
const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
@ -1356,7 +1343,6 @@ __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeIn
|
||||||
zLen = shape::length(zShapeInfo);
|
zLen = shape::length(zShapeInfo);
|
||||||
rank = 4;
|
rank = 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
|
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
@ -1425,7 +1411,6 @@ __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeIn
|
||||||
zLen = shape::length(zShapeInfo);
|
zLen = shape::length(zShapeInfo);
|
||||||
rank = 5;
|
rank = 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
|
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
@ -1499,7 +1484,6 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape
|
||||||
factorH = xShapeInfo[dimIH + 1] / zShapeInfo[dimIH + 1];
|
factorH = xShapeInfo[dimIH + 1] / zShapeInfo[dimIH + 1];
|
||||||
factorW = xShapeInfo[dimIH + 2] / zShapeInfo[dimIH + 2];
|
factorW = xShapeInfo[dimIH + 2] / zShapeInfo[dimIH + 2];
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
|
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
@ -1573,7 +1557,6 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape
|
||||||
factorH = xShapeInfo[dimID + 2] / zShapeInfo[dimID + 2];
|
factorH = xShapeInfo[dimID + 2] / zShapeInfo[dimID + 2];
|
||||||
factorW = xShapeInfo[dimID + 3] / zShapeInfo[dimID + 3];
|
factorW = xShapeInfo[dimID + 3] / zShapeInfo[dimID + 3];
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
|
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
|
|
@ -40,7 +40,6 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
|
||||||
__shared__ Nd4jLong lenWithoutLastDim, totalThreads, *sharedMem;
|
__shared__ Nd4jLong lenWithoutLastDim, totalThreads, *sharedMem;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
x = reinterpret_cast<const T*>(vx);
|
x = reinterpret_cast<const T*>(vx);
|
||||||
y = reinterpret_cast<const T*>(vy);
|
y = reinterpret_cast<const T*>(vy);
|
||||||
z = reinterpret_cast<T*>(vz);
|
z = reinterpret_cast<T*>(vz);
|
||||||
|
|
|
@ -62,7 +62,6 @@ __global__ static void dilation2dCuda(const void* vx, const Nd4jLong* xShapeInfo
|
||||||
kH = yShapeInfo[1];
|
kH = yShapeInfo[1];
|
||||||
kW = yShapeInfo[2];
|
kW = yShapeInfo[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
|
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
|
|
@ -32,13 +32,8 @@ namespace helpers {
|
||||||
static __global__ void dropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probVal, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
|
static __global__ void dropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probVal, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
|
||||||
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
auto step = blockDim.x * gridDim.x;
|
auto step = blockDim.x * gridDim.x;
|
||||||
__shared__ T const* input;
|
T const* input = reinterpret_cast<T const*>(inputBuf);
|
||||||
__shared__ T* output;
|
T* output = reinterpret_cast<T*>(outputBuf);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
input = reinterpret_cast<T const*>(inputBuf);
|
|
||||||
output = reinterpret_cast<T*>(outputBuf);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (Nd4jLong e = 0; e < inLen; ++e) {
|
for (Nd4jLong e = 0; e < inLen; ++e) {
|
||||||
T val = nodeRng->relativeT(e, T(0.f), T(1.f));
|
T val = nodeRng->relativeT(e, T(0.f), T(1.f));
|
||||||
|
@ -134,6 +129,7 @@ namespace helpers {
|
||||||
output = reinterpret_cast<T*>(outputBuf);
|
output = reinterpret_cast<T*>(outputBuf);
|
||||||
input = reinterpret_cast<T*>(gradOutBuf);
|
input = reinterpret_cast<T*>(gradOutBuf);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
auto step = blockDim.x * gridDim.x;
|
auto step = blockDim.x * gridDim.x;
|
||||||
|
@ -159,13 +155,8 @@ namespace helpers {
|
||||||
static __global__ void alphaDropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probValue, double alpha, double alpha1, double beta, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
|
static __global__ void alphaDropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probValue, double alpha, double alpha1, double beta, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
|
||||||
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
auto step = blockDim.x * gridDim.x;
|
auto step = blockDim.x * gridDim.x;
|
||||||
__shared__ T const* input;
|
T const* input = reinterpret_cast<T const*>(inputBuf);
|
||||||
__shared__ T* output;
|
T* output = reinterpret_cast<T*>(outputBuf);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
input = reinterpret_cast<T const*>(inputBuf);
|
|
||||||
output = reinterpret_cast<T*>(outputBuf);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto e = tid; e < inLen; e += step) {
|
for (auto e = tid; e < inLen; e += step) {
|
||||||
T val = nodeRng->relativeT(e, T(0.f), T(1.f));
|
T val = nodeRng->relativeT(e, T(0.f), T(1.f));
|
||||||
|
@ -209,7 +200,7 @@ namespace helpers {
|
||||||
std::vector<Nd4jLong> dims(reduceShape->lengthOf());
|
std::vector<Nd4jLong> dims(reduceShape->lengthOf());
|
||||||
reduceShape->syncToHost(); // to ensure that follows are actual
|
reduceShape->syncToHost(); // to ensure that follows are actual
|
||||||
bool fit = true;
|
bool fit = true;
|
||||||
// PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit))
|
|
||||||
for( int i = 0; i < dims.size(); i++ ) {
|
for( int i = 0; i < dims.size(); i++ ) {
|
||||||
if (fit) {
|
if (fit) {
|
||||||
dims[i] = reduceShape->e<Nd4jLong>(i);
|
dims[i] = reduceShape->e<Nd4jLong>(i);
|
||||||
|
@ -225,9 +216,9 @@ namespace helpers {
|
||||||
REQUIRE_TRUE(fit, 0, "alpha_dropout: Noise shape should fit to input rank.");
|
REQUIRE_TRUE(fit, 0, "alpha_dropout: Noise shape should fit to input rank.");
|
||||||
std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext()));
|
std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext()));
|
||||||
chunk->assign(1.f);
|
chunk->assign(1.f);
|
||||||
//chunk->applyRandom<randomOps::DropOutInverted<T>>(rng, nullptr, chunk.get(), &probValue);
|
|
||||||
//NativeOpExecutioner::execRandom(random::DropOutInverted, rng, chunk->buffer(), chunk->shapeInfo(), chunk->buffer(), chunk->shapeInfo(), &prob);
|
|
||||||
alphaDropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), seed, probValue, alpha, alpha1, beta);
|
alphaDropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), seed, probValue, alpha, alpha1, beta);
|
||||||
|
|
||||||
// broadcast chunk to full matrix
|
// broadcast chunk to full matrix
|
||||||
std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input));
|
std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input));
|
||||||
dropOutMultiplier->assign(1.f);
|
dropOutMultiplier->assign(1.f);
|
||||||
|
|
|
@ -76,7 +76,6 @@ __global__ static void gatherCuda(const int numOfSubArrs,
|
||||||
for (int i = blockIdx.x; i < numOfSubArrs; i += gridDim.x) {
|
for (int i = blockIdx.x; i < numOfSubArrs; i += gridDim.x) {
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
x = reinterpret_cast<const X*>(vx) + xOffsets[y[shape::getIndexOffset(i, yShapeInfo, numOfSubArrs)]];
|
x = reinterpret_cast<const X*>(vx) + xOffsets[y[shape::getIndexOffset(i, yShapeInfo, numOfSubArrs)]];
|
||||||
z = reinterpret_cast<X*>(vz) + zOffsets[i];
|
z = reinterpret_cast<X*>(vz) + zOffsets[i];
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,7 +47,6 @@ namespace nd4j {
|
||||||
__shared__ Nd4jLong zLen, totalThreads, *sharedMem;
|
__shared__ Nd4jLong zLen, totalThreads, *sharedMem;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
|
|
||||||
|
@ -61,7 +60,6 @@ namespace nd4j {
|
||||||
|
|
||||||
totalThreads = gridDim.x * blockDim.x;
|
totalThreads = gridDim.x * blockDim.x;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
auto coord = sharedMem + threadIdx.x * maxRank;
|
auto coord = sharedMem + threadIdx.x * maxRank;
|
||||||
|
|
|
@ -61,7 +61,6 @@ namespace nd4j {
|
||||||
|
|
||||||
static __global__ void lastStep(Nd4jLong* resultBuf, Nd4jLong* tempBufferA, Nd4jLong* tempResult, Nd4jLong length, Nd4jLong blockSize) {
|
static __global__ void lastStep(Nd4jLong* resultBuf, Nd4jLong* tempBufferA, Nd4jLong* tempResult, Nd4jLong length, Nd4jLong blockSize) {
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
if (length <= blockSize)
|
if (length <= blockSize)
|
||||||
*resultBuf = *tempBufferA;
|
*resultBuf = *tempBufferA;
|
||||||
else
|
else
|
||||||
|
|
|
@ -44,7 +44,6 @@ __global__ static void im2colCuda(const void *image, void *columns,
|
||||||
__shared__ int imRank, colRank;
|
__shared__ int imRank, colRank;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
|
|
||||||
|
@ -56,7 +55,6 @@ __global__ static void im2colCuda(const void *image, void *columns,
|
||||||
iH = imShapeInfo[3];
|
iH = imShapeInfo[3];
|
||||||
iW = imShapeInfo[4];
|
iW = imShapeInfo[4];
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;
|
const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
|
|
@ -88,12 +88,9 @@ namespace helpers {
|
||||||
|
|
||||||
template <typename I>
|
template <typename I>
|
||||||
static __global__ void copyIndices(void* indices, void* indicesLong, Nd4jLong len) {
|
static __global__ void copyIndices(void* indices, void* indicesLong, Nd4jLong len) {
|
||||||
__shared__ I* indexBuf;
|
I* indexBuf = reinterpret_cast<I*>(indices);
|
||||||
__shared__ Nd4jLong* srcBuf;
|
Nd4jLong* srcBuf = reinterpret_cast<Nd4jLong*>(indicesLong);;
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
indexBuf = reinterpret_cast<I*>(indices);
|
|
||||||
srcBuf = reinterpret_cast<Nd4jLong*>(indicesLong);
|
|
||||||
}
|
|
||||||
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
|
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
auto step = blockDim.x * gridDim.x;
|
auto step = blockDim.x * gridDim.x;
|
||||||
|
|
||||||
|
|
|
@ -29,11 +29,7 @@ namespace helpers {
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static _CUDA_G void lrnKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) {
|
static _CUDA_G void lrnKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) {
|
||||||
extern __shared__ char sharedChar[];
|
extern __shared__ char sharedChar[];
|
||||||
__shared__ T* shared;
|
T* shared = reinterpret_cast<T*>(sharedChar);
|
||||||
if (threadIdx.x == 0)
|
|
||||||
shared = reinterpret_cast<T*>(sharedChar);
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
|
|
||||||
auto xEws = shape::elementWiseStride(xTadShapeInfo);
|
auto xEws = shape::elementWiseStride(xTadShapeInfo);
|
||||||
auto zEws = shape::elementWiseStride(zTadShapeInfo);
|
auto zEws = shape::elementWiseStride(zTadShapeInfo);
|
||||||
|
@ -69,16 +65,8 @@ namespace helpers {
|
||||||
template <typename X, typename Z>
|
template <typename X, typename Z>
|
||||||
static _CUDA_G void lrnBPKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) {
|
static _CUDA_G void lrnBPKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) {
|
||||||
extern __shared__ char sharedChar[];
|
extern __shared__ char sharedChar[];
|
||||||
__shared__ X* sharedX;
|
X* sharedX = reinterpret_cast<X*>(sharedChar);
|
||||||
__shared__ Z* sharedY;
|
Z* sharedY = reinterpret_cast<Z*>(sharedX + blockDim.x);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
sharedX = reinterpret_cast<X*>(sharedChar);
|
|
||||||
sharedY = reinterpret_cast<Z*>(sharedX + blockDim.x);
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
|
|
||||||
auto xEws = shape::elementWiseStride(xTadShapeInfo);
|
auto xEws = shape::elementWiseStride(xTadShapeInfo);
|
||||||
auto zEws = shape::elementWiseStride(zTadShapeInfo);
|
auto zEws = shape::elementWiseStride(zTadShapeInfo);
|
||||||
|
|
|
@ -57,14 +57,8 @@ namespace helpers {
|
||||||
// }
|
// }
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static __global__ void invertKernelLow(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
|
static __global__ void invertKernelLow(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
|
||||||
__shared__ T* inverted;
|
T* inverted = reinterpret_cast<T*>(invertedBuf);
|
||||||
__shared__ T* input;
|
T* input = reinterpret_cast<T*>(inputBuf);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
inverted = reinterpret_cast<T*>(invertedBuf);
|
|
||||||
input = reinterpret_cast<T*>(inputBuf);
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
auto start = threadIdx.x + blockIdx.x * blockDim.x;
|
auto start = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
auto step = blockDim.x * gridDim.x;
|
auto step = blockDim.x * gridDim.x;
|
||||||
|
@ -84,14 +78,8 @@ namespace helpers {
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static __global__ void upvertKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
|
static __global__ void upvertKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
|
||||||
__shared__ T* inverted;
|
T* inverted = reinterpret_cast<T*>(invertedBuf);
|
||||||
__shared__ T* input;
|
T* input = reinterpret_cast<T*>(inputBuf);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
inverted = reinterpret_cast<T*>(invertedBuf);
|
|
||||||
input = reinterpret_cast<T*>(inputBuf);
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
auto start = threadIdx.x + blockIdx.x * blockDim.x;
|
auto start = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
auto step = blockDim.x * gridDim.x;
|
auto step = blockDim.x * gridDim.x;
|
||||||
|
@ -107,14 +95,8 @@ namespace helpers {
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static __global__ void upvertKernelUp(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
|
static __global__ void upvertKernelUp(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
|
||||||
__shared__ T* inverted;
|
T* inverted = reinterpret_cast<T*>(invertedBuf);
|
||||||
__shared__ T* input;
|
T* input = reinterpret_cast<T*>(inputBuf);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
inverted = reinterpret_cast<T*>(invertedBuf);
|
|
||||||
input = reinterpret_cast<T*>(inputBuf);
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
auto start = threadIdx.x + blockIdx.x * blockDim.x;
|
auto start = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
auto step = blockDim.x * gridDim.x;
|
auto step = blockDim.x * gridDim.x;
|
||||||
|
@ -135,17 +117,8 @@ namespace helpers {
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static __global__ void invertLowKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
|
static __global__ void invertLowKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
|
||||||
__shared__ T* inverted;
|
T* inverted = reinterpret_cast<T*>(invertedBuf);
|
||||||
__shared__ T* input;
|
T* input = reinterpret_cast<T*>(inputBuf);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
inverted = reinterpret_cast<T*>(invertedBuf);
|
|
||||||
input = reinterpret_cast<T*>(inputBuf);
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// auto start = threadIdx.x + blockIdx.x * blockDim.x;
|
|
||||||
// auto step = blockDim.x * gridDim.x;
|
|
||||||
|
|
||||||
for (int i = blockIdx.x + 2; i < n; i += gridDim.x) {
|
for (int i = blockIdx.x + 2; i < n; i += gridDim.x) {
|
||||||
for (int j = i - 2; j >= 0; --j)
|
for (int j = i - 2; j >= 0; --j)
|
||||||
|
@ -166,17 +139,8 @@ namespace helpers {
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static __global__ void invertUpKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
|
static __global__ void invertUpKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
|
||||||
__shared__ T* inverted;
|
T* inverted = reinterpret_cast<T*>(invertedBuf);;
|
||||||
__shared__ T* input;
|
T* input = reinterpret_cast<T*>(inputBuf);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
inverted = reinterpret_cast<T*>(invertedBuf);
|
|
||||||
input = reinterpret_cast<T*>(inputBuf);
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// auto start = threadIdx.x + blockIdx.x * blockDim.x;
|
|
||||||
// auto step = blockDim.x * gridDim.x;
|
|
||||||
|
|
||||||
for (int i = n - blockIdx.x - 2; i >= 0; i -= gridDim.x) {
|
for (int i = n - blockIdx.x - 2; i >= 0; i -= gridDim.x) {
|
||||||
for (int j = i + 2; j < n; j++)
|
for (int j = i + 2; j < n; j++)
|
||||||
|
@ -366,11 +330,8 @@ namespace helpers {
|
||||||
|
|
||||||
template <typename F>
|
template <typename F>
|
||||||
static __global__ void fillUpPermutation(void* output, Nd4jLong* shape, int* source, int rowNum) {
|
static __global__ void fillUpPermutation(void* output, Nd4jLong* shape, int* source, int rowNum) {
|
||||||
__shared__ F* permutation;
|
F* permutation = reinterpret_cast<F*>(output);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
permutation = reinterpret_cast<F*>(output);
|
|
||||||
}
|
|
||||||
auto start = blockIdx.x * blockDim.x + threadIdx.x;
|
auto start = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
auto step = blockDim.x * gridDim.x;
|
auto step = blockDim.x * gridDim.x;
|
||||||
for (auto i = start; i < rowNum; i += step) {
|
for (auto i = start; i < rowNum; i += step) {
|
||||||
|
@ -709,13 +670,8 @@ namespace helpers {
|
||||||
template <typename F>
|
template <typename F>
|
||||||
__global__ void adjustResultsKernel(F* dArray, Nd4jLong* shape, Nd4jLong* offsets, Nd4jLong batchSize, Nd4jLong n) {
|
__global__ void adjustResultsKernel(F* dArray, Nd4jLong* shape, Nd4jLong* offsets, Nd4jLong batchSize, Nd4jLong n) {
|
||||||
//auto i = blockIdx.x * blockDim.x + threadIdx.x;
|
//auto i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
__shared__ Nd4jLong* shapeOf;
|
Nd4jLong* shapeOf = shape::shapeOf(shape);
|
||||||
__shared__ Nd4jLong* strideOf;
|
Nd4jLong* strideOf = shape::stride(shape);
|
||||||
if (blockIdx.x == 0 && threadIdx.x == 0) {
|
|
||||||
shapeOf = shape::shapeOf(shape);
|
|
||||||
strideOf = shape::stride(shape);
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
for (auto i = blockIdx.x; i < batchSize; i+= gridDim.x) {
|
for (auto i = blockIdx.x; i < batchSize; i+= gridDim.x) {
|
||||||
auto current = dArray + offsets[i];
|
auto current = dArray + offsets[i];
|
||||||
|
|
|
@ -37,6 +37,7 @@ namespace helpers {
|
||||||
outLength = shape::length(outputShape);
|
outLength = shape::length(outputShape);
|
||||||
diagonalLen = shape::length(diagonalShape);
|
diagonalLen = shape::length(diagonalShape);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
for(int i = blockIdx.x; i < batchSize; i+= gridDim.x )
|
for(int i = blockIdx.x; i < batchSize; i+= gridDim.x )
|
||||||
for(int j = threadIdx.x; j < lastSmallDim; j += blockDim.x) {
|
for(int j = threadIdx.x; j < lastSmallDim; j += blockDim.x) {
|
||||||
|
|
|
@ -46,7 +46,6 @@ __global__ static void onehotCuda(const void *vx, const Nd4jLong *xShapeInfo, vo
|
||||||
__shared__ Nd4jLong zLen, totalThreads, *sharedMem;
|
__shared__ Nd4jLong zLen, totalThreads, *sharedMem;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
xRank = shape::rank(xShapeInfo);
|
xRank = shape::rank(xShapeInfo);
|
||||||
|
@ -54,11 +53,10 @@ __global__ static void onehotCuda(const void *vx, const Nd4jLong *xShapeInfo, vo
|
||||||
zLen = shape::length(zShapeInfo);
|
zLen = shape::length(zShapeInfo);
|
||||||
totalThreads = gridDim.x * blockDim.x;
|
totalThreads = gridDim.x * blockDim.x;
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
auto coord = sharedMem + threadIdx.x * zRank;
|
auto coord = sharedMem + threadIdx.x * zRank;
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
|
for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
|
||||||
|
|
|
@ -51,7 +51,6 @@ namespace nd4j {
|
||||||
__shared__ Nd4jLong zLen, yLen, totalThreads, *coords, *xShape, *zShape, *xStride, *zStride, shift1, shift2, yStride0;
|
__shared__ Nd4jLong zLen, yLen, totalThreads, *coords, *xShape, *zShape, *xStride, *zStride, shift1, shift2, yStride0;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
coords = reinterpret_cast<Nd4jLong*>(shmem);
|
coords = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
zLen = shape::length(zShapeInfo);
|
zLen = shape::length(zShapeInfo);
|
||||||
|
|
|
@ -77,6 +77,7 @@ namespace helpers {
|
||||||
// saving final value
|
// saving final value
|
||||||
if (threadIdx.x == 0)
|
if (threadIdx.x == 0)
|
||||||
z[shape::getIndexOffset(t, zShapeInfo, zLength)] = x[shape::getIndexOffset(position, xTadShapeInfo, tadLength)];
|
z[shape::getIndexOffset(t, zShapeInfo, zLength)] = x[shape::getIndexOffset(position, xTadShapeInfo, tadLength)];
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -40,8 +40,7 @@ __global__ static void polyGammaCuda(const void *vn, const Nd4jLong *nShapeInfo,
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
if (threadIdx.x == 0)
|
||||||
len = shape::length(nShapeInfo);
|
len = shape::length(nShapeInfo);
|
||||||
|
__syncthreads();
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const auto totalThreads = gridDim.x * blockDim.x;
|
const auto totalThreads = gridDim.x * blockDim.x;
|
||||||
|
|
|
@ -93,6 +93,7 @@ __global__ static void prefixPerBlockCuda(scalar::Ops op,
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
if (threadIdx.x == 0)
|
||||||
shared[blockDim2 - 1] = (op == scalar::Add) ? 0 : 1;
|
shared[blockDim2 - 1] = (op == scalar::Add) ? 0 : 1;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
for (uint d = 1; d < blockDim2; d *= 2) {
|
for (uint d = 1; d < blockDim2; d *= 2) {
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
|
||||||
__shared__ Nd4jLong zLen, totalThreads, *sharedMem;
|
__shared__ Nd4jLong zLen, totalThreads, *sharedMem;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
|
|
||||||
|
@ -55,7 +54,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
|
||||||
zLen = shape::length(zShapeInfo);
|
zLen = shape::length(zShapeInfo);
|
||||||
totalThreads = gridDim.x * blockDim.x;
|
totalThreads = gridDim.x * blockDim.x;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
auto coords = sharedMem + threadIdx.x * rank;
|
auto coords = sharedMem + threadIdx.x * rank;
|
||||||
|
@ -138,7 +136,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
|
||||||
__shared__ Nd4jLong zLen, totalThreads, *sharedMem;
|
__shared__ Nd4jLong zLen, totalThreads, *sharedMem;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
|
|
||||||
|
@ -146,7 +143,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
|
||||||
zLen = shape::length(zShapeInfo);
|
zLen = shape::length(zShapeInfo);
|
||||||
totalThreads = gridDim.x * blockDim.x;
|
totalThreads = gridDim.x * blockDim.x;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
auto coords = sharedMem + threadIdx.x * rank;
|
auto coords = sharedMem + threadIdx.x * rank;
|
||||||
|
|
|
@ -308,7 +308,6 @@ __global__ static void scatterCuda(const int opCode,
|
||||||
__shared__ Nd4jLong yLen, totalThreads, *coord;
|
__shared__ Nd4jLong yLen, totalThreads, *coord;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
coord = reinterpret_cast<Nd4jLong*>(shmem);
|
coord = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
yLen = shape::length(yShapeInfo);
|
yLen = shape::length(yShapeInfo);
|
||||||
|
@ -317,7 +316,6 @@ __global__ static void scatterCuda(const int opCode,
|
||||||
yRank = shape::rank(yShapeInfo);
|
yRank = shape::rank(yShapeInfo);
|
||||||
zRank = shape::rank(zShapeInfo);
|
zRank = shape::rank(zShapeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank);
|
auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank);
|
||||||
|
@ -455,12 +453,10 @@ __global__ static void scatterNDLockCuda(const int opCode,
|
||||||
__shared__ int xLastDim;
|
__shared__ int xLastDim;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
zTadCoords = reinterpret_cast<Nd4jLong*>(shmem);
|
zTadCoords = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
xLastDim = xTadShapeInfo[1]; // xTad has rank = 1 always
|
xLastDim = xTadShapeInfo[1]; // xTad has rank = 1 always
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
Nd4jLong* zTadCoordsPerThread = zTadCoords + threadIdx.x * xLastDim;
|
Nd4jLong* zTadCoordsPerThread = zTadCoords + threadIdx.x * xLastDim;
|
||||||
|
@ -598,7 +594,6 @@ __global__ static void scatterNDCuda(const int opCode,
|
||||||
__shared__ Nd4jLong yLen, totalThreads, *coord;
|
__shared__ Nd4jLong yLen, totalThreads, *coord;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
extern __shared__ unsigned char shmem[];
|
extern __shared__ unsigned char shmem[];
|
||||||
coord = reinterpret_cast<Nd4jLong*>(shmem);
|
coord = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
yLen = shape::length(yShapeInfo);
|
yLen = shape::length(yShapeInfo);
|
||||||
|
@ -608,7 +603,6 @@ __global__ static void scatterNDCuda(const int opCode,
|
||||||
zRank = shape::rank(zShapeInfo);
|
zRank = shape::rank(zShapeInfo);
|
||||||
xLastDim = xShapeInfo[xRank];
|
xLastDim = xShapeInfo[xRank];
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank);
|
auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank);
|
||||||
|
@ -752,7 +746,6 @@ __global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo,
|
||||||
xLen = shape::length(xShapeInfo);
|
xLen = shape::length(xShapeInfo);
|
||||||
xRank = shape::rank(xShapeInfo);
|
xRank = shape::rank(xShapeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto xInd = threadIdx.x + blockIdx.x * blockDim.x;
|
const auto xInd = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
|
|
@ -56,7 +56,6 @@ namespace nd4j {
|
||||||
arrLenX = shape::length(xShapeInfo);
|
arrLenX = shape::length(xShapeInfo);
|
||||||
arrLenY = shape::length(yShapeInfo);
|
arrLenY = shape::length(yShapeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (arrLenX != arrLenY)
|
if (arrLenX != arrLenY)
|
||||||
|
|
|
@ -266,6 +266,7 @@ namespace nd4j {
|
||||||
gradOut = reinterpret_cast<T*>(eps);
|
gradOut = reinterpret_cast<T*>(eps);
|
||||||
gradLen = shape::length(epsShape);
|
gradLen = shape::length(epsShape);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
auto start = blockIdx.x * blockDim.x + threadIdx.x;
|
auto start = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
auto step = gridDim.x * blockDim.x;
|
auto step = gridDim.x * blockDim.x;
|
||||||
|
@ -311,6 +312,7 @@ namespace nd4j {
|
||||||
gradLen = shape::length(epsShape);
|
gradLen = shape::length(epsShape);
|
||||||
currentLen = shape::length(outTad);
|
currentLen = shape::length(outTad);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
|
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
|
||||||
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
|
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
|
||||||
|
|
|
@ -248,6 +248,7 @@ namespace helpers {
|
||||||
gradOut = reinterpret_cast<T*>(eps);
|
gradOut = reinterpret_cast<T*>(eps);
|
||||||
gradLen = shape::length(epsShape);
|
gradLen = shape::length(epsShape);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
auto start = blockIdx.x * blockDim.x + threadIdx.x;
|
auto start = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
auto step = gridDim.x * blockDim.x;
|
auto step = gridDim.x * blockDim.x;
|
||||||
|
|
|
@ -257,6 +257,7 @@ namespace helpers {
|
||||||
gradOut = reinterpret_cast<T*>(eps);
|
gradOut = reinterpret_cast<T*>(eps);
|
||||||
gradLen = shape::length(epsShape);
|
gradLen = shape::length(epsShape);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
auto start = blockIdx.x * blockDim.x + threadIdx.x;
|
auto start = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
auto step = gridDim.x * blockDim.x;
|
auto step = gridDim.x * blockDim.x;
|
||||||
|
@ -302,6 +303,7 @@ namespace helpers {
|
||||||
gradLen = shape::length(epsShape);
|
gradLen = shape::length(epsShape);
|
||||||
currentLen = shape::length(outTad);
|
currentLen = shape::length(outTad);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
|
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
|
||||||
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
|
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
|
||||||
|
|
|
@ -75,7 +75,7 @@ namespace helpers {
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
z[zIndex] = val[segment];
|
z[zIndex] = val[segment];
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
// -------------------------------------------------------------------------------------------------------------- //
|
// -------------------------------------------------------------------------------------------------------------- //
|
||||||
template <typename T, typename I>
|
template <typename T, typename I>
|
||||||
|
@ -256,6 +256,7 @@ namespace helpers {
|
||||||
gradOut = reinterpret_cast<T*>(eps);
|
gradOut = reinterpret_cast<T*>(eps);
|
||||||
gradLen = shape::length(epsShape);
|
gradLen = shape::length(epsShape);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
auto start = blockIdx.x * blockDim.x + threadIdx.x;
|
auto start = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
auto step = gridDim.x * blockDim.x;
|
auto step = gridDim.x * blockDim.x;
|
||||||
|
@ -298,6 +299,7 @@ namespace helpers {
|
||||||
gradLen = shape::length(epsShape);
|
gradLen = shape::length(epsShape);
|
||||||
currentLen = shape::length(outTad);
|
currentLen = shape::length(outTad);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
|
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
|
||||||
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
|
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
|
||||||
|
|
|
@ -168,6 +168,7 @@ namespace helpers {
|
||||||
gradOut = reinterpret_cast<T*>(eps);
|
gradOut = reinterpret_cast<T*>(eps);
|
||||||
gradLen = shape::length(epsShape);
|
gradLen = shape::length(epsShape);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
auto start = blockIdx.x * blockDim.x + threadIdx.x;
|
auto start = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
auto step = gridDim.x * blockDim.x;
|
auto step = gridDim.x * blockDim.x;
|
||||||
|
|
|
@ -256,6 +256,7 @@ namespace helpers {
|
||||||
gradOut = reinterpret_cast<T*>(eps);
|
gradOut = reinterpret_cast<T*>(eps);
|
||||||
gradLen = shape::length(epsShape);
|
gradLen = shape::length(epsShape);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
auto start = blockIdx.x * blockDim.x + threadIdx.x;
|
auto start = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
auto step = gridDim.x * blockDim.x;
|
auto step = gridDim.x * blockDim.x;
|
||||||
|
@ -292,6 +293,7 @@ namespace helpers {
|
||||||
gradLen = shape::length(epsShape);
|
gradLen = shape::length(epsShape);
|
||||||
currentLen = shape::length(outTad);
|
currentLen = shape::length(outTad);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
|
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
|
||||||
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
|
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
|
||||||
|
|
|
@ -36,6 +36,7 @@ namespace helpers {
|
||||||
inputLen = shape::length(inputShape);
|
inputLen = shape::length(inputShape);
|
||||||
outputLen = shape::length(outputShape);
|
outputLen = shape::length(outputShape);
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
for (auto i = blockIdx.x; i < maxIndex; i += gridDim.x)
|
for (auto i = blockIdx.x; i < maxIndex; i += gridDim.x)
|
||||||
for(auto k = threadIdx.x; k < inputLen; k += blockDim.x)
|
for(auto k = threadIdx.x; k < inputLen; k += blockDim.x)
|
||||||
|
|
|
@ -375,8 +375,9 @@ namespace nd4j {
|
||||||
}
|
}
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
if (hasError)
|
if (hasError)
|
||||||
neu1[0] = DataTypeUtils::infOrMax<T>();
|
neu1[0] = DataTypeUtils::infOrMax<T>();
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|
|
@ -149,7 +149,6 @@ __global__ static void sruBICuda(const void* vx, const Nd4jLong* xShapeInfo,
|
||||||
|
|
||||||
totalThreads = gridDim.x * blockDim.x;
|
totalThreads = gridDim.x * blockDim.x;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
|
@ -135,6 +135,7 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
if (threadIdx.x == 0)
|
||||||
z[zOffset] = *sharedMem;
|
z[zOffset] = *sharedMem;
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,8 +39,7 @@ __global__ static void zetaCuda(const void *vx, const Nd4jLong *xShapeInfo,
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
if (threadIdx.x == 0)
|
||||||
len = shape::length(xShapeInfo);
|
len = shape::length(xShapeInfo);
|
||||||
|
__syncthreads();
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const auto totalThreads = gridDim.x * blockDim.x;
|
const auto totalThreads = gridDim.x * blockDim.x;
|
||||||
|
|
|
@ -18208,6 +18208,24 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
|
||||||
}
|
}
|
||||||
// #endif
|
// #endif
|
||||||
|
|
||||||
|
// #if NOT_EXCLUDED(OP_space_to_batch_nd)
|
||||||
|
@Namespace("nd4j::ops") public static class space_to_batch_nd extends DeclarableCustomOp {
|
||||||
|
static { Loader.load(); }
|
||||||
|
/** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
|
||||||
|
public space_to_batch_nd(Pointer p) { super(p); }
|
||||||
|
/** Native array allocator. Access with {@link Pointer#position(long)}. */
|
||||||
|
public space_to_batch_nd(long size) { super((Pointer)null); allocateArray(size); }
|
||||||
|
private native void allocateArray(long size);
|
||||||
|
@Override public space_to_batch_nd position(long position) {
|
||||||
|
return (space_to_batch_nd)super.position(position);
|
||||||
|
}
|
||||||
|
|
||||||
|
public space_to_batch_nd() { super((Pointer)null); allocate(); }
|
||||||
|
private native void allocate();
|
||||||
|
public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
|
||||||
|
}
|
||||||
|
// #endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
|
|
Loading…
Reference in New Issue