syncthreads (#136)

Signed-off-by: raver119 <raver119@gmail.com>
master
raver119 2019-08-20 18:28:43 +03:00 committed by GitHub
parent 38310777ee
commit 23c8738d4a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
47 changed files with 71 additions and 170 deletions

View File

@ -78,7 +78,6 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
__shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen __shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem); sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
@ -87,7 +86,6 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
zLen = shape::length(zShapeInfo); zLen = shape::length(zShapeInfo);
totalThreads = gridDim.x * blockDim.x; totalThreads = gridDim.x * blockDim.x;
} }
__syncthreads(); __syncthreads();
auto coords = sharedMem + threadIdx.x * zRank; auto coords = sharedMem + threadIdx.x * zRank;
@ -153,14 +151,12 @@ __global__ static void identityMatrixCuda(void* vx, const Nd4jLong* xShapeInfo,
__shared__ Nd4jLong len, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen __shared__ Nd4jLong len, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem); sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
rank = shape::rank(xShapeInfo); rank = shape::rank(xShapeInfo);
len = shape::length(xShapeInfo); len = shape::length(xShapeInfo);
totalThreads = gridDim.x * blockDim.x; totalThreads = gridDim.x * blockDim.x;
} }
__syncthreads(); __syncthreads();
auto coords = sharedMem + threadIdx.x * rank; auto coords = sharedMem + threadIdx.x * rank;

View File

@ -1183,7 +1183,6 @@ __global__ static void concatCuda(const int numOfArrs, void* pVx, void* pxShape
__shared__ Nd4jLong *zShapeInfo, *xShapeInfo, arrLen, arrLenZ, arrLenPerBlock, start, end; __shared__ Nd4jLong *zShapeInfo, *xShapeInfo, arrLen, arrLenZ, arrLenPerBlock, start, end;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
blocksPerArr = (gridDim.x - gridDim.x % numOfArrs) / numOfArrs; // floor blocksPerArr = (gridDim.x - gridDim.x % numOfArrs) / numOfArrs; // floor
arrIdx = blockIdx.x / blocksPerArr; arrIdx = blockIdx.x / blocksPerArr;
if (arrIdx >= numOfArrs) if (arrIdx >= numOfArrs)
@ -1200,8 +1199,8 @@ __global__ static void concatCuda(const int numOfArrs, void* pVx, void* pxShape
start = arrLenPerBlock * (blockIdx.x % blocksPerArr); start = arrLenPerBlock * (blockIdx.x % blocksPerArr);
end = (start + arrLenPerBlock) > arrLen ? arrLen : (start + arrLenPerBlock); end = (start + arrLenPerBlock) > arrLen ? arrLen : (start + arrLenPerBlock);
} }
__syncthreads(); __syncthreads();
for (Nd4jLong i = threadIdx.x + start; i < end; i += blockDim.x) { for (Nd4jLong i = threadIdx.x + start; i < end; i += blockDim.x) {
auto zOffset = shape::getIndexOffset(i, zShapeInfo, arrLenZ); auto zOffset = shape::getIndexOffset(i, zShapeInfo, arrLenZ);
auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLen); auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLen);
@ -3165,7 +3164,6 @@ __global__ static void scatterUpdateCuda(const int opCode, const int numOfSubArr
arrLenX = shape::length(xShapeInfo); arrLenX = shape::length(xShapeInfo);
arrLenY = shape::length(yShapeInfo); arrLenY = shape::length(yShapeInfo);
} }
__syncthreads(); __syncthreads();
if (arrLenX != arrLenY) if (arrLenX != arrLenY)

View File

@ -128,7 +128,6 @@ namespace functions {
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
tadLength = _length(tadOnlyShapeInfo); tadLength = _length(tadOnlyShapeInfo);
tadEWS = shape::elementWiseStride(tadOnlyShapeInfo); tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
numTads = _length(yShapeInfo) / tadLength; numTads = _length(yShapeInfo) / tadLength;
@ -194,7 +193,6 @@ namespace functions {
__shared__ Nd4jLong zEWS; __shared__ Nd4jLong zEWS;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
tadLength = _length(tadOnlyShapeInfo); tadLength = _length(tadOnlyShapeInfo);
tadEWS = shape::elementWiseStride(tadOnlyShapeInfo); tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
numTads = _length(xShapeInfo) / tadLength; numTads = _length(xShapeInfo) / tadLength;

View File

@ -185,7 +185,6 @@ namespace functions {
__shared__ Nd4jLong zEWS; __shared__ Nd4jLong zEWS;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength); tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength);
tadEWS = shape::elementWiseStride(tadOnlyShapeInfo); tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
numTads = shape::length(xShapeInfo) / tadLength; numTads = shape::length(xShapeInfo) / tadLength;

View File

@ -231,7 +231,6 @@ namespace functions {
xLength = shape::length(xShapeInfo); xLength = shape::length(xShapeInfo);
} }
__syncthreads(); __syncthreads();
if (!resultScalar) { if (!resultScalar) {
@ -267,6 +266,7 @@ namespace functions {
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
result[r] = sPartials[threadIdx.x].index; result[r] = sPartials[threadIdx.x].index;
} }
__syncthreads();
} }
} else { } else {
@ -287,6 +287,7 @@ namespace functions {
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
result[i] = sPartials[threadIdx.x].index; //postProcess(sPartials[0],tadLength ,extraParams); result[i] = sPartials[threadIdx.x].index; //postProcess(sPartials[0],tadLength ,extraParams);
} }
__syncthreads();
} }
} }
} else { } else {

View File

@ -49,7 +49,6 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
__shared__ Nd4jLong len; __shared__ Nd4jLong len;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
xEws = shape::elementWiseStride(xShapeInfo); xEws = shape::elementWiseStride(xShapeInfo);
yEws = shape::elementWiseStride(yShapeInfo); yEws = shape::elementWiseStride(yShapeInfo);
zEws = shape::elementWiseStride(zShapeInfo); zEws = shape::elementWiseStride(zShapeInfo);

View File

@ -49,7 +49,6 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
__shared__ Nd4jLong len; __shared__ Nd4jLong len;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
xEws = shape::elementWiseStride(xShapeInfo); xEws = shape::elementWiseStride(xShapeInfo);
yEws = shape::elementWiseStride(yShapeInfo); yEws = shape::elementWiseStride(yShapeInfo);
zEws = shape::elementWiseStride(zShapeInfo); zEws = shape::elementWiseStride(zShapeInfo);

View File

@ -125,7 +125,6 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
__shared__ Z* sPartials; __shared__ Z* sPartials;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
sPartials = reinterpret_cast<Z*>(shmem); sPartials = reinterpret_cast<Z*>(shmem);
@ -137,7 +136,6 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
else else
extraZ[2] = (Z) 0.0f; extraZ[2] = (Z) 0.0f;
} }
__syncthreads(); __syncthreads();
sPartials[threadIdx.x] = OpType::startingValue(x); sPartials[threadIdx.x] = OpType::startingValue(x);
@ -377,7 +375,6 @@ __device__ void Reduce3<X,Z>::transform(void *vx, Nd4jLong *xShapeInfo,
__shared__ char yTadOrder; __shared__ char yTadOrder;
if(threadIdx.x == 0) { if(threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
sPartials = reinterpret_cast<Z*>(shmem); sPartials = reinterpret_cast<Z*>(shmem);

View File

@ -217,7 +217,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
z[r] = OpType::getValue(postProcessOrNot, sPartials[threadIdx.x]); z[r] = OpType::getValue(postProcessOrNot, sPartials[threadIdx.x]);
} }
__syncthreads();
} }
} }
else { else {
@ -285,8 +285,8 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
SummaryStatsData<X> *pBuffer = (SummaryStatsData<X>*) reductionBuffer; SummaryStatsData<X> *pBuffer = (SummaryStatsData<X>*) reductionBuffer;
pBuffer[blockIdx.x] = sPartials[0]; pBuffer[blockIdx.x] = sPartials[0];
} }
__syncthreads();
__threadfence(); __threadfence();
__syncthreads();
if (tid == 0) { if (tid == 0) {
unsigned int ticket = atomicInc(&tc[16384], gridDim.x); unsigned int ticket = atomicInc(&tc[16384], gridDim.x);

View File

@ -52,7 +52,6 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo,
xzRank = shape::rank(xShapeInfo); xzRank = shape::rank(xShapeInfo);
yRank = shape::rank(yShapeInfo); yRank = shape::rank(yShapeInfo);
} }
__syncthreads(); __syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
@ -132,7 +131,6 @@ __global__ linkage void preluBPCuda(const void *vIn, const Nd4jLong *inShapeI
inRank = shape::rank(inShapeInfo); inRank = shape::rank(inShapeInfo);
alphaRank = shape::rank(alphaShapeInfo); alphaRank = shape::rank(alphaShapeInfo);
} }
__syncthreads(); __syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -45,7 +45,6 @@ static void _CUDA_G adjustHueCuda(const void* vx, const Nd4jLong* xShapeInfo, co
xDimCstride = shape::stride(xShapeInfo)[dimC]; xDimCstride = shape::stride(xShapeInfo)[dimC];
zDimCstride = shape::stride(zShapeInfo)[dimC]; zDimCstride = shape::stride(zShapeInfo)[dimC];
} }
__syncthreads(); __syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -47,7 +47,6 @@ static void _CUDA_G adjustSaturationCuda(const void* vx, const Nd4jLong* xShapeI
xDimCstride = shape::stride(xShapeInfo)[dimC]; xDimCstride = shape::stride(xShapeInfo)[dimC];
zDimCstride = shape::stride(zShapeInfo)[dimC]; zDimCstride = shape::stride(zShapeInfo)[dimC];
} }
__syncthreads(); __syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -53,7 +53,6 @@ __global__ static void batchnormCuda(const void* vx, const Nd4jLong* xShapeInfo,
__shared__ Nd4jLong minLen, tadLen, totalThreads; __shared__ Nd4jLong minLen, tadLen, totalThreads;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
totalThreads = gridDim.x * blockDim.x; totalThreads = gridDim.x * blockDim.x;
minLen = shape::length(meanShapeInfo); minLen = shape::length(meanShapeInfo);
@ -116,7 +115,6 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem); sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
totalThreads = gridDim.x * blockDim.x; totalThreads = gridDim.x * blockDim.x;

View File

@ -38,7 +38,6 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI
__shared__ Nd4jLong *sharedMem, imLen; __shared__ Nd4jLong *sharedMem, imLen;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem); sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
@ -53,7 +52,6 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI
imLen = shape::length(imShapeInfo); imLen = shape::length(imShapeInfo);
} }
__syncthreads(); __syncthreads();
const auto imInd = threadIdx.x + blockIdx.x * blockDim.x; const auto imInd = threadIdx.x + blockIdx.x * blockDim.x;

View File

@ -50,7 +50,6 @@ __global__ static void concatCuda(void* pVx, void* pxShapeInfo, void* vz, Nd4jL
rank = shape::rank(zShapeInfo); rank = shape::rank(zShapeInfo);
totalThreads = gridDim.x * blockDim.x; totalThreads = gridDim.x * blockDim.x;
} }
__syncthreads(); __syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -43,7 +43,6 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
__shared__ Nd4jLong *sharedMem; __shared__ Nd4jLong *sharedMem;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem); sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
@ -56,7 +55,6 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
iH = volShapeInfo[4]; iH = volShapeInfo[4];
iW = volShapeInfo[5]; iW = volShapeInfo[5];
} }
__syncthreads(); __syncthreads();
const auto colInd = threadIdx.x + blockIdx.x * blockDim.x; const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -127,7 +125,6 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape
__shared__ Nd4jLong *sharedMem, volLen; __shared__ Nd4jLong *sharedMem, volLen;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem); sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
@ -144,7 +141,6 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape
volLen = shape::length(volShapeInfo); volLen = shape::length(volShapeInfo);
} }
__syncthreads(); __syncthreads();
const auto volInd = threadIdx.x + blockIdx.x * blockDim.x; const auto volInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -404,7 +400,6 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
__shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff; __shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
bS = shape::sizeAt(xShapeInfo, 0); bS = shape::sizeAt(xShapeInfo, 0);
iC = shape::sizeAt(xShapeInfo, 1); iC = shape::sizeAt(xShapeInfo, 1);
oH = shape::sizeAt(zShapeInfo, 2); oH = shape::sizeAt(zShapeInfo, 2);
@ -428,7 +423,6 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
kHEff = kH + (kH-1)*(dH-1); kHEff = kH + (kH-1)*(dH-1);
kWEff = kW + (kW-1)*(dW-1); kWEff = kW + (kW-1)*(dW-1);
} }
__syncthreads(); __syncthreads();
int tid = blockIdx.x * gridDim.x + threadIdx.x; int tid = blockIdx.x * gridDim.x + threadIdx.x;
@ -501,7 +495,6 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
__shared__ bool fOrder; __shared__ bool fOrder;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
bS = shape::sizeAt(xShapeInfo, 0); bS = shape::sizeAt(xShapeInfo, 0);
iC = shape::sizeAt(xShapeInfo, 1); iC = shape::sizeAt(xShapeInfo, 1);
oH = shape::sizeAt(zShapeInfo, 2); oH = shape::sizeAt(zShapeInfo, 2);
@ -525,7 +518,6 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
kHEff = kH + (kH-1)*(dH-1); kHEff = kH + (kH-1)*(dH-1);
kWEff = kW + (kW-1)*(dW-1); kWEff = kW + (kW-1)*(dW-1);
} }
__syncthreads(); __syncthreads();
int tid = blockIdx.x * gridDim.x + threadIdx.x; int tid = blockIdx.x * gridDim.x + threadIdx.x;
@ -594,7 +586,6 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
__shared__ bool fOrder; __shared__ bool fOrder;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
bS = shape::sizeAt(xShapeInfo, 0); bS = shape::sizeAt(xShapeInfo, 0);
iC = shape::sizeAt(xShapeInfo, 1); iC = shape::sizeAt(xShapeInfo, 1);
oH = shape::sizeAt(zShapeInfo, 2); oH = shape::sizeAt(zShapeInfo, 2);
@ -618,7 +609,6 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
kHEff = kH + (kH-1)*(dH-1); kHEff = kH + (kH-1)*(dH-1);
kWEff = kW + (kW-1)*(dW-1); kWEff = kW + (kW-1)*(dW-1);
} }
__syncthreads(); __syncthreads();
int tid = blockIdx.x * gridDim.x + threadIdx.x; int tid = blockIdx.x * gridDim.x + threadIdx.x;
@ -737,7 +727,6 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,
kProd = kD * kH * kW; kProd = kD * kH * kW;
} }
__syncthreads(); __syncthreads();
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x; const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -888,7 +877,6 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
kProd = kH * kW; kProd = kH * kW;
} }
__syncthreads(); __syncthreads();
const auto yInd = threadIdx.x + blockIdx.x * blockDim.x; const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1043,7 +1031,6 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
kProd = kD * kH * kW; kProd = kD * kH * kW;
} }
__syncthreads(); __syncthreads();
const auto yInd = threadIdx.x + blockIdx.x * blockDim.x; const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1356,7 +1343,6 @@ __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeIn
zLen = shape::length(zShapeInfo); zLen = shape::length(zShapeInfo);
rank = 4; rank = 4;
} }
__syncthreads(); __syncthreads();
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x; const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1425,7 +1411,6 @@ __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeIn
zLen = shape::length(zShapeInfo); zLen = shape::length(zShapeInfo);
rank = 5; rank = 5;
} }
__syncthreads(); __syncthreads();
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x; const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1499,7 +1484,6 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape
factorH = xShapeInfo[dimIH + 1] / zShapeInfo[dimIH + 1]; factorH = xShapeInfo[dimIH + 1] / zShapeInfo[dimIH + 1];
factorW = xShapeInfo[dimIH + 2] / zShapeInfo[dimIH + 2]; factorW = xShapeInfo[dimIH + 2] / zShapeInfo[dimIH + 2];
} }
__syncthreads(); __syncthreads();
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x; const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1573,7 +1557,6 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape
factorH = xShapeInfo[dimID + 2] / zShapeInfo[dimID + 2]; factorH = xShapeInfo[dimID + 2] / zShapeInfo[dimID + 2];
factorW = xShapeInfo[dimID + 3] / zShapeInfo[dimID + 3]; factorW = xShapeInfo[dimID + 3] / zShapeInfo[dimID + 3];
} }
__syncthreads(); __syncthreads();
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x; const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;

View File

@ -40,7 +40,6 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
__shared__ Nd4jLong lenWithoutLastDim, totalThreads, *sharedMem; __shared__ Nd4jLong lenWithoutLastDim, totalThreads, *sharedMem;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
x = reinterpret_cast<const T*>(vx); x = reinterpret_cast<const T*>(vx);
y = reinterpret_cast<const T*>(vy); y = reinterpret_cast<const T*>(vy);
z = reinterpret_cast<T*>(vz); z = reinterpret_cast<T*>(vz);

View File

@ -62,7 +62,6 @@ __global__ static void dilation2dCuda(const void* vx, const Nd4jLong* xShapeInfo
kH = yShapeInfo[1]; kH = yShapeInfo[1];
kW = yShapeInfo[2]; kW = yShapeInfo[2];
} }
__syncthreads(); __syncthreads();
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x; const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;

View File

@ -32,13 +32,8 @@ namespace helpers {
static __global__ void dropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probVal, int inLen, nd4j::graph::RandomGenerator* nodeRng) { static __global__ void dropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probVal, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
auto tid = blockIdx.x * blockDim.x + threadIdx.x; auto tid = blockIdx.x * blockDim.x + threadIdx.x;
auto step = blockDim.x * gridDim.x; auto step = blockDim.x * gridDim.x;
__shared__ T const* input; T const* input = reinterpret_cast<T const*>(inputBuf);
__shared__ T* output; T* output = reinterpret_cast<T*>(outputBuf);
if (threadIdx.x == 0) {
input = reinterpret_cast<T const*>(inputBuf);
output = reinterpret_cast<T*>(outputBuf);
}
for (Nd4jLong e = 0; e < inLen; ++e) { for (Nd4jLong e = 0; e < inLen; ++e) {
T val = nodeRng->relativeT(e, T(0.f), T(1.f)); T val = nodeRng->relativeT(e, T(0.f), T(1.f));
@ -134,6 +129,7 @@ namespace helpers {
output = reinterpret_cast<T*>(outputBuf); output = reinterpret_cast<T*>(outputBuf);
input = reinterpret_cast<T*>(gradOutBuf); input = reinterpret_cast<T*>(gradOutBuf);
} }
__syncthreads();
auto tid = blockIdx.x * blockDim.x + threadIdx.x; auto tid = blockIdx.x * blockDim.x + threadIdx.x;
auto step = blockDim.x * gridDim.x; auto step = blockDim.x * gridDim.x;
@ -159,13 +155,8 @@ namespace helpers {
static __global__ void alphaDropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probValue, double alpha, double alpha1, double beta, int inLen, nd4j::graph::RandomGenerator* nodeRng) { static __global__ void alphaDropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probValue, double alpha, double alpha1, double beta, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
auto tid = blockIdx.x * blockDim.x + threadIdx.x; auto tid = blockIdx.x * blockDim.x + threadIdx.x;
auto step = blockDim.x * gridDim.x; auto step = blockDim.x * gridDim.x;
__shared__ T const* input; T const* input = reinterpret_cast<T const*>(inputBuf);
__shared__ T* output; T* output = reinterpret_cast<T*>(outputBuf);
if (threadIdx.x == 0) {
input = reinterpret_cast<T const*>(inputBuf);
output = reinterpret_cast<T*>(outputBuf);
}
for (auto e = tid; e < inLen; e += step) { for (auto e = tid; e < inLen; e += step) {
T val = nodeRng->relativeT(e, T(0.f), T(1.f)); T val = nodeRng->relativeT(e, T(0.f), T(1.f));
@ -209,7 +200,7 @@ namespace helpers {
std::vector<Nd4jLong> dims(reduceShape->lengthOf()); std::vector<Nd4jLong> dims(reduceShape->lengthOf());
reduceShape->syncToHost(); // to ensure that follows are actual reduceShape->syncToHost(); // to ensure that follows are actual
bool fit = true; bool fit = true;
// PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit))
for( int i = 0; i < dims.size(); i++ ) { for( int i = 0; i < dims.size(); i++ ) {
if (fit) { if (fit) {
dims[i] = reduceShape->e<Nd4jLong>(i); dims[i] = reduceShape->e<Nd4jLong>(i);
@ -225,9 +216,9 @@ namespace helpers {
REQUIRE_TRUE(fit, 0, "alpha_dropout: Noise shape should fit to input rank."); REQUIRE_TRUE(fit, 0, "alpha_dropout: Noise shape should fit to input rank.");
std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext())); std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext()));
chunk->assign(1.f); chunk->assign(1.f);
//chunk->applyRandom<randomOps::DropOutInverted<T>>(rng, nullptr, chunk.get(), &probValue);
//NativeOpExecutioner::execRandom(random::DropOutInverted, rng, chunk->buffer(), chunk->shapeInfo(), chunk->buffer(), chunk->shapeInfo(), &prob);
alphaDropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), seed, probValue, alpha, alpha1, beta); alphaDropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), seed, probValue, alpha, alpha1, beta);
// broadcast chunk to full matrix // broadcast chunk to full matrix
std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input)); std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input));
dropOutMultiplier->assign(1.f); dropOutMultiplier->assign(1.f);

View File

@ -76,7 +76,6 @@ __global__ static void gatherCuda(const int numOfSubArrs,
for (int i = blockIdx.x; i < numOfSubArrs; i += gridDim.x) { for (int i = blockIdx.x; i < numOfSubArrs; i += gridDim.x) {
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
x = reinterpret_cast<const X*>(vx) + xOffsets[y[shape::getIndexOffset(i, yShapeInfo, numOfSubArrs)]]; x = reinterpret_cast<const X*>(vx) + xOffsets[y[shape::getIndexOffset(i, yShapeInfo, numOfSubArrs)]];
z = reinterpret_cast<X*>(vz) + zOffsets[i]; z = reinterpret_cast<X*>(vz) + zOffsets[i];
} }

View File

@ -47,7 +47,6 @@ namespace nd4j {
__shared__ Nd4jLong zLen, totalThreads, *sharedMem; __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem); sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
@ -61,7 +60,6 @@ namespace nd4j {
totalThreads = gridDim.x * blockDim.x; totalThreads = gridDim.x * blockDim.x;
} }
__syncthreads(); __syncthreads();
auto coord = sharedMem + threadIdx.x * maxRank; auto coord = sharedMem + threadIdx.x * maxRank;

View File

@ -61,7 +61,6 @@ namespace nd4j {
static __global__ void lastStep(Nd4jLong* resultBuf, Nd4jLong* tempBufferA, Nd4jLong* tempResult, Nd4jLong length, Nd4jLong blockSize) { static __global__ void lastStep(Nd4jLong* resultBuf, Nd4jLong* tempBufferA, Nd4jLong* tempResult, Nd4jLong length, Nd4jLong blockSize) {
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
if (length <= blockSize) if (length <= blockSize)
*resultBuf = *tempBufferA; *resultBuf = *tempBufferA;
else else

View File

@ -44,7 +44,6 @@ __global__ static void im2colCuda(const void *image, void *columns,
__shared__ int imRank, colRank; __shared__ int imRank, colRank;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem); sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
@ -56,7 +55,6 @@ __global__ static void im2colCuda(const void *image, void *columns,
iH = imShapeInfo[3]; iH = imShapeInfo[3];
iW = imShapeInfo[4]; iW = imShapeInfo[4];
} }
__syncthreads(); __syncthreads();
const auto colInd = threadIdx.x + blockIdx.x * blockDim.x; const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;

View File

@ -88,12 +88,9 @@ namespace helpers {
template <typename I> template <typename I>
static __global__ void copyIndices(void* indices, void* indicesLong, Nd4jLong len) { static __global__ void copyIndices(void* indices, void* indicesLong, Nd4jLong len) {
__shared__ I* indexBuf; I* indexBuf = reinterpret_cast<I*>(indices);
__shared__ Nd4jLong* srcBuf; Nd4jLong* srcBuf = reinterpret_cast<Nd4jLong*>(indicesLong);;
if (threadIdx.x == 0) {
indexBuf = reinterpret_cast<I*>(indices);
srcBuf = reinterpret_cast<Nd4jLong*>(indicesLong);
}
auto tid = threadIdx.x + blockIdx.x * blockDim.x; auto tid = threadIdx.x + blockIdx.x * blockDim.x;
auto step = blockDim.x * gridDim.x; auto step = blockDim.x * gridDim.x;

View File

@ -29,11 +29,7 @@ namespace helpers {
template <typename T> template <typename T>
static _CUDA_G void lrnKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) { static _CUDA_G void lrnKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) {
extern __shared__ char sharedChar[]; extern __shared__ char sharedChar[];
__shared__ T* shared; T* shared = reinterpret_cast<T*>(sharedChar);
if (threadIdx.x == 0)
shared = reinterpret_cast<T*>(sharedChar);
__syncthreads();
auto xEws = shape::elementWiseStride(xTadShapeInfo); auto xEws = shape::elementWiseStride(xTadShapeInfo);
auto zEws = shape::elementWiseStride(zTadShapeInfo); auto zEws = shape::elementWiseStride(zTadShapeInfo);
@ -69,16 +65,8 @@ namespace helpers {
template <typename X, typename Z> template <typename X, typename Z>
static _CUDA_G void lrnBPKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) { static _CUDA_G void lrnBPKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) {
extern __shared__ char sharedChar[]; extern __shared__ char sharedChar[];
__shared__ X* sharedX; X* sharedX = reinterpret_cast<X*>(sharedChar);
__shared__ Z* sharedY; Z* sharedY = reinterpret_cast<Z*>(sharedX + blockDim.x);
if (threadIdx.x == 0) {
sharedX = reinterpret_cast<X*>(sharedChar);
sharedY = reinterpret_cast<Z*>(sharedX + blockDim.x);
}
__syncthreads();
auto xEws = shape::elementWiseStride(xTadShapeInfo); auto xEws = shape::elementWiseStride(xTadShapeInfo);
auto zEws = shape::elementWiseStride(zTadShapeInfo); auto zEws = shape::elementWiseStride(zTadShapeInfo);

View File

@ -57,14 +57,8 @@ namespace helpers {
// } // }
template <typename T> template <typename T>
static __global__ void invertKernelLow(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) { static __global__ void invertKernelLow(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
__shared__ T* inverted; T* inverted = reinterpret_cast<T*>(invertedBuf);
__shared__ T* input; T* input = reinterpret_cast<T*>(inputBuf);
if (threadIdx.x == 0) {
inverted = reinterpret_cast<T*>(invertedBuf);
input = reinterpret_cast<T*>(inputBuf);
}
__syncthreads();
auto start = threadIdx.x + blockIdx.x * blockDim.x; auto start = threadIdx.x + blockIdx.x * blockDim.x;
auto step = blockDim.x * gridDim.x; auto step = blockDim.x * gridDim.x;
@ -84,14 +78,8 @@ namespace helpers {
template <typename T> template <typename T>
static __global__ void upvertKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) { static __global__ void upvertKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
__shared__ T* inverted; T* inverted = reinterpret_cast<T*>(invertedBuf);
__shared__ T* input; T* input = reinterpret_cast<T*>(inputBuf);
if (threadIdx.x == 0) {
inverted = reinterpret_cast<T*>(invertedBuf);
input = reinterpret_cast<T*>(inputBuf);
}
__syncthreads();
auto start = threadIdx.x + blockIdx.x * blockDim.x; auto start = threadIdx.x + blockIdx.x * blockDim.x;
auto step = blockDim.x * gridDim.x; auto step = blockDim.x * gridDim.x;
@ -107,14 +95,8 @@ namespace helpers {
template <typename T> template <typename T>
static __global__ void upvertKernelUp(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) { static __global__ void upvertKernelUp(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
__shared__ T* inverted; T* inverted = reinterpret_cast<T*>(invertedBuf);
__shared__ T* input; T* input = reinterpret_cast<T*>(inputBuf);
if (threadIdx.x == 0) {
inverted = reinterpret_cast<T*>(invertedBuf);
input = reinterpret_cast<T*>(inputBuf);
}
__syncthreads();
auto start = threadIdx.x + blockIdx.x * blockDim.x; auto start = threadIdx.x + blockIdx.x * blockDim.x;
auto step = blockDim.x * gridDim.x; auto step = blockDim.x * gridDim.x;
@ -135,17 +117,8 @@ namespace helpers {
template <typename T> template <typename T>
static __global__ void invertLowKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) { static __global__ void invertLowKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
__shared__ T* inverted; T* inverted = reinterpret_cast<T*>(invertedBuf);
__shared__ T* input; T* input = reinterpret_cast<T*>(inputBuf);
if (threadIdx.x == 0) {
inverted = reinterpret_cast<T*>(invertedBuf);
input = reinterpret_cast<T*>(inputBuf);
}
__syncthreads();
// auto start = threadIdx.x + blockIdx.x * blockDim.x;
// auto step = blockDim.x * gridDim.x;
for (int i = blockIdx.x + 2; i < n; i += gridDim.x) { for (int i = blockIdx.x + 2; i < n; i += gridDim.x) {
for (int j = i - 2; j >= 0; --j) for (int j = i - 2; j >= 0; --j)
@ -166,17 +139,8 @@ namespace helpers {
template <typename T> template <typename T>
static __global__ void invertUpKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) { static __global__ void invertUpKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
__shared__ T* inverted; T* inverted = reinterpret_cast<T*>(invertedBuf);;
__shared__ T* input; T* input = reinterpret_cast<T*>(inputBuf);
if (threadIdx.x == 0) {
inverted = reinterpret_cast<T*>(invertedBuf);
input = reinterpret_cast<T*>(inputBuf);
}
__syncthreads();
// auto start = threadIdx.x + blockIdx.x * blockDim.x;
// auto step = blockDim.x * gridDim.x;
for (int i = n - blockIdx.x - 2; i >= 0; i -= gridDim.x) { for (int i = n - blockIdx.x - 2; i >= 0; i -= gridDim.x) {
for (int j = i + 2; j < n; j++) for (int j = i + 2; j < n; j++)
@ -366,11 +330,8 @@ namespace helpers {
template <typename F> template <typename F>
static __global__ void fillUpPermutation(void* output, Nd4jLong* shape, int* source, int rowNum) { static __global__ void fillUpPermutation(void* output, Nd4jLong* shape, int* source, int rowNum) {
__shared__ F* permutation; F* permutation = reinterpret_cast<F*>(output);
if (threadIdx.x == 0) {
permutation = reinterpret_cast<F*>(output);
}
auto start = blockIdx.x * blockDim.x + threadIdx.x; auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = blockDim.x * gridDim.x; auto step = blockDim.x * gridDim.x;
for (auto i = start; i < rowNum; i += step) { for (auto i = start; i < rowNum; i += step) {
@ -709,13 +670,8 @@ namespace helpers {
template <typename F> template <typename F>
__global__ void adjustResultsKernel(F* dArray, Nd4jLong* shape, Nd4jLong* offsets, Nd4jLong batchSize, Nd4jLong n) { __global__ void adjustResultsKernel(F* dArray, Nd4jLong* shape, Nd4jLong* offsets, Nd4jLong batchSize, Nd4jLong n) {
//auto i = blockIdx.x * blockDim.x + threadIdx.x; //auto i = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ Nd4jLong* shapeOf; Nd4jLong* shapeOf = shape::shapeOf(shape);
__shared__ Nd4jLong* strideOf; Nd4jLong* strideOf = shape::stride(shape);
if (blockIdx.x == 0 && threadIdx.x == 0) {
shapeOf = shape::shapeOf(shape);
strideOf = shape::stride(shape);
}
__syncthreads();
for (auto i = blockIdx.x; i < batchSize; i+= gridDim.x) { for (auto i = blockIdx.x; i < batchSize; i+= gridDim.x) {
auto current = dArray + offsets[i]; auto current = dArray + offsets[i];

View File

@ -37,6 +37,7 @@ namespace helpers {
outLength = shape::length(outputShape); outLength = shape::length(outputShape);
diagonalLen = shape::length(diagonalShape); diagonalLen = shape::length(diagonalShape);
} }
__syncthreads();
for(int i = blockIdx.x; i < batchSize; i+= gridDim.x ) for(int i = blockIdx.x; i < batchSize; i+= gridDim.x )
for(int j = threadIdx.x; j < lastSmallDim; j += blockDim.x) { for(int j = threadIdx.x; j < lastSmallDim; j += blockDim.x) {

View File

@ -46,7 +46,6 @@ __global__ static void onehotCuda(const void *vx, const Nd4jLong *xShapeInfo, vo
__shared__ Nd4jLong zLen, totalThreads, *sharedMem; __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem); sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
xRank = shape::rank(xShapeInfo); xRank = shape::rank(xShapeInfo);
@ -54,11 +53,10 @@ __global__ static void onehotCuda(const void *vx, const Nd4jLong *xShapeInfo, vo
zLen = shape::length(zShapeInfo); zLen = shape::length(zShapeInfo);
totalThreads = gridDim.x * blockDim.x; totalThreads = gridDim.x * blockDim.x;
} }
__syncthreads();
auto coord = sharedMem + threadIdx.x * zRank; auto coord = sharedMem + threadIdx.x * zRank;
__syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
for (Nd4jLong i = tid; i < zLen; i += totalThreads) { for (Nd4jLong i = tid; i < zLen; i += totalThreads) {

View File

@ -51,7 +51,6 @@ namespace nd4j {
__shared__ Nd4jLong zLen, yLen, totalThreads, *coords, *xShape, *zShape, *xStride, *zStride, shift1, shift2, yStride0; __shared__ Nd4jLong zLen, yLen, totalThreads, *coords, *xShape, *zShape, *xStride, *zStride, shift1, shift2, yStride0;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
coords = reinterpret_cast<Nd4jLong*>(shmem); coords = reinterpret_cast<Nd4jLong*>(shmem);
zLen = shape::length(zShapeInfo); zLen = shape::length(zShapeInfo);

View File

@ -77,6 +77,7 @@ namespace helpers {
// saving final value // saving final value
if (threadIdx.x == 0) if (threadIdx.x == 0)
z[shape::getIndexOffset(t, zShapeInfo, zLength)] = x[shape::getIndexOffset(position, xTadShapeInfo, tadLength)]; z[shape::getIndexOffset(t, zShapeInfo, zLength)] = x[shape::getIndexOffset(position, xTadShapeInfo, tadLength)];
__syncthreads();
} }
} }

View File

@ -40,7 +40,6 @@ __global__ static void polyGammaCuda(const void *vn, const Nd4jLong *nShapeInfo,
if (threadIdx.x == 0) if (threadIdx.x == 0)
len = shape::length(nShapeInfo); len = shape::length(nShapeInfo);
__syncthreads(); __syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -93,6 +93,7 @@ __global__ static void prefixPerBlockCuda(scalar::Ops op,
if (threadIdx.x == 0) if (threadIdx.x == 0)
shared[blockDim2 - 1] = (op == scalar::Add) ? 0 : 1; shared[blockDim2 - 1] = (op == scalar::Add) ? 0 : 1;
__syncthreads();
for (uint d = 1; d < blockDim2; d *= 2) { for (uint d = 1; d < blockDim2; d *= 2) {

View File

@ -47,7 +47,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
__shared__ Nd4jLong zLen, totalThreads, *sharedMem; __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem); sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
@ -55,7 +54,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
zLen = shape::length(zShapeInfo); zLen = shape::length(zShapeInfo);
totalThreads = gridDim.x * blockDim.x; totalThreads = gridDim.x * blockDim.x;
} }
__syncthreads(); __syncthreads();
auto coords = sharedMem + threadIdx.x * rank; auto coords = sharedMem + threadIdx.x * rank;
@ -138,7 +136,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
__shared__ Nd4jLong zLen, totalThreads, *sharedMem; __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem); sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
@ -146,7 +143,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
zLen = shape::length(zShapeInfo); zLen = shape::length(zShapeInfo);
totalThreads = gridDim.x * blockDim.x; totalThreads = gridDim.x * blockDim.x;
} }
__syncthreads(); __syncthreads();
auto coords = sharedMem + threadIdx.x * rank; auto coords = sharedMem + threadIdx.x * rank;

View File

@ -308,7 +308,6 @@ __global__ static void scatterCuda(const int opCode,
__shared__ Nd4jLong yLen, totalThreads, *coord; __shared__ Nd4jLong yLen, totalThreads, *coord;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
coord = reinterpret_cast<Nd4jLong*>(shmem); coord = reinterpret_cast<Nd4jLong*>(shmem);
yLen = shape::length(yShapeInfo); yLen = shape::length(yShapeInfo);
@ -317,7 +316,6 @@ __global__ static void scatterCuda(const int opCode,
yRank = shape::rank(yShapeInfo); yRank = shape::rank(yShapeInfo);
zRank = shape::rank(zShapeInfo); zRank = shape::rank(zShapeInfo);
} }
__syncthreads(); __syncthreads();
auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank); auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank);
@ -455,12 +453,10 @@ __global__ static void scatterNDLockCuda(const int opCode,
__shared__ int xLastDim; __shared__ int xLastDim;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
zTadCoords = reinterpret_cast<Nd4jLong*>(shmem); zTadCoords = reinterpret_cast<Nd4jLong*>(shmem);
xLastDim = xTadShapeInfo[1]; // xTad has rank = 1 always xLastDim = xTadShapeInfo[1]; // xTad has rank = 1 always
} }
__syncthreads(); __syncthreads();
Nd4jLong* zTadCoordsPerThread = zTadCoords + threadIdx.x * xLastDim; Nd4jLong* zTadCoordsPerThread = zTadCoords + threadIdx.x * xLastDim;
@ -598,7 +594,6 @@ __global__ static void scatterNDCuda(const int opCode,
__shared__ Nd4jLong yLen, totalThreads, *coord; __shared__ Nd4jLong yLen, totalThreads, *coord;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[]; extern __shared__ unsigned char shmem[];
coord = reinterpret_cast<Nd4jLong*>(shmem); coord = reinterpret_cast<Nd4jLong*>(shmem);
yLen = shape::length(yShapeInfo); yLen = shape::length(yShapeInfo);
@ -608,7 +603,6 @@ __global__ static void scatterNDCuda(const int opCode,
zRank = shape::rank(zShapeInfo); zRank = shape::rank(zShapeInfo);
xLastDim = xShapeInfo[xRank]; xLastDim = xShapeInfo[xRank];
} }
__syncthreads(); __syncthreads();
auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank); auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank);
@ -752,7 +746,6 @@ __global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo,
xLen = shape::length(xShapeInfo); xLen = shape::length(xShapeInfo);
xRank = shape::rank(xShapeInfo); xRank = shape::rank(xShapeInfo);
} }
__syncthreads(); __syncthreads();
const auto xInd = threadIdx.x + blockIdx.x * blockDim.x; const auto xInd = threadIdx.x + blockIdx.x * blockDim.x;

View File

@ -56,7 +56,6 @@ namespace nd4j {
arrLenX = shape::length(xShapeInfo); arrLenX = shape::length(xShapeInfo);
arrLenY = shape::length(yShapeInfo); arrLenY = shape::length(yShapeInfo);
} }
__syncthreads(); __syncthreads();
if (arrLenX != arrLenY) if (arrLenX != arrLenY)

View File

@ -266,6 +266,7 @@ namespace nd4j {
gradOut = reinterpret_cast<T*>(eps); gradOut = reinterpret_cast<T*>(eps);
gradLen = shape::length(epsShape); gradLen = shape::length(epsShape);
} }
__syncthreads();
auto start = blockIdx.x * blockDim.x + threadIdx.x; auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = gridDim.x * blockDim.x; auto step = gridDim.x * blockDim.x;
@ -311,6 +312,7 @@ namespace nd4j {
gradLen = shape::length(epsShape); gradLen = shape::length(epsShape);
currentLen = shape::length(outTad); currentLen = shape::length(outTad);
} }
__syncthreads();
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) { for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen); auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);

View File

@ -248,6 +248,7 @@ namespace helpers {
gradOut = reinterpret_cast<T*>(eps); gradOut = reinterpret_cast<T*>(eps);
gradLen = shape::length(epsShape); gradLen = shape::length(epsShape);
} }
__syncthreads();
auto start = blockIdx.x * blockDim.x + threadIdx.x; auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = gridDim.x * blockDim.x; auto step = gridDim.x * blockDim.x;

View File

@ -257,6 +257,7 @@ namespace helpers {
gradOut = reinterpret_cast<T*>(eps); gradOut = reinterpret_cast<T*>(eps);
gradLen = shape::length(epsShape); gradLen = shape::length(epsShape);
} }
__syncthreads();
auto start = blockIdx.x * blockDim.x + threadIdx.x; auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = gridDim.x * blockDim.x; auto step = gridDim.x * blockDim.x;
@ -302,6 +303,7 @@ namespace helpers {
gradLen = shape::length(epsShape); gradLen = shape::length(epsShape);
currentLen = shape::length(outTad); currentLen = shape::length(outTad);
} }
__syncthreads();
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) { for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen); auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);

View File

@ -75,7 +75,7 @@ namespace helpers {
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
z[zIndex] = val[segment]; z[zIndex] = val[segment];
} }
__syncthreads();
} }
// -------------------------------------------------------------------------------------------------------------- // // -------------------------------------------------------------------------------------------------------------- //
template <typename T, typename I> template <typename T, typename I>
@ -256,6 +256,7 @@ namespace helpers {
gradOut = reinterpret_cast<T*>(eps); gradOut = reinterpret_cast<T*>(eps);
gradLen = shape::length(epsShape); gradLen = shape::length(epsShape);
} }
__syncthreads();
auto start = blockIdx.x * blockDim.x + threadIdx.x; auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = gridDim.x * blockDim.x; auto step = gridDim.x * blockDim.x;
@ -298,6 +299,7 @@ namespace helpers {
gradLen = shape::length(epsShape); gradLen = shape::length(epsShape);
currentLen = shape::length(outTad); currentLen = shape::length(outTad);
} }
__syncthreads();
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) { for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen); auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);

View File

@ -168,6 +168,7 @@ namespace helpers {
gradOut = reinterpret_cast<T*>(eps); gradOut = reinterpret_cast<T*>(eps);
gradLen = shape::length(epsShape); gradLen = shape::length(epsShape);
} }
__syncthreads();
auto start = blockIdx.x * blockDim.x + threadIdx.x; auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = gridDim.x * blockDim.x; auto step = gridDim.x * blockDim.x;

View File

@ -256,6 +256,7 @@ namespace helpers {
gradOut = reinterpret_cast<T*>(eps); gradOut = reinterpret_cast<T*>(eps);
gradLen = shape::length(epsShape); gradLen = shape::length(epsShape);
} }
__syncthreads();
auto start = blockIdx.x * blockDim.x + threadIdx.x; auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = gridDim.x * blockDim.x; auto step = gridDim.x * blockDim.x;
@ -292,6 +293,7 @@ namespace helpers {
gradLen = shape::length(epsShape); gradLen = shape::length(epsShape);
currentLen = shape::length(outTad); currentLen = shape::length(outTad);
} }
__syncthreads();
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) { for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen); auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);

View File

@ -36,6 +36,7 @@ namespace helpers {
inputLen = shape::length(inputShape); inputLen = shape::length(inputShape);
outputLen = shape::length(outputShape); outputLen = shape::length(outputShape);
} }
__syncthreads();
for (auto i = blockIdx.x; i < maxIndex; i += gridDim.x) for (auto i = blockIdx.x; i < maxIndex; i += gridDim.x)
for(auto k = threadIdx.x; k < inputLen; k += blockDim.x) for(auto k = threadIdx.x; k < inputLen; k += blockDim.x)

View File

@ -375,8 +375,9 @@ namespace nd4j {
} }
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
if (hasError) if (hasError)
neu1[0] = DataTypeUtils::infOrMax<T>(); neu1[0] = DataTypeUtils::infOrMax<T>();
} }
__syncthreads();
} }
template <typename T> template <typename T>

View File

@ -149,7 +149,6 @@ __global__ static void sruBICuda(const void* vx, const Nd4jLong* xShapeInfo,
totalThreads = gridDim.x * blockDim.x; totalThreads = gridDim.x * blockDim.x;
} }
__syncthreads(); __syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -135,6 +135,7 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
if (threadIdx.x == 0) if (threadIdx.x == 0)
z[zOffset] = *sharedMem; z[zOffset] = *sharedMem;
__syncthreads();
} }
} }

View File

@ -39,7 +39,6 @@ __global__ static void zetaCuda(const void *vx, const Nd4jLong *xShapeInfo,
if (threadIdx.x == 0) if (threadIdx.x == 0)
len = shape::length(xShapeInfo); len = shape::length(xShapeInfo);
__syncthreads(); __syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -18208,6 +18208,24 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
} }
// #endif // #endif
// #if NOT_EXCLUDED(OP_space_to_batch_nd)
@Namespace("nd4j::ops") public static class space_to_batch_nd extends DeclarableCustomOp {
static { Loader.load(); }
/** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
public space_to_batch_nd(Pointer p) { super(p); }
/** Native array allocator. Access with {@link Pointer#position(long)}. */
public space_to_batch_nd(long size) { super((Pointer)null); allocateArray(size); }
private native void allocateArray(long size);
@Override public space_to_batch_nd position(long position) {
return (space_to_batch_nd)super.position(position);
}
public space_to_batch_nd() { super((Pointer)null); allocate(); }
private native void allocate();
public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
}
// #endif
/** /**
* *
* *