syncthreads (#136)

Signed-off-by: raver119 <raver119@gmail.com>
master
raver119 2019-08-20 18:28:43 +03:00 committed by GitHub
parent 38310777ee
commit 23c8738d4a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
47 changed files with 71 additions and 170 deletions

View File

@ -78,7 +78,6 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
__shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
@ -87,7 +86,6 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
zLen = shape::length(zShapeInfo);
totalThreads = gridDim.x * blockDim.x;
}
__syncthreads();
auto coords = sharedMem + threadIdx.x * zRank;
@ -153,14 +151,12 @@ __global__ static void identityMatrixCuda(void* vx, const Nd4jLong* xShapeInfo,
__shared__ Nd4jLong len, totalThreads, *sharedMem; // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
rank = shape::rank(xShapeInfo);
len = shape::length(xShapeInfo);
totalThreads = gridDim.x * blockDim.x;
}
__syncthreads();
auto coords = sharedMem + threadIdx.x * rank;

View File

@ -1183,7 +1183,6 @@ __global__ static void concatCuda(const int numOfArrs, void* pVx, void* pxShape
__shared__ Nd4jLong *zShapeInfo, *xShapeInfo, arrLen, arrLenZ, arrLenPerBlock, start, end;
if (threadIdx.x == 0) {
blocksPerArr = (gridDim.x - gridDim.x % numOfArrs) / numOfArrs; // floor
arrIdx = blockIdx.x / blocksPerArr;
if (arrIdx >= numOfArrs)
@ -1200,8 +1199,8 @@ __global__ static void concatCuda(const int numOfArrs, void* pVx, void* pxShape
start = arrLenPerBlock * (blockIdx.x % blocksPerArr);
end = (start + arrLenPerBlock) > arrLen ? arrLen : (start + arrLenPerBlock);
}
__syncthreads();
for (Nd4jLong i = threadIdx.x + start; i < end; i += blockDim.x) {
auto zOffset = shape::getIndexOffset(i, zShapeInfo, arrLenZ);
auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLen);
@ -3165,7 +3164,6 @@ __global__ static void scatterUpdateCuda(const int opCode, const int numOfSubArr
arrLenX = shape::length(xShapeInfo);
arrLenY = shape::length(yShapeInfo);
}
__syncthreads();
if (arrLenX != arrLenY)

View File

@ -128,7 +128,6 @@ namespace functions {
if (threadIdx.x == 0) {
tadLength = _length(tadOnlyShapeInfo);
tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
numTads = _length(yShapeInfo) / tadLength;
@ -194,7 +193,6 @@ namespace functions {
__shared__ Nd4jLong zEWS;
if (threadIdx.x == 0) {
tadLength = _length(tadOnlyShapeInfo);
tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
numTads = _length(xShapeInfo) / tadLength;

View File

@ -185,7 +185,6 @@ namespace functions {
__shared__ Nd4jLong zEWS;
if (threadIdx.x == 0) {
tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength);
tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
numTads = shape::length(xShapeInfo) / tadLength;

View File

@ -231,7 +231,6 @@ namespace functions {
xLength = shape::length(xShapeInfo);
}
__syncthreads();
if (!resultScalar) {
@ -267,6 +266,7 @@ namespace functions {
if (threadIdx.x == 0) {
result[r] = sPartials[threadIdx.x].index;
}
__syncthreads();
}
} else {
@ -287,6 +287,7 @@ namespace functions {
if (threadIdx.x == 0) {
result[i] = sPartials[threadIdx.x].index; //postProcess(sPartials[0],tadLength ,extraParams);
}
__syncthreads();
}
}
} else {

View File

@ -49,7 +49,6 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
__shared__ Nd4jLong len;
if (threadIdx.x == 0) {
xEws = shape::elementWiseStride(xShapeInfo);
yEws = shape::elementWiseStride(yShapeInfo);
zEws = shape::elementWiseStride(zShapeInfo);

View File

@ -49,7 +49,6 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
__shared__ Nd4jLong len;
if (threadIdx.x == 0) {
xEws = shape::elementWiseStride(xShapeInfo);
yEws = shape::elementWiseStride(yShapeInfo);
zEws = shape::elementWiseStride(zShapeInfo);

View File

@ -125,7 +125,6 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
__shared__ Z* sPartials;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sPartials = reinterpret_cast<Z*>(shmem);
@ -137,7 +136,6 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
else
extraZ[2] = (Z) 0.0f;
}
__syncthreads();
sPartials[threadIdx.x] = OpType::startingValue(x);
@ -377,7 +375,6 @@ __device__ void Reduce3<X,Z>::transform(void *vx, Nd4jLong *xShapeInfo,
__shared__ char yTadOrder;
if(threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sPartials = reinterpret_cast<Z*>(shmem);

View File

@ -217,7 +217,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
if (threadIdx.x == 0) {
z[r] = OpType::getValue(postProcessOrNot, sPartials[threadIdx.x]);
}
__syncthreads();
}
}
else {
@ -285,8 +285,8 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
SummaryStatsData<X> *pBuffer = (SummaryStatsData<X>*) reductionBuffer;
pBuffer[blockIdx.x] = sPartials[0];
}
__syncthreads();
__threadfence();
__syncthreads();
if (tid == 0) {
unsigned int ticket = atomicInc(&tc[16384], gridDim.x);

View File

@ -52,7 +52,6 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo,
xzRank = shape::rank(xShapeInfo);
yRank = shape::rank(yShapeInfo);
}
__syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
@ -132,7 +131,6 @@ __global__ linkage void preluBPCuda(const void *vIn, const Nd4jLong *inShapeI
inRank = shape::rank(inShapeInfo);
alphaRank = shape::rank(alphaShapeInfo);
}
__syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -45,7 +45,6 @@ static void _CUDA_G adjustHueCuda(const void* vx, const Nd4jLong* xShapeInfo, co
xDimCstride = shape::stride(xShapeInfo)[dimC];
zDimCstride = shape::stride(zShapeInfo)[dimC];
}
__syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -47,7 +47,6 @@ static void _CUDA_G adjustSaturationCuda(const void* vx, const Nd4jLong* xShapeI
xDimCstride = shape::stride(xShapeInfo)[dimC];
zDimCstride = shape::stride(zShapeInfo)[dimC];
}
__syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -53,7 +53,6 @@ __global__ static void batchnormCuda(const void* vx, const Nd4jLong* xShapeInfo,
__shared__ Nd4jLong minLen, tadLen, totalThreads;
if (threadIdx.x == 0) {
totalThreads = gridDim.x * blockDim.x;
minLen = shape::length(meanShapeInfo);
@ -116,7 +115,6 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
totalThreads = gridDim.x * blockDim.x;

View File

@ -38,7 +38,6 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI
__shared__ Nd4jLong *sharedMem, imLen;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
@ -53,7 +52,6 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI
imLen = shape::length(imShapeInfo);
}
__syncthreads();
const auto imInd = threadIdx.x + blockIdx.x * blockDim.x;

View File

@ -50,7 +50,6 @@ __global__ static void concatCuda(void* pVx, void* pxShapeInfo, void* vz, Nd4jL
rank = shape::rank(zShapeInfo);
totalThreads = gridDim.x * blockDim.x;
}
__syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -43,7 +43,6 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
__shared__ Nd4jLong *sharedMem;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
@ -56,7 +55,6 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
iH = volShapeInfo[4];
iW = volShapeInfo[5];
}
__syncthreads();
const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -127,7 +125,6 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape
__shared__ Nd4jLong *sharedMem, volLen;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
@ -144,7 +141,6 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape
volLen = shape::length(volShapeInfo);
}
__syncthreads();
const auto volInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -404,7 +400,6 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
__shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;
if (threadIdx.x == 0) {
bS = shape::sizeAt(xShapeInfo, 0);
iC = shape::sizeAt(xShapeInfo, 1);
oH = shape::sizeAt(zShapeInfo, 2);
@ -428,7 +423,6 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
kHEff = kH + (kH-1)*(dH-1);
kWEff = kW + (kW-1)*(dW-1);
}
__syncthreads();
int tid = blockIdx.x * gridDim.x + threadIdx.x;
@ -501,7 +495,6 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
__shared__ bool fOrder;
if (threadIdx.x == 0) {
bS = shape::sizeAt(xShapeInfo, 0);
iC = shape::sizeAt(xShapeInfo, 1);
oH = shape::sizeAt(zShapeInfo, 2);
@ -525,7 +518,6 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
kHEff = kH + (kH-1)*(dH-1);
kWEff = kW + (kW-1)*(dW-1);
}
__syncthreads();
int tid = blockIdx.x * gridDim.x + threadIdx.x;
@ -594,7 +586,6 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
__shared__ bool fOrder;
if (threadIdx.x == 0) {
bS = shape::sizeAt(xShapeInfo, 0);
iC = shape::sizeAt(xShapeInfo, 1);
oH = shape::sizeAt(zShapeInfo, 2);
@ -618,7 +609,6 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
kHEff = kH + (kH-1)*(dH-1);
kWEff = kW + (kW-1)*(dW-1);
}
__syncthreads();
int tid = blockIdx.x * gridDim.x + threadIdx.x;
@ -737,7 +727,6 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,
kProd = kD * kH * kW;
}
__syncthreads();
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -888,7 +877,6 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
kProd = kH * kW;
}
__syncthreads();
const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1043,7 +1031,6 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
kProd = kD * kH * kW;
}
__syncthreads();
const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1356,7 +1343,6 @@ __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeIn
zLen = shape::length(zShapeInfo);
rank = 4;
}
__syncthreads();
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1425,7 +1411,6 @@ __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeIn
zLen = shape::length(zShapeInfo);
rank = 5;
}
__syncthreads();
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1499,7 +1484,6 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape
factorH = xShapeInfo[dimIH + 1] / zShapeInfo[dimIH + 1];
factorW = xShapeInfo[dimIH + 2] / zShapeInfo[dimIH + 2];
}
__syncthreads();
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
@ -1573,7 +1557,6 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape
factorH = xShapeInfo[dimID + 2] / zShapeInfo[dimID + 2];
factorW = xShapeInfo[dimID + 3] / zShapeInfo[dimID + 3];
}
__syncthreads();
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;

View File

@ -40,7 +40,6 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
__shared__ Nd4jLong lenWithoutLastDim, totalThreads, *sharedMem;
if (threadIdx.x == 0) {
x = reinterpret_cast<const T*>(vx);
y = reinterpret_cast<const T*>(vy);
z = reinterpret_cast<T*>(vz);

View File

@ -62,7 +62,6 @@ __global__ static void dilation2dCuda(const void* vx, const Nd4jLong* xShapeInfo
kH = yShapeInfo[1];
kW = yShapeInfo[2];
}
__syncthreads();
const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;

View File

@ -32,13 +32,8 @@ namespace helpers {
static __global__ void dropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probVal, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
auto step = blockDim.x * gridDim.x;
__shared__ T const* input;
__shared__ T* output;
if (threadIdx.x == 0) {
input = reinterpret_cast<T const*>(inputBuf);
output = reinterpret_cast<T*>(outputBuf);
}
T const* input = reinterpret_cast<T const*>(inputBuf);
T* output = reinterpret_cast<T*>(outputBuf);
for (Nd4jLong e = 0; e < inLen; ++e) {
T val = nodeRng->relativeT(e, T(0.f), T(1.f));
@ -134,6 +129,7 @@ namespace helpers {
output = reinterpret_cast<T*>(outputBuf);
input = reinterpret_cast<T*>(gradOutBuf);
}
__syncthreads();
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
auto step = blockDim.x * gridDim.x;
@ -159,13 +155,8 @@ namespace helpers {
static __global__ void alphaDropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probValue, double alpha, double alpha1, double beta, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
auto step = blockDim.x * gridDim.x;
__shared__ T const* input;
__shared__ T* output;
if (threadIdx.x == 0) {
input = reinterpret_cast<T const*>(inputBuf);
output = reinterpret_cast<T*>(outputBuf);
}
T const* input = reinterpret_cast<T const*>(inputBuf);
T* output = reinterpret_cast<T*>(outputBuf);
for (auto e = tid; e < inLen; e += step) {
T val = nodeRng->relativeT(e, T(0.f), T(1.f));
@ -209,7 +200,7 @@ namespace helpers {
std::vector<Nd4jLong> dims(reduceShape->lengthOf());
reduceShape->syncToHost(); // to ensure that follows are actual
bool fit = true;
// PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit))
for( int i = 0; i < dims.size(); i++ ) {
if (fit) {
dims[i] = reduceShape->e<Nd4jLong>(i);
@ -225,9 +216,9 @@ namespace helpers {
REQUIRE_TRUE(fit, 0, "alpha_dropout: Noise shape should fit to input rank.");
std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext()));
chunk->assign(1.f);
//chunk->applyRandom<randomOps::DropOutInverted<T>>(rng, nullptr, chunk.get(), &probValue);
//NativeOpExecutioner::execRandom(random::DropOutInverted, rng, chunk->buffer(), chunk->shapeInfo(), chunk->buffer(), chunk->shapeInfo(), &prob);
alphaDropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), seed, probValue, alpha, alpha1, beta);
// broadcast chunk to full matrix
std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input));
dropOutMultiplier->assign(1.f);

View File

@ -76,7 +76,6 @@ __global__ static void gatherCuda(const int numOfSubArrs,
for (int i = blockIdx.x; i < numOfSubArrs; i += gridDim.x) {
if (threadIdx.x == 0) {
x = reinterpret_cast<const X*>(vx) + xOffsets[y[shape::getIndexOffset(i, yShapeInfo, numOfSubArrs)]];
z = reinterpret_cast<X*>(vz) + zOffsets[i];
}

View File

@ -47,7 +47,6 @@ namespace nd4j {
__shared__ Nd4jLong zLen, totalThreads, *sharedMem;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
@ -61,7 +60,6 @@ namespace nd4j {
totalThreads = gridDim.x * blockDim.x;
}
__syncthreads();
auto coord = sharedMem + threadIdx.x * maxRank;

View File

@ -61,7 +61,6 @@ namespace nd4j {
static __global__ void lastStep(Nd4jLong* resultBuf, Nd4jLong* tempBufferA, Nd4jLong* tempResult, Nd4jLong length, Nd4jLong blockSize) {
if (threadIdx.x == 0) {
if (length <= blockSize)
*resultBuf = *tempBufferA;
else

View File

@ -44,7 +44,6 @@ __global__ static void im2colCuda(const void *image, void *columns,
__shared__ int imRank, colRank;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
@ -56,7 +55,6 @@ __global__ static void im2colCuda(const void *image, void *columns,
iH = imShapeInfo[3];
iW = imShapeInfo[4];
}
__syncthreads();
const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;

View File

@ -88,12 +88,9 @@ namespace helpers {
template <typename I>
static __global__ void copyIndices(void* indices, void* indicesLong, Nd4jLong len) {
__shared__ I* indexBuf;
__shared__ Nd4jLong* srcBuf;
if (threadIdx.x == 0) {
indexBuf = reinterpret_cast<I*>(indices);
srcBuf = reinterpret_cast<Nd4jLong*>(indicesLong);
}
I* indexBuf = reinterpret_cast<I*>(indices);
Nd4jLong* srcBuf = reinterpret_cast<Nd4jLong*>(indicesLong);;
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
auto step = blockDim.x * gridDim.x;

View File

@ -29,11 +29,7 @@ namespace helpers {
template <typename T>
static _CUDA_G void lrnKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) {
extern __shared__ char sharedChar[];
__shared__ T* shared;
if (threadIdx.x == 0)
shared = reinterpret_cast<T*>(sharedChar);
__syncthreads();
T* shared = reinterpret_cast<T*>(sharedChar);
auto xEws = shape::elementWiseStride(xTadShapeInfo);
auto zEws = shape::elementWiseStride(zTadShapeInfo);
@ -69,16 +65,8 @@ namespace helpers {
template <typename X, typename Z>
static _CUDA_G void lrnBPKernel(void *vx, Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets, void *vz, Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets, Nd4jLong numTads, Nd4jLong tadLength, int depth, double bias, double alpha, double beta) {
extern __shared__ char sharedChar[];
__shared__ X* sharedX;
__shared__ Z* sharedY;
if (threadIdx.x == 0) {
sharedX = reinterpret_cast<X*>(sharedChar);
sharedY = reinterpret_cast<Z*>(sharedX + blockDim.x);
}
__syncthreads();
X* sharedX = reinterpret_cast<X*>(sharedChar);
Z* sharedY = reinterpret_cast<Z*>(sharedX + blockDim.x);
auto xEws = shape::elementWiseStride(xTadShapeInfo);
auto zEws = shape::elementWiseStride(zTadShapeInfo);

View File

@ -57,14 +57,8 @@ namespace helpers {
// }
template <typename T>
static __global__ void invertKernelLow(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
__shared__ T* inverted;
__shared__ T* input;
if (threadIdx.x == 0) {
inverted = reinterpret_cast<T*>(invertedBuf);
input = reinterpret_cast<T*>(inputBuf);
}
__syncthreads();
T* inverted = reinterpret_cast<T*>(invertedBuf);
T* input = reinterpret_cast<T*>(inputBuf);
auto start = threadIdx.x + blockIdx.x * blockDim.x;
auto step = blockDim.x * gridDim.x;
@ -84,14 +78,8 @@ namespace helpers {
template <typename T>
static __global__ void upvertKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
__shared__ T* inverted;
__shared__ T* input;
if (threadIdx.x == 0) {
inverted = reinterpret_cast<T*>(invertedBuf);
input = reinterpret_cast<T*>(inputBuf);
}
__syncthreads();
T* inverted = reinterpret_cast<T*>(invertedBuf);
T* input = reinterpret_cast<T*>(inputBuf);
auto start = threadIdx.x + blockIdx.x * blockDim.x;
auto step = blockDim.x * gridDim.x;
@ -107,14 +95,8 @@ namespace helpers {
template <typename T>
static __global__ void upvertKernelUp(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
__shared__ T* inverted;
__shared__ T* input;
if (threadIdx.x == 0) {
inverted = reinterpret_cast<T*>(invertedBuf);
input = reinterpret_cast<T*>(inputBuf);
}
__syncthreads();
T* inverted = reinterpret_cast<T*>(invertedBuf);
T* input = reinterpret_cast<T*>(inputBuf);
auto start = threadIdx.x + blockIdx.x * blockDim.x;
auto step = blockDim.x * gridDim.x;
@ -135,17 +117,8 @@ namespace helpers {
template <typename T>
static __global__ void invertLowKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
__shared__ T* inverted;
__shared__ T* input;
if (threadIdx.x == 0) {
inverted = reinterpret_cast<T*>(invertedBuf);
input = reinterpret_cast<T*>(inputBuf);
}
__syncthreads();
// auto start = threadIdx.x + blockIdx.x * blockDim.x;
// auto step = blockDim.x * gridDim.x;
T* inverted = reinterpret_cast<T*>(invertedBuf);
T* input = reinterpret_cast<T*>(inputBuf);
for (int i = blockIdx.x + 2; i < n; i += gridDim.x) {
for (int j = i - 2; j >= 0; --j)
@ -166,17 +139,8 @@ namespace helpers {
template <typename T>
static __global__ void invertUpKernel(void* invertedBuf, Nd4jLong* invertedShape, void* inputBuf, Nd4jLong* inputShape, Nd4jLong n) {
__shared__ T* inverted;
__shared__ T* input;
if (threadIdx.x == 0) {
inverted = reinterpret_cast<T*>(invertedBuf);
input = reinterpret_cast<T*>(inputBuf);
}
__syncthreads();
// auto start = threadIdx.x + blockIdx.x * blockDim.x;
// auto step = blockDim.x * gridDim.x;
T* inverted = reinterpret_cast<T*>(invertedBuf);;
T* input = reinterpret_cast<T*>(inputBuf);
for (int i = n - blockIdx.x - 2; i >= 0; i -= gridDim.x) {
for (int j = i + 2; j < n; j++)
@ -366,11 +330,8 @@ namespace helpers {
template <typename F>
static __global__ void fillUpPermutation(void* output, Nd4jLong* shape, int* source, int rowNum) {
__shared__ F* permutation;
F* permutation = reinterpret_cast<F*>(output);
if (threadIdx.x == 0) {
permutation = reinterpret_cast<F*>(output);
}
auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = blockDim.x * gridDim.x;
for (auto i = start; i < rowNum; i += step) {
@ -709,13 +670,8 @@ namespace helpers {
template <typename F>
__global__ void adjustResultsKernel(F* dArray, Nd4jLong* shape, Nd4jLong* offsets, Nd4jLong batchSize, Nd4jLong n) {
//auto i = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ Nd4jLong* shapeOf;
__shared__ Nd4jLong* strideOf;
if (blockIdx.x == 0 && threadIdx.x == 0) {
shapeOf = shape::shapeOf(shape);
strideOf = shape::stride(shape);
}
__syncthreads();
Nd4jLong* shapeOf = shape::shapeOf(shape);
Nd4jLong* strideOf = shape::stride(shape);
for (auto i = blockIdx.x; i < batchSize; i+= gridDim.x) {
auto current = dArray + offsets[i];

View File

@ -37,6 +37,7 @@ namespace helpers {
outLength = shape::length(outputShape);
diagonalLen = shape::length(diagonalShape);
}
__syncthreads();
for(int i = blockIdx.x; i < batchSize; i+= gridDim.x )
for(int j = threadIdx.x; j < lastSmallDim; j += blockDim.x) {

View File

@ -46,7 +46,6 @@ __global__ static void onehotCuda(const void *vx, const Nd4jLong *xShapeInfo, vo
__shared__ Nd4jLong zLen, totalThreads, *sharedMem;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
xRank = shape::rank(xShapeInfo);
@ -54,11 +53,10 @@ __global__ static void onehotCuda(const void *vx, const Nd4jLong *xShapeInfo, vo
zLen = shape::length(zShapeInfo);
totalThreads = gridDim.x * blockDim.x;
}
__syncthreads();
auto coord = sharedMem + threadIdx.x * zRank;
__syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
for (Nd4jLong i = tid; i < zLen; i += totalThreads) {

View File

@ -51,7 +51,6 @@ namespace nd4j {
__shared__ Nd4jLong zLen, yLen, totalThreads, *coords, *xShape, *zShape, *xStride, *zStride, shift1, shift2, yStride0;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
coords = reinterpret_cast<Nd4jLong*>(shmem);
zLen = shape::length(zShapeInfo);

View File

@ -77,6 +77,7 @@ namespace helpers {
// saving final value
if (threadIdx.x == 0)
z[shape::getIndexOffset(t, zShapeInfo, zLength)] = x[shape::getIndexOffset(position, xTadShapeInfo, tadLength)];
__syncthreads();
}
}

View File

@ -40,7 +40,6 @@ __global__ static void polyGammaCuda(const void *vn, const Nd4jLong *nShapeInfo,
if (threadIdx.x == 0)
len = shape::length(nShapeInfo);
__syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -93,6 +93,7 @@ __global__ static void prefixPerBlockCuda(scalar::Ops op,
if (threadIdx.x == 0)
shared[blockDim2 - 1] = (op == scalar::Add) ? 0 : 1;
__syncthreads();
for (uint d = 1; d < blockDim2; d *= 2) {

View File

@ -47,7 +47,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
__shared__ Nd4jLong zLen, totalThreads, *sharedMem;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
@ -55,7 +54,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
zLen = shape::length(zShapeInfo);
totalThreads = gridDim.x * blockDim.x;
}
__syncthreads();
auto coords = sharedMem + threadIdx.x * rank;
@ -138,7 +136,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
__shared__ Nd4jLong zLen, totalThreads, *sharedMem;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
@ -146,7 +143,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
zLen = shape::length(zShapeInfo);
totalThreads = gridDim.x * blockDim.x;
}
__syncthreads();
auto coords = sharedMem + threadIdx.x * rank;

View File

@ -308,7 +308,6 @@ __global__ static void scatterCuda(const int opCode,
__shared__ Nd4jLong yLen, totalThreads, *coord;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
coord = reinterpret_cast<Nd4jLong*>(shmem);
yLen = shape::length(yShapeInfo);
@ -317,7 +316,6 @@ __global__ static void scatterCuda(const int opCode,
yRank = shape::rank(yShapeInfo);
zRank = shape::rank(zShapeInfo);
}
__syncthreads();
auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank);
@ -455,12 +453,10 @@ __global__ static void scatterNDLockCuda(const int opCode,
__shared__ int xLastDim;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
zTadCoords = reinterpret_cast<Nd4jLong*>(shmem);
xLastDim = xTadShapeInfo[1]; // xTad has rank = 1 always
}
__syncthreads();
Nd4jLong* zTadCoordsPerThread = zTadCoords + threadIdx.x * xLastDim;
@ -598,7 +594,6 @@ __global__ static void scatterNDCuda(const int opCode,
__shared__ Nd4jLong yLen, totalThreads, *coord;
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
coord = reinterpret_cast<Nd4jLong*>(shmem);
yLen = shape::length(yShapeInfo);
@ -608,7 +603,6 @@ __global__ static void scatterNDCuda(const int opCode,
zRank = shape::rank(zShapeInfo);
xLastDim = xShapeInfo[xRank];
}
__syncthreads();
auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank);
@ -752,7 +746,6 @@ __global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo,
xLen = shape::length(xShapeInfo);
xRank = shape::rank(xShapeInfo);
}
__syncthreads();
const auto xInd = threadIdx.x + blockIdx.x * blockDim.x;

View File

@ -56,7 +56,6 @@ namespace nd4j {
arrLenX = shape::length(xShapeInfo);
arrLenY = shape::length(yShapeInfo);
}
__syncthreads();
if (arrLenX != arrLenY)

View File

@ -266,6 +266,7 @@ namespace nd4j {
gradOut = reinterpret_cast<T*>(eps);
gradLen = shape::length(epsShape);
}
__syncthreads();
auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = gridDim.x * blockDim.x;
@ -311,6 +312,7 @@ namespace nd4j {
gradLen = shape::length(epsShape);
currentLen = shape::length(outTad);
}
__syncthreads();
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);

View File

@ -248,6 +248,7 @@ namespace helpers {
gradOut = reinterpret_cast<T*>(eps);
gradLen = shape::length(epsShape);
}
__syncthreads();
auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = gridDim.x * blockDim.x;

View File

@ -257,6 +257,7 @@ namespace helpers {
gradOut = reinterpret_cast<T*>(eps);
gradLen = shape::length(epsShape);
}
__syncthreads();
auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = gridDim.x * blockDim.x;
@ -302,6 +303,7 @@ namespace helpers {
gradLen = shape::length(epsShape);
currentLen = shape::length(outTad);
}
__syncthreads();
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);

View File

@ -75,7 +75,7 @@ namespace helpers {
if (threadIdx.x == 0) {
z[zIndex] = val[segment];
}
__syncthreads();
}
// -------------------------------------------------------------------------------------------------------------- //
template <typename T, typename I>
@ -256,6 +256,7 @@ namespace helpers {
gradOut = reinterpret_cast<T*>(eps);
gradLen = shape::length(epsShape);
}
__syncthreads();
auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = gridDim.x * blockDim.x;
@ -298,6 +299,7 @@ namespace helpers {
gradLen = shape::length(epsShape);
currentLen = shape::length(outTad);
}
__syncthreads();
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);

View File

@ -168,6 +168,7 @@ namespace helpers {
gradOut = reinterpret_cast<T*>(eps);
gradLen = shape::length(epsShape);
}
__syncthreads();
auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = gridDim.x * blockDim.x;

View File

@ -256,6 +256,7 @@ namespace helpers {
gradOut = reinterpret_cast<T*>(eps);
gradLen = shape::length(epsShape);
}
__syncthreads();
auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = gridDim.x * blockDim.x;
@ -292,6 +293,7 @@ namespace helpers {
gradLen = shape::length(epsShape);
currentLen = shape::length(outTad);
}
__syncthreads();
for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);

View File

@ -36,6 +36,7 @@ namespace helpers {
inputLen = shape::length(inputShape);
outputLen = shape::length(outputShape);
}
__syncthreads();
for (auto i = blockIdx.x; i < maxIndex; i += gridDim.x)
for(auto k = threadIdx.x; k < inputLen; k += blockDim.x)

View File

@ -377,6 +377,7 @@ namespace nd4j {
if (hasError)
neu1[0] = DataTypeUtils::infOrMax<T>();
}
__syncthreads();
}
template <typename T>

View File

@ -149,7 +149,6 @@ __global__ static void sruBICuda(const void* vx, const Nd4jLong* xShapeInfo,
totalThreads = gridDim.x * blockDim.x;
}
__syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -135,6 +135,7 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
if (threadIdx.x == 0)
z[zOffset] = *sharedMem;
__syncthreads();
}
}

View File

@ -39,7 +39,6 @@ __global__ static void zetaCuda(const void *vx, const Nd4jLong *xShapeInfo,
if (threadIdx.x == 0)
len = shape::length(xShapeInfo);
__syncthreads();
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;

View File

@ -18208,6 +18208,24 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
}
// #endif
// #if NOT_EXCLUDED(OP_space_to_batch_nd)
@Namespace("nd4j::ops") public static class space_to_batch_nd extends DeclarableCustomOp {
static { Loader.load(); }
/** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
public space_to_batch_nd(Pointer p) { super(p); }
/** Native array allocator. Access with {@link Pointer#position(long)}. */
public space_to_batch_nd(long size) { super((Pointer)null); allocateArray(size); }
private native void allocateArray(long size);
@Override public space_to_batch_nd position(long position) {
return (space_to_batch_nd)super.position(position);
}
public space_to_batch_nd() { super((Pointer)null); allocate(); }
private native void allocate();
public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
}
// #endif
/**
*
*