[WIP] minor (#218)
* - initial docs commit - merge* cuda fix Signed-off-by: raver119 <raver119@gmail.com> * one more fix Signed-off-by: raver119 <raver119@gmail.com> * one more fix Signed-off-by: raver119 <raver119@gmail.com>master
parent
65c9f2a888
commit
e42c34ca55
|
@ -273,9 +273,11 @@ namespace nd4j {
|
|||
* @param writeList
|
||||
* @param readList
|
||||
*/
|
||||
// TODO: it would be nice to have NDArray::registerSpecialUse signature that accepts something else beyond initializer_list
|
||||
static void registerSpecialUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList);
|
||||
static void prepareSpecialUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList, bool synchronizeWritables = false);
|
||||
|
||||
// TODO: it would be nice to have NDArray::registerSpecialUse signature that accepts something else beyond initializer_list
|
||||
static void registerPrimaryUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList);
|
||||
static void preparePrimaryUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList, bool synchronizeWritables = false);
|
||||
|
||||
|
|
|
@ -96,13 +96,13 @@ namespace functions {
|
|||
}
|
||||
else {
|
||||
if(vx == vz) {
|
||||
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
||||
z[xOffset] = OpType::op(x[xOffset], params);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
|
||||
z[zOffset] = OpType::op(x[xOffset], params);
|
||||
|
|
|
@ -94,13 +94,13 @@ namespace functions {
|
|||
}
|
||||
else {
|
||||
if(vx == vz) {
|
||||
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
||||
z[xOffset] = OpType::op(x[xOffset], params);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
|
||||
z[zOffset] = OpType::op(x[xOffset], params);
|
||||
|
|
|
@ -96,13 +96,13 @@ namespace functions {
|
|||
}
|
||||
else {
|
||||
if(vx == vz) {
|
||||
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
||||
z[xOffset] = OpType::op(x[xOffset], params);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
||||
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
|
||||
z[zOffset] = OpType::op(x[xOffset], params);
|
||||
|
|
|
@ -30,7 +30,7 @@ namespace nd4j {
|
|||
|
||||
shared[threadIdx.x] = 0;
|
||||
|
||||
|
||||
// each thread will compare 2 elements: E and E+1
|
||||
for (int e = tid; e < length - 1; e += blockDim.x * gridDim.x) {
|
||||
auto val0 = x[shape::getIndexOffset(e, xShapeInfo, length)];
|
||||
auto val1 = x[shape::getIndexOffset(e+1, xShapeInfo, length)];
|
||||
|
@ -41,11 +41,12 @@ namespace nd4j {
|
|||
else
|
||||
v = val1 >= val0;
|
||||
|
||||
// store comparison result in shared memory
|
||||
shared[threadIdx.x] += v ? 0 : 1;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// aggregate sum
|
||||
// aggregate sums in shared memory
|
||||
for (uint activeThreads = blockDim.x / 2; activeThreads > 0; activeThreads /= 2) {
|
||||
if (threadIdx.x < activeThreads)
|
||||
shared[threadIdx.x] += shared[threadIdx.x + activeThreads];
|
||||
|
@ -53,7 +54,7 @@ namespace nd4j {
|
|||
}
|
||||
|
||||
|
||||
// store over the grid
|
||||
// store over the grid if we have more than 1 block
|
||||
if (gridDim.x > 1) {
|
||||
|
||||
auto tc = reinterpret_cast<unsigned int *>(reductionBuffer);
|
||||
|
@ -96,7 +97,7 @@ namespace nd4j {
|
|||
}
|
||||
}
|
||||
else {
|
||||
|
||||
// if we have only 1 block, we just store results right away
|
||||
if (threadIdx.x == 0) {
|
||||
auto tc = reinterpret_cast<unsigned int*>(reductionBuffer);
|
||||
tc[16384] = 0;
|
||||
|
|
|
@ -424,7 +424,7 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
|
|||
}
|
||||
__syncthreads();
|
||||
|
||||
int tid = blockIdx.x * gridDim.x + threadIdx.x;
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
|
||||
|
||||
|
@ -519,7 +519,7 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
|
|||
}
|
||||
__syncthreads();
|
||||
|
||||
int tid = blockIdx.x * gridDim.x + threadIdx.x;
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
|
||||
|
||||
|
@ -610,7 +610,7 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
|
|||
}
|
||||
__syncthreads();
|
||||
|
||||
int tid = blockIdx.x * gridDim.x + threadIdx.x;
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
|
|||
}
|
||||
__syncthreads();
|
||||
|
||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = gridDim.x * blockDim.x;
|
||||
for (int t = tid; t < inputLength; t += step) {
|
||||
z[shape::getIndexOffset(t * (inputLength + 1), outputShape, outputLength)] = x[shape::getIndexOffset(t, inputShape, inputLength)]; //tX];
|
||||
|
@ -59,7 +59,7 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
|
|||
}
|
||||
__syncthreads();
|
||||
|
||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = gridDim.x * blockDim.x;
|
||||
Nd4jLong i = threadIdx.x * (outputLength + 1);
|
||||
for (int t = tid; t < outputLength && i < inputLength; t += step) {
|
||||
|
|
|
@ -35,9 +35,11 @@ namespace helpers {
|
|||
T const* input = reinterpret_cast<T const*>(inputBuf);
|
||||
T* output = reinterpret_cast<T*>(outputBuf);
|
||||
|
||||
// trivial idea: loop through all elements, get independent probability for each element to be nullified
|
||||
for (Nd4jLong e = 0; e < inLen; ++e) {
|
||||
T val = nodeRng->relativeT(e, T(0.f), T(1.f));
|
||||
|
||||
// if probability is ok - we're saving scaled value
|
||||
if (double(val) < probVal)
|
||||
output[shape::getIndexOffset(e, outputShape, inLen)] = T(input[shape::getIndexOffset(e, inputShape, inLen)] / probVal);
|
||||
}
|
||||
|
@ -80,7 +82,7 @@ namespace helpers {
|
|||
std::vector<Nd4jLong> dims(reduceShape->lengthOf());
|
||||
reduceShape->syncToHost(); // to ensure that follows are actual
|
||||
bool fit = true;
|
||||
// PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit))
|
||||
|
||||
for( int i = 0; i < dims.size(); i++ ) {
|
||||
if (fit) {
|
||||
dims[i] = reduceShape->e<Nd4jLong>(i);
|
||||
|
@ -96,8 +98,7 @@ namespace helpers {
|
|||
REQUIRE_TRUE(fit, 0, "dropout: Noise shape should fit to input rank.");
|
||||
std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext()));
|
||||
chunk->assign(1.f);
|
||||
//chunk->applyRandom<randomOps::DropOutInverted<T>>(rng, nullptr, chunk.get(), &probValue);
|
||||
//NativeOpExecutioner::execRandom(random::DropOutInverted, rng, chunk->buffer(), chunk->shapeInfo(), chunk->buffer(), chunk->shapeInfo(), &prob);
|
||||
|
||||
dropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), probValue, seed);
|
||||
// broadcast chunk to full matrix
|
||||
std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input));
|
||||
|
@ -105,6 +106,7 @@ namespace helpers {
|
|||
|
||||
*dropOutMultiplier += *chunk;
|
||||
|
||||
// FIXME: we could do this in one step, aren't we?
|
||||
output->assign(*input * *dropOutMultiplier); //input->applyPairwiseTransform(pairwise::Multiply, dropOutMultiplier.get(), output, nullptr);
|
||||
}
|
||||
|
||||
|
@ -113,8 +115,11 @@ namespace helpers {
|
|||
|
||||
int dropOutFunctor(graph::Context& context, NDArray* input, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
|
||||
auto xType = input->dataType();
|
||||
NDArray::prepareSpecialUse({output}, {input});
|
||||
|
||||
BUILD_SINGLE_SELECTOR(xType, return _dropOutFunctor, (context, input, output, reduceShape, seed, probValue), FLOAT_TYPES);
|
||||
|
||||
NDArray::registerSpecialUse({output}, {input});
|
||||
}
|
||||
|
||||
/////////////////////////////////// backrpopagations ///////////////////////////////////////////////
|
||||
|
@ -136,6 +141,8 @@ namespace helpers {
|
|||
|
||||
for (int e = tid; e < len; e += step) {
|
||||
const auto zOffset = shape::getIndexOffset(e, outputShape, len);
|
||||
|
||||
// if probability was non-zero on FF step, we'll scale grads back
|
||||
if (output[zOffset] != T(0.))
|
||||
output[zOffset] = T(input[shape::getIndexOffset(e, gradOutShape, len)] / probValue);
|
||||
|
||||
|
@ -143,12 +150,17 @@ namespace helpers {
|
|||
}
|
||||
template <typename T>
|
||||
static int dropOutFunctorBP_(graph::Context& context, NDArray* input, NDArray* gradOut, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
|
||||
// we're making additional FF run to see how probabilities played out with given seeds
|
||||
int res = dropOutFunctor(context, input, output, reduceShape, seed, probValue);
|
||||
auto stream = context.launchContext()->getCudaStream();
|
||||
|
||||
NDArray::prepareSpecialUse({output}, {input, gradOut});
|
||||
|
||||
if (ND4J_STATUS_OK == res)
|
||||
dropoutBPKernel<T><<<128, 256, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(), probValue);
|
||||
|
||||
NDArray::registerSpecialUse({output}, {input, gradOut});
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -239,6 +251,7 @@ namespace helpers {
|
|||
|
||||
int res = alphaDropOutFunctor(context, input, output, reduceShape, seed, probValue, alpha, alpha1, beta);
|
||||
if (res == ND4J_STATUS_OK) {
|
||||
// FIXME: can we make it single-loop?
|
||||
(*output) *= alpha;
|
||||
(*output) *= (*gradOut); //->applyPairwiseTransform<transform::Multiply>(gradOut, output, nullptr);
|
||||
}
|
||||
|
|
|
@ -43,7 +43,7 @@ namespace nd4j {
|
|||
}
|
||||
__syncthreads();
|
||||
|
||||
|
||||
// we run things in blocks, 1 partition per block of threads
|
||||
for (Nd4jLong o = blockIdx.x; o < numOutputs; o += gridDim.x) {
|
||||
auto z = reinterpret_cast<X*>(vz[o]);
|
||||
|
||||
|
@ -89,9 +89,11 @@ namespace nd4j {
|
|||
auto x = reinterpret_cast<X*>(vx);
|
||||
auto indices = reinterpret_cast<Y*>(vindices);
|
||||
|
||||
// we run things in blocks, 1 partition per block of threads
|
||||
for (int i = blockIdx.x; i < numOutputs; i += gridDim.x) {
|
||||
auto z = reinterpret_cast<X*>(vz[i]);
|
||||
|
||||
// each thread has own counter for partitions
|
||||
int outCnt = 0;
|
||||
|
||||
for (Nd4jLong e = 0; e < iLength; e++) {
|
||||
|
@ -145,6 +147,7 @@ namespace nd4j {
|
|||
tadOffsets[i] = packZ.platformOffsets();
|
||||
}
|
||||
|
||||
// we copy pointers to device
|
||||
auto dOutBuffers = reinterpret_cast<void **>(pm.replicatePointer(outBuffers.data(), outBuffers.size() * sizeof(void *)));
|
||||
auto dOutTadShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(tadShapes.data(), tadShapes.size() * sizeof(Nd4jLong *)));
|
||||
auto dOutTadOffsets = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(tadOffsets.data(), tadOffsets.size() * sizeof(Nd4jLong *)));
|
||||
|
@ -248,6 +251,7 @@ namespace nd4j {
|
|||
indicesShapes[e] = indices.at(e)->getSpecialShapeInfo();
|
||||
}
|
||||
|
||||
// copying pointers to buffers to device
|
||||
auto dInputBuffers = reinterpret_cast<void **>(pm.replicatePointer(inputBuffers.data(), inputSize * sizeof(void *)));
|
||||
auto dIndicesBuffers = reinterpret_cast<void **>(pm.replicatePointer(indicesBuffers.data(), inputSize * sizeof(void *)));
|
||||
auto dInputShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputShapes.data(), inputSize * sizeof(Nd4jLong *)));
|
||||
|
@ -283,6 +287,7 @@ namespace nd4j {
|
|||
inputTadOffsets[e] = packX.platformOffsets();
|
||||
}
|
||||
|
||||
// copying pointers to buffers to device
|
||||
auto dInputBuffers = reinterpret_cast<void **>(pm.replicatePointer(inputBuffers.data(), inputSize * sizeof(void *)));
|
||||
auto dInputTadShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputTadShapes.data(), inputSize * sizeof(Nd4jLong *)));
|
||||
auto dInputTadOffsets = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputTadOffsets.data(), inputSize * sizeof(Nd4jLong *)));
|
||||
|
@ -313,6 +318,7 @@ namespace nd4j {
|
|||
|
||||
NDArray::registerSpecialUse({}, {indices, input});
|
||||
|
||||
// TODO: it would be nice to have NDArray::registerSpecialUse signature that accepts something else beyond initializer_list
|
||||
for (auto v:outputList) {
|
||||
v->tickWriteDevice();
|
||||
}
|
||||
|
|
|
@ -29,6 +29,7 @@ namespace nd4j {
|
|||
|
||||
Nd4jLong xCoord[MAX_RANK];
|
||||
|
||||
// each block of threads works on 1 input array
|
||||
for (Nd4jLong e = blockIdx.x; e < numInputs; e += gridDim.x) {
|
||||
auto z = reinterpret_cast<T*>(zBuffer) + offsets[e];
|
||||
|
||||
|
@ -39,6 +40,7 @@ namespace nd4j {
|
|||
auto xRank = shape::rank(xShapeInfo);
|
||||
auto xLength = shape::length(xShapeInfo);
|
||||
|
||||
// each element of this input array has own place within common output array
|
||||
for (uint i = threadIdx.x; i < xLength; i += blockDim.x) {
|
||||
shape::index2coords(xRank, xShape, i, xLength, xCoord, order);
|
||||
auto xOffset = shape::getOffset(0, xShape, xStride, xCoord, xRank);
|
||||
|
@ -65,6 +67,7 @@ namespace nd4j {
|
|||
hdShapes[e] = inputs[e]->specialShapeInfo();
|
||||
}
|
||||
|
||||
// copying pointers to device
|
||||
auto dBuffers = (void **) pm.replicatePointer(hdBuffers.data(), inputs.size() * sizeof(void*));
|
||||
auto dShapes = (Nd4jLong **)pm.replicatePointer(hdShapes.data(), inputs.size() * sizeof(Nd4jLong*));
|
||||
auto dOffsets = (Nd4jLong *) pm.replicatePointer(hOffsets.data(), inputs.size() * sizeof(Nd4jLong));
|
||||
|
@ -76,6 +79,7 @@ namespace nd4j {
|
|||
}
|
||||
|
||||
void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
|
||||
// FIXME: we want NDArrayFactory::prepareSpecialUse here eventually
|
||||
for (auto v:inputs)
|
||||
v->syncToDevice();
|
||||
|
||||
|
|
|
@ -26,6 +26,7 @@ namespace ops {
|
|||
namespace helpers {
|
||||
template <typename T>
|
||||
void applyGradientDescent_(LaunchContext* context, NDArray* input, NDArray* step, double weight, NDArray* output) {
|
||||
// classic one
|
||||
auto lambda = LAMBDA_TT(_x, _y, weight) {
|
||||
return _x - (_y * weight);
|
||||
};
|
||||
|
|
|
@ -44,6 +44,7 @@ namespace nd4j {
|
|||
|
||||
X binSize = X((*max_val - *min_val) / numBins);
|
||||
|
||||
// nullify bins
|
||||
for (int e = threadIdx.x; e < numBins; e += blockDim.x) {
|
||||
bins[e] = (Z) 0;
|
||||
}
|
||||
|
@ -53,14 +54,12 @@ namespace nd4j {
|
|||
int idx = int((dx[e] - *min_val) / binSize);
|
||||
idx = math::nd4j_max(idx, 0); //atomicMax(&idx, 0);//atomicMax(&idx, 0);
|
||||
idx = math::nd4j_min(idx, int(numBins - 1)); //atomicMin(&idx, int(numBins - 1));
|
||||
nd4j::math::atomics::nd4j_atomicAdd(&bins[idx], (Z)1);
|
||||
// bins[idx]++;
|
||||
nd4j::math::atomics::nd4j_atomicAdd<Z>(&bins[idx], (Z)1);
|
||||
}
|
||||
__syncthreads();
|
||||
// at this point all bins in shared memory are calculated, so we aggregate them now via threadfence trick
|
||||
|
||||
// transfer shared memory to reduction memory
|
||||
|
||||
|
||||
if (gridDim.x > 1) {
|
||||
unsigned int *tc = (unsigned int *)reductionPointer;
|
||||
__shared__ bool amLast;
|
||||
|
|
|
@ -64,6 +64,7 @@ static void ismax_(nd4j::LaunchContext * context, const NDArray* input, NDArray*
|
|||
|
||||
auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), copy.data(), copy.size());
|
||||
|
||||
// we launch legacy IndexMax op, to get indices of max values along dimension
|
||||
auto indexMaxArr = input->applyIndexReduce(indexreduce::IndexMax, dimensions);
|
||||
|
||||
dim3 launchDims(256, 256, 16384);
|
||||
|
|
|
@ -41,12 +41,12 @@ namespace helpers {
|
|||
const T tbeta = static_cast<T>(beta);
|
||||
const T talpha = static_cast<T>(alpha);
|
||||
|
||||
|
||||
// one block of threads processes 1 example within batch
|
||||
for (uint i = blockIdx.x; i < numTads; i += gridDim.x) {
|
||||
auto x = reinterpret_cast<T*>(vx) + xTadOffsets[i];
|
||||
auto z = reinterpret_cast<T*>(vz) + zTadOffsets[i];
|
||||
|
||||
// load everything into shared memory
|
||||
// load everything into shared memory, so we'll operate on shared memory from now on
|
||||
shared[threadIdx.x] = x[threadIdx.x * xEws];
|
||||
__syncthreads();
|
||||
|
||||
|
@ -94,7 +94,7 @@ namespace helpers {
|
|||
sharedY[threadIdx.x] = 0.f;
|
||||
__syncthreads();
|
||||
|
||||
|
||||
// we're operating in shared memory
|
||||
for (int s = begin; s < end; s++)
|
||||
sharedY[threadIdx.x] = sharedY[threadIdx.x] + sharedX[s] * sharedX[s];
|
||||
__syncthreads();
|
||||
|
|
|
@ -37,7 +37,7 @@ namespace nd4j {
|
|||
static __global__ void global_mergeMaxIndex_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
|
||||
auto output = reinterpret_cast<Z*>(voutput);
|
||||
|
||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = gridDim.x * blockDim.x;
|
||||
|
||||
for (Nd4jLong e = tid; e < length; e += step) {
|
||||
|
@ -81,7 +81,13 @@ namespace nd4j {
|
|||
}
|
||||
|
||||
void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
|
||||
NDArray::prepareSpecialUse({&output}, {});
|
||||
for (auto v:inArrs)
|
||||
v->syncToDevice();
|
||||
|
||||
BUILD_DOUBLE_SELECTOR(inArrs[0]->dataType(), output.dataType(), mergeMaxIndex_, (context, inArrs, output), LIBND4J_TYPES, INDEXING_TYPES);
|
||||
|
||||
NDArray::registerSpecialUse({&output}, {});
|
||||
}
|
||||
|
||||
|
||||
|
@ -90,7 +96,7 @@ namespace nd4j {
|
|||
static __global__ void global_mergeMax_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
|
||||
auto output = reinterpret_cast<T*>(voutput);
|
||||
|
||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = gridDim.x * blockDim.x;
|
||||
|
||||
for (Nd4jLong e = tid; e < length; e += step) {
|
||||
|
@ -131,7 +137,12 @@ namespace nd4j {
|
|||
}
|
||||
|
||||
void mergeMax(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
|
||||
NDArray::prepareSpecialUse({&output}, {});
|
||||
for (auto v:inArrs)
|
||||
v->syncToDevice();
|
||||
|
||||
BUILD_SINGLE_SELECTOR(output.dataType(), mergeMax_, (context, inArrs, output), LIBND4J_TYPES);
|
||||
NDArray::registerSpecialUse({&output}, {});
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
@ -139,7 +150,7 @@ namespace nd4j {
|
|||
static __global__ void global_mergeAvg_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
|
||||
auto output = reinterpret_cast<T*>(voutput);
|
||||
|
||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = gridDim.x * blockDim.x;
|
||||
|
||||
for (Nd4jLong e = tid; e < length; e += step) {
|
||||
|
@ -178,7 +189,13 @@ namespace nd4j {
|
|||
}
|
||||
|
||||
void mergeAvg(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
|
||||
NDArray::prepareSpecialUse({&output}, {});
|
||||
for (auto v:inArrs)
|
||||
v->syncToDevice();
|
||||
|
||||
BUILD_SINGLE_SELECTOR(output.dataType(), mergeAvg_, (context, inArrs, output), FLOAT_TYPES);
|
||||
|
||||
NDArray::registerSpecialUse({&output}, {});
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
@ -186,7 +203,7 @@ namespace nd4j {
|
|||
static __global__ void global_mergeAdd_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
|
||||
auto output = reinterpret_cast<T*>(voutput);
|
||||
|
||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = gridDim.x * blockDim.x;
|
||||
|
||||
for (Nd4jLong e = tid; e < length; e += step) {
|
||||
|
@ -226,7 +243,13 @@ namespace nd4j {
|
|||
BUILD_SINGLE_TEMPLATE(template void mergeAdd_, (nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output), NUMERIC_TYPES);
|
||||
|
||||
void mergeAdd(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
|
||||
NDArray::prepareSpecialUse({&output}, {});
|
||||
for (auto v:inArrs)
|
||||
v->syncToDevice();
|
||||
|
||||
BUILD_SINGLE_SELECTOR(output.dataType(), mergeAdd_, (context, inArrs, output), NUMERIC_TYPES);
|
||||
|
||||
NDArray::registerSpecialUse({&output}, {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,18 +31,18 @@ namespace helpers {
|
|||
|
||||
template <typename T>
|
||||
static __global__ void fillUpElementKernel(void* outputBuffer, Nd4jLong* outputShapeInfo, void* inputBuffer, Nd4jLong* inputShapeInfo, Nd4jLong* pTadShape, Nd4jLong* pTadOffsets, Nd4jLong n) {
|
||||
__shared__ T *z, *x;
|
||||
__shared__ Nd4jLong bufferLength, arrLen;
|
||||
|
||||
auto z = reinterpret_cast<T*>(outputBuffer);
|
||||
auto x = reinterpret_cast<T*>(inputBuffer);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
z = reinterpret_cast<T*>(outputBuffer);
|
||||
x = reinterpret_cast<T*>(inputBuffer);
|
||||
arrLen = shape::length(pTadShape);
|
||||
bufferLength = shape::length(outputShapeInfo);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = gridDim.x * blockDim.x;
|
||||
for (int t = tid; t < bufferLength; t += step) {
|
||||
auto tX = x + pTadOffsets[t];
|
||||
|
@ -77,8 +77,6 @@ namespace helpers {
|
|||
// manager.synchronize();
|
||||
sortedVals.tickWriteDevice();
|
||||
sortedVals.syncToHost();
|
||||
sortedVals.printIndexedBuffer("Hello");
|
||||
sortedVals.printBuffer("Hello line");
|
||||
auto stream = context->getCudaStream();
|
||||
fillUpElementKernel<T><<<32, 64, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), sortedVals.specialBuffer(), sortedVals.specialShapeInfo(), pTadShape, pTadOffsets, n);
|
||||
}
|
||||
|
|
|
@ -74,17 +74,14 @@ static void polyGammaCudaLauncher(const int blocksPerGrid, const int threadsPerB
|
|||
///////////////////////////////////////////////////////////////////
|
||||
void polyGamma(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& z) {
|
||||
|
||||
if(!n.isActualOnDeviceSide()) n.syncToDevice();
|
||||
if(!x.isActualOnDeviceSide()) x.syncToDevice();
|
||||
NDArray::prepareSpecialUse({&z}, {&n, &x});
|
||||
|
||||
int threadsPerBlock = MAX_NUM_THREADS;
|
||||
int blocksPerGrid = (z.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
|
||||
BUILD_SINGLE_SELECTOR(n.dataType(), polyGammaCudaLauncher, (blocksPerGrid, threadsPerBlock, context->getCudaStream(), n.getSpecialBuffer(), n.getSpecialShapeInfo(), x.getSpecialBuffer(), x.getSpecialShapeInfo(), z.getSpecialBuffer(), z.getSpecialShapeInfo()), FLOAT_TYPES);
|
||||
|
||||
n.tickReadHost();
|
||||
x.tickReadHost();
|
||||
z.tickWriteDevice();
|
||||
NDArray::registerSpecialUse({&z}, {&n, &x});
|
||||
}
|
||||
|
||||
BUILD_SINGLE_TEMPLATE(template void polyGammaCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const cudaStream_t *stream, const void *vn, const Nd4jLong *nShapeInfo, const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo), FLOAT_TYPES);
|
||||
|
|
|
@ -28,7 +28,7 @@ namespace helpers {
|
|||
template <typename T>
|
||||
static __global__ void global_range(void *output, Nd4jLong length, T start, T delta) {
|
||||
auto buff = reinterpret_cast<T*>(output);
|
||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = gridDim.x * blockDim.x;
|
||||
|
||||
for(Nd4jLong i = tid; i < length; i += step)
|
||||
|
@ -43,10 +43,11 @@ namespace helpers {
|
|||
}
|
||||
|
||||
void range(nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) {
|
||||
NDArray::prepareSpecialUse({&outVector}, {&start, &delta});
|
||||
BUILD_SINGLE_SELECTOR(outVector.dataType(), _range, (context, start, delta, outVector), LIBND4J_TYPES);
|
||||
NDArray::registerSpecialUse({&outVector}, {&start, &delta});
|
||||
}
|
||||
|
||||
BUILD_SINGLE_TEMPLATE(template void _range, (nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector), NUMERIC_TYPES);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -26,13 +26,11 @@ namespace nd4j {
|
|||
namespace helpers {
|
||||
template<typename T>
|
||||
void toggle_bits__(NDArray &in, NDArray &out) {
|
||||
NDArray::prepareSpecialUse({&out}, {&in});
|
||||
auto lambda = LAMBDA_T(_x) {
|
||||
return ~_x;//eUtils::flip_bits(_x);
|
||||
};
|
||||
|
||||
in.applyLambda(lambda, &out);
|
||||
NDArray::registerSpecialUse({&out}, {&in});
|
||||
}
|
||||
BUILD_SINGLE_TEMPLATE(template void toggle_bits__, (NDArray &in, NDArray &out), INTEGER_TYPES);
|
||||
|
||||
|
|
Loading…
Reference in New Issue