[WIP] minor (#218)

* - initial docs commit
- merge* cuda fix

Signed-off-by: raver119 <raver119@gmail.com>

* one more fix

Signed-off-by: raver119 <raver119@gmail.com>

* one more fix

Signed-off-by: raver119 <raver119@gmail.com>
master
raver119 2019-09-02 11:25:48 +03:00 committed by GitHub
parent 65c9f2a888
commit e42c34ca55
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 89 additions and 45 deletions

View File

@ -273,9 +273,11 @@ namespace nd4j {
* @param writeList * @param writeList
* @param readList * @param readList
*/ */
// TODO: it would be nice to have NDArray::registerSpecialUse signature that accepts something else beyond initializer_list
static void registerSpecialUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList); static void registerSpecialUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList);
static void prepareSpecialUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList, bool synchronizeWritables = false); static void prepareSpecialUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList, bool synchronizeWritables = false);
// TODO: it would be nice to have NDArray::registerSpecialUse signature that accepts something else beyond initializer_list
static void registerPrimaryUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList); static void registerPrimaryUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList);
static void preparePrimaryUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList, bool synchronizeWritables = false); static void preparePrimaryUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList, bool synchronizeWritables = false);

View File

@ -96,13 +96,13 @@ namespace functions {
} }
else { else {
if(vx == vz) { if(vx == vz) {
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) { for (Nd4jLong i = tid; i < length; i+= totalThreads) {
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
z[xOffset] = OpType::op(x[xOffset], params); z[xOffset] = OpType::op(x[xOffset], params);
} }
} }
else { else {
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) { for (Nd4jLong i = tid; i < length; i+= totalThreads) {
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length); auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
z[zOffset] = OpType::op(x[xOffset], params); z[zOffset] = OpType::op(x[xOffset], params);

View File

@ -94,13 +94,13 @@ namespace functions {
} }
else { else {
if(vx == vz) { if(vx == vz) {
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) { for (Nd4jLong i = tid; i < length; i+= totalThreads) {
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
z[xOffset] = OpType::op(x[xOffset], params); z[xOffset] = OpType::op(x[xOffset], params);
} }
} }
else { else {
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) { for (Nd4jLong i = tid; i < length; i+= totalThreads) {
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length); auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
z[zOffset] = OpType::op(x[xOffset], params); z[zOffset] = OpType::op(x[xOffset], params);

View File

@ -96,13 +96,13 @@ namespace functions {
} }
else { else {
if(vx == vz) { if(vx == vz) {
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) { for (Nd4jLong i = tid; i < length; i+= totalThreads) {
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
z[xOffset] = OpType::op(x[xOffset], params); z[xOffset] = OpType::op(x[xOffset], params);
} }
} }
else { else {
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) { for (Nd4jLong i = tid; i < length; i+= totalThreads) {
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length); auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length); auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
z[zOffset] = OpType::op(x[xOffset], params); z[zOffset] = OpType::op(x[xOffset], params);

View File

@ -30,7 +30,7 @@ namespace nd4j {
shared[threadIdx.x] = 0; shared[threadIdx.x] = 0;
// each thread will compare 2 elements: E and E+1
for (int e = tid; e < length - 1; e += blockDim.x * gridDim.x) { for (int e = tid; e < length - 1; e += blockDim.x * gridDim.x) {
auto val0 = x[shape::getIndexOffset(e, xShapeInfo, length)]; auto val0 = x[shape::getIndexOffset(e, xShapeInfo, length)];
auto val1 = x[shape::getIndexOffset(e+1, xShapeInfo, length)]; auto val1 = x[shape::getIndexOffset(e+1, xShapeInfo, length)];
@ -41,11 +41,12 @@ namespace nd4j {
else else
v = val1 >= val0; v = val1 >= val0;
// store comparison result in shared memory
shared[threadIdx.x] += v ? 0 : 1; shared[threadIdx.x] += v ? 0 : 1;
} }
__syncthreads(); __syncthreads();
// aggregate sum // aggregate sums in shared memory
for (uint activeThreads = blockDim.x / 2; activeThreads > 0; activeThreads /= 2) { for (uint activeThreads = blockDim.x / 2; activeThreads > 0; activeThreads /= 2) {
if (threadIdx.x < activeThreads) if (threadIdx.x < activeThreads)
shared[threadIdx.x] += shared[threadIdx.x + activeThreads]; shared[threadIdx.x] += shared[threadIdx.x + activeThreads];
@ -53,7 +54,7 @@ namespace nd4j {
} }
// store over the grid // store over the grid if we have more than 1 block
if (gridDim.x > 1) { if (gridDim.x > 1) {
auto tc = reinterpret_cast<unsigned int *>(reductionBuffer); auto tc = reinterpret_cast<unsigned int *>(reductionBuffer);
@ -96,7 +97,7 @@ namespace nd4j {
} }
} }
else { else {
// if we have only 1 block, we just store results right away
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
auto tc = reinterpret_cast<unsigned int*>(reductionBuffer); auto tc = reinterpret_cast<unsigned int*>(reductionBuffer);
tc[16384] = 0; tc[16384] = 0;

View File

@ -424,7 +424,7 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
} }
__syncthreads(); __syncthreads();
int tid = blockIdx.x * gridDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
for (int index = tid; index < length; index += blockDim.x * gridDim.x) { for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
@ -519,7 +519,7 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
} }
__syncthreads(); __syncthreads();
int tid = blockIdx.x * gridDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
for (int index = tid; index < length; index += blockDim.x * gridDim.x) { for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
@ -610,7 +610,7 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
} }
__syncthreads(); __syncthreads();
int tid = blockIdx.x * gridDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
for (int index = tid; index < length; index += blockDim.x * gridDim.x) { for (int index = tid; index < length; index += blockDim.x * gridDim.x) {

View File

@ -39,7 +39,7 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
} }
__syncthreads(); __syncthreads();
const auto tid = blockIdx.x * gridDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = gridDim.x * blockDim.x; const auto step = gridDim.x * blockDim.x;
for (int t = tid; t < inputLength; t += step) { for (int t = tid; t < inputLength; t += step) {
z[shape::getIndexOffset(t * (inputLength + 1), outputShape, outputLength)] = x[shape::getIndexOffset(t, inputShape, inputLength)]; //tX]; z[shape::getIndexOffset(t * (inputLength + 1), outputShape, outputLength)] = x[shape::getIndexOffset(t, inputShape, inputLength)]; //tX];
@ -59,7 +59,7 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
} }
__syncthreads(); __syncthreads();
const auto tid = blockIdx.x * gridDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = gridDim.x * blockDim.x; const auto step = gridDim.x * blockDim.x;
Nd4jLong i = threadIdx.x * (outputLength + 1); Nd4jLong i = threadIdx.x * (outputLength + 1);
for (int t = tid; t < outputLength && i < inputLength; t += step) { for (int t = tid; t < outputLength && i < inputLength; t += step) {

View File

@ -35,9 +35,11 @@ namespace helpers {
T const* input = reinterpret_cast<T const*>(inputBuf); T const* input = reinterpret_cast<T const*>(inputBuf);
T* output = reinterpret_cast<T*>(outputBuf); T* output = reinterpret_cast<T*>(outputBuf);
// trivial idea: loop through all elements, get independent probability for each element to be nullified
for (Nd4jLong e = 0; e < inLen; ++e) { for (Nd4jLong e = 0; e < inLen; ++e) {
T val = nodeRng->relativeT(e, T(0.f), T(1.f)); T val = nodeRng->relativeT(e, T(0.f), T(1.f));
// if probability is ok - we're saving scaled value
if (double(val) < probVal) if (double(val) < probVal)
output[shape::getIndexOffset(e, outputShape, inLen)] = T(input[shape::getIndexOffset(e, inputShape, inLen)] / probVal); output[shape::getIndexOffset(e, outputShape, inLen)] = T(input[shape::getIndexOffset(e, inputShape, inLen)] / probVal);
} }
@ -80,7 +82,7 @@ namespace helpers {
std::vector<Nd4jLong> dims(reduceShape->lengthOf()); std::vector<Nd4jLong> dims(reduceShape->lengthOf());
reduceShape->syncToHost(); // to ensure that follows are actual reduceShape->syncToHost(); // to ensure that follows are actual
bool fit = true; bool fit = true;
// PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit))
for( int i = 0; i < dims.size(); i++ ) { for( int i = 0; i < dims.size(); i++ ) {
if (fit) { if (fit) {
dims[i] = reduceShape->e<Nd4jLong>(i); dims[i] = reduceShape->e<Nd4jLong>(i);
@ -96,8 +98,7 @@ namespace helpers {
REQUIRE_TRUE(fit, 0, "dropout: Noise shape should fit to input rank."); REQUIRE_TRUE(fit, 0, "dropout: Noise shape should fit to input rank.");
std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext())); std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext()));
chunk->assign(1.f); chunk->assign(1.f);
//chunk->applyRandom<randomOps::DropOutInverted<T>>(rng, nullptr, chunk.get(), &probValue);
//NativeOpExecutioner::execRandom(random::DropOutInverted, rng, chunk->buffer(), chunk->shapeInfo(), chunk->buffer(), chunk->shapeInfo(), &prob);
dropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), probValue, seed); dropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), probValue, seed);
// broadcast chunk to full matrix // broadcast chunk to full matrix
std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input)); std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input));
@ -105,6 +106,7 @@ namespace helpers {
*dropOutMultiplier += *chunk; *dropOutMultiplier += *chunk;
// FIXME: we could do this in one step, aren't we?
output->assign(*input * *dropOutMultiplier); //input->applyPairwiseTransform(pairwise::Multiply, dropOutMultiplier.get(), output, nullptr); output->assign(*input * *dropOutMultiplier); //input->applyPairwiseTransform(pairwise::Multiply, dropOutMultiplier.get(), output, nullptr);
} }
@ -113,8 +115,11 @@ namespace helpers {
int dropOutFunctor(graph::Context& context, NDArray* input, NDArray* output, NDArray* reduceShape, int seed, double probValue) { int dropOutFunctor(graph::Context& context, NDArray* input, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
auto xType = input->dataType(); auto xType = input->dataType();
NDArray::prepareSpecialUse({output}, {input});
BUILD_SINGLE_SELECTOR(xType, return _dropOutFunctor, (context, input, output, reduceShape, seed, probValue), FLOAT_TYPES); BUILD_SINGLE_SELECTOR(xType, return _dropOutFunctor, (context, input, output, reduceShape, seed, probValue), FLOAT_TYPES);
NDArray::registerSpecialUse({output}, {input});
} }
/////////////////////////////////// backrpopagations /////////////////////////////////////////////// /////////////////////////////////// backrpopagations ///////////////////////////////////////////////
@ -136,6 +141,8 @@ namespace helpers {
for (int e = tid; e < len; e += step) { for (int e = tid; e < len; e += step) {
const auto zOffset = shape::getIndexOffset(e, outputShape, len); const auto zOffset = shape::getIndexOffset(e, outputShape, len);
// if probability was non-zero on FF step, we'll scale grads back
if (output[zOffset] != T(0.)) if (output[zOffset] != T(0.))
output[zOffset] = T(input[shape::getIndexOffset(e, gradOutShape, len)] / probValue); output[zOffset] = T(input[shape::getIndexOffset(e, gradOutShape, len)] / probValue);
@ -143,12 +150,17 @@ namespace helpers {
} }
template <typename T> template <typename T>
static int dropOutFunctorBP_(graph::Context& context, NDArray* input, NDArray* gradOut, NDArray* output, NDArray* reduceShape, int seed, double probValue) { static int dropOutFunctorBP_(graph::Context& context, NDArray* input, NDArray* gradOut, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
// we're making additional FF run to see how probabilities played out with given seeds
int res = dropOutFunctor(context, input, output, reduceShape, seed, probValue); int res = dropOutFunctor(context, input, output, reduceShape, seed, probValue);
auto stream = context.launchContext()->getCudaStream(); auto stream = context.launchContext()->getCudaStream();
NDArray::prepareSpecialUse({output}, {input, gradOut});
if (ND4J_STATUS_OK == res) if (ND4J_STATUS_OK == res)
dropoutBPKernel<T><<<128, 256, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(), probValue); dropoutBPKernel<T><<<128, 256, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(), probValue);
NDArray::registerSpecialUse({output}, {input, gradOut});
return res; return res;
} }
@ -239,6 +251,7 @@ namespace helpers {
int res = alphaDropOutFunctor(context, input, output, reduceShape, seed, probValue, alpha, alpha1, beta); int res = alphaDropOutFunctor(context, input, output, reduceShape, seed, probValue, alpha, alpha1, beta);
if (res == ND4J_STATUS_OK) { if (res == ND4J_STATUS_OK) {
// FIXME: can we make it single-loop?
(*output) *= alpha; (*output) *= alpha;
(*output) *= (*gradOut); //->applyPairwiseTransform<transform::Multiply>(gradOut, output, nullptr); (*output) *= (*gradOut); //->applyPairwiseTransform<transform::Multiply>(gradOut, output, nullptr);
} }

View File

@ -43,7 +43,7 @@ namespace nd4j {
} }
__syncthreads(); __syncthreads();
// we run things in blocks, 1 partition per block of threads
for (Nd4jLong o = blockIdx.x; o < numOutputs; o += gridDim.x) { for (Nd4jLong o = blockIdx.x; o < numOutputs; o += gridDim.x) {
auto z = reinterpret_cast<X*>(vz[o]); auto z = reinterpret_cast<X*>(vz[o]);
@ -89,9 +89,11 @@ namespace nd4j {
auto x = reinterpret_cast<X*>(vx); auto x = reinterpret_cast<X*>(vx);
auto indices = reinterpret_cast<Y*>(vindices); auto indices = reinterpret_cast<Y*>(vindices);
// we run things in blocks, 1 partition per block of threads
for (int i = blockIdx.x; i < numOutputs; i += gridDim.x) { for (int i = blockIdx.x; i < numOutputs; i += gridDim.x) {
auto z = reinterpret_cast<X*>(vz[i]); auto z = reinterpret_cast<X*>(vz[i]);
// each thread has own counter for partitions
int outCnt = 0; int outCnt = 0;
for (Nd4jLong e = 0; e < iLength; e++) { for (Nd4jLong e = 0; e < iLength; e++) {
@ -145,6 +147,7 @@ namespace nd4j {
tadOffsets[i] = packZ.platformOffsets(); tadOffsets[i] = packZ.platformOffsets();
} }
// we copy pointers to device
auto dOutBuffers = reinterpret_cast<void **>(pm.replicatePointer(outBuffers.data(), outBuffers.size() * sizeof(void *))); auto dOutBuffers = reinterpret_cast<void **>(pm.replicatePointer(outBuffers.data(), outBuffers.size() * sizeof(void *)));
auto dOutTadShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(tadShapes.data(), tadShapes.size() * sizeof(Nd4jLong *))); auto dOutTadShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(tadShapes.data(), tadShapes.size() * sizeof(Nd4jLong *)));
auto dOutTadOffsets = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(tadOffsets.data(), tadOffsets.size() * sizeof(Nd4jLong *))); auto dOutTadOffsets = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(tadOffsets.data(), tadOffsets.size() * sizeof(Nd4jLong *)));
@ -248,6 +251,7 @@ namespace nd4j {
indicesShapes[e] = indices.at(e)->getSpecialShapeInfo(); indicesShapes[e] = indices.at(e)->getSpecialShapeInfo();
} }
// copying pointers to buffers to device
auto dInputBuffers = reinterpret_cast<void **>(pm.replicatePointer(inputBuffers.data(), inputSize * sizeof(void *))); auto dInputBuffers = reinterpret_cast<void **>(pm.replicatePointer(inputBuffers.data(), inputSize * sizeof(void *)));
auto dIndicesBuffers = reinterpret_cast<void **>(pm.replicatePointer(indicesBuffers.data(), inputSize * sizeof(void *))); auto dIndicesBuffers = reinterpret_cast<void **>(pm.replicatePointer(indicesBuffers.data(), inputSize * sizeof(void *)));
auto dInputShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputShapes.data(), inputSize * sizeof(Nd4jLong *))); auto dInputShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputShapes.data(), inputSize * sizeof(Nd4jLong *)));
@ -283,6 +287,7 @@ namespace nd4j {
inputTadOffsets[e] = packX.platformOffsets(); inputTadOffsets[e] = packX.platformOffsets();
} }
// copying pointers to buffers to device
auto dInputBuffers = reinterpret_cast<void **>(pm.replicatePointer(inputBuffers.data(), inputSize * sizeof(void *))); auto dInputBuffers = reinterpret_cast<void **>(pm.replicatePointer(inputBuffers.data(), inputSize * sizeof(void *)));
auto dInputTadShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputTadShapes.data(), inputSize * sizeof(Nd4jLong *))); auto dInputTadShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputTadShapes.data(), inputSize * sizeof(Nd4jLong *)));
auto dInputTadOffsets = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputTadOffsets.data(), inputSize * sizeof(Nd4jLong *))); auto dInputTadOffsets = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputTadOffsets.data(), inputSize * sizeof(Nd4jLong *)));
@ -313,6 +318,7 @@ namespace nd4j {
NDArray::registerSpecialUse({}, {indices, input}); NDArray::registerSpecialUse({}, {indices, input});
// TODO: it would be nice to have NDArray::registerSpecialUse signature that accepts something else beyond initializer_list
for (auto v:outputList) { for (auto v:outputList) {
v->tickWriteDevice(); v->tickWriteDevice();
} }

View File

@ -29,6 +29,7 @@ namespace nd4j {
Nd4jLong xCoord[MAX_RANK]; Nd4jLong xCoord[MAX_RANK];
// each block of threads works on 1 input array
for (Nd4jLong e = blockIdx.x; e < numInputs; e += gridDim.x) { for (Nd4jLong e = blockIdx.x; e < numInputs; e += gridDim.x) {
auto z = reinterpret_cast<T*>(zBuffer) + offsets[e]; auto z = reinterpret_cast<T*>(zBuffer) + offsets[e];
@ -39,6 +40,7 @@ namespace nd4j {
auto xRank = shape::rank(xShapeInfo); auto xRank = shape::rank(xShapeInfo);
auto xLength = shape::length(xShapeInfo); auto xLength = shape::length(xShapeInfo);
// each element of this input array has own place within common output array
for (uint i = threadIdx.x; i < xLength; i += blockDim.x) { for (uint i = threadIdx.x; i < xLength; i += blockDim.x) {
shape::index2coords(xRank, xShape, i, xLength, xCoord, order); shape::index2coords(xRank, xShape, i, xLength, xCoord, order);
auto xOffset = shape::getOffset(0, xShape, xStride, xCoord, xRank); auto xOffset = shape::getOffset(0, xShape, xStride, xCoord, xRank);
@ -65,6 +67,7 @@ namespace nd4j {
hdShapes[e] = inputs[e]->specialShapeInfo(); hdShapes[e] = inputs[e]->specialShapeInfo();
} }
// copying pointers to device
auto dBuffers = (void **) pm.replicatePointer(hdBuffers.data(), inputs.size() * sizeof(void*)); auto dBuffers = (void **) pm.replicatePointer(hdBuffers.data(), inputs.size() * sizeof(void*));
auto dShapes = (Nd4jLong **)pm.replicatePointer(hdShapes.data(), inputs.size() * sizeof(Nd4jLong*)); auto dShapes = (Nd4jLong **)pm.replicatePointer(hdShapes.data(), inputs.size() * sizeof(Nd4jLong*));
auto dOffsets = (Nd4jLong *) pm.replicatePointer(hOffsets.data(), inputs.size() * sizeof(Nd4jLong)); auto dOffsets = (Nd4jLong *) pm.replicatePointer(hOffsets.data(), inputs.size() * sizeof(Nd4jLong));
@ -76,6 +79,7 @@ namespace nd4j {
} }
void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) { void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
// FIXME: we want NDArrayFactory::prepareSpecialUse here eventually
for (auto v:inputs) for (auto v:inputs)
v->syncToDevice(); v->syncToDevice();

View File

@ -26,6 +26,7 @@ namespace ops {
namespace helpers { namespace helpers {
template <typename T> template <typename T>
void applyGradientDescent_(LaunchContext* context, NDArray* input, NDArray* step, double weight, NDArray* output) { void applyGradientDescent_(LaunchContext* context, NDArray* input, NDArray* step, double weight, NDArray* output) {
// classic one
auto lambda = LAMBDA_TT(_x, _y, weight) { auto lambda = LAMBDA_TT(_x, _y, weight) {
return _x - (_y * weight); return _x - (_y * weight);
}; };

View File

@ -44,6 +44,7 @@ namespace nd4j {
X binSize = X((*max_val - *min_val) / numBins); X binSize = X((*max_val - *min_val) / numBins);
// nullify bins
for (int e = threadIdx.x; e < numBins; e += blockDim.x) { for (int e = threadIdx.x; e < numBins; e += blockDim.x) {
bins[e] = (Z) 0; bins[e] = (Z) 0;
} }
@ -53,14 +54,12 @@ namespace nd4j {
int idx = int((dx[e] - *min_val) / binSize); int idx = int((dx[e] - *min_val) / binSize);
idx = math::nd4j_max(idx, 0); //atomicMax(&idx, 0);//atomicMax(&idx, 0); idx = math::nd4j_max(idx, 0); //atomicMax(&idx, 0);//atomicMax(&idx, 0);
idx = math::nd4j_min(idx, int(numBins - 1)); //atomicMin(&idx, int(numBins - 1)); idx = math::nd4j_min(idx, int(numBins - 1)); //atomicMin(&idx, int(numBins - 1));
nd4j::math::atomics::nd4j_atomicAdd(&bins[idx], (Z)1); nd4j::math::atomics::nd4j_atomicAdd<Z>(&bins[idx], (Z)1);
// bins[idx]++;
} }
__syncthreads(); __syncthreads();
// at this point all bins in shared memory are calculated, so we aggregate them now via threadfence trick
// transfer shared memory to reduction memory // transfer shared memory to reduction memory
if (gridDim.x > 1) { if (gridDim.x > 1) {
unsigned int *tc = (unsigned int *)reductionPointer; unsigned int *tc = (unsigned int *)reductionPointer;
__shared__ bool amLast; __shared__ bool amLast;

View File

@ -64,6 +64,7 @@ static void ismax_(nd4j::LaunchContext * context, const NDArray* input, NDArray*
auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), copy.data(), copy.size()); auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), copy.data(), copy.size());
// we launch legacy IndexMax op, to get indices of max values along dimension
auto indexMaxArr = input->applyIndexReduce(indexreduce::IndexMax, dimensions); auto indexMaxArr = input->applyIndexReduce(indexreduce::IndexMax, dimensions);
dim3 launchDims(256, 256, 16384); dim3 launchDims(256, 256, 16384);

View File

@ -41,12 +41,12 @@ namespace helpers {
const T tbeta = static_cast<T>(beta); const T tbeta = static_cast<T>(beta);
const T talpha = static_cast<T>(alpha); const T talpha = static_cast<T>(alpha);
// one block of threads processes 1 example within batch
for (uint i = blockIdx.x; i < numTads; i += gridDim.x) { for (uint i = blockIdx.x; i < numTads; i += gridDim.x) {
auto x = reinterpret_cast<T*>(vx) + xTadOffsets[i]; auto x = reinterpret_cast<T*>(vx) + xTadOffsets[i];
auto z = reinterpret_cast<T*>(vz) + zTadOffsets[i]; auto z = reinterpret_cast<T*>(vz) + zTadOffsets[i];
// load everything into shared memory // load everything into shared memory, so we'll operate on shared memory from now on
shared[threadIdx.x] = x[threadIdx.x * xEws]; shared[threadIdx.x] = x[threadIdx.x * xEws];
__syncthreads(); __syncthreads();
@ -94,7 +94,7 @@ namespace helpers {
sharedY[threadIdx.x] = 0.f; sharedY[threadIdx.x] = 0.f;
__syncthreads(); __syncthreads();
// we're operating in shared memory
for (int s = begin; s < end; s++) for (int s = begin; s < end; s++)
sharedY[threadIdx.x] = sharedY[threadIdx.x] + sharedX[s] * sharedX[s]; sharedY[threadIdx.x] = sharedY[threadIdx.x] + sharedX[s] * sharedX[s];
__syncthreads(); __syncthreads();

View File

@ -37,7 +37,7 @@ namespace nd4j {
static __global__ void global_mergeMaxIndex_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) { static __global__ void global_mergeMaxIndex_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
auto output = reinterpret_cast<Z*>(voutput); auto output = reinterpret_cast<Z*>(voutput);
const auto tid = blockIdx.x * gridDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = gridDim.x * blockDim.x; const auto step = gridDim.x * blockDim.x;
for (Nd4jLong e = tid; e < length; e += step) { for (Nd4jLong e = tid; e < length; e += step) {
@ -81,7 +81,13 @@ namespace nd4j {
} }
void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) { void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
NDArray::prepareSpecialUse({&output}, {});
for (auto v:inArrs)
v->syncToDevice();
BUILD_DOUBLE_SELECTOR(inArrs[0]->dataType(), output.dataType(), mergeMaxIndex_, (context, inArrs, output), LIBND4J_TYPES, INDEXING_TYPES); BUILD_DOUBLE_SELECTOR(inArrs[0]->dataType(), output.dataType(), mergeMaxIndex_, (context, inArrs, output), LIBND4J_TYPES, INDEXING_TYPES);
NDArray::registerSpecialUse({&output}, {});
} }
@ -90,7 +96,7 @@ namespace nd4j {
static __global__ void global_mergeMax_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) { static __global__ void global_mergeMax_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
auto output = reinterpret_cast<T*>(voutput); auto output = reinterpret_cast<T*>(voutput);
const auto tid = blockIdx.x * gridDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = gridDim.x * blockDim.x; const auto step = gridDim.x * blockDim.x;
for (Nd4jLong e = tid; e < length; e += step) { for (Nd4jLong e = tid; e < length; e += step) {
@ -131,7 +137,12 @@ namespace nd4j {
} }
void mergeMax(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) { void mergeMax(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
NDArray::prepareSpecialUse({&output}, {});
for (auto v:inArrs)
v->syncToDevice();
BUILD_SINGLE_SELECTOR(output.dataType(), mergeMax_, (context, inArrs, output), LIBND4J_TYPES); BUILD_SINGLE_SELECTOR(output.dataType(), mergeMax_, (context, inArrs, output), LIBND4J_TYPES);
NDArray::registerSpecialUse({&output}, {});
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@ -139,7 +150,7 @@ namespace nd4j {
static __global__ void global_mergeAvg_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) { static __global__ void global_mergeAvg_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
auto output = reinterpret_cast<T*>(voutput); auto output = reinterpret_cast<T*>(voutput);
const auto tid = blockIdx.x * gridDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = gridDim.x * blockDim.x; const auto step = gridDim.x * blockDim.x;
for (Nd4jLong e = tid; e < length; e += step) { for (Nd4jLong e = tid; e < length; e += step) {
@ -178,7 +189,13 @@ namespace nd4j {
} }
void mergeAvg(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) { void mergeAvg(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
NDArray::prepareSpecialUse({&output}, {});
for (auto v:inArrs)
v->syncToDevice();
BUILD_SINGLE_SELECTOR(output.dataType(), mergeAvg_, (context, inArrs, output), FLOAT_TYPES); BUILD_SINGLE_SELECTOR(output.dataType(), mergeAvg_, (context, inArrs, output), FLOAT_TYPES);
NDArray::registerSpecialUse({&output}, {});
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@ -186,7 +203,7 @@ namespace nd4j {
static __global__ void global_mergeAdd_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) { static __global__ void global_mergeAdd_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
auto output = reinterpret_cast<T*>(voutput); auto output = reinterpret_cast<T*>(voutput);
const auto tid = blockIdx.x * gridDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = gridDim.x * blockDim.x; const auto step = gridDim.x * blockDim.x;
for (Nd4jLong e = tid; e < length; e += step) { for (Nd4jLong e = tid; e < length; e += step) {
@ -226,7 +243,13 @@ namespace nd4j {
BUILD_SINGLE_TEMPLATE(template void mergeAdd_, (nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output), NUMERIC_TYPES); BUILD_SINGLE_TEMPLATE(template void mergeAdd_, (nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output), NUMERIC_TYPES);
void mergeAdd(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) { void mergeAdd(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
NDArray::prepareSpecialUse({&output}, {});
for (auto v:inArrs)
v->syncToDevice();
BUILD_SINGLE_SELECTOR(output.dataType(), mergeAdd_, (context, inArrs, output), NUMERIC_TYPES); BUILD_SINGLE_SELECTOR(output.dataType(), mergeAdd_, (context, inArrs, output), NUMERIC_TYPES);
NDArray::registerSpecialUse({&output}, {});
} }
} }
} }

View File

@ -31,18 +31,18 @@ namespace helpers {
template <typename T> template <typename T>
static __global__ void fillUpElementKernel(void* outputBuffer, Nd4jLong* outputShapeInfo, void* inputBuffer, Nd4jLong* inputShapeInfo, Nd4jLong* pTadShape, Nd4jLong* pTadOffsets, Nd4jLong n) { static __global__ void fillUpElementKernel(void* outputBuffer, Nd4jLong* outputShapeInfo, void* inputBuffer, Nd4jLong* inputShapeInfo, Nd4jLong* pTadShape, Nd4jLong* pTadOffsets, Nd4jLong n) {
__shared__ T *z, *x;
__shared__ Nd4jLong bufferLength, arrLen; __shared__ Nd4jLong bufferLength, arrLen;
auto z = reinterpret_cast<T*>(outputBuffer);
auto x = reinterpret_cast<T*>(inputBuffer);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
z = reinterpret_cast<T*>(outputBuffer);
x = reinterpret_cast<T*>(inputBuffer);
arrLen = shape::length(pTadShape); arrLen = shape::length(pTadShape);
bufferLength = shape::length(outputShapeInfo); bufferLength = shape::length(outputShapeInfo);
} }
__syncthreads(); __syncthreads();
const auto tid = blockIdx.x * gridDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = gridDim.x * blockDim.x; const auto step = gridDim.x * blockDim.x;
for (int t = tid; t < bufferLength; t += step) { for (int t = tid; t < bufferLength; t += step) {
auto tX = x + pTadOffsets[t]; auto tX = x + pTadOffsets[t];
@ -77,8 +77,6 @@ namespace helpers {
// manager.synchronize(); // manager.synchronize();
sortedVals.tickWriteDevice(); sortedVals.tickWriteDevice();
sortedVals.syncToHost(); sortedVals.syncToHost();
sortedVals.printIndexedBuffer("Hello");
sortedVals.printBuffer("Hello line");
auto stream = context->getCudaStream(); auto stream = context->getCudaStream();
fillUpElementKernel<T><<<32, 64, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), sortedVals.specialBuffer(), sortedVals.specialShapeInfo(), pTadShape, pTadOffsets, n); fillUpElementKernel<T><<<32, 64, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), sortedVals.specialBuffer(), sortedVals.specialShapeInfo(), pTadShape, pTadOffsets, n);
} }

View File

@ -74,17 +74,14 @@ static void polyGammaCudaLauncher(const int blocksPerGrid, const int threadsPerB
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
void polyGamma(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& z) { void polyGamma(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& z) {
if(!n.isActualOnDeviceSide()) n.syncToDevice(); NDArray::prepareSpecialUse({&z}, {&n, &x});
if(!x.isActualOnDeviceSide()) x.syncToDevice();
int threadsPerBlock = MAX_NUM_THREADS; int threadsPerBlock = MAX_NUM_THREADS;
int blocksPerGrid = (z.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; int blocksPerGrid = (z.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
BUILD_SINGLE_SELECTOR(n.dataType(), polyGammaCudaLauncher, (blocksPerGrid, threadsPerBlock, context->getCudaStream(), n.getSpecialBuffer(), n.getSpecialShapeInfo(), x.getSpecialBuffer(), x.getSpecialShapeInfo(), z.getSpecialBuffer(), z.getSpecialShapeInfo()), FLOAT_TYPES); BUILD_SINGLE_SELECTOR(n.dataType(), polyGammaCudaLauncher, (blocksPerGrid, threadsPerBlock, context->getCudaStream(), n.getSpecialBuffer(), n.getSpecialShapeInfo(), x.getSpecialBuffer(), x.getSpecialShapeInfo(), z.getSpecialBuffer(), z.getSpecialShapeInfo()), FLOAT_TYPES);
n.tickReadHost(); NDArray::registerSpecialUse({&z}, {&n, &x});
x.tickReadHost();
z.tickWriteDevice();
} }
BUILD_SINGLE_TEMPLATE(template void polyGammaCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const cudaStream_t *stream, const void *vn, const Nd4jLong *nShapeInfo, const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo), FLOAT_TYPES); BUILD_SINGLE_TEMPLATE(template void polyGammaCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const cudaStream_t *stream, const void *vn, const Nd4jLong *nShapeInfo, const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo), FLOAT_TYPES);

View File

@ -28,7 +28,7 @@ namespace helpers {
template <typename T> template <typename T>
static __global__ void global_range(void *output, Nd4jLong length, T start, T delta) { static __global__ void global_range(void *output, Nd4jLong length, T start, T delta) {
auto buff = reinterpret_cast<T*>(output); auto buff = reinterpret_cast<T*>(output);
const auto tid = blockIdx.x * gridDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = gridDim.x * blockDim.x; const auto step = gridDim.x * blockDim.x;
for(Nd4jLong i = tid; i < length; i += step) for(Nd4jLong i = tid; i < length; i += step)
@ -43,10 +43,11 @@ namespace helpers {
} }
void range(nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) { void range(nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) {
NDArray::prepareSpecialUse({&outVector}, {&start, &delta});
BUILD_SINGLE_SELECTOR(outVector.dataType(), _range, (context, start, delta, outVector), LIBND4J_TYPES); BUILD_SINGLE_SELECTOR(outVector.dataType(), _range, (context, start, delta, outVector), LIBND4J_TYPES);
NDArray::registerSpecialUse({&outVector}, {&start, &delta});
} }
BUILD_SINGLE_TEMPLATE(template void _range, (nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector), NUMERIC_TYPES);
} }
} }
} }

View File

@ -26,13 +26,11 @@ namespace nd4j {
namespace helpers { namespace helpers {
template<typename T> template<typename T>
void toggle_bits__(NDArray &in, NDArray &out) { void toggle_bits__(NDArray &in, NDArray &out) {
NDArray::prepareSpecialUse({&out}, {&in});
auto lambda = LAMBDA_T(_x) { auto lambda = LAMBDA_T(_x) {
return ~_x;//eUtils::flip_bits(_x); return ~_x;//eUtils::flip_bits(_x);
}; };
in.applyLambda(lambda, &out); in.applyLambda(lambda, &out);
NDArray::registerSpecialUse({&out}, {&in});
} }
BUILD_SINGLE_TEMPLATE(template void toggle_bits__, (NDArray &in, NDArray &out), INTEGER_TYPES); BUILD_SINGLE_TEMPLATE(template void toggle_bits__, (NDArray &in, NDArray &out), INTEGER_TYPES);