[WIP] minor (#218)
* - initial docs commit - merge* cuda fix Signed-off-by: raver119 <raver119@gmail.com> * one more fix Signed-off-by: raver119 <raver119@gmail.com> * one more fix Signed-off-by: raver119 <raver119@gmail.com>master
parent
65c9f2a888
commit
e42c34ca55
|
@ -273,9 +273,11 @@ namespace nd4j {
|
||||||
* @param writeList
|
* @param writeList
|
||||||
* @param readList
|
* @param readList
|
||||||
*/
|
*/
|
||||||
|
// TODO: it would be nice to have NDArray::registerSpecialUse signature that accepts something else beyond initializer_list
|
||||||
static void registerSpecialUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList);
|
static void registerSpecialUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList);
|
||||||
static void prepareSpecialUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList, bool synchronizeWritables = false);
|
static void prepareSpecialUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList, bool synchronizeWritables = false);
|
||||||
|
|
||||||
|
// TODO: it would be nice to have NDArray::registerSpecialUse signature that accepts something else beyond initializer_list
|
||||||
static void registerPrimaryUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList);
|
static void registerPrimaryUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList);
|
||||||
static void preparePrimaryUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList, bool synchronizeWritables = false);
|
static void preparePrimaryUse(const std::initializer_list<const NDArray*>& writeList, const std::initializer_list<const NDArray*>& readList, bool synchronizeWritables = false);
|
||||||
|
|
||||||
|
|
|
@ -96,13 +96,13 @@ namespace functions {
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if(vx == vz) {
|
if(vx == vz) {
|
||||||
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
||||||
z[xOffset] = OpType::op(x[xOffset], params);
|
z[xOffset] = OpType::op(x[xOffset], params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
||||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
|
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
|
||||||
z[zOffset] = OpType::op(x[xOffset], params);
|
z[zOffset] = OpType::op(x[xOffset], params);
|
||||||
|
|
|
@ -94,13 +94,13 @@ namespace functions {
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if(vx == vz) {
|
if(vx == vz) {
|
||||||
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
||||||
z[xOffset] = OpType::op(x[xOffset], params);
|
z[xOffset] = OpType::op(x[xOffset], params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
||||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
|
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
|
||||||
z[zOffset] = OpType::op(x[xOffset], params);
|
z[zOffset] = OpType::op(x[xOffset], params);
|
||||||
|
|
|
@ -96,13 +96,13 @@ namespace functions {
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if(vx == vz) {
|
if(vx == vz) {
|
||||||
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
||||||
z[xOffset] = OpType::op(x[xOffset], params);
|
z[xOffset] = OpType::op(x[xOffset], params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
|
for (Nd4jLong i = tid; i < length; i+= totalThreads) {
|
||||||
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
|
||||||
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
|
auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
|
||||||
z[zOffset] = OpType::op(x[xOffset], params);
|
z[zOffset] = OpType::op(x[xOffset], params);
|
||||||
|
|
|
@ -30,7 +30,7 @@ namespace nd4j {
|
||||||
|
|
||||||
shared[threadIdx.x] = 0;
|
shared[threadIdx.x] = 0;
|
||||||
|
|
||||||
|
// each thread will compare 2 elements: E and E+1
|
||||||
for (int e = tid; e < length - 1; e += blockDim.x * gridDim.x) {
|
for (int e = tid; e < length - 1; e += blockDim.x * gridDim.x) {
|
||||||
auto val0 = x[shape::getIndexOffset(e, xShapeInfo, length)];
|
auto val0 = x[shape::getIndexOffset(e, xShapeInfo, length)];
|
||||||
auto val1 = x[shape::getIndexOffset(e+1, xShapeInfo, length)];
|
auto val1 = x[shape::getIndexOffset(e+1, xShapeInfo, length)];
|
||||||
|
@ -41,11 +41,12 @@ namespace nd4j {
|
||||||
else
|
else
|
||||||
v = val1 >= val0;
|
v = val1 >= val0;
|
||||||
|
|
||||||
|
// store comparison result in shared memory
|
||||||
shared[threadIdx.x] += v ? 0 : 1;
|
shared[threadIdx.x] += v ? 0 : 1;
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// aggregate sum
|
// aggregate sums in shared memory
|
||||||
for (uint activeThreads = blockDim.x / 2; activeThreads > 0; activeThreads /= 2) {
|
for (uint activeThreads = blockDim.x / 2; activeThreads > 0; activeThreads /= 2) {
|
||||||
if (threadIdx.x < activeThreads)
|
if (threadIdx.x < activeThreads)
|
||||||
shared[threadIdx.x] += shared[threadIdx.x + activeThreads];
|
shared[threadIdx.x] += shared[threadIdx.x + activeThreads];
|
||||||
|
@ -53,7 +54,7 @@ namespace nd4j {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// store over the grid
|
// store over the grid if we have more than 1 block
|
||||||
if (gridDim.x > 1) {
|
if (gridDim.x > 1) {
|
||||||
|
|
||||||
auto tc = reinterpret_cast<unsigned int *>(reductionBuffer);
|
auto tc = reinterpret_cast<unsigned int *>(reductionBuffer);
|
||||||
|
@ -96,7 +97,7 @@ namespace nd4j {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
// if we have only 1 block, we just store results right away
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
auto tc = reinterpret_cast<unsigned int*>(reductionBuffer);
|
auto tc = reinterpret_cast<unsigned int*>(reductionBuffer);
|
||||||
tc[16384] = 0;
|
tc[16384] = 0;
|
||||||
|
|
|
@ -424,7 +424,7 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
int tid = blockIdx.x * gridDim.x + threadIdx.x;
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
|
for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
|
||||||
|
|
||||||
|
@ -519,7 +519,7 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
int tid = blockIdx.x * gridDim.x + threadIdx.x;
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
|
for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
|
||||||
|
|
||||||
|
@ -610,7 +610,7 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
int tid = blockIdx.x * gridDim.x + threadIdx.x;
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
|
for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const auto step = gridDim.x * blockDim.x;
|
const auto step = gridDim.x * blockDim.x;
|
||||||
for (int t = tid; t < inputLength; t += step) {
|
for (int t = tid; t < inputLength; t += step) {
|
||||||
z[shape::getIndexOffset(t * (inputLength + 1), outputShape, outputLength)] = x[shape::getIndexOffset(t, inputShape, inputLength)]; //tX];
|
z[shape::getIndexOffset(t * (inputLength + 1), outputShape, outputLength)] = x[shape::getIndexOffset(t, inputShape, inputLength)]; //tX];
|
||||||
|
@ -59,7 +59,7 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const auto step = gridDim.x * blockDim.x;
|
const auto step = gridDim.x * blockDim.x;
|
||||||
Nd4jLong i = threadIdx.x * (outputLength + 1);
|
Nd4jLong i = threadIdx.x * (outputLength + 1);
|
||||||
for (int t = tid; t < outputLength && i < inputLength; t += step) {
|
for (int t = tid; t < outputLength && i < inputLength; t += step) {
|
||||||
|
|
|
@ -35,9 +35,11 @@ namespace helpers {
|
||||||
T const* input = reinterpret_cast<T const*>(inputBuf);
|
T const* input = reinterpret_cast<T const*>(inputBuf);
|
||||||
T* output = reinterpret_cast<T*>(outputBuf);
|
T* output = reinterpret_cast<T*>(outputBuf);
|
||||||
|
|
||||||
|
// trivial idea: loop through all elements, get independent probability for each element to be nullified
|
||||||
for (Nd4jLong e = 0; e < inLen; ++e) {
|
for (Nd4jLong e = 0; e < inLen; ++e) {
|
||||||
T val = nodeRng->relativeT(e, T(0.f), T(1.f));
|
T val = nodeRng->relativeT(e, T(0.f), T(1.f));
|
||||||
|
|
||||||
|
// if probability is ok - we're saving scaled value
|
||||||
if (double(val) < probVal)
|
if (double(val) < probVal)
|
||||||
output[shape::getIndexOffset(e, outputShape, inLen)] = T(input[shape::getIndexOffset(e, inputShape, inLen)] / probVal);
|
output[shape::getIndexOffset(e, outputShape, inLen)] = T(input[shape::getIndexOffset(e, inputShape, inLen)] / probVal);
|
||||||
}
|
}
|
||||||
|
@ -80,7 +82,7 @@ namespace helpers {
|
||||||
std::vector<Nd4jLong> dims(reduceShape->lengthOf());
|
std::vector<Nd4jLong> dims(reduceShape->lengthOf());
|
||||||
reduceShape->syncToHost(); // to ensure that follows are actual
|
reduceShape->syncToHost(); // to ensure that follows are actual
|
||||||
bool fit = true;
|
bool fit = true;
|
||||||
// PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit))
|
|
||||||
for( int i = 0; i < dims.size(); i++ ) {
|
for( int i = 0; i < dims.size(); i++ ) {
|
||||||
if (fit) {
|
if (fit) {
|
||||||
dims[i] = reduceShape->e<Nd4jLong>(i);
|
dims[i] = reduceShape->e<Nd4jLong>(i);
|
||||||
|
@ -96,8 +98,7 @@ namespace helpers {
|
||||||
REQUIRE_TRUE(fit, 0, "dropout: Noise shape should fit to input rank.");
|
REQUIRE_TRUE(fit, 0, "dropout: Noise shape should fit to input rank.");
|
||||||
std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext()));
|
std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext()));
|
||||||
chunk->assign(1.f);
|
chunk->assign(1.f);
|
||||||
//chunk->applyRandom<randomOps::DropOutInverted<T>>(rng, nullptr, chunk.get(), &probValue);
|
|
||||||
//NativeOpExecutioner::execRandom(random::DropOutInverted, rng, chunk->buffer(), chunk->shapeInfo(), chunk->buffer(), chunk->shapeInfo(), &prob);
|
|
||||||
dropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), probValue, seed);
|
dropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), probValue, seed);
|
||||||
// broadcast chunk to full matrix
|
// broadcast chunk to full matrix
|
||||||
std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input));
|
std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input));
|
||||||
|
@ -105,6 +106,7 @@ namespace helpers {
|
||||||
|
|
||||||
*dropOutMultiplier += *chunk;
|
*dropOutMultiplier += *chunk;
|
||||||
|
|
||||||
|
// FIXME: we could do this in one step, aren't we?
|
||||||
output->assign(*input * *dropOutMultiplier); //input->applyPairwiseTransform(pairwise::Multiply, dropOutMultiplier.get(), output, nullptr);
|
output->assign(*input * *dropOutMultiplier); //input->applyPairwiseTransform(pairwise::Multiply, dropOutMultiplier.get(), output, nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -113,8 +115,11 @@ namespace helpers {
|
||||||
|
|
||||||
int dropOutFunctor(graph::Context& context, NDArray* input, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
|
int dropOutFunctor(graph::Context& context, NDArray* input, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
|
||||||
auto xType = input->dataType();
|
auto xType = input->dataType();
|
||||||
|
NDArray::prepareSpecialUse({output}, {input});
|
||||||
|
|
||||||
BUILD_SINGLE_SELECTOR(xType, return _dropOutFunctor, (context, input, output, reduceShape, seed, probValue), FLOAT_TYPES);
|
BUILD_SINGLE_SELECTOR(xType, return _dropOutFunctor, (context, input, output, reduceShape, seed, probValue), FLOAT_TYPES);
|
||||||
|
|
||||||
|
NDArray::registerSpecialUse({output}, {input});
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////////// backrpopagations ///////////////////////////////////////////////
|
/////////////////////////////////// backrpopagations ///////////////////////////////////////////////
|
||||||
|
@ -136,6 +141,8 @@ namespace helpers {
|
||||||
|
|
||||||
for (int e = tid; e < len; e += step) {
|
for (int e = tid; e < len; e += step) {
|
||||||
const auto zOffset = shape::getIndexOffset(e, outputShape, len);
|
const auto zOffset = shape::getIndexOffset(e, outputShape, len);
|
||||||
|
|
||||||
|
// if probability was non-zero on FF step, we'll scale grads back
|
||||||
if (output[zOffset] != T(0.))
|
if (output[zOffset] != T(0.))
|
||||||
output[zOffset] = T(input[shape::getIndexOffset(e, gradOutShape, len)] / probValue);
|
output[zOffset] = T(input[shape::getIndexOffset(e, gradOutShape, len)] / probValue);
|
||||||
|
|
||||||
|
@ -143,12 +150,17 @@ namespace helpers {
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static int dropOutFunctorBP_(graph::Context& context, NDArray* input, NDArray* gradOut, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
|
static int dropOutFunctorBP_(graph::Context& context, NDArray* input, NDArray* gradOut, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
|
||||||
|
// we're making additional FF run to see how probabilities played out with given seeds
|
||||||
int res = dropOutFunctor(context, input, output, reduceShape, seed, probValue);
|
int res = dropOutFunctor(context, input, output, reduceShape, seed, probValue);
|
||||||
auto stream = context.launchContext()->getCudaStream();
|
auto stream = context.launchContext()->getCudaStream();
|
||||||
|
|
||||||
|
NDArray::prepareSpecialUse({output}, {input, gradOut});
|
||||||
|
|
||||||
if (ND4J_STATUS_OK == res)
|
if (ND4J_STATUS_OK == res)
|
||||||
dropoutBPKernel<T><<<128, 256, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(), probValue);
|
dropoutBPKernel<T><<<128, 256, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(), probValue);
|
||||||
|
|
||||||
|
NDArray::registerSpecialUse({output}, {input, gradOut});
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -239,6 +251,7 @@ namespace helpers {
|
||||||
|
|
||||||
int res = alphaDropOutFunctor(context, input, output, reduceShape, seed, probValue, alpha, alpha1, beta);
|
int res = alphaDropOutFunctor(context, input, output, reduceShape, seed, probValue, alpha, alpha1, beta);
|
||||||
if (res == ND4J_STATUS_OK) {
|
if (res == ND4J_STATUS_OK) {
|
||||||
|
// FIXME: can we make it single-loop?
|
||||||
(*output) *= alpha;
|
(*output) *= alpha;
|
||||||
(*output) *= (*gradOut); //->applyPairwiseTransform<transform::Multiply>(gradOut, output, nullptr);
|
(*output) *= (*gradOut); //->applyPairwiseTransform<transform::Multiply>(gradOut, output, nullptr);
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,7 +43,7 @@ namespace nd4j {
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
|
// we run things in blocks, 1 partition per block of threads
|
||||||
for (Nd4jLong o = blockIdx.x; o < numOutputs; o += gridDim.x) {
|
for (Nd4jLong o = blockIdx.x; o < numOutputs; o += gridDim.x) {
|
||||||
auto z = reinterpret_cast<X*>(vz[o]);
|
auto z = reinterpret_cast<X*>(vz[o]);
|
||||||
|
|
||||||
|
@ -89,9 +89,11 @@ namespace nd4j {
|
||||||
auto x = reinterpret_cast<X*>(vx);
|
auto x = reinterpret_cast<X*>(vx);
|
||||||
auto indices = reinterpret_cast<Y*>(vindices);
|
auto indices = reinterpret_cast<Y*>(vindices);
|
||||||
|
|
||||||
|
// we run things in blocks, 1 partition per block of threads
|
||||||
for (int i = blockIdx.x; i < numOutputs; i += gridDim.x) {
|
for (int i = blockIdx.x; i < numOutputs; i += gridDim.x) {
|
||||||
auto z = reinterpret_cast<X*>(vz[i]);
|
auto z = reinterpret_cast<X*>(vz[i]);
|
||||||
|
|
||||||
|
// each thread has own counter for partitions
|
||||||
int outCnt = 0;
|
int outCnt = 0;
|
||||||
|
|
||||||
for (Nd4jLong e = 0; e < iLength; e++) {
|
for (Nd4jLong e = 0; e < iLength; e++) {
|
||||||
|
@ -145,6 +147,7 @@ namespace nd4j {
|
||||||
tadOffsets[i] = packZ.platformOffsets();
|
tadOffsets[i] = packZ.platformOffsets();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// we copy pointers to device
|
||||||
auto dOutBuffers = reinterpret_cast<void **>(pm.replicatePointer(outBuffers.data(), outBuffers.size() * sizeof(void *)));
|
auto dOutBuffers = reinterpret_cast<void **>(pm.replicatePointer(outBuffers.data(), outBuffers.size() * sizeof(void *)));
|
||||||
auto dOutTadShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(tadShapes.data(), tadShapes.size() * sizeof(Nd4jLong *)));
|
auto dOutTadShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(tadShapes.data(), tadShapes.size() * sizeof(Nd4jLong *)));
|
||||||
auto dOutTadOffsets = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(tadOffsets.data(), tadOffsets.size() * sizeof(Nd4jLong *)));
|
auto dOutTadOffsets = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(tadOffsets.data(), tadOffsets.size() * sizeof(Nd4jLong *)));
|
||||||
|
@ -248,6 +251,7 @@ namespace nd4j {
|
||||||
indicesShapes[e] = indices.at(e)->getSpecialShapeInfo();
|
indicesShapes[e] = indices.at(e)->getSpecialShapeInfo();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// copying pointers to buffers to device
|
||||||
auto dInputBuffers = reinterpret_cast<void **>(pm.replicatePointer(inputBuffers.data(), inputSize * sizeof(void *)));
|
auto dInputBuffers = reinterpret_cast<void **>(pm.replicatePointer(inputBuffers.data(), inputSize * sizeof(void *)));
|
||||||
auto dIndicesBuffers = reinterpret_cast<void **>(pm.replicatePointer(indicesBuffers.data(), inputSize * sizeof(void *)));
|
auto dIndicesBuffers = reinterpret_cast<void **>(pm.replicatePointer(indicesBuffers.data(), inputSize * sizeof(void *)));
|
||||||
auto dInputShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputShapes.data(), inputSize * sizeof(Nd4jLong *)));
|
auto dInputShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputShapes.data(), inputSize * sizeof(Nd4jLong *)));
|
||||||
|
@ -283,6 +287,7 @@ namespace nd4j {
|
||||||
inputTadOffsets[e] = packX.platformOffsets();
|
inputTadOffsets[e] = packX.platformOffsets();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// copying pointers to buffers to device
|
||||||
auto dInputBuffers = reinterpret_cast<void **>(pm.replicatePointer(inputBuffers.data(), inputSize * sizeof(void *)));
|
auto dInputBuffers = reinterpret_cast<void **>(pm.replicatePointer(inputBuffers.data(), inputSize * sizeof(void *)));
|
||||||
auto dInputTadShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputTadShapes.data(), inputSize * sizeof(Nd4jLong *)));
|
auto dInputTadShapes = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputTadShapes.data(), inputSize * sizeof(Nd4jLong *)));
|
||||||
auto dInputTadOffsets = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputTadOffsets.data(), inputSize * sizeof(Nd4jLong *)));
|
auto dInputTadOffsets = reinterpret_cast<Nd4jLong **>(pm.replicatePointer(inputTadOffsets.data(), inputSize * sizeof(Nd4jLong *)));
|
||||||
|
@ -313,6 +318,7 @@ namespace nd4j {
|
||||||
|
|
||||||
NDArray::registerSpecialUse({}, {indices, input});
|
NDArray::registerSpecialUse({}, {indices, input});
|
||||||
|
|
||||||
|
// TODO: it would be nice to have NDArray::registerSpecialUse signature that accepts something else beyond initializer_list
|
||||||
for (auto v:outputList) {
|
for (auto v:outputList) {
|
||||||
v->tickWriteDevice();
|
v->tickWriteDevice();
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,6 +29,7 @@ namespace nd4j {
|
||||||
|
|
||||||
Nd4jLong xCoord[MAX_RANK];
|
Nd4jLong xCoord[MAX_RANK];
|
||||||
|
|
||||||
|
// each block of threads works on 1 input array
|
||||||
for (Nd4jLong e = blockIdx.x; e < numInputs; e += gridDim.x) {
|
for (Nd4jLong e = blockIdx.x; e < numInputs; e += gridDim.x) {
|
||||||
auto z = reinterpret_cast<T*>(zBuffer) + offsets[e];
|
auto z = reinterpret_cast<T*>(zBuffer) + offsets[e];
|
||||||
|
|
||||||
|
@ -39,6 +40,7 @@ namespace nd4j {
|
||||||
auto xRank = shape::rank(xShapeInfo);
|
auto xRank = shape::rank(xShapeInfo);
|
||||||
auto xLength = shape::length(xShapeInfo);
|
auto xLength = shape::length(xShapeInfo);
|
||||||
|
|
||||||
|
// each element of this input array has own place within common output array
|
||||||
for (uint i = threadIdx.x; i < xLength; i += blockDim.x) {
|
for (uint i = threadIdx.x; i < xLength; i += blockDim.x) {
|
||||||
shape::index2coords(xRank, xShape, i, xLength, xCoord, order);
|
shape::index2coords(xRank, xShape, i, xLength, xCoord, order);
|
||||||
auto xOffset = shape::getOffset(0, xShape, xStride, xCoord, xRank);
|
auto xOffset = shape::getOffset(0, xShape, xStride, xCoord, xRank);
|
||||||
|
@ -65,6 +67,7 @@ namespace nd4j {
|
||||||
hdShapes[e] = inputs[e]->specialShapeInfo();
|
hdShapes[e] = inputs[e]->specialShapeInfo();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// copying pointers to device
|
||||||
auto dBuffers = (void **) pm.replicatePointer(hdBuffers.data(), inputs.size() * sizeof(void*));
|
auto dBuffers = (void **) pm.replicatePointer(hdBuffers.data(), inputs.size() * sizeof(void*));
|
||||||
auto dShapes = (Nd4jLong **)pm.replicatePointer(hdShapes.data(), inputs.size() * sizeof(Nd4jLong*));
|
auto dShapes = (Nd4jLong **)pm.replicatePointer(hdShapes.data(), inputs.size() * sizeof(Nd4jLong*));
|
||||||
auto dOffsets = (Nd4jLong *) pm.replicatePointer(hOffsets.data(), inputs.size() * sizeof(Nd4jLong));
|
auto dOffsets = (Nd4jLong *) pm.replicatePointer(hOffsets.data(), inputs.size() * sizeof(Nd4jLong));
|
||||||
|
@ -76,6 +79,7 @@ namespace nd4j {
|
||||||
}
|
}
|
||||||
|
|
||||||
void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
|
void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
|
||||||
|
// FIXME: we want NDArrayFactory::prepareSpecialUse here eventually
|
||||||
for (auto v:inputs)
|
for (auto v:inputs)
|
||||||
v->syncToDevice();
|
v->syncToDevice();
|
||||||
|
|
||||||
|
|
|
@ -26,6 +26,7 @@ namespace ops {
|
||||||
namespace helpers {
|
namespace helpers {
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void applyGradientDescent_(LaunchContext* context, NDArray* input, NDArray* step, double weight, NDArray* output) {
|
void applyGradientDescent_(LaunchContext* context, NDArray* input, NDArray* step, double weight, NDArray* output) {
|
||||||
|
// classic one
|
||||||
auto lambda = LAMBDA_TT(_x, _y, weight) {
|
auto lambda = LAMBDA_TT(_x, _y, weight) {
|
||||||
return _x - (_y * weight);
|
return _x - (_y * weight);
|
||||||
};
|
};
|
||||||
|
|
|
@ -44,6 +44,7 @@ namespace nd4j {
|
||||||
|
|
||||||
X binSize = X((*max_val - *min_val) / numBins);
|
X binSize = X((*max_val - *min_val) / numBins);
|
||||||
|
|
||||||
|
// nullify bins
|
||||||
for (int e = threadIdx.x; e < numBins; e += blockDim.x) {
|
for (int e = threadIdx.x; e < numBins; e += blockDim.x) {
|
||||||
bins[e] = (Z) 0;
|
bins[e] = (Z) 0;
|
||||||
}
|
}
|
||||||
|
@ -53,14 +54,12 @@ namespace nd4j {
|
||||||
int idx = int((dx[e] - *min_val) / binSize);
|
int idx = int((dx[e] - *min_val) / binSize);
|
||||||
idx = math::nd4j_max(idx, 0); //atomicMax(&idx, 0);//atomicMax(&idx, 0);
|
idx = math::nd4j_max(idx, 0); //atomicMax(&idx, 0);//atomicMax(&idx, 0);
|
||||||
idx = math::nd4j_min(idx, int(numBins - 1)); //atomicMin(&idx, int(numBins - 1));
|
idx = math::nd4j_min(idx, int(numBins - 1)); //atomicMin(&idx, int(numBins - 1));
|
||||||
nd4j::math::atomics::nd4j_atomicAdd(&bins[idx], (Z)1);
|
nd4j::math::atomics::nd4j_atomicAdd<Z>(&bins[idx], (Z)1);
|
||||||
// bins[idx]++;
|
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
// at this point all bins in shared memory are calculated, so we aggregate them now via threadfence trick
|
||||||
|
|
||||||
// transfer shared memory to reduction memory
|
// transfer shared memory to reduction memory
|
||||||
|
|
||||||
|
|
||||||
if (gridDim.x > 1) {
|
if (gridDim.x > 1) {
|
||||||
unsigned int *tc = (unsigned int *)reductionPointer;
|
unsigned int *tc = (unsigned int *)reductionPointer;
|
||||||
__shared__ bool amLast;
|
__shared__ bool amLast;
|
||||||
|
|
|
@ -64,6 +64,7 @@ static void ismax_(nd4j::LaunchContext * context, const NDArray* input, NDArray*
|
||||||
|
|
||||||
auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), copy.data(), copy.size());
|
auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), copy.data(), copy.size());
|
||||||
|
|
||||||
|
// we launch legacy IndexMax op, to get indices of max values along dimension
|
||||||
auto indexMaxArr = input->applyIndexReduce(indexreduce::IndexMax, dimensions);
|
auto indexMaxArr = input->applyIndexReduce(indexreduce::IndexMax, dimensions);
|
||||||
|
|
||||||
dim3 launchDims(256, 256, 16384);
|
dim3 launchDims(256, 256, 16384);
|
||||||
|
|
|
@ -41,12 +41,12 @@ namespace helpers {
|
||||||
const T tbeta = static_cast<T>(beta);
|
const T tbeta = static_cast<T>(beta);
|
||||||
const T talpha = static_cast<T>(alpha);
|
const T talpha = static_cast<T>(alpha);
|
||||||
|
|
||||||
|
// one block of threads processes 1 example within batch
|
||||||
for (uint i = blockIdx.x; i < numTads; i += gridDim.x) {
|
for (uint i = blockIdx.x; i < numTads; i += gridDim.x) {
|
||||||
auto x = reinterpret_cast<T*>(vx) + xTadOffsets[i];
|
auto x = reinterpret_cast<T*>(vx) + xTadOffsets[i];
|
||||||
auto z = reinterpret_cast<T*>(vz) + zTadOffsets[i];
|
auto z = reinterpret_cast<T*>(vz) + zTadOffsets[i];
|
||||||
|
|
||||||
// load everything into shared memory
|
// load everything into shared memory, so we'll operate on shared memory from now on
|
||||||
shared[threadIdx.x] = x[threadIdx.x * xEws];
|
shared[threadIdx.x] = x[threadIdx.x * xEws];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
|
@ -94,7 +94,7 @@ namespace helpers {
|
||||||
sharedY[threadIdx.x] = 0.f;
|
sharedY[threadIdx.x] = 0.f;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
|
// we're operating in shared memory
|
||||||
for (int s = begin; s < end; s++)
|
for (int s = begin; s < end; s++)
|
||||||
sharedY[threadIdx.x] = sharedY[threadIdx.x] + sharedX[s] * sharedX[s];
|
sharedY[threadIdx.x] = sharedY[threadIdx.x] + sharedX[s] * sharedX[s];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
|
@ -37,7 +37,7 @@ namespace nd4j {
|
||||||
static __global__ void global_mergeMaxIndex_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
|
static __global__ void global_mergeMaxIndex_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
|
||||||
auto output = reinterpret_cast<Z*>(voutput);
|
auto output = reinterpret_cast<Z*>(voutput);
|
||||||
|
|
||||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const auto step = gridDim.x * blockDim.x;
|
const auto step = gridDim.x * blockDim.x;
|
||||||
|
|
||||||
for (Nd4jLong e = tid; e < length; e += step) {
|
for (Nd4jLong e = tid; e < length; e += step) {
|
||||||
|
@ -81,7 +81,13 @@ namespace nd4j {
|
||||||
}
|
}
|
||||||
|
|
||||||
void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
|
void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
|
||||||
|
NDArray::prepareSpecialUse({&output}, {});
|
||||||
|
for (auto v:inArrs)
|
||||||
|
v->syncToDevice();
|
||||||
|
|
||||||
BUILD_DOUBLE_SELECTOR(inArrs[0]->dataType(), output.dataType(), mergeMaxIndex_, (context, inArrs, output), LIBND4J_TYPES, INDEXING_TYPES);
|
BUILD_DOUBLE_SELECTOR(inArrs[0]->dataType(), output.dataType(), mergeMaxIndex_, (context, inArrs, output), LIBND4J_TYPES, INDEXING_TYPES);
|
||||||
|
|
||||||
|
NDArray::registerSpecialUse({&output}, {});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -90,7 +96,7 @@ namespace nd4j {
|
||||||
static __global__ void global_mergeMax_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
|
static __global__ void global_mergeMax_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
|
||||||
auto output = reinterpret_cast<T*>(voutput);
|
auto output = reinterpret_cast<T*>(voutput);
|
||||||
|
|
||||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const auto step = gridDim.x * blockDim.x;
|
const auto step = gridDim.x * blockDim.x;
|
||||||
|
|
||||||
for (Nd4jLong e = tid; e < length; e += step) {
|
for (Nd4jLong e = tid; e < length; e += step) {
|
||||||
|
@ -131,7 +137,12 @@ namespace nd4j {
|
||||||
}
|
}
|
||||||
|
|
||||||
void mergeMax(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
|
void mergeMax(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
|
||||||
|
NDArray::prepareSpecialUse({&output}, {});
|
||||||
|
for (auto v:inArrs)
|
||||||
|
v->syncToDevice();
|
||||||
|
|
||||||
BUILD_SINGLE_SELECTOR(output.dataType(), mergeMax_, (context, inArrs, output), LIBND4J_TYPES);
|
BUILD_SINGLE_SELECTOR(output.dataType(), mergeMax_, (context, inArrs, output), LIBND4J_TYPES);
|
||||||
|
NDArray::registerSpecialUse({&output}, {});
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -139,7 +150,7 @@ namespace nd4j {
|
||||||
static __global__ void global_mergeAvg_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
|
static __global__ void global_mergeAvg_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
|
||||||
auto output = reinterpret_cast<T*>(voutput);
|
auto output = reinterpret_cast<T*>(voutput);
|
||||||
|
|
||||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const auto step = gridDim.x * blockDim.x;
|
const auto step = gridDim.x * blockDim.x;
|
||||||
|
|
||||||
for (Nd4jLong e = tid; e < length; e += step) {
|
for (Nd4jLong e = tid; e < length; e += step) {
|
||||||
|
@ -178,7 +189,13 @@ namespace nd4j {
|
||||||
}
|
}
|
||||||
|
|
||||||
void mergeAvg(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
|
void mergeAvg(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
|
||||||
|
NDArray::prepareSpecialUse({&output}, {});
|
||||||
|
for (auto v:inArrs)
|
||||||
|
v->syncToDevice();
|
||||||
|
|
||||||
BUILD_SINGLE_SELECTOR(output.dataType(), mergeAvg_, (context, inArrs, output), FLOAT_TYPES);
|
BUILD_SINGLE_SELECTOR(output.dataType(), mergeAvg_, (context, inArrs, output), FLOAT_TYPES);
|
||||||
|
|
||||||
|
NDArray::registerSpecialUse({&output}, {});
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -186,7 +203,7 @@ namespace nd4j {
|
||||||
static __global__ void global_mergeAdd_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
|
static __global__ void global_mergeAdd_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
|
||||||
auto output = reinterpret_cast<T*>(voutput);
|
auto output = reinterpret_cast<T*>(voutput);
|
||||||
|
|
||||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const auto step = gridDim.x * blockDim.x;
|
const auto step = gridDim.x * blockDim.x;
|
||||||
|
|
||||||
for (Nd4jLong e = tid; e < length; e += step) {
|
for (Nd4jLong e = tid; e < length; e += step) {
|
||||||
|
@ -226,7 +243,13 @@ namespace nd4j {
|
||||||
BUILD_SINGLE_TEMPLATE(template void mergeAdd_, (nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output), NUMERIC_TYPES);
|
BUILD_SINGLE_TEMPLATE(template void mergeAdd_, (nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output), NUMERIC_TYPES);
|
||||||
|
|
||||||
void mergeAdd(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
|
void mergeAdd(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
|
||||||
|
NDArray::prepareSpecialUse({&output}, {});
|
||||||
|
for (auto v:inArrs)
|
||||||
|
v->syncToDevice();
|
||||||
|
|
||||||
BUILD_SINGLE_SELECTOR(output.dataType(), mergeAdd_, (context, inArrs, output), NUMERIC_TYPES);
|
BUILD_SINGLE_SELECTOR(output.dataType(), mergeAdd_, (context, inArrs, output), NUMERIC_TYPES);
|
||||||
|
|
||||||
|
NDArray::registerSpecialUse({&output}, {});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,18 +31,18 @@ namespace helpers {
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static __global__ void fillUpElementKernel(void* outputBuffer, Nd4jLong* outputShapeInfo, void* inputBuffer, Nd4jLong* inputShapeInfo, Nd4jLong* pTadShape, Nd4jLong* pTadOffsets, Nd4jLong n) {
|
static __global__ void fillUpElementKernel(void* outputBuffer, Nd4jLong* outputShapeInfo, void* inputBuffer, Nd4jLong* inputShapeInfo, Nd4jLong* pTadShape, Nd4jLong* pTadOffsets, Nd4jLong n) {
|
||||||
__shared__ T *z, *x;
|
|
||||||
__shared__ Nd4jLong bufferLength, arrLen;
|
__shared__ Nd4jLong bufferLength, arrLen;
|
||||||
|
|
||||||
|
auto z = reinterpret_cast<T*>(outputBuffer);
|
||||||
|
auto x = reinterpret_cast<T*>(inputBuffer);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
z = reinterpret_cast<T*>(outputBuffer);
|
|
||||||
x = reinterpret_cast<T*>(inputBuffer);
|
|
||||||
arrLen = shape::length(pTadShape);
|
arrLen = shape::length(pTadShape);
|
||||||
bufferLength = shape::length(outputShapeInfo);
|
bufferLength = shape::length(outputShapeInfo);
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const auto step = gridDim.x * blockDim.x;
|
const auto step = gridDim.x * blockDim.x;
|
||||||
for (int t = tid; t < bufferLength; t += step) {
|
for (int t = tid; t < bufferLength; t += step) {
|
||||||
auto tX = x + pTadOffsets[t];
|
auto tX = x + pTadOffsets[t];
|
||||||
|
@ -77,8 +77,6 @@ namespace helpers {
|
||||||
// manager.synchronize();
|
// manager.synchronize();
|
||||||
sortedVals.tickWriteDevice();
|
sortedVals.tickWriteDevice();
|
||||||
sortedVals.syncToHost();
|
sortedVals.syncToHost();
|
||||||
sortedVals.printIndexedBuffer("Hello");
|
|
||||||
sortedVals.printBuffer("Hello line");
|
|
||||||
auto stream = context->getCudaStream();
|
auto stream = context->getCudaStream();
|
||||||
fillUpElementKernel<T><<<32, 64, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), sortedVals.specialBuffer(), sortedVals.specialShapeInfo(), pTadShape, pTadOffsets, n);
|
fillUpElementKernel<T><<<32, 64, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), sortedVals.specialBuffer(), sortedVals.specialShapeInfo(), pTadShape, pTadOffsets, n);
|
||||||
}
|
}
|
||||||
|
|
|
@ -74,17 +74,14 @@ static void polyGammaCudaLauncher(const int blocksPerGrid, const int threadsPerB
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
void polyGamma(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& z) {
|
void polyGamma(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& z) {
|
||||||
|
|
||||||
if(!n.isActualOnDeviceSide()) n.syncToDevice();
|
NDArray::prepareSpecialUse({&z}, {&n, &x});
|
||||||
if(!x.isActualOnDeviceSide()) x.syncToDevice();
|
|
||||||
|
|
||||||
int threadsPerBlock = MAX_NUM_THREADS;
|
int threadsPerBlock = MAX_NUM_THREADS;
|
||||||
int blocksPerGrid = (z.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
int blocksPerGrid = (z.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||||
|
|
||||||
BUILD_SINGLE_SELECTOR(n.dataType(), polyGammaCudaLauncher, (blocksPerGrid, threadsPerBlock, context->getCudaStream(), n.getSpecialBuffer(), n.getSpecialShapeInfo(), x.getSpecialBuffer(), x.getSpecialShapeInfo(), z.getSpecialBuffer(), z.getSpecialShapeInfo()), FLOAT_TYPES);
|
BUILD_SINGLE_SELECTOR(n.dataType(), polyGammaCudaLauncher, (blocksPerGrid, threadsPerBlock, context->getCudaStream(), n.getSpecialBuffer(), n.getSpecialShapeInfo(), x.getSpecialBuffer(), x.getSpecialShapeInfo(), z.getSpecialBuffer(), z.getSpecialShapeInfo()), FLOAT_TYPES);
|
||||||
|
|
||||||
n.tickReadHost();
|
NDArray::registerSpecialUse({&z}, {&n, &x});
|
||||||
x.tickReadHost();
|
|
||||||
z.tickWriteDevice();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template void polyGammaCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const cudaStream_t *stream, const void *vn, const Nd4jLong *nShapeInfo, const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo), FLOAT_TYPES);
|
BUILD_SINGLE_TEMPLATE(template void polyGammaCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const cudaStream_t *stream, const void *vn, const Nd4jLong *nShapeInfo, const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo), FLOAT_TYPES);
|
||||||
|
|
|
@ -28,7 +28,7 @@ namespace helpers {
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static __global__ void global_range(void *output, Nd4jLong length, T start, T delta) {
|
static __global__ void global_range(void *output, Nd4jLong length, T start, T delta) {
|
||||||
auto buff = reinterpret_cast<T*>(output);
|
auto buff = reinterpret_cast<T*>(output);
|
||||||
const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const auto step = gridDim.x * blockDim.x;
|
const auto step = gridDim.x * blockDim.x;
|
||||||
|
|
||||||
for(Nd4jLong i = tid; i < length; i += step)
|
for(Nd4jLong i = tid; i < length; i += step)
|
||||||
|
@ -43,10 +43,11 @@ namespace helpers {
|
||||||
}
|
}
|
||||||
|
|
||||||
void range(nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) {
|
void range(nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) {
|
||||||
|
NDArray::prepareSpecialUse({&outVector}, {&start, &delta});
|
||||||
BUILD_SINGLE_SELECTOR(outVector.dataType(), _range, (context, start, delta, outVector), LIBND4J_TYPES);
|
BUILD_SINGLE_SELECTOR(outVector.dataType(), _range, (context, start, delta, outVector), LIBND4J_TYPES);
|
||||||
|
NDArray::registerSpecialUse({&outVector}, {&start, &delta});
|
||||||
}
|
}
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template void _range, (nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector), NUMERIC_TYPES);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -26,13 +26,11 @@ namespace nd4j {
|
||||||
namespace helpers {
|
namespace helpers {
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void toggle_bits__(NDArray &in, NDArray &out) {
|
void toggle_bits__(NDArray &in, NDArray &out) {
|
||||||
NDArray::prepareSpecialUse({&out}, {&in});
|
|
||||||
auto lambda = LAMBDA_T(_x) {
|
auto lambda = LAMBDA_T(_x) {
|
||||||
return ~_x;//eUtils::flip_bits(_x);
|
return ~_x;//eUtils::flip_bits(_x);
|
||||||
};
|
};
|
||||||
|
|
||||||
in.applyLambda(lambda, &out);
|
in.applyLambda(lambda, &out);
|
||||||
NDArray::registerSpecialUse({&out}, {&in});
|
|
||||||
}
|
}
|
||||||
BUILD_SINGLE_TEMPLATE(template void toggle_bits__, (NDArray &in, NDArray &out), INTEGER_TYPES);
|
BUILD_SINGLE_TEMPLATE(template void toggle_bits__, (NDArray &in, NDArray &out), INTEGER_TYPES);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue