[WIP] few more fixes (#182)

* one noop test

Signed-off-by: raver119 <raver119@gmail.com>

* skip input validation for no-input ops

Signed-off-by: raver119 <raver119@gmail.com>

* - one more noop empty test
- one more validation before sync

Signed-off-by: raver119 <raver119@gmail.com>

* typo

Signed-off-by: raver119 <raver119@gmail.com>

* one more validation fix

Signed-off-by: raver119 <raver119@gmail.com>

* CUDA empty reductions java side

Signed-off-by: raver119 <raver119@gmail.com>

* one svd test

Signed-off-by: raver119 <raver119@gmail.com>

* Corrected segment_mean helpers and added another test.

* Refactored segment_mean kernels to avoid race_condition.
master
raver119 2019-08-27 21:00:38 +03:00 committed by GitHub
parent 2144941313
commit b472d7d8c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 86 additions and 13 deletions

View File

@ -2728,11 +2728,13 @@ int execCustomOp2(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer opConte
throw nd4j::cuda_exception::build("customOp execution failed", res); throw nd4j::cuda_exception::build("customOp execution failed", res);
for (auto v:context->fastpath_in()) { for (auto v:context->fastpath_in()) {
v->syncToDevice(); if (!v->isEmpty())
v->syncToDevice();
} }
for (auto v:context->fastpath_out()) { for (auto v:context->fastpath_out()) {
v->syncToDevice(); if (!v->isEmpty())
v->syncToDevice();
} }
return result; return result;

View File

@ -74,14 +74,14 @@ namespace helpers {
template <typename T, typename I> template <typename T, typename I>
static __global__ void unsortedSegmentMeanLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) { static __global__ void unsortedSegmentMeanLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
__shared__ T* val; __shared__ T* val;
__shared__ Nd4jLong xLen, zLen, segment, zIndex; __shared__ Nd4jLong xLen, zLen, zIndex;
__shared__ T* x; __shared__ T* x;
__shared__ T* z; __shared__ T* z;
__shared__ I* y; //int threadsPerSegment, start, finish; __shared__ I* y; //int threadsPerSegment, start, finish;
auto segment = blockIdx.x;// /
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
// threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses; // threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
segment = blockIdx.x;// / threadsPerSegment; // threadsPerSegment;
x = reinterpret_cast<T*>(input); x = reinterpret_cast<T*>(input);
z = reinterpret_cast<T*>(output); z = reinterpret_cast<T*>(output);
y = reinterpret_cast<I*>(indices); y = reinterpret_cast<I*>(indices);
@ -117,12 +117,12 @@ namespace helpers {
template <typename T, typename I> template <typename T, typename I>
static __global__ void segmentMeanTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) { static __global__ void segmentMeanTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) {
__shared__ T* val; __shared__ T* val;
__shared__ Nd4jLong len, segment, zIndex, total; __shared__ Nd4jLong len, zIndex, total;
__shared__ T* z; __shared__ T* z;
__shared__ int threadsPerSegment, start, finish; __shared__ int threadsPerSegment, start, finish;
auto segment = indices[blockIdx.x]; // / threadsPerSegment;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
segment = indices[blockIdx.x]; // / threadsPerSegment;
z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment]; z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
len = shape::length(inputTads); len = shape::length(inputTads);
start = starts[segment]; start = starts[segment];
@ -139,7 +139,7 @@ namespace helpers {
for (auto e = threadIdx.x; e < len; e += blockDim.x) { for (auto e = threadIdx.x; e < len; e += blockDim.x) {
auto xIndex = shape::getIndexOffset(e, inputTads, len); auto xIndex = shape::getIndexOffset(e, inputTads, len);
auto zIndex = shape::getIndexOffset(e, outputTads, len); auto zIndex = shape::getIndexOffset(e, outputTads, len);
z[zIndex] = T(x[xIndex]/lengths[segment]); nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/lengths[segment]));
} }
} }
else { else {
@ -197,7 +197,6 @@ namespace helpers {
static void unsortedSegmentMeanFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) { static void unsortedSegmentMeanFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
auto stream = context->getCudaStream(); auto stream = context->getCudaStream();
// NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2}); // NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
NDArray::prepareSpecialUse({output}, {input, indices});
NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses}); NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses}); NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
@ -226,7 +225,6 @@ namespace helpers {
dims.x = input->sizeAt(0); dims.x = input->sizeAt(0);
segmentMeanTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets); segmentMeanTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
} }
NDArray::registerSpecialUse({output}, {input, indices});
} }
// -------------------------------------------------------------------------------------------------------------- // // -------------------------------------------------------------------------------------------------------------- //
@ -234,7 +232,7 @@ namespace helpers {
NDArray::prepareSpecialUse({output}, {input, indices}); NDArray::prepareSpecialUse({output}, {input, indices});
BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentMeanFunctor_, (context, input, indices, numOfClasses, output), BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentMeanFunctor_, (context, input, indices, numOfClasses, output),
NUMERIC_TYPES, INDEXING_TYPES); NUMERIC_TYPES, INDEXING_TYPES);
NDArray::prepareSpecialUse({output}, {input, indices}); NDArray::registerSpecialUse({output}, {input, indices});
} }
// -------------------------------------------------------------------------------------------------------------- // // -------------------------------------------------------------------------------------------------------------- //

View File

@ -372,6 +372,9 @@ namespace nd4j {
if (_descriptor->isSameMode()) { if (_descriptor->isSameMode()) {
if (index >= block.width()) { if (index >= block.width()) {
if (block.fastpath_in().size() == 0)
continue;
auto ia = block.fastpath_in()[0]; auto ia = block.fastpath_in()[0];
if (ia->dataType() != cType) { if (ia->dataType() != cType) {
@ -423,6 +426,9 @@ namespace nd4j {
if (_descriptor->isSameMode()) { if (_descriptor->isSameMode()) {
if (index >= block.width()) { if (index >= block.width()) {
if (block.width() == 0)
continue;
auto iv = block.variable(0); auto iv = block.variable(0);
if (iv->getNDArray()->dataType() != cType) { if (iv->getNDArray()->dataType() != cType) {

View File

@ -88,3 +88,33 @@ TEST_F(DeclarableOpsTests16, test_size_dtype_1) {
ASSERT_EQ(e, z); ASSERT_EQ(e, z);
} }
TEST_F(DeclarableOpsTests16, test_empty_noop_1) {
auto z = NDArrayFactory::empty<Nd4jLong>();
nd4j::ops::noop op;
auto status = op.execute({}, {&z}, {}, {}, {});
ASSERT_EQ(Status::OK(), status);
}
TEST_F(DeclarableOpsTests16, test_empty_noop_2) {
auto z = NDArrayFactory::empty<Nd4jLong>();
Context ctx(1);
ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
nd4j::ops::noop op;
auto status = op.execute(&ctx);
ASSERT_EQ(Status::OK(), status);
}
TEST_F(DeclarableOpsTests16, test_svd_1) {
auto x = NDArrayFactory::create<float>('c', {3, 3}, {0.7787856f, 0.80119777f, 0.72437465f, 0.23089433f, 0.72714126f, 0.18039072f,0.50563407f, 0.89252293f, 0.5461209f});
auto z = NDArrayFactory::create<float>('c', {3});
nd4j::ops::svd op;
auto status = op.execute({&x}, {&z}, {}, {0, 0, 16}, {});
ASSERT_EQ(Status::OK(), status);
}

View File

@ -1459,6 +1459,25 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_021) {
delete result; delete result;
} }
TEST_F(DeclarableOpsTests7, TestSegmentMean_022) {
auto x = NDArrayFactory::create<float>('c', {6, 3});//, {1, 2, 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18.});
auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 2,2});
auto z = NDArrayFactory::create<float>('c', {3, 3}); //, { 2.5, 3.5, 4.5, 8.5, 9.5, 10.5, 14.5, 15.5, 16.5});
auto exp = NDArrayFactory::create<float>('c', {3, 3}, { 2.5, 3.5, 4.5, 8.5, 9.5, 10.5, 14.5, 15.5, 16.5});
nd4j::ops::segment_mean op;
x.linspace(1.);
auto result = op.execute({&x, &idx}, {&z}, {}, {}, {}, false, nd4j::DataType::FLOAT32);
ASSERT_EQ(result, Status::OK());
exp.printIndexedBuffer("Expect Mean");
z.printIndexedBuffer("Output Mean");
// exp.printShapeInfo("Exp Shape");
ASSERT_TRUE(exp.equalsTo(z));
// delete result;
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests7, TestSegmentMeanBP_2) { TEST_F(DeclarableOpsTests7, TestSegmentMeanBP_2) {
auto x = NDArrayFactory::create<double>('c', {4, 4}, {1.8, 2.5, 4., 9.,2.1, 2.4, 3., 9.,2.1, 2.1, 0.7, 0.1,3., 4.2, 2.2, 1.}); auto x = NDArrayFactory::create<double>('c', {4, 4}, {1.8, 2.5, 4., 9.,2.1, 2.4, 3., 9.,2.1, 2.1, 0.7, 0.1,3., 4.2, 2.2, 1.});

View File

@ -40,6 +40,7 @@ import org.nd4j.jita.workspace.CudaWorkspace;
import org.nd4j.linalg.api.buffer.BaseDataBuffer; import org.nd4j.linalg.api.buffer.BaseDataBuffer;
import org.nd4j.linalg.api.buffer.DataBuffer; import org.nd4j.linalg.api.buffer.DataBuffer;
import org.nd4j.linalg.api.buffer.DataType; import org.nd4j.linalg.api.buffer.DataType;
import org.nd4j.linalg.api.buffer.Utf8Buffer;
import org.nd4j.linalg.api.memory.enums.MemoryKind; import org.nd4j.linalg.api.memory.enums.MemoryKind;
import org.nd4j.linalg.api.memory.pointers.PagedPointer; import org.nd4j.linalg.api.memory.pointers.PagedPointer;
import org.nd4j.linalg.api.ndarray.INDArray; import org.nd4j.linalg.api.ndarray.INDArray;
@ -284,10 +285,16 @@ public class AtomicAllocator implements Allocator {
*/ */
@Override @Override
public Pointer getPointer(@NonNull DataBuffer buffer, CudaContext context) { public Pointer getPointer(@NonNull DataBuffer buffer, CudaContext context) {
if (buffer instanceof Utf8Buffer)
return null;
return memoryHandler.getDevicePointer(buffer, context); return memoryHandler.getDevicePointer(buffer, context);
} }
public Pointer getPointer(DataBuffer buffer) { public Pointer getPointer(DataBuffer buffer) {
if (buffer instanceof Utf8Buffer)
return null;
return memoryHandler.getDevicePointer(buffer, getDeviceContext()); return memoryHandler.getDevicePointer(buffer, getDeviceContext());
} }

View File

@ -961,12 +961,12 @@ public class CudaExecutioner extends DefaultOpExecutioner {
if (CudaEnvironment.getInstance().getConfiguration().isDebug()) if (CudaEnvironment.getInstance().getConfiguration().isDebug())
lastOp.set(op.opName()); lastOp.set(op.opName());
val tadBuffers = tadManager.getTADOnlyShapeInfo(op.x(), dimension); val tadBuffers = op.x().isEmpty() ? Pair.<DataBuffer, DataBuffer>makePair(op.x().data(), null) : tadManager.getTADOnlyShapeInfo(op.x(), dimension);
val hostTadShapeInfo = AddressRetriever.retrieveHostPointer(tadBuffers.getFirst()); val hostTadShapeInfo = AddressRetriever.retrieveHostPointer(tadBuffers.getFirst());
val devTadShapeInfo = AtomicAllocator.getInstance().getPointer(tadBuffers.getFirst(), context); val devTadShapeInfo = AtomicAllocator.getInstance().getPointer(tadBuffers.getFirst(), context);
val offsets = tadBuffers.getSecond(); val offsets = op.x().isEmpty() ? null : tadBuffers.getSecond();
val devTadOffsets = offsets == null ? null : AtomicAllocator.getInstance().getPointer(offsets, context); val devTadOffsets = offsets == null ? null : AtomicAllocator.getInstance().getPointer(offsets, context);
Pointer x = AtomicAllocator.getInstance().getPointer(op.x(), context); Pointer x = AtomicAllocator.getInstance().getPointer(op.x(), context);

View File

@ -309,6 +309,17 @@ public class EmptyTests extends BaseNd4jTest {
assertEquals(x, out); assertEquals(x, out);
} }
@Test
public void testEmptyNoop() {
val output = Nd4j.empty(DataType.LONG);
val op = DynamicCustomOp.builder("noop")
.addOutputs(output)
.build();
Nd4j.exec(op);
}
@Override @Override
public char ordering() { public char ordering() {
return 'c'; return 'c';