[WIP] few more fixes (#182)
* one noop test Signed-off-by: raver119 <raver119@gmail.com> * skip input validation for no-input ops Signed-off-by: raver119 <raver119@gmail.com> * - one more noop empty test - one more validation before sync Signed-off-by: raver119 <raver119@gmail.com> * typo Signed-off-by: raver119 <raver119@gmail.com> * one more validation fix Signed-off-by: raver119 <raver119@gmail.com> * CUDA empty reductions java side Signed-off-by: raver119 <raver119@gmail.com> * one svd test Signed-off-by: raver119 <raver119@gmail.com> * Corrected segment_mean helpers and added another test. * Refactored segment_mean kernels to avoid race_condition.master
parent
2144941313
commit
b472d7d8c8
|
@ -2728,11 +2728,13 @@ int execCustomOp2(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer opConte
|
||||||
throw nd4j::cuda_exception::build("customOp execution failed", res);
|
throw nd4j::cuda_exception::build("customOp execution failed", res);
|
||||||
|
|
||||||
for (auto v:context->fastpath_in()) {
|
for (auto v:context->fastpath_in()) {
|
||||||
v->syncToDevice();
|
if (!v->isEmpty())
|
||||||
|
v->syncToDevice();
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto v:context->fastpath_out()) {
|
for (auto v:context->fastpath_out()) {
|
||||||
v->syncToDevice();
|
if (!v->isEmpty())
|
||||||
|
v->syncToDevice();
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
|
@ -74,14 +74,14 @@ namespace helpers {
|
||||||
template <typename T, typename I>
|
template <typename T, typename I>
|
||||||
static __global__ void unsortedSegmentMeanLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
|
static __global__ void unsortedSegmentMeanLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
|
||||||
__shared__ T* val;
|
__shared__ T* val;
|
||||||
__shared__ Nd4jLong xLen, zLen, segment, zIndex;
|
__shared__ Nd4jLong xLen, zLen, zIndex;
|
||||||
__shared__ T* x;
|
__shared__ T* x;
|
||||||
__shared__ T* z;
|
__shared__ T* z;
|
||||||
__shared__ I* y; //int threadsPerSegment, start, finish;
|
__shared__ I* y; //int threadsPerSegment, start, finish;
|
||||||
|
auto segment = blockIdx.x;// /
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
// threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
|
// threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
|
||||||
segment = blockIdx.x;// / threadsPerSegment;
|
// threadsPerSegment;
|
||||||
x = reinterpret_cast<T*>(input);
|
x = reinterpret_cast<T*>(input);
|
||||||
z = reinterpret_cast<T*>(output);
|
z = reinterpret_cast<T*>(output);
|
||||||
y = reinterpret_cast<I*>(indices);
|
y = reinterpret_cast<I*>(indices);
|
||||||
|
@ -117,12 +117,12 @@ namespace helpers {
|
||||||
template <typename T, typename I>
|
template <typename T, typename I>
|
||||||
static __global__ void segmentMeanTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) {
|
static __global__ void segmentMeanTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) {
|
||||||
__shared__ T* val;
|
__shared__ T* val;
|
||||||
__shared__ Nd4jLong len, segment, zIndex, total;
|
__shared__ Nd4jLong len, zIndex, total;
|
||||||
__shared__ T* z;
|
__shared__ T* z;
|
||||||
__shared__ int threadsPerSegment, start, finish;
|
__shared__ int threadsPerSegment, start, finish;
|
||||||
|
auto segment = indices[blockIdx.x]; // / threadsPerSegment;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
segment = indices[blockIdx.x]; // / threadsPerSegment;
|
|
||||||
z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
|
z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
|
||||||
len = shape::length(inputTads);
|
len = shape::length(inputTads);
|
||||||
start = starts[segment];
|
start = starts[segment];
|
||||||
|
@ -139,7 +139,7 @@ namespace helpers {
|
||||||
for (auto e = threadIdx.x; e < len; e += blockDim.x) {
|
for (auto e = threadIdx.x; e < len; e += blockDim.x) {
|
||||||
auto xIndex = shape::getIndexOffset(e, inputTads, len);
|
auto xIndex = shape::getIndexOffset(e, inputTads, len);
|
||||||
auto zIndex = shape::getIndexOffset(e, outputTads, len);
|
auto zIndex = shape::getIndexOffset(e, outputTads, len);
|
||||||
z[zIndex] = T(x[xIndex]/lengths[segment]);
|
nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/lengths[segment]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -197,7 +197,6 @@ namespace helpers {
|
||||||
static void unsortedSegmentMeanFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
|
static void unsortedSegmentMeanFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
|
||||||
auto stream = context->getCudaStream();
|
auto stream = context->getCudaStream();
|
||||||
// NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
|
// NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
|
||||||
NDArray::prepareSpecialUse({output}, {input, indices});
|
|
||||||
|
|
||||||
NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
|
NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
|
||||||
NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
|
NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
|
||||||
|
@ -226,7 +225,6 @@ namespace helpers {
|
||||||
dims.x = input->sizeAt(0);
|
dims.x = input->sizeAt(0);
|
||||||
segmentMeanTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
|
segmentMeanTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
|
||||||
}
|
}
|
||||||
NDArray::registerSpecialUse({output}, {input, indices});
|
|
||||||
|
|
||||||
}
|
}
|
||||||
// -------------------------------------------------------------------------------------------------------------- //
|
// -------------------------------------------------------------------------------------------------------------- //
|
||||||
|
@ -234,7 +232,7 @@ namespace helpers {
|
||||||
NDArray::prepareSpecialUse({output}, {input, indices});
|
NDArray::prepareSpecialUse({output}, {input, indices});
|
||||||
BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentMeanFunctor_, (context, input, indices, numOfClasses, output),
|
BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentMeanFunctor_, (context, input, indices, numOfClasses, output),
|
||||||
NUMERIC_TYPES, INDEXING_TYPES);
|
NUMERIC_TYPES, INDEXING_TYPES);
|
||||||
NDArray::prepareSpecialUse({output}, {input, indices});
|
NDArray::registerSpecialUse({output}, {input, indices});
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------------------------------------------------------------------- //
|
// -------------------------------------------------------------------------------------------------------------- //
|
||||||
|
|
|
@ -372,6 +372,9 @@ namespace nd4j {
|
||||||
if (_descriptor->isSameMode()) {
|
if (_descriptor->isSameMode()) {
|
||||||
|
|
||||||
if (index >= block.width()) {
|
if (index >= block.width()) {
|
||||||
|
if (block.fastpath_in().size() == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
auto ia = block.fastpath_in()[0];
|
auto ia = block.fastpath_in()[0];
|
||||||
|
|
||||||
if (ia->dataType() != cType) {
|
if (ia->dataType() != cType) {
|
||||||
|
@ -423,6 +426,9 @@ namespace nd4j {
|
||||||
if (_descriptor->isSameMode()) {
|
if (_descriptor->isSameMode()) {
|
||||||
|
|
||||||
if (index >= block.width()) {
|
if (index >= block.width()) {
|
||||||
|
if (block.width() == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
auto iv = block.variable(0);
|
auto iv = block.variable(0);
|
||||||
|
|
||||||
if (iv->getNDArray()->dataType() != cType) {
|
if (iv->getNDArray()->dataType() != cType) {
|
||||||
|
|
|
@ -88,3 +88,33 @@ TEST_F(DeclarableOpsTests16, test_size_dtype_1) {
|
||||||
|
|
||||||
ASSERT_EQ(e, z);
|
ASSERT_EQ(e, z);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(DeclarableOpsTests16, test_empty_noop_1) {
|
||||||
|
auto z = NDArrayFactory::empty<Nd4jLong>();
|
||||||
|
|
||||||
|
nd4j::ops::noop op;
|
||||||
|
auto status = op.execute({}, {&z}, {}, {}, {});
|
||||||
|
ASSERT_EQ(Status::OK(), status);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(DeclarableOpsTests16, test_empty_noop_2) {
|
||||||
|
auto z = NDArrayFactory::empty<Nd4jLong>();
|
||||||
|
|
||||||
|
Context ctx(1);
|
||||||
|
ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
|
||||||
|
|
||||||
|
nd4j::ops::noop op;
|
||||||
|
auto status = op.execute(&ctx);
|
||||||
|
|
||||||
|
ASSERT_EQ(Status::OK(), status);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(DeclarableOpsTests16, test_svd_1) {
|
||||||
|
auto x = NDArrayFactory::create<float>('c', {3, 3}, {0.7787856f, 0.80119777f, 0.72437465f, 0.23089433f, 0.72714126f, 0.18039072f,0.50563407f, 0.89252293f, 0.5461209f});
|
||||||
|
auto z = NDArrayFactory::create<float>('c', {3});
|
||||||
|
|
||||||
|
nd4j::ops::svd op;
|
||||||
|
auto status = op.execute({&x}, {&z}, {}, {0, 0, 16}, {});
|
||||||
|
|
||||||
|
ASSERT_EQ(Status::OK(), status);
|
||||||
|
}
|
|
@ -1459,6 +1459,25 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_021) {
|
||||||
delete result;
|
delete result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(DeclarableOpsTests7, TestSegmentMean_022) {
|
||||||
|
auto x = NDArrayFactory::create<float>('c', {6, 3});//, {1, 2, 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18.});
|
||||||
|
auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 2,2});
|
||||||
|
auto z = NDArrayFactory::create<float>('c', {3, 3}); //, { 2.5, 3.5, 4.5, 8.5, 9.5, 10.5, 14.5, 15.5, 16.5});
|
||||||
|
auto exp = NDArrayFactory::create<float>('c', {3, 3}, { 2.5, 3.5, 4.5, 8.5, 9.5, 10.5, 14.5, 15.5, 16.5});
|
||||||
|
|
||||||
|
nd4j::ops::segment_mean op;
|
||||||
|
x.linspace(1.);
|
||||||
|
auto result = op.execute({&x, &idx}, {&z}, {}, {}, {}, false, nd4j::DataType::FLOAT32);
|
||||||
|
ASSERT_EQ(result, Status::OK());
|
||||||
|
|
||||||
|
exp.printIndexedBuffer("Expect Mean");
|
||||||
|
z.printIndexedBuffer("Output Mean");
|
||||||
|
// exp.printShapeInfo("Exp Shape");
|
||||||
|
ASSERT_TRUE(exp.equalsTo(z));
|
||||||
|
|
||||||
|
// delete result;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests7, TestSegmentMeanBP_2) {
|
TEST_F(DeclarableOpsTests7, TestSegmentMeanBP_2) {
|
||||||
auto x = NDArrayFactory::create<double>('c', {4, 4}, {1.8, 2.5, 4., 9.,2.1, 2.4, 3., 9.,2.1, 2.1, 0.7, 0.1,3., 4.2, 2.2, 1.});
|
auto x = NDArrayFactory::create<double>('c', {4, 4}, {1.8, 2.5, 4., 9.,2.1, 2.4, 3., 9.,2.1, 2.1, 0.7, 0.1,3., 4.2, 2.2, 1.});
|
||||||
|
|
|
@ -40,6 +40,7 @@ import org.nd4j.jita.workspace.CudaWorkspace;
|
||||||
import org.nd4j.linalg.api.buffer.BaseDataBuffer;
|
import org.nd4j.linalg.api.buffer.BaseDataBuffer;
|
||||||
import org.nd4j.linalg.api.buffer.DataBuffer;
|
import org.nd4j.linalg.api.buffer.DataBuffer;
|
||||||
import org.nd4j.linalg.api.buffer.DataType;
|
import org.nd4j.linalg.api.buffer.DataType;
|
||||||
|
import org.nd4j.linalg.api.buffer.Utf8Buffer;
|
||||||
import org.nd4j.linalg.api.memory.enums.MemoryKind;
|
import org.nd4j.linalg.api.memory.enums.MemoryKind;
|
||||||
import org.nd4j.linalg.api.memory.pointers.PagedPointer;
|
import org.nd4j.linalg.api.memory.pointers.PagedPointer;
|
||||||
import org.nd4j.linalg.api.ndarray.INDArray;
|
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||||
|
@ -284,10 +285,16 @@ public class AtomicAllocator implements Allocator {
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public Pointer getPointer(@NonNull DataBuffer buffer, CudaContext context) {
|
public Pointer getPointer(@NonNull DataBuffer buffer, CudaContext context) {
|
||||||
|
if (buffer instanceof Utf8Buffer)
|
||||||
|
return null;
|
||||||
|
|
||||||
return memoryHandler.getDevicePointer(buffer, context);
|
return memoryHandler.getDevicePointer(buffer, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Pointer getPointer(DataBuffer buffer) {
|
public Pointer getPointer(DataBuffer buffer) {
|
||||||
|
if (buffer instanceof Utf8Buffer)
|
||||||
|
return null;
|
||||||
|
|
||||||
return memoryHandler.getDevicePointer(buffer, getDeviceContext());
|
return memoryHandler.getDevicePointer(buffer, getDeviceContext());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -961,12 +961,12 @@ public class CudaExecutioner extends DefaultOpExecutioner {
|
||||||
if (CudaEnvironment.getInstance().getConfiguration().isDebug())
|
if (CudaEnvironment.getInstance().getConfiguration().isDebug())
|
||||||
lastOp.set(op.opName());
|
lastOp.set(op.opName());
|
||||||
|
|
||||||
val tadBuffers = tadManager.getTADOnlyShapeInfo(op.x(), dimension);
|
val tadBuffers = op.x().isEmpty() ? Pair.<DataBuffer, DataBuffer>makePair(op.x().data(), null) : tadManager.getTADOnlyShapeInfo(op.x(), dimension);
|
||||||
|
|
||||||
val hostTadShapeInfo = AddressRetriever.retrieveHostPointer(tadBuffers.getFirst());
|
val hostTadShapeInfo = AddressRetriever.retrieveHostPointer(tadBuffers.getFirst());
|
||||||
val devTadShapeInfo = AtomicAllocator.getInstance().getPointer(tadBuffers.getFirst(), context);
|
val devTadShapeInfo = AtomicAllocator.getInstance().getPointer(tadBuffers.getFirst(), context);
|
||||||
|
|
||||||
val offsets = tadBuffers.getSecond();
|
val offsets = op.x().isEmpty() ? null : tadBuffers.getSecond();
|
||||||
val devTadOffsets = offsets == null ? null : AtomicAllocator.getInstance().getPointer(offsets, context);
|
val devTadOffsets = offsets == null ? null : AtomicAllocator.getInstance().getPointer(offsets, context);
|
||||||
|
|
||||||
Pointer x = AtomicAllocator.getInstance().getPointer(op.x(), context);
|
Pointer x = AtomicAllocator.getInstance().getPointer(op.x(), context);
|
||||||
|
|
|
@ -309,6 +309,17 @@ public class EmptyTests extends BaseNd4jTest {
|
||||||
assertEquals(x, out);
|
assertEquals(x, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testEmptyNoop() {
|
||||||
|
val output = Nd4j.empty(DataType.LONG);
|
||||||
|
|
||||||
|
val op = DynamicCustomOp.builder("noop")
|
||||||
|
.addOutputs(output)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
Nd4j.exec(op);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public char ordering() {
|
public char ordering() {
|
||||||
return 'c';
|
return 'c';
|
||||||
|
|
Loading…
Reference in New Issue