[WIP] maxpool2d_bp fix (#160)

* one test for maxpool2d_bp

Signed-off-by: raver119 <raver119@gmail.com>

* - maxpool2d_bp cuda fix for NaNs
- streamSync after each custom op execution

Signed-off-by: raver119 <raver119@gmail.com>
master
raver119 2019-08-24 09:20:57 +03:00 committed by GitHub
parent 95b2686ce5
commit f8364997c0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 54 additions and 38 deletions

View File

@ -866,9 +866,10 @@ void initializeFunctions(Nd4jPointer *functions) {
Nd4jPointer mallocHost(Nd4jLong memorySize, int flags) { Nd4jPointer mallocHost(Nd4jLong memorySize, int flags) {
Nd4jPointer pointer; Nd4jPointer pointer;
// cudaHostAllocMapped |cudaHostAllocPortable // cudaHostAllocMapped |cudaHostAllocPortable
cudaError_t res = cudaHostAlloc(reinterpret_cast<void **>(&pointer), memorySize, cudaHostAllocDefault); auto res = cudaHostAlloc(reinterpret_cast<void **>(&pointer), memorySize, cudaHostAllocDefault);
if (res != 0) if (res != 0)
pointer = 0L; throw nd4j::cuda_exception::build("cudaHostAlloc(...) failed", res);
return pointer; return pointer;
} }
@ -884,7 +885,7 @@ Nd4jPointer mallocDevice(Nd4jLong memorySize, int deviceId, int flags) {
Nd4jPointer pointer; Nd4jPointer pointer;
auto res = cudaMalloc(reinterpret_cast<void **>(&pointer), memorySize); auto res = cudaMalloc(reinterpret_cast<void **>(&pointer), memorySize);
if (res != 0) if (res != 0)
pointer = 0L; throw nd4j::cuda_exception::build("cudaMalloc(...) failed", res);
return pointer; return pointer;
} }
@ -894,9 +895,9 @@ Nd4jPointer mallocDevice(Nd4jLong memorySize, int deviceId, int flags) {
* @param pointer pointer that'll be freed * @param pointer pointer that'll be freed
*/ */
int freeHost(Nd4jPointer pointer) { int freeHost(Nd4jPointer pointer) {
cudaError_t res = cudaFreeHost(reinterpret_cast<void *>(pointer)); auto res = cudaFreeHost(reinterpret_cast<void *>(pointer));
if (res != 0) if (res != 0)
pointer = 0L; throw nd4j::cuda_exception::build("cudaFreeHost(...) failed", res);
return 1L; return 1L;
} }
@ -907,9 +908,10 @@ int freeHost(Nd4jPointer pointer) {
* @param ptrToDeviceId pointer to deviceId. * @param ptrToDeviceId pointer to deviceId.
*/ */
int freeDevice(Nd4jPointer pointer, int deviceId) { int freeDevice(Nd4jPointer pointer, int deviceId) {
cudaError_t res = cudaFree(reinterpret_cast<void *>(pointer)); auto res = cudaFree(reinterpret_cast<void *>(pointer));
if (res != 0) if (res != 0)
pointer = 0L; throw nd4j::cuda_exception::build("cudaFree(...) failed", res);
return 1L; return 1L;
} }
@ -934,7 +936,7 @@ Nd4jPointer createStream() {
auto stream = new cudaStream_t(); auto stream = new cudaStream_t();
auto dZ = cudaStreamCreate(stream); auto dZ = cudaStreamCreate(stream);
if (dZ != 0) if (dZ != 0)
throw std::runtime_error("cudaStreamCreate(...) failed"); throw nd4j::cuda_exception::build("cudaStreamCreate(...) failed", dZ);
return stream; return stream;
} }
@ -944,23 +946,21 @@ Nd4jPointer createEvent() {
CHECK_ALLOC(nativeEvent, "Failed to allocate new CUDA event buffer", sizeof(cudaEvent_t)); CHECK_ALLOC(nativeEvent, "Failed to allocate new CUDA event buffer", sizeof(cudaEvent_t));
cudaError_t dZ = cudaEventCreateWithFlags(reinterpret_cast<cudaEvent_t *>(&nativeEvent), cudaEventDisableTiming); auto dZ = cudaEventCreateWithFlags(reinterpret_cast<cudaEvent_t *>(&nativeEvent), cudaEventDisableTiming);
checkCudaErrors(dZ);
if (dZ != 0) if (dZ != 0)
throw std::runtime_error("cudaEventCreateWithFlags(...) failed"); throw nd4j::cuda_exception::build("cudaEventCreateWithFlags(...) failed", dZ);
return nativeEvent; return nativeEvent;
} }
int registerEvent(Nd4jPointer event, Nd4jPointer stream) { int registerEvent(Nd4jPointer event, Nd4jPointer stream) {
cudaEvent_t *pEvent = reinterpret_cast<cudaEvent_t *>(&event); auto pEvent = reinterpret_cast<cudaEvent_t *>(&event);
cudaStream_t *pStream = reinterpret_cast<cudaStream_t *>(stream); auto pStream = reinterpret_cast<cudaStream_t *>(stream);
cudaError_t dZ = cudaEventRecord(*pEvent, *pStream); auto dZ = cudaEventRecord(*pEvent, *pStream);
checkCudaErrors(dZ);
if (dZ != 0) if (dZ != 0)
throw std::runtime_error("cudaEventRecord(...) failed"); throw nd4j::cuda_exception::build("cudaEventRecord(...) failed", dZ);
return 1; return 1;
} }
@ -1065,53 +1065,48 @@ int memcpyAsync(Nd4jPointer dst, Nd4jPointer src, Nd4jLong size, int flags, Nd4j
} }
int memsetSync(Nd4jPointer dst, int value, Nd4jLong size, int flags, Nd4jPointer reserved) { int memsetSync(Nd4jPointer dst, int value, Nd4jLong size, int flags, Nd4jPointer reserved) {
cudaError_t dZ = cudaMemset(reinterpret_cast<void *>(dst), value, static_cast<size_t>(size)); auto dZ = cudaMemset(reinterpret_cast<void *>(dst), value, static_cast<size_t>(size));
checkCudaErrors(dZ);
if (dZ != 0) if (dZ != 0)
throw std::runtime_error("cudaMemset(...) failed"); throw nd4j::cuda_exception::build("cudaMemset(...) failed", dZ);
return 1; return 1;
} }
int memsetAsync(Nd4jPointer dst, int value, Nd4jLong size, int flags, Nd4jPointer reserved) { int memsetAsync(Nd4jPointer dst, int value, Nd4jLong size, int flags, Nd4jPointer reserved) {
cudaStream_t *pStream = reinterpret_cast<cudaStream_t *>(reserved); auto pStream = reinterpret_cast<cudaStream_t *>(reserved);
cudaError_t dZ = cudaMemsetAsync(reinterpret_cast<void *>(dst), value, static_cast<size_t>(size), *pStream); auto dZ = cudaMemsetAsync(reinterpret_cast<void *>(dst), value, static_cast<size_t>(size), *pStream);
checkCudaErrors(dZ);
if (dZ != 0) if (dZ != 0)
throw std::runtime_error("cudaMemsetAsync(...) failed"); throw nd4j::cuda_exception::build("cudaMemsetAsync(...) failed", dZ);
return 1; return 1;
} }
int destroyEvent(Nd4jPointer event) { int destroyEvent(Nd4jPointer event) {
cudaEvent_t *pEvent = reinterpret_cast<cudaEvent_t *>(&event); auto pEvent = reinterpret_cast<cudaEvent_t *>(&event);
cudaError_t dZ = cudaEventDestroy(*pEvent); auto dZ = cudaEventDestroy(*pEvent);
checkCudaErrors(dZ);
if (dZ != 0) if (dZ != 0)
throw std::runtime_error("cudaEvenDestroy(...) failed"); throw nd4j::cuda_exception::build("cudaEvenDestroy(...) failed", dZ);
return 1; return 1;
} }
int streamSynchronize(Nd4jPointer stream) { int streamSynchronize(Nd4jPointer stream) {
cudaStream_t *pStream = reinterpret_cast<cudaStream_t *>(stream); auto pStream = reinterpret_cast<cudaStream_t *>(stream);
cudaError_t dZ = cudaStreamSynchronize(*pStream); auto dZ = cudaStreamSynchronize(*pStream);
checkCudaErrors(dZ);
if (dZ != 0) if (dZ != 0)
throw std::runtime_error("cudaStreamSynchronize(...) failed"); throw nd4j::cuda_exception::build("cudaStreamSynchronize(...) failed", dZ);
return 1L; return 1L;
} }
int eventSynchronize(Nd4jPointer event) { int eventSynchronize(Nd4jPointer event) {
cudaEvent_t *pEvent = reinterpret_cast<cudaEvent_t *>(&event); auto pEvent = reinterpret_cast<cudaEvent_t *>(&event);
cudaError_t dZ = cudaEventSynchronize(*pEvent); auto dZ = cudaEventSynchronize(*pEvent);
checkCudaErrors(dZ);
if (dZ != 0) if (dZ != 0)
throw std::runtime_error("cudaEventSynchronize(...) failed"); throw nd4j::cuda_exception::build("cudaEventSynchronize(...) failed", dZ);
return 1L; return 1L;
} }
@ -2697,13 +2692,16 @@ int execCustomOp2(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer opConte
auto result = op->execute(context); auto result = op->execute(context);
// FIXME: remove once CUDA backend is 100% ready auto res = cudaStreamSynchronize(*context->launchContext()->getCudaStream());
if (res != 0)
throw nd4j::cuda_exception::build("customOp execution failed", res);
for (auto v:context->fastpath_in()) { for (auto v:context->fastpath_in()) {
v->makeBothActual(); v->syncToDevice();
} }
for (auto v:context->fastpath_out()) { for (auto v:context->fastpath_out()) {
v->makeBothActual(); v->syncToDevice();
} }
return result; return result;

View File

@ -907,6 +907,8 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
/*** max ***/ /*** max ***/
case 0: { case 0: {
coord2 = hstart;
coord3 = hend;
T max = -DataTypeUtils::max<T>(); T max = -DataTypeUtils::max<T>();
for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) { for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {

View File

@ -732,4 +732,20 @@ public class CustomOpsTests extends BaseNd4jTest {
fail("Failed datatypes: " + failed.toString()); fail("Failed datatypes: " + failed.toString());
} }
} }
@Test
public void testMaxPool2Dbp_1() {
val x = Nd4j.create(DataType.HALF, 2,3,16,16).assign(Double.NaN);
val y = Nd4j.create(DataType.HALF, 2,3,8,8).assign(Double.NaN);
val z = Nd4j.create(DataType.HALF, 2,3,16,16);
val op = DynamicCustomOp.builder("maxpool2d_bp")
.addInputs(x, y)
.addOutputs(z)
.addIntegerArguments(2, 2, 2, 2, 8,8, 1,1,1, 0,0)
.build();
Nd4j.exec(op);
Nd4j.getExecutioner().commit();
}
} }