diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu index 5c6dadbaf..e75aa422c 100755 --- a/libnd4j/blas/cuda/NativeOps.cu +++ b/libnd4j/blas/cuda/NativeOps.cu @@ -866,9 +866,10 @@ void initializeFunctions(Nd4jPointer *functions) { Nd4jPointer mallocHost(Nd4jLong memorySize, int flags) { Nd4jPointer pointer; // cudaHostAllocMapped |cudaHostAllocPortable - cudaError_t res = cudaHostAlloc(reinterpret_cast(&pointer), memorySize, cudaHostAllocDefault); + auto res = cudaHostAlloc(reinterpret_cast(&pointer), memorySize, cudaHostAllocDefault); if (res != 0) - pointer = 0L; + throw nd4j::cuda_exception::build("cudaHostAlloc(...) failed", res); + return pointer; } @@ -884,7 +885,7 @@ Nd4jPointer mallocDevice(Nd4jLong memorySize, int deviceId, int flags) { Nd4jPointer pointer; auto res = cudaMalloc(reinterpret_cast(&pointer), memorySize); if (res != 0) - pointer = 0L; + throw nd4j::cuda_exception::build("cudaMalloc(...) failed", res); return pointer; } @@ -894,9 +895,9 @@ Nd4jPointer mallocDevice(Nd4jLong memorySize, int deviceId, int flags) { * @param pointer pointer that'll be freed */ int freeHost(Nd4jPointer pointer) { - cudaError_t res = cudaFreeHost(reinterpret_cast(pointer)); + auto res = cudaFreeHost(reinterpret_cast(pointer)); if (res != 0) - pointer = 0L; + throw nd4j::cuda_exception::build("cudaFreeHost(...) failed", res); return 1L; } @@ -907,9 +908,10 @@ int freeHost(Nd4jPointer pointer) { * @param ptrToDeviceId pointer to deviceId. */ int freeDevice(Nd4jPointer pointer, int deviceId) { - cudaError_t res = cudaFree(reinterpret_cast(pointer)); + auto res = cudaFree(reinterpret_cast(pointer)); if (res != 0) - pointer = 0L; + throw nd4j::cuda_exception::build("cudaFree(...) failed", res); + return 1L; } @@ -934,7 +936,7 @@ Nd4jPointer createStream() { auto stream = new cudaStream_t(); auto dZ = cudaStreamCreate(stream); if (dZ != 0) - throw std::runtime_error("cudaStreamCreate(...) failed"); + throw nd4j::cuda_exception::build("cudaStreamCreate(...) failed", dZ); return stream; } @@ -944,23 +946,21 @@ Nd4jPointer createEvent() { CHECK_ALLOC(nativeEvent, "Failed to allocate new CUDA event buffer", sizeof(cudaEvent_t)); - cudaError_t dZ = cudaEventCreateWithFlags(reinterpret_cast(&nativeEvent), cudaEventDisableTiming); - checkCudaErrors(dZ); + auto dZ = cudaEventCreateWithFlags(reinterpret_cast(&nativeEvent), cudaEventDisableTiming); if (dZ != 0) - throw std::runtime_error("cudaEventCreateWithFlags(...) failed"); + throw nd4j::cuda_exception::build("cudaEventCreateWithFlags(...) failed", dZ); return nativeEvent; } int registerEvent(Nd4jPointer event, Nd4jPointer stream) { - cudaEvent_t *pEvent = reinterpret_cast(&event); - cudaStream_t *pStream = reinterpret_cast(stream); + auto pEvent = reinterpret_cast(&event); + auto pStream = reinterpret_cast(stream); - cudaError_t dZ = cudaEventRecord(*pEvent, *pStream); - checkCudaErrors(dZ); + auto dZ = cudaEventRecord(*pEvent, *pStream); if (dZ != 0) - throw std::runtime_error("cudaEventRecord(...) failed"); + throw nd4j::cuda_exception::build("cudaEventRecord(...) failed", dZ); return 1; } @@ -1065,53 +1065,48 @@ int memcpyAsync(Nd4jPointer dst, Nd4jPointer src, Nd4jLong size, int flags, Nd4j } int memsetSync(Nd4jPointer dst, int value, Nd4jLong size, int flags, Nd4jPointer reserved) { - cudaError_t dZ = cudaMemset(reinterpret_cast(dst), value, static_cast(size)); - checkCudaErrors(dZ); + auto dZ = cudaMemset(reinterpret_cast(dst), value, static_cast(size)); if (dZ != 0) - throw std::runtime_error("cudaMemset(...) failed"); + throw nd4j::cuda_exception::build("cudaMemset(...) failed", dZ); return 1; } int memsetAsync(Nd4jPointer dst, int value, Nd4jLong size, int flags, Nd4jPointer reserved) { - cudaStream_t *pStream = reinterpret_cast(reserved); + auto pStream = reinterpret_cast(reserved); - cudaError_t dZ = cudaMemsetAsync(reinterpret_cast(dst), value, static_cast(size), *pStream); - checkCudaErrors(dZ); + auto dZ = cudaMemsetAsync(reinterpret_cast(dst), value, static_cast(size), *pStream); if (dZ != 0) - throw std::runtime_error("cudaMemsetAsync(...) failed"); + throw nd4j::cuda_exception::build("cudaMemsetAsync(...) failed", dZ); return 1; } int destroyEvent(Nd4jPointer event) { - cudaEvent_t *pEvent = reinterpret_cast(&event); - cudaError_t dZ = cudaEventDestroy(*pEvent); - checkCudaErrors(dZ); + auto pEvent = reinterpret_cast(&event); + auto dZ = cudaEventDestroy(*pEvent); if (dZ != 0) - throw std::runtime_error("cudaEvenDestroy(...) failed"); + throw nd4j::cuda_exception::build("cudaEvenDestroy(...) failed", dZ); return 1; } int streamSynchronize(Nd4jPointer stream) { - cudaStream_t *pStream = reinterpret_cast(stream); + auto pStream = reinterpret_cast(stream); - cudaError_t dZ = cudaStreamSynchronize(*pStream); - checkCudaErrors(dZ); + auto dZ = cudaStreamSynchronize(*pStream); if (dZ != 0) - throw std::runtime_error("cudaStreamSynchronize(...) failed"); + throw nd4j::cuda_exception::build("cudaStreamSynchronize(...) failed", dZ); return 1L; } int eventSynchronize(Nd4jPointer event) { - cudaEvent_t *pEvent = reinterpret_cast(&event); + auto pEvent = reinterpret_cast(&event); - cudaError_t dZ = cudaEventSynchronize(*pEvent); - checkCudaErrors(dZ); + auto dZ = cudaEventSynchronize(*pEvent); if (dZ != 0) - throw std::runtime_error("cudaEventSynchronize(...) failed"); + throw nd4j::cuda_exception::build("cudaEventSynchronize(...) failed", dZ); return 1L; } @@ -2697,13 +2692,16 @@ int execCustomOp2(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer opConte auto result = op->execute(context); - // FIXME: remove once CUDA backend is 100% ready + auto res = cudaStreamSynchronize(*context->launchContext()->getCudaStream()); + if (res != 0) + throw nd4j::cuda_exception::build("customOp execution failed", res); + for (auto v:context->fastpath_in()) { - v->makeBothActual(); + v->syncToDevice(); } for (auto v:context->fastpath_out()) { - v->makeBothActual(); + v->syncToDevice(); } return result; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu index e224329f0..98ab86dec 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu @@ -907,6 +907,8 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf /*** max ***/ case 0: { + coord2 = hstart; + coord3 = hend; T max = -DataTypeUtils::max(); for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) { diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/custom/CustomOpsTests.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/custom/CustomOpsTests.java index c2f5dedc5..6c4595a0c 100644 --- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/custom/CustomOpsTests.java +++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/custom/CustomOpsTests.java @@ -732,4 +732,20 @@ public class CustomOpsTests extends BaseNd4jTest { fail("Failed datatypes: " + failed.toString()); } } + + @Test + public void testMaxPool2Dbp_1() { + val x = Nd4j.create(DataType.HALF, 2,3,16,16).assign(Double.NaN); + val y = Nd4j.create(DataType.HALF, 2,3,8,8).assign(Double.NaN); + val z = Nd4j.create(DataType.HALF, 2,3,16,16); + + val op = DynamicCustomOp.builder("maxpool2d_bp") + .addInputs(x, y) + .addOutputs(z) + .addIntegerArguments(2, 2, 2, 2, 8,8, 1,1,1, 0,0) + .build(); + + Nd4j.exec(op); + Nd4j.getExecutioner().commit(); + } }