[WIP] More fixes (#142)

* atomicAdd cc 70+ Signed-off-by: raver119 <raver119@gmail.com> * additional 8 bytes alocation Signed-off-by: raver119 <raver119@gmail.com> * missed include 2019 Signed-off-by: raver119 <raver119@gmail.com> * less spam Signed-off-by: raver119 <raver119@gmail.com>
2019-08-21 20:18:29 +03:00 · 2019-08-21 20:18:29 +03:00 · 3cf72e5e30
commit 3cf72e5e30
parent 348d9c59f7
6 changed files with 32 additions and 24 deletions
--- a/libnd4j/include/execution/cuda/ContextBuffers.cu
+++ b/libnd4j/include/execution/cuda/ContextBuffers.cu
@ -30,13 +30,13 @@

 namespace nd4j {
    ContextBuffers::ContextBuffers() {
-        nd4j_printf("Creating ContextBuffers for device [%i]\n", AffinityManager::currentDeviceId());
+        //nd4j_printf("Creating ContextBuffers for device [%i]\n", AffinityManager::currentDeviceId());
        _deviceId = AffinityManager::currentDeviceId();
    }

    ContextBuffers::~ContextBuffers() {
        if (_allocated) {
-            nd4j_printf("Releasing ContextBuffers\n","");
+            //nd4j_printf("Releasing ContextBuffers\n","");

            if (_allocationPointer != nullptr)
                cudaFree(_allocationPointer);
@ -69,7 +69,7 @@ namespace nd4j {
    }

    void ContextBuffers::initialize() {
-        nd4j_printf("Initializing buffers on deviceId [%i]\n", AffinityManager::currentNativeDeviceId());
+        //nd4j_printf("Initializing buffers on deviceId [%i]\n", AffinityManager::currentNativeDeviceId());

        auto res = cudaMalloc(reinterpret_cast<void**>(&_reductionPointer), 1024 * 1024 * 8);
        if (res != 0)
--- a/libnd4j/include/helpers/impl/Parameters.cpp
+++ b/libnd4j/include/helpers/impl/Parameters.cpp
@ -19,6 +19,7 @@
 //

 #include "../benchmark/Parameters.h"
+#include <stdexcept>

 namespace nd4j {
    Parameters* Parameters::addIntParam(std::string string, int param) {
--- a/libnd4j/include/op_boilerplate.h
+++ b/libnd4j/include/op_boilerplate.h
@ -1461,7 +1461,7 @@

 #ifdef _RELEASE

-#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast<void**>(&VARIABLE), LENGTH * sizeof(TT)); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT))); }
+#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast<void**>(&VARIABLE), LENGTH * sizeof(TT) + 8); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT) + 8)); }
 #define RELEASE_SPECIAL(VARIABLE, WORKSPACE) if (VARIABLE != nullptr) {if (WORKSPACE == nullptr) { auto erc_##VARIABLE = cudaFree(reinterpret_cast<void *>(VARIABLE));  if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] deallocation failed", erc_##VARIABLE);}; }; };

 #else
--- a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
@ -177,7 +177,9 @@ namespace helpers {
    }

    void invertLowerMatrix(NDArray* inputMatrix, NDArray* invertedMatrix) {
+        NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
        BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertLowerMatrix_, (inputMatrix, invertedMatrix), FLOAT_NATIVE);
+        NDArray::registerSpecialUse({invertedMatrix}, {inputMatrix});
    }

    template <typename T>
@ -195,7 +197,9 @@ namespace helpers {
    }

    void invertUpperMatrix(NDArray* inputMatrix, NDArray* invertedMatrix) {
+        NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
        BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertUpperMatrix_, (inputMatrix, invertedMatrix), FLOAT_NATIVE);
+        NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
    }

 //    template <typename T>
@ -242,11 +246,7 @@ namespace helpers {

    template <typename T, typename F>
    static __global__ void determinantKernel(T* compound, T* result, Nd4jLong len) {
-        __shared__ F tempRes;
-        if (blockIdx.x == 0) {
-            tempRes = (F)result[0];
-        }
-        __syncthreads();
+        F tempRes = (F)result[0];

        auto start = blockIdx.x * blockDim.x + threadIdx.x;
        auto step = blockDim.x * gridDim.x;
@ -256,18 +256,14 @@ namespace helpers {
        }
        __syncthreads();

-        if (blockIdx.x == 0) {
+        if (threadIdx.x == 0) {
            result[0] = (T)tempRes;
        }
    }

        template <typename T, typename F>
        static __global__ void determinantLogKernel(T* compound, T* result, Nd4jLong len) {
-            __shared__ F tempRes;
-            if (blockIdx.x == 0) {
-                tempRes = (F)result[0];
-            }
-            __syncthreads();
+            F tempRes =  (F)result[0];

            auto start = blockIdx.x * blockDim.x + threadIdx.x;
            auto step = blockDim.x * gridDim.x;
@ -277,7 +273,7 @@ namespace helpers {
            }
            __syncthreads();

-            if (blockIdx.x == 0) {
+            if (threadIdx.x == 0) {
                result[0] = (T)math::nd4j_log<F,F>(math::nd4j_abs(tempRes));
            }
        }
@ -520,7 +516,9 @@ namespace helpers {
    }

    int determinant(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
+        NDArray::prepareSpecialUse({output}, {input});
        BUILD_SINGLE_SELECTOR(input->dataType(), return determinant_, (context, input, output), FLOAT_NATIVE);
+        NDArray::registerSpecialUse({output}, {input});
    }

    template <typename T>
@ -568,7 +566,9 @@ namespace helpers {
    }

    int logAbsDeterminant(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
+        NDArray::prepareSpecialUse({output}, {input});
        BUILD_SINGLE_SELECTOR(input->dataType(), return logAbsDeterminant_, (context, input, output), FLOAT_NATIVE);
+        NDArray::registerSpecialUse({output}, {input});
    }

    template <typename T>
@ -631,26 +631,27 @@ namespace helpers {
        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {output->rankOf() - 2, output->rankOf() - 1});
        auto stream = context->getCudaStream();

-//        PRAGMA_OMP_PARALLEL_FOR
        for (auto i = 0LL; i < packX.numberOfTads(); i++) {
-            fillMatrix<T, T><<<1, n2, 128, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), i * n2, n);
+            fillMatrix<T, T><<<1, n2, 1024, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), i * n2, n);
            matrix.tickWriteDevice();
            compound.assign(matrix);
            lup_<T>(context, &compound, nullptr, nullptr);
-            fillLowerUpperKernel<T><<<n, n, 128>>>(lower.specialBuffer(), lower.specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), compound.specialBuffer(), compound.specialShapeInfo(), n);
+            fillLowerUpperKernel<T><<<n, n, 1024, *stream>>>(lower.specialBuffer(), lower.specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), compound.specialBuffer(), compound.specialShapeInfo(), n);
            matrix.assign(0);
            invertUpperMatrix(&upper, &matrix); // U^{-1}
            compound.assign(0);
            invertLowerMatrix(&lower, &compound); // L{-1}

            nd4j::MmulHelper::mmul(&matrix, &compound, &upper, 1.0, 0.0);
-            returnMatrix<T, T><<<1, n2, 128, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), i * n2, n);
+            returnMatrix<T, T><<<1, n2, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), i * n2, n);
        }
        return Status::OK();
    }

    int inverse(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
+        NDArray::prepareSpecialUse({output}, {input});
        BUILD_SINGLE_SELECTOR(input->dataType(), return inverse_, (context, input, output), FLOAT_NATIVE);
+        NDArray::registerSpecialUse({output}, {input});
    }

    bool checkCholeskyInput(nd4j::LaunchContext * context, NDArray const* input) {
@ -795,10 +796,12 @@ namespace helpers {
        double* output = outputBuf;
        double* input = inputBuf;

+        Nd4jLong* shapeOf = shape::shapeOf(tadShape);
+        Nd4jLong* strideOf = shape::stride(tadShape);
+
        for (auto i = blockIdx.x; i < batchNum; i += gridDim.x) {
            double* current = input + tadOffsets[i];
-            Nd4jLong* shapeOf = shape::shapeOf(tadShape);
-            Nd4jLong* strideOf = shape::stride(tadShape);
+
            auto zIndex = shape::getIndexOffset(i, outputShape, batchNum);
            for (auto e = threadIdx.x; e < n; e += blockDim.x) {
                Nd4jLong diag[] = {e, e};
--- a/libnd4j/include/templatemath.h
+++ b/libnd4j/include/templatemath.h
@ -1057,6 +1057,9 @@ inline __device__ uint64_t nd4j_atomicAdd<uint64_t>(uint64_t* address, uint64_t

 template <>
 inline __device__ float16 nd4j_atomicAdd<float16>(float16* address, float16 val)  {
+#if __CUDA_ARCH__ >= 700
+    atomicAdd(reinterpret_cast<__half*>(address), val.data);
+#else
 	int* address_as_ull = (int*) address;

 	long addr = (long) address;
@ -1086,6 +1089,7 @@ inline __device__ float16 nd4j_atomicAdd<float16>(float16* address, float16 val)

 	if (!misaligned) return old.B.H;
 	else return old.B.L;
+#endif
 }

 template <>
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
@ -1846,8 +1846,8 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_3) {
    ASSERT_EQ(ND4J_STATUS_OK, result->status());

    auto z = result->at(0);
-    //z->printIndexedBuffer("Output ");
-    //exp.printIndexedBuffer("Expected ");
+    exp.printIndexedBuffer("Expected ");
+    z->printIndexedBuffer("Output ");

    ASSERT_TRUE(exp.isSameShape(z));
    ASSERT_TRUE(exp.equalsTo(z));