Shugeo cuda docs1 (#249)

* Comments axis shifts. * Fixed LUP solver usage. Added helpers doc. * Switch off OMP for roll and lup. Fixed omp usage for ClipByGlobalNorm. * Switch off omp for ClipByGlobalNorm to reduce omp ambigiousness.
2019-09-09 16:27:45 +03:00 · 2019-09-09 16:27:45 +03:00 · c9f8a904ad
commit c9f8a904ad
parent b582e69e3b
5 changed files with 318 additions and 345 deletions
--- a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
@ -50,13 +50,13 @@ namespace helpers {
        int n = inputMatrix->rows();
        invertedMatrix->assign(0.f);

-        PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
+       // PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
        for (int i = 0; i < n; i++)
            invertedMatrix->p(i, i, 1.0f);

        if (inputMatrix->isIdentityMatrix()) return;

-        PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
+        //PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
        for (int i = 1; i < n; i++)
            invertedMatrix->t<T>(i, i - 1) = -inputMatrix->t<T>(i, i - 1);

@ -83,11 +83,11 @@ namespace helpers {
            return;
        }

-        PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
+        //PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
        for (int i = 0; i < n; i++)
            invertedMatrix->t<T>(i, i) /= inputMatrix->t<T>(i, i);

-        PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
+        //PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
        for (int i = 0; i < n - 1; i++)
            invertedMatrix->t<T>(i, i + 1) -= (inputMatrix->t<T>(i, i + 1) * invertedMatrix->t<T>(i + 1, i + 1) / inputMatrix->t<T>(i, i));

@ -124,7 +124,7 @@ namespace helpers {
        for(int i = 0; i < rowNum; i++ ) {
            pivotValue = T(0.0);
            pivot = -1;
-            PRAGMA_OMP_PARALLEL_FOR //_ARGS(firstprivate(pivot,pivotValue))
+            //PRAGMA_OMP_PARALLEL_FOR //_ARGS(firstprivate(pivot,pivotValue))
            for(int rowCounter = i; rowCounter < rowNum; rowCounter++ ) {
                if (nd4j::math::nd4j_abs(compoundMatrix.t<T>(rowCounter, i)) > pivotValue) {
                    pivotValue = nd4j::math::nd4j_abs(compoundMatrix.t<T>(rowCounter, i));
@ -140,7 +140,7 @@ namespace helpers {

                for( int j = i + 1; j < rowNum; j++ ) {
                    compoundMatrix.t<T>(j, i) /= compoundMatrix.t<T>(i, i);
-                    PRAGMA_OMP_PARALLEL_FOR
+                    //PRAGMA_OMP_PARALLEL_FOR
                    for( int k = i + 1; k < rowNum; k++ ) {
                        compoundMatrix.t<T>(j, k) -= compoundMatrix.t<T>(j, i) * compoundMatrix.t<T>(i, k);
                    }
--- a/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp
@ -43,7 +43,7 @@ namespace helpers {
            int remainShift = fullLen % actualShift; 
            
            // stage 1) swap last actualShift elements with first ones.
-            PRAGMA_OMP_PARALLEL_FOR_IF(actualShift > Environment::getInstance()->elementwiseThreshold())
+            //PRAGMA_OMP_PARALLEL_FOR //_IF(actualShift > Environment::getInstance()->elementwiseThreshold())
            for (int e = 0; e < actualShift; ++e) {
                int sourceIndex = fullLen - actualShift + e;

@ -56,7 +56,7 @@ namespace helpers {
            }

            // stage 2) swap swapped actualShift elements with rest remainShiftCount times.
-            PRAGMA_OMP_PARALLEL_FOR_IF(shiftCount > Environment::getInstance()->tadThreshold())
+            //PRAGMA_OMP_PARALLEL_FOR //_IF(shiftCount > Environment::getInstance()->tadThreshold())
            for (int count = 1; count < shiftCount; ++count) {
                for (int e = 0; e < actualShift; ++e) {
                    int destinationIndex = fullLen - (count + 1) * actualShift + e;
@ -91,7 +91,7 @@ namespace helpers {
            output->assign(input);

        auto source = output; //input;
-        for (auto i = 0; i < axes.size(); i++) {
+        for (size_t i = 0; i < axes.size(); i++) {
            int axe = axes[i];
            if (axe == source->rankOf() - 1) {// last dimension
                std::unique_ptr<ResultSet> listOfTensors(source->allTensorsAlongDimension({axe}));
@ -115,7 +115,7 @@ namespace helpers {

                std::unique_ptr<ResultSet> listOfTensors(source->allTensorsAlongDimension({dims}));
                std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension({dims}));
-            
+                //
                int fullLen = listOfTensors->size();
                int sizeAt = input->sizeAt(axe);

--- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@ -957,26 +957,27 @@ void clipByNorm(nd4j::LaunchContext * context, NDArray& input, NDArray& output,

    template <typename T>
    static void clipByGlobalNorm_(std::vector<NDArray*> const& inputs, double clipNorm, nd4j::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
-        NDArray globalNorm = NDArrayFactory::create<T>(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list]))
-        PRAGMA_OMP_PARALLEL_FOR
+        T globalNorm = 0; //NDArrayFactory::create<T>(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list]))
+//        PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(sumT : globalNorm)
        for (size_t i = 0; i < inputs.size(); i++) {
            auto input = inputs[i];
            auto l2norm = input->reduceNumber(reduce::Norm2);
-            globalNorm += l2norm * l2norm;
+            globalNorm += l2norm.t<T>(0) * l2norm.t<T>(0);
        }

-        globalNorm.applyTransform(transform::Sqrt, nullptr, nullptr);// = nd4j::math::nd4j_sqrt(globalNorm);
-        outputs[inputs.size()]->p(0, globalNorm);
+        //globalNorm.applyTransform(transform::Sqrt, nullptr, nullptr);// = nd4j::math::nd4j_sqrt(globalNorm);
+        auto normS = nd4j::math::nd4j_sqrt<T,T>(globalNorm);
+        outputs[inputs.size()]->p(0, normS);

-        const T factor = clipNorm / globalNorm.e<T>(0);
+        const T factor = clipNorm / normS;

-        PRAGMA_OMP_PARALLEL_FOR
+//        PRAGMA_OMP_PARALLEL_FOR
        for (size_t e = 0; e < inputs.size(); e++) {
            // all-reduce
            auto input = inputs[e];
            auto output = outputs[e];

-            if (globalNorm.e<double>(0) <= clipNorm) {
+            if (normS <= clipNorm) {
                output->assign(input);
            }
            else {
--- a/libnd4j/include/ops/declarable/helpers/cuda/axis.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/axis.cu
@ -27,11 +27,11 @@ namespace helpers {

    void adjustAxis(Nd4jLong rank, NDArray* axisVector, std::vector<int>& output) {
        output.resize(axisVector->lengthOf());
-        axisVector->tickReadDevice();
-        axisVector->syncToHost();
+        axisVector->tickReadDevice(); // mark input as read on device
+        axisVector->syncToHost(); // sync to host
        for (int e = 0; e < axisVector->lengthOf(); e++) {
                auto ca = axisVector->e<int>(e);
-                if (ca < 0)
+                if (ca < 0) // shift values on rank for negative vals
                    ca += rank;

                output[e] = ca;
@ -41,7 +41,7 @@ namespace helpers {
    void adjustAxis(Nd4jLong rank, std::vector<int> &axisVector) {
        for (int e = 0; e < axisVector.size(); e++) {
            auto a = axisVector[e];
-            if (a < 0)
+            if (a < 0) // shift vals on rank for negative vals
                axisVector[e] = a + rank;
        }
    }
--- a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
@ -31,34 +31,14 @@
 namespace nd4j {
 namespace ops {
 namespace helpers {
-//    template <typename T>
-//    static __device__ void swapRows_(T* matrix, Nd4jLong* shape, int theFirst, int theSecond, Nd4jLong N) {
-//        if (theFirst != theSecond) {
-//            auto start = threadIdx.x + blockIdx.x * blockDim.x;
-//            auto step = blockDim.x * gridDim.x;
-//            for (auto i = start; i < N; i += step) {
-//                Nd4jLong iCoord1[] = {theFirst, i};
-//                Nd4jLong iCoord2[] = {theSecond, i};
-//                auto iIndex1 = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), iCoord1, 2);
-//                auto iIndex2 = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), iCoord2, 2);
-//                //atomicExch(&matrix[iIndex1], matrix[iIndex2]);
-//                T e0 = matrix[iIndex1];
-//                T e1 = matrix[iIndex2];
-//                matrix[iIndex1] = e0;
-//                matrix[iIndex2] = e1;
-//            }
-//        }
-//    }
-//    BUILD_SINGLE_TEMPLATE(template void swapRows_, (NDArray* matrix, int theFirst, int theSecond), FLOAT_TYPES);
-//
-//    void swapRows(NDArray* matrix, int theFirst, int theSecond) {
-//        BUILD_SINGLE_SELECTOR(matrix->dataType(), swapRows_, (matrix, theFirst, theSecond), FLOAT_TYPES);
-//    }
+
+// ------------------------------------------------------------------------------------------------------------------ //
+//  invert the second diagonal for lower diagonal matrix
    template<typename T>
    static __global__ void
    invertKernelLow(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
-        T *inverted = reinterpret_cast<T *>(invertedBuf);
-        T *input = reinterpret_cast<T *>(inputBuf);
+        T* inverted = reinterpret_cast<T *>(invertedBuf);
+        T* input = reinterpret_cast<T*>(inputBuf);

        auto start = threadIdx.x + blockIdx.x * blockDim.x;
        auto step = blockDim.x * gridDim.x;
@ -71,11 +51,13 @@ namespace helpers {
            auto dxIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posX, 2);
            auto dyIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posY, 2);
            auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2);
+            // invert lower triangular matrix
            inverted[zIndex] = -input[xIndex] / (input[dxIndex] * input[dyIndex]);
 //            math::atomics::nd4j_atomicAdd(&inverted[zIndex], - input[xIndex] * inverted[iIndex] / input[dIndex]);
        }
    }
-
+// ------------------------------------------------------------------------------------------------------------------ //
+// invert diagonal vals to upper diagonal matrix
    template<typename T>
    static __global__ void
    upvertKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
@ -90,10 +72,13 @@ namespace helpers {
            auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2);
            auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2);
 //            math::atomics::nd4j_atomicDiv(&inverted[zIndex], input[xIndex]);
+            // invert diagonal elements
            inverted[zIndex] /= input[xIndex];
        }
    }

+// ------------------------------------------------------------------------------------------------------------------ //
+//  invert upper second diagonal
    template<typename T>
    static __global__ void
    upvertKernelUp(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
@ -120,18 +105,17 @@ namespace helpers {

        for (int i = start; i < n - 1; i += step) {
            Nd4jLong pos[] = {i, i + 1};
-            //Nd4jLong posY[] = {i, i};
            Nd4jLong posX[] = {i + 1, i + 1};
            auto xIndex = shape::getOffset(0, inputShapeOf, shape::stride(inputShape), pos, 2);
-//            auto yIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posY, 2);
-//            auto yIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2);
            auto iIndex = shape::getOffset(0, invertedShapeOf, invertedStride, posX, 2);
            auto zIndex = shape::getOffset(0, invertedShapeOf, invertedStride, pos, 2);
+            // invert upper matrix
            math::atomics::nd4j_atomicAdd(&inverted[zIndex], -input[xIndex] * inverted[iIndex]); // / input[yIndex]);
            //inputMatrix->t<T>(i, i + 1) * invertedMatrix->t<T>(i + 1, i + 1) / inputMatrix->t<T>(i, i)
        }
    }

+// ------------------------------------------------------------------------------------------------------------------ //
    template<typename T>
    static __global__ void
    invertLowKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
@ -152,11 +136,14 @@ namespace helpers {
                    auto dIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posD, 2);
                    auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), posZ,
                                                   2);
+                    // invert non-diagonal elements
                    math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex] / input[dIndex]);
                }
        }
    }

+// ------------------------------------------------------------------------------------------------------------------ //
+// Invertion of upper triangular matrix non-diagonal elements when main and second diagonals already processed
    template<typename T>
    static __global__ void
    invertUpKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
@ -183,18 +170,20 @@ namespace helpers {
                    Nd4jLong posZ[] = {i, j};
                    Nd4jLong posY[] = {k, j};
                    Nd4jLong posX[] = {i, k};
-//                    Nd4jLong posD[] = {i, i};
-
+                    // inversion with Joardan Gauss transformation
                    auto xIndex = shape::getOffset(0, inputShapeOf, inputStrideOf, posX, 2);
                    auto yIndex = shape::getOffset(0, invertedShapeOf, invertedStrideOf, posY, 2);
-                    //                  auto dIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posD, 2);
                    auto zIndex = shape::getOffset(0, invertedShapeOf, invertedStrideOf, posZ, 2);
-                    math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex]);// / input[dIndex]);
-//                    printf("(%d, %d) inverted[%lld] = %lf (-inverted[%lld] * input[%lld]\n", blockIdx.x, threadIdx.x, zIndex, inverted[zIndex], yIndex, xIndex);
+                    // invert upper non-diagonal elements
+                    math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex]);
                }
        }
    }

+// ------------------------------------------------------------------------------------------------------------------ //
+// procedure to invert lower-triangular matrix.
+// In current case lower triangular matrix has main diagonal with general values
+//
    template<typename T>
    static void invertLowerMatrix_(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) {
        int n = inputMatrix->rows();
@ -204,20 +193,26 @@ namespace helpers {

        auto stream = context->getCudaStream();

+        // invert lower matrix
        // invert main diagonal
        upvertKernel<T><<<1, n, 512, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
        // invert the second diagonal
        invertKernelLow<T><<<1, n, 512, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
-//        invertKernelLow<T><<<1, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
+        // invert non-diagonal elements
        invertLowKernel<T><<<n, n, 512, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
    }

+// ------------------------------------------------------------------------------------------------------------------ //
+// caller for invert lower matrix routine
    void invertLowerMatrix(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) {
        NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
        BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertLowerMatrix_, (context, inputMatrix, invertedMatrix), FLOAT_NATIVE);
        NDArray::registerSpecialUse({invertedMatrix}, {inputMatrix});
    }

+// ------------------------------------------------------------------------------------------------------------------ //
+// procedure to invert upper-triangular matrix.
+// In current case upper triangular matrix has main diagonal with all ones on it.
    template<typename T>
    static void invertUpperMatrix_(LaunchContext *context, NDArray* inputMatrix, NDArray* invertedMatrix) {
        int n = inputMatrix->rows();
@ -227,342 +222,319 @@ namespace helpers {
            return;
        }

-        //upvertKernel<T><<<1, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
+        // invert upper matrix
+        // invert the second diagonal
        upvertKernelUp<T><<<1, n, 512, *stream >>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(),
                inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
-        invertedMatrix->tickWriteDevice();
-        invertedMatrix->printIndexedBuffer("Step1 UP inversion");
+
+        // invert other elements
        invertUpKernel<T><<<n, n, 512, *stream >>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(),inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
    }

+// ------------------------------------------------------------------------------------------------------------------ //
+//  invertion of upper triangular matrix - runner routine
    void invertUpperMatrix(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) {
        NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
        BUILD_SINGLE_SELECTOR(invertedMatrix->dataType(), invertUpperMatrix_, (context, inputMatrix, invertedMatrix), FLOAT_NATIVE);
        NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
    }

-//    template <typename T>
-//    static __global__ void lupKernel(T* compound, Nd4jLong* compoundShape, T* permutation, Nd4jLong* permutationShape, Nd4jLong rowNum) {
-//        int swapCount = 0;
-//        for(int i = blockIdx.x; i < rowNum; i += gridDim.x ) {
-//            auto pivotValue = T(0.0);
-//            auto pivot = -1;
-//
-//            for(int rowCounter = i; rowCounter < rowNum; rowCounter++ ) {
-//                Nd4jLong rowCoord[] = {rowCounter, i};
-//                auto rowPos = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), rowCoord, 2);
-//                if(nd4j::math::nd4j_abs(compound[rowPos]) > pivotValue ) {
-//                    pivotValue = nd4j::math::nd4j_abs(compound[rowPos]);
-//                    pivot = rowCounter;
-//                }
-//            }
-//
-//            if( pivotValue != T(0.0) ) {
-//                swapRows_<T>(compound, compoundShape, pivot, i, rowNum);
-//                swapRows_<T>(permutation, permutationShape, pivot, i, rowNum);
-//                if (pivot != i)
-//                    swapCount++;
-//
-//                for( int j = i + 1; j < rowNum; j++ ) {
-//                    Nd4jLong posJIbuf[] = {j, i};
-//                    Nd4jLong posIIbuf[] = {i, i};
-//                    auto posJI = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posJIbuf, 2);
-//                    auto posII = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posIIbuf, 2);
-//
-//                    compound[posJI] /= compound[posII];
-//                    for( int k = i + 1; k < rowNum; k++ ) {
-//                        Nd4jLong posJKbuf[] = {j, k};
-//                        Nd4jLong posIKbuf[] = {i, k};
-//                        auto posJK = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posJKbuf, 2);
-//                        auto posIK = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posIKbuf, 2);
-//                        T arg = compound[posJI] * compound[posIK];
-//                        compound[posJK] -= arg;
-//                    }
-//                }
-//            }
-//        }
-//    }
+// ------------------------------------------------------------------------------------------------------------------ //
+    // determinant kernel - accumulation product of all values on the main diagonal
+    template<typename T>
+    static __global__ void determinantKernel(T *compound, T *result, Nd4jLong len) {
+        auto start = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = blockDim.x * gridDim.x;
+        for (auto i = start; i < len; i += step) {
+            auto pos = i * len + i; //shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), di, 2);
+            // multiply all diagonal elements
+            math::atomics::nd4j_atomicMul(&result[0], compound[pos]);
+        }
+    }

-//    template <typename T, typename F>
-        template<typename T>
-        static __global__ void determinantKernel(T *compound, T *result, Nd4jLong len) {
-            //F tempRes = result[0];
+// ------------------------------------------------------------------------------------------------------------------ //
+    // determinant logarithm - accumulation sum of all logarithm values on the main diagonal. All in logarithic values
+    // should be positive
+    template<typename T>
+    static __global__ void determinantLogKernel(T *compound, T *result, Nd4jLong len) {
+        auto start = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = blockDim.x * gridDim.x;
+        for (auto i = start; i < len; i += step) {
+            auto pos = i * len + i; //shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), di, 2);
+            // sum logs of all diagonal elements
+            math::atomics::nd4j_atomicAdd(result, math::nd4j_log<T,T>(math::nd4j_abs(compound[pos])));
+        }
+    }

-            auto start = blockIdx.x * blockDim.x + threadIdx.x;
-            auto step = blockDim.x * gridDim.x;
-            for (auto i = start; i < len; i += step) {
-                auto pos = i * len + i; //shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), di, 2);
-                math::atomics::nd4j_atomicMul(&result[0], compound[pos]);
-            }
+// ------------------------------------------------------------------------------------------------------------------ //
+    // kernel to copy matrix with given shape to compound tensor with given pos
+    // output - a N-D tensor buffer with rank not less than 2, input - 2D square n x n matrix with n = rowLen
+    template<typename T, typename F>
+    static __global__ void
+    fillMatrix(void *output, Nd4jLong *outShape, void *input, Nd4jLong *inputShape, Nd4jLong pos, Nd4jLong rowLen) {
+        __shared__ F *matrix;
+        __shared__ T *inputBuf;
+        __shared__ Nd4jLong inputLen;
+        __shared__ Nd4jLong n2;
+
+        if (threadIdx.x == 0) {
+            matrix = reinterpret_cast<F*>(output);
+            inputBuf = reinterpret_cast<T*>(input);
+            inputLen = shape::length(inputShape);
+            n2 = rowLen * rowLen;
+        }
+        __syncthreads();
+
+        auto start = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = blockDim.x * gridDim.x;
+
+        for (int k = pos + start, j = start; j < n2; k += step, j += step) {
+            auto xIndex = shape::getIndexOffset(k, inputShape, inputLen);
+            matrix[j] = (F) inputBuf[xIndex];
+        }
+    }
+
+// ------------------------------------------------------------------------------------------------------------------ //
+// same as above, but without type conversion
+    template<typename T>
+    static __global__ void
+    returnMatrix(void *output, Nd4jLong *outputShape, void *input, Nd4jLong *inputShape, Nd4jLong pos, Nd4jLong rowLen) {
+        __shared__ T* matrix;
+        __shared__ T* outputBuf;
+        __shared__ Nd4jLong outputLen;
+        __shared__ Nd4jLong n2;
+
+        if (threadIdx.x == 0) {
+            matrix = reinterpret_cast<T *>(input);
+            outputBuf = reinterpret_cast<T *>(output);
+            outputLen = shape::length(inputShape);
+            n2 = rowLen * rowLen;
+        }
+        __syncthreads();
+        auto start = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = blockDim.x * gridDim.x;
+
+        for (int k = pos + start, j = start; j < n2; k += step, j += step) {
+            auto zIndex = shape::getIndexOffset(k, outputShape, outputLen);
+            outputBuf[zIndex] = matrix[j];
+        }
+    }
+
+// ------------------------------------------------------------------------------------------------------------------ //
+    // fill up permutaion matrix kernel. Permutation matrix filled with zeros and ones
+    template<typename F>
+    static __global__ void fillUpPermutation(void *output, Nd4jLong *shape, int *source, int rowNum) {
+        F *permutation = reinterpret_cast<F *>(output);
+
+        auto start = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = blockDim.x * gridDim.x;
+        for (auto i = start; i < rowNum; i += step) {
+            int val = source[i] - 1;
+            Nd4jLong posF[] = {i, val};
+            auto pos = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), posF, 2);
+            permutation[pos] = F(1.f);
+        }
+    }
+
+// ------------------------------------------------------------------------------------------------------------------ //
+    // LUP decomposition runner - using CUBLAS SOLVER
+    // if permutation is given, then using LUP decomposition, LU decomposition otherwise
+    // L - lower triangular, U - upper triangular, P - permutation matricies
+    // PA = LU
+    //
+    // input - A matrix nxn
+    // compound - C matrix L + U - I, or main diagonal and lower - L matrix, from the 2nd diagonal - U matrix
+    template<typename T>
+    static void lup_(LaunchContext *context, NDArray *input, NDArray *compound, NDArray *permutation) {
+        auto stream = context->getCudaStream();
+        auto n = input->rows();
+        cusolverDnHandle_t cusolverH = nullptr;
+        // create solver handle
+        cusolverStatus_t status = cusolverDnCreate(&cusolverH);
+        if (CUSOLVER_STATUS_SUCCESS != status) {
+            throw cuda_exception::build("Cannot create cuSolver handle", status);
+        }
+        // set solver stream
+        status = cusolverDnSetStream(cusolverH, *stream);
+        if (CUSOLVER_STATUS_SUCCESS != status) {
+            throw cuda_exception::build("Cannot set up stream for cuda solver", status);
+        }
+        int lwork = 0;
+        int *d_info = nullptr;
+        // allocate memory for permutation vector
+        auto err = cudaMalloc((void **) &d_info, sizeof(int));
+        if (err) {
+            throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver info buffer", err);
        }

-        template<typename T>
-        static __global__ void determinantLogKernel(T *compound, T *result, Nd4jLong len) {
-//            F tempRes =  (F)result[0];
+        DataType dtype = input->dataType();
+        switch (dtype) { // there are two implementations with cublas for LUP decomposition - double and float

-            auto start = blockIdx.x * blockDim.x + threadIdx.x;
-            auto step = blockDim.x * gridDim.x;
-            for (auto i = start; i < len; i += step) {
-                auto pos = i * len + i; //shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), di, 2);
-                math::atomics::nd4j_atomicAdd(result, math::nd4j_log<T,T>(math::nd4j_abs(compound[pos])));
-            }
-//            __syncthreads();
-//
-//            if (threadIdx.x == 0) {
-//                result[0] = (T)math::nd4j_log<F,F>(math::nd4j_abs(tempRes));
-//            }
-        }
+            case DataType::DOUBLE: {
+                double *d_work = nullptr;
+                // compute internal buffer size
+                double *matrix = reinterpret_cast<double *>(input->specialBuffer());
+                status = cusolverDnDgetrf_bufferSize(
+                        cusolverH,
+                        n,
+                        n,
+                        matrix,
+                        n,
+                        &lwork);
+                if (CUSOLVER_STATUS_SUCCESS != status) {
+                    throw cuda_exception::build("helpers::lup_: Cannot create cuSolver handle", status);
+                }

-        template<typename T, typename F>
-        static __global__ void
-        fillMatrix(void *output, Nd4jLong *outShape, void *input, Nd4jLong *inputShape, Nd4jLong pos, Nd4jLong rowLen) {
-            __shared__
-            F *matrix;
-            __shared__
-            T *inputBuf;
-            __shared__
-            Nd4jLong inputLen;
-            __shared__
-            Nd4jLong n2;
+                err = cudaMalloc((void **) &d_work, sizeof(float) * lwork);
+                if (err) {
+                    throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer",
+                                                err);
+                }

-            if (threadIdx.x == 0) {
-                matrix = reinterpret_cast<F *>(output);
-                inputBuf = reinterpret_cast<T *>(input);
-                inputLen = shape::length(inputShape);
-                n2 = rowLen * rowLen;
-            }
-            __syncthreads();
-            auto start = blockIdx.x * blockDim.x + threadIdx.x;
-            auto step = blockDim.x * gridDim.x;
-
-            for (int k = pos + start, j = start; j < n2; k += step, j += step) {
-                auto xIndex = shape::getIndexOffset(k, inputShape, inputLen);
-                matrix[j] = (F) inputBuf[xIndex];
-            }
-        }
-
-        template<typename T>
-        static __global__ void
-        returnMatrix(void *output, Nd4jLong *outputShape, void *input, Nd4jLong *inputShape, Nd4jLong pos,
-                     Nd4jLong rowLen) {
-            __shared__            T *matrix;
-            __shared__            T *outputBuf;
-            __shared__            Nd4jLong outputLen;
-            __shared__            Nd4jLong n2;
-
-            if (threadIdx.x == 0) {
-                matrix = reinterpret_cast<T *>(input);
-                outputBuf = reinterpret_cast<T *>(output);
-                outputLen = shape::length(inputShape);
-                n2 = rowLen * rowLen;
-            }
-            __syncthreads();
-            auto start = blockIdx.x * blockDim.x + threadIdx.x;
-            auto step = blockDim.x * gridDim.x;
-
-            for (int k = pos + start, j = start; j < n2; k += step, j += step) {
-                auto zIndex = shape::getIndexOffset(k, outputShape, outputLen);
-                outputBuf[zIndex] = (T) matrix[j];
-            }
-        }
-
-        template<typename F>
-        static __global__ void fillUpPermutation(void *output, Nd4jLong *shape, int *source, int rowNum) {
-            F *permutation = reinterpret_cast<F *>(output);
-
-            auto start = blockIdx.x * blockDim.x + threadIdx.x;
-            auto step = blockDim.x * gridDim.x;
-            for (auto i = start; i < rowNum; i += step) {
-                int val = source[i] - 1;
-                Nd4jLong posF[] = {i, val};
-                auto pos = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), posF, 2);
-                permutation[pos] = F(1.f);
-            }
-        }
-
-        template<typename T>
-        static void lup_(LaunchContext *context, NDArray *input, NDArray *compound, NDArray *permutation) {
-            auto stream = context->getCudaStream();
-            auto n = input->rows();
-            cusolverDnHandle_t cusolverH = nullptr;
-            cusolverStatus_t status = cusolverDnCreate(&cusolverH);
-            if (CUSOLVER_STATUS_SUCCESS != status) {
-                throw cuda_exception::build("Cannot create cuSolver handle", status);
-            }
-            status = cusolverDnSetStream(cusolverH, *stream);
-            if (CUSOLVER_STATUS_SUCCESS != status) {
-                throw cuda_exception::build("Cannot set up stream for cuda solver", status);
-            }
-            int lwork = 0;
-            int *d_info = nullptr;
-
-            auto err = cudaMalloc((void **) &d_info, sizeof(int));
-            if (err) {
-                throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver info buffer", err);
-            }
-
-            DataType dtype = input->dataType();
-            switch (dtype) {
-
-                case DataType::DOUBLE: {
-                    double *d_work = nullptr;
-                    err = cudaMalloc((void **) &d_work, sizeof(float) * lwork);
-                    if (err) {
-                        throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer",
-                                                    err);
-                    }
-                    double *matrix = reinterpret_cast<double *>(input->specialBuffer());
-                    status = cusolverDnDgetrf_bufferSize(
+                if (permutation == nullptr)
+                    status = cusolverDnDgetrf(
                            cusolverH,
                            n,
                            n,
                            matrix,
                            n,
-                            &lwork);
-                    if (CUSOLVER_STATUS_SUCCESS != status) {
-                        throw cuda_exception::build("helpers::lup_: Cannot create cuSolver handle", status);
-                    }
-                    if (permutation == nullptr)
-                        status = cusolverDnDgetrf(
-                                cusolverH,
-                                n,
-                                n,
-                                matrix,
-                                n,
-                                d_work,
-                                nullptr,
-                                d_info);
-                    else {
-                        NDArray permutVector('c', {n}, nd4j::DataType::INT32, context);
-                        int *permutationBuf = reinterpret_cast<int *>(permutVector.specialBuffer());
-                        status = cusolverDnDgetrf(
-                                cusolverH,
-                                n,
-                                n,
-                                matrix,
-                                n,
-                                d_work,
-                                permutationBuf,
-                                d_info);
-                        fillUpPermutation<double> << < n, n, 1024, *stream >> >
-                                                                  (permutation->specialBuffer(), permutation->specialShapeInfo(), permutationBuf, n);
-                        permutation->tickWriteDevice();
-                    }
-                    err = cudaFree(d_work);
-                    if (err) {
-                        throw cuda_exception::build("helpers::lup_: Cannot deallocate memory for solver data buffer",
-                                                    err);
-                    }
-                }
-                    break;
-                case DataType::FLOAT32: {
-                    float *matrix = reinterpret_cast<float *>(input->specialBuffer());
-                    float *d_work = nullptr;
-                    err = cudaMalloc((void **) &d_work, sizeof(float) * lwork);
-                    if (err) {
-                        throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer",
-                                                    err);
-                    }
-
-                    status = cusolverDnSgetrf_bufferSize(
+                            d_work,
+                            nullptr,
+                            d_info);
+                else {
+                    NDArray permutVector('c', {n}, nd4j::DataType::INT32, context);
+                    int *permutationBuf = reinterpret_cast<int *>(permutVector.specialBuffer());
+                    status = cusolverDnDgetrf(
                            cusolverH,
                            n,
                            n,
                            matrix,
                            n,
-                            &lwork);
-                    if (CUSOLVER_STATUS_SUCCESS != status) {
-                        throw cuda_exception::build("helpers::lup_: Cannot create cuSolver handle", status);
-                    }
-
-                    if (permutation == nullptr)
-                        status = cusolverDnSgetrf(
-                                cusolverH,
-                                n,
-                                n,
-                                matrix,
-                                n,
-                                d_work,
-                                nullptr,
-                                d_info);
-                    else {
-                        NDArray permutVector('c', {n}, nd4j::DataType::INT32, context);
-                        int *permutationBuf = reinterpret_cast<int *>(permutVector.specialBuffer());
-                        status = cusolverDnSgetrf(
-                                cusolverH,
-                                n,
-                                n,
-                                matrix,
-                                n,
-                                d_work,
-                                permutationBuf,
-                                d_info);
-                        fillUpPermutation<T> <<< n, n, 128, *stream >> >
-                                                             (permutation->specialBuffer(), permutation->specialShapeInfo(), permutationBuf, n);
-                        permutation->tickWriteDevice();
-                    }
-                    err = cudaFree(d_work);
-                    if (err) {
-                        throw cuda_exception::build("helpers::lup_: Cannot deallocate memory for solver data buffer",
-                                                    err);
-                    }
-
+                            d_work,
+                            permutationBuf,
+                            d_info);
+                    fillUpPermutation<double> << < n, n, 1024, *stream >> >
+                                                              (permutation->specialBuffer(), permutation->specialShapeInfo(), permutationBuf, n);
+                    permutation->tickWriteDevice();
+                }
+                err = cudaFree(d_work);
+                if (err) {
+                    throw cuda_exception::build("helpers::lup_: Cannot deallocate memory for solver data buffer",
+                                                err);
                }
            }
-            if (CUSOLVER_STATUS_SUCCESS != status) {
-                throw cuda_exception::build("helpers::lup_: Cannot make LU decomposition", status);
+                break;
+            case DataType::FLOAT32: {
+                float *matrix = reinterpret_cast<float*>(input->specialBuffer());
+                float *d_work = nullptr;
+
+                status = cusolverDnSgetrf_bufferSize(
+                        cusolverH,
+                        n,
+                        n,
+                        matrix,
+                        n,
+                        &lwork);
+                if (CUSOLVER_STATUS_SUCCESS != status) {
+                    throw cuda_exception::build("helpers::lup_: Cannot create cuSolver handle", status);
+                }
+
+                err = cudaMalloc((void **) &d_work, sizeof(float) * lwork);
+                if (err) {
+                    throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer",
+                                                err);
+                }
+
+                if (permutation == nullptr)
+                    status = cusolverDnSgetrf(
+                            cusolverH,
+                            n,
+                            n,
+                            matrix,
+                            n,
+                            d_work,
+                            nullptr,
+                            d_info);
+                else {
+                    NDArray permutVector('c', {n}, nd4j::DataType::INT32, context);
+                    int *permutationBuf = reinterpret_cast<int *>(permutVector.specialBuffer());
+                    status = cusolverDnSgetrf(
+                            cusolverH,
+                            n,
+                            n,
+                            matrix,
+                            n,
+                            d_work,
+                            permutationBuf,
+                            d_info);
+                    fillUpPermutation<T> <<< n, n, 128, *stream >> >
+                                                         (permutation->specialBuffer(), permutation->specialShapeInfo(), permutationBuf, n);
+                    permutation->tickWriteDevice();
+                }
+                err = cudaFree(d_work);
+                if (err) {
+                    throw cuda_exception::build("helpers::lup_: Cannot deallocate memory for solver data buffer",
+                                                err);
+                }
+
            }
-            err = cudaFree(d_info);
-            if (err) {
-                throw cuda_exception::build("helpers::lup_: Cannot deallocate memory for solver info buffer", err);
-            }
-            cusolverDnDestroy(cusolverH);
+        }
+        if (CUSOLVER_STATUS_SUCCESS != status) {
+            throw cuda_exception::build("helpers::lup_: Cannot make LU decomposition", status);
+        }
+        err = cudaFree(d_info);
+        if (err) {
+            throw cuda_exception::build("helpers::lup_: Cannot deallocate memory for solver info buffer", err);
+        }
+        cusolverDnDestroy(cusolverH);
 //        NDArray::registerSpecialUse({input}, {input});
-            input->tickWriteDevice();
-        }
+        input->tickWriteDevice();
+    }
+// ------------------------------------------------------------------------------------------------------------------ //

-        BUILD_SINGLE_TEMPLATE(template void lup_,(LaunchContext * context, NDArray * input, NDArray * output, NDArray * permutation), FLOAT_NATIVE);
+    BUILD_SINGLE_TEMPLATE(template void lup_,(LaunchContext * context, NDArray * input, NDArray * output, NDArray * permutation), FLOAT_NATIVE);

-        template<typename T>
-        static int determinant_(nd4j::LaunchContext *context, NDArray *input, NDArray *output) {
-            Nd4jLong n = input->sizeAt(-1);
-            Nd4jLong n2 = n * n;
-            std::vector<int> dims();
-            auto packX = ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), {input->rankOf() - 2, input->rankOf() - 1});
-            //auto packZ = ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), {output->rankOf() - 1});
+// ------------------------------------------------------------------------------------------------------------------ //
+    template<typename T>
+    static int determinant_(nd4j::LaunchContext *context, NDArray *input, NDArray *output) {
+        Nd4jLong n = input->sizeAt(-1);
+        Nd4jLong n2 = n * n;
+        std::vector<int> dims();
+        auto packX = ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), {input->rankOf() - 2, input->rankOf() - 1});
+        //auto packZ = ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), {output->rankOf() - 1});
 //        DataType dtype = input->dataType();
 //        if (dtype != DataType::DOUBLE)
 //            dtype = DataType::FLOAT32;
-            auto matrix = NDArrayFactory::create(input->ordering(), {n, n}, DataTypeUtils::fromT<T>(), context); //, block.getWorkspace());
-            auto det = NDArrayFactory::create<T>(1);
-            auto stream = context->getCudaStream();
-            NDArray::prepareSpecialUse({output}, {input});
-            dim3 launchDims(256, 256, 1024);
-            output->assign(1.f);
-            for (int e = 0; e < output->lengthOf(); e++) {
-                Nd4jLong pos = e * n2;
+        auto matrix = NDArrayFactory::create(input->ordering(), {n, n}, DataTypeUtils::fromT<T>(), context); //, block.getWorkspace());
+        auto det = NDArrayFactory::create<T>(1);
+        auto stream = context->getCudaStream();
+        NDArray::prepareSpecialUse({output}, {input});
+        dim3 launchDims(256, 256, 1024);
+        output->assign(1.f);
+        for (int e = 0; e < output->lengthOf(); e++) {
+            Nd4jLong pos = e * n2;
 //            if (matrix.dataType() == input->dataType())
-                fillMatrix<T, T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), pos, n);
+            fillMatrix<T, T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), pos, n);
 //            else
 //                fillMatrix<T, float><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), pos, n);

 //            if (matrix.dataType() == input->dataType())
-                lup_<T>(context, &matrix, nullptr, nullptr);
+            lup_<T>(context, &matrix, nullptr, nullptr);
 //            else
 //                lup_<float>(context, &matrix, nullptr, nullptr);
-                auto offset = shape::getIndexOffset(e, output->shapeInfo(), output->lengthOf());
-                auto inputBuf = reinterpret_cast<T *>(matrix.specialBuffer());
-                auto outputBuf = reinterpret_cast<T *>(output->specialBuffer()) + offset;
+            auto offset = shape::getIndexOffset(e, output->shapeInfo(), output->lengthOf());
+            auto inputBuf = reinterpret_cast<T *>(matrix.specialBuffer());
+            auto outputBuf = reinterpret_cast<T *>(output->specialBuffer()) + offset;
 //            if (matrix.dataType() == input->dataType())
-                determinantKernel<T> << < launchDims.x, launchDims.y, launchDims.z, *stream >> >
-                                                                                    (inputBuf, outputBuf, n);
+            determinantKernel<T> << < launchDims.x, launchDims.y, launchDims.z, *stream >> >
+                                                                                (inputBuf, outputBuf, n);
 //            else
 //                determinantKernel<T, float><<<launchDims.x, launchDims.y, launchDims.z, *stream >>> (inputBuf, outputBuf, n);
-            }
-            NDArray::registerSpecialUse({output}, {input});
-
-            return Status::OK();
        }
+        NDArray::registerSpecialUse({output}, {input});
+
+        return Status::OK();
+    }

        int determinant(nd4j::LaunchContext *context, NDArray *input, NDArray *output) {
            NDArray::prepareSpecialUse({output}, {input});