Shugeo cuda docs1 (#249)

* Comments axis shifts. * Fixed LUP solver usage. Added helpers doc. * Switch off OMP for roll and lup. Fixed omp usage for ClipByGlobalNorm. * Switch off omp for ClipByGlobalNorm to reduce omp ambigiousness.
2019-09-09 16:27:45 +03:00 · 2019-09-09 16:27:45 +03:00 · c9f8a904ad
commit c9f8a904ad
parent b582e69e3b
5 changed files with 318 additions and 345 deletions
--- a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
@ -50,13 +50,13 @@ namespace helpers {
        int n = inputMatrix->rows();
        invertedMatrix->assign(0.f);
-        PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
+       // PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
        for (int i = 0; i < n; i++)
            invertedMatrix->p(i, i, 1.0f);
        if (inputMatrix->isIdentityMatrix()) return;
-        PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
+        //PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
        for (int i = 1; i < n; i++)
            invertedMatrix->t<T>(i, i - 1) = -inputMatrix->t<T>(i, i - 1);
@ -83,11 +83,11 @@ namespace helpers {
            return;
        }
-        PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
+        //PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
        for (int i = 0; i < n; i++)
            invertedMatrix->t<T>(i, i) /= inputMatrix->t<T>(i, i);
-        PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
+        //PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
        for (int i = 0; i < n - 1; i++)
            invertedMatrix->t<T>(i, i + 1) -= (inputMatrix->t<T>(i, i + 1) * invertedMatrix->t<T>(i + 1, i + 1) / inputMatrix->t<T>(i, i));
@ -124,7 +124,7 @@ namespace helpers {
        for(int i = 0; i < rowNum; i++ ) {
            pivotValue = T(0.0);
            pivot = -1;
-            PRAGMA_OMP_PARALLEL_FOR //_ARGS(firstprivate(pivot,pivotValue))
+            //PRAGMA_OMP_PARALLEL_FOR //_ARGS(firstprivate(pivot,pivotValue))
            for(int rowCounter = i; rowCounter < rowNum; rowCounter++ ) {
                if (nd4j::math::nd4j_abs(compoundMatrix.t<T>(rowCounter, i)) > pivotValue) {
                    pivotValue = nd4j::math::nd4j_abs(compoundMatrix.t<T>(rowCounter, i));
@ -140,7 +140,7 @@ namespace helpers {
                for( int j = i + 1; j < rowNum; j++ ) {
                    compoundMatrix.t<T>(j, i) /= compoundMatrix.t<T>(i, i);
-                    PRAGMA_OMP_PARALLEL_FOR
+                    //PRAGMA_OMP_PARALLEL_FOR
                    for( int k = i + 1; k < rowNum; k++ ) {
                        compoundMatrix.t<T>(j, k) -= compoundMatrix.t<T>(j, i) * compoundMatrix.t<T>(i, k);
                    }
--- a/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp
@ -43,7 +43,7 @@ namespace helpers {
            int remainShift = fullLen % actualShift; 
            // stage 1) swap last actualShift elements with first ones.
-            PRAGMA_OMP_PARALLEL_FOR_IF(actualShift > Environment::getInstance()->elementwiseThreshold())
+            //PRAGMA_OMP_PARALLEL_FOR //_IF(actualShift > Environment::getInstance()->elementwiseThreshold())
            for (int e = 0; e < actualShift; ++e) {
                int sourceIndex = fullLen - actualShift + e;
@ -56,7 +56,7 @@ namespace helpers {
            }
            // stage 2) swap swapped actualShift elements with rest remainShiftCount times.
-            PRAGMA_OMP_PARALLEL_FOR_IF(shiftCount > Environment::getInstance()->tadThreshold())
+            //PRAGMA_OMP_PARALLEL_FOR //_IF(shiftCount > Environment::getInstance()->tadThreshold())
            for (int count = 1; count < shiftCount; ++count) {
                for (int e = 0; e < actualShift; ++e) {
                    int destinationIndex = fullLen - (count + 1) * actualShift + e;
@ -91,7 +91,7 @@ namespace helpers {
            output->assign(input);
        auto source = output; //input;
-        for (auto i = 0; i < axes.size(); i++) {
+        for (size_t i = 0; i < axes.size(); i++) {
            int axe = axes[i];
            if (axe == source->rankOf() - 1) {// last dimension
                std::unique_ptr<ResultSet> listOfTensors(source->allTensorsAlongDimension({axe}));
@ -115,7 +115,7 @@ namespace helpers {
                std::unique_ptr<ResultSet> listOfTensors(source->allTensorsAlongDimension({dims}));
                std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension({dims}));
-            
+                //
                int fullLen = listOfTensors->size();
                int sizeAt = input->sizeAt(axe);
--- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@ -957,26 +957,27 @@ void clipByNorm(nd4j::LaunchContext * context, NDArray& input, NDArray& output,
    template <typename T>
    static void clipByGlobalNorm_(std::vector<NDArray*> const& inputs, double clipNorm, nd4j::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
-        NDArray globalNorm = NDArrayFactory::create<T>(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list]))
+        T globalNorm = 0; //NDArrayFactory::create<T>(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list]))
-        PRAGMA_OMP_PARALLEL_FOR
+//        PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(sumT : globalNorm)
        for (size_t i = 0; i < inputs.size(); i++) {
            auto input = inputs[i];
            auto l2norm = input->reduceNumber(reduce::Norm2);
-            globalNorm += l2norm * l2norm;
+            globalNorm += l2norm.t<T>(0) * l2norm.t<T>(0);
        }
-        globalNorm.applyTransform(transform::Sqrt, nullptr, nullptr);// = nd4j::math::nd4j_sqrt(globalNorm);
+        //globalNorm.applyTransform(transform::Sqrt, nullptr, nullptr);// = nd4j::math::nd4j_sqrt(globalNorm);
-        outputs[inputs.size()]->p(0, globalNorm);
+        auto normS = nd4j::math::nd4j_sqrt<T,T>(globalNorm);
        outputs[inputs.size()]->p(0, normS);
-        const T factor = clipNorm / globalNorm.e<T>(0);
+        const T factor = clipNorm / normS;
-        PRAGMA_OMP_PARALLEL_FOR
+//        PRAGMA_OMP_PARALLEL_FOR
        for (size_t e = 0; e < inputs.size(); e++) {
            // all-reduce
            auto input = inputs[e];
            auto output = outputs[e];
-            if (globalNorm.e<double>(0) <= clipNorm) {
+            if (normS <= clipNorm) {
                output->assign(input);
            }
            else {
--- a/libnd4j/include/ops/declarable/helpers/cuda/axis.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/axis.cu
@ -27,11 +27,11 @@ namespace helpers {
    void adjustAxis(Nd4jLong rank, NDArray* axisVector, std::vector<int>& output) {
        output.resize(axisVector->lengthOf());
-        axisVector->tickReadDevice();
+        axisVector->tickReadDevice(); // mark input as read on device
-        axisVector->syncToHost();
+        axisVector->syncToHost(); // sync to host
        for (int e = 0; e < axisVector->lengthOf(); e++) {
                auto ca = axisVector->e<int>(e);
-                if (ca < 0)
+                if (ca < 0) // shift values on rank for negative vals
                    ca += rank;
                output[e] = ca;
@ -41,7 +41,7 @@ namespace helpers {
    void adjustAxis(Nd4jLong rank, std::vector<int> &axisVector) {
        for (int e = 0; e < axisVector.size(); e++) {
            auto a = axisVector[e];
-            if (a < 0)
+            if (a < 0) // shift vals on rank for negative vals
                axisVector[e] = a + rank;
        }
    }
--- a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
@ -31,29 +31,9 @@
 namespace nd4j {
 namespace ops {
 namespace helpers {
-//    template <typename T>
+
-//    static __device__ void swapRows_(T* matrix, Nd4jLong* shape, int theFirst, int theSecond, Nd4jLong N) {
+// ------------------------------------------------------------------------------------------------------------------ //
-//        if (theFirst != theSecond) {
+//  invert the second diagonal for lower diagonal matrix
 //            auto start = threadIdx.x + blockIdx.x * blockDim.x;
 //            auto step = blockDim.x * gridDim.x;
 //            for (auto i = start; i < N; i += step) {
 //                Nd4jLong iCoord1[] = {theFirst, i};
 //                Nd4jLong iCoord2[] = {theSecond, i};
 //                auto iIndex1 = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), iCoord1, 2);
 //                auto iIndex2 = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), iCoord2, 2);
 //                //atomicExch(&matrix[iIndex1], matrix[iIndex2]);
 //                T e0 = matrix[iIndex1];
 //                T e1 = matrix[iIndex2];
 //                matrix[iIndex1] = e0;
 //                matrix[iIndex2] = e1;
 //            }
 //        }
 //    }
 //    BUILD_SINGLE_TEMPLATE(template void swapRows_, (NDArray* matrix, int theFirst, int theSecond), FLOAT_TYPES);
 //
 //    void swapRows(NDArray* matrix, int theFirst, int theSecond) {
 //        BUILD_SINGLE_SELECTOR(matrix->dataType(), swapRows_, (matrix, theFirst, theSecond), FLOAT_TYPES);
 //    }
    template<typename T>
    static __global__ void
    invertKernelLow(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
@ -71,11 +51,13 @@ namespace helpers {
            auto dxIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posX, 2);
            auto dyIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posY, 2);
            auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2);
            // invert lower triangular matrix
            inverted[zIndex] = -input[xIndex] / (input[dxIndex] * input[dyIndex]);
 //            math::atomics::nd4j_atomicAdd(&inverted[zIndex], - input[xIndex] * inverted[iIndex] / input[dIndex]);
        }
    }
-
+// ------------------------------------------------------------------------------------------------------------------ //
 // invert diagonal vals to upper diagonal matrix
    template<typename T>
    static __global__ void
    upvertKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
@ -90,10 +72,13 @@ namespace helpers {
            auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2);
            auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2);
 //            math::atomics::nd4j_atomicDiv(&inverted[zIndex], input[xIndex]);
            // invert diagonal elements
            inverted[zIndex] /= input[xIndex];
        }
    }
 // ------------------------------------------------------------------------------------------------------------------ //
 //  invert upper second diagonal
    template<typename T>
    static __global__ void
    upvertKernelUp(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
@ -120,18 +105,17 @@ namespace helpers {
        for (int i = start; i < n - 1; i += step) {
            Nd4jLong pos[] = {i, i + 1};
            //Nd4jLong posY[] = {i, i};
            Nd4jLong posX[] = {i + 1, i + 1};
            auto xIndex = shape::getOffset(0, inputShapeOf, shape::stride(inputShape), pos, 2);
 //            auto yIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posY, 2);
 //            auto yIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2);
            auto iIndex = shape::getOffset(0, invertedShapeOf, invertedStride, posX, 2);
            auto zIndex = shape::getOffset(0, invertedShapeOf, invertedStride, pos, 2);
            // invert upper matrix
            math::atomics::nd4j_atomicAdd(&inverted[zIndex], -input[xIndex] * inverted[iIndex]); // / input[yIndex]);
            //inputMatrix->t<T>(i, i + 1) * invertedMatrix->t<T>(i + 1, i + 1) / inputMatrix->t<T>(i, i)
        }
    }
 // ------------------------------------------------------------------------------------------------------------------ //
    template<typename T>
    static __global__ void
    invertLowKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
@ -152,11 +136,14 @@ namespace helpers {
                    auto dIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posD, 2);
                    auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), posZ,
                                                   2);
                    // invert non-diagonal elements
                    math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex] / input[dIndex]);
                }
        }
    }
 // ------------------------------------------------------------------------------------------------------------------ //
 // Invertion of upper triangular matrix non-diagonal elements when main and second diagonals already processed
    template<typename T>
    static __global__ void
    invertUpKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
@ -183,18 +170,20 @@ namespace helpers {
                    Nd4jLong posZ[] = {i, j};
                    Nd4jLong posY[] = {k, j};
                    Nd4jLong posX[] = {i, k};
-//                    Nd4jLong posD[] = {i, i};
+                    // inversion with Joardan Gauss transformation
                    auto xIndex = shape::getOffset(0, inputShapeOf, inputStrideOf, posX, 2);
                    auto yIndex = shape::getOffset(0, invertedShapeOf, invertedStrideOf, posY, 2);
                    //                  auto dIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posD, 2);
                    auto zIndex = shape::getOffset(0, invertedShapeOf, invertedStrideOf, posZ, 2);
-                    math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex]);// / input[dIndex]);
+                    // invert upper non-diagonal elements
-//                    printf("(%d, %d) inverted[%lld] = %lf (-inverted[%lld] * input[%lld]\n", blockIdx.x, threadIdx.x, zIndex, inverted[zIndex], yIndex, xIndex);
+                    math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex]);
                }
        }
    }
 // ------------------------------------------------------------------------------------------------------------------ //
 // procedure to invert lower-triangular matrix.
 // In current case lower triangular matrix has main diagonal with general values
 //
    template<typename T>
    static void invertLowerMatrix_(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) {
        int n = inputMatrix->rows();
@ -204,20 +193,26 @@ namespace helpers {
        auto stream = context->getCudaStream();
        // invert lower matrix
        // invert main diagonal
        upvertKernel<T><<<1, n, 512, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
        // invert the second diagonal
        invertKernelLow<T><<<1, n, 512, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
-//        invertKernelLow<T><<<1, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
+        // invert non-diagonal elements
        invertLowKernel<T><<<n, n, 512, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
    }
 // ------------------------------------------------------------------------------------------------------------------ //
 // caller for invert lower matrix routine
    void invertLowerMatrix(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) {
        NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
        BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertLowerMatrix_, (context, inputMatrix, invertedMatrix), FLOAT_NATIVE);
        NDArray::registerSpecialUse({invertedMatrix}, {inputMatrix});
    }
 // ------------------------------------------------------------------------------------------------------------------ //
 // procedure to invert upper-triangular matrix.
 // In current case upper triangular matrix has main diagonal with all ones on it.
    template<typename T>
    static void invertUpperMatrix_(LaunchContext *context, NDArray* inputMatrix, NDArray* invertedMatrix) {
        int n = inputMatrix->rows();
@ -227,103 +222,60 @@ namespace helpers {
            return;
        }
-        //upvertKernel<T><<<1, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
+        // invert upper matrix
        // invert the second diagonal
        upvertKernelUp<T><<<1, n, 512, *stream >>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(),
                inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
-        invertedMatrix->tickWriteDevice();
+
-        invertedMatrix->printIndexedBuffer("Step1 UP inversion");
+        // invert other elements
        invertUpKernel<T><<<n, n, 512, *stream >>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(),inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
    }
 // ------------------------------------------------------------------------------------------------------------------ //
 //  invertion of upper triangular matrix - runner routine
    void invertUpperMatrix(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) {
        NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
        BUILD_SINGLE_SELECTOR(invertedMatrix->dataType(), invertUpperMatrix_, (context, inputMatrix, invertedMatrix), FLOAT_NATIVE);
        NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
    }
-//    template <typename T>
+// ------------------------------------------------------------------------------------------------------------------ //
-//    static __global__ void lupKernel(T* compound, Nd4jLong* compoundShape, T* permutation, Nd4jLong* permutationShape, Nd4jLong rowNum) {
+    // determinant kernel - accumulation product of all values on the main diagonal
 //        int swapCount = 0;
 //        for(int i = blockIdx.x; i < rowNum; i += gridDim.x ) {
 //            auto pivotValue = T(0.0);
 //            auto pivot = -1;
 //
 //            for(int rowCounter = i; rowCounter < rowNum; rowCounter++ ) {
 //                Nd4jLong rowCoord[] = {rowCounter, i};
 //                auto rowPos = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), rowCoord, 2);
 //                if(nd4j::math::nd4j_abs(compound[rowPos]) > pivotValue ) {
 //                    pivotValue = nd4j::math::nd4j_abs(compound[rowPos]);
 //                    pivot = rowCounter;
 //                }
 //            }
 //
 //            if( pivotValue != T(0.0) ) {
 //                swapRows_<T>(compound, compoundShape, pivot, i, rowNum);
 //                swapRows_<T>(permutation, permutationShape, pivot, i, rowNum);
 //                if (pivot != i)
 //                    swapCount++;
 //
 //                for( int j = i + 1; j < rowNum; j++ ) {
 //                    Nd4jLong posJIbuf[] = {j, i};
 //                    Nd4jLong posIIbuf[] = {i, i};
 //                    auto posJI = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posJIbuf, 2);
 //                    auto posII = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posIIbuf, 2);
 //
 //                    compound[posJI] /= compound[posII];
 //                    for( int k = i + 1; k < rowNum; k++ ) {
 //                        Nd4jLong posJKbuf[] = {j, k};
 //                        Nd4jLong posIKbuf[] = {i, k};
 //                        auto posJK = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posJKbuf, 2);
 //                        auto posIK = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posIKbuf, 2);
 //                        T arg = compound[posJI] * compound[posIK];
 //                        compound[posJK] -= arg;
 //                    }
 //                }
 //            }
 //        }
 //    }
 //    template <typename T, typename F>
    template<typename T>
    static __global__ void determinantKernel(T *compound, T *result, Nd4jLong len) {
            //F tempRes = result[0];
        auto start = blockIdx.x * blockDim.x + threadIdx.x;
        auto step = blockDim.x * gridDim.x;
        for (auto i = start; i < len; i += step) {
            auto pos = i * len + i; //shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), di, 2);
            // multiply all diagonal elements
            math::atomics::nd4j_atomicMul(&result[0], compound[pos]);
        }
    }
 // ------------------------------------------------------------------------------------------------------------------ //
    // determinant logarithm - accumulation sum of all logarithm values on the main diagonal. All in logarithic values
    // should be positive
    template<typename T>
    static __global__ void determinantLogKernel(T *compound, T *result, Nd4jLong len) {
 //            F tempRes =  (F)result[0];
        auto start = blockIdx.x * blockDim.x + threadIdx.x;
        auto step = blockDim.x * gridDim.x;
        for (auto i = start; i < len; i += step) {
            auto pos = i * len + i; //shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), di, 2);
            // sum logs of all diagonal elements
            math::atomics::nd4j_atomicAdd(result, math::nd4j_log<T,T>(math::nd4j_abs(compound[pos])));
        }
 //            __syncthreads();
 //
 //            if (threadIdx.x == 0) {
 //                result[0] = (T)math::nd4j_log<F,F>(math::nd4j_abs(tempRes));
 //            }
    }
 // ------------------------------------------------------------------------------------------------------------------ //
    // kernel to copy matrix with given shape to compound tensor with given pos
    // output - a N-D tensor buffer with rank not less than 2, input - 2D square n x n matrix with n = rowLen
    template<typename T, typename F>
    static __global__ void
    fillMatrix(void *output, Nd4jLong *outShape, void *input, Nd4jLong *inputShape, Nd4jLong pos, Nd4jLong rowLen) {
-            __shared__
+        __shared__ F *matrix;
-            F *matrix;
+        __shared__ T *inputBuf;
-            __shared__
+        __shared__ Nd4jLong inputLen;
-            T *inputBuf;
+        __shared__ Nd4jLong n2;
            __shared__
            Nd4jLong inputLen;
            __shared__
            Nd4jLong n2;
        if (threadIdx.x == 0) {
            matrix = reinterpret_cast<F*>(output);
@ -332,6 +284,7 @@ namespace helpers {
            n2 = rowLen * rowLen;
        }
        __syncthreads();
        auto start = blockIdx.x * blockDim.x + threadIdx.x;
        auto step = blockDim.x * gridDim.x;
@ -341,10 +294,11 @@ namespace helpers {
        }
    }
 // ------------------------------------------------------------------------------------------------------------------ //
 // same as above, but without type conversion
    template<typename T>
    static __global__ void
-        returnMatrix(void *output, Nd4jLong *outputShape, void *input, Nd4jLong *inputShape, Nd4jLong pos,
+    returnMatrix(void *output, Nd4jLong *outputShape, void *input, Nd4jLong *inputShape, Nd4jLong pos, Nd4jLong rowLen) {
                     Nd4jLong rowLen) {
        __shared__ T* matrix;
        __shared__ T* outputBuf;
        __shared__ Nd4jLong outputLen;
@ -362,10 +316,12 @@ namespace helpers {
        for (int k = pos + start, j = start; j < n2; k += step, j += step) {
            auto zIndex = shape::getIndexOffset(k, outputShape, outputLen);
-                outputBuf[zIndex] = (T) matrix[j];
+            outputBuf[zIndex] = matrix[j];
        }
    }
 // ------------------------------------------------------------------------------------------------------------------ //
    // fill up permutaion matrix kernel. Permutation matrix filled with zeros and ones
    template<typename F>
    static __global__ void fillUpPermutation(void *output, Nd4jLong *shape, int *source, int rowNum) {
        F *permutation = reinterpret_cast<F *>(output);
@ -380,37 +336,43 @@ namespace helpers {
        }
    }
 // ------------------------------------------------------------------------------------------------------------------ //
    // LUP decomposition runner - using CUBLAS SOLVER
    // if permutation is given, then using LUP decomposition, LU decomposition otherwise
    // L - lower triangular, U - upper triangular, P - permutation matricies
    // PA = LU
    //
    // input - A matrix nxn
    // compound - C matrix L + U - I, or main diagonal and lower - L matrix, from the 2nd diagonal - U matrix
    template<typename T>
    static void lup_(LaunchContext *context, NDArray *input, NDArray *compound, NDArray *permutation) {
        auto stream = context->getCudaStream();
        auto n = input->rows();
        cusolverDnHandle_t cusolverH = nullptr;
        // create solver handle
        cusolverStatus_t status = cusolverDnCreate(&cusolverH);
        if (CUSOLVER_STATUS_SUCCESS != status) {
            throw cuda_exception::build("Cannot create cuSolver handle", status);
        }
        // set solver stream
        status = cusolverDnSetStream(cusolverH, *stream);
        if (CUSOLVER_STATUS_SUCCESS != status) {
            throw cuda_exception::build("Cannot set up stream for cuda solver", status);
        }
        int lwork = 0;
        int *d_info = nullptr;
-
+        // allocate memory for permutation vector
        auto err = cudaMalloc((void **) &d_info, sizeof(int));
        if (err) {
            throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver info buffer", err);
        }
        DataType dtype = input->dataType();
-            switch (dtype) {
+        switch (dtype) { // there are two implementations with cublas for LUP decomposition - double and float
            case DataType::DOUBLE: {
                double *d_work = nullptr;
-                    err = cudaMalloc((void **) &d_work, sizeof(float) * lwork);
+                // compute internal buffer size
                    if (err) {
                        throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer",
                                                    err);
                    }
                double *matrix = reinterpret_cast<double *>(input->specialBuffer());
                status = cusolverDnDgetrf_bufferSize(
                        cusolverH,
@ -422,6 +384,13 @@ namespace helpers {
                if (CUSOLVER_STATUS_SUCCESS != status) {
                    throw cuda_exception::build("helpers::lup_: Cannot create cuSolver handle", status);
                }
                err = cudaMalloc((void **) &d_work, sizeof(float) * lwork);
                if (err) {
                    throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer",
                                                err);
                }
                if (permutation == nullptr)
                    status = cusolverDnDgetrf(
                            cusolverH,
@ -458,11 +427,6 @@ namespace helpers {
            case DataType::FLOAT32: {
                float *matrix = reinterpret_cast<float*>(input->specialBuffer());
                float *d_work = nullptr;
                    err = cudaMalloc((void **) &d_work, sizeof(float) * lwork);
                    if (err) {
                        throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer",
                                                    err);
                    }
                status = cusolverDnSgetrf_bufferSize(
                        cusolverH,
@ -475,6 +439,12 @@ namespace helpers {
                    throw cuda_exception::build("helpers::lup_: Cannot create cuSolver handle", status);
                }
                err = cudaMalloc((void **) &d_work, sizeof(float) * lwork);
                if (err) {
                    throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer",
                                                err);
                }
                if (permutation == nullptr)
                    status = cusolverDnSgetrf(
                            cusolverH,
@ -520,9 +490,11 @@ namespace helpers {
 //        NDArray::registerSpecialUse({input}, {input});
        input->tickWriteDevice();
    }
 // ------------------------------------------------------------------------------------------------------------------ //
    BUILD_SINGLE_TEMPLATE(template void lup_,(LaunchContext * context, NDArray * input, NDArray * output, NDArray * permutation), FLOAT_NATIVE);
 // ------------------------------------------------------------------------------------------------------------------ //
    template<typename T>
    static int determinant_(nd4j::LaunchContext *context, NDArray *input, NDArray *output) {
        Nd4jLong n = input->sizeAt(-1);