cavis/libnd4j/include/ops/declarable/helpers/cuda/solve.cu

/*******************************************************************************
 * Copyright (c) 2020 Konduit, K.K.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/

//
//  @author GS <sgazeos@gmail.com>
//

#include <op_boilerplate.h>
#include <NDArray.h>
#include <NDArrayFactory.h>
#include <MmulHelper.h>

#include <execution/Threads.h>
#include <ConstantTadHelper.h>
#include "../triangular_solve.h"
#include "../lup.h"
#include "../solve.h"

namespace nd4j {
    namespace ops {
        namespace helpers {
    
            template <typename T>
            static __global__ void oneOnDiagonalKernel(T* ioBuf, Nd4jLong* ioShape, Nd4jLong* tadShape, Nd4jLong* tadOffsets, Nd4jLong batchNum, Nd4jLong rowNum) {
                for (auto i = blockIdx.x; i < batchNum; i += gridDim.x) {
                    auto matrixPart = ioBuf + tadOffsets[i];
                    for (auto j = threadIdx.x; j < rowNum; j += blockDim.x) {
                        Nd4jLong pos[] = {j, j};
                        auto offset = shape::getOffset(tadShape, pos);

                        matrixPart[offset] = T(1.f);
                    }
                }
            }

            template <typename T>
            static __global__ void restorePermutationsKernel(T* PBuf, Nd4jLong* PShapeInfo, int const* permutationsBuf,
            Nd4jLong* PTadShapeInfo, Nd4jLong* PTadSOffsets, Nd4jLong* permutationsTadShapeInfo, Nd4jLong* permutationsTadOffsets, Nd4jLong batchNum, Nd4jLong rowNum) {
                for (auto batch = blockIdx.x; batch < batchNum; batch += gridDim.x) {
                    auto permutations = permutationsBuf + permutationsTadOffsets[batch];
                    auto P = PBuf + PTadSOffsets[batch];

                    for (auto row = threadIdx.x; row < rowNum; row += blockDim.x) {
                        //auto posX[] = {row};
                        Nd4jLong posZ[] = {row, permutations[row]};
                        auto zOffset = shape::getOffset(PTadShapeInfo, posZ);
                        P[zOffset] = T(1.f);
                    }
                }
            }

            template <typename T>
            static int solveFunctor_(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput,
                                     bool adjoint, NDArray* output) {
                NDArray::prepareSpecialUse({output}, {leftInput, rightInput});
                // stage 1: LU decomposition batched
                auto leftOutput = leftInput->ulike();
                auto permuShape = rightInput->getShapeAsVector(); permuShape.pop_back();
                auto permutations = NDArrayFactory::create<int>('c', permuShape, context);
                helpers::lu(context, leftInput, &leftOutput, &permutations);
                auto leftLower = leftOutput.dup();
                auto rightOutput = rightInput->ulike();
                auto leftLowerTad = ConstantTadHelper::getInstance()->tadForDimensions(leftLower.getShapeInfo(), {-2, -1});
                auto stream = context->getCudaStream();
                oneOnDiagonalKernel<T><<<128, 256, 256, *stream>>>(leftLower.dataBuffer()->specialAsT<T>(), leftLower.specialShapeInfo(), leftLowerTad.specialShapeInfo(), leftLowerTad.specialOffsets(), leftLowerTad.numberOfTads(), leftLower.sizeAt(-1));
                auto P = leftOutput.ulike(); P.nullify();
                auto PTad = ConstantTadHelper::getInstance()->tadForDimensions(P.getShapeInfo(), {-2, -1});
                auto permutationsTad = ConstantTadHelper::getInstance()->tadForDimensions(permutations.getShapeInfo(), {-1});
                restorePermutationsKernel<T><<<128, 256, 256, *stream>>>(P.dataBuffer()->specialAsT<T>(), P.specialShapeInfo(), permutations.dataBuffer()->specialAsT<int>(),
                        PTad.specialShapeInfo(), PTad.specialOffsets(), permutationsTad.specialShapeInfo(), permutationsTad.specialOffsets(), permutationsTad.numberOfTads(), permutations.sizeAt(-1));
                P.tickWriteDevice();
                auto rightPart = rightInput->ulike();
                MmulHelper::matmul(&P, rightInput, &rightPart, 0, 0);
                
                // stage 2: triangularSolveFunctor for Lower with given b
                helpers::triangularSolveFunctor(context, &leftLower, &rightPart, true, false, &rightOutput);
                // stage 3: triangularSolveFunctor for Upper with output of previous stage
                helpers::triangularSolveFunctor(context, &leftOutput, &rightOutput, false, false, output);
                NDArray::registerSpecialUse({output}, {leftInput, rightInput});

                return Status::OK();
            }

            int solveFunctor(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output) {
                BUILD_SINGLE_SELECTOR(leftInput->dataType(), return solveFunctor_, (context, leftInput, rightInput, adjoint, output), FLOAT_TYPES);
            }

            template <typename T>
            static __global__ void adjointKernel(T* output, Nd4jLong batchSize, Nd4jLong rows, Nd4jLong columns, Nd4jLong* outputTads, 
                                                 Nd4jLong* outputOffsets) {

                for (auto b = blockIdx.x; b < batchSize; b += gridDim.x) {
                    auto outputPart = output + outputOffsets[b];
                    for (auto r = threadIdx.x; r < rows; r += blockDim.x) {
                        for (auto c = threadIdx.y; c < r; c += blockDim.y) {
                            Nd4jLong zPos[] = {r, c};
                            Nd4jLong xPos[] = {c, r};
                            auto zIndex = shape::getOffset(outputTads, zPos);
                            auto xIndex = shape::getOffset(outputTads, xPos);
                            math::nd4j_swap(outputPart[zIndex], outputPart[xIndex]);
                        }
                    }
                }

            }

            template <typename T>
            static void adjointMatrix_(nd4j::LaunchContext* context, NDArray const* input, NDArray* output) {
                NDArray::prepareSpecialUse({output}, {input});
                auto inputTads = ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), {-2, -1});
                auto outputTads = ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {-2, -1});
                auto stream = context->getCudaStream();
                auto outputBuf = reinterpret_cast<T*>(output->specialBuffer());
                auto rows = input->sizeAt(-2);
                auto columns = input->sizeAt(-1);
                output->assign(input);
                adjointKernel<T><<<128, 256, 256, *stream>>>(outputBuf, outputTads.numberOfTads(), rows, columns, outputTads.specialShapeInfo(), outputTads.specialOffsets());
                NDArray::registerSpecialUse({output}, {input});
            }

            void adjointMatrix(nd4j::LaunchContext* context, NDArray const* input, NDArray* output) {
                BUILD_SINGLE_SELECTOR(input->dataType(), adjointMatrix_, (context, input, output), FLOAT_TYPES);
            }

        }
    }
}
Shugeo solve linear (#191) * linear equations systems solve op. Initial commit. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed compiling issues. Signed-off-by: shugeo <sgazeos@gmail.com> * Linear equations systems solve. The next stage commit. Signed-off-by: shugeo <sgazeos@gmail.com> * Added test for linear equations systems solve operation. Signed-off-by: shugeo <sgazeos@gmail.com> * Added additional test and fixed lower matrix retrievance. * Implementation for solve of the systems of linear equations." Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored permutation generation. Signed-off-by: shugeo <sgazeos@gmail.com> * Added restore for permutations batched with cuda helper for solve op. Signed-off-by: shugeo <sgazeos@gmail.com> * Finished cuda implementation for solve op helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored cpu helpers for solve op. Signed-off-by: shugeo <sgazeos@gmail.com> * Fix gtest output on Windows * Fixed issue with permutation matrix for cuda implementation. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed issue with permutation matrix for cpu implementation. Signed-off-by: shugeo <sgazeos@gmail.com> * Eliminated waste comments. Signed-off-by: shugeo <sgazeos@gmail.com> * LinearSolve added * Mapping added * Javadoc added * Refactored implementation of triangular_solve helpers and tests for solve matrix equations generally. Signed-off-by: shugeo <sgazeos@gmail.com> * Added a test for solve op. Signed-off-by: shugeo <sgazeos@gmail.com> * Solve test added * Fix for TF import Co-authored-by: Serhii Shepel <9946053+sshepel@users.noreply.github.com> Co-authored-by: raver119 <raver119@gmail.com> Co-authored-by: Alexander Stoyakin <alexander.stoyakin@gmail.com> 2020-02-04 06:59:11 +01:00			`/*******************************************************************************`
			`* Copyright (c) 2020 Konduit, K.K.`
			`*`
			`* This program and the accompanying materials are made available under the`
			`* terms of the Apache License, Version 2.0 which is available at`
			`* https://www.apache.org/licenses/LICENSE-2.0.`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT`
			`* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the`
			`* License for the specific language governing permissions and limitations`
			`* under the License.`
			`*`
			`* SPDX-License-Identifier: Apache-2.0`
			`******************************************************************************/`

			`//`
			`// @author GS <sgazeos@gmail.com>`
			`//`

			`#include <op_boilerplate.h>`
			`#include <NDArray.h>`
			`#include <NDArrayFactory.h>`
			`#include <MmulHelper.h>`

			`#include <execution/Threads.h>`
			`#include <ConstantTadHelper.h>`
			`#include "../triangular_solve.h"`
			`#include "../lup.h"`
			`#include "../solve.h"`

			`namespace nd4j {`
			`namespace ops {`
			`namespace helpers {`

			`template <typename T>`
			`static __global__ void oneOnDiagonalKernel(T* ioBuf, Nd4jLong* ioShape, Nd4jLong* tadShape, Nd4jLong* tadOffsets, Nd4jLong batchNum, Nd4jLong rowNum) {`
			`for (auto i = blockIdx.x; i < batchNum; i += gridDim.x) {`
			`auto matrixPart = ioBuf + tadOffsets[i];`
			`for (auto j = threadIdx.x; j < rowNum; j += blockDim.x) {`
			`Nd4jLong pos[] = {j, j};`
			`auto offset = shape::getOffset(tadShape, pos);`

			`matrixPart[offset] = T(1.f);`
			`}`
			`}`
			`}`

			`template <typename T>`
			`static __global__ void restorePermutationsKernel(T* PBuf, Nd4jLong* PShapeInfo, int const* permutationsBuf,`
			`Nd4jLong* PTadShapeInfo, Nd4jLong* PTadSOffsets, Nd4jLong* permutationsTadShapeInfo, Nd4jLong* permutationsTadOffsets, Nd4jLong batchNum, Nd4jLong rowNum) {`
			`for (auto batch = blockIdx.x; batch < batchNum; batch += gridDim.x) {`
			`auto permutations = permutationsBuf + permutationsTadOffsets[batch];`
			`auto P = PBuf + PTadSOffsets[batch];`

			`for (auto row = threadIdx.x; row < rowNum; row += blockDim.x) {`
			`//auto posX[] = {row};`
			`Nd4jLong posZ[] = {row, permutations[row]};`
			`auto zOffset = shape::getOffset(PTadShapeInfo, posZ);`
			`P[zOffset] = T(1.f);`
			`}`
			`}`
			`}`

			`template <typename T>`
			`static int solveFunctor_(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput,`
			`bool adjoint, NDArray* output) {`
			`NDArray::prepareSpecialUse({output}, {leftInput, rightInput});`
			`// stage 1: LU decomposition batched`
			`auto leftOutput = leftInput->ulike();`
			`auto permuShape = rightInput->getShapeAsVector(); permuShape.pop_back();`
			`auto permutations = NDArrayFactory::create<int>('c', permuShape, context);`
			`helpers::lu(context, leftInput, &leftOutput, &permutations);`
			`auto leftLower = leftOutput.dup();`
			`auto rightOutput = rightInput->ulike();`
			`auto leftLowerTad = ConstantTadHelper::getInstance()->tadForDimensions(leftLower.getShapeInfo(), {-2, -1});`
			`auto stream = context->getCudaStream();`
			`oneOnDiagonalKernel<T><<<128, 256, 256, *stream>>>(leftLower.dataBuffer()->specialAsT<T>(), leftLower.specialShapeInfo(), leftLowerTad.specialShapeInfo(), leftLowerTad.specialOffsets(), leftLowerTad.numberOfTads(), leftLower.sizeAt(-1));`
			`auto P = leftOutput.ulike(); P.nullify();`
			`auto PTad = ConstantTadHelper::getInstance()->tadForDimensions(P.getShapeInfo(), {-2, -1});`
			`auto permutationsTad = ConstantTadHelper::getInstance()->tadForDimensions(permutations.getShapeInfo(), {-1});`
			`restorePermutationsKernel<T><<<128, 256, 256, *stream>>>(P.dataBuffer()->specialAsT<T>(), P.specialShapeInfo(), permutations.dataBuffer()->specialAsT<int>(),`
			`PTad.specialShapeInfo(), PTad.specialOffsets(), permutationsTad.specialShapeInfo(), permutationsTad.specialOffsets(), permutationsTad.numberOfTads(), permutations.sizeAt(-1));`
			`P.tickWriteDevice();`
			`auto rightPart = rightInput->ulike();`
			`MmulHelper::matmul(&P, rightInput, &rightPart, 0, 0);`

			`// stage 2: triangularSolveFunctor for Lower with given b`
			`helpers::triangularSolveFunctor(context, &leftLower, &rightPart, true, false, &rightOutput);`
			`// stage 3: triangularSolveFunctor for Upper with output of previous stage`
			`helpers::triangularSolveFunctor(context, &leftOutput, &rightOutput, false, false, output);`
			`NDArray::registerSpecialUse({output}, {leftInput, rightInput});`

			`return Status::OK();`
			`}`

			`int solveFunctor(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output) {`
			`BUILD_SINGLE_SELECTOR(leftInput->dataType(), return solveFunctor_, (context, leftInput, rightInput, adjoint, output), FLOAT_TYPES);`
			`}`

			`template <typename T>`
			`static __global__ void adjointKernel(T* output, Nd4jLong batchSize, Nd4jLong rows, Nd4jLong columns, Nd4jLong* outputTads,`
			`Nd4jLong* outputOffsets) {`

			`for (auto b = blockIdx.x; b < batchSize; b += gridDim.x) {`
			`auto outputPart = output + outputOffsets[b];`
			`for (auto r = threadIdx.x; r < rows; r += blockDim.x) {`
			`for (auto c = threadIdx.y; c < r; c += blockDim.y) {`
			`Nd4jLong zPos[] = {r, c};`
			`Nd4jLong xPos[] = {c, r};`
			`auto zIndex = shape::getOffset(outputTads, zPos);`
			`auto xIndex = shape::getOffset(outputTads, xPos);`
			`math::nd4j_swap(outputPart[zIndex], outputPart[xIndex]);`
			`}`
			`}`
			`}`

			`}`

			`template <typename T>`
			`static void adjointMatrix_(nd4j::LaunchContext* context, NDArray const* input, NDArray* output) {`
			`NDArray::prepareSpecialUse({output}, {input});`
			`auto inputTads = ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), {-2, -1});`
			`auto outputTads = ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {-2, -1});`
			`auto stream = context->getCudaStream();`
			`auto outputBuf = reinterpret_cast<T*>(output->specialBuffer());`
			`auto rows = input->sizeAt(-2);`
			`auto columns = input->sizeAt(-1);`
			`output->assign(input);`
			`adjointKernel<T><<<128, 256, 256, *stream>>>(outputBuf, outputTads.numberOfTads(), rows, columns, outputTads.specialShapeInfo(), outputTads.specialOffsets());`
			`NDArray::registerSpecialUse({output}, {input});`
			`}`

			`void adjointMatrix(nd4j::LaunchContext* context, NDArray const* input, NDArray* output) {`
			`BUILD_SINGLE_SELECTOR(input->dataType(), adjointMatrix_, (context, input, output), FLOAT_TYPES);`
			`}`

			`}`
			`}`
			`}`