cavis/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu

/*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/

//
//  @author GS <sgazeos@gmail.com>
//

#include <ops/declarable/helpers/sequence_mask.h>

namespace nd4j {
namespace ops {
namespace helpers {

    template <typename I, typename B>
    static __global__ void sequenceMaskKernel(void* inputBuf, Nd4jLong* inputShape, void* outputBuf, Nd4jLong* outputShape, int maxIndex) {

        __shared__ I* input;
        __shared__ B* output;
        __shared__ Nd4jLong inputLen, outputLen;
        if (threadIdx.x == 0) {
            input = reinterpret_cast<I*>(inputBuf);
            output = reinterpret_cast<B*>(outputBuf);
            inputLen = shape::length(inputShape);
            outputLen = shape::length(outputShape);
        }
        __syncthreads();

        for (auto i = blockIdx.x; i < maxIndex; i += gridDim.x)
            for(auto k = threadIdx.x; k < inputLen; k += blockDim.x)
                if (i < input[shape::getIndexOffset(k, inputShape)])
                    output[shape::getIndexOffset(k * maxIndex + i, outputShape)] = B(true);

    }

    template <typename I, typename B>
    static void sequenceMask_(LaunchContext* context, NDArray* input, NDArray* output, int maxIndex) {
        dim3 launchDims(maxIndex, input->lengthOf(), 128);
        NDArray::prepareSpecialUse({output}, {input});
        auto stream = context->getCudaStream();
        sequenceMaskKernel<I, B><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), maxIndex);
        NDArray::registerSpecialUse({output}, {input});
    }

    void sequenceMask(nd4j::LaunchContext * context, NDArray* input, NDArray* output, int maxIndex) {
        BUILD_DOUBLE_SELECTOR(input->dataType(), output->dataType(), sequenceMask_, (context, input, output, maxIndex), INTEGER_TYPES, LIBND4J_TYPES_EXTENDED);
    }

    BUILD_DOUBLE_TEMPLATE(template void sequenceMask_, (nd4j::LaunchContext* context, NDArray* input, NDArray* output, int maxIndex), INTEGER_TYPES, LIBND4J_TYPES_EXTENDED);
}
}
}
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`/*******************************************************************************`
			`* Copyright (c) 2015-2018 Skymind, Inc.`
			`*`
			`* This program and the accompanying materials are made available under the`
			`* terms of the Apache License, Version 2.0 which is available at`
			`* https://www.apache.org/licenses/LICENSE-2.0.`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT`
			`* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the`
			`* License for the specific language governing permissions and limitations`
			`* under the License.`
			`*`
			`* SPDX-License-Identifier: Apache-2.0`
			`******************************************************************************/`

			`//`
			`// @author GS <sgazeos@gmail.com>`
			`//`

			`#include <ops/declarable/helpers/sequence_mask.h>`

			`namespace nd4j {`
			`namespace ops {`
			`namespace helpers {`

[WIP] More of CUDA operations (#69) * initial commit Signed-off-by: raver119 <raver119@gmail.com> * - gruCell_bp further Signed-off-by: Yurii <yurii@skymind.io> * - further work on gruCell_bp Signed-off-by: Yurii <yurii@skymind.io> * Inverse matrix cublas implementation. Partial working revision. * Separation of segment ops helpers. Max separation. * Separated segment_min ops. * Separation of segment_mean/sum/prod/sqrtN ops heleprs. * Fixed diagonal processing with LUP decomposition. * Modified inversion approach using current state of LU decomposition. * Implementation of matrix_inverse op with cuda kernels. Working revision. * Implemented sequence_mask cuda helper. Eliminated waste printf with matrix_inverse implementation. Added proper tests. * - further work on gruCell_bp (ff/cuda) Signed-off-by: Yurii <yurii@skymind.io> * comment one test for gruCell_bp Signed-off-by: Yurii <yurii@skymind.io> * - provide cuda static_rnn Signed-off-by: Yurii <yurii@skymind.io> * Refactored random_shuffle op to use new random generator. * Refactored random_shuffle op helper. * Fixed debug tests with random ops tests. * Implement random_shuffle op cuda kernel helper and tests. * - provide cuda scatter_update Signed-off-by: Yurii <yurii@skymind.io> * Implementation of random_shuffle for linear case with cuda kernels and tests. * Implemented random_shuffle with cuda kernels. Final revision. * - finally gruCell_bp is completed Signed-off-by: Yurii <yurii@skymind.io> * Dropout op cuda helper implementation. * Implemented dropout_bp cuda helper. * Implemented alpha_dropout_bp with cuda kernel helpers. * Refactored helper. * Implementation of suppresion helper with cuda kernels. * - provide cpu code fot hsvToRgb, rgbToHsv, adjustHue Signed-off-by: Yurii <yurii@skymind.io> * Using sort by value method. * Implementation of image.non_max_suppression op cuda-based helper. * - correcting and testing adjust_hue, adjust_saturation cpu/cuda code Signed-off-by: Yurii <yurii@skymind.io> * Added cuda device prefixes to declarations. * Implementation of hashcode op with cuda helper. Initital revision. * rnn cu impl removed Signed-off-by: raver119 <raver119@gmail.com> 2019-07-20 08:58:44 +03:00			`template <typename I, typename B>`
			`static __global__ void sequenceMaskKernel(void* inputBuf, Nd4jLong* inputShape, void* outputBuf, Nd4jLong* outputShape, int maxIndex) {`

			`__shared__ I* input;`
			`__shared__ B* output;`
			`__shared__ Nd4jLong inputLen, outputLen;`
			`if (threadIdx.x == 0) {`
			`input = reinterpret_cast<I*>(inputBuf);`
			`output = reinterpret_cast<B*>(outputBuf);`
			`inputLen = shape::length(inputShape);`
			`outputLen = shape::length(outputShape);`
			`}`
syncthreads (#136) Signed-off-by: raver119 <raver119@gmail.com> 2019-08-20 18:28:43 +03:00			`__syncthreads();`
[WIP] More of CUDA operations (#69) * initial commit Signed-off-by: raver119 <raver119@gmail.com> * - gruCell_bp further Signed-off-by: Yurii <yurii@skymind.io> * - further work on gruCell_bp Signed-off-by: Yurii <yurii@skymind.io> * Inverse matrix cublas implementation. Partial working revision. * Separation of segment ops helpers. Max separation. * Separated segment_min ops. * Separation of segment_mean/sum/prod/sqrtN ops heleprs. * Fixed diagonal processing with LUP decomposition. * Modified inversion approach using current state of LU decomposition. * Implementation of matrix_inverse op with cuda kernels. Working revision. * Implemented sequence_mask cuda helper. Eliminated waste printf with matrix_inverse implementation. Added proper tests. * - further work on gruCell_bp (ff/cuda) Signed-off-by: Yurii <yurii@skymind.io> * comment one test for gruCell_bp Signed-off-by: Yurii <yurii@skymind.io> * - provide cuda static_rnn Signed-off-by: Yurii <yurii@skymind.io> * Refactored random_shuffle op to use new random generator. * Refactored random_shuffle op helper. * Fixed debug tests with random ops tests. * Implement random_shuffle op cuda kernel helper and tests. * - provide cuda scatter_update Signed-off-by: Yurii <yurii@skymind.io> * Implementation of random_shuffle for linear case with cuda kernels and tests. * Implemented random_shuffle with cuda kernels. Final revision. * - finally gruCell_bp is completed Signed-off-by: Yurii <yurii@skymind.io> * Dropout op cuda helper implementation. * Implemented dropout_bp cuda helper. * Implemented alpha_dropout_bp with cuda kernel helpers. * Refactored helper. * Implementation of suppresion helper with cuda kernels. * - provide cpu code fot hsvToRgb, rgbToHsv, adjustHue Signed-off-by: Yurii <yurii@skymind.io> * Using sort by value method. * Implementation of image.non_max_suppression op cuda-based helper. * - correcting and testing adjust_hue, adjust_saturation cpu/cuda code Signed-off-by: Yurii <yurii@skymind.io> * Added cuda device prefixes to declarations. * Implementation of hashcode op with cuda helper. Initital revision. * rnn cu impl removed Signed-off-by: raver119 <raver119@gmail.com> 2019-07-20 08:58:44 +03:00
			`for (auto i = blockIdx.x; i < maxIndex; i += gridDim.x)`
			`for(auto k = threadIdx.x; k < inputLen; k += blockDim.x)`
[WIP] bunch of improvements (#257) * - profiling bias_add op - add some docementation Signed-off-by: Yurii <yurii@skymind.io> * - minor change Signed-off-by: Yurii <yurii@skymind.io> * - provide addBias cuda kernel Signed-off-by: Yurii <yurii@skymind.io> * - improve shape::getIndexOfffset and change its signature Signed-off-by: Yurii <yurii@skymind.io> * - same as previous Signed-off-by: Yurii <yurii@skymind.io> * - improve and change signature in some shape:: stuff which has to do with calculation of offsets for array elements Signed-off-by: Yurii <yurii@skymind.io> * - minor changes in flatten Signed-off-by: Yurii <shyrma@skymind.io> * - add function shape::getIndexOffsetOrdered Signed-off-by: Yurii <shyrma@skymind.io> * - correct shape::getIndexOffsetOrdered() Signed-off-by: Yurii <shyrma@skymind.io> * - move getIndexOffsetOrdered to flatten.h header in order to isolate this function Signed-off-by: Yurii <shyrma@skymind.io> 2019-09-11 20:12:09 +03:00			`if (i < input[shape::getIndexOffset(k, inputShape)])`
			`output[shape::getIndexOffset(k * maxIndex + i, outputShape)] = B(true);`
[WIP] More of CUDA operations (#69) * initial commit Signed-off-by: raver119 <raver119@gmail.com> * - gruCell_bp further Signed-off-by: Yurii <yurii@skymind.io> * - further work on gruCell_bp Signed-off-by: Yurii <yurii@skymind.io> * Inverse matrix cublas implementation. Partial working revision. * Separation of segment ops helpers. Max separation. * Separated segment_min ops. * Separation of segment_mean/sum/prod/sqrtN ops heleprs. * Fixed diagonal processing with LUP decomposition. * Modified inversion approach using current state of LU decomposition. * Implementation of matrix_inverse op with cuda kernels. Working revision. * Implemented sequence_mask cuda helper. Eliminated waste printf with matrix_inverse implementation. Added proper tests. * - further work on gruCell_bp (ff/cuda) Signed-off-by: Yurii <yurii@skymind.io> * comment one test for gruCell_bp Signed-off-by: Yurii <yurii@skymind.io> * - provide cuda static_rnn Signed-off-by: Yurii <yurii@skymind.io> * Refactored random_shuffle op to use new random generator. * Refactored random_shuffle op helper. * Fixed debug tests with random ops tests. * Implement random_shuffle op cuda kernel helper and tests. * - provide cuda scatter_update Signed-off-by: Yurii <yurii@skymind.io> * Implementation of random_shuffle for linear case with cuda kernels and tests. * Implemented random_shuffle with cuda kernels. Final revision. * - finally gruCell_bp is completed Signed-off-by: Yurii <yurii@skymind.io> * Dropout op cuda helper implementation. * Implemented dropout_bp cuda helper. * Implemented alpha_dropout_bp with cuda kernel helpers. * Refactored helper. * Implementation of suppresion helper with cuda kernels. * - provide cpu code fot hsvToRgb, rgbToHsv, adjustHue Signed-off-by: Yurii <yurii@skymind.io> * Using sort by value method. * Implementation of image.non_max_suppression op cuda-based helper. * - correcting and testing adjust_hue, adjust_saturation cpu/cuda code Signed-off-by: Yurii <yurii@skymind.io> * Added cuda device prefixes to declarations. * Implementation of hashcode op with cuda helper. Initital revision. * rnn cu impl removed Signed-off-by: raver119 <raver119@gmail.com> 2019-07-20 08:58:44 +03:00
			`}`

			`template <typename I, typename B>`
			`static void sequenceMask_(LaunchContext* context, NDArray* input, NDArray* output, int maxIndex) {`
			`dim3 launchDims(maxIndex, input->lengthOf(), 128);`
			`NDArray::prepareSpecialUse({output}, {input});`
			`auto stream = context->getCudaStream();`
			`sequenceMaskKernel<I, B><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), maxIndex);`
			`NDArray::registerSpecialUse({output}, {input});`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`}`

			`void sequenceMask(nd4j::LaunchContext * context, NDArray* input, NDArray* output, int maxIndex) {`
Shugeo sequence mask fix2 (#216) * Fixed sequence_mask op and tests. Signed-off-by: shugeo <sgazeos@gmail.com> * Cuda fix for sequence_mask op. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed sequence_mask op for both platforms and tests. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed solve and triangular_solve for more than 2D for adjoint cases. Signed-off-by: shugeo <sgazeos@gmail.com> * Added adjoint solve test again. Signed-off-by: shugeo <sgazeos@gmail.com> * Added a set of tests for triangual_solve and generic solve ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Added a pair tests for triangular_solve Signed-off-by: shugeo <sgazeos@gmail.com> * Added tests for triangular_solve op. Signed-off-by: shugeo <sgazeos@gmail.com> 2020-02-06 20:06:50 +02:00			`BUILD_DOUBLE_SELECTOR(input->dataType(), output->dataType(), sequenceMask_, (context, input, output, maxIndex), INTEGER_TYPES, LIBND4J_TYPES_EXTENDED);`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`}`

Shugeo sequence mask fix2 (#216) * Fixed sequence_mask op and tests. Signed-off-by: shugeo <sgazeos@gmail.com> * Cuda fix for sequence_mask op. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed sequence_mask op for both platforms and tests. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed solve and triangular_solve for more than 2D for adjoint cases. Signed-off-by: shugeo <sgazeos@gmail.com> * Added adjoint solve test again. Signed-off-by: shugeo <sgazeos@gmail.com> * Added a set of tests for triangual_solve and generic solve ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Added a pair tests for triangular_solve Signed-off-by: shugeo <sgazeos@gmail.com> * Added tests for triangular_solve op. Signed-off-by: shugeo <sgazeos@gmail.com> 2020-02-06 20:06:50 +02:00			`BUILD_DOUBLE_TEMPLATE(template void sequenceMask_, (nd4j::LaunchContext* context, NDArray* input, NDArray* output, int maxIndex), INTEGER_TYPES, LIBND4J_TYPES_EXTENDED);`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`}`
			`}`
			`}`