cavis/libnd4j/include/loops/cuda/transform/transform_same.cu

/*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/

//
// @author raver119@gmail.com
//

#include <Environment.h>
#include <loops/transform_same.h>
#include <types/types.h>
#include <op_boilerplate.h>

#include <loops/legacy_ops.h>
#include <helpers/DebugHelper.h>

using namespace simdOps;

template <typename X, typename OpType>
__global__ void transformSameSimple(void *x, Nd4jLong *xShapeInfo, int xRank,
								void *params,
								void *z, Nd4jLong *zShapeInfo, int zRank,
								int *allocationPointer,
								void *reductionPointer,
								Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {

	functions::transform::TransformSame<X>::template transformCuda<OpType>(x,xShapeInfo,params,z,zShapeInfo,allocationPointer,reductionPointer, tadShapeInfo, tadOffsets);
}


namespace functions {
    namespace transform {

        template<typename X>
        _CUDA_H void TransformSame<X>::executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
			DISPATCH_BY_OPNUM_T(intermediateShaped, PARAMS(launchDims, stream, x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets), TRANSFORM_SAME_OPS);

            DEBUG_KERNEL(stream, opNum);
        }


        template<typename X>
        template <typename OpType>
        __device__ void TransformSame<X>::transformCuda(void *vx, Nd4jLong *xShapeInfo,
        												void *vparams,
        												void *vz, Nd4jLong *zShapeInfo,
        												int *allocationPointer, void *vreductionPointer,
        												Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {

        	auto x = static_cast<X*>(vx);
		    auto z = static_cast<X*>(vz);
		    auto params = static_cast<X*>(vparams);
		    auto reductionPointer = static_cast<X*>(vreductionPointer);

		    if(OpType::requiresSpecial) {
			    OpType::execSpecialCuda(x,xShapeInfo,z,zShapeInfo,params, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
			    return;
		    } else {
		    	__shared__ Nd4jLong xEws;
    	        __shared__ Nd4jLong zEws;
        	    __shared__ char xOrder;
            	__shared__ char zOrder;
            	__shared__ Nd4jLong length;

	            if (threadIdx.x == 0) {

        	        xEws = shape::elementWiseStride(xShapeInfo);
            	    zEws = shape::elementWiseStride(zShapeInfo);
                	xOrder = shape::order(xShapeInfo);
					zOrder = shape::order(zShapeInfo);
					length = shape::length(xShapeInfo);
            	}
            	__syncthreads();

	    	    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
				int totalThreads = gridDim.x * blockDim.x;

		        if(xEws > 0 && zEws > 0 && xOrder == zOrder && xOrder == 'c') {

					for (int i = tid; i < length; i += totalThreads)
						z[i * zEws] = OpType::op(x[i * xEws], params);
		        }
		        else {
					if(vx == vz) {
						for (Nd4jLong i = tid; i < length; i+= totalThreads) {
							auto xOffset = shape::getIndexOffset(i, xShapeInfo);
	    			    	z[xOffset] = OpType::op(x[xOffset], params);
		    	    	}
					}
					else {
		    	    	for (Nd4jLong i = tid; i < length; i+= totalThreads) {
							auto xOffset = shape::getIndexOffset(i, xShapeInfo);
							auto zOffset = shape::getIndexOffset(i, zShapeInfo);
	    			    	z[zOffset] = OpType::op(x[xOffset], params);
		    	    	}
		    		}
		        }
	        }
	    };


		template<typename X>
		template <typename OpType>
		_CUDA_H void TransformSame<X>::intermediateShaped(dim3 launchDims, cudaStream_t *stream, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
			transformSameSimple<X, OpType><<<launchDims.x, launchDims.x, launchDims.z, *stream>>>(x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
            nd4j::DebugHelper::checkErrorCode(stream, "transformSame(...) failed");
		}

        BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformSame, , LIBND4J_TYPES);
    }
}
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`/*******************************************************************************`
			`* Copyright (c) 2015-2018 Skymind, Inc.`
			`*`
			`* This program and the accompanying materials are made available under the`
			`* terms of the Apache License, Version 2.0 which is available at`
			`* https://www.apache.org/licenses/LICENSE-2.0.`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT`
			`* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the`
			`* License for the specific language governing permissions and limitations`
			`* under the License.`
			`*`
			`* SPDX-License-Identifier: Apache-2.0`
			`******************************************************************************/`

			`//`
			`// @author raver119@gmail.com`
			`//`

			`#include <Environment.h>`
			`#include <loops/transform_same.h>`
			`#include <types/types.h>`
			`#include <op_boilerplate.h>`

			`#include <loops/legacy_ops.h>`
			`#include <helpers/DebugHelper.h>`

			`using namespace simdOps;`

			`template <typename X, typename OpType>`
			`__global__ void transformSameSimple(void x, Nd4jLong xShapeInfo, int xRank,`
			`void *params,`
			`void z, Nd4jLong zShapeInfo, int zRank,`
			`int *allocationPointer,`
			`void *reductionPointer,`
			`Nd4jLong tadShapeInfo, Nd4jLong tadOffsets) {`

			`functions::transform::TransformSame<X>::template transformCuda<OpType>(x,xShapeInfo,params,z,zShapeInfo,allocationPointer,reductionPointer, tadShapeInfo, tadOffsets);`
			`}`


			`namespace functions {`
			`namespace transform {`

			`template<typename X>`
			`_CUDA_H void TransformSame<X>::executeTransformShaped(dim3 launchDims, cudaStream_t stream, int opNum, void x, Nd4jLong xShape, int xRank, void extraParams, void z, Nd4jLong zShape, int zRank, int allocationPointer, void reductionPointer, Nd4jLong tadShapeInfo, Nd4jLong tadOffsets) {`
			`DISPATCH_BY_OPNUM_T(intermediateShaped, PARAMS(launchDims, stream, x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets), TRANSFORM_SAME_OPS);`

			`DEBUG_KERNEL(stream, opNum);`
			`}`


			`template<typename X>`
			`template <typename OpType>`
			`__device__ void TransformSame<X>::transformCuda(void vx, Nd4jLong xShapeInfo,`
			`void *vparams,`
			`void vz, Nd4jLong zShapeInfo,`
			`int allocationPointer, void vreductionPointer,`
			`Nd4jLong tadShapeInfo, Nd4jLong tadOffsets) {`

			`auto x = static_cast<X*>(vx);`
			`auto z = static_cast<X*>(vz);`
			`auto params = static_cast<X*>(vparams);`
			`auto reductionPointer = static_cast<X*>(vreductionPointer);`

			`if(OpType::requiresSpecial) {`
			`OpType::execSpecialCuda(x,xShapeInfo,z,zShapeInfo,params, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);`
			`return;`
			`} else {`
			`__shared__ Nd4jLong xEws;`
			`__shared__ Nd4jLong zEws;`
			`__shared__ char xOrder;`
			`__shared__ char zOrder;`
			`__shared__ Nd4jLong length;`

			`if (threadIdx.x == 0) {`

			`xEws = shape::elementWiseStride(xShapeInfo);`
			`zEws = shape::elementWiseStride(zShapeInfo);`
			`xOrder = shape::order(xShapeInfo);`
			`zOrder = shape::order(zShapeInfo);`
			`length = shape::length(xShapeInfo);`
			`}`
			`__syncthreads();`

			`auto tid = blockIdx.x * blockDim.x + threadIdx.x;`
			`int totalThreads = gridDim.x * blockDim.x;`

Oleh tenzor mmul (#231) * Libnd4j: TensorMMul backprop op #8174, raw implementation Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 merge master and some corrections Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 algorithm update, need testing, sync with master * Libnd4j: TensorMMul backprop op #8174 fixed incorrect B axes calculation Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 optimize axes identification and fix bug of indeces overlapping, added first test. need testing with different shapes Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 some fixes and improvements need more testing Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 fixed order of matrix multiply Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 fixed issue of incorrect axes definition, add tests based on TF, need additional testing for case dLdC not equal 1 Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 fixed scalar case add test Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 fixed bp algorithm, axes definition, need some mode testing with different orders combination f,c; c,f f,f and add some checks for inputs Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 some checks and corrections added tests, exists the problem with different input orders support A-f B-c and A-f B-f Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 sync master Signed-off-by: Oleg <oleg.semeniv@gmail.com> * - correct bug in MmulHelper::tensorDot(a, b, c, axes_a, axes_b,permutForC) Signed-off-by: Yurii <iuriish@yahoo.com> * Libnd4j: TensorMMul backprop op #8174 code clean up and refactoring Signed-off-by: Oleg <oleg.semeniv@gmail.com> * - add check for linspase ordered permutations in ShapeUtils::evalShapeForTensorDot Signed-off-by: Yurii <iuriish@yahoo.com> * - provide additional code in shape::reshape stuff in order to reduce amount of allocation/copy operations during reshaping procedure Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on problem of wrong shape evaluation during permute/reshape procedures Signed-off-by: Yurii <iuriish@yahoo.com> * - still looking for bug reason in reshape/permute stuff Signed-off-by: Yurii <iuriish@yahoo.com> * - correct bug in transform cuda native ops Signed-off-by: Yurii <iuriish@yahoo.com> * - correct bug in NDArray::assign Signed-off-by: Yurii <iuriish@yahoo.com> * - remove old shape::reshape stuff Signed-off-by: Yurii <iuriish@yahoo.com> * - add possibility to disable copy of old buffer to new buffer during reshape operation in NDArray class Signed-off-by: Yurii <iuriish@yahoo.com> * - correct bug in tensorDot which had to do with wrong pointers assigments Signed-off-by: Yurii <iuriish@yahoo.com> Co-authored-by: Oleh <oleg.semeniv@gmail.com> 2020-02-13 19:33:54 +02:00			`if(xEws > 0 && zEws > 0 && xOrder == zOrder && xOrder == 'c') {`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00
			`for (int i = tid; i < length; i += totalThreads)`
			`z[i * zEws] = OpType::op(x[i * xEws], params);`
			`}`
			`else {`
			`if(vx == vz) {`
[WIP] minor (#218) * - initial docs commit - merge* cuda fix Signed-off-by: raver119 <raver119@gmail.com> * one more fix Signed-off-by: raver119 <raver119@gmail.com> * one more fix Signed-off-by: raver119 <raver119@gmail.com> 2019-09-02 11:25:48 +03:00			`for (Nd4jLong i = tid; i < length; i+= totalThreads) {`
[WIP] bunch of improvements (#257) * - profiling bias_add op - add some docementation Signed-off-by: Yurii <yurii@skymind.io> * - minor change Signed-off-by: Yurii <yurii@skymind.io> * - provide addBias cuda kernel Signed-off-by: Yurii <yurii@skymind.io> * - improve shape::getIndexOfffset and change its signature Signed-off-by: Yurii <yurii@skymind.io> * - same as previous Signed-off-by: Yurii <yurii@skymind.io> * - improve and change signature in some shape:: stuff which has to do with calculation of offsets for array elements Signed-off-by: Yurii <yurii@skymind.io> * - minor changes in flatten Signed-off-by: Yurii <shyrma@skymind.io> * - add function shape::getIndexOffsetOrdered Signed-off-by: Yurii <shyrma@skymind.io> * - correct shape::getIndexOffsetOrdered() Signed-off-by: Yurii <shyrma@skymind.io> * - move getIndexOffsetOrdered to flatten.h header in order to isolate this function Signed-off-by: Yurii <shyrma@skymind.io> 2019-09-11 20:12:09 +03:00			`auto xOffset = shape::getIndexOffset(i, xShapeInfo);`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`z[xOffset] = OpType::op(x[xOffset], params);`
			`}`
			`}`
			`else {`
[WIP] minor (#218) * - initial docs commit - merge* cuda fix Signed-off-by: raver119 <raver119@gmail.com> * one more fix Signed-off-by: raver119 <raver119@gmail.com> * one more fix Signed-off-by: raver119 <raver119@gmail.com> 2019-09-02 11:25:48 +03:00			`for (Nd4jLong i = tid; i < length; i+= totalThreads) {`
[WIP] bunch of improvements (#257) * - profiling bias_add op - add some docementation Signed-off-by: Yurii <yurii@skymind.io> * - minor change Signed-off-by: Yurii <yurii@skymind.io> * - provide addBias cuda kernel Signed-off-by: Yurii <yurii@skymind.io> * - improve shape::getIndexOfffset and change its signature Signed-off-by: Yurii <yurii@skymind.io> * - same as previous Signed-off-by: Yurii <yurii@skymind.io> * - improve and change signature in some shape:: stuff which has to do with calculation of offsets for array elements Signed-off-by: Yurii <yurii@skymind.io> * - minor changes in flatten Signed-off-by: Yurii <shyrma@skymind.io> * - add function shape::getIndexOffsetOrdered Signed-off-by: Yurii <shyrma@skymind.io> * - correct shape::getIndexOffsetOrdered() Signed-off-by: Yurii <shyrma@skymind.io> * - move getIndexOffsetOrdered to flatten.h header in order to isolate this function Signed-off-by: Yurii <shyrma@skymind.io> 2019-09-11 20:12:09 +03:00			`auto xOffset = shape::getIndexOffset(i, xShapeInfo);`
			`auto zOffset = shape::getIndexOffset(i, zShapeInfo);`
Eclipse Migration Initial Commit 2019-06-06 15:21:15 +03:00			`z[zOffset] = OpType::op(x[xOffset], params);`
			`}`
			`}`
			`}`
			`}`
			`};`


			`template<typename X>`
			`template <typename OpType>`
			`_CUDA_H void TransformSame<X>::intermediateShaped(dim3 launchDims, cudaStream_t stream, void x, Nd4jLong xShape, int xRank, void extraParams, void z, Nd4jLong zShape, int zRank, int allocationPointer, void reductionPointer, Nd4jLong tadShapeInfo, Nd4jLong tadOffsets) {`
			`transformSameSimple<X, OpType><<<launchDims.x, launchDims.x, launchDims.z, *stream>>>(x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);`
			`nd4j::DebugHelper::checkErrorCode(stream, "transformSame(...) failed");`
			`}`

			`BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformSame, , LIBND4J_TYPES);`
			`}`
			`}`