/* ****************************************************************************** * * * This program and the accompanying materials are made available under the * terms of the Apache License, Version 2.0 which is available at * https://www.apache.org/licenses/LICENSE-2.0. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * SPDX-License-Identifier: Apache-2.0 ******************************************************************************/ #ifndef REDUCE_BOOL_H #define REDUCE_BOOL_H #include //#include #include #include #ifdef _OPENMP #include #endif #include #include #include #include #include #include #pragma once #ifdef __CUDACC__ #include #include #endif #include "legacy_ops.h" //an op for the kernel namespace functions { namespace reduce { /** * A reduce function * reduces a vector down to * a subset of itself * via aggregating member * elements. */ template class ReduceBoolFunction { public: #ifdef __CUDACC__ template static __device__ void aggregatePartials(void *sPartials, Nd4jLong tid, Nd4jLong numItems, void *extraParams); template static __device__ void execScalarCuda(const void *vx, const Nd4jLong *xShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo); template static __device__ void transformCudaXD(const void *vx, const Nd4jLong *outerXTadShapeInfo, const Nd4jLong *innerXTadShapeInfo, void *extraParams, void *vreductionBuffer, void *vz, const Nd4jLong *zShapeInfo); template static __host__ void intermediateScalar(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo); template static __host__ void intermediateXD(dim3 launchDims, cudaStream_t *stream, const void *vx, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vreductionBuffer, void *vz, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int* dims); static __host__ void execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong* hXShapeInfo, void *extraParams, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, const Nd4jLong *tadOnlyShapeInfo); static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, const void *vx, const Nd4jLong *dXShapeInfo, const Nd4jLong *hXShapeInfo, void *extraParams, void *vreductionBuffer, void *vz, const Nd4jLong *dZShapeInfo, const Nd4jLong *hZShapeInfo, const int *dims); #else /** * Reduce down to 1 number * @param x the input * @param xShapeInfo the shape information * for the input * @param extraParams the extra params * @return */ template static _CUDA_H Z execScalar(const void *x, const Nd4jLong *xShapeInfo, void *extraParams); template static _CUDA_H void execScalar(const void *x, const Nd4jLong *xShapeInfo, void *extraParams, void *z, const Nd4jLong *zShapeInfo); static Z execScalar(int opNum, const void *x, const Nd4jLong *xShapeInfo, void *extraParams); static void execScalar(int opNum, const void *x, const Nd4jLong *xShapeInfo, void *extraParams, void *z, const Nd4jLong *zShapeInfo); static void exec(int opNum, sd::memory::Workspace* workspace, const void *vx, const Nd4jLong *xShapeInfo, void *vextraParams, void *vz, const Nd4jLong *zShapeInfo, const int *dims); /** * Execute on the cpu * @param x the input data * @param xShapeInfo the shape information for x * @param extraParams the extra parameters * @param result the result buffer * @param resultShapeInfoBuffer the shape information * @param dimension the dimension to perform * the reduce along long * @param dimensionLength the length of the dimension buffer */ template static void _CUDA_H exec(sd::memory::Workspace* workspace, const void *vx, const Nd4jLong *xShapeInfo, void *vextraParams, void *vz, const Nd4jLong *zShapeInfo, const int *dims); /** * CPU implementation * @param x the input data * @param xShapeInfo the shape information for * the input data * @param extraParams the extra parameters for the problem * @param result the result buffer * @param resultShapeInfo the shape information */ template static void _CUDA_H exec(const void *x, const Nd4jLong *xShapeInfo, void *extraParams, void *result, const Nd4jLong *resultShapeInfo); /** * Reduce down to 1 number * @param x the input * @param xShapeInfo the shape information * for the input * @param extraParams the extra params * @return */ template static Z _CUDA_H execScalar(const void *x, Nd4jLong xElementWiseStride, Nd4jLong length, void *extraParams); #endif }; #ifdef __CUDACC__ /** * * @param extraParams * @param sPartials * @param sMemSize */ template __device__ void initializeShared(T *extraParams, T **sPartials, int sMemSize); #endif } } #endif