274 lines
12 KiB
C++
Executable File
274 lines
12 KiB
C++
Executable File
/* ******************************************************************************
|
|
*
|
|
*
|
|
* This program and the accompanying materials are made available under the
|
|
* terms of the Apache License, Version 2.0 which is available at
|
|
* https://www.apache.org/licenses/LICENSE-2.0.
|
|
*
|
|
* See the NOTICE file distributed with this work for additional
|
|
* information regarding copyright ownership.
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
* License for the specific language governing permissions and limitations
|
|
* under the License.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
******************************************************************************/
|
|
|
|
/*
|
|
* reduce3.h
|
|
*
|
|
* Created on: Dec 28, 2015
|
|
* Author: agibsonccc
|
|
*/
|
|
|
|
#ifndef REDUCE3_H_
|
|
#define REDUCE3_H_
|
|
|
|
#define EXTRA_PARAMS_LENGTH 10
|
|
|
|
#include <math/templatemath.h>
|
|
#ifdef _OPENMP
|
|
#include <omp.h>
|
|
#endif
|
|
#include <system/pairwise_util.h>
|
|
#include <system/dll.h>
|
|
#include <helpers/shape.h>
|
|
#include <helpers/TAD.h>
|
|
#include <ops/ops.h>
|
|
#include <system/op_boilerplate.h>
|
|
#include <helpers/OmpLaunchHelper.h>
|
|
#include <helpers/DebugHelper.h>
|
|
|
|
#ifdef __CUDACC__
|
|
#include <cuda.h>
|
|
#include <cuda_runtime.h>
|
|
#endif
|
|
|
|
|
|
#include "legacy_ops.h"
|
|
|
|
using namespace simdOps;
|
|
|
|
namespace functions {
|
|
namespace reduce3 {
|
|
|
|
/**
|
|
* Reduce involving
|
|
* 2 arrays
|
|
*/
|
|
template<typename X, typename Y>
|
|
class Reduce3 {
|
|
|
|
public:
|
|
|
|
#ifdef __CUDACC__
|
|
virtual __device__
|
|
inline Y opAtomic(X d1, X d2, Y *extraParamsRef) = 0;
|
|
|
|
/**
|
|
* Aggregate shared memory
|
|
* @param sPartialsRef
|
|
* @param tid
|
|
* @param extraParams
|
|
*/
|
|
template<typename OpType>
|
|
static __device__ void aggregatePartials(void* sPartials, Nd4jLong tid, Nd4jLong numItems, void *extraParams);
|
|
|
|
template<typename OpType>
|
|
static __device__ void execScalarCuda(const void *x, const Nd4jLong *xShapeInfo,
|
|
const void *y, const Nd4jLong *yShapeInfo,
|
|
void *extraParams,
|
|
void *z, const Nd4jLong *zShapeInfo,
|
|
int *allocationPointer, void *reductionBuffer,
|
|
const Nd4jLong *tadOnlyShapeInfo);
|
|
|
|
template<typename OpType>
|
|
static __device__ void transformAll(const void *vx, const Nd4jLong *xShapeInfo,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *extraParams,
|
|
void *vz, const Nd4jLong *zShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
int postProcessOrNot,
|
|
int *allocationPointer,
|
|
const Nd4jLong *xTadShapeInfo, const Nd4jLong *xOffsets,
|
|
const Nd4jLong *yTadShapeInfo, const Nd4jLong *yOffsets);
|
|
|
|
/**
|
|
Perform a reduction
|
|
@param n the number of elements
|
|
@param xOffset the starting offset
|
|
@param dx the data to perform the reduction on
|
|
@param incx the increment on which to perform the reduction
|
|
@param extraParams extra parameters used for calculations
|
|
@param result where to store the result of the reduction
|
|
*/
|
|
template<typename OpType>
|
|
static __device__ void transform(const void *vx, const Nd4jLong *xShapeInfo,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *extraParams,
|
|
void *vz, const Nd4jLong *zShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
int postProcessOrNot,
|
|
int *allocationPointer,
|
|
const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets,
|
|
const Nd4jLong *yTadOnlyShapeInfo, const Nd4jLong *yTadOffsets);
|
|
|
|
|
|
static __device__ void execCuda(int opNum,
|
|
const void *vx, const Nd4jLong *xShapeInfo,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *extraParams,
|
|
void *vz, const Nd4jLong *zShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
int postProcessOrNot,
|
|
int *allocationPointer,
|
|
const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets,
|
|
const Nd4jLong *yTadOnlyShapeInfo, const Nd4jLong *yTadOffsets);
|
|
|
|
|
|
static __device__ void execAllCuda(int opNum,
|
|
const void *vx, const Nd4jLong *xShapeInfo,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *extraParams,
|
|
void *vz, const Nd4jLong *zShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
int postProcessOrNot,
|
|
int *allocationPointer,
|
|
const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets,
|
|
const Nd4jLong *yTadOnlyShapeInfo, const Nd4jLong *yTadOffsets);
|
|
|
|
|
|
static __device__ void execScalarCuda(int opNum,
|
|
const void *vx, const Nd4jLong *xShapeInfo,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *extraParams,
|
|
void *vz, const Nd4jLong *zShapeInfo,
|
|
int * allocationPointer, void *reductionBuffer,
|
|
const Nd4jLong *tadOnlyShapeInfo);
|
|
|
|
|
|
static __host__ void exec(dim3 launchDims, cudaStream_t *stream,
|
|
int opNum,
|
|
const void *vx, const Nd4jLong *xShapeInfo,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *extraParams,
|
|
void *vz, const Nd4jLong *zShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
int postProcessOrNot,
|
|
int *allocationPointer,
|
|
const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets,
|
|
const Nd4jLong *yTadOnlyShapeInfo, const Nd4jLong *yTadOffsets);
|
|
|
|
static __host__ void execAll(dim3 launchDims, cudaStream_t *stream,
|
|
int opNum,
|
|
const void *vx, const Nd4jLong *xShapeInfo,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *extraParams,
|
|
void *vz, const Nd4jLong *zShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
int postProcessOrNot,
|
|
int *allocationPointer,
|
|
const Nd4jLong *tadOnlyShapeInfo, const Nd4jLong *tadOffsets,
|
|
const Nd4jLong *yTadOnlyShapeInfo, const Nd4jLong *yTadOffsets);
|
|
|
|
static __host__ void execScalar(dim3 launchDims, cudaStream_t *stream,
|
|
int opNum,
|
|
const void *vx, const Nd4jLong *xShapeInfo,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *extraParams,
|
|
void *vz, const Nd4jLong *zShapeInfo,
|
|
int* allocationPointer, void *reductionBuffer,
|
|
const Nd4jLong *tadOnlyShapeInfo);
|
|
|
|
#else
|
|
|
|
template<typename OpType>
|
|
static void execScalar(const void *vx, const Nd4jLong *xShapeInfo,
|
|
void *vextraParams,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *vz, const Nd4jLong *zShapeInfo);
|
|
|
|
|
|
static void execScalar(int opNum,
|
|
const void *x, const Nd4jLong *xShapeInfo,
|
|
void *extraParamsVals,
|
|
const void *y, const Nd4jLong *yShapeInfo,
|
|
void *z, const Nd4jLong *zShapeInfo);
|
|
|
|
|
|
template<typename OpType>
|
|
static void exec(const void *vx, const Nd4jLong *xShapeInfo,
|
|
void *vextraParams,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *vz, const Nd4jLong *zShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
int64_t start, int64_t stop);
|
|
|
|
|
|
template<typename OpType>
|
|
static void exec(const void *vx, const Nd4jLong *xShapeInfo,
|
|
void *vextraParams,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *vz, const Nd4jLong *zShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets,
|
|
int64_t start, int64_t stop);
|
|
|
|
|
|
template<typename OpType>
|
|
static void execAll(const void *vx, const Nd4jLong *xShapeInfo,
|
|
void *vextraParams,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *vz, const Nd4jLong *zShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
const Nd4jLong *xTadShapeInfo, const Nd4jLong *xOffsets,
|
|
const Nd4jLong *yTadShapeInfo, const Nd4jLong *yOffsets,
|
|
int64_t start, int64_t stop);
|
|
|
|
|
|
static void exec(int opNum,
|
|
const void *vx, const Nd4jLong *xShapeInfo,
|
|
void *extraParamsVals,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *vz, const Nd4jLong *zShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
int64_t start, int64_t stop);
|
|
|
|
|
|
static void exec(int opNum,
|
|
const void *vx, const Nd4jLong *xShapeInfo,
|
|
void *extraParamsVals,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *vz, const Nd4jLong *zShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets,
|
|
int64_t start, int64_t stop);
|
|
|
|
|
|
static void execAll(int opNum,
|
|
const void *vx, const Nd4jLong *xShapeInfo,
|
|
void *extraParamsVals,
|
|
const void *vy, const Nd4jLong *yShapeInfo,
|
|
void *vz, const Nd4jLong *zShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
const Nd4jLong *xTadShapeInfo, const Nd4jLong *xOffsets,
|
|
const Nd4jLong *yTadShapeInfo, const Nd4jLong *yOffsets,
|
|
int64_t start, int64_t stop);
|
|
#endif
|
|
};
|
|
|
|
|
|
|
|
}
|
|
}
|
|
|
|
#ifdef __CUDACC__
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* REDUCE3_H_ */
|