__device__ static inline void invertedMetaPairwiseStridedGeneric(const int opTypeA, const int opTypeB, Nd4jLong N, T *dx, Nd4jLong xStride, T *dy, Nd4jLong yStride, T *dz, Nd4jLong zStride, T *extraA, T *extraB, T scalarA, T scalarB) {
__device__ static inline void invertedMetaPairwiseStridedNumericGeneric(const int opTypeA, const int opNumA, const int opTypeB, const int opNumB, Nd4jLong N, T *dx, Nd4jLong xStride, T *dy, Nd4jLong yStride, T *dz, Nd4jLong zStride, T *extraA, T *extraB, T scalarA, T scalarB) {
// kernels set for pairwise + scalar based on stride const int opTypeA, const int opTypeB, Nd4jLong N, T *dx, int xStride, T *dy, int yStride, T *dz, int zStride, T *extraA, T *extraB, T scalarA, T scalarB
//DISPATCH_KERNEL_META(invertedMetaPairwiseStrided_Pairwise_Scalar_, invertedMetaPairwiseStridedGeneric, float, metaOps::InvertedMetaOp, INPUT(const int opTypeA, const int opTypeB, Nd4jLong N, float *dx, int xStride, float *dy, int yStride, float *dz, int zStride, float *extraA, float *extraB, float scalarA, float scalarB), PARAMS(opTypeA, opTypeB, N, dx, xStride, dy, yStride, dz, zStride, extraA, extraB, scalarA, scalarB), OPS_A(PAIRWISE_TRANSFORM_OPS), OPS_B(SCALAR_OPS))
//DISPATCH_KERNEL_META(invertedMetaPairwiseStrided_Pairwise_Scalar_, invertedMetaPairwiseStridedGeneric, double, metaOps::InvertedMetaOp, INPUT(const int opTypeA, const int opTypeB, Nd4jLong N, double *dx, int xStride, double *dy, int yStride, double *dz, int zStride, double *extraA, double *extraB, double scalarA, double scalarB), PARAMS(opTypeA, opTypeB, N, dx, xStride, dy, yStride, dz, zStride, extraA, extraB, scalarA, scalarB), OPS_A(PAIRWISE_TRANSFORM_OPS), OPS_B(SCALAR_OPS))
//DISPATCH_KERNEL_META(invertedMetaPairwiseStrided_Pairwise_Scalar_, invertedMetaPairwiseStridedGeneric, float16, metaOps::InvertedMetaOp, INPUT(const int opTypeA, const int opTypeB, Nd4jLong N, float16 *dx, int xStride, float16 *dy, int yStride, float16 *dz, int zStride, float16 *extraA, float16 *extraB, float16 scalarA, float16 scalarB), PARAMS(opTypeA, opTypeB, N, dx, xStride, dy, yStride, dz, zStride, extraA, extraB, scalarA, scalarB), OPS_A(PAIRWISE_TRANSFORM_OPS), OPS_B(SCALAR_OPS))
#endif
namespace functions {
namespace grid {
template <typename T>
__device__ __noinline__ T invertedOpExecutorB(const int opTypeA, const int opNumA, const int opTypeB, const int opNumB, T x, T y, T *extras);
template <typename T>
__device__ __noinline__ T execute_2OEF(const int opType, const int opNum, T x, T y, T *extras);
template <typename T>
__device__ __noinline__ T execute_1OEF(const int opType, const int opNum, T x, T *extras);
/**
* This method is able to execute various ops that takes 2 operands (x, y) + extras
* @tparam T
*/
template <typename T>
__device__ __noinline__ T execute_2OEF(const int opType, const int opNum, T x, T y, T *extras) {
T z;
switch(opType) {
case 2: {
EXECUTE_NOE((x, y, extras), OPS_A(PAIRWISE_TRANSFORM_OPS));
intermediate = functions::grid::execute_2OEF<T>(opTypeA, opNumA, x, y, paramsA);
// Executing second op, opB
T intermediate2 = functions::grid::execute_1OEF<T>(opTypeB, opNumB, intermediate, paramsB);
// just returning result now
return intermediate2;
}
template<typename T>
template<typename OpType>
__device__ void GRIDStrided<T>::transformCuda(Nd4jLong n, T *dx, T *dy, Nd4jLong incx, Nd4jLong incy, T *params, T *result, Nd4jLong incz,int *allocationPointer, UnifiedSharedMemory *manager,Nd4jLong *tadOnlyShapeInfo) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (incx == incy && incy == incz && incx == 1) {
for (Nd4jLong i = tid; i < n; i += gridDim.x * blockDim.x) {
result[i] = OpType::op(dx[i], dy[i], params);
}
} else {
for (Nd4jLong i = tid; i < n; i += gridDim.x * blockDim.x) {
__device__ void GRIDStrided<T>::transformCuda(const int opTypeA, const int opNumA, const int opTypeB, const int opNumB, Nd4jLong n, T *dx, T *dy, Nd4jLong incx, Nd4jLong incy, T *params, T *result, Nd4jLong incz,int *allocationPointer, UnifiedSharedMemory *manager,Nd4jLong *tadOnlyShapeInfo) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (incx == incy && incy == incz && incx == 1) {
for (Nd4jLong i = tid; i < n; i += gridDim.x * blockDim.x) {