static inline __device__ void transform(Nd4jLong n, T scalar,T *dy,T *params, T *result, int *indexes, int *allocationBuffer, UnifiedSharedMemory *manager) {
static inline __device__ void transformCuda(T scalar, T *dy, int *shapeInfo, T *params, T *result, int *resultShapeInfo, int *allocationBuffer, UnifiedSharedMemory *manager) {
int *xShape = shape::shapeOf(shapeInfo);
int *xStride = shape::stride(shapeInfo);
char xOrder = shape::order(shapeInfo);
int xRank = shape::rank(shapeInfo);
int xOffset = shape::offset(shapeInfo);
int xElementWiseStride = shape::elementWiseStride(shapeInfo);
int resultElementWiseStride = shape::elementWiseStride(resultShapeInfo);
static inline void __device__ transformCuda(T *x, int *xShapeInfo, T *extraParams, T *z, int *zShapeInfo, T *scalars, int *dimension, int dimensionLength, int *tadShapeInfo, Nd4jLong *tadOffsets, int *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
if (tadShapeInfoZ == nullptr) {
tadShapeInfoZ = tadShapeInfo;
tadOffsetsZ = tadOffsets;
}
// tad preparation
int tadEWS = shape::elementWiseStride(tadShapeInfo);
int zEWS = shape::elementWiseStride(tadShapeInfo);
int tadRank = shape::rank(tadShapeInfo);
int tadLength = shape::length(tadShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength);
int numTads =shape::length(xShapeInfo) / tadLength;
// main loop, rolling over tads
for (int r = blockIdx.x; r < numTads; r+=gridDim.x) {
Nd4jLong offset = tadOffsets[r];
Nd4jLong offsetZ = tadOffsetsZ[r];
T scalar = scalars[r];
if (tadEWS >= 1 && zEWS >= 1) {
T *oZ = z + offsetZ;
T *oX = x + offset;
for (int f = threadIdx.x; f < tadLength; f+= blockDim.x) {