/******************************************************************************* * Copyright (c) 2015-2018 Skymind, Inc. * * This program and the accompanying materials are made available under the * terms of the Apache License, Version 2.0 which is available at * https://www.apache.org/licenses/LICENSE-2.0. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * SPDX-License-Identifier: Apache-2.0 ******************************************************************************/ // // @author raver119@gmail.com // #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace simdOps; namespace functions { namespace summarystats { template void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *z, Nd4jLong *zShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot,bool biasCorrected,int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) { functions::summarystats::SummaryStatsReduce::transform(op,dx,xShapeInfo,extraParams,z,zShapeInfo,dimension,dimensionLength,biasCorrected,allocationBuffer,reductionBuffer,tadOnlyShapeInfo,tadOffsets); } /** * * @param sPartialsRef * @param tid * @param extraParams */ template template _CUDA_D void SummaryStatsReduce::aggregatePartials(SummaryStatsData **sPartialsRef, Nd4jLong tid, Nd4jLong numElements, void *vextraParams) { // start the shared memory loop on the next power of 2 less // than the block size. If block size is not a power of 2, // accumulate the intermediate sums in the remainder range. auto extraParams = static_cast(vextraParams); SummaryStatsData *sPartials = *sPartialsRef; Nd4jLong floorPow2 = blockDim.x; if (floorPow2 & (floorPow2 - 1)) { while (floorPow2 & (floorPow2 - 1)) { floorPow2 &= floorPow2 - 1; } if (tid >= floorPow2) { SummaryStatsData prev = sPartials[tid - floorPow2]; SummaryStatsData curr = sPartials[tid]; sPartials[tid - floorPow2] = update(prev, curr, extraParams); } __syncthreads(); } for (Nd4jLong activeThreads = floorPow2 >> 1; activeThreads; activeThreads >>= 1) { if (tid < activeThreads && tid + activeThreads < numElements) { SummaryStatsData curr = sPartials[tid]; SummaryStatsData next = sPartials[tid + activeThreads]; sPartials[tid] = update(curr, next, extraParams); } __syncthreads(); } }; /** * @param n n is the number of * elements to loop through * @param dx the data to operate on * @param xVectorInfo the meta data for the vector: * 0 is the offset * 1 is the increment/stride * 2 is the real length of the buffer (n and dx.length won't always be the same) * 3 is the element wise stride for the buffer * 4 is the number of elements it takes to get to the next row/column/tensor * @param gpuInformation * 0 is the block size * 1 is the grid size * 2 is the shared memory size * @param problemDefinition * 0 is the number of elements per vector * 1 is the number of vectors */ template template _CUDA_D void SummaryStatsReduce::transform(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, int postProcessOrNot, int *allocationBuffer, void *vreductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) { auto dx = static_cast(vx); auto z = static_cast(vz); auto extraParams = static_cast(vextraParams); auto reductionBuffer = static_cast(vreductionBuffer); int tid = blockIdx.x * blockDim.x + threadIdx.x; __shared__ volatile int resultScalar; __shared__ int xElementWiseStride; int numElements = blockDim.x; //shared memory space for storing intermediate results __shared__ SummaryStatsData *sPartials; if(threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; sPartials = reinterpret_cast*>(shmem); } __syncthreads(); Z startingVal = startingValue(dx); SummaryStatsData val; val.initWithValue(startingVal); val.n = 0; sPartials[threadIdx.x] = val; //length for the tad __shared__ volatile int xLength; __shared__ volatile int resultLength; SummaryStatsData reduction; reduction.initWithValue(0.0); reduction.n = 0; if (threadIdx.x == 0) { if (zShapeInfo != nullptr) resultLength = shape::length(zShapeInfo); else resultLength = 1; if (dimensionLength == 1) { if (resultLength == 1 && (dimension == nullptr || dimension[0] == MAX_DIMENSION)) resultScalar = 1; else resultScalar = 0; } else resultScalar = 0; if (resultLength == 1) resultScalar = 1; auto xStride = shape::stride(xShapeInfo); auto xOrder = shape::order(xShapeInfo); if (dimension != nullptr && (dimension[0] != MAX_DIMENSION && dimensionLength == 1)) { xElementWiseStride = xStride[dimension[0]]; } else { xElementWiseStride = shape::elementWiseStride(xShapeInfo); } xLength = shape::length(xShapeInfo); } __syncthreads(); if (!resultScalar) { __shared__ int tadLength; __shared__ int tadEWS; __shared__ int numTads; if (threadIdx.x == 0) { tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength); tadEWS = shape::elementWiseStride(tadOnlyShapeInfo); numTads = shape::length(xShapeInfo) / tadLength; } __syncthreads(); if (tadEWS == 0) { for (int r = blockIdx.x; r < numTads; r += gridDim.x) { auto tadOffsetForBlock = tadOffsets[r]; val.initWithValue(startingVal); val.n = 0; sPartials[threadIdx.x] = val; for (int i = threadIdx.x; i < tadLength; i += blockDim.x) { auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo); SummaryStatsData indexVal2; indexVal2.initWithValue(dx[xOffset]); sPartials[threadIdx.x] = update(sPartials[threadIdx.x], OpType::op(indexVal2, extraParams), extraParams); } __syncthreads(); aggregatePartials(&sPartials, threadIdx.x, sd::math::nd4j_min(blockDim.x, tadLength), extraParams); __syncthreads(); if (threadIdx.x == 0) { z[r] = OpType::getValue(postProcessOrNot, sPartials[threadIdx.x]); } __syncthreads(); } } else { for (int i = blockIdx.x; i < numTads; i += gridDim.x) { auto tadOffsetForBlock = tadOffsets[i]; val.initWithValue(startingVal); val.n = 0; sPartials[threadIdx.x] = val; for (int x = threadIdx.x; x < tadLength; x += blockDim.x) { auto indexX = tadOffsetForBlock + x * tadEWS; SummaryStatsData indexVal2; indexVal2.initWithValue(dx[indexX]); sPartials[threadIdx.x] = update(sPartials[threadIdx.x], OpType::op(indexVal2, extraParams), extraParams); } __syncthreads(); aggregatePartials(&sPartials, threadIdx.x, sd::math::nd4j_min(blockDim.x, tadLength), extraParams); __syncthreads(); if (threadIdx.x == 0) { z[i] = OpType::getValue(postProcessOrNot, sPartials[threadIdx.x]); //postProcess(sPartials[0],tadLength ,extraParams); } } } } else if (resultScalar) { __shared__ int n; if (threadIdx.x == 0) { xElementWiseStride = shape::elementWiseStride(xShapeInfo); n = shape::length(xShapeInfo); } __syncthreads(); if (xElementWiseStride >= 1) { for (Nd4jLong i = tid; i < n; i += (blockDim.x * gridDim.x)) { SummaryStatsData indexVal2; indexVal2.initWithValue(dx[i * xElementWiseStride]); reduction = update(reduction, indexVal2, extraParams); } } else { for (Nd4jLong i = tid; i < n; i += blockDim.x * gridDim.x) { auto offset = shape::getIndexOffset(i, xShapeInfo); SummaryStatsData indexVal2; indexVal2.initWithValue(dx[offset]); reduction = update(reduction, indexVal2, extraParams); } } sPartials[threadIdx.x] = reduction; __syncthreads(); aggregatePartials(&sPartials, threadIdx.x, blockDim.x, extraParams); __syncthreads(); if (gridDim.x > 1) { __shared__ bool amLast; unsigned int *tc = (unsigned int *)reductionBuffer; tid = threadIdx.x; if (threadIdx.x == 0) { SummaryStatsData *pBuffer = (SummaryStatsData*) reductionBuffer; pBuffer[blockIdx.x] = sPartials[0]; } __threadfence(); __syncthreads(); if (tid == 0) { unsigned int ticket = atomicInc(&tc[16384], gridDim.x); amLast = (ticket == gridDim.x - 1); } __syncthreads(); if (amLast) { tc[16384] = 0; SummaryStatsData* pBuffer = (SummaryStatsData*) reductionBuffer; Z startingVal = startingValue(dx); SummaryStatsData val; val.initWithValue(startingVal); val.n = 0; sPartials[threadIdx.x] = val; for (int i = threadIdx.x; i < gridDim.x; i += blockDim.x) { sPartials[threadIdx.x] = update(sPartials[threadIdx.x], pBuffer[i], extraParams); } __syncthreads(); aggregatePartials(&sPartials, threadIdx.x, gridDim.x, extraParams); __syncthreads(); if (tid == 0) { z[0] = OpType::getValue(postProcessOrNot, sPartials[0]); } } } else { if (tid == 0) { unsigned int *tc = (unsigned *)reductionBuffer; tc[16384] = 0; z[0] = z[0] = OpType::getValue(postProcessOrNot, sPartials[0]); } } } }; template _CUDA_D void SummaryStatsReduce::transform(const int opNum, void *dx, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) { DISPATCH_BY_OPNUM_TT(transform, PARAMS(dx, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, postProcessOrNot, allocationBuffer, reductionBuffer, tadOnlyShapeInfo, tadOffsets), SUMMARY_STATS_OPS); }; template _CUDA_H void SummaryStatsReduce::execSummaryStatsReduceScalar(dim3& launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *vextraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected, void *reductionBuffer) { auto x = static_cast(vx); auto extraParams = static_cast(vextraParams); auto z = reinterpret_cast(vz); auto reductionPointerA = reinterpret_cast(reductionBuffer); if (sd::Environment::getInstance()->isDebugAndVerbose()) printf("D16 opNum:[%i]\n", opNum); summaryStatsReduceT<<>>( opNum, x, xShapeInfo, shape::rank(hxShapeInfo), extraParams, z, zShapeInfo, shape::rank(hzShapeInfo), nullptr, 1, 1,biasCorrected, nullptr, reductionPointerA, tadShapeInfo, tadOffsets); // this is blocking method since method should return scalar sd::DebugHelper::checkErrorCode(stream, "execSSReduceScalar(...) failed"); } template _CUDA_H void SummaryStatsReduce::execSummaryStatsReduce(dim3& launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *vextraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected, void *reductionBuffer) { auto x = static_cast(vx); auto z = static_cast(vz); auto extraParams = static_cast(vextraParams); if (sd::Environment::getInstance()->isDebugAndVerbose()) printf("F17 opNum:[%i]\n", opNum); auto reductionPointerA = reinterpret_cast(reductionBuffer); summaryStatsReduceT<<>>( opNum, x, xShapeInfo, shape::rank(hxShapeInfo), extraParams, z, zShapeInfo, shape::rank(hzShapeInfo), nullptr, 1, 1,biasCorrected, nullptr, reductionPointerA, tadShapeInfo, tadOffsets); DEBUG_KERNEL(stream, opNum); } template _CUDA_H void SummaryStatsReduce::execSummaryStatsReduce(dim3& launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *vextraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected, void *reductionBuffer) { auto x = static_cast(vx); auto z = static_cast(vz); auto extraParams = static_cast(vextraParams); if (sd::Environment::getInstance()->isDebugAndVerbose()) printf("D18 opNum:[%i]\n", opNum); summaryStatsReduceT<<>>( opNum, x, xShapeInfo, shape::rank(hxShapeInfo), extraParams, z, zShapeInfo, shape::rank(hzShapeInfo), dimension, dimensionLength, 1, biasCorrected, nullptr, reinterpret_cast(reductionBuffer), tadShapeInfo, tadOffsets); DEBUG_KERNEL(stream, opNum); } BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT SummaryStatsReduce, , LIBND4J_TYPES, FLOAT_TYPES); } }