* Libnd4j: TensorMMul backprop op #8174, raw implementation Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 merge master and some corrections Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 algorithm update, need testing, sync with master * Libnd4j: TensorMMul backprop op #8174 fixed incorrect B axes calculation Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 optimize axes identification and fix bug of indeces overlapping, added first test. need testing with different shapes Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 some fixes and improvements need more testing Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 fixed order of matrix multiply Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 fixed issue of incorrect axes definition, add tests based on TF, need additional testing for case dLdC not equal 1 Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 fixed scalar case add test Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 fixed bp algorithm, axes definition, need some mode testing with different orders combination f,c; c,f f,f and add some checks for inputs Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 some checks and corrections added tests, exists the problem with different input orders support A-f B-c and A-f B-f Signed-off-by: Oleg <oleg.semeniv@gmail.com> * Libnd4j: TensorMMul backprop op #8174 sync master Signed-off-by: Oleg <oleg.semeniv@gmail.com> * - correct bug in MmulHelper::tensorDot(a, b, c, axes_a, axes_b,permutForC) Signed-off-by: Yurii <iuriish@yahoo.com> * Libnd4j: TensorMMul backprop op #8174 code clean up and refactoring Signed-off-by: Oleg <oleg.semeniv@gmail.com> * - add check for linspase ordered permutations in ShapeUtils::evalShapeForTensorDot Signed-off-by: Yurii <iuriish@yahoo.com> * - provide additional code in shape::reshape stuff in order to reduce amount of allocation/copy operations during reshaping procedure Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on problem of wrong shape evaluation during permute/reshape procedures Signed-off-by: Yurii <iuriish@yahoo.com> * - still looking for bug reason in reshape/permute stuff Signed-off-by: Yurii <iuriish@yahoo.com> * - correct bug in transform cuda native ops Signed-off-by: Yurii <iuriish@yahoo.com> * - correct bug in NDArray::assign Signed-off-by: Yurii <iuriish@yahoo.com> * - remove old shape::reshape stuff Signed-off-by: Yurii <iuriish@yahoo.com> * - add possibility to disable copy of old buffer to new buffer during reshape operation in NDArray class Signed-off-by: Yurii <iuriish@yahoo.com> * - correct bug in tensorDot which had to do with wrong pointers assigments Signed-off-by: Yurii <iuriish@yahoo.com> Co-authored-by: Oleh <oleg.semeniv@gmail.com>
		
			
				
	
	
		
			1631 lines
		
	
	
		
			75 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			1631 lines
		
	
	
		
			75 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| /*******************************************************************************
 | |
|  * Copyright (c) 2015-2018 Skymind, Inc.
 | |
|  * Copyright (c) 2019 Konduit K.K.
 | |
|  *
 | |
|  * This program and the accompanying materials are made available under the
 | |
|  * terms of the Apache License, Version 2.0 which is available at
 | |
|  * https://www.apache.org/licenses/LICENSE-2.0.
 | |
|  *
 | |
|  * Unless required by applicable law or agreed to in writing, software
 | |
|  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 | |
|  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 | |
|  * License for the specific language governing permissions and limitations
 | |
|  * under the License.
 | |
|  *
 | |
|  * SPDX-License-Identifier: Apache-2.0
 | |
|  ******************************************************************************/
 | |
| 
 | |
| //
 | |
| // @author Yurii Shyrma (iuriish@yahoo.com)
 | |
| //
 | |
| 
 | |
| #include <ops/declarable/helpers/convolutions.h>
 | |
| #include <ops/declarable/helpers/im2col.h>
 | |
| #include <ops/declarable/helpers/col2im.h>
 | |
| #include<ops/declarable/helpers/addBias.h>
 | |
| #include <exceptions/cuda_exception.h>
 | |
| #include <NDArrayFactory.h>
 | |
| #include <MmulHelper.h>
 | |
| #include <PointersManager.h>
 | |
| #include <templatemath.h>
 | |
| 
 | |
| namespace nd4j {
 | |
| namespace ops  {
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| // vol [bS, iC, iD, iH, iW] is convoluted to col [bS, iC, kD, kH, kW, oD, oH, oW]
 | |
| template <typename T>
 | |
| static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeInfo, void* columns, const Nd4jLong* colShapeInfo,  const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
 | |
| 
 | |
|     const T* vol = reinterpret_cast<const T*>(volume);
 | |
|           T* col = reinterpret_cast<T*>(columns);
 | |
| 
 | |
|     __shared__ int colRank, volRank;
 | |
|     __shared__ Nd4jLong colLen, iD, iH, iW, *sharedMem;
 | |
| 
 | |
|     if (threadIdx.x == 0) {
 | |
|         extern __shared__ unsigned char shmem[];
 | |
|         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 | |
| 
 | |
|         volRank = 5;
 | |
|         colRank = 8;
 | |
| 
 | |
|         colLen = shape::length(colShapeInfo);
 | |
| 
 | |
|         iD = volShapeInfo[3];
 | |
|         iH = volShapeInfo[4];
 | |
|         iW = volShapeInfo[5];
 | |
|     }
 | |
|     __syncthreads();
 | |
| 
 | |
|     const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;
 | |
| 
 | |
|     if(colInd >= colLen)
 | |
|         return;
 | |
| 
 | |
|     auto coords = sharedMem + threadIdx.x * colRank;
 | |
| 
 | |
|     shape::index2coords(colInd, colShapeInfo, coords);
 | |
| 
 | |
|     // const auto colW = coords[7];
 | |
|     // const auto colH = coords[6];
 | |
|     // const auto colD = coords[5];
 | |
|     // const auto kCol = coords[4];
 | |
|     // const auto kRow = coords[3];
 | |
|     // const auto kDep = coords[2];
 | |
|     // const auto c    = coords[1];
 | |
|     // const auto b    = coords[0];
 | |
| 
 | |
|     const auto colOffset = shape::getOffset(colShapeInfo, coords);
 | |
| 
 | |
|     coords[2] = -pD + coords[2] * dD + coords[5] * sD;     // const auto volDep = (-pD + kDep * dD) + colD * sD;
 | |
|     coords[3] = -pH + coords[3] * dH + coords[6] * sH;     // const auto volRow = (-pH + kRow * dH) + colH * sH;
 | |
|     coords[4] = -pW + coords[4] * dW + coords[7] * sW;     // const auto volCol = (-pW + kCol * dW) + colW * sW;
 | |
| 
 | |
|     if (static_cast<unsigned>(coords[2]) >= static_cast<unsigned>(iD) || static_cast<unsigned>(coords[3]) >= static_cast<unsigned>(iH) || static_cast<unsigned>(coords[4]) >= static_cast<unsigned>(iW))
 | |
|         col[colOffset] = static_cast<T>(0.);
 | |
|     else
 | |
|         col[colOffset] = vol[shape::getOffset(volShapeInfo, coords)];
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| static void vol2colCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
 | |
|                                 const void* volume, const Nd4jLong* volShapeInfo,
 | |
|                                       void* columns, const Nd4jLong* colShapeInfo,
 | |
|                                 const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
 | |
| 
 | |
|     vol2colCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(volume, volShapeInfo, columns, colShapeInfo,  sD, sH, sW, pD, pH, pW, dD, dH, dW);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::vol2col(nd4j::graph::Context& block, const NDArray& vol, NDArray& col, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
 | |
| 
 | |
|     PointersManager manager(block.launchContext(), "vol2col");
 | |
| 
 | |
|     const int threadsPerBlock = MAX_NUM_THREADS / 4;
 | |
|     const int blocksPerGrid = (col.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
 | |
|     const int sharedMem = col.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
 | |
| 
 | |
|     NDArray::prepareSpecialUse({&col}, {&vol});
 | |
|     BUILD_SINGLE_SELECTOR(vol.dataType(), vol2colCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), vol.getSpecialBuffer(), vol.getSpecialShapeInfo(), col.specialBuffer(), col.specialShapeInfo(), sD, sH, sW, pD, pH, pW, dD, dH, dW), FLOAT_TYPES);
 | |
|     NDArray::registerSpecialUse({&col}, {&vol});
 | |
| 
 | |
|     manager.synchronize();
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| // columns [bS, iC, kD, kH, kW, oD, oH, oW] to be de-convoluted to volume [bS, iC, iD, iH, iW]
 | |
| template <typename T>
 | |
| static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShapeInfo, void* volume, const Nd4jLong* volShapeInfo,  const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
 | |
| 
 | |
|     const T* col = reinterpret_cast<const T*>(columns);
 | |
|           T* vol = reinterpret_cast<T*>(volume);
 | |
| 
 | |
|     __shared__ uint kD, kH, kW, oD, oH, oW, *sharedMem;
 | |
|     __shared__ Nd4jLong volLen;
 | |
| 
 | |
|     if (threadIdx.x == 0) {
 | |
|         extern __shared__ unsigned char shmem[];
 | |
|         sharedMem = reinterpret_cast<uint*>(shmem);
 | |
| 
 | |
|         oD = colShapeInfo[6];
 | |
|         oH = colShapeInfo[7];
 | |
|         oW = colShapeInfo[8];
 | |
| 
 | |
|         kD = dD * (colShapeInfo[3] - 1) + 1;
 | |
|         kH = dH * (colShapeInfo[4] - 1) + 1;
 | |
|         kW = dW * (colShapeInfo[5] - 1) + 1;
 | |
| 
 | |
|         volLen  = shape::length(volShapeInfo);
 | |
|     }
 | |
|     __syncthreads();
 | |
| 
 | |
|     auto coords = sharedMem + threadIdx.x * 8;
 | |
| 
 | |
|     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 | |
| 
 | |
|     for (Nd4jLong i = tid; i < volLen; i += gridDim.x * blockDim.x) {
 | |
| 
 | |
|         shape::index2coords(i, volShapeInfo, coords);
 | |
| 
 | |
|         const auto volOffset = shape::getOffset(volShapeInfo, coords);
 | |
| 
 | |
|         const auto bSiCoffset = coords[0] * colShapeInfo[9] + coords[1] * colShapeInfo[10];
 | |
| 
 | |
|         const uint imD = coords[2] + pD;
 | |
|         const uint imH = coords[3] + pH;
 | |
|         const uint imW = coords[4] + pW;
 | |
| 
 | |
|         const uint colDstart = (imD < kD) ? 0 : (imD - kD) / sD + 1;
 | |
|         const uint colHstart = (imH < kH) ? 0 : (imH - kH) / sH + 1;
 | |
|         const uint colWstart = (imW < kW) ? 0 : (imW - kW) / sW + 1;
 | |
| 
 | |
|         const uint colDend = nd4j::math::nd4j_min<uint>(imD / sD + 1, oD);
 | |
|         const uint colHend = nd4j::math::nd4j_min<uint>(imH / sH + 1, oH);
 | |
|         const uint colWend = nd4j::math::nd4j_min<uint>(imW / sW + 1, oW);
 | |
| 
 | |
|         T val = 0;
 | |
| 
 | |
|         for(uint colD = colDstart; colD < colDend; ++colD) {
 | |
|             coords[2] = imD - colD * sD;
 | |
|             if(coords[2] % dD != 0) continue;
 | |
| 
 | |
|             for(uint colH = colHstart; colH < colHend; ++colH) {
 | |
|                 coords[3] = imH - colH * sH;
 | |
|                 if(coords[3] % dH != 0) continue;
 | |
| 
 | |
|                 for(uint colW = colWstart; colW < colWend; ++colW) {
 | |
|                     coords[4] = imW - colW * sW;
 | |
|                     if(coords[4] % dW != 0) continue;
 | |
| 
 | |
|                     val += col[bSiCoffset + (coords[2]/dD)*colShapeInfo[11] + (coords[3]/dH)*colShapeInfo[12] + (coords[4]/dW)*colShapeInfo[13] + colD*colShapeInfo[14] + colH*colShapeInfo[15] + colW*colShapeInfo[16]];
 | |
| 
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         vol[volOffset] = val;
 | |
|     }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| static void col2volCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
 | |
|                                 const void* columns, const Nd4jLong* colShapeInfo,
 | |
|                                       void* volume, const Nd4jLong* volShapeInfo,
 | |
|                                 const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
 | |
| 
 | |
|     col2volCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(columns, colShapeInfo, volume, volShapeInfo, sD, sH, sW, pD, pH, pW, dD, dH, dW);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::col2vol(nd4j::graph::Context& block, const NDArray& col, NDArray& vol, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
 | |
| 
 | |
|     PointersManager manager(block.launchContext(), "col2vol");
 | |
| 
 | |
|     const int threadsPerBlock = MAX_NUM_THREADS / 4;
 | |
|     const int blocksPerGrid = (vol.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
 | |
|     const int sharedMem = col.rankOf() * sizeof(uint) * threadsPerBlock  + 256;
 | |
| 
 | |
|     NDArray::prepareSpecialUse({&vol}, {&col});
 | |
|     BUILD_SINGLE_SELECTOR(vol.dataType(), col2volCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), col.getSpecialBuffer(), col.getSpecialShapeInfo(), vol.specialBuffer(), vol.specialShapeInfo(), sD, sH, sW, pD, pH, pW, dD, dH, dW), FLOAT_TYPES);
 | |
|     NDArray::registerSpecialUse({&vol}, {&col});
 | |
| 
 | |
|     manager.synchronize();
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename X, typename Y>
 | |
| static void conv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 | |
| 
 | |
|     // input   [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
 | |
|     // weights [kH, kW, iC, oC] always
 | |
|     // bias    [oC]
 | |
|     // output  [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
 | |
| 
 | |
|     // kH  filter(kernel) height
 | |
|     // kW  filter(kernel) width
 | |
|     // sH  strides height
 | |
|     // sW  strides width
 | |
|     // pH  paddings height
 | |
|     // pW  paddings width
 | |
|     // dH  dilations height
 | |
|     // dW  dilations width
 | |
|     // paddingMode 0-VALID, 1-SAME
 | |
|     // isNCHW     1-NCHW,  0-NHWC
 | |
| 
 | |
|     int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
 | |
|     int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
 | |
|     ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
 | |
| 
 | |
|     ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
 | |
| 
 | |
|     std::vector<int> permutForOutput;
 | |
| 
 | |
|     if(isNCHW)
 | |
|         permutForOutput = {0, 3, 1, 2};                                             // [bS, oH, oW, oC] -> [bS, oC, oH, oW]
 | |
|     else
 | |
|         input = new NDArray(input->permute({0, 3, 1, 2}));                         // [bS, iH, iW, iC] -> [bS, iC, iH, iW] if NHWC
 | |
| 
 | |
|     NDArray col('c', {bS, oH, oW, kH, kW, iC}, input->dataType(), input->getContext());
 | |
|     NDArray colP = col.permute({0, 5, 3, 4, 1, 2});            // {bS, iC, kH, kW, oH, oW}
 | |
|     NDArray mmulResult('f', {bS*oH*oW, oC}, output->dataType(), output->getContext());
 | |
| 
 | |
|     //----- calculation of output -----//
 | |
|     auto ctx = block.launchContext();
 | |
|     helpers::im2col(*ctx, *input, colP, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
 | |
|     MmulHelper::tensorDot(&col, weights, &mmulResult, {3,4,5}, {0,1,2}, {}); // [bS, oH, oW, kH, kW, iC] x [kH, kW, iC, oC] = [bS, oH, oW, oC]
 | |
| 
 | |
|     //----- assign outTemp to output  -----//
 | |
|     if(isNCHW) {
 | |
|         mmulResult.reshapei({bS, oH, oW, oC});
 | |
|         mmulResult.permutei(permutForOutput);
 | |
|     }
 | |
|     output->assign(mmulResult);
 | |
| 
 | |
|     //----- add biases if required -----//
 | |
|     if(bias)
 | |
|         // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
 | |
|         helpers::addBias(block, *output, *bias, *output, isNCHW);
 | |
| 
 | |
|     if(!isNCHW)
 | |
|         delete input;
 | |
| 
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::conv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 | |
|     BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename X, typename Y>
 | |
| static void depthwiseConv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 | |
| 
 | |
|     // input     [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
 | |
|     // weights   [kH, kW, iC, mC] always
 | |
|     // bias      [oC] = iC*mC
 | |
|     // output    [bS, oH, oW, iC*mC] (NHWC) or [bS, iC*mC, oH, oW] (NCHW)
 | |
| 
 | |
|     // kH           filter(kernel) height
 | |
|     // kW           filter(kernel) width
 | |
|     // sH           strides height
 | |
|     // sW           strides width
 | |
|     // pH           paddings height
 | |
|     // pW           paddings width
 | |
|     // dH           dilations height
 | |
|     // dW           dilations width
 | |
|     // paddingMode   0-VALID, 1-SAME
 | |
|     // isNCHW       0-NCHW,  1-NHWC
 | |
| 
 | |
|     int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
 | |
|     int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
 | |
|     ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
 | |
|     mC = weights->sizeAt(indWmC);                           // channels multiplier
 | |
| 
 | |
|     std::vector<std::vector<Nd4jLong>> modifColumns = {{1,0,4,5,2,3}, {iC,bS*oH*oW,kH*kW}};  // [bS,iC,kH,kW,oH,oW] -> [iC,bS,oH,oW,kH,kW] -> [iC,bS*oH*oW,kH*kW]
 | |
|     std::vector<std::vector<Nd4jLong>> modifOutput;
 | |
|     std::vector<Nd4jLong> outReShape;
 | |
| 
 | |
|     if(!isNCHW) {
 | |
|         outReShape = {bS, oH, oW, iC, mC};                                              // [bS,oH,oW,iC*mC] -> [bS,oH,oW,iC,mC]
 | |
|         modifOutput = {{3,0,1,2,4},{iC, bS*oH*oW, mC}};                                 // [bS,oH,oW,iC,mC] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
 | |
|         input = new NDArray(input->permute({0, 3, 1, 2}));                             // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
 | |
|     }
 | |
|     else {
 | |
|         outReShape = {bS, iC, mC, oH, oW};                                              // [bS,iC*mC,oH,oW] -> [bS,iC,mC,oH,oW]
 | |
|         modifOutput = {{1,0,3,4,2},{iC, bS*oH*oW, mC}};                                 // [bS,iC,mC,oH,oW] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
 | |
|     }
 | |
| 
 | |
|     if(paddingMode == 1)                       // SAME
 | |
|         ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW);
 | |
| 
 | |
|     NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
 | |
|     NDArray outputReshaped = output->reshape(output->ordering(), outReShape, false);
 | |
| 
 | |
|     helpers::im2col(*output->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
 | |
|     MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, {{2,0,1,3},{iC,kH*kW,mC}}, modifOutput);              // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC]
 | |
| 
 | |
|     if(bias)
 | |
|         // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
 | |
|         helpers::addBias(block, *output, *bias, *output, isNCHW);
 | |
| 
 | |
|     if(!isNCHW)
 | |
|         delete input;
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::depthwiseConv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 | |
|     BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename X, typename Y>
 | |
| static void sconv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 | |
| 
 | |
|     // input         [bS, iH, iW, iC]  (NHWC) or [bS, iC, iH, iW]  (NCHW)
 | |
|     // weightsDepth  [kH, kW, iC, mC]  always
 | |
|     // weightsPoint  [1, 1, iC*mC, oC] always
 | |
|     // bias          [oC], oC = iC*mC if weightsPoint=nullptr
 | |
|     // output is     [bS, oH, oW, oC]  (NHWC) or [bS, oC, oH, oW]  (NCHW)
 | |
| 
 | |
|     //  kH         filter(kernel) height
 | |
|     //  kW         filter(kernel) width
 | |
|     //  sH         strides height
 | |
|     //  sW         strides width
 | |
|     //  pH         paddings height
 | |
|     //  pW         paddings width
 | |
|     //  dH         dilations height
 | |
|     //  dW         dilations width
 | |
|     //  paddingMode 0-VALID, 1-SAME
 | |
|     //  isNCHW     1-NCHW,  0-NHWC
 | |
| 
 | |
|     int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier, output channels, output height/width
 | |
|     int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
 | |
|     ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
 | |
|     mC = weightsDepth->sizeAt(indWmC);                      // channels multiplier
 | |
| 
 | |
|     NDArray* outputDepth = output;
 | |
|     if(weightsPoint)                        // if pointwise convolution is expected
 | |
|         outputDepth = new NDArray(output->ordering(), !isNCHW ? std::vector<Nd4jLong>({bS, oH, oW, iC*mC}) : std::vector<Nd4jLong>({bS, iC*mC, oH, oW}), input->dataType(), input->getContext());
 | |
| 
 | |
|     // ----- perform depthwise convolution (if weightsPoint is absent then oC = iC*mC) ----- //
 | |
|     ConvolutionUtils::depthwiseConv2d(block, input, weightsDepth, weightsPoint ? nullptr : bias, outputDepth, kH,kW, sH,sW, pH,pW, dH,dW, paddingMode, isNCHW);
 | |
| 
 | |
|     // ----- perform pointwise convolution (oH = iH, oW = iW) ----- //
 | |
|     if (weightsPoint) {
 | |
|         ConvolutionUtils::conv2d(block, outputDepth, weightsPoint, bias, output, 1,1, 1,1, 0,0, 1,1, paddingMode, isNCHW);             // in this case oH=iH, oW=iW
 | |
|         delete outputDepth;
 | |
|     }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::sconv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 | |
|     BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), sconv2d_, (block, input, weightsDepth, weightsPoint, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename X, typename Z>
 | |
| static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
 | |
| 
 | |
|     // input is  [bS, iC, iH, iW]
 | |
|     // output is [bS, iC, oH, oW]
 | |
| 
 | |
|     const auto x = reinterpret_cast<const X*>(vx);
 | |
|           auto z = reinterpret_cast<Z*>(vz);
 | |
| 
 | |
|     __shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;
 | |
| 
 | |
|     if (threadIdx.x == 0) {
 | |
|         bS = shape::sizeAt(xShapeInfo, 0);
 | |
|         iC = shape::sizeAt(xShapeInfo, 1);
 | |
|         oH = shape::sizeAt(zShapeInfo, 2);
 | |
|         oW = shape::sizeAt(zShapeInfo, 3);
 | |
|         iH = shape::sizeAt(xShapeInfo, 2);
 | |
|         iW = shape::sizeAt(xShapeInfo, 3);
 | |
| 
 | |
|         strideB = shape::stride(xShapeInfo)[0];
 | |
|         strideC = shape::stride(xShapeInfo)[1];
 | |
|         strideY = shape::stride(xShapeInfo)[2];
 | |
|         strideX = shape::stride(xShapeInfo)[3];
 | |
| 
 | |
|         strideOB = shape::stride(zShapeInfo)[0];
 | |
|         strideOC = shape::stride(zShapeInfo)[1];
 | |
|         strideOY = shape::stride(zShapeInfo)[2];
 | |
|         strideOX = shape::stride(zShapeInfo)[3];
 | |
| 
 | |
|         length = shape::length(zShapeInfo);
 | |
| 
 | |
|         //Replace kernel H/W with *effective* kernel H/W accounting for dilatyon
 | |
|         kHEff = kH + (kH-1)*(dH-1);
 | |
|         kWEff = kW + (kW-1)*(dW-1);
 | |
|     }
 | |
|     __syncthreads();
 | |
| 
 | |
|     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 | |
| 
 | |
|     for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
 | |
| 
 | |
|         const int pw = index % oW;
 | |
|         const int ph = (index / oW) % oH;
 | |
|         const int c = (index / oW / oH) % iC;
 | |
|         const int n = index / oW / oH / iC;
 | |
| 
 | |
|         int hstart = sH * ph - pH;
 | |
|         int wstart = sW * pw - pW;
 | |
|         int hend = hstart + kHEff;
 | |
|         int wend = wstart + kWEff;
 | |
| 
 | |
|         if(hstart < 0){
 | |
|             int f = nd4j::math::nd4j_ceil<Z,int>((Z) -hstart / (Z)dH);
 | |
|             hstart += f * dH;
 | |
|         }
 | |
|         if(wstart < 0){
 | |
|             int f = nd4j::math::nd4j_ceil<Z,int>((Z) -wstart / (Z) dW);
 | |
|             wstart += f * dW;
 | |
|         }
 | |
|         if(hend > iH){
 | |
|             int f = nd4j::math::nd4j_ceil<Z,int>((Z) (hend-iH) / (Z) dH);
 | |
|             hend -= f * dH;
 | |
|         }
 | |
|         if(wend > iW){
 | |
|             int f = nd4j::math::nd4j_ceil<Z,int>((Z) (wend-iW) / (Z) dW);
 | |
|             wend -= f * dW;
 | |
|         }
 | |
| 
 | |
|         //Accounts for dilation
 | |
|         int pool_size = nd4j::math::nd4j_ceil<double,int>((double) (hend-hstart) / (double) dH) * nd4j::math::nd4j_ceil<double,int>((double) (wend-wstart) / (double) dW);
 | |
| 
 | |
|         Z sum = 0.0f;
 | |
| 
 | |
|         const X *inSlice = x + (n * strideB + c * strideC);
 | |
| 
 | |
|         for (int h = hstart; h < hend; h += dH)
 | |
|             for (int w = wstart; w < wend; w += dW)
 | |
|                 sum += static_cast<Z>(inSlice[h * strideY + w * strideX]);
 | |
| 
 | |
|         int divide_factor = pool_size;  //Case 0: exclude padding
 | |
|         if (extraParam0 == 1)     //Case 1: include padding
 | |
|             divide_factor = kH * kW;
 | |
| 
 | |
|         z[n * strideOB + c * strideOC + pw * strideOX + ph * strideOY] = sum / static_cast<Z>(divide_factor);
 | |
|     }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename X, typename Z>
 | |
| static void avgPooling2dCudaLauncher(nd4j::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
 | |
|     avgPooling2dCuda<X, Z><<<512, 512, 4192, *block.getCudaStream()>>>(vx, vxShapeInfo, vz, vzShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, extraParam0);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename X, typename Z>
 | |
| static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
 | |
| 
 | |
|     // input is  [bS, iC, iH, iW]
 | |
|     // output is [bS, iC, oH, oW]
 | |
| 
 | |
|     const auto x = reinterpret_cast<const X*>(vx);
 | |
|           auto z = reinterpret_cast<Z*>(vz);
 | |
| 
 | |
|     __shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;
 | |
|     __shared__ bool fOrder;
 | |
| 
 | |
|     if (threadIdx.x == 0) {
 | |
|         bS = shape::sizeAt(xShapeInfo, 0);
 | |
|         iC = shape::sizeAt(xShapeInfo, 1);
 | |
|         oH = shape::sizeAt(zShapeInfo, 2);
 | |
|         oW = shape::sizeAt(zShapeInfo, 3);
 | |
|         iH = shape::sizeAt(xShapeInfo, 2);
 | |
|         iW = shape::sizeAt(xShapeInfo, 3);
 | |
| 
 | |
|         strideB = shape::stride(xShapeInfo)[0];
 | |
|         strideC = shape::stride(xShapeInfo)[1];
 | |
|         strideY = shape::stride(xShapeInfo)[2];
 | |
|         strideX = shape::stride(xShapeInfo)[3];
 | |
| 
 | |
|         strideOB = shape::stride(zShapeInfo)[0];
 | |
|         strideOC = shape::stride(zShapeInfo)[1];
 | |
|         strideOY = shape::stride(zShapeInfo)[2];
 | |
|         strideOX = shape::stride(zShapeInfo)[3];
 | |
| 
 | |
|         length = shape::length(zShapeInfo);
 | |
| 
 | |
|         //Replace kernel H/W with *effective* kernel H/W accounting for dilatyon
 | |
|         kHEff = kH + (kH-1)*(dH-1);
 | |
|         kWEff = kW + (kW-1)*(dW-1);
 | |
|     }
 | |
|     __syncthreads();
 | |
| 
 | |
|     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 | |
| 
 | |
|     for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
 | |
| 
 | |
|         const int pw = index % oW;
 | |
|         const int ph = (index / oW) % oH;
 | |
|         const int c = (index / oW / oH) % iC;
 | |
|         const int n = index / oW / oH / iC;
 | |
| 
 | |
|         int hstart = sH * ph - pH;
 | |
|         int wstart = sW * pw - pW;
 | |
|         int hend = hstart + kHEff;
 | |
|         int wend = wstart + kWEff;
 | |
| 
 | |
|         if (hstart < 0) {
 | |
|             int f = nd4j::math::nd4j_ceil<Z, int>((Z) -hstart / (Z) dH);
 | |
|             hstart += f * dH;
 | |
|         }
 | |
|         if (wstart < 0) {
 | |
|             int f = nd4j::math::nd4j_ceil<Z, int>((Z) -wstart / (Z) dW);
 | |
|             wstart += f * dW;
 | |
|         }
 | |
|         if (hend > iH) {
 | |
|             int f = nd4j::math::nd4j_ceil<Z, int>((Z) (hend - iH) / (Z) dH);
 | |
|             hend -= f * dH;
 | |
|         }
 | |
|         if (wend > iW) {
 | |
|             int f = nd4j::math::nd4j_ceil<Z, int>((Z) (wend - iW) / (Z) dW);
 | |
|             wend -= f * dW;
 | |
|         }
 | |
|         //Accounts for dilation
 | |
|         int pool_size = nd4j::math::nd4j_ceil<double, int>((double) (hend - hstart) / (double) dH) *
 | |
|                         nd4j::math::nd4j_ceil<double, int>((double) (wend - wstart) / (double) dW);
 | |
| 
 | |
|         Z sum = 0.f;
 | |
| 
 | |
|         const X *inSlice = x + (n * strideB + c * strideC);
 | |
| 
 | |
|         for (int h = hstart; h < hend; h += dH)
 | |
|             for (int w = wstart; w < wend; w += dW)
 | |
|                 sum += nd4j::math::nd4j_pow<Z, Z, Z>(static_cast<Z>(nd4j::math::nd4j_abs<X>(inSlice[h * strideY + w * strideX])), extraParam0);
 | |
| 
 | |
|         z[n * strideOB + c * strideOC + pw * strideOX + ph * strideOY] = nd4j::math::nd4j_pow<Z, Z, Z>(sum, (Z) 1.0f / extraParam0);
 | |
|     }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename X, typename Z>
 | |
| static void pnormPooling2dCudaLauncher(nd4j::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
 | |
|     pnormPooling2dCuda<X, Z><<<512, 512, 4192, *block.getCudaStream()>>>(vx, vxShapeInfo, vz, vzShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, extraParam0);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename X, typename Z>
 | |
| static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
 | |
| 
 | |
|     // input is  [bS, iC, iH, iW]
 | |
|     // output is [bS, iC, oH, oW]
 | |
| 
 | |
|     const auto x = reinterpret_cast<const X*>(vx);
 | |
|           auto z = reinterpret_cast<Z*>(vz);
 | |
| 
 | |
|     __shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;
 | |
|     __shared__ bool fOrder;
 | |
| 
 | |
|     if (threadIdx.x == 0) {
 | |
|         bS = shape::sizeAt(xShapeInfo, 0);
 | |
|         iC = shape::sizeAt(xShapeInfo, 1);
 | |
|         oH = shape::sizeAt(zShapeInfo, 2);
 | |
|         oW = shape::sizeAt(zShapeInfo, 3);
 | |
|         iH = shape::sizeAt(xShapeInfo, 2);
 | |
|         iW = shape::sizeAt(xShapeInfo, 3);
 | |
| 
 | |
|         strideB = shape::stride(xShapeInfo)[0];
 | |
|         strideC = shape::stride(xShapeInfo)[1];
 | |
|         strideY = shape::stride(xShapeInfo)[2];
 | |
|         strideX = shape::stride(xShapeInfo)[3];
 | |
| 
 | |
|         strideOB = shape::stride(zShapeInfo)[0];
 | |
|         strideOC = shape::stride(zShapeInfo)[1];
 | |
|         strideOY = shape::stride(zShapeInfo)[2];
 | |
|         strideOX = shape::stride(zShapeInfo)[3];
 | |
| 
 | |
|         length = shape::length(zShapeInfo);
 | |
| 
 | |
|         //Replace kernel H/W with *effective* kernel H/W accounting for dilatyon
 | |
|         kHEff = kH + (kH-1)*(dH-1);
 | |
|         kWEff = kW + (kW-1)*(dW-1);
 | |
|     }
 | |
|     __syncthreads();
 | |
| 
 | |
|     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 | |
| 
 | |
|     for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
 | |
| 
 | |
|         const int pw = index % oW;
 | |
|         const int ph = (index / oW) % oH;
 | |
|         const int c = (index / oW / oH) % iC;
 | |
|         const int n = index / oW / oH / iC;
 | |
| 
 | |
|         int hstart = sH * ph - pH;
 | |
|         int wstart = sW * pw - pW;
 | |
|         int hend = hstart + kHEff;
 | |
|         int wend = wstart + kWEff;
 | |
| 
 | |
|         if(hstart < 0){
 | |
|             int f = nd4j::math::nd4j_ceil<Z,int>((Z) -hstart / (Z)dH);
 | |
|             hstart += f * dH;
 | |
|         }
 | |
|         if(wstart < 0){
 | |
|             int f = nd4j::math::nd4j_ceil<Z,int>((Z) -wstart / (Z) dW);
 | |
|             wstart += f * dW;
 | |
|         }
 | |
|         if(hend > iH){
 | |
|             int f = nd4j::math::nd4j_ceil<Z,int>((Z) (hend-iH) / (Z) dH);
 | |
|             hend -= f * dH;
 | |
|         }
 | |
|         if(wend > iW){
 | |
|             int f = nd4j::math::nd4j_ceil<Z,int>((Z) (wend-iW) / (Z) dW);
 | |
|             wend -= f * dW;
 | |
|         }
 | |
|         //Accounts for dilation
 | |
|         int pool_size = nd4j::math::nd4j_ceil<double,int>((double) (hend-hstart) / (double) dH) * nd4j::math::nd4j_ceil<double,int>((double) (wend-wstart) / (double) dW);
 | |
| 
 | |
|         Z max = -nd4j::DataTypeUtils::max<Z>();
 | |
| 
 | |
|         const X *inSlice = x + (n * strideB + c * strideC);
 | |
| 
 | |
|         for (int h = hstart; h < hend; h += dH) {
 | |
|             for (int w = wstart; w < wend; w += dW) {
 | |
|                 Z v = static_cast<Z>(inSlice[h * strideY + w * strideX]);
 | |
|                 if (v > max)
 | |
|                     max = v;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         z[n * strideOB + c * strideOC + pw * strideOX + ph * strideOY] = max;
 | |
|     }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename X, typename Z>
 | |
| static void maxPooling2dCudaLauncher(nd4j::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
 | |
|     maxPooling2dCuda<X,Z><<<512, 512, 4192, *block.getCudaStream()>>>(vx, vxShapeInfo, vz, vzShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, extraParam0);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::pooling2d(nd4j::graph::Context& block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const PoolingType poolingMode, const int extraParam0) {
 | |
| 
 | |
|     if(!input.isActualOnDeviceSide()) input.syncToDevice();
 | |
| 
 | |
|     switch (poolingMode) {
 | |
| 
 | |
|         case MAX_POOL: {
 | |
|                 BUILD_SINGLE_SELECTOR_TWICE(input.dataType(), maxPooling2dCudaLauncher, (*block.launchContext(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), kH, kW, sH, sW, pH, pW, dH, dW, extraParam0), FLOAT_TYPES);
 | |
|             }
 | |
|             break;
 | |
|         case AVG_POOL: {
 | |
|                 BUILD_SINGLE_SELECTOR_TWICE(input.dataType(), avgPooling2dCudaLauncher, (*block.launchContext(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), kH, kW, sH, sW, pH, pW, dH, dW, extraParam0), FLOAT_TYPES);
 | |
|             }
 | |
|             break;
 | |
|         case PNORM_POOL: {
 | |
|                 BUILD_SINGLE_SELECTOR_TWICE(input.dataType(), pnormPooling2dCudaLauncher, (*block.launchContext(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), kH, kW, sH, sW, pH, pW, dH, dW, extraParam0), FLOAT_TYPES);
 | |
|             }
 | |
|             break;
 | |
|         default:
 | |
|             throw std::runtime_error("Pooling2D: Unknown PoolingType used");
 | |
|     }
 | |
| 
 | |
|     output.tickWriteDevice();
 | |
|     input.tickReadDevice();
 | |
| 
 | |
|     auto result = cudaStreamSynchronize(*block.launchContext()->getCudaStream());
 | |
|     if (result != 0)
 | |
|         throw cuda_exception::build("Pooling2D failed", result);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
 | |
| 
 | |
|     // x input  is [bS, iC, iD, iH, iW]
 | |
|     // z output is [bS, iC, oD, oH, oW]
 | |
| 
 | |
|     const T* x = reinterpret_cast<const T*>(vx);
 | |
|           T* z = reinterpret_cast<T*>(vz);
 | |
| 
 | |
|     __shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd;
 | |
|     __shared__ Nd4jLong *sharedMem, zLen;
 | |
| 
 | |
|     if (threadIdx.x == 0) {
 | |
|         extern __shared__ unsigned char shmem[];
 | |
|         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 | |
| 
 | |
|         zLen = shape::length(zShapeInfo);
 | |
|         rank = 5;
 | |
| 
 | |
|         kDeff = kD + (kD - 1) * (dD - 1);
 | |
|         kHeff = kH + (kH - 1) * (dH - 1);
 | |
|         kWeff = kW + (kW - 1) * (dW - 1);
 | |
| 
 | |
|         iD = xShapeInfo[3];
 | |
|         iH = xShapeInfo[4];
 | |
|         iW = xShapeInfo[5];
 | |
| 
 | |
|         kProd = kD * kH * kW;
 | |
|     }
 | |
|     __syncthreads();
 | |
| 
 | |
|     const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
 | |
| 
 | |
|     if(zInd >= zLen)
 | |
|         return;
 | |
| 
 | |
|     auto coords = sharedMem + threadIdx.x * rank;
 | |
| 
 | |
|     shape::index2coords(zInd, zShapeInfo, coords);
 | |
| 
 | |
|     const auto zOffset = shape::getOffset(zShapeInfo, coords);
 | |
| 
 | |
|     int dstart = coords[2] * sD - pD;
 | |
|     int hstart = coords[3] * sH - pH;
 | |
|     int wstart = coords[4] * sW - pW;
 | |
|     int dend = dstart + kDeff;
 | |
|     int hend = hstart + kHeff;
 | |
|     int wend = wstart + kWeff;
 | |
| 
 | |
|     if(dstart < 0)
 | |
|         dstart += dD * ((-dstart + dD - 1) / dD);
 | |
|     if(hstart < 0)
 | |
|         hstart += dH * ((-hstart + dH - 1) / dH);
 | |
|     if(wstart < 0)
 | |
|         wstart += dW * ((-wstart + dW - 1) / dW);
 | |
|     if(dend > iD)
 | |
|         dend -= dD * ((dend - iD + dD - 1) / dD);
 | |
|     if(hend > iH)
 | |
|         hend -= dH * ((hend - iH + dH - 1) / dH);
 | |
|     if(wend > iW)
 | |
|         wend -= dW * ((wend - iW + dW - 1) / dW);
 | |
| 
 | |
| 
 | |
|     switch (poolingMode) {
 | |
| 
 | |
|         /*** max ***/
 | |
|         case 0: {
 | |
|             T max = -DataTypeUtils::max<T>();
 | |
|             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
 | |
|                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH){
 | |
|                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
 | |
|                         T val = x[shape::getOffset(xShapeInfo, coords)];
 | |
|                         if (val > max)
 | |
|                             max = val;
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|             z[zOffset] = max;
 | |
|         }
 | |
|         break;
 | |
| 
 | |
|         /*** avg ***/
 | |
|         case 1: {
 | |
|             T sum = static_cast<T>(0.);
 | |
|             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
 | |
|                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
 | |
|                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
 | |
|                         sum += x[shape::getOffset(xShapeInfo, coords)];
 | |
| 
 | |
|             if (extraParam0 == 0) {         //Exclude padding
 | |
|                 uint a = (dend - dstart) / dD + ((dend - dstart) % dD == 0 ? 0 : 1);
 | |
|                 uint b = (hend - hstart) / dH + ((hend - hstart) % dH == 0 ? 0 : 1);
 | |
|                 uint c = (wend - wstart) / dW + ((wend - wstart) % dW == 0 ? 0 : 1);
 | |
|                 sum /=  static_cast<T>(a * b * c);                                       //  /= nd4j::math::nd4j_ceil<double,T>(static_cast<double>(dend - dstart) / static_cast<double>(dD)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart) / static_cast<double>(dW));   //Accounts for dilation
 | |
|             }
 | |
|             else if (extraParam0 == 1)    //Include padding
 | |
|                 sum /= kProd;
 | |
| 
 | |
|             z[zOffset] = sum;
 | |
|         }
 | |
|         break;
 | |
| 
 | |
|         /*** pnorm ***/
 | |
|         case 2: {
 | |
|             T sum = static_cast<T>(0.);
 | |
|             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
 | |
|                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
 | |
|                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
 | |
|                         sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
 | |
| 
 | |
|             sum = nd4j::math::nd4j_pow<T,T,T>(sum, (T) 1.f / extraParam0);
 | |
| 
 | |
|             z[zOffset] = sum;
 | |
|         }
 | |
|         break;
 | |
|     }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| static void pooling3dCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
 | |
|                                 const void* vx, const Nd4jLong* xShapeInfo,
 | |
|                                       void* vz, const Nd4jLong* zShapeInfo,
 | |
|                                 const int kD, const int kH, const int kW,
 | |
|                                 const int sD, const int sH, const int sW,
 | |
|                                 const int pD, const int pH, const int pW,
 | |
|                                 const int dD, const int dH, const int dW,
 | |
|                                 const int poolingMode, const int extraParam0) {
 | |
| 
 | |
|     pooling3dCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::pooling3d(nd4j::graph::Context& block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
 | |
| 
 | |
|     PointersManager manager(block.launchContext(), "pooling3d");
 | |
| 
 | |
|     const int threadsPerBlock = MAX_NUM_THREADS / 2;
 | |
|     const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
 | |
|     const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
 | |
| 
 | |
|     NDArray::prepareSpecialUse({&output}, {&input});
 | |
|     BUILD_SINGLE_SELECTOR(input.dataType(), pooling3dCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
 | |
|     NDArray::registerSpecialUse({&output}, {&input});
 | |
| 
 | |
|     manager.synchronize();
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
 | |
| 
 | |
|     // x: input [bS, iC, iH, iW]
 | |
|     // y: gradO [bS, iC, oH, oW]
 | |
|     // z: gradI [bS, iC, iH, iW] -> gradI is output in this function
 | |
| 
 | |
|     const T* x = reinterpret_cast<const T*>(vx);
 | |
|     const T* y = reinterpret_cast<const T*>(vy);
 | |
|           T* z = reinterpret_cast<T*>(vz);
 | |
| 
 | |
|     Nd4jLong coord2, coord3;
 | |
|     __shared__ int rank, kHeff, kWeff, iH, iW, kProd;
 | |
|     __shared__ Nd4jLong *sharedMem, yLen;
 | |
| 
 | |
|     if (threadIdx.x == 0) {
 | |
|         extern __shared__ unsigned char shmem[];
 | |
|         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 | |
| 
 | |
|         yLen = shape::length(yShapeInfo);
 | |
|         rank = 4;
 | |
| 
 | |
|         kHeff = kH + (kH - 1) * (dH - 1);
 | |
|         kWeff = kW + (kW - 1) * (dW - 1);
 | |
| 
 | |
|         iH = xShapeInfo[3];
 | |
|         iW = xShapeInfo[4];
 | |
| 
 | |
|         kProd = kH * kW;
 | |
|     }
 | |
|     __syncthreads();
 | |
| 
 | |
|     const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
 | |
| 
 | |
|     if(yInd >= yLen)
 | |
|         return;
 | |
| 
 | |
|     auto coords = sharedMem + threadIdx.x * rank;
 | |
| 
 | |
|     shape::index2coords(yInd, yShapeInfo, coords);
 | |
| 
 | |
|     const auto yOffset = shape::getOffset(yShapeInfo, coords);
 | |
| 
 | |
|     int hstart = coords[2] * sH - pH;
 | |
|     int wstart = coords[3] * sW - pW;
 | |
|     int hend = hstart + kHeff;
 | |
|     int wend = wstart + kWeff;
 | |
|     if(hstart < 0)
 | |
|         hstart += dH * ((-hstart + dH - 1) / dH);
 | |
|     if(wstart < 0)
 | |
|         wstart += dW * ((-wstart + dW - 1) / dW);
 | |
|     if(hend > iH)
 | |
|         hend -= dH * ((hend - iH + dH - 1) / dH);
 | |
|     if(wend > iW)
 | |
|         wend -= dW * ((wend - iW + dW - 1) / dW);
 | |
| 
 | |
| 
 | |
|     switch (poolingMode) {
 | |
| 
 | |
|         /*** max ***/
 | |
|         case 0: {
 | |
|             coord2 = hstart;
 | |
|             coord3 = wstart;
 | |
| 
 | |
|             T max = -DataTypeUtils::max<T>();
 | |
|             for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {
 | |
|                 for (coords[3] = wstart; coords[3] < wend; coords[3] += dW){
 | |
|                     T val = x[shape::getOffset(xShapeInfo, coords)];
 | |
|                     if (val > max) {
 | |
|                         max = val;
 | |
|                         coord2 = coords[2];
 | |
|                         coord3 = coords[3];
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|             coords[2] = coord2;
 | |
|             coords[3] = coord3;
 | |
|             auto zOffset = shape::getOffset(zShapeInfo, coords);
 | |
|             nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], y[yOffset]);
 | |
|             //z[zOffset] += y[yOffset];
 | |
|         }
 | |
|         break;
 | |
| 
 | |
|         /*** avg ***/
 | |
|         case 1: {
 | |
| 
 | |
|             T val = y[yOffset];
 | |
| 
 | |
|             if (extraParam0 == 0)         //Exclude padding
 | |
|                 val /= nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart) / static_cast<double>(dW));   //Accounts for dilation
 | |
|             else if (extraParam0 == 1)    //Include padding
 | |
|                 val /= kProd;
 | |
| 
 | |
|             for (coords[2] = hstart; coords[2] < hend; coords[2] += dH)
 | |
|                 for (coords[3] = wstart; coords[3] < wend; coords[3] += dW)
 | |
|                     nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], val);
 | |
|         }
 | |
|         break;
 | |
| 
 | |
|         /*** pnorm ***/
 | |
|         case 2: {
 | |
| 
 | |
|             T sum = static_cast<T>(0.);
 | |
|             T val = y[yOffset];
 | |
| 
 | |
|             for (coords[2] = hstart; coords[2] < hend; coords[2] += dH)
 | |
|                 for (coords[3] = wstart; coords[3] < wend; coords[3] += dW)
 | |
|                     sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
 | |
| 
 | |
|             val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
 | |
| 
 | |
|             for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {
 | |
|                 for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) {
 | |
|                     const auto xOffset = shape::getOffset(xShapeInfo, coords);
 | |
|                     const auto zOffset = shape::getOffset(zShapeInfo, coords);
 | |
|                     nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         break;
 | |
|     }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| static void pooling2dBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
 | |
|                                     const void* vx, const Nd4jLong* xShapeInfo,
 | |
|                                     const void* vy, const Nd4jLong* yShapeInfo,
 | |
|                                           void* vz, const Nd4jLong* zShapeInfo,
 | |
|                                     const int kH, const int kW,
 | |
|                                     const int sH, const int sW,
 | |
|                                     const int pH, const int pW,
 | |
|                                     const int dH, const int dW,
 | |
|                                     const int poolingMode, const int extraParam0) {
 | |
| 
 | |
|     pooling2dBPCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::pooling2dBP(nd4j::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
 | |
| 
 | |
|     // initial zeroing of gradI
 | |
|     gradI.nullify();
 | |
| 
 | |
|     PointersManager manager(block.launchContext(), "pooling2dBP");
 | |
| 
 | |
|     const int threadsPerBlock = 256;
 | |
|     const int blocksPerGrid = (gradO.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
 | |
|     const int sharedMem = gradO.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
 | |
| 
 | |
|     NDArray::prepareSpecialUse({&gradI}, {&input, &gradO});
 | |
|     BUILD_SINGLE_SELECTOR(input.dataType(), pooling2dBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), gradO.getSpecialBuffer(), gradO.getSpecialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
 | |
|     NDArray::registerSpecialUse({&gradI}, {&input, &gradO});
 | |
| 
 | |
|     manager.synchronize();
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
 | |
| 
 | |
|     // x: input [bS, iC, iD, iH, iW]
 | |
|     // y: gradO [bS, iC, oD, oH, oW]
 | |
|     // z: gradI [bS, iC, iD, iH, iW] -> gradI is output in this function
 | |
| 
 | |
| 
 | |
|     const T* x = reinterpret_cast<const T*>(vx);
 | |
|     const T* y = reinterpret_cast<const T*>(vy);
 | |
|           T* z = reinterpret_cast<T*>(vz);
 | |
| 
 | |
|     Nd4jLong coord2, coord3, coord4;
 | |
|     __shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd;
 | |
|     __shared__ Nd4jLong *sharedMem, yLen;
 | |
| 
 | |
|     if (threadIdx.x == 0) {
 | |
|         extern __shared__ unsigned char shmem[];
 | |
|         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 | |
| 
 | |
|         yLen = shape::length(yShapeInfo);
 | |
|         rank = 5;
 | |
| 
 | |
|         kDeff = kD + (kD - 1) * (dD - 1);
 | |
|         kHeff = kH + (kH - 1) * (dH - 1);
 | |
|         kWeff = kW + (kW - 1) * (dW - 1);
 | |
| 
 | |
|         iD = xShapeInfo[3];
 | |
|         iH = xShapeInfo[4];
 | |
|         iW = xShapeInfo[5];
 | |
| 
 | |
|         kProd = kD * kH * kW;
 | |
|     }
 | |
|     __syncthreads();
 | |
| 
 | |
|     const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
 | |
| 
 | |
|     if(yInd >= yLen)
 | |
|         return;
 | |
| 
 | |
|     auto coords = sharedMem + threadIdx.x * rank;
 | |
| 
 | |
|     shape::index2coords(yInd, yShapeInfo, coords);
 | |
| 
 | |
|     const auto yOffset = shape::getOffset(yShapeInfo, coords);
 | |
| 
 | |
|     int dstart = coords[2] * sD - pD;
 | |
|     int hstart = coords[3] * sH - pH;
 | |
|     int wstart = coords[4] * sW - pW;
 | |
|     int dend = dstart + kDeff;
 | |
|     int hend = hstart + kHeff;
 | |
|     int wend = wstart + kWeff;
 | |
| 
 | |
|     if(dstart < 0)
 | |
|         dstart += dD * ((-dstart + dD - 1) / dD);
 | |
|     if(hstart < 0)
 | |
|         hstart += dH * ((-hstart + dH - 1) / dH);
 | |
|     if(wstart < 0)
 | |
|         wstart += dW * ((-wstart + dW - 1) / dW);
 | |
|     if(dend > iD)
 | |
|         dend -= dD * ((dend - iD + dD - 1) / dD);
 | |
|     if(hend > iH)
 | |
|         hend -= dH * ((hend - iH + dH - 1) / dH);
 | |
|     if(wend > iW)
 | |
|         wend -= dW * ((wend - iW + dW - 1) / dW);
 | |
| 
 | |
| 
 | |
|     switch (poolingMode) {
 | |
| 
 | |
|         /*** max ***/
 | |
|         case 0: {
 | |
| 
 | |
|             T max = -DataTypeUtils::max<T>();
 | |
|             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
 | |
|                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH){
 | |
|                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
 | |
|                         T val = x[shape::getOffset(xShapeInfo, coords)];
 | |
|                         if (val > max) {
 | |
|                             max = val;
 | |
|                             coord2 = coords[2];
 | |
|                             coord3 = coords[3];
 | |
|                             coord4 = coords[4];
 | |
|                         }
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|             coords[2] = coord2;
 | |
|             coords[3] = coord3;
 | |
|             coords[4] = coord4;
 | |
|             nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], y[yOffset]);
 | |
|         }
 | |
|         break;
 | |
| 
 | |
|         /*** avg ***/
 | |
|         case 1: {
 | |
| 
 | |
|             T val = y[yOffset];
 | |
| 
 | |
|             if (extraParam0 == 0)         //Exclude padding
 | |
|                 val /= nd4j::math::nd4j_ceil<double,T>(static_cast<double>(dend - dstart) / static_cast<double>(dD))  * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH))     * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart)    / static_cast<double>(dW));   //Accounts for dilation
 | |
|             else if (extraParam0 == 1)    //Include padding
 | |
|                 val /= kProd;
 | |
| 
 | |
|             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
 | |
|                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
 | |
|                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
 | |
|                         nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], val);
 | |
|         }
 | |
|         break;
 | |
| 
 | |
|         /*** pnorm ***/
 | |
|         case 2: {
 | |
| 
 | |
|             T sum = static_cast<T>(0.);
 | |
|             T val = y[yOffset];
 | |
| 
 | |
|             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
 | |
|                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
 | |
|                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
 | |
|                         sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
 | |
| 
 | |
|             val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
 | |
| 
 | |
|             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
 | |
|                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) {
 | |
|                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
 | |
|                         const auto xOffset = shape::getOffset(xShapeInfo, coords);
 | |
|                         const auto zOffset = shape::getOffset(zShapeInfo, coords);
 | |
|                         nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         break;
 | |
|     }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| static void pooling3dBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
 | |
|                                     const void* vx, const Nd4jLong* xShapeInfo,
 | |
|                                     const void* vy, const Nd4jLong* yShapeInfo,
 | |
|                                           void* vz, const Nd4jLong* zShapeInfo,
 | |
|                                     const int kD, const int kH, const int kW,
 | |
|                                     const int sD, const int sH, const int sW,
 | |
|                                     const int pD, const int pH, const int pW,
 | |
|                                     const int dD, const int dH, const int dW,
 | |
|                                     const int poolingMode, const int extraParam0) {
 | |
| 
 | |
|     pooling3dBPCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::pooling3dBP(nd4j::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
 | |
| 
 | |
|     // initial zeroing of gradI
 | |
|     gradI.nullify();
 | |
| 
 | |
|     PointersManager manager(block.launchContext(), "pooling3dBP");
 | |
| 
 | |
|     const int threadsPerBlock = MAX_NUM_THREADS / 2;
 | |
|     const int blocksPerGrid = (gradO.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
 | |
|     const int sharedMem = gradO.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
 | |
| 
 | |
|     NDArray::prepareSpecialUse({&gradI}, {&input, &gradO});
 | |
|     BUILD_SINGLE_SELECTOR(input.dataType(), pooling3dBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), gradO.getSpecialBuffer(), gradO.getSpecialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
 | |
|     NDArray::registerSpecialUse({&gradI}, {&input, &gradO});
 | |
| 
 | |
|     manager.synchronize();
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename X, typename Y>
 | |
| static void conv2dBP_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 | |
| 
 | |
|     // input   [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
 | |
|     // weights [kH, kW, iC, oC] always
 | |
|     // bias    [oC]
 | |
|     // gradO   [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
 | |
| 
 | |
|     // gradI    [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
 | |
|     // gradW    [kH, kW, iC, oC] always
 | |
|     // gradB    [oC]
 | |
| 
 | |
|     // kH         filter(kernel) height
 | |
|     // kW         filter(kernel) width
 | |
|     // sH         strides height
 | |
|     // sW         strides width
 | |
|     // pH         paddings height
 | |
|     // pW         paddings width
 | |
|     // dH         dilations height
 | |
|     // dW         dilations width
 | |
|     // paddingMode 0-VALID, 1-SAME
 | |
|     // isNCHW     0-NHWC, 1-NCHW
 | |
| 
 | |
|     int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
 | |
|     int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
 | |
|     ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
 | |
| 
 | |
|     ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
 | |
| 
 | |
|     std::vector<int> gradOaxesForDot;
 | |
| 
 | |
|     if(!isNCHW) {
 | |
|         gradOaxesForDot  = {0, 1, 2};                                           // bS, oH, oW
 | |
|         input = new NDArray(input->permute({0, 3, 1, 2}));                      // [bS, iH, iW, iC] -> [bS, iC, iH, iW]
 | |
|         gradI = new NDArray(gradI->permute({0, 3, 1, 2}));                      // [bS, iH, iW, iC] -> [bS, iC, iH, iW]
 | |
|     } else {
 | |
|         gradOaxesForDot  = {0, 2, 3};                                           // bS, oH, oW
 | |
|     }
 | |
| 
 | |
|     NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
 | |
| 
 | |
|     // ----- calculation of gradW ----- //
 | |
|     if(gradW) {
 | |
|         auto ctx = block.launchContext();
 | |
|         helpers::im2col(*ctx, *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));   // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
 | |
|         nd4j::MmulHelper::tensorDot(&columns, gradO, gradW, {0,4,5}, gradOaxesForDot, {2, 0, 1, 3});       // [bS, iC, kH, kW, oH, oW] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [iC, kH, kW, oC]
 | |
|     }
 | |
| 
 | |
|     // ----- calculation of gradB ----- //
 | |
|     if(gradB) {
 | |
|         NDArray* gradBR = gradB;
 | |
|         if(gradB->rankOf() == 2)
 | |
|             gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
 | |
|         gradO->reduceAlongDimension(reduce::Sum, *gradBR, gradOaxesForDot, false);                          // sum over bS, oH, oW
 | |
|         if(gradBR != gradB)
 | |
|             delete gradBR;
 | |
|     }
 | |
| 
 | |
|     //----- calculation of gradI -----//
 | |
|     nd4j::MmulHelper::tensorDot(weights, gradO, &columns, {indWoC}, {indIOioC}, {2, 3, 1, 0, 4, 5});  // [kH, kW, iC, oC]/[oC, iC, kH, kW]] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
 | |
| 
 | |
|     helpers::col2im(*block.launchContext(), columns, *gradI, sH, sW, pH, pW, iH, iW, dH, dW);                          // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
 | |
| 
 | |
|     if(!isNCHW) {
 | |
|         delete input;
 | |
|         delete gradI;
 | |
|     }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::conv2dBP(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 | |
|     BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2dBP_, (block, input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename X, typename Y>
 | |
| static void depthwiseConv2dBP_(const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 | |
| 
 | |
|     // input    [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
 | |
|     // weights  [kH, kW, iC, mC] always
 | |
|     // bias     [oC] = [iC*mC]
 | |
|     // gradO    [bS, oH, oW, oC] (NDHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next
 | |
|     // gradI    [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW), epsilon
 | |
|     // gradW    [kH, kW, iC, mC] always
 | |
|     // gradB    [oC]
 | |
| 
 | |
|     //  kH          filter(kernel) height
 | |
|     //  kW          filter(kernel) width
 | |
|     //  sH          strides height
 | |
|     //  sW          strides width
 | |
|     //  pH          paddings height
 | |
|     //  pW          paddings width
 | |
|     //  dH          dilations height
 | |
|     //  dW          dilations width
 | |
|     //  paddingMode  0-VALID, 1-SAME
 | |
|     //  isNCHW      0-NHWC, 1-NCHW
 | |
| 
 | |
|     int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
 | |
|     int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
 | |
|     ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
 | |
|     mC = weights->sizeAt(indWmC);                           // channels multiplier
 | |
| 
 | |
|     std::vector<std::vector<Nd4jLong>> modifColumns = {{1,2,3,0,4,5}, {iC, kH*kW, bS*oH*oW}};      // [bS,iC,kH,kW,oH,oW] -> [iC, kH*kW, bS*oH*oW]
 | |
|     std::vector<std::vector<Nd4jLong>> modifGradO1, modifGradO2;
 | |
|     std::vector<Nd4jLong> gradOreShape;
 | |
| 
 | |
|     if(!isNCHW) {
 | |
|         gradOreShape = {bS, oH, oW, iC, mC};                                            // [bS,oH,oW,iC*mC] -> [bS,oH,oW,iC,mC]
 | |
|         modifGradO1 = {{3,0,1,2,4},{iC, bS*oH*oW, mC}};                                 // [bS,oH,oW,iC,mC] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
 | |
|         modifGradO2 = {{3,0,1,2},{iC, mC, bS*oH*oW}};                                   // [bS,oH,oW,iC*mC] -> [iC*mC,bS,oH,oW] -> [iC,mC,bS*oH*oW]
 | |
|         input = new NDArray(input->permute({0, 3, 1, 2}));                             // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
 | |
|         gradI = new NDArray(gradI->permute({0, 3, 1, 2}));                             // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
 | |
|     }
 | |
|     else {
 | |
|         gradOreShape = {bS, iC, mC, oH, oW};                                            // [bS,iC*mC,oH,oW] -> [bS,iC,mC,oH,oW]
 | |
|         modifGradO1 = {{1,0,3,4,2},{iC, bS*oH*oW, mC}};                                 // [bS,iC,mC,oH,oW] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
 | |
|         modifGradO2 = {{1,0,2,3},{iC, mC, bS*oH*oW}};                                   // [bS,iC*mC,oH,oW] -> [iC*mC,bS,oH,oW] -> [iC,mC,bS*oH*oW]
 | |
|     }
 | |
| 
 | |
|     if(paddingMode == 1)                       // SAME
 | |
|         ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW);
 | |
| 
 | |
|     NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
 | |
|     NDArray gradOreshaped = gradO->reshape(gradO->ordering(), gradOreShape);
 | |
| 
 | |
|     // ----- calculation of gradW and gradB ----- //
 | |
| 
 | |
|     helpers::im2col(*input->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
 | |
|     nd4j::MmulHelper::tensorDot(&columns, &gradOreshaped, gradW, modifColumns, modifGradO1, {{2,0,1,3},{iC,kH*kW,mC}});  // [iC, kW*kH, bS*oH*oW] x [iC, bS*oH*oW, mC] = [iC, kH*kW, mC]
 | |
| 
 | |
|     // ----- calculation of gradB ----- //
 | |
|     if(gradB) {
 | |
|         NDArray* gradBR = gradB;
 | |
|         if(gradB->rankOf() == 2)
 | |
|             gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
 | |
|         gradO->reduceAlongDimension(reduce::Sum, *gradBR, {0,indOoH,indOoH+1}, false);                      // sum over bS, oH, oW
 | |
|         if(gradBR != gradB)
 | |
|             delete gradBR;
 | |
|     }
 | |
| 
 | |
|     //----- calculation of gradI -----//
 | |
|     nd4j::MmulHelper::tensorDot(weights, gradO, &columns, {{2,0,1,3},{iC,kH*kW,mC}}, modifGradO2, modifColumns); // [iC, kH*kW, mC] x [iC, mC, bS*oH*oW] = [iC, kW*kH, bS*oH*oW]
 | |
|     helpers::col2im(*input->getContext(), columns, *gradI, sH, sW, pH, pW, iH, iW, dH, dW);                                       // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
 | |
| 
 | |
|     if(!isNCHW) {
 | |
|         delete input;
 | |
|         delete gradI;
 | |
|     }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::depthwiseConv2dBP(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 | |
|     BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2dBP_, (input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
 | |
| }
 | |
| 
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int factorH, const int factorW, const bool isNCHW) {
 | |
| 
 | |
|     // x has shape [bS, iC, iH, iW] (NCHW) or [bS, iH, iW, iC] (NHWC)
 | |
|     // z has shape [bS, iC, factorH*iH, factorW*iW ] (NCHW) or [bS, factorH*iH, factorW*iW, iC] (NHWC)
 | |
| 
 | |
|     const T* x = reinterpret_cast<const T*>(vx);
 | |
|           T* z = reinterpret_cast<T*>(vz);
 | |
| 
 | |
|     __shared__ int rank, dimIH;
 | |
|     __shared__ Nd4jLong *sharedMem, zLen;
 | |
| 
 | |
|     if (threadIdx.x == 0) {
 | |
|         extern __shared__ unsigned char shmem[];
 | |
|         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 | |
| 
 | |
|         dimIH  = isNCHW ? 2 : 1;
 | |
|         zLen   = shape::length(zShapeInfo);
 | |
|         rank   = 4;
 | |
|     }
 | |
|     __syncthreads();
 | |
| 
 | |
|     const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
 | |
| 
 | |
|     if(zInd >= zLen)
 | |
|         return;
 | |
| 
 | |
|     auto coords = sharedMem + threadIdx.x * rank;
 | |
| 
 | |
|     shape::index2coords(zInd, zShapeInfo, coords);
 | |
| 
 | |
|     const auto zOffset = shape::getOffset(zShapeInfo, coords);
 | |
| 
 | |
|     coords[dimIH]     /= factorH;
 | |
|     coords[dimIH + 1] /= factorW;
 | |
| 
 | |
|     const auto xOffset = shape::getOffset(xShapeInfo, coords);
 | |
| 
 | |
|     z[zOffset] = x[xOffset];
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| static void upsampling2dCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
 | |
|                                      const void* vx, const Nd4jLong* xShapeInfo,
 | |
|                                            void* vz, const Nd4jLong* zShapeInfo,
 | |
|                                      const int factorH, const int factorW, const bool isNCHW) {
 | |
| 
 | |
|     upsampling2dCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, factorH, factorW, isNCHW);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::upsampling2d(nd4j::graph::Context& block, const NDArray& input, NDArray& output, const int factorH, const int factorW, const bool isNCHW) {
 | |
| 
 | |
|     PointersManager manager(block.launchContext(), "upsampling2d");
 | |
| 
 | |
|     const int threadsPerBlock = MAX_NUM_THREADS / 2;
 | |
|     const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
 | |
|     const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
 | |
| 
 | |
|     NDArray::prepareSpecialUse({&output}, {&input});
 | |
|     BUILD_SINGLE_SELECTOR(input.dataType(), upsampling2dCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), factorH, factorW, isNCHW), FLOAT_TYPES);
 | |
|     NDArray::registerSpecialUse({&output}, {&input});
 | |
| 
 | |
|     manager.synchronize();
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
 | |
| 
 | |
|     // x has shape [bS, iC, iD, iH, iW] (NCDHW) or [bS, iD, iH, iW, iC] (NDHWC)
 | |
|     // z has shape [bS, iC, factorD*iD, factorH*iH, factorW*iW ] (NCDHW) or [bS, factorD*iD, factorH*iH, factorW*iW, iC] (NDHWC)
 | |
| 
 | |
|     const T* x = reinterpret_cast<const T*>(vx);
 | |
|           T* z = reinterpret_cast<T*>(vz);
 | |
| 
 | |
|     __shared__ int rank, dimID;
 | |
|     __shared__ Nd4jLong *sharedMem, zLen;
 | |
| 
 | |
|     if (threadIdx.x == 0) {
 | |
|         extern __shared__ unsigned char shmem[];
 | |
|         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 | |
| 
 | |
|         dimID = isNCDHW ? 2 : 1;
 | |
|         zLen  = shape::length(zShapeInfo);
 | |
|         rank  = 5;
 | |
|     }
 | |
|     __syncthreads();
 | |
| 
 | |
|     const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
 | |
| 
 | |
|     if(zInd >= zLen)
 | |
|         return;
 | |
| 
 | |
|     auto coords = sharedMem + threadIdx.x * rank;
 | |
| 
 | |
|     shape::index2coords(zInd, zShapeInfo, coords);
 | |
| 
 | |
|     const auto zOffset = shape::getOffset(zShapeInfo, coords);
 | |
| 
 | |
|     coords[dimID]     /= factorD;
 | |
|     coords[dimID + 1] /= factorH;
 | |
|     coords[dimID + 2] /= factorW;
 | |
| 
 | |
|     const auto xOffset = shape::getOffset(xShapeInfo, coords);
 | |
| 
 | |
|     z[zOffset] = x[xOffset];
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| static void upsampling3dCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
 | |
|                                      const void* vx, const Nd4jLong* xShapeInfo,
 | |
|                                            void* vz, const Nd4jLong* zShapeInfo,
 | |
|                                      const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
 | |
| 
 | |
|     upsampling3dCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, factorD, factorH, factorW, isNCDHW);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::upsampling3d(nd4j::graph::Context& block, const NDArray& input, NDArray& output, const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
 | |
| 
 | |
|     PointersManager manager(block.launchContext(), "upsampling3d");
 | |
| 
 | |
|     const int threadsPerBlock = MAX_NUM_THREADS / 2;
 | |
|     const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
 | |
|     const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
 | |
| 
 | |
|     NDArray::prepareSpecialUse({&output}, {&input});
 | |
|     BUILD_SINGLE_SELECTOR(input.dataType(), upsampling3dCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), factorD, factorH, factorW, isNCDHW), FLOAT_TYPES);
 | |
|     NDArray::registerSpecialUse({&output}, {&input});
 | |
| 
 | |
|     manager.synchronize();
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool isNCHW) {
 | |
| 
 | |
|     // x (gradO) has shape [bS, iC, factorH*iH, factorW*iW ] (NCHW) or [bS, factorH*iH, factorW*iW, iC] (NHWC)
 | |
|     // z (gradI) has shape [bS, iC, iH, iW] (NCHW) or [bS, iH, iW, iC] (NHWC)
 | |
| 
 | |
|     const T* x = reinterpret_cast<const T*>(vx);
 | |
|           T* z = reinterpret_cast<T*>(vz);
 | |
| 
 | |
|     __shared__ int rank, dimIH;
 | |
|     __shared__ uint factorH, factorW;
 | |
|     __shared__ Nd4jLong *sharedMem, zLen;
 | |
| 
 | |
|     if (threadIdx.x == 0) {
 | |
|         extern __shared__ unsigned char shmem[];
 | |
|         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 | |
| 
 | |
|         dimIH = isNCHW ? 2 : 1;
 | |
|         zLen  = shape::length(zShapeInfo);
 | |
|         rank  = 4;
 | |
| 
 | |
|         factorH = xShapeInfo[dimIH + 1] / zShapeInfo[dimIH + 1];
 | |
|         factorW = xShapeInfo[dimIH + 2] / zShapeInfo[dimIH + 2];
 | |
|     }
 | |
|     __syncthreads();
 | |
| 
 | |
|     const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
 | |
| 
 | |
|     if(zInd >= zLen)
 | |
|         return;
 | |
| 
 | |
|     auto coords = sharedMem + threadIdx.x * rank;
 | |
| 
 | |
|     shape::index2coords(zInd, zShapeInfo, coords);
 | |
| 
 | |
|     const auto zOffset = shape::getOffset(zShapeInfo, coords);
 | |
| 
 | |
|     z[zOffset] = 0;
 | |
| 
 | |
|     const Nd4jLong zCoord2 = coords[dimIH]     * factorH;
 | |
|     const Nd4jLong zCoord3 = coords[dimIH + 1] * factorW;
 | |
| 
 | |
|     for(coords[dimIH] = zCoord2; coords[dimIH] < zCoord2 + factorH; ++coords[dimIH])
 | |
|         for(coords[dimIH + 1] = zCoord3; coords[dimIH + 1] < zCoord3 + factorW; ++coords[dimIH + 1])
 | |
|             z[zOffset] += x[shape::getOffset(xShapeInfo, coords)];
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| static void upsampling2dBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
 | |
|                                        const void* vx, const Nd4jLong* xShapeInfo,
 | |
|                                              void* vz, const Nd4jLong* zShapeInfo,
 | |
|                                        const bool isNCHW) {
 | |
| 
 | |
|     upsampling2dBPCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, isNCHW);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::upsampling2dBP(nd4j::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
 | |
| 
 | |
|     PointersManager manager(block.launchContext(), "upsampling2d_bp");
 | |
| 
 | |
|     const int threadsPerBlock = MAX_NUM_THREADS / 2;
 | |
|     const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
 | |
|     const int sharedMem = gradI.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
 | |
| 
 | |
|     NDArray::prepareSpecialUse({&gradI}, {&gradO});
 | |
|     BUILD_SINGLE_SELECTOR(gradI.dataType(), upsampling2dBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), gradO.getSpecialBuffer(), gradO.getSpecialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), isNCHW), FLOAT_TYPES);
 | |
|     NDArray::registerSpecialUse({&gradI}, {&gradO});
 | |
| 
 | |
|     manager.synchronize();
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool isNCDHW) {
 | |
| 
 | |
|     // x (gradO) has shape [bS, iC, iD, iH, iW] (NCDHW) or [bS, iD, iH, iW, iC] (NDHWC)
 | |
|     // z (gradI) has shape [bS, iC, factorD*iD, factorH*iH, factorW*iW ] (NCDHW) or [bS, factorD*iD, factorH*iH, factorW*iW, iC] (NDHWC)
 | |
| 
 | |
|     const T* x = reinterpret_cast<const T*>(vx);
 | |
|           T* z = reinterpret_cast<T*>(vz);
 | |
| 
 | |
|     __shared__ int rank, dimID;
 | |
|     __shared__ uint factorD, factorH, factorW;
 | |
|     __shared__ Nd4jLong *sharedMem, zLen;
 | |
| 
 | |
|     if (threadIdx.x == 0) {
 | |
|         extern __shared__ unsigned char shmem[];
 | |
|         sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
 | |
| 
 | |
|         dimID = isNCDHW ? 2 : 1;
 | |
|         zLen  = shape::length(zShapeInfo);
 | |
|         rank  = 5;
 | |
| 
 | |
|         factorD = xShapeInfo[dimID + 1] / zShapeInfo[dimID + 1];
 | |
|         factorH = xShapeInfo[dimID + 2] / zShapeInfo[dimID + 2];
 | |
|         factorW = xShapeInfo[dimID + 3] / zShapeInfo[dimID + 3];
 | |
|     }
 | |
|     __syncthreads();
 | |
| 
 | |
|     const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
 | |
| 
 | |
|     if(zInd >= zLen)
 | |
|         return;
 | |
| 
 | |
|     auto coords = sharedMem + threadIdx.x * rank;
 | |
| 
 | |
|     shape::index2coords(zInd, zShapeInfo, coords);
 | |
| 
 | |
|     const auto zOffset = shape::getOffset(zShapeInfo, coords);
 | |
| 
 | |
|     z[zOffset] = 0;
 | |
| 
 | |
|     const Nd4jLong zCoord2 = coords[dimID]     * factorD;
 | |
|     const Nd4jLong zCoord3 = coords[dimID + 1] * factorH;
 | |
|     const Nd4jLong zCoord4 = coords[dimID + 2] * factorW;
 | |
| 
 | |
|     for(coords[dimID] = zCoord2; coords[dimID] < zCoord2 + factorD; ++coords[dimID])
 | |
|         for(coords[dimID + 1] = zCoord3; coords[dimID + 1] < zCoord3 + factorH; ++coords[dimID + 1])
 | |
|             for(coords[dimID + 2] = zCoord4; coords[dimID + 2] < zCoord4 + factorW; ++coords[dimID + 2])
 | |
|                 z[zOffset] += x[shape::getOffset(xShapeInfo, coords)];
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| template <typename T>
 | |
| static void upsampling3dBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
 | |
|                                        const void* vx, const Nd4jLong* xShapeInfo,
 | |
|                                              void* vz, const Nd4jLong* zShapeInfo,
 | |
|                                        const bool isNCDHW) {
 | |
| 
 | |
|     upsampling3dBPCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, isNCDHW);
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////////
 | |
| void ConvolutionUtils::upsampling3dBP(nd4j::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCDHW) {
 | |
| 
 | |
|     PointersManager manager(block.launchContext(), "upsampling3d_bp");
 | |
| 
 | |
|     const int threadsPerBlock = MAX_NUM_THREADS / 2;
 | |
|     const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
 | |
|     const int sharedMem = gradI.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
 | |
| 
 | |
|     NDArray::prepareSpecialUse({&gradI}, {&gradO});
 | |
|     BUILD_SINGLE_SELECTOR(gradI.dataType(), upsampling3dBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), gradO.getSpecialBuffer(), gradO.getSpecialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), isNCDHW), FLOAT_TYPES);
 | |
|     NDArray::registerSpecialUse({&gradI}, {&gradO});
 | |
| 
 | |
|     manager.synchronize();
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| }
 | |
| } |