/******************************************************************************* * Copyright (c) 2019 Konduit K.K. * * This program and the accompanying materials are made available under the * terms of the Apache License, Version 2.0 which is available at * https://www.apache.org/licenses/LICENSE-2.0. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * SPDX-License-Identifier: Apache-2.0 ******************************************************************************/ // // @author Yurii Shyrma (iuriish@yahoo.com) // @author sgazeos@gmail.com // @author raver119@gmail.com // #include #include #include #include namespace sd { namespace ops { namespace helpers { ////////////////////////////////////////////////////////////////////////// template __global__ static void clipByNormCuda(const void* vClipNorm, const void* vNorm, const Nd4jLong* normShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int* dimensions, const int dimsLen, const bool useAverage) { const T clipNorm = *reinterpret_cast(vClipNorm); const T* norm = reinterpret_cast(vNorm); T* z = reinterpret_cast(vz); __shared__ Nd4jLong zLen, tadLen, totalThreads; if (threadIdx.x == 0) { zLen = shape::length(zShapeInfo); tadLen = zLen / shape::length(normShapeInfo); totalThreads = gridDim.x * blockDim.x; } __syncthreads(); int zCoords[MAX_RANK], normCoords[MAX_RANK]; const auto tid = blockIdx.x * blockDim.x + threadIdx.x; for (Nd4jLong i = tid; i < zLen; i += totalThreads) { shape::index2coords(i, zShapeInfo, zCoords); // deduce norm coords for (int j = 0; j < dimsLen; ++j) normCoords[j] = zCoords[dimensions[j]]; const T actualNorm = useAverage ? norm[shape::getOffset(normShapeInfo, normCoords)] / tadLen : norm[shape::getOffset(normShapeInfo, normCoords)]; if(actualNorm > clipNorm) z[shape::getOffset(zShapeInfo, zCoords)] *= clipNorm / actualNorm; } } ////////////////////////////////////////////////////////////////////////// template __host__ static void clipByNormCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const cudaStream_t *stream, const void* vClipNorm, const void* vNorm, const Nd4jLong* normShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int* dimensions, const int dimsLen, const bool useAverage) { clipByNormCuda<<>>(vClipNorm, vNorm, normShapeInfo, vz, zShapeInfo, dimensions, dimsLen, useAverage); } ////////////////////////////////////////////////////////////////////////// void clipByNorm(sd::LaunchContext* context, NDArray& input, NDArray& output, const std::vector& dims, const NDArray& clipNorm, const bool isInplace, const bool useAverage) { NDArray* z = nullptr; if(isInplace) { z = &input; } else { output.assign(input); z = &output; } if(dims.empty()) { const NDArray actualNorm = useAverage ? z->reduceAlongDimension(reduce::Norm2, {}) / z->lengthOf() : z->reduceAlongDimension(reduce::Norm2, {}); if(actualNorm.e(0) > clipNorm.e(0)) *z *= clipNorm / actualNorm; } else { const NDArray actualNorms = z->reduceAlongDimension(reduce::Norm2, dims); std::vector dimsToExclude = ShapeUtils::evalDimsToExclude(z->rankOf(), dims); const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (z->lengthOf() + threadsPerBlock - 1) / threadsPerBlock; PointersManager manager(context, "clipByNorm"); const int* dimensions = reinterpret_cast(manager.replicatePointer(dimsToExclude.data(), dimsToExclude.size() * sizeof(int))); NDArray::prepareSpecialUse({z}, {z, &actualNorms, &clipNorm}); BUILD_SINGLE_SELECTOR(z->dataType(), clipByNormCudaLauncher, (blocksPerGrid, threadsPerBlock, context->getCudaStream(), clipNorm.specialBuffer(), actualNorms.specialBuffer(), actualNorms.specialShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), dimensions, (int)dimsToExclude.size(), useAverage), FLOAT_TYPES); NDArray::registerSpecialUse({z}, {z, &actualNorms, &clipNorm}); manager.synchronize(); } } ////////////////////////////////////////////////////////////////////////// template __global__ static void clipByNormBpCuda(const void* vClipNorm, const void* vx, const Nd4jLong* xShapeInfo, // input const void* vy, const Nd4jLong* yShapeInfo, // gradO const void* vNorm, const Nd4jLong* normShapeInfo, const void* vSum, const Nd4jLong* sumShapeInfo, void* vz, const Nd4jLong* zShapeInfo, // gradI const int* dimensions, const int dimsLen, const bool useAverage) { const T clipNorm = *reinterpret_cast(vClipNorm); const T* norm = reinterpret_cast(vNorm); const T* sum = reinterpret_cast(vSum); const T* x = reinterpret_cast(vx); const T* y = reinterpret_cast(vy); T* z = reinterpret_cast(vz); __shared__ Nd4jLong zLen, tadLen, totalThreads; __shared__ bool sameOffsets; if (threadIdx.x == 0) { zLen = shape::length(zShapeInfo); tadLen = zLen / shape::length(normShapeInfo); totalThreads = gridDim.x * blockDim.x; sameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo, zShapeInfo); } __syncthreads(); int zCoords[MAX_RANK], normCoords[MAX_RANK]; const auto tid = blockIdx.x * blockDim.x + threadIdx.x; for (Nd4jLong i = tid; i < zLen; i += totalThreads) { shape::index2coords(i, zShapeInfo, zCoords); const auto zOffset = shape::getOffset(zShapeInfo, zCoords); const auto yOffset = sameOffsets ? zOffset : shape::getOffset(yShapeInfo, zCoords); // deduce norm coords for (int j = 0; j < dimsLen; ++j) normCoords[j] = zCoords[dimensions[j]]; const T actualNorm = useAverage ? norm[shape::getOffset(normShapeInfo, normCoords)] / tadLen : norm[shape::getOffset(normShapeInfo, normCoords)]; if(actualNorm > clipNorm) { const T sumVal = sum[shape::getOffset(sumShapeInfo, normCoords)]; const auto xOffset = sameOffsets ? zOffset : shape::getOffset(xShapeInfo, zCoords); z[zOffset] = (clipNorm / actualNorm) * y[yOffset] * (static_cast(1.f) - (x[xOffset] * sumVal) / (actualNorm * actualNorm)); } else z[zOffset] = y[yOffset]; } } ////////////////////////////////////////////////////////////////////////// template void clipByNormBp_(sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector& dims, const NDArray& clipNorm, const bool useAverage) { const int rank = input.rankOf(); auto actualNorms = input.reduceAlongDimension(reduce::Norm2, dims); if(actualNorms.lengthOf() == 1) { const T norm = useAverage ? actualNorms.e(0) / static_cast(input.lengthOf()) : actualNorms.e(0); auto clipVal = clipNorm.e(0); if(norm > clipVal) { const T sum = input.reduceNumber(reduce::Sum).e(0); // reduce to scalar const T factor1 = clipVal / norm; const T factor2 = static_cast(1.f) / (norm * norm); // 1 / (norm*norm*norm) auto lambda = LAMBDA_TT(x, y, sum, factor1, factor2) { return factor1 * y * (static_cast(1.f) - factor2 * x * sum); }; const_cast(input).applyPairwiseLambda(const_cast(gradO), lambda, gradI); } else gradI.assign(gradO); } else { const NDArray actualNorms = input.reduceAlongDimension(reduce::Norm2, dims); const NDArray sums = input.reduceAlongDimension(reduce::Sum, dims); std::vector dimsToExclude = ShapeUtils::evalDimsToExclude(gradI.rankOf(), dims); const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; PointersManager manager(context, "clipByNormBp"); const int* dimensions = reinterpret_cast(manager.replicatePointer(dimsToExclude.data(), dimsToExclude.size() * sizeof(int))); NDArray::prepareSpecialUse({&gradI}, {&actualNorms, &sums, &clipNorm, &input, &gradO}); clipByNormBpCuda<<getCudaStream()>>>(clipNorm.specialBuffer(), input.specialBuffer(), input.specialShapeInfo(), gradO.specialBuffer(), gradO.specialShapeInfo(), actualNorms.specialBuffer(), actualNorms.specialShapeInfo(), sums.specialBuffer(), sums.specialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), dimensions, (int)dimsToExclude.size(), useAverage); NDArray::registerSpecialUse({&gradI}, {&actualNorms, &sums, &clipNorm, &input, &gradO}); manager.synchronize(); } } BUILD_SINGLE_TEMPLATE(template void clipByNormBp_, (sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector& dimensions, const NDArray& clipNorm, const bool useAverage), FLOAT_TYPES); ////////////////////////////////////////////////////////////////////////// void clipByNormBp(sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector& dimensions, const NDArray& clipNorm, const bool useAverage) { const NDArray& castedInput = gradI.dataType() == input.dataType() ? input : input.cast(gradI.dataType()); BUILD_SINGLE_SELECTOR(gradI.dataType(), clipByNormBp_, (context, castedInput, gradO, gradI, dimensions, clipNorm, useAverage), FLOAT_TYPES); } template void clipByGlobalNorm_(sd::LaunchContext * context, std::vector const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector& outputs, bool isInplace) { NDArray globalNorm = NDArrayFactory::create(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list])) for (auto i = 0; i < inputs.size(); i++) { auto input = inputs[i]; auto l2norm = input->reduceNumber(reduce::Norm2); globalNorm += l2norm * l2norm; } globalNorm.applyTransform(transform::Sqrt, globalNorm); // = sd::math::nd4j_sqrt(globalNorm); outputs[inputs.size()]->p(0, globalNorm); globalNorm.syncToHost(); const T factor = static_cast(clipNorm) / globalNorm.e(0); for (size_t e = 0; e < inputs.size(); e++) { // all-reduce auto input = inputs[e]; auto output = outputs[e]; if (globalNorm.e(0) <= clipNorm) { output->assign(input); } else { auto lambda = LAMBDA_T(_x, factor) { return _x * factor; }; input->applyLambda(lambda, *output); } } } void clipByGlobalNorm(sd::LaunchContext * context, std::vector const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector& outputs, bool isInplace) { BUILD_SINGLE_SELECTOR(outputs[0]->dataType(), clipByGlobalNorm_, (context, inputs, clipNorm, workspace, outputs, isInplace), FLOAT_TYPES); } BUILD_SINGLE_TEMPLATE(template void clipByGlobalNorm_, (sd::LaunchContext * context, std::vector const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector& outputs, bool isInplace), FLOAT_TYPES); template static void __global__ clipByValueKernel(void* input, const Nd4jLong* inputShape, void* output, const Nd4jLong* outputShape, double leftBound, double rightBound) { __shared__ T* outputBuf; __shared__ T* inputBuf; __shared__ Nd4jLong length; __shared__ bool linearBuffers; if (threadIdx.x == 0) { outputBuf = reinterpret_cast(output); inputBuf = reinterpret_cast(input); length = shape::length(inputShape); linearBuffers = shape::elementWiseStride(inputShape) == shape::elementWiseStride(outputShape) && shape::elementWiseStride(inputShape) == 1; } __syncthreads(); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto step = gridDim.x * blockDim.x; for (Nd4jLong e = tid; e < length; e += step) { if (linearBuffers) { if (inputBuf[e] > rightBound) outputBuf[e] = (T) rightBound; else if (inputBuf[e] < leftBound) outputBuf[e] = (T) leftBound; else outputBuf[e] = inputBuf[e]; } else { auto inputOffset = shape::getIndexOffset(e, inputShape); auto outputOffset = shape::getIndexOffset(e, outputShape); if (inputBuf[inputOffset] > rightBound) outputBuf[outputOffset] = (T) rightBound; else if (inputBuf[inputOffset] < leftBound) outputBuf[outputOffset] = (T) leftBound; else outputBuf[outputOffset] = inputBuf[outputOffset]; } } } template static void clipByValue_(sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output) { auto stream = context->getCudaStream(); if (!input.isActualOnDeviceSide()) input.syncToDevice(); NDArray::prepareSpecialUse({&output}, {&input}); clipByValueKernel<<<256, 512, 8192, *stream>>>(input.specialBuffer(), input.specialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), leftBound, rightBound); NDArray::registerSpecialUse({&output}, {&input}); } void clipByValue(sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output) { BUILD_SINGLE_SELECTOR(input.dataType(), clipByValue_, (context, input, leftBound, rightBound, output), FLOAT_TYPES); } BUILD_SINGLE_TEMPLATE(template void clipByValue_, (sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output);, FLOAT_TYPES); } } }