/******************************************************************************* * Copyright (c) 2015-2018 Skymind, Inc. * * This program and the accompanying materials are made available under the * terms of the Apache License, Version 2.0 which is available at * https://www.apache.org/licenses/LICENSE-2.0. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * SPDX-License-Identifier: Apache-2.0 ******************************************************************************/ // // @author Yurii Shyrma (iuriish@yahoo.com), created on 20.04.2018 // #include #include #include #include #include #include #include #include #include namespace sd { namespace ops { namespace helpers { ////////////////////////////////////////////////////////////////////////// template static __global__ void mergeMaxIndexCudaLauncher(void** inArrs, void** inShapes, const int numArrays, void* voutput, Nd4jLong* outputShape, Nd4jLong length) { auto output = reinterpret_cast(voutput); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto step = gridDim.x * blockDim.x; for (Nd4jLong e = tid; e < length; e += step) { T mVal = -DataTypeUtils::max(); Z mIdx(0); for (int i = 0; i < numArrays; i++) { auto x = reinterpret_cast(inArrs[i]); auto xShape = reinterpret_cast(inShapes[i]); auto val = x[shape::getIndexOffset(e, xShape)];; if (mVal < val) { mIdx = static_cast(i); mVal = val; } } output[shape::getIndexOffset(e, outputShape)] = mIdx; } } template static void mergeMaxIndex_(sd::LaunchContext* context, const std::vector& inArrs, NDArray& output) { int nArrSize = static_cast(inArrs.size()); std::vector inBuffers(nArrSize), inShapes(nArrSize); for (int e = 0; e < nArrSize; e++) { inBuffers[e] = inArrs[e]->getSpecialBuffer(); inShapes[e] = inArrs[e]->getSpecialShapeInfo(); } PointersManager manager(context, "mergeMaxIndex"); auto pInBuffers = reinterpret_cast(manager.replicatePointer(inBuffers.data(), inBuffers.size() * sizeof(void*))); auto pInShapes = reinterpret_cast(manager.replicatePointer(inShapes.data(), inShapes.size() * sizeof(void*))); auto length = output.lengthOf(); const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock; mergeMaxIndexCudaLauncher << getCudaStream() >> > (pInBuffers, pInShapes, nArrSize, output.getSpecialBuffer(), output.getSpecialShapeInfo(), length); manager.synchronize(); } void mergeMaxIndex(sd::LaunchContext* context, const std::vector& inArrs, NDArray& output) { NDArray::prepareSpecialUse({ &output }, inArrs); BUILD_DOUBLE_SELECTOR(inArrs[0]->dataType(), output.dataType(), mergeMaxIndex_, (context, inArrs, output), LIBND4J_TYPES, INDEXING_TYPES); NDArray::registerSpecialUse({ &output }, inArrs); } ////////////////////////////////////////////////////////////////////////// template static __global__ void mergeMaxCudaLauncher(void** inArrs, void** inShapes, const int numArrays, void* voutput, Nd4jLong* outputShape, Nd4jLong length) { auto output = reinterpret_cast(voutput); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto step = gridDim.x * blockDim.x; for (Nd4jLong e = tid; e < length; e += step) { T mVal = -DataTypeUtils::max(); for (int i = 0; i < numArrays; i++) { auto x = reinterpret_cast(inArrs[i]); auto xShape = reinterpret_cast(inShapes[i]); auto val = x[shape::getIndexOffset(e, xShape)];; if (mVal < val) mVal = val; } output[shape::getIndexOffset(e, outputShape)] = mVal; } } template static void mergeMax_(sd::LaunchContext* context, const std::vector& inArrs, NDArray& output) { int nArrsSize = static_cast(inArrs.size()); std::vector inBuffers(nArrsSize), inShapes(nArrsSize); for (int e = 0; e < nArrsSize; e++) { inBuffers[e] = inArrs[e]->getSpecialBuffer(); inShapes[e] = inArrs[e]->getSpecialShapeInfo(); } PointersManager manager(context, "mergeMax"); auto pInBuffers = reinterpret_cast(manager.replicatePointer(inBuffers.data(), inBuffers.size() * sizeof(void*))); auto pInShapes = reinterpret_cast(manager.replicatePointer(inShapes.data(), inShapes.size() * sizeof(void*))); auto length = output.lengthOf(); const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock; mergeMaxCudaLauncher << getCudaStream() >> > (pInBuffers, pInShapes, nArrsSize, output.getSpecialBuffer(), output.getSpecialShapeInfo(), length); manager.synchronize(); } void mergeMax(sd::LaunchContext* context, const std::vector& inArrs, NDArray& output) { NDArray::prepareSpecialUse({ &output }, inArrs); BUILD_SINGLE_SELECTOR(output.dataType(), mergeMax_, (context, inArrs, output), LIBND4J_TYPES); NDArray::registerSpecialUse({ &output }, inArrs); } ////////////////////////////////////////////////////////////////////////// template static __global__ void mergeMaxBpCudaLauncher(void** inArrs, void** inShapes, void* vgradient, Nd4jLong* gradientShape, const int numArrays, void** outArrs, void** outShapes, Nd4jLong length, bool bSameOrderAndEws1) { auto grad = reinterpret_cast(vgradient); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto step = gridDim.x * blockDim.x; int coords[MAX_RANK]; for (Nd4jLong e = tid; e < length; e += step) { T mVal = -DataTypeUtils::max(); int nMaxIndex = 0; auto xOffset = e, zOffset = e, gradOffset = e; if (!bSameOrderAndEws1) { shape::index2coords(e, gradientShape, coords); gradOffset = shape::getOffset(gradientShape, coords); } for (int i = 0; i < numArrays; i++) { auto x = reinterpret_cast(inArrs[i]); if (!bSameOrderAndEws1) { auto xShape = reinterpret_cast(inShapes[i]); xOffset = shape::getOffset(xShape, coords); } auto val = x[xOffset]; if (mVal < val) { mVal = val; nMaxIndex = i; } } // outputs have to be pre-nullify if (!bSameOrderAndEws1) { auto outShape = reinterpret_cast(outShapes[nMaxIndex]); zOffset = shape::getOffset(outShape, coords); } auto output = reinterpret_cast(outArrs[nMaxIndex]); output[zOffset] = grad[gradOffset]; } } template static void mergeMaxBp_(sd::LaunchContext* context, const std::vector& inArrs, std::vector& outArrs, int nArrSize, bool bSameOrderAndEws1) { std::vector inBuffers(nArrSize), inShapes(nArrSize), outBuffers(nArrSize), outShapes(nArrSize); for (int e = 0; e < nArrSize; e++) { inBuffers[e] = inArrs[e]->getSpecialBuffer(); inShapes[e] = inArrs[e]->getSpecialShapeInfo(); outBuffers[e] = outArrs[e]->getSpecialBuffer(); outShapes[e] = outArrs[e]->getSpecialShapeInfo(); } PointersManager manager(context, "mergeMaxBp"); auto pInBuffers = reinterpret_cast(manager.replicatePointer(inBuffers.data(), inBuffers.size() * sizeof(void*))); auto pInShapes = reinterpret_cast(manager.replicatePointer(inShapes.data(), inShapes.size() * sizeof(void*))); auto pOutBuffers = reinterpret_cast(manager.replicatePointer(outBuffers.data(), outBuffers.size() * sizeof(void*))); auto pOutShapes = reinterpret_cast(manager.replicatePointer(outShapes.data(), outShapes.size() * sizeof(void*))); auto length = inArrs[nArrSize]->lengthOf(); const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock; mergeMaxBpCudaLauncher << getCudaStream() >> > (pInBuffers, pInShapes, inArrs[nArrSize]->getSpecialBuffer(), inArrs[nArrSize]->getSpecialShapeInfo(), nArrSize, pOutBuffers, pOutShapes, length, bSameOrderAndEws1); manager.synchronize(); } void mergeMaxBp(sd::LaunchContext* context, const std::vector& inArrs, std::vector& outArrs) { // not use gradient int nArrSize = static_cast(inArrs.size() - 1); const std::vector& out = reinterpret_cast&>(outArrs); NDArray::prepareSpecialUse(out, inArrs); bool bSameOrderAndEws1 = (1 == inArrs[nArrSize]->ews()); auto ordering = inArrs[nArrSize]->ordering(); for (int i = 0; i < nArrSize; ++i) { bSameOrderAndEws1 &= (ordering == inArrs[i]->ordering()); bSameOrderAndEws1 &= (1 == inArrs[i]->ews()); bSameOrderAndEws1 &= (ordering == outArrs[i]->ordering()); bSameOrderAndEws1 &= (1 == outArrs[i]->ews()); } BUILD_SINGLE_SELECTOR(inArrs[nArrSize]->dataType(), mergeMaxBp_, (context, inArrs, outArrs, nArrSize, bSameOrderAndEws1), LIBND4J_TYPES); NDArray::registerSpecialUse( out, inArrs ); } ////////////////////////////////////////////////////////////////////////// template static __global__ void mergeAvgCudaLauncher(void** inArrs, void** inShapes, const int numArrays, void* voutput, Nd4jLong* outputShape, Nd4jLong length) { auto output = reinterpret_cast(voutput); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto step = gridDim.x * blockDim.x; for (Nd4jLong e = tid; e < length; e += step) { T sum(0.0f); for (int i = 0; i < numArrays; i++) { auto x = reinterpret_cast(inArrs[i]); auto xShape = reinterpret_cast(inShapes[i]); sum += x[shape::getIndexOffset(e, xShape)]; } output[shape::getIndexOffset(e, outputShape)] = sum / numArrays; } } template static void mergeAvg_(sd::LaunchContext* context, const std::vector& inArrs, NDArray& output) { std::vector inBuffers(inArrs.size()), inShapes(inArrs.size()); for (int e = 0; e < inArrs.size(); e++) { inBuffers[e] = inArrs[e]->getSpecialBuffer(); inShapes[e] = inArrs[e]->getSpecialShapeInfo(); } PointersManager manager(context, "mergeAvg"); auto pInBuffers = reinterpret_cast(manager.replicatePointer(inBuffers.data(), inBuffers.size() * sizeof(void*))); auto pInShapes = reinterpret_cast(manager.replicatePointer(inShapes.data(), inShapes.size() * sizeof(void*))); auto length = output.lengthOf(); const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock; mergeAvgCudaLauncher << getCudaStream() >> > (pInBuffers, pInShapes, (int)inArrs.size(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), length); manager.synchronize(); } void mergeAvg(sd::LaunchContext* context, const std::vector& inArrs, NDArray& output) { NDArray::prepareSpecialUse({ &output }, inArrs); BUILD_SINGLE_SELECTOR(output.dataType(), mergeAvg_, (context, inArrs, output), FLOAT_TYPES); NDArray::registerSpecialUse({ &output }, inArrs); } ////////////////////////////////////////////////////////////////////////// template static __global__ void mergeAvgBpCudaLauncher(void* vgradient, Nd4jLong* gradientShape, void** outArrs, void** outShapes, const int numArrays, Nd4jLong length, bool bSameOrderAndEws1) { auto grad = reinterpret_cast(vgradient); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto step = gridDim.x * blockDim.x; int coords[MAX_RANK]; for (Nd4jLong e = tid; e < length; e += step) { auto zOffset = e, gradOffset = e; if (!bSameOrderAndEws1) { shape::index2coords(e, gradientShape, coords); gradOffset = shape::getOffset(gradientShape, coords); } for (int i = 0; i < numArrays; i++) { if (!bSameOrderAndEws1) { auto outShape = reinterpret_cast(outShapes[i]); zOffset = shape::getOffset(outShape, coords); } auto output = reinterpret_cast(outArrs[i]); output[zOffset] = grad[gradOffset] / numArrays; } } } template static void mergeAvgBp_(sd::LaunchContext* context, const NDArray& gradient, std::vector& outArrs, bool bSameOrderAndEws1) { int nArrSize = static_cast(outArrs.size()); std::vector outBuffers(nArrSize), outShapes(nArrSize); for (int e = 0; e < nArrSize; e++) { outBuffers[e] = outArrs[e]->getSpecialBuffer(); outShapes[e] = outArrs[e]->getSpecialShapeInfo(); } PointersManager manager(context, "mergeAvgBp"); auto pOutBuffers = reinterpret_cast(manager.replicatePointer(outBuffers.data(), outBuffers.size() * sizeof(void*))); auto pOutShapes = reinterpret_cast(manager.replicatePointer(outShapes.data(), outShapes.size() * sizeof(void*))); auto length = gradient.lengthOf(); const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock; mergeAvgBpCudaLauncher << getCudaStream() >> > (gradient.getSpecialBuffer(), gradient.getSpecialShapeInfo(), pOutBuffers, pOutShapes, nArrSize, length, bSameOrderAndEws1); manager.synchronize(); } void mergeAvgBp(sd::LaunchContext* context, const NDArray& gradient, std::vector& outArrs) { const std::vector& out = reinterpret_cast&>(outArrs); NDArray::prepareSpecialUse( out, { &gradient }); bool bSameOrderAndEws1 = (1 == gradient.ews()); auto ordering = gradient.ordering(); for (const auto& v : outArrs) { bSameOrderAndEws1 &= (ordering == v->ordering()); bSameOrderAndEws1 &= (1 == v->ews()); } BUILD_SINGLE_SELECTOR(gradient.dataType(), mergeAvgBp_, (context, gradient, outArrs, bSameOrderAndEws1), LIBND4J_TYPES); NDArray::prepareSpecialUse(out, { &gradient }); } ////////////////////////////////////////////////////////////////////////// template static __global__ void mergeAddCudaLauncher(void** inArrs, void** inShapes, const int numArrays, void* voutput, Nd4jLong* outputShape, Nd4jLong length) { auto output = reinterpret_cast(voutput); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto step = gridDim.x * blockDim.x; for (Nd4jLong e = tid; e < length; e += step) { T sum(0.0f); for (int i = 0; i < numArrays; i++) { auto x = reinterpret_cast(inArrs[i]); auto xShape = reinterpret_cast(inShapes[i]); sum += x[shape::getIndexOffset(e, xShape)]; } output[shape::getIndexOffset(e, outputShape)] = sum; } } template static void mergeAdd_(sd::LaunchContext* context, const std::vector& inArrs, NDArray& output) { int nArrSize = static_cast(inArrs.size()); std::vector inBuffers(nArrSize), inShapes(nArrSize); for (int e = 0; e < nArrSize; e++) { inBuffers[e] = inArrs[e]->getSpecialBuffer(); inShapes[e] = inArrs[e]->getSpecialShapeInfo(); } PointersManager manager(context, "mergeAdd"); auto pInBuffers = reinterpret_cast(manager.replicatePointer(inBuffers.data(), inBuffers.size() * sizeof(void*))); auto pInShapes = reinterpret_cast(manager.replicatePointer(inShapes.data(), inShapes.size() * sizeof(void*))); auto length = output.lengthOf(); const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock; mergeAddCudaLauncher << getCudaStream() >> > (pInBuffers, pInShapes, nArrSize, output.getSpecialBuffer(), output.getSpecialShapeInfo(), length); manager.synchronize(); } BUILD_SINGLE_TEMPLATE(template void mergeAdd_, (sd::LaunchContext* context, const std::vector& inArrs, NDArray& output), NUMERIC_TYPES); void mergeAdd(sd::LaunchContext* context, const std::vector& inArrs, NDArray& output) { NDArray::prepareSpecialUse({ &output }, inArrs); BUILD_SINGLE_SELECTOR(output.dataType(), mergeAdd_, (context, inArrs, output), NUMERIC_TYPES); NDArray::registerSpecialUse({ &output }, inArrs); } ////////////////////////////////////////////////////////////////////////// template static __global__ void mergeAddBpCudaLauncher(void* vgradient, Nd4jLong* gradientShape, void** outArrs, void** outShapes, const int numArrays, Nd4jLong length, bool bSameOrderAndEws1) { auto grad = reinterpret_cast(vgradient); const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto step = gridDim.x * blockDim.x; int coords[MAX_RANK]; for (Nd4jLong e = tid; e < length; e += step) { auto zOffset = e, gradOffset = e; if (!bSameOrderAndEws1) { shape::index2coords(e, gradientShape, coords); gradOffset = shape::getOffset(gradientShape, coords); } for (int i = 0; i < numArrays; i++) { if (!bSameOrderAndEws1) { auto outShape = reinterpret_cast(outShapes[i]); zOffset = shape::getOffset(outShape, coords); } auto output = reinterpret_cast(outArrs[i]); output[zOffset] = grad[gradOffset]; } } } template static void mergeAddBp_(sd::LaunchContext* context, const NDArray& gradient, std::vector& outArrs, bool bSameOrderAndEws1) { int nArrSize = static_cast(outArrs.size()); std::vector outBuffers(nArrSize), outShapes(nArrSize); for (int e = 0; e < nArrSize; e++) { outBuffers[e] = outArrs[e]->getSpecialBuffer(); outShapes[e] = outArrs[e]->getSpecialShapeInfo(); } PointersManager manager(context, "mergeAddBp"); auto pOutBuffers = reinterpret_cast(manager.replicatePointer(outBuffers.data(), outBuffers.size() * sizeof(void*))); auto pOutShapes = reinterpret_cast(manager.replicatePointer(outShapes.data(), outShapes.size() * sizeof(void*))); auto length = gradient.lengthOf(); const int threadsPerBlock = MAX_NUM_THREADS / 2; const int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock; mergeAddBpCudaLauncher << getCudaStream() >> > (gradient.getSpecialBuffer(), gradient.getSpecialShapeInfo(), pOutBuffers, pOutShapes, nArrSize, length, bSameOrderAndEws1); manager.synchronize(); } void mergeAddBp(sd::LaunchContext* context, const NDArray& gradient, std::vector& outArrs) { const std::vector& out = reinterpret_cast& >(outArrs); NDArray::prepareSpecialUse( out, { &gradient }); bool bSameOrderAndEws1 = (1 == gradient.ews()); auto ordering = gradient.ordering(); for (const auto& v : outArrs) { bSameOrderAndEws1 &= (ordering == v->ordering()); bSameOrderAndEws1 &= (1 == v->ews()); } BUILD_SINGLE_SELECTOR(gradient.dataType(), mergeAddBp_, (context, gradient, outArrs, bSameOrderAndEws1), LIBND4J_TYPES); NDArray::prepareSpecialUse( out, { &gradient }); } } } }