From cb4c9377b19710ad6e086b56759b93778e8185af Mon Sep 17 00:00:00 2001 From: Yurii Shyrma Date: Mon, 2 Sep 2019 16:25:58 +0300 Subject: [PATCH] Shyrma docs (#222) * - documenting and profiling matrix_set_diag cuda kernel Signed-off-by: Yurii * - correct formula of pnorm pooling in cuda 2d/3d kernels - remove helper matrix_diag which duplicates work of helper matrix_set_diag Signed-off-by: Yurii --- .../generic/parity_ops/matrixSetDiag.cpp | 7 +- .../generic/parity_ops/matrix_diag.cpp | 64 +++++----- .../ops/declarable/headers/parity_ops.h | 16 ++- .../declarable/helpers/cpu/convolutions.cpp | 2 +- .../declarable/helpers/cpu/matrixSetDiag.cpp | 57 +++++---- .../declarable/helpers/cpu/matrix_diag.cpp | 65 ----------- .../declarable/helpers/cuda/convolutions.cu | 23 ++-- .../declarable/helpers/cuda/matrixSetDiag.cu | 110 +++++++++++------- .../declarable/helpers/cuda/matrix_diag.cu | 95 --------------- .../ops/declarable/helpers/matrixSetDiag.h | 3 +- .../ops/declarable/helpers/matrix_diag.h | 34 ------ .../layers_tests/DeclarableOpsTests3.cpp | 28 ++--- .../tests_cpu/layers_tests/SortCudaTests.cu | 6 +- 13 files changed, 190 insertions(+), 320 deletions(-) delete mode 100644 libnd4j/include/ops/declarable/helpers/cpu/matrix_diag.cpp delete mode 100644 libnd4j/include/ops/declarable/helpers/cuda/matrix_diag.cu delete mode 100644 libnd4j/include/ops/declarable/helpers/matrix_diag.h diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp index f63469817..3a52057a5 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp @@ -15,7 +15,7 @@ ******************************************************************************/ // -// @author Yurii Shyrma (iuriish@yahoo.com), created on 07.12.2017 +// @author Yurii Shyrma (iuriish@yahoo.com) // #include @@ -38,10 +38,9 @@ CONFIGURABLE_OP_IMPL(matrix_set_diag, 2, 1, false, 0, 0) { for(int i = 0; i < diagonal->rankOf() - 1; ++i) REQUIRE_TRUE(diagonal->sizeAt(i) == input->sizeAt(i), 0, "MATRIX_SET_DIAG op: the shapes of diagonal and input arrays must be equal till last diagonal dimension but one, however got diagonal=%s and input=%s instead !", ShapeUtils::shapeAsString(diagonal).c_str(), ShapeUtils::shapeAsString(input).c_str()); - REQUIRE_TRUE(diagonal->sizeAt(-1) == (int)nd4j::math::nd4j_min(input->sizeAt(-1), input->sizeAt(-2)), - 0, "MATRIX_SET_DIAG op: the value of last dimension of diagonal array must be equal to min(input_last_shape=%i, input_last_but_one_shape=%i), but got %i instead !", input->sizeAt(-1), input->sizeAt(-2), diagonal->sizeAt(-1)); + REQUIRE_TRUE(diagonal->sizeAt(-1) == (int)nd4j::math::nd4j_min(input->sizeAt(-1), input->sizeAt(-2)), 0, "MATRIX_SET_DIAG op: the value of last dimension of diagonal array must be equal to min(input_last_shape=%i, input_last_but_one_shape=%i), but got %i instead !", input->sizeAt(-1), input->sizeAt(-2), diagonal->sizeAt(-1)); - helpers::matrixSetDiag(block.launchContext(), input, diagonal, output); + helpers::matrixSetDiag(block.launchContext(), *input, *diagonal, *output, false); return Status::OK(); } diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp index 8fa5bfa41..c430fd4d2 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp @@ -15,49 +15,53 @@ ******************************************************************************/ // -// Created to use with batched tensor by GS 3/21/2018 +// @author GS 3/21/2018 +// @author Yurii Shyrma (iuriish@yahoo.com) // #include -#include - +#include namespace nd4j { - namespace ops { - CUSTOM_OP_IMPL(matrix_diag, 1, 1, false, 0, 0) { - auto input = INPUT_VARIABLE(0); - auto output = OUTPUT_VARIABLE(0); +namespace ops { - REQUIRE_TRUE(!input->isScalar(), 0, "CUSTOM_OP matrix_diag: input array must be at list a vector, but scalar was given!"); +CUSTOM_OP_IMPL(matrix_diag, 1, 1, false, 0, 0) { - output->nullify(); - return helpers::matrixDiag(block.launchContext(), input, output); - } + auto diagonal = INPUT_VARIABLE(0); + auto output = OUTPUT_VARIABLE(0); - DECLARE_SHAPE_FN(matrix_diag) { - Nd4jLong* outShapeInfo = nullptr; - auto in = inputShape->at(0); - int inRank = shape::rank(in); + REQUIRE_TRUE(!diagonal->isScalar(), 0, "CUSTOM_OP matrix_diag: input diagonal array must be at list a vector, but scalar was given!"); - int outRank = inRank + 1; - auto lastDimension = shape::sizeAt(in, -1); + helpers::matrixSetDiag(block.launchContext(), *output, *diagonal, *output, true); - ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong); - outShapeInfo[0] = outRank; - for(int i = 0; i < inRank; ++i) - outShapeInfo[i + 1] = shape::sizeAt(in, i); - outShapeInfo[outRank] = lastDimension; + return Status::OK(); +} - ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in)); +DECLARE_SHAPE_FN(matrix_diag) { - return SHAPELIST(CONSTANT(outShapeInfo)); - } + Nd4jLong* outShapeInfo = nullptr; + auto in = inputShape->at(0); + int inRank = shape::rank(in); - DECLARE_TYPES(matrix_diag) { - getOpDescriptor() - ->setAllowedInputTypes(nd4j::DataType::ANY) - ->setSameMode(true); - } + int outRank = inRank + 1; + auto lastDimension = shape::sizeAt(in, -1); + + ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong); + outShapeInfo[0] = outRank; + for(int i = 0; i < inRank; ++i) + outShapeInfo[i + 1] = shape::sizeAt(in, i); + outShapeInfo[outRank] = lastDimension; + + ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in)); + + return SHAPELIST(CONSTANT(outShapeInfo)); +} + +DECLARE_TYPES(matrix_diag) { + getOpDescriptor() + ->setAllowedInputTypes(nd4j::DataType::ANY) + ->setSameMode(true); +} } } diff --git a/libnd4j/include/ops/declarable/headers/parity_ops.h b/libnd4j/include/ops/declarable/headers/parity_ops.h index f9278fb36..c86f28499 100644 --- a/libnd4j/include/ops/declarable/headers/parity_ops.h +++ b/libnd4j/include/ops/declarable/headers/parity_ops.h @@ -76,8 +76,20 @@ namespace nd4j { #endif /** - * Returns a batched matrix tensor with new batched diagonal values. - */ + * Inserts elements provided by diagonal array into the main diagonal of innermost matrices of input array + * + * Input arrays: + * input: input array, considered as batch of matrices + * diagonal: array containing elements to be inserted into input array, + * following rank condition should be satisfied: diagonal_rank = input_rank - 1, + * the shapes of diagonal and input arrays must be equal except last dimension of input array, + * for example if input_shape = [A,B,C,D] then diagonal_shape = [A,B,C], + * also last dimension of diagonal array should be equal to smaller of last and last but one input dimensions + * that is: diagonal_shape[-1] = min(input_shape[-1], input_shape[-2]) + * + * Output array: + * has the same shape as input, corresponding diagonal elements are substituted + */ #if NOT_EXCLUDED(OP_matrix_set_diag) DECLARE_CONFIGURABLE_OP(matrix_set_diag, 2, 1, false, 0, 0); #endif diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp index dd5516461..3d04bc129 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp @@ -2411,7 +2411,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) - pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd + kh + kw]), extraParam0 - (T)1.f); + pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd + kh + kw]), extraParam0 - (T)1.f) * nd4j::math::nd4j_sgn(pIn[kd + kh + kw]); } else { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp index 7180a88b3..e974755ac 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp @@ -15,7 +15,7 @@ ******************************************************************************/ // -// Created by Yurii Shyrma on 07.12.2017. +// @author Yurii Shyrma (iuriish@yahoo.com) // #include "ResultSet.h" @@ -27,31 +27,48 @@ namespace helpers { ////////////////////////////////////////////////////////////////////////// -// Returns a batched matrix tensor with new batched diagonal values. -// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag -template -static void _matrixSetDiag(const NDArray* input, const NDArray* diagonal, NDArray* output) { +template +void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) { - *output = *input; + // input and output are the same array (x == z) when zeroPad = true + // xRank = zRank, xRank = yRank + 1 + // xLen = zLen - const int lastDimSize = input->sizeAt(-1); - const int last2DimSize = input->sizeAt(-1) * input->sizeAt(-2); - const int lastSmallDim = diagonal->sizeAt(-1); - const int batchSize = input->lengthOf()/last2DimSize; + const T* x = input.bufferAsT(); + const T* y = diagonal.bufferAsT(); + T* z = output.bufferAsT(); - for(int i = 0; i < batchSize; ++i ) - for(int j = 0; j < lastSmallDim; ++j) { - output->p(i*last2DimSize + j*(lastDimSize + 1), diagonal->e(i*lastSmallDim + j)); - } - + const Nd4jLong* xShapeInfo = input.getShapeInfo(); + const Nd4jLong* yShapeInfo = diagonal.getShapeInfo(); + const Nd4jLong* zShapeInfo = output.getShapeInfo(); + const bool areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); // shapes are definitely the same, but strides might not + + const int xRank = input.rankOf(); + const auto xLen = input.lengthOf(); + + std::vector coords(xRank); // we use the same coordinates storage both for input and output since their ranks are the same + + PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords)) + for (Nd4jLong i = 0; i < xLen; ++i) { + + shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords.data()); + + const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords.data(), xRank); + const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords.data(), xRank); + + // condition to be on diagonal of innermost matrix + if(coords[xRank - 2] == coords[xRank - 1]) + z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords.data(), xRank - 1)]; + else + z[zOffset] = zeroPad ? static_cast(0) : x[xOffset]; + } } - void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) { - BUILD_SINGLE_SELECTOR(input->dataType(), _matrixSetDiag, (input, diagonal, output), LIBND4J_TYPES); - } - - BUILD_SINGLE_TEMPLATE(template void _matrixSetDiag, (const NDArray* input, const NDArray* diagonal, NDArray* output), LIBND4J_TYPES); +////////////////////////////////////////////////////////////////////////// +void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) { + BUILD_SINGLE_SELECTOR(input.dataType(), matrixSetDiag_, (input, diagonal, output, zeroPad), LIBND4J_TYPES); +} } } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag.cpp deleted file mode 100644 index 3f9883b54..000000000 --- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -// -// Created by GS on 3/21/2018. -// - -#include "ResultSet.h" -#include -#include - -namespace nd4j { -namespace ops { -namespace helpers { - - -////////////////////////////////////////////////////////////////////////// -// Returns a batched matrix tensor with new batched diagonal values. -// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag -template -static int _matrixDiag(const NDArray* input, NDArray* output) { - - auto listOut = output->allTensorsAlongDimension({output->rankOf() - 2, output->rankOf() - 1}); - auto listDiag = input->allTensorsAlongDimension({input->rankOf() - 1}); - - if (listOut->size() != listDiag->size()) { - nd4j_printf("matrix_diag: Input matrix has wrong shape.", ""); - return ND4J_STATUS_VALIDATION; - } - int lastDimension = input->sizeAt(-1); - // TODO: tune this properlys - int lO = listOut->size(); - PRAGMA_OMP_PARALLEL_FOR_IF(lO > Environment::getInstance()->tadThreshold()) - for(int i = 0; i < lO; ++i) - for (int e = 0; e < lastDimension; e++) - listOut->at(i)->p(e, e, listDiag->at(i)->e(e)); - - delete listOut; - delete listDiag; - - return Status::OK(); -} - - int matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) { - BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiag, (input, output), LIBND4J_TYPES); - } - - BUILD_SINGLE_TEMPLATE(template int _matrixDiag, (const NDArray* input, NDArray* output), LIBND4J_TYPES); - -} -} -} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu index 87e7c4f08..c08551318 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu @@ -957,9 +957,13 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf val *= nd4j::math::nd4j_pow(sum, ((T)1.f - extraParam0) / extraParam0); - for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) - for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) - nd4j::math::atomics::nd4j_atomicAdd(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0 - 1.f)); + for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) { + for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) { + const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank); + const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + nd4j::math::atomics::nd4j_atomicAdd(&z[zOffset], val * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn(x[xOffset])); + } + } } break; } @@ -1123,10 +1127,15 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf val *= nd4j::math::nd4j_pow(sum, ((T)1.f - extraParam0) / extraParam0); - for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) - for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) - for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) - nd4j::math::atomics::nd4j_atomicAdd(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0 - 1.f)); + for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) { + for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) { + for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) { + const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank); + const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + nd4j::math::atomics::nd4j_atomicAdd(&z[zOffset], val * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn(x[xOffset])); + } + } + } } break; } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu index 95eb5f439..01baaffb4 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu @@ -15,63 +15,87 @@ ******************************************************************************/ // -// Created by Yurii Shyrma on 07.12.2017. +// @author Yurii Shyrma (iuriish@yahoo.com) // #include "ResultSet.h" #include +#include -namespace nd4j { -namespace ops { +namespace nd4j { +namespace ops { namespace helpers { +/////////////////////////////////////////////////////////////////// +template +__global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool zeroPad) { - template - static __global__ void matrixSetDiagKernel(void* outputBuffer, Nd4jLong* outputShape, void const* diagonalBuffer, Nd4jLong* diagonalShape, Nd4jLong lastDimSize, Nd4jLong last2DimSize, Nd4jLong lastSmallDim, Nd4jLong batchSize) { - __shared__ T* z; - __shared__ T const* x; - __shared__ Nd4jLong outLength, diagonalLen; - if (threadIdx.x == 0) { - z = reinterpret_cast(outputBuffer); - x = reinterpret_cast(diagonalBuffer); - outLength = shape::length(outputShape); - diagonalLen = shape::length(diagonalShape); - } - __syncthreads(); + // x - input, shape [A,B,C] + // y - diagonal, shape [A,B] + // z - output, shape [A,B,C] + // input and output are the same array (x == z) when zeroPad = true - for(int i = blockIdx.x; i < batchSize; i+= gridDim.x ) - for(int j = threadIdx.x; j < lastSmallDim; j += blockDim.x) { -// z[i * last2DimSize + j * (lastDimSize + 1)] = x[i * lastSmallDim + j]; - z[shape::getIndexOffset(i * last2DimSize + j * (lastDimSize + 1), outputShape, outLength)] = x[shape::getIndexOffset(i * lastSmallDim + j, diagonalShape, diagonalLen)]; - } - } - ////////////////////////////////////////////////////////////////////////// - // Returns a batched matrix tensor with new batched diagonal values. - // for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag - template - static void _matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) { - *output = *input; + const auto x = reinterpret_cast(vx); + const auto y = reinterpret_cast(vy); + auto z = reinterpret_cast(vz); - const int lastDimSize = input->sizeAt(-1); - const int last2DimSize = input->sizeAt(-1) * input->sizeAt(-2); - const int lastSmallDim = diagonal->sizeAt(-1); - const int batchSize = input->lengthOf()/last2DimSize; - auto stream = context->getCudaStream(); - dim3 launchDims(256, 512, 8192); - matrixSetDiagKernel<<>>(output->specialBuffer(), output->specialShapeInfo(), diagonal->getSpecialBuffer(), diagonal->getSpecialShapeInfo(), lastDimSize, last2DimSize, lastSmallDim, batchSize); -//// #pragma omp parallel for if(batchSize > Environment::getInstance()->elementwiseThreshold()) schedule(static) -// for(int i = 0; i < batchSize; ++i ) -// for(int j = 0; j < lastSmallDim; ++j) { -// output->p(i*last2DimSize + j*(lastDimSize + 1), diagonal->e(i*lastSmallDim + j)); -// } + __shared__ int xRank; // xRank = zRank, xRank = yRank + 1 + __shared__ Nd4jLong xLen, *sharedMem; // xLen = zLen + __shared__ bool areSameOffsets; + if (threadIdx.x == 0) { + + extern __shared__ unsigned char shmem[]; + sharedMem = reinterpret_cast(shmem); + + areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); // shapes are definitely the same, but strides might not + + xRank = shape::rank(xShapeInfo); + xLen = shape::length(xShapeInfo); } - void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) { - BUILD_SINGLE_SELECTOR(input->dataType(), _matrixSetDiag, (context, input, diagonal, output), LIBND4J_TYPES); - } + __syncthreads(); - BUILD_SINGLE_TEMPLATE(template void _matrixSetDiag, (nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output), LIBND4J_TYPES); + auto coords = sharedMem + threadIdx.x * xRank; // we provide (xRank * sizeof(Nd4jLong) * threadIdx.x) amount of shared memory per each thread + const auto tid = blockIdx.x * blockDim.x + threadIdx.x; + + for (Nd4jLong i = tid; i < xLen; i += gridDim.x * blockDim.x) { + + shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords); + + const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords, xRank); + const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords, xRank); + + // condition to be on diagonal of innermost matrix + if(coords[xRank - 2] == coords[xRank - 1]) + z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords, xRank - 1)]; + else + z[zOffset] = zeroPad ? static_cast(0) : x[xOffset]; + } +} + +/////////////////////////////////////////////////////////////////// +template +static void matrixSetDiagCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool zeroPad) { + + matrixSetDiagCuda<<>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, zeroPad); +} + +/////////////////////////////////////////////////////////////////// +void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) { + + const int threadsPerBlock = MAX_NUM_THREADS / 2; + const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; + const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * input.rankOf() + 128; + + PointersManager manager(context, "matrixSetDiag"); + + NDArray::prepareSpecialUse({&output}, {&input, &diagonal}); + BUILD_SINGLE_SELECTOR(input.dataType(), matrixSetDiagCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), diagonal.getSpecialBuffer(), diagonal.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), zeroPad), LIBND4J_TYPES); + NDArray::registerSpecialUse({&output}, {&input, &diagonal}); + + manager.synchronize(); +} } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag.cu deleted file mode 100644 index 78304510d..000000000 --- a/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag.cu +++ /dev/null @@ -1,95 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -// -// Created by GS on 3/21/2018. -// - -#include "ResultSet.h" -#include -#include -#include -#include -#include -#include -#include - -namespace nd4j { -namespace ops { -namespace helpers { - - - template - static __global__ void matrixDiagKernel(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength, - Nd4jLong* tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets, - Nd4jLong* tadOnlyOutputShapeInfo, Nd4jLong *tadOutputOffsets) { - int totalThreads = blockDim.x; - for (Nd4jLong i = blockIdx.x; i < numTads; i += gridDim.x) { - auto yOffset = tadInputOffsets[i]; - auto xOffset = tadOutputOffsets[i]; - for (Nd4jLong j = threadIdx.x; j < inputLength; j += totalThreads) { - Nd4jLong coords[2] = {j, j}; - Nd4jLong tadOffset = shape::getOffset(0, shape::shapeOf(tadOnlyOutputShapeInfo), shape::stride(tadOnlyOutputShapeInfo), coords, 2); - //shape::getIndexOffset(j, tadOnlyOutputShapeInfo, inputLength) - *(reinterpret_cast(outputBuffer) + xOffset + tadOffset) = *(reinterpret_cast(inputBuffer) + yOffset + shape::getIndexOffset(j, tadOnlyInputShapeInfo, inputLength)); - } - } - } - ////////////////////////////////////////////////////////////////////////// - // Returns a batched matrix tensor with new batched diagonal values. - // for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag - - template - static int _matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) { - cudaStream_t* stream = context->getCudaStream(); - //auto listOut = output->allTensorsAlongDimension({output->rankOf() - 2, output->rankOf() - 1}); - //auto listDiag = input->allTensorsAlongDimension({input->rankOf() - 1}); - - //auto repeatDelta = shape::prodLong(newShape.data(), rank) / this->lengthOf(); - std::vector dimsToExclude = ShapeUtils::evalDimsToExclude(input->rankOf(), {input->rankOf() - 1}); - const Nd4jLong numTads = ShapeUtils::getNumOfSubArrs(input->getShapeInfo(), dimsToExclude); //this->tensorsAlongDimension({dimension}); - //printf("Repeat delta %lld, numTads %lld\n", repeatDelta, numTads); - //tadOnlyInputShapeInfo, tadInputOffsets, tadOnlyOutputShapeInfo, tadOutputOffsets; - std::vector inputDims({input->rankOf() - 1}); - std::vector outputDims({output->rankOf() - 2, output->rankOf() - 1}); - - auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), inputDims); - auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), outputDims); - - if (!input->isActualOnDeviceSide()) - input->syncToDevice(); - - if (!output->isActualOnDeviceSide()) - output->syncToDevice(); - - // create cuda stream and LaunchContext - cudaError_t cudaResult; - - dim3 launchDims(256, 512, 8192); - matrixDiagKernel<<>>(input->getSpecialBuffer(), output->getSpecialBuffer(), numTads, input->sizeAt(-1), packX.specialShapeInfo(), packX.specialOffsets(), packZ.specialShapeInfo(), packZ.specialOffsets()); - - return Status::OK(); - } - - int matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) { - BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiag, (context, input, output), LIBND4J_TYPES); - } - - BUILD_SINGLE_TEMPLATE(template int _matrixDiag, (nd4j::LaunchContext * context, const NDArray* input, NDArray* output), LIBND4J_TYPES); - -} -} -} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/matrixSetDiag.h b/libnd4j/include/ops/declarable/helpers/matrixSetDiag.h index ea5a1a4ad..fb7d57d18 100644 --- a/libnd4j/include/ops/declarable/helpers/matrixSetDiag.h +++ b/libnd4j/include/ops/declarable/helpers/matrixSetDiag.h @@ -28,8 +28,7 @@ namespace nd4j { namespace ops { namespace helpers { - void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output); - + void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad); } } diff --git a/libnd4j/include/ops/declarable/helpers/matrix_diag.h b/libnd4j/include/ops/declarable/helpers/matrix_diag.h deleted file mode 100644 index 0cbbcef16..000000000 --- a/libnd4j/include/ops/declarable/helpers/matrix_diag.h +++ /dev/null @@ -1,34 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -// -// @author GS -// -#ifndef __MATRIX_DIAG_HELPERS__ -#define __MATRIX_DIAG_HELPERS__ -#include -#include - -namespace nd4j { -namespace ops { -namespace helpers { - - int matrixDiag(nd4j::LaunchContext * context, NDArray const* input, NDArray* output); - -} -} -} -#endif diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp index 1ec9650f9..7d166f831 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp @@ -117,9 +117,9 @@ TEST_F(DeclarableOpsTests3, Test_Unique_1) { auto v = result->at(0); auto i = result->at(1); - v->printIndexedBuffer("Values"); - i->printIndexedBuffer("Indices"); - i->printShapeInfo("Indices shape"); + // v->printIndexedBuffer("Values"); + // i->printIndexedBuffer("Indices"); + // i->printShapeInfo("Indices shape"); ASSERT_TRUE(expV.isSameShape(v)); ASSERT_TRUE(expV.equalsTo(v)); @@ -145,12 +145,12 @@ TEST_F(DeclarableOpsTests3, Test_Unique_2) { auto i = result->at(1); auto c = result->at(2); - v->printShapeInfo(); - v->printIndexedBuffer("Values"); - i->printShapeInfo(); - i->printIndexedBuffer("Indices"); - c->printShapeInfo(); - c->printIndexedBuffer("Counts"); + // v->printShapeInfo(); + // v->printIndexedBuffer("Values"); + // i->printShapeInfo(); + // i->printIndexedBuffer("Indices"); + // c->printShapeInfo(); + // c->printIndexedBuffer("Counts"); ASSERT_TRUE(expV.isSameShape(v)); ASSERT_TRUE(expV.equalsTo(v)); @@ -200,11 +200,11 @@ TEST_F(DeclarableOpsTests3, Test_Norm_1) { auto result1 = op.execute({&x}, {1.}, {1}); ASSERT_EQ(result1->status(), ND4J_STATUS_OK); auto z1 = result1->at(0); - z1->printIndexedBuffer("Z1"); + // z1->printIndexedBuffer("Z1"); auto exp1 = x.reduceAlongDims(reduce::Norm2, dims, false, false); - exp1.printIndexedBuffer("EXP1"); - z1->printShapeInfo("Z1 shape"); - exp1.printShapeInfo("EXP1 shape"); + // exp1.printIndexedBuffer("EXP1"); + // z1->printShapeInfo("Z1 shape"); + // exp1.printShapeInfo("EXP1 shape"); ASSERT_TRUE(exp1.isSameShape(z1)); ASSERT_TRUE(exp1.equalsTo(z1)); @@ -714,7 +714,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_7) { auto exp = MmulHelper::mmul(&x, &y); - exp->printShapeInfo("exp shape"); + // exp->printShapeInfo("exp shape"); nd4j::ops::batched_gemm op; auto result = op.execute({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {112, 112, 2, 3, 5, 5, 3, 2, 3}); diff --git a/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu b/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu index 49c1f7a95..6913722be 100644 --- a/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu +++ b/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu @@ -79,7 +79,7 @@ TEST_F(SortCudaTests, test_linear_sort_by_val_2) { sortByValue(extras, k.buffer(), k.shapeInfo(), k.specialBuffer(), k.specialShapeInfo(), v.buffer(), v.shapeInfo(), v.specialBuffer(), v.specialShapeInfo(), true); k.tickWriteDevice(); v.tickWriteDevice(); - k.printIndexedBuffer("KEYS"); + // k.printIndexedBuffer("KEYS"); ASSERT_EQ(ek, k); ASSERT_EQ(ev, v); } @@ -98,8 +98,8 @@ TEST_F(SortCudaTests, test_tad_sort_by_key_1) { k.tickWriteDevice(); v.tickWriteDevice(); - k.printIndexedBuffer("k"); - v.printIndexedBuffer("v"); + // k.printIndexedBuffer("k"); + // v.printIndexedBuffer("v"); ASSERT_EQ(ek, k); ASSERT_EQ(ev, v);