Shyrma docs (#222)

* - documenting and profiling matrix_set_diag cuda kernel

Signed-off-by: Yurii <yurii@skymind.io>

* - correct formula of pnorm pooling in cuda 2d/3d kernels
- remove helper matrix_diag which duplicates work of helper matrix_set_diag

Signed-off-by: Yurii <yurii@skymind.io>
master
Yurii Shyrma 2019-09-02 16:25:58 +03:00 committed by raver119
parent 106524663b
commit cb4c9377b1
13 changed files with 190 additions and 320 deletions

View File

@ -15,7 +15,7 @@
******************************************************************************/ ******************************************************************************/
// //
// @author Yurii Shyrma (iuriish@yahoo.com), created on 07.12.2017 // @author Yurii Shyrma (iuriish@yahoo.com)
// //
#include <op_boilerplate.h> #include <op_boilerplate.h>
@ -38,10 +38,9 @@ CONFIGURABLE_OP_IMPL(matrix_set_diag, 2, 1, false, 0, 0) {
for(int i = 0; i < diagonal->rankOf() - 1; ++i) for(int i = 0; i < diagonal->rankOf() - 1; ++i)
REQUIRE_TRUE(diagonal->sizeAt(i) == input->sizeAt(i), 0, "MATRIX_SET_DIAG op: the shapes of diagonal and input arrays must be equal till last diagonal dimension but one, however got diagonal=%s and input=%s instead !", ShapeUtils::shapeAsString(diagonal).c_str(), ShapeUtils::shapeAsString(input).c_str()); REQUIRE_TRUE(diagonal->sizeAt(i) == input->sizeAt(i), 0, "MATRIX_SET_DIAG op: the shapes of diagonal and input arrays must be equal till last diagonal dimension but one, however got diagonal=%s and input=%s instead !", ShapeUtils::shapeAsString(diagonal).c_str(), ShapeUtils::shapeAsString(input).c_str());
REQUIRE_TRUE(diagonal->sizeAt(-1) == (int)nd4j::math::nd4j_min<Nd4jLong>(input->sizeAt(-1), input->sizeAt(-2)), REQUIRE_TRUE(diagonal->sizeAt(-1) == (int)nd4j::math::nd4j_min<Nd4jLong>(input->sizeAt(-1), input->sizeAt(-2)), 0, "MATRIX_SET_DIAG op: the value of last dimension of diagonal array must be equal to min(input_last_shape=%i, input_last_but_one_shape=%i), but got %i instead !", input->sizeAt(-1), input->sizeAt(-2), diagonal->sizeAt(-1));
0, "MATRIX_SET_DIAG op: the value of last dimension of diagonal array must be equal to min(input_last_shape=%i, input_last_but_one_shape=%i), but got %i instead !", input->sizeAt(-1), input->sizeAt(-2), diagonal->sizeAt(-1));
helpers::matrixSetDiag(block.launchContext(), input, diagonal, output); helpers::matrixSetDiag(block.launchContext(), *input, *diagonal, *output, false);
return Status::OK(); return Status::OK();
} }

View File

@ -15,49 +15,53 @@
******************************************************************************/ ******************************************************************************/
// //
// Created to use with batched tensor by GS <sgazeos@gmail.com> 3/21/2018 // @author GS <sgazeos@gmail.com> 3/21/2018
// @author Yurii Shyrma (iuriish@yahoo.com)
// //
#include <ops/declarable/CustomOperations.h> #include <ops/declarable/CustomOperations.h>
#include <ops/declarable/helpers/matrix_diag.h> #include <ops/declarable/helpers/matrixSetDiag.h>
namespace nd4j { namespace nd4j {
namespace ops { namespace ops {
CUSTOM_OP_IMPL(matrix_diag, 1, 1, false, 0, 0) {
auto input = INPUT_VARIABLE(0);
auto output = OUTPUT_VARIABLE(0);
REQUIRE_TRUE(!input->isScalar(), 0, "CUSTOM_OP matrix_diag: input array must be at list a vector, but scalar was given!"); CUSTOM_OP_IMPL(matrix_diag, 1, 1, false, 0, 0) {
output->nullify(); auto diagonal = INPUT_VARIABLE(0);
return helpers::matrixDiag(block.launchContext(), input, output); auto output = OUTPUT_VARIABLE(0);
}
DECLARE_SHAPE_FN(matrix_diag) { REQUIRE_TRUE(!diagonal->isScalar(), 0, "CUSTOM_OP matrix_diag: input diagonal array must be at list a vector, but scalar was given!");
Nd4jLong* outShapeInfo = nullptr;
auto in = inputShape->at(0);
int inRank = shape::rank(in);
int outRank = inRank + 1; helpers::matrixSetDiag(block.launchContext(), *output, *diagonal, *output, true);
auto lastDimension = shape::sizeAt(in, -1);
ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong); return Status::OK();
outShapeInfo[0] = outRank; }
for(int i = 0; i < inRank; ++i)
outShapeInfo[i + 1] = shape::sizeAt(in, i);
outShapeInfo[outRank] = lastDimension;
ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in)); DECLARE_SHAPE_FN(matrix_diag) {
return SHAPELIST(CONSTANT(outShapeInfo)); Nd4jLong* outShapeInfo = nullptr;
} auto in = inputShape->at(0);
int inRank = shape::rank(in);
DECLARE_TYPES(matrix_diag) { int outRank = inRank + 1;
getOpDescriptor() auto lastDimension = shape::sizeAt(in, -1);
->setAllowedInputTypes(nd4j::DataType::ANY)
->setSameMode(true); ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong);
} outShapeInfo[0] = outRank;
for(int i = 0; i < inRank; ++i)
outShapeInfo[i + 1] = shape::sizeAt(in, i);
outShapeInfo[outRank] = lastDimension;
ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in));
return SHAPELIST(CONSTANT(outShapeInfo));
}
DECLARE_TYPES(matrix_diag) {
getOpDescriptor()
->setAllowedInputTypes(nd4j::DataType::ANY)
->setSameMode(true);
}
} }
} }

View File

@ -76,8 +76,20 @@ namespace nd4j {
#endif #endif
/** /**
* Returns a batched matrix tensor with new batched diagonal values. * Inserts elements provided by diagonal array into the main diagonal of innermost matrices of input array
*/ *
* Input arrays:
* input: input array, considered as batch of matrices
* diagonal: array containing elements to be inserted into input array,
* following rank condition should be satisfied: diagonal_rank = input_rank - 1,
* the shapes of diagonal and input arrays must be equal except last dimension of input array,
* for example if input_shape = [A,B,C,D] then diagonal_shape = [A,B,C],
* also last dimension of diagonal array should be equal to smaller of last and last but one input dimensions
* that is: diagonal_shape[-1] = min(input_shape[-1], input_shape[-2])
*
* Output array:
* has the same shape as input, corresponding diagonal elements are substituted
*/
#if NOT_EXCLUDED(OP_matrix_set_diag) #if NOT_EXCLUDED(OP_matrix_set_diag)
DECLARE_CONFIGURABLE_OP(matrix_set_diag, 2, 1, false, 0, 0); DECLARE_CONFIGURABLE_OP(matrix_set_diag, 2, 1, false, 0, 0);
#endif #endif

View File

@ -2411,7 +2411,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f); pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f) * nd4j::math::nd4j_sgn<T,T>(pIn[kd + kh + kw]);
} }
else { else {

View File

@ -15,7 +15,7 @@
******************************************************************************/ ******************************************************************************/
// //
// Created by Yurii Shyrma on 07.12.2017. // @author Yurii Shyrma (iuriish@yahoo.com)
// //
#include "ResultSet.h" #include "ResultSet.h"
@ -27,31 +27,48 @@ namespace helpers {
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Returns a batched matrix tensor with new batched diagonal values. template<typename T>
// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
template <typename T>
static void _matrixSetDiag(const NDArray* input, const NDArray* diagonal, NDArray* output) {
*output = *input; // input and output are the same array (x == z) when zeroPad = true
// xRank = zRank, xRank = yRank + 1
// xLen = zLen
const int lastDimSize = input->sizeAt(-1); const T* x = input.bufferAsT<T>();
const int last2DimSize = input->sizeAt(-1) * input->sizeAt(-2); const T* y = diagonal.bufferAsT<T>();
const int lastSmallDim = diagonal->sizeAt(-1); T* z = output.bufferAsT<T>();
const int batchSize = input->lengthOf()/last2DimSize;
for(int i = 0; i < batchSize; ++i ) const Nd4jLong* xShapeInfo = input.getShapeInfo();
for(int j = 0; j < lastSmallDim; ++j) { const Nd4jLong* yShapeInfo = diagonal.getShapeInfo();
output->p(i*last2DimSize + j*(lastDimSize + 1), diagonal->e<T>(i*lastSmallDim + j)); const Nd4jLong* zShapeInfo = output.getShapeInfo();
}
const bool areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); // shapes are definitely the same, but strides might not
const int xRank = input.rankOf();
const auto xLen = input.lengthOf();
std::vector<Nd4jLong> coords(xRank); // we use the same coordinates storage both for input and output since their ranks are the same
PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
for (Nd4jLong i = 0; i < xLen; ++i) {
shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords.data());
const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords.data(), xRank);
const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords.data(), xRank);
// condition to be on diagonal of innermost matrix
if(coords[xRank - 2] == coords[xRank - 1])
z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords.data(), xRank - 1)];
else
z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
}
} }
void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) { //////////////////////////////////////////////////////////////////////////
BUILD_SINGLE_SELECTOR(input->dataType(), _matrixSetDiag, (input, diagonal, output), LIBND4J_TYPES); void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
} BUILD_SINGLE_SELECTOR(input.dataType(), matrixSetDiag_, (input, diagonal, output, zeroPad), LIBND4J_TYPES);
}
BUILD_SINGLE_TEMPLATE(template void _matrixSetDiag, (const NDArray* input, const NDArray* diagonal, NDArray* output), LIBND4J_TYPES);
} }
} }

View File

@ -1,65 +0,0 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// Created by GS <sgazeos@gmail.com> on 3/21/2018.
//
#include "ResultSet.h"
#include <ops/declarable/helpers/matrix_diag.h>
#include <Status.h>
namespace nd4j {
namespace ops {
namespace helpers {
//////////////////////////////////////////////////////////////////////////
// Returns a batched matrix tensor with new batched diagonal values.
// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
template <typename T>
static int _matrixDiag(const NDArray* input, NDArray* output) {
auto listOut = output->allTensorsAlongDimension({output->rankOf() - 2, output->rankOf() - 1});
auto listDiag = input->allTensorsAlongDimension({input->rankOf() - 1});
if (listOut->size() != listDiag->size()) {
nd4j_printf("matrix_diag: Input matrix has wrong shape.", "");
return ND4J_STATUS_VALIDATION;
}
int lastDimension = input->sizeAt(-1);
// TODO: tune this properlys
int lO = listOut->size();
PRAGMA_OMP_PARALLEL_FOR_IF(lO > Environment::getInstance()->tadThreshold())
for(int i = 0; i < lO; ++i)
for (int e = 0; e < lastDimension; e++)
listOut->at(i)->p(e, e, listDiag->at(i)->e<T>(e));
delete listOut;
delete listDiag;
return Status::OK();
}
int matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiag, (input, output), LIBND4J_TYPES);
}
BUILD_SINGLE_TEMPLATE(template int _matrixDiag, (const NDArray* input, NDArray* output), LIBND4J_TYPES);
}
}
}

View File

@ -957,9 +957,13 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0); val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {
for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) {
nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0 - 1.f)); const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
}
}
} }
break; break;
} }
@ -1123,10 +1127,15 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0); val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) {
for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0 - 1.f)); const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
}
}
}
} }
break; break;
} }

View File

@ -15,63 +15,87 @@
******************************************************************************/ ******************************************************************************/
// //
// Created by Yurii Shyrma on 07.12.2017. // @author Yurii Shyrma (iuriish@yahoo.com)
// //
#include "ResultSet.h" #include "ResultSet.h"
#include <ops/declarable/helpers/matrixSetDiag.h> #include <ops/declarable/helpers/matrixSetDiag.h>
#include <PointersManager.h>
namespace nd4j { namespace nd4j {
namespace ops { namespace ops {
namespace helpers { namespace helpers {
///////////////////////////////////////////////////////////////////
template<typename T>
__global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool zeroPad) {
template <typename T> // x - input, shape [A,B,C]
static __global__ void matrixSetDiagKernel(void* outputBuffer, Nd4jLong* outputShape, void const* diagonalBuffer, Nd4jLong* diagonalShape, Nd4jLong lastDimSize, Nd4jLong last2DimSize, Nd4jLong lastSmallDim, Nd4jLong batchSize) { // y - diagonal, shape [A,B]
__shared__ T* z; // z - output, shape [A,B,C]
__shared__ T const* x; // input and output are the same array (x == z) when zeroPad = true
__shared__ Nd4jLong outLength, diagonalLen;
if (threadIdx.x == 0) {
z = reinterpret_cast<T*>(outputBuffer);
x = reinterpret_cast<T const*>(diagonalBuffer);
outLength = shape::length(outputShape);
diagonalLen = shape::length(diagonalShape);
}
__syncthreads();
for(int i = blockIdx.x; i < batchSize; i+= gridDim.x ) const auto x = reinterpret_cast<const T*>(vx);
for(int j = threadIdx.x; j < lastSmallDim; j += blockDim.x) { const auto y = reinterpret_cast<const T*>(vy);
// z[i * last2DimSize + j * (lastDimSize + 1)] = x[i * lastSmallDim + j]; auto z = reinterpret_cast<T*>(vz);
z[shape::getIndexOffset(i * last2DimSize + j * (lastDimSize + 1), outputShape, outLength)] = x[shape::getIndexOffset(i * lastSmallDim + j, diagonalShape, diagonalLen)];
}
}
//////////////////////////////////////////////////////////////////////////
// Returns a batched matrix tensor with new batched diagonal values.
// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
template <typename T>
static void _matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) {
*output = *input;
const int lastDimSize = input->sizeAt(-1); __shared__ int xRank; // xRank = zRank, xRank = yRank + 1
const int last2DimSize = input->sizeAt(-1) * input->sizeAt(-2); __shared__ Nd4jLong xLen, *sharedMem; // xLen = zLen
const int lastSmallDim = diagonal->sizeAt(-1); __shared__ bool areSameOffsets;
const int batchSize = input->lengthOf()/last2DimSize;
auto stream = context->getCudaStream();
dim3 launchDims(256, 512, 8192);
matrixSetDiagKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), diagonal->getSpecialBuffer(), diagonal->getSpecialShapeInfo(), lastDimSize, last2DimSize, lastSmallDim, batchSize);
//// #pragma omp parallel for if(batchSize > Environment::getInstance()->elementwiseThreshold()) schedule(static)
// for(int i = 0; i < batchSize; ++i )
// for(int j = 0; j < lastSmallDim; ++j) {
// output->p(i*last2DimSize + j*(lastDimSize + 1), diagonal->e<T>(i*lastSmallDim + j));
// }
if (threadIdx.x == 0) {
extern __shared__ unsigned char shmem[];
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); // shapes are definitely the same, but strides might not
xRank = shape::rank(xShapeInfo);
xLen = shape::length(xShapeInfo);
} }
void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) { __syncthreads();
BUILD_SINGLE_SELECTOR(input->dataType(), _matrixSetDiag, (context, input, diagonal, output), LIBND4J_TYPES);
}
BUILD_SINGLE_TEMPLATE(template void _matrixSetDiag, (nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output), LIBND4J_TYPES); auto coords = sharedMem + threadIdx.x * xRank; // we provide (xRank * sizeof(Nd4jLong) * threadIdx.x) amount of shared memory per each thread
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
for (Nd4jLong i = tid; i < xLen; i += gridDim.x * blockDim.x) {
shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords);
const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords, xRank);
const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords, xRank);
// condition to be on diagonal of innermost matrix
if(coords[xRank - 2] == coords[xRank - 1])
z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords, xRank - 1)];
else
z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
}
}
///////////////////////////////////////////////////////////////////
template<typename T>
static void matrixSetDiagCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool zeroPad) {
matrixSetDiagCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, zeroPad);
}
///////////////////////////////////////////////////////////////////
void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
const int threadsPerBlock = MAX_NUM_THREADS / 2;
const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * input.rankOf() + 128;
PointersManager manager(context, "matrixSetDiag");
NDArray::prepareSpecialUse({&output}, {&input, &diagonal});
BUILD_SINGLE_SELECTOR(input.dataType(), matrixSetDiagCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), diagonal.getSpecialBuffer(), diagonal.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), zeroPad), LIBND4J_TYPES);
NDArray::registerSpecialUse({&output}, {&input, &diagonal});
manager.synchronize();
}
} }
} }

View File

@ -1,95 +0,0 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// Created by GS <sgazeos@gmail.com> on 3/21/2018.
//
#include "ResultSet.h"
#include <ops/declarable/helpers/matrix_diag.h>
#include <Status.h>
#include <ShapeUtils.h>
#include <ShapeUtils.h>
#include <TAD.h>
#include <cuda_exception.h>
#include <helpers/ConstantTadHelper.h>
namespace nd4j {
namespace ops {
namespace helpers {
template <typename T>
static __global__ void matrixDiagKernel(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength,
Nd4jLong* tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets,
Nd4jLong* tadOnlyOutputShapeInfo, Nd4jLong *tadOutputOffsets) {
int totalThreads = blockDim.x;
for (Nd4jLong i = blockIdx.x; i < numTads; i += gridDim.x) {
auto yOffset = tadInputOffsets[i];
auto xOffset = tadOutputOffsets[i];
for (Nd4jLong j = threadIdx.x; j < inputLength; j += totalThreads) {
Nd4jLong coords[2] = {j, j};
Nd4jLong tadOffset = shape::getOffset(0, shape::shapeOf(tadOnlyOutputShapeInfo), shape::stride(tadOnlyOutputShapeInfo), coords, 2);
//shape::getIndexOffset(j, tadOnlyOutputShapeInfo, inputLength)
*(reinterpret_cast<T*>(outputBuffer) + xOffset + tadOffset) = *(reinterpret_cast<T const*>(inputBuffer) + yOffset + shape::getIndexOffset(j, tadOnlyInputShapeInfo, inputLength));
}
}
}
//////////////////////////////////////////////////////////////////////////
// Returns a batched matrix tensor with new batched diagonal values.
// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
template <typename T>
static int _matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
cudaStream_t* stream = context->getCudaStream();
//auto listOut = output->allTensorsAlongDimension({output->rankOf() - 2, output->rankOf() - 1});
//auto listDiag = input->allTensorsAlongDimension({input->rankOf() - 1});
//auto repeatDelta = shape::prodLong(newShape.data(), rank) / this->lengthOf();
std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(input->rankOf(), {input->rankOf() - 1});
const Nd4jLong numTads = ShapeUtils::getNumOfSubArrs(input->getShapeInfo(), dimsToExclude); //this->tensorsAlongDimension({dimension});
//printf("Repeat delta %lld, numTads %lld\n", repeatDelta, numTads);
//tadOnlyInputShapeInfo, tadInputOffsets, tadOnlyOutputShapeInfo, tadOutputOffsets;
std::vector<int> inputDims({input->rankOf() - 1});
std::vector<int> outputDims({output->rankOf() - 2, output->rankOf() - 1});
auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), inputDims);
auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), outputDims);
if (!input->isActualOnDeviceSide())
input->syncToDevice();
if (!output->isActualOnDeviceSide())
output->syncToDevice();
// create cuda stream and LaunchContext
cudaError_t cudaResult;
dim3 launchDims(256, 512, 8192);
matrixDiagKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(input->getSpecialBuffer(), output->getSpecialBuffer(), numTads, input->sizeAt(-1), packX.specialShapeInfo(), packX.specialOffsets(), packZ.specialShapeInfo(), packZ.specialOffsets());
return Status::OK();
}
int matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiag, (context, input, output), LIBND4J_TYPES);
}
BUILD_SINGLE_TEMPLATE(template int _matrixDiag, (nd4j::LaunchContext * context, const NDArray* input, NDArray* output), LIBND4J_TYPES);
}
}
}

View File

@ -28,8 +28,7 @@ namespace nd4j {
namespace ops { namespace ops {
namespace helpers { namespace helpers {
void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output); void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad);
} }
} }

View File

@ -1,34 +0,0 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author GS <sgazeos@gmail.com>
//
#ifndef __MATRIX_DIAG_HELPERS__
#define __MATRIX_DIAG_HELPERS__
#include <op_boilerplate.h>
#include <NDArray.h>
namespace nd4j {
namespace ops {
namespace helpers {
int matrixDiag(nd4j::LaunchContext * context, NDArray const* input, NDArray* output);
}
}
}
#endif

View File

@ -117,9 +117,9 @@ TEST_F(DeclarableOpsTests3, Test_Unique_1) {
auto v = result->at(0); auto v = result->at(0);
auto i = result->at(1); auto i = result->at(1);
v->printIndexedBuffer("Values"); // v->printIndexedBuffer("Values");
i->printIndexedBuffer("Indices"); // i->printIndexedBuffer("Indices");
i->printShapeInfo("Indices shape"); // i->printShapeInfo("Indices shape");
ASSERT_TRUE(expV.isSameShape(v)); ASSERT_TRUE(expV.isSameShape(v));
ASSERT_TRUE(expV.equalsTo(v)); ASSERT_TRUE(expV.equalsTo(v));
@ -145,12 +145,12 @@ TEST_F(DeclarableOpsTests3, Test_Unique_2) {
auto i = result->at(1); auto i = result->at(1);
auto c = result->at(2); auto c = result->at(2);
v->printShapeInfo(); // v->printShapeInfo();
v->printIndexedBuffer("Values"); // v->printIndexedBuffer("Values");
i->printShapeInfo(); // i->printShapeInfo();
i->printIndexedBuffer("Indices"); // i->printIndexedBuffer("Indices");
c->printShapeInfo(); // c->printShapeInfo();
c->printIndexedBuffer("Counts"); // c->printIndexedBuffer("Counts");
ASSERT_TRUE(expV.isSameShape(v)); ASSERT_TRUE(expV.isSameShape(v));
ASSERT_TRUE(expV.equalsTo(v)); ASSERT_TRUE(expV.equalsTo(v));
@ -200,11 +200,11 @@ TEST_F(DeclarableOpsTests3, Test_Norm_1) {
auto result1 = op.execute({&x}, {1.}, {1}); auto result1 = op.execute({&x}, {1.}, {1});
ASSERT_EQ(result1->status(), ND4J_STATUS_OK); ASSERT_EQ(result1->status(), ND4J_STATUS_OK);
auto z1 = result1->at(0); auto z1 = result1->at(0);
z1->printIndexedBuffer("Z1"); // z1->printIndexedBuffer("Z1");
auto exp1 = x.reduceAlongDims(reduce::Norm2, dims, false, false); auto exp1 = x.reduceAlongDims(reduce::Norm2, dims, false, false);
exp1.printIndexedBuffer("EXP1"); // exp1.printIndexedBuffer("EXP1");
z1->printShapeInfo("Z1 shape"); // z1->printShapeInfo("Z1 shape");
exp1.printShapeInfo("EXP1 shape"); // exp1.printShapeInfo("EXP1 shape");
ASSERT_TRUE(exp1.isSameShape(z1)); ASSERT_TRUE(exp1.isSameShape(z1));
ASSERT_TRUE(exp1.equalsTo(z1)); ASSERT_TRUE(exp1.equalsTo(z1));
@ -714,7 +714,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_7) {
auto exp = MmulHelper::mmul(&x, &y); auto exp = MmulHelper::mmul(&x, &y);
exp->printShapeInfo("exp shape"); // exp->printShapeInfo("exp shape");
nd4j::ops::batched_gemm op; nd4j::ops::batched_gemm op;
auto result = op.execute({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {112, 112, 2, 3, 5, 5, 3, 2, 3}); auto result = op.execute({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {112, 112, 2, 3, 5, 5, 3, 2, 3});

View File

@ -79,7 +79,7 @@ TEST_F(SortCudaTests, test_linear_sort_by_val_2) {
sortByValue(extras, k.buffer(), k.shapeInfo(), k.specialBuffer(), k.specialShapeInfo(), v.buffer(), v.shapeInfo(), v.specialBuffer(), v.specialShapeInfo(), true); sortByValue(extras, k.buffer(), k.shapeInfo(), k.specialBuffer(), k.specialShapeInfo(), v.buffer(), v.shapeInfo(), v.specialBuffer(), v.specialShapeInfo(), true);
k.tickWriteDevice(); k.tickWriteDevice();
v.tickWriteDevice(); v.tickWriteDevice();
k.printIndexedBuffer("KEYS"); // k.printIndexedBuffer("KEYS");
ASSERT_EQ(ek, k); ASSERT_EQ(ek, k);
ASSERT_EQ(ev, v); ASSERT_EQ(ev, v);
} }
@ -98,8 +98,8 @@ TEST_F(SortCudaTests, test_tad_sort_by_key_1) {
k.tickWriteDevice(); k.tickWriteDevice();
v.tickWriteDevice(); v.tickWriteDevice();
k.printIndexedBuffer("k"); // k.printIndexedBuffer("k");
v.printIndexedBuffer("v"); // v.printIndexedBuffer("v");
ASSERT_EQ(ek, k); ASSERT_EQ(ek, k);
ASSERT_EQ(ev, v); ASSERT_EQ(ev, v);