Shyrma docs (#222)
* - documenting and profiling matrix_set_diag cuda kernel Signed-off-by: Yurii <yurii@skymind.io> * - correct formula of pnorm pooling in cuda 2d/3d kernels - remove helper matrix_diag which duplicates work of helper matrix_set_diag Signed-off-by: Yurii <yurii@skymind.io>master
parent
106524663b
commit
cb4c9377b1
|
@ -15,7 +15,7 @@
|
|||
******************************************************************************/
|
||||
|
||||
//
|
||||
// @author Yurii Shyrma (iuriish@yahoo.com), created on 07.12.2017
|
||||
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||
//
|
||||
|
||||
#include <op_boilerplate.h>
|
||||
|
@ -38,10 +38,9 @@ CONFIGURABLE_OP_IMPL(matrix_set_diag, 2, 1, false, 0, 0) {
|
|||
for(int i = 0; i < diagonal->rankOf() - 1; ++i)
|
||||
REQUIRE_TRUE(diagonal->sizeAt(i) == input->sizeAt(i), 0, "MATRIX_SET_DIAG op: the shapes of diagonal and input arrays must be equal till last diagonal dimension but one, however got diagonal=%s and input=%s instead !", ShapeUtils::shapeAsString(diagonal).c_str(), ShapeUtils::shapeAsString(input).c_str());
|
||||
|
||||
REQUIRE_TRUE(diagonal->sizeAt(-1) == (int)nd4j::math::nd4j_min<Nd4jLong>(input->sizeAt(-1), input->sizeAt(-2)),
|
||||
0, "MATRIX_SET_DIAG op: the value of last dimension of diagonal array must be equal to min(input_last_shape=%i, input_last_but_one_shape=%i), but got %i instead !", input->sizeAt(-1), input->sizeAt(-2), diagonal->sizeAt(-1));
|
||||
REQUIRE_TRUE(diagonal->sizeAt(-1) == (int)nd4j::math::nd4j_min<Nd4jLong>(input->sizeAt(-1), input->sizeAt(-2)), 0, "MATRIX_SET_DIAG op: the value of last dimension of diagonal array must be equal to min(input_last_shape=%i, input_last_but_one_shape=%i), but got %i instead !", input->sizeAt(-1), input->sizeAt(-2), diagonal->sizeAt(-1));
|
||||
|
||||
helpers::matrixSetDiag(block.launchContext(), input, diagonal, output);
|
||||
helpers::matrixSetDiag(block.launchContext(), *input, *diagonal, *output, false);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
|
@ -15,49 +15,53 @@
|
|||
******************************************************************************/
|
||||
|
||||
//
|
||||
// Created to use with batched tensor by GS <sgazeos@gmail.com> 3/21/2018
|
||||
// @author GS <sgazeos@gmail.com> 3/21/2018
|
||||
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||
//
|
||||
|
||||
#include <ops/declarable/CustomOperations.h>
|
||||
#include <ops/declarable/helpers/matrix_diag.h>
|
||||
|
||||
#include <ops/declarable/helpers/matrixSetDiag.h>
|
||||
|
||||
namespace nd4j {
|
||||
namespace ops {
|
||||
CUSTOM_OP_IMPL(matrix_diag, 1, 1, false, 0, 0) {
|
||||
auto input = INPUT_VARIABLE(0);
|
||||
auto output = OUTPUT_VARIABLE(0);
|
||||
namespace ops {
|
||||
|
||||
REQUIRE_TRUE(!input->isScalar(), 0, "CUSTOM_OP matrix_diag: input array must be at list a vector, but scalar was given!");
|
||||
CUSTOM_OP_IMPL(matrix_diag, 1, 1, false, 0, 0) {
|
||||
|
||||
output->nullify();
|
||||
return helpers::matrixDiag(block.launchContext(), input, output);
|
||||
}
|
||||
auto diagonal = INPUT_VARIABLE(0);
|
||||
auto output = OUTPUT_VARIABLE(0);
|
||||
|
||||
DECLARE_SHAPE_FN(matrix_diag) {
|
||||
Nd4jLong* outShapeInfo = nullptr;
|
||||
auto in = inputShape->at(0);
|
||||
int inRank = shape::rank(in);
|
||||
REQUIRE_TRUE(!diagonal->isScalar(), 0, "CUSTOM_OP matrix_diag: input diagonal array must be at list a vector, but scalar was given!");
|
||||
|
||||
int outRank = inRank + 1;
|
||||
auto lastDimension = shape::sizeAt(in, -1);
|
||||
helpers::matrixSetDiag(block.launchContext(), *output, *diagonal, *output, true);
|
||||
|
||||
ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong);
|
||||
outShapeInfo[0] = outRank;
|
||||
for(int i = 0; i < inRank; ++i)
|
||||
outShapeInfo[i + 1] = shape::sizeAt(in, i);
|
||||
outShapeInfo[outRank] = lastDimension;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in));
|
||||
DECLARE_SHAPE_FN(matrix_diag) {
|
||||
|
||||
return SHAPELIST(CONSTANT(outShapeInfo));
|
||||
}
|
||||
Nd4jLong* outShapeInfo = nullptr;
|
||||
auto in = inputShape->at(0);
|
||||
int inRank = shape::rank(in);
|
||||
|
||||
DECLARE_TYPES(matrix_diag) {
|
||||
getOpDescriptor()
|
||||
->setAllowedInputTypes(nd4j::DataType::ANY)
|
||||
->setSameMode(true);
|
||||
}
|
||||
int outRank = inRank + 1;
|
||||
auto lastDimension = shape::sizeAt(in, -1);
|
||||
|
||||
ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong);
|
||||
outShapeInfo[0] = outRank;
|
||||
for(int i = 0; i < inRank; ++i)
|
||||
outShapeInfo[i + 1] = shape::sizeAt(in, i);
|
||||
outShapeInfo[outRank] = lastDimension;
|
||||
|
||||
ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in));
|
||||
|
||||
return SHAPELIST(CONSTANT(outShapeInfo));
|
||||
}
|
||||
|
||||
DECLARE_TYPES(matrix_diag) {
|
||||
getOpDescriptor()
|
||||
->setAllowedInputTypes(nd4j::DataType::ANY)
|
||||
->setSameMode(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -76,8 +76,20 @@ namespace nd4j {
|
|||
#endif
|
||||
|
||||
/**
|
||||
* Returns a batched matrix tensor with new batched diagonal values.
|
||||
*/
|
||||
* Inserts elements provided by diagonal array into the main diagonal of innermost matrices of input array
|
||||
*
|
||||
* Input arrays:
|
||||
* input: input array, considered as batch of matrices
|
||||
* diagonal: array containing elements to be inserted into input array,
|
||||
* following rank condition should be satisfied: diagonal_rank = input_rank - 1,
|
||||
* the shapes of diagonal and input arrays must be equal except last dimension of input array,
|
||||
* for example if input_shape = [A,B,C,D] then diagonal_shape = [A,B,C],
|
||||
* also last dimension of diagonal array should be equal to smaller of last and last but one input dimensions
|
||||
* that is: diagonal_shape[-1] = min(input_shape[-1], input_shape[-2])
|
||||
*
|
||||
* Output array:
|
||||
* has the same shape as input, corresponding diagonal elements are substituted
|
||||
*/
|
||||
#if NOT_EXCLUDED(OP_matrix_set_diag)
|
||||
DECLARE_CONFIGURABLE_OP(matrix_set_diag, 2, 1, false, 0, 0);
|
||||
#endif
|
||||
|
|
|
@ -2411,7 +2411,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
|||
for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
|
||||
for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
|
||||
for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
|
||||
pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f);
|
||||
pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f) * nd4j::math::nd4j_sgn<T,T>(pIn[kd + kh + kw]);
|
||||
}
|
||||
else {
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
******************************************************************************/
|
||||
|
||||
//
|
||||
// Created by Yurii Shyrma on 07.12.2017.
|
||||
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||
//
|
||||
|
||||
#include "ResultSet.h"
|
||||
|
@ -27,31 +27,48 @@ namespace helpers {
|
|||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// Returns a batched matrix tensor with new batched diagonal values.
|
||||
// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
|
||||
template <typename T>
|
||||
static void _matrixSetDiag(const NDArray* input, const NDArray* diagonal, NDArray* output) {
|
||||
template<typename T>
|
||||
void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
|
||||
|
||||
*output = *input;
|
||||
// input and output are the same array (x == z) when zeroPad = true
|
||||
// xRank = zRank, xRank = yRank + 1
|
||||
// xLen = zLen
|
||||
|
||||
const int lastDimSize = input->sizeAt(-1);
|
||||
const int last2DimSize = input->sizeAt(-1) * input->sizeAt(-2);
|
||||
const int lastSmallDim = diagonal->sizeAt(-1);
|
||||
const int batchSize = input->lengthOf()/last2DimSize;
|
||||
const T* x = input.bufferAsT<T>();
|
||||
const T* y = diagonal.bufferAsT<T>();
|
||||
T* z = output.bufferAsT<T>();
|
||||
|
||||
for(int i = 0; i < batchSize; ++i )
|
||||
for(int j = 0; j < lastSmallDim; ++j) {
|
||||
output->p(i*last2DimSize + j*(lastDimSize + 1), diagonal->e<T>(i*lastSmallDim + j));
|
||||
}
|
||||
|
||||
const Nd4jLong* xShapeInfo = input.getShapeInfo();
|
||||
const Nd4jLong* yShapeInfo = diagonal.getShapeInfo();
|
||||
const Nd4jLong* zShapeInfo = output.getShapeInfo();
|
||||
|
||||
const bool areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); // shapes are definitely the same, but strides might not
|
||||
|
||||
const int xRank = input.rankOf();
|
||||
const auto xLen = input.lengthOf();
|
||||
|
||||
std::vector<Nd4jLong> coords(xRank); // we use the same coordinates storage both for input and output since their ranks are the same
|
||||
|
||||
PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
|
||||
for (Nd4jLong i = 0; i < xLen; ++i) {
|
||||
|
||||
shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords.data());
|
||||
|
||||
const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords.data(), xRank);
|
||||
const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords.data(), xRank);
|
||||
|
||||
// condition to be on diagonal of innermost matrix
|
||||
if(coords[xRank - 2] == coords[xRank - 1])
|
||||
z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords.data(), xRank - 1)];
|
||||
else
|
||||
z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
|
||||
}
|
||||
}
|
||||
|
||||
void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) {
|
||||
BUILD_SINGLE_SELECTOR(input->dataType(), _matrixSetDiag, (input, diagonal, output), LIBND4J_TYPES);
|
||||
}
|
||||
|
||||
BUILD_SINGLE_TEMPLATE(template void _matrixSetDiag, (const NDArray* input, const NDArray* diagonal, NDArray* output), LIBND4J_TYPES);
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
|
||||
BUILD_SINGLE_SELECTOR(input.dataType(), matrixSetDiag_, (input, diagonal, output, zeroPad), LIBND4J_TYPES);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,65 +0,0 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2015-2018 Skymind, Inc.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
//
|
||||
// Created by GS <sgazeos@gmail.com> on 3/21/2018.
|
||||
//
|
||||
|
||||
#include "ResultSet.h"
|
||||
#include <ops/declarable/helpers/matrix_diag.h>
|
||||
#include <Status.h>
|
||||
|
||||
namespace nd4j {
|
||||
namespace ops {
|
||||
namespace helpers {
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// Returns a batched matrix tensor with new batched diagonal values.
|
||||
// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
|
||||
template <typename T>
|
||||
static int _matrixDiag(const NDArray* input, NDArray* output) {
|
||||
|
||||
auto listOut = output->allTensorsAlongDimension({output->rankOf() - 2, output->rankOf() - 1});
|
||||
auto listDiag = input->allTensorsAlongDimension({input->rankOf() - 1});
|
||||
|
||||
if (listOut->size() != listDiag->size()) {
|
||||
nd4j_printf("matrix_diag: Input matrix has wrong shape.", "");
|
||||
return ND4J_STATUS_VALIDATION;
|
||||
}
|
||||
int lastDimension = input->sizeAt(-1);
|
||||
// TODO: tune this properlys
|
||||
int lO = listOut->size();
|
||||
PRAGMA_OMP_PARALLEL_FOR_IF(lO > Environment::getInstance()->tadThreshold())
|
||||
for(int i = 0; i < lO; ++i)
|
||||
for (int e = 0; e < lastDimension; e++)
|
||||
listOut->at(i)->p(e, e, listDiag->at(i)->e<T>(e));
|
||||
|
||||
delete listOut;
|
||||
delete listDiag;
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
|
||||
BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiag, (input, output), LIBND4J_TYPES);
|
||||
}
|
||||
|
||||
BUILD_SINGLE_TEMPLATE(template int _matrixDiag, (const NDArray* input, NDArray* output), LIBND4J_TYPES);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
|
@ -957,9 +957,13 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
|
|||
|
||||
val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
|
||||
|
||||
for (coords[2] = hstart; coords[2] < hend; coords[2] += dH)
|
||||
for (coords[3] = wstart; coords[3] < wend; coords[3] += dW)
|
||||
nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0 - 1.f));
|
||||
for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {
|
||||
for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) {
|
||||
const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
|
||||
const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
|
||||
nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -1123,10 +1127,15 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
|
|||
|
||||
val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
|
||||
|
||||
for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
|
||||
for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
|
||||
for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
|
||||
nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0 - 1.f));
|
||||
for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
|
||||
for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) {
|
||||
for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
|
||||
const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
|
||||
const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
|
||||
nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -15,63 +15,87 @@
|
|||
******************************************************************************/
|
||||
|
||||
//
|
||||
// Created by Yurii Shyrma on 07.12.2017.
|
||||
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||
//
|
||||
|
||||
#include "ResultSet.h"
|
||||
#include <ops/declarable/helpers/matrixSetDiag.h>
|
||||
#include <PointersManager.h>
|
||||
|
||||
namespace nd4j {
|
||||
namespace ops {
|
||||
namespace nd4j {
|
||||
namespace ops {
|
||||
namespace helpers {
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
template<typename T>
|
||||
__global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool zeroPad) {
|
||||
|
||||
template <typename T>
|
||||
static __global__ void matrixSetDiagKernel(void* outputBuffer, Nd4jLong* outputShape, void const* diagonalBuffer, Nd4jLong* diagonalShape, Nd4jLong lastDimSize, Nd4jLong last2DimSize, Nd4jLong lastSmallDim, Nd4jLong batchSize) {
|
||||
__shared__ T* z;
|
||||
__shared__ T const* x;
|
||||
__shared__ Nd4jLong outLength, diagonalLen;
|
||||
if (threadIdx.x == 0) {
|
||||
z = reinterpret_cast<T*>(outputBuffer);
|
||||
x = reinterpret_cast<T const*>(diagonalBuffer);
|
||||
outLength = shape::length(outputShape);
|
||||
diagonalLen = shape::length(diagonalShape);
|
||||
}
|
||||
__syncthreads();
|
||||
// x - input, shape [A,B,C]
|
||||
// y - diagonal, shape [A,B]
|
||||
// z - output, shape [A,B,C]
|
||||
// input and output are the same array (x == z) when zeroPad = true
|
||||
|
||||
for(int i = blockIdx.x; i < batchSize; i+= gridDim.x )
|
||||
for(int j = threadIdx.x; j < lastSmallDim; j += blockDim.x) {
|
||||
// z[i * last2DimSize + j * (lastDimSize + 1)] = x[i * lastSmallDim + j];
|
||||
z[shape::getIndexOffset(i * last2DimSize + j * (lastDimSize + 1), outputShape, outLength)] = x[shape::getIndexOffset(i * lastSmallDim + j, diagonalShape, diagonalLen)];
|
||||
}
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// Returns a batched matrix tensor with new batched diagonal values.
|
||||
// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
|
||||
template <typename T>
|
||||
static void _matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) {
|
||||
*output = *input;
|
||||
const auto x = reinterpret_cast<const T*>(vx);
|
||||
const auto y = reinterpret_cast<const T*>(vy);
|
||||
auto z = reinterpret_cast<T*>(vz);
|
||||
|
||||
const int lastDimSize = input->sizeAt(-1);
|
||||
const int last2DimSize = input->sizeAt(-1) * input->sizeAt(-2);
|
||||
const int lastSmallDim = diagonal->sizeAt(-1);
|
||||
const int batchSize = input->lengthOf()/last2DimSize;
|
||||
auto stream = context->getCudaStream();
|
||||
dim3 launchDims(256, 512, 8192);
|
||||
matrixSetDiagKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), diagonal->getSpecialBuffer(), diagonal->getSpecialShapeInfo(), lastDimSize, last2DimSize, lastSmallDim, batchSize);
|
||||
//// #pragma omp parallel for if(batchSize > Environment::getInstance()->elementwiseThreshold()) schedule(static)
|
||||
// for(int i = 0; i < batchSize; ++i )
|
||||
// for(int j = 0; j < lastSmallDim; ++j) {
|
||||
// output->p(i*last2DimSize + j*(lastDimSize + 1), diagonal->e<T>(i*lastSmallDim + j));
|
||||
// }
|
||||
__shared__ int xRank; // xRank = zRank, xRank = yRank + 1
|
||||
__shared__ Nd4jLong xLen, *sharedMem; // xLen = zLen
|
||||
__shared__ bool areSameOffsets;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
extern __shared__ unsigned char shmem[];
|
||||
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||
|
||||
areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); // shapes are definitely the same, but strides might not
|
||||
|
||||
xRank = shape::rank(xShapeInfo);
|
||||
xLen = shape::length(xShapeInfo);
|
||||
}
|
||||
|
||||
void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) {
|
||||
BUILD_SINGLE_SELECTOR(input->dataType(), _matrixSetDiag, (context, input, diagonal, output), LIBND4J_TYPES);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
BUILD_SINGLE_TEMPLATE(template void _matrixSetDiag, (nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output), LIBND4J_TYPES);
|
||||
auto coords = sharedMem + threadIdx.x * xRank; // we provide (xRank * sizeof(Nd4jLong) * threadIdx.x) amount of shared memory per each thread
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (Nd4jLong i = tid; i < xLen; i += gridDim.x * blockDim.x) {
|
||||
|
||||
shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords);
|
||||
|
||||
const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords, xRank);
|
||||
const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords, xRank);
|
||||
|
||||
// condition to be on diagonal of innermost matrix
|
||||
if(coords[xRank - 2] == coords[xRank - 1])
|
||||
z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords, xRank - 1)];
|
||||
else
|
||||
z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
template<typename T>
|
||||
static void matrixSetDiagCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool zeroPad) {
|
||||
|
||||
matrixSetDiagCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, zeroPad);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
|
||||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS / 2;
|
||||
const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * input.rankOf() + 128;
|
||||
|
||||
PointersManager manager(context, "matrixSetDiag");
|
||||
|
||||
NDArray::prepareSpecialUse({&output}, {&input, &diagonal});
|
||||
BUILD_SINGLE_SELECTOR(input.dataType(), matrixSetDiagCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), diagonal.getSpecialBuffer(), diagonal.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), zeroPad), LIBND4J_TYPES);
|
||||
NDArray::registerSpecialUse({&output}, {&input, &diagonal});
|
||||
|
||||
manager.synchronize();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,95 +0,0 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2015-2018 Skymind, Inc.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
//
|
||||
// Created by GS <sgazeos@gmail.com> on 3/21/2018.
|
||||
//
|
||||
|
||||
#include "ResultSet.h"
|
||||
#include <ops/declarable/helpers/matrix_diag.h>
|
||||
#include <Status.h>
|
||||
#include <ShapeUtils.h>
|
||||
#include <ShapeUtils.h>
|
||||
#include <TAD.h>
|
||||
#include <cuda_exception.h>
|
||||
#include <helpers/ConstantTadHelper.h>
|
||||
|
||||
namespace nd4j {
|
||||
namespace ops {
|
||||
namespace helpers {
|
||||
|
||||
|
||||
template <typename T>
|
||||
static __global__ void matrixDiagKernel(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength,
|
||||
Nd4jLong* tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets,
|
||||
Nd4jLong* tadOnlyOutputShapeInfo, Nd4jLong *tadOutputOffsets) {
|
||||
int totalThreads = blockDim.x;
|
||||
for (Nd4jLong i = blockIdx.x; i < numTads; i += gridDim.x) {
|
||||
auto yOffset = tadInputOffsets[i];
|
||||
auto xOffset = tadOutputOffsets[i];
|
||||
for (Nd4jLong j = threadIdx.x; j < inputLength; j += totalThreads) {
|
||||
Nd4jLong coords[2] = {j, j};
|
||||
Nd4jLong tadOffset = shape::getOffset(0, shape::shapeOf(tadOnlyOutputShapeInfo), shape::stride(tadOnlyOutputShapeInfo), coords, 2);
|
||||
//shape::getIndexOffset(j, tadOnlyOutputShapeInfo, inputLength)
|
||||
*(reinterpret_cast<T*>(outputBuffer) + xOffset + tadOffset) = *(reinterpret_cast<T const*>(inputBuffer) + yOffset + shape::getIndexOffset(j, tadOnlyInputShapeInfo, inputLength));
|
||||
}
|
||||
}
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// Returns a batched matrix tensor with new batched diagonal values.
|
||||
// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
|
||||
|
||||
template <typename T>
|
||||
static int _matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
|
||||
cudaStream_t* stream = context->getCudaStream();
|
||||
//auto listOut = output->allTensorsAlongDimension({output->rankOf() - 2, output->rankOf() - 1});
|
||||
//auto listDiag = input->allTensorsAlongDimension({input->rankOf() - 1});
|
||||
|
||||
//auto repeatDelta = shape::prodLong(newShape.data(), rank) / this->lengthOf();
|
||||
std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(input->rankOf(), {input->rankOf() - 1});
|
||||
const Nd4jLong numTads = ShapeUtils::getNumOfSubArrs(input->getShapeInfo(), dimsToExclude); //this->tensorsAlongDimension({dimension});
|
||||
//printf("Repeat delta %lld, numTads %lld\n", repeatDelta, numTads);
|
||||
//tadOnlyInputShapeInfo, tadInputOffsets, tadOnlyOutputShapeInfo, tadOutputOffsets;
|
||||
std::vector<int> inputDims({input->rankOf() - 1});
|
||||
std::vector<int> outputDims({output->rankOf() - 2, output->rankOf() - 1});
|
||||
|
||||
auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), inputDims);
|
||||
auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), outputDims);
|
||||
|
||||
if (!input->isActualOnDeviceSide())
|
||||
input->syncToDevice();
|
||||
|
||||
if (!output->isActualOnDeviceSide())
|
||||
output->syncToDevice();
|
||||
|
||||
// create cuda stream and LaunchContext
|
||||
cudaError_t cudaResult;
|
||||
|
||||
dim3 launchDims(256, 512, 8192);
|
||||
matrixDiagKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(input->getSpecialBuffer(), output->getSpecialBuffer(), numTads, input->sizeAt(-1), packX.specialShapeInfo(), packX.specialOffsets(), packZ.specialShapeInfo(), packZ.specialOffsets());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
|
||||
BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiag, (context, input, output), LIBND4J_TYPES);
|
||||
}
|
||||
|
||||
BUILD_SINGLE_TEMPLATE(template int _matrixDiag, (nd4j::LaunchContext * context, const NDArray* input, NDArray* output), LIBND4J_TYPES);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
|
@ -28,8 +28,7 @@ namespace nd4j {
|
|||
namespace ops {
|
||||
namespace helpers {
|
||||
|
||||
void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output);
|
||||
|
||||
void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,34 +0,0 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2015-2018 Skymind, Inc.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
//
|
||||
// @author GS <sgazeos@gmail.com>
|
||||
//
|
||||
#ifndef __MATRIX_DIAG_HELPERS__
|
||||
#define __MATRIX_DIAG_HELPERS__
|
||||
#include <op_boilerplate.h>
|
||||
#include <NDArray.h>
|
||||
|
||||
namespace nd4j {
|
||||
namespace ops {
|
||||
namespace helpers {
|
||||
|
||||
int matrixDiag(nd4j::LaunchContext * context, NDArray const* input, NDArray* output);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -117,9 +117,9 @@ TEST_F(DeclarableOpsTests3, Test_Unique_1) {
|
|||
|
||||
auto v = result->at(0);
|
||||
auto i = result->at(1);
|
||||
v->printIndexedBuffer("Values");
|
||||
i->printIndexedBuffer("Indices");
|
||||
i->printShapeInfo("Indices shape");
|
||||
// v->printIndexedBuffer("Values");
|
||||
// i->printIndexedBuffer("Indices");
|
||||
// i->printShapeInfo("Indices shape");
|
||||
ASSERT_TRUE(expV.isSameShape(v));
|
||||
ASSERT_TRUE(expV.equalsTo(v));
|
||||
|
||||
|
@ -145,12 +145,12 @@ TEST_F(DeclarableOpsTests3, Test_Unique_2) {
|
|||
auto i = result->at(1);
|
||||
auto c = result->at(2);
|
||||
|
||||
v->printShapeInfo();
|
||||
v->printIndexedBuffer("Values");
|
||||
i->printShapeInfo();
|
||||
i->printIndexedBuffer("Indices");
|
||||
c->printShapeInfo();
|
||||
c->printIndexedBuffer("Counts");
|
||||
// v->printShapeInfo();
|
||||
// v->printIndexedBuffer("Values");
|
||||
// i->printShapeInfo();
|
||||
// i->printIndexedBuffer("Indices");
|
||||
// c->printShapeInfo();
|
||||
// c->printIndexedBuffer("Counts");
|
||||
|
||||
ASSERT_TRUE(expV.isSameShape(v));
|
||||
ASSERT_TRUE(expV.equalsTo(v));
|
||||
|
@ -200,11 +200,11 @@ TEST_F(DeclarableOpsTests3, Test_Norm_1) {
|
|||
auto result1 = op.execute({&x}, {1.}, {1});
|
||||
ASSERT_EQ(result1->status(), ND4J_STATUS_OK);
|
||||
auto z1 = result1->at(0);
|
||||
z1->printIndexedBuffer("Z1");
|
||||
// z1->printIndexedBuffer("Z1");
|
||||
auto exp1 = x.reduceAlongDims(reduce::Norm2, dims, false, false);
|
||||
exp1.printIndexedBuffer("EXP1");
|
||||
z1->printShapeInfo("Z1 shape");
|
||||
exp1.printShapeInfo("EXP1 shape");
|
||||
// exp1.printIndexedBuffer("EXP1");
|
||||
// z1->printShapeInfo("Z1 shape");
|
||||
// exp1.printShapeInfo("EXP1 shape");
|
||||
ASSERT_TRUE(exp1.isSameShape(z1));
|
||||
ASSERT_TRUE(exp1.equalsTo(z1));
|
||||
|
||||
|
@ -714,7 +714,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_7) {
|
|||
|
||||
auto exp = MmulHelper::mmul(&x, &y);
|
||||
|
||||
exp->printShapeInfo("exp shape");
|
||||
// exp->printShapeInfo("exp shape");
|
||||
|
||||
nd4j::ops::batched_gemm op;
|
||||
auto result = op.execute({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {112, 112, 2, 3, 5, 5, 3, 2, 3});
|
||||
|
|
|
@ -79,7 +79,7 @@ TEST_F(SortCudaTests, test_linear_sort_by_val_2) {
|
|||
sortByValue(extras, k.buffer(), k.shapeInfo(), k.specialBuffer(), k.specialShapeInfo(), v.buffer(), v.shapeInfo(), v.specialBuffer(), v.specialShapeInfo(), true);
|
||||
k.tickWriteDevice();
|
||||
v.tickWriteDevice();
|
||||
k.printIndexedBuffer("KEYS");
|
||||
// k.printIndexedBuffer("KEYS");
|
||||
ASSERT_EQ(ek, k);
|
||||
ASSERT_EQ(ev, v);
|
||||
}
|
||||
|
@ -98,8 +98,8 @@ TEST_F(SortCudaTests, test_tad_sort_by_key_1) {
|
|||
k.tickWriteDevice();
|
||||
v.tickWriteDevice();
|
||||
|
||||
k.printIndexedBuffer("k");
|
||||
v.printIndexedBuffer("v");
|
||||
// k.printIndexedBuffer("k");
|
||||
// v.printIndexedBuffer("v");
|
||||
|
||||
ASSERT_EQ(ek, k);
|
||||
ASSERT_EQ(ev, v);
|
||||
|
|
Loading…
Reference in New Issue