Shyrma docs (#222)
* - documenting and profiling matrix_set_diag cuda kernel Signed-off-by: Yurii <yurii@skymind.io> * - correct formula of pnorm pooling in cuda 2d/3d kernels - remove helper matrix_diag which duplicates work of helper matrix_set_diag Signed-off-by: Yurii <yurii@skymind.io>master
parent
106524663b
commit
cb4c9377b1
|
@ -15,7 +15,7 @@
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
|
|
||||||
//
|
//
|
||||||
// @author Yurii Shyrma (iuriish@yahoo.com), created on 07.12.2017
|
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||||
//
|
//
|
||||||
|
|
||||||
#include <op_boilerplate.h>
|
#include <op_boilerplate.h>
|
||||||
|
@ -38,10 +38,9 @@ CONFIGURABLE_OP_IMPL(matrix_set_diag, 2, 1, false, 0, 0) {
|
||||||
for(int i = 0; i < diagonal->rankOf() - 1; ++i)
|
for(int i = 0; i < diagonal->rankOf() - 1; ++i)
|
||||||
REQUIRE_TRUE(diagonal->sizeAt(i) == input->sizeAt(i), 0, "MATRIX_SET_DIAG op: the shapes of diagonal and input arrays must be equal till last diagonal dimension but one, however got diagonal=%s and input=%s instead !", ShapeUtils::shapeAsString(diagonal).c_str(), ShapeUtils::shapeAsString(input).c_str());
|
REQUIRE_TRUE(diagonal->sizeAt(i) == input->sizeAt(i), 0, "MATRIX_SET_DIAG op: the shapes of diagonal and input arrays must be equal till last diagonal dimension but one, however got diagonal=%s and input=%s instead !", ShapeUtils::shapeAsString(diagonal).c_str(), ShapeUtils::shapeAsString(input).c_str());
|
||||||
|
|
||||||
REQUIRE_TRUE(diagonal->sizeAt(-1) == (int)nd4j::math::nd4j_min<Nd4jLong>(input->sizeAt(-1), input->sizeAt(-2)),
|
REQUIRE_TRUE(diagonal->sizeAt(-1) == (int)nd4j::math::nd4j_min<Nd4jLong>(input->sizeAt(-1), input->sizeAt(-2)), 0, "MATRIX_SET_DIAG op: the value of last dimension of diagonal array must be equal to min(input_last_shape=%i, input_last_but_one_shape=%i), but got %i instead !", input->sizeAt(-1), input->sizeAt(-2), diagonal->sizeAt(-1));
|
||||||
0, "MATRIX_SET_DIAG op: the value of last dimension of diagonal array must be equal to min(input_last_shape=%i, input_last_but_one_shape=%i), but got %i instead !", input->sizeAt(-1), input->sizeAt(-2), diagonal->sizeAt(-1));
|
|
||||||
|
|
||||||
helpers::matrixSetDiag(block.launchContext(), input, diagonal, output);
|
helpers::matrixSetDiag(block.launchContext(), *input, *diagonal, *output, false);
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,26 +15,30 @@
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
|
|
||||||
//
|
//
|
||||||
// Created to use with batched tensor by GS <sgazeos@gmail.com> 3/21/2018
|
// @author GS <sgazeos@gmail.com> 3/21/2018
|
||||||
|
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||||
//
|
//
|
||||||
|
|
||||||
#include <ops/declarable/CustomOperations.h>
|
#include <ops/declarable/CustomOperations.h>
|
||||||
#include <ops/declarable/helpers/matrix_diag.h>
|
#include <ops/declarable/helpers/matrixSetDiag.h>
|
||||||
|
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
namespace ops {
|
namespace ops {
|
||||||
|
|
||||||
CUSTOM_OP_IMPL(matrix_diag, 1, 1, false, 0, 0) {
|
CUSTOM_OP_IMPL(matrix_diag, 1, 1, false, 0, 0) {
|
||||||
auto input = INPUT_VARIABLE(0);
|
|
||||||
|
auto diagonal = INPUT_VARIABLE(0);
|
||||||
auto output = OUTPUT_VARIABLE(0);
|
auto output = OUTPUT_VARIABLE(0);
|
||||||
|
|
||||||
REQUIRE_TRUE(!input->isScalar(), 0, "CUSTOM_OP matrix_diag: input array must be at list a vector, but scalar was given!");
|
REQUIRE_TRUE(!diagonal->isScalar(), 0, "CUSTOM_OP matrix_diag: input diagonal array must be at list a vector, but scalar was given!");
|
||||||
|
|
||||||
output->nullify();
|
helpers::matrixSetDiag(block.launchContext(), *output, *diagonal, *output, true);
|
||||||
return helpers::matrixDiag(block.launchContext(), input, output);
|
|
||||||
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
DECLARE_SHAPE_FN(matrix_diag) {
|
DECLARE_SHAPE_FN(matrix_diag) {
|
||||||
|
|
||||||
Nd4jLong* outShapeInfo = nullptr;
|
Nd4jLong* outShapeInfo = nullptr;
|
||||||
auto in = inputShape->at(0);
|
auto in = inputShape->at(0);
|
||||||
int inRank = shape::rank(in);
|
int inRank = shape::rank(in);
|
||||||
|
|
|
@ -76,7 +76,19 @@ namespace nd4j {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a batched matrix tensor with new batched diagonal values.
|
* Inserts elements provided by diagonal array into the main diagonal of innermost matrices of input array
|
||||||
|
*
|
||||||
|
* Input arrays:
|
||||||
|
* input: input array, considered as batch of matrices
|
||||||
|
* diagonal: array containing elements to be inserted into input array,
|
||||||
|
* following rank condition should be satisfied: diagonal_rank = input_rank - 1,
|
||||||
|
* the shapes of diagonal and input arrays must be equal except last dimension of input array,
|
||||||
|
* for example if input_shape = [A,B,C,D] then diagonal_shape = [A,B,C],
|
||||||
|
* also last dimension of diagonal array should be equal to smaller of last and last but one input dimensions
|
||||||
|
* that is: diagonal_shape[-1] = min(input_shape[-1], input_shape[-2])
|
||||||
|
*
|
||||||
|
* Output array:
|
||||||
|
* has the same shape as input, corresponding diagonal elements are substituted
|
||||||
*/
|
*/
|
||||||
#if NOT_EXCLUDED(OP_matrix_set_diag)
|
#if NOT_EXCLUDED(OP_matrix_set_diag)
|
||||||
DECLARE_CONFIGURABLE_OP(matrix_set_diag, 2, 1, false, 0, 0);
|
DECLARE_CONFIGURABLE_OP(matrix_set_diag, 2, 1, false, 0, 0);
|
||||||
|
|
|
@ -2411,7 +2411,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
|
for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
|
||||||
for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
|
for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
|
||||||
for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
|
for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
|
||||||
pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f);
|
pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f) * nd4j::math::nd4j_sgn<T,T>(pIn[kd + kh + kw]);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
|
|
||||||
//
|
//
|
||||||
// Created by Yurii Shyrma on 07.12.2017.
|
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "ResultSet.h"
|
#include "ResultSet.h"
|
||||||
|
@ -27,32 +27,49 @@ namespace helpers {
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// Returns a batched matrix tensor with new batched diagonal values.
|
|
||||||
// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
static void _matrixSetDiag(const NDArray* input, const NDArray* diagonal, NDArray* output) {
|
void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
|
||||||
|
|
||||||
*output = *input;
|
// input and output are the same array (x == z) when zeroPad = true
|
||||||
|
// xRank = zRank, xRank = yRank + 1
|
||||||
|
// xLen = zLen
|
||||||
|
|
||||||
const int lastDimSize = input->sizeAt(-1);
|
const T* x = input.bufferAsT<T>();
|
||||||
const int last2DimSize = input->sizeAt(-1) * input->sizeAt(-2);
|
const T* y = diagonal.bufferAsT<T>();
|
||||||
const int lastSmallDim = diagonal->sizeAt(-1);
|
T* z = output.bufferAsT<T>();
|
||||||
const int batchSize = input->lengthOf()/last2DimSize;
|
|
||||||
|
|
||||||
for(int i = 0; i < batchSize; ++i )
|
const Nd4jLong* xShapeInfo = input.getShapeInfo();
|
||||||
for(int j = 0; j < lastSmallDim; ++j) {
|
const Nd4jLong* yShapeInfo = diagonal.getShapeInfo();
|
||||||
output->p(i*last2DimSize + j*(lastDimSize + 1), diagonal->e<T>(i*lastSmallDim + j));
|
const Nd4jLong* zShapeInfo = output.getShapeInfo();
|
||||||
|
|
||||||
|
const bool areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); // shapes are definitely the same, but strides might not
|
||||||
|
|
||||||
|
const int xRank = input.rankOf();
|
||||||
|
const auto xLen = input.lengthOf();
|
||||||
|
|
||||||
|
std::vector<Nd4jLong> coords(xRank); // we use the same coordinates storage both for input and output since their ranks are the same
|
||||||
|
|
||||||
|
PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
|
||||||
|
for (Nd4jLong i = 0; i < xLen; ++i) {
|
||||||
|
|
||||||
|
shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords.data());
|
||||||
|
|
||||||
|
const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords.data(), xRank);
|
||||||
|
const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords.data(), xRank);
|
||||||
|
|
||||||
|
// condition to be on diagonal of innermost matrix
|
||||||
|
if(coords[xRank - 2] == coords[xRank - 1])
|
||||||
|
z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords.data(), xRank - 1)];
|
||||||
|
else
|
||||||
|
z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
|
||||||
|
BUILD_SINGLE_SELECTOR(input.dataType(), matrixSetDiag_, (input, diagonal, output, zeroPad), LIBND4J_TYPES);
|
||||||
}
|
}
|
||||||
|
|
||||||
void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) {
|
|
||||||
BUILD_SINGLE_SELECTOR(input->dataType(), _matrixSetDiag, (input, diagonal, output), LIBND4J_TYPES);
|
|
||||||
}
|
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template void _matrixSetDiag, (const NDArray* input, const NDArray* diagonal, NDArray* output), LIBND4J_TYPES);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -1,65 +0,0 @@
|
||||||
/*******************************************************************************
|
|
||||||
* Copyright (c) 2015-2018 Skymind, Inc.
|
|
||||||
*
|
|
||||||
* This program and the accompanying materials are made available under the
|
|
||||||
* terms of the Apache License, Version 2.0 which is available at
|
|
||||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
||||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
||||||
* License for the specific language governing permissions and limitations
|
|
||||||
* under the License.
|
|
||||||
*
|
|
||||||
* SPDX-License-Identifier: Apache-2.0
|
|
||||||
******************************************************************************/
|
|
||||||
|
|
||||||
//
|
|
||||||
// Created by GS <sgazeos@gmail.com> on 3/21/2018.
|
|
||||||
//
|
|
||||||
|
|
||||||
#include "ResultSet.h"
|
|
||||||
#include <ops/declarable/helpers/matrix_diag.h>
|
|
||||||
#include <Status.h>
|
|
||||||
|
|
||||||
namespace nd4j {
|
|
||||||
namespace ops {
|
|
||||||
namespace helpers {
|
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
// Returns a batched matrix tensor with new batched diagonal values.
|
|
||||||
// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
|
|
||||||
template <typename T>
|
|
||||||
static int _matrixDiag(const NDArray* input, NDArray* output) {
|
|
||||||
|
|
||||||
auto listOut = output->allTensorsAlongDimension({output->rankOf() - 2, output->rankOf() - 1});
|
|
||||||
auto listDiag = input->allTensorsAlongDimension({input->rankOf() - 1});
|
|
||||||
|
|
||||||
if (listOut->size() != listDiag->size()) {
|
|
||||||
nd4j_printf("matrix_diag: Input matrix has wrong shape.", "");
|
|
||||||
return ND4J_STATUS_VALIDATION;
|
|
||||||
}
|
|
||||||
int lastDimension = input->sizeAt(-1);
|
|
||||||
// TODO: tune this properlys
|
|
||||||
int lO = listOut->size();
|
|
||||||
PRAGMA_OMP_PARALLEL_FOR_IF(lO > Environment::getInstance()->tadThreshold())
|
|
||||||
for(int i = 0; i < lO; ++i)
|
|
||||||
for (int e = 0; e < lastDimension; e++)
|
|
||||||
listOut->at(i)->p(e, e, listDiag->at(i)->e<T>(e));
|
|
||||||
|
|
||||||
delete listOut;
|
|
||||||
delete listDiag;
|
|
||||||
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
int matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
|
|
||||||
BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiag, (input, output), LIBND4J_TYPES);
|
|
||||||
}
|
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template int _matrixDiag, (const NDArray* input, NDArray* output), LIBND4J_TYPES);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -957,9 +957,13 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
|
||||||
|
|
||||||
val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
|
val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
|
||||||
|
|
||||||
for (coords[2] = hstart; coords[2] < hend; coords[2] += dH)
|
for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {
|
||||||
for (coords[3] = wstart; coords[3] < wend; coords[3] += dW)
|
for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) {
|
||||||
nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0 - 1.f));
|
const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
|
||||||
|
const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
|
||||||
|
nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1123,10 +1127,15 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
|
||||||
|
|
||||||
val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
|
val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
|
||||||
|
|
||||||
for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
|
for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
|
||||||
for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
|
for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) {
|
||||||
for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
|
for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
|
||||||
nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0 - 1.f));
|
const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
|
||||||
|
const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
|
||||||
|
nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,63 +15,87 @@
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
|
|
||||||
//
|
//
|
||||||
// Created by Yurii Shyrma on 07.12.2017.
|
// @author Yurii Shyrma (iuriish@yahoo.com)
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "ResultSet.h"
|
#include "ResultSet.h"
|
||||||
#include <ops/declarable/helpers/matrixSetDiag.h>
|
#include <ops/declarable/helpers/matrixSetDiag.h>
|
||||||
|
#include <PointersManager.h>
|
||||||
|
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
namespace ops {
|
namespace ops {
|
||||||
namespace helpers {
|
namespace helpers {
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
template<typename T>
|
template<typename T>
|
||||||
static __global__ void matrixSetDiagKernel(void* outputBuffer, Nd4jLong* outputShape, void const* diagonalBuffer, Nd4jLong* diagonalShape, Nd4jLong lastDimSize, Nd4jLong last2DimSize, Nd4jLong lastSmallDim, Nd4jLong batchSize) {
|
__global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool zeroPad) {
|
||||||
__shared__ T* z;
|
|
||||||
__shared__ T const* x;
|
// x - input, shape [A,B,C]
|
||||||
__shared__ Nd4jLong outLength, diagonalLen;
|
// y - diagonal, shape [A,B]
|
||||||
|
// z - output, shape [A,B,C]
|
||||||
|
// input and output are the same array (x == z) when zeroPad = true
|
||||||
|
|
||||||
|
const auto x = reinterpret_cast<const T*>(vx);
|
||||||
|
const auto y = reinterpret_cast<const T*>(vy);
|
||||||
|
auto z = reinterpret_cast<T*>(vz);
|
||||||
|
|
||||||
|
__shared__ int xRank; // xRank = zRank, xRank = yRank + 1
|
||||||
|
__shared__ Nd4jLong xLen, *sharedMem; // xLen = zLen
|
||||||
|
__shared__ bool areSameOffsets;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
z = reinterpret_cast<T*>(outputBuffer);
|
|
||||||
x = reinterpret_cast<T const*>(diagonalBuffer);
|
extern __shared__ unsigned char shmem[];
|
||||||
outLength = shape::length(outputShape);
|
sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
|
||||||
diagonalLen = shape::length(diagonalShape);
|
|
||||||
|
areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo); // shapes are definitely the same, but strides might not
|
||||||
|
|
||||||
|
xRank = shape::rank(xShapeInfo);
|
||||||
|
xLen = shape::length(xShapeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
for(int i = blockIdx.x; i < batchSize; i+= gridDim.x )
|
auto coords = sharedMem + threadIdx.x * xRank; // we provide (xRank * sizeof(Nd4jLong) * threadIdx.x) amount of shared memory per each thread
|
||||||
for(int j = threadIdx.x; j < lastSmallDim; j += blockDim.x) {
|
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
// z[i * last2DimSize + j * (lastDimSize + 1)] = x[i * lastSmallDim + j];
|
|
||||||
z[shape::getIndexOffset(i * last2DimSize + j * (lastDimSize + 1), outputShape, outLength)] = x[shape::getIndexOffset(i * lastSmallDim + j, diagonalShape, diagonalLen)];
|
for (Nd4jLong i = tid; i < xLen; i += gridDim.x * blockDim.x) {
|
||||||
|
|
||||||
|
shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords);
|
||||||
|
|
||||||
|
const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords, xRank);
|
||||||
|
const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords, xRank);
|
||||||
|
|
||||||
|
// condition to be on diagonal of innermost matrix
|
||||||
|
if(coords[xRank - 2] == coords[xRank - 1])
|
||||||
|
z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords, xRank - 1)];
|
||||||
|
else
|
||||||
|
z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
// Returns a batched matrix tensor with new batched diagonal values.
|
///////////////////////////////////////////////////////////////////
|
||||||
// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
static void _matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) {
|
static void matrixSetDiagCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool zeroPad) {
|
||||||
*output = *input;
|
|
||||||
|
|
||||||
const int lastDimSize = input->sizeAt(-1);
|
|
||||||
const int last2DimSize = input->sizeAt(-1) * input->sizeAt(-2);
|
|
||||||
const int lastSmallDim = diagonal->sizeAt(-1);
|
|
||||||
const int batchSize = input->lengthOf()/last2DimSize;
|
|
||||||
auto stream = context->getCudaStream();
|
|
||||||
dim3 launchDims(256, 512, 8192);
|
|
||||||
matrixSetDiagKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), diagonal->getSpecialBuffer(), diagonal->getSpecialShapeInfo(), lastDimSize, last2DimSize, lastSmallDim, batchSize);
|
|
||||||
//// #pragma omp parallel for if(batchSize > Environment::getInstance()->elementwiseThreshold()) schedule(static)
|
|
||||||
// for(int i = 0; i < batchSize; ++i )
|
|
||||||
// for(int j = 0; j < lastSmallDim; ++j) {
|
|
||||||
// output->p(i*last2DimSize + j*(lastDimSize + 1), diagonal->e<T>(i*lastSmallDim + j));
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
matrixSetDiagCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, zeroPad);
|
||||||
}
|
}
|
||||||
|
|
||||||
void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output) {
|
///////////////////////////////////////////////////////////////////
|
||||||
BUILD_SINGLE_SELECTOR(input->dataType(), _matrixSetDiag, (context, input, diagonal, output), LIBND4J_TYPES);
|
void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
|
||||||
}
|
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template void _matrixSetDiag, (nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output), LIBND4J_TYPES);
|
const int threadsPerBlock = MAX_NUM_THREADS / 2;
|
||||||
|
const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||||
|
const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * input.rankOf() + 128;
|
||||||
|
|
||||||
|
PointersManager manager(context, "matrixSetDiag");
|
||||||
|
|
||||||
|
NDArray::prepareSpecialUse({&output}, {&input, &diagonal});
|
||||||
|
BUILD_SINGLE_SELECTOR(input.dataType(), matrixSetDiagCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), diagonal.getSpecialBuffer(), diagonal.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), zeroPad), LIBND4J_TYPES);
|
||||||
|
NDArray::registerSpecialUse({&output}, {&input, &diagonal});
|
||||||
|
|
||||||
|
manager.synchronize();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,95 +0,0 @@
|
||||||
/*******************************************************************************
|
|
||||||
* Copyright (c) 2015-2018 Skymind, Inc.
|
|
||||||
*
|
|
||||||
* This program and the accompanying materials are made available under the
|
|
||||||
* terms of the Apache License, Version 2.0 which is available at
|
|
||||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
||||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
||||||
* License for the specific language governing permissions and limitations
|
|
||||||
* under the License.
|
|
||||||
*
|
|
||||||
* SPDX-License-Identifier: Apache-2.0
|
|
||||||
******************************************************************************/
|
|
||||||
|
|
||||||
//
|
|
||||||
// Created by GS <sgazeos@gmail.com> on 3/21/2018.
|
|
||||||
//
|
|
||||||
|
|
||||||
#include "ResultSet.h"
|
|
||||||
#include <ops/declarable/helpers/matrix_diag.h>
|
|
||||||
#include <Status.h>
|
|
||||||
#include <ShapeUtils.h>
|
|
||||||
#include <ShapeUtils.h>
|
|
||||||
#include <TAD.h>
|
|
||||||
#include <cuda_exception.h>
|
|
||||||
#include <helpers/ConstantTadHelper.h>
|
|
||||||
|
|
||||||
namespace nd4j {
|
|
||||||
namespace ops {
|
|
||||||
namespace helpers {
|
|
||||||
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static __global__ void matrixDiagKernel(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength,
|
|
||||||
Nd4jLong* tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets,
|
|
||||||
Nd4jLong* tadOnlyOutputShapeInfo, Nd4jLong *tadOutputOffsets) {
|
|
||||||
int totalThreads = blockDim.x;
|
|
||||||
for (Nd4jLong i = blockIdx.x; i < numTads; i += gridDim.x) {
|
|
||||||
auto yOffset = tadInputOffsets[i];
|
|
||||||
auto xOffset = tadOutputOffsets[i];
|
|
||||||
for (Nd4jLong j = threadIdx.x; j < inputLength; j += totalThreads) {
|
|
||||||
Nd4jLong coords[2] = {j, j};
|
|
||||||
Nd4jLong tadOffset = shape::getOffset(0, shape::shapeOf(tadOnlyOutputShapeInfo), shape::stride(tadOnlyOutputShapeInfo), coords, 2);
|
|
||||||
//shape::getIndexOffset(j, tadOnlyOutputShapeInfo, inputLength)
|
|
||||||
*(reinterpret_cast<T*>(outputBuffer) + xOffset + tadOffset) = *(reinterpret_cast<T const*>(inputBuffer) + yOffset + shape::getIndexOffset(j, tadOnlyInputShapeInfo, inputLength));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
// Returns a batched matrix tensor with new batched diagonal values.
|
|
||||||
// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static int _matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
|
|
||||||
cudaStream_t* stream = context->getCudaStream();
|
|
||||||
//auto listOut = output->allTensorsAlongDimension({output->rankOf() - 2, output->rankOf() - 1});
|
|
||||||
//auto listDiag = input->allTensorsAlongDimension({input->rankOf() - 1});
|
|
||||||
|
|
||||||
//auto repeatDelta = shape::prodLong(newShape.data(), rank) / this->lengthOf();
|
|
||||||
std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(input->rankOf(), {input->rankOf() - 1});
|
|
||||||
const Nd4jLong numTads = ShapeUtils::getNumOfSubArrs(input->getShapeInfo(), dimsToExclude); //this->tensorsAlongDimension({dimension});
|
|
||||||
//printf("Repeat delta %lld, numTads %lld\n", repeatDelta, numTads);
|
|
||||||
//tadOnlyInputShapeInfo, tadInputOffsets, tadOnlyOutputShapeInfo, tadOutputOffsets;
|
|
||||||
std::vector<int> inputDims({input->rankOf() - 1});
|
|
||||||
std::vector<int> outputDims({output->rankOf() - 2, output->rankOf() - 1});
|
|
||||||
|
|
||||||
auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), inputDims);
|
|
||||||
auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), outputDims);
|
|
||||||
|
|
||||||
if (!input->isActualOnDeviceSide())
|
|
||||||
input->syncToDevice();
|
|
||||||
|
|
||||||
if (!output->isActualOnDeviceSide())
|
|
||||||
output->syncToDevice();
|
|
||||||
|
|
||||||
// create cuda stream and LaunchContext
|
|
||||||
cudaError_t cudaResult;
|
|
||||||
|
|
||||||
dim3 launchDims(256, 512, 8192);
|
|
||||||
matrixDiagKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(input->getSpecialBuffer(), output->getSpecialBuffer(), numTads, input->sizeAt(-1), packX.specialShapeInfo(), packX.specialOffsets(), packZ.specialShapeInfo(), packZ.specialOffsets());
|
|
||||||
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
int matrixDiag(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
|
|
||||||
BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiag, (context, input, output), LIBND4J_TYPES);
|
|
||||||
}
|
|
||||||
|
|
||||||
BUILD_SINGLE_TEMPLATE(template int _matrixDiag, (nd4j::LaunchContext * context, const NDArray* input, NDArray* output), LIBND4J_TYPES);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -28,8 +28,7 @@ namespace nd4j {
|
||||||
namespace ops {
|
namespace ops {
|
||||||
namespace helpers {
|
namespace helpers {
|
||||||
|
|
||||||
void matrixSetDiag(nd4j::LaunchContext * context, const NDArray* input, const NDArray* diagonal, NDArray* output);
|
void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,34 +0,0 @@
|
||||||
/*******************************************************************************
|
|
||||||
* Copyright (c) 2015-2018 Skymind, Inc.
|
|
||||||
*
|
|
||||||
* This program and the accompanying materials are made available under the
|
|
||||||
* terms of the Apache License, Version 2.0 which is available at
|
|
||||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
||||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
||||||
* License for the specific language governing permissions and limitations
|
|
||||||
* under the License.
|
|
||||||
*
|
|
||||||
* SPDX-License-Identifier: Apache-2.0
|
|
||||||
******************************************************************************/
|
|
||||||
|
|
||||||
//
|
|
||||||
// @author GS <sgazeos@gmail.com>
|
|
||||||
//
|
|
||||||
#ifndef __MATRIX_DIAG_HELPERS__
|
|
||||||
#define __MATRIX_DIAG_HELPERS__
|
|
||||||
#include <op_boilerplate.h>
|
|
||||||
#include <NDArray.h>
|
|
||||||
|
|
||||||
namespace nd4j {
|
|
||||||
namespace ops {
|
|
||||||
namespace helpers {
|
|
||||||
|
|
||||||
int matrixDiag(nd4j::LaunchContext * context, NDArray const* input, NDArray* output);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
|
@ -117,9 +117,9 @@ TEST_F(DeclarableOpsTests3, Test_Unique_1) {
|
||||||
|
|
||||||
auto v = result->at(0);
|
auto v = result->at(0);
|
||||||
auto i = result->at(1);
|
auto i = result->at(1);
|
||||||
v->printIndexedBuffer("Values");
|
// v->printIndexedBuffer("Values");
|
||||||
i->printIndexedBuffer("Indices");
|
// i->printIndexedBuffer("Indices");
|
||||||
i->printShapeInfo("Indices shape");
|
// i->printShapeInfo("Indices shape");
|
||||||
ASSERT_TRUE(expV.isSameShape(v));
|
ASSERT_TRUE(expV.isSameShape(v));
|
||||||
ASSERT_TRUE(expV.equalsTo(v));
|
ASSERT_TRUE(expV.equalsTo(v));
|
||||||
|
|
||||||
|
@ -145,12 +145,12 @@ TEST_F(DeclarableOpsTests3, Test_Unique_2) {
|
||||||
auto i = result->at(1);
|
auto i = result->at(1);
|
||||||
auto c = result->at(2);
|
auto c = result->at(2);
|
||||||
|
|
||||||
v->printShapeInfo();
|
// v->printShapeInfo();
|
||||||
v->printIndexedBuffer("Values");
|
// v->printIndexedBuffer("Values");
|
||||||
i->printShapeInfo();
|
// i->printShapeInfo();
|
||||||
i->printIndexedBuffer("Indices");
|
// i->printIndexedBuffer("Indices");
|
||||||
c->printShapeInfo();
|
// c->printShapeInfo();
|
||||||
c->printIndexedBuffer("Counts");
|
// c->printIndexedBuffer("Counts");
|
||||||
|
|
||||||
ASSERT_TRUE(expV.isSameShape(v));
|
ASSERT_TRUE(expV.isSameShape(v));
|
||||||
ASSERT_TRUE(expV.equalsTo(v));
|
ASSERT_TRUE(expV.equalsTo(v));
|
||||||
|
@ -200,11 +200,11 @@ TEST_F(DeclarableOpsTests3, Test_Norm_1) {
|
||||||
auto result1 = op.execute({&x}, {1.}, {1});
|
auto result1 = op.execute({&x}, {1.}, {1});
|
||||||
ASSERT_EQ(result1->status(), ND4J_STATUS_OK);
|
ASSERT_EQ(result1->status(), ND4J_STATUS_OK);
|
||||||
auto z1 = result1->at(0);
|
auto z1 = result1->at(0);
|
||||||
z1->printIndexedBuffer("Z1");
|
// z1->printIndexedBuffer("Z1");
|
||||||
auto exp1 = x.reduceAlongDims(reduce::Norm2, dims, false, false);
|
auto exp1 = x.reduceAlongDims(reduce::Norm2, dims, false, false);
|
||||||
exp1.printIndexedBuffer("EXP1");
|
// exp1.printIndexedBuffer("EXP1");
|
||||||
z1->printShapeInfo("Z1 shape");
|
// z1->printShapeInfo("Z1 shape");
|
||||||
exp1.printShapeInfo("EXP1 shape");
|
// exp1.printShapeInfo("EXP1 shape");
|
||||||
ASSERT_TRUE(exp1.isSameShape(z1));
|
ASSERT_TRUE(exp1.isSameShape(z1));
|
||||||
ASSERT_TRUE(exp1.equalsTo(z1));
|
ASSERT_TRUE(exp1.equalsTo(z1));
|
||||||
|
|
||||||
|
@ -714,7 +714,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_7) {
|
||||||
|
|
||||||
auto exp = MmulHelper::mmul(&x, &y);
|
auto exp = MmulHelper::mmul(&x, &y);
|
||||||
|
|
||||||
exp->printShapeInfo("exp shape");
|
// exp->printShapeInfo("exp shape");
|
||||||
|
|
||||||
nd4j::ops::batched_gemm op;
|
nd4j::ops::batched_gemm op;
|
||||||
auto result = op.execute({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {112, 112, 2, 3, 5, 5, 3, 2, 3});
|
auto result = op.execute({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {112, 112, 2, 3, 5, 5, 3, 2, 3});
|
||||||
|
|
|
@ -79,7 +79,7 @@ TEST_F(SortCudaTests, test_linear_sort_by_val_2) {
|
||||||
sortByValue(extras, k.buffer(), k.shapeInfo(), k.specialBuffer(), k.specialShapeInfo(), v.buffer(), v.shapeInfo(), v.specialBuffer(), v.specialShapeInfo(), true);
|
sortByValue(extras, k.buffer(), k.shapeInfo(), k.specialBuffer(), k.specialShapeInfo(), v.buffer(), v.shapeInfo(), v.specialBuffer(), v.specialShapeInfo(), true);
|
||||||
k.tickWriteDevice();
|
k.tickWriteDevice();
|
||||||
v.tickWriteDevice();
|
v.tickWriteDevice();
|
||||||
k.printIndexedBuffer("KEYS");
|
// k.printIndexedBuffer("KEYS");
|
||||||
ASSERT_EQ(ek, k);
|
ASSERT_EQ(ek, k);
|
||||||
ASSERT_EQ(ev, v);
|
ASSERT_EQ(ev, v);
|
||||||
}
|
}
|
||||||
|
@ -98,8 +98,8 @@ TEST_F(SortCudaTests, test_tad_sort_by_key_1) {
|
||||||
k.tickWriteDevice();
|
k.tickWriteDevice();
|
||||||
v.tickWriteDevice();
|
v.tickWriteDevice();
|
||||||
|
|
||||||
k.printIndexedBuffer("k");
|
// k.printIndexedBuffer("k");
|
||||||
v.printIndexedBuffer("v");
|
// v.printIndexedBuffer("v");
|
||||||
|
|
||||||
ASSERT_EQ(ek, k);
|
ASSERT_EQ(ek, k);
|
||||||
ASSERT_EQ(ev, v);
|
ASSERT_EQ(ev, v);
|
||||||
|
|
Loading…
Reference in New Issue