Shugeo cuda docs1 (#249)

* Comments axis shifts.

* Fixed LUP solver usage. Added helpers doc.

* Switch off OMP for roll and lup. Fixed omp usage for ClipByGlobalNorm.

* Switch off omp for ClipByGlobalNorm to reduce omp ambigiousness.
master
shugeo 2019-09-09 16:27:45 +03:00 committed by raver119
parent b582e69e3b
commit c9f8a904ad
5 changed files with 318 additions and 345 deletions

View File

@ -50,13 +50,13 @@ namespace helpers {
int n = inputMatrix->rows(); int n = inputMatrix->rows();
invertedMatrix->assign(0.f); invertedMatrix->assign(0.f);
PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold()) // PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
invertedMatrix->p(i, i, 1.0f); invertedMatrix->p(i, i, 1.0f);
if (inputMatrix->isIdentityMatrix()) return; if (inputMatrix->isIdentityMatrix()) return;
PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold()) //PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
for (int i = 1; i < n; i++) for (int i = 1; i < n; i++)
invertedMatrix->t<T>(i, i - 1) = -inputMatrix->t<T>(i, i - 1); invertedMatrix->t<T>(i, i - 1) = -inputMatrix->t<T>(i, i - 1);
@ -83,11 +83,11 @@ namespace helpers {
return; return;
} }
PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold()) //PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
invertedMatrix->t<T>(i, i) /= inputMatrix->t<T>(i, i); invertedMatrix->t<T>(i, i) /= inputMatrix->t<T>(i, i);
PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold()) //PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold())
for (int i = 0; i < n - 1; i++) for (int i = 0; i < n - 1; i++)
invertedMatrix->t<T>(i, i + 1) -= (inputMatrix->t<T>(i, i + 1) * invertedMatrix->t<T>(i + 1, i + 1) / inputMatrix->t<T>(i, i)); invertedMatrix->t<T>(i, i + 1) -= (inputMatrix->t<T>(i, i + 1) * invertedMatrix->t<T>(i + 1, i + 1) / inputMatrix->t<T>(i, i));
@ -124,7 +124,7 @@ namespace helpers {
for(int i = 0; i < rowNum; i++ ) { for(int i = 0; i < rowNum; i++ ) {
pivotValue = T(0.0); pivotValue = T(0.0);
pivot = -1; pivot = -1;
PRAGMA_OMP_PARALLEL_FOR //_ARGS(firstprivate(pivot,pivotValue)) //PRAGMA_OMP_PARALLEL_FOR //_ARGS(firstprivate(pivot,pivotValue))
for(int rowCounter = i; rowCounter < rowNum; rowCounter++ ) { for(int rowCounter = i; rowCounter < rowNum; rowCounter++ ) {
if (nd4j::math::nd4j_abs(compoundMatrix.t<T>(rowCounter, i)) > pivotValue) { if (nd4j::math::nd4j_abs(compoundMatrix.t<T>(rowCounter, i)) > pivotValue) {
pivotValue = nd4j::math::nd4j_abs(compoundMatrix.t<T>(rowCounter, i)); pivotValue = nd4j::math::nd4j_abs(compoundMatrix.t<T>(rowCounter, i));
@ -140,7 +140,7 @@ namespace helpers {
for( int j = i + 1; j < rowNum; j++ ) { for( int j = i + 1; j < rowNum; j++ ) {
compoundMatrix.t<T>(j, i) /= compoundMatrix.t<T>(i, i); compoundMatrix.t<T>(j, i) /= compoundMatrix.t<T>(i, i);
PRAGMA_OMP_PARALLEL_FOR //PRAGMA_OMP_PARALLEL_FOR
for( int k = i + 1; k < rowNum; k++ ) { for( int k = i + 1; k < rowNum; k++ ) {
compoundMatrix.t<T>(j, k) -= compoundMatrix.t<T>(j, i) * compoundMatrix.t<T>(i, k); compoundMatrix.t<T>(j, k) -= compoundMatrix.t<T>(j, i) * compoundMatrix.t<T>(i, k);
} }

View File

@ -43,7 +43,7 @@ namespace helpers {
int remainShift = fullLen % actualShift; int remainShift = fullLen % actualShift;
// stage 1) swap last actualShift elements with first ones. // stage 1) swap last actualShift elements with first ones.
PRAGMA_OMP_PARALLEL_FOR_IF(actualShift > Environment::getInstance()->elementwiseThreshold()) //PRAGMA_OMP_PARALLEL_FOR //_IF(actualShift > Environment::getInstance()->elementwiseThreshold())
for (int e = 0; e < actualShift; ++e) { for (int e = 0; e < actualShift; ++e) {
int sourceIndex = fullLen - actualShift + e; int sourceIndex = fullLen - actualShift + e;
@ -56,7 +56,7 @@ namespace helpers {
} }
// stage 2) swap swapped actualShift elements with rest remainShiftCount times. // stage 2) swap swapped actualShift elements with rest remainShiftCount times.
PRAGMA_OMP_PARALLEL_FOR_IF(shiftCount > Environment::getInstance()->tadThreshold()) //PRAGMA_OMP_PARALLEL_FOR //_IF(shiftCount > Environment::getInstance()->tadThreshold())
for (int count = 1; count < shiftCount; ++count) { for (int count = 1; count < shiftCount; ++count) {
for (int e = 0; e < actualShift; ++e) { for (int e = 0; e < actualShift; ++e) {
int destinationIndex = fullLen - (count + 1) * actualShift + e; int destinationIndex = fullLen - (count + 1) * actualShift + e;
@ -91,7 +91,7 @@ namespace helpers {
output->assign(input); output->assign(input);
auto source = output; //input; auto source = output; //input;
for (auto i = 0; i < axes.size(); i++) { for (size_t i = 0; i < axes.size(); i++) {
int axe = axes[i]; int axe = axes[i];
if (axe == source->rankOf() - 1) {// last dimension if (axe == source->rankOf() - 1) {// last dimension
std::unique_ptr<ResultSet> listOfTensors(source->allTensorsAlongDimension({axe})); std::unique_ptr<ResultSet> listOfTensors(source->allTensorsAlongDimension({axe}));
@ -115,7 +115,7 @@ namespace helpers {
std::unique_ptr<ResultSet> listOfTensors(source->allTensorsAlongDimension({dims})); std::unique_ptr<ResultSet> listOfTensors(source->allTensorsAlongDimension({dims}));
std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension({dims})); std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension({dims}));
//
int fullLen = listOfTensors->size(); int fullLen = listOfTensors->size();
int sizeAt = input->sizeAt(axe); int sizeAt = input->sizeAt(axe);

View File

@ -957,26 +957,27 @@ void clipByNorm(nd4j::LaunchContext * context, NDArray& input, NDArray& output,
template <typename T> template <typename T>
static void clipByGlobalNorm_(std::vector<NDArray*> const& inputs, double clipNorm, nd4j::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) { static void clipByGlobalNorm_(std::vector<NDArray*> const& inputs, double clipNorm, nd4j::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
NDArray globalNorm = NDArrayFactory::create<T>(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list])) T globalNorm = 0; //NDArrayFactory::create<T>(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list]))
PRAGMA_OMP_PARALLEL_FOR // PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(sumT : globalNorm)
for (size_t i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < inputs.size(); i++) {
auto input = inputs[i]; auto input = inputs[i];
auto l2norm = input->reduceNumber(reduce::Norm2); auto l2norm = input->reduceNumber(reduce::Norm2);
globalNorm += l2norm * l2norm; globalNorm += l2norm.t<T>(0) * l2norm.t<T>(0);
} }
globalNorm.applyTransform(transform::Sqrt, nullptr, nullptr);// = nd4j::math::nd4j_sqrt(globalNorm); //globalNorm.applyTransform(transform::Sqrt, nullptr, nullptr);// = nd4j::math::nd4j_sqrt(globalNorm);
outputs[inputs.size()]->p(0, globalNorm); auto normS = nd4j::math::nd4j_sqrt<T,T>(globalNorm);
outputs[inputs.size()]->p(0, normS);
const T factor = clipNorm / globalNorm.e<T>(0); const T factor = clipNorm / normS;
PRAGMA_OMP_PARALLEL_FOR // PRAGMA_OMP_PARALLEL_FOR
for (size_t e = 0; e < inputs.size(); e++) { for (size_t e = 0; e < inputs.size(); e++) {
// all-reduce // all-reduce
auto input = inputs[e]; auto input = inputs[e];
auto output = outputs[e]; auto output = outputs[e];
if (globalNorm.e<double>(0) <= clipNorm) { if (normS <= clipNorm) {
output->assign(input); output->assign(input);
} }
else { else {

View File

@ -27,11 +27,11 @@ namespace helpers {
void adjustAxis(Nd4jLong rank, NDArray* axisVector, std::vector<int>& output) { void adjustAxis(Nd4jLong rank, NDArray* axisVector, std::vector<int>& output) {
output.resize(axisVector->lengthOf()); output.resize(axisVector->lengthOf());
axisVector->tickReadDevice(); axisVector->tickReadDevice(); // mark input as read on device
axisVector->syncToHost(); axisVector->syncToHost(); // sync to host
for (int e = 0; e < axisVector->lengthOf(); e++) { for (int e = 0; e < axisVector->lengthOf(); e++) {
auto ca = axisVector->e<int>(e); auto ca = axisVector->e<int>(e);
if (ca < 0) if (ca < 0) // shift values on rank for negative vals
ca += rank; ca += rank;
output[e] = ca; output[e] = ca;
@ -41,7 +41,7 @@ namespace helpers {
void adjustAxis(Nd4jLong rank, std::vector<int> &axisVector) { void adjustAxis(Nd4jLong rank, std::vector<int> &axisVector) {
for (int e = 0; e < axisVector.size(); e++) { for (int e = 0; e < axisVector.size(); e++) {
auto a = axisVector[e]; auto a = axisVector[e];
if (a < 0) if (a < 0) // shift vals on rank for negative vals
axisVector[e] = a + rank; axisVector[e] = a + rank;
} }
} }

View File

@ -31,29 +31,9 @@
namespace nd4j { namespace nd4j {
namespace ops { namespace ops {
namespace helpers { namespace helpers {
// template <typename T>
// static __device__ void swapRows_(T* matrix, Nd4jLong* shape, int theFirst, int theSecond, Nd4jLong N) { // ------------------------------------------------------------------------------------------------------------------ //
// if (theFirst != theSecond) { // invert the second diagonal for lower diagonal matrix
// auto start = threadIdx.x + blockIdx.x * blockDim.x;
// auto step = blockDim.x * gridDim.x;
// for (auto i = start; i < N; i += step) {
// Nd4jLong iCoord1[] = {theFirst, i};
// Nd4jLong iCoord2[] = {theSecond, i};
// auto iIndex1 = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), iCoord1, 2);
// auto iIndex2 = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), iCoord2, 2);
// //atomicExch(&matrix[iIndex1], matrix[iIndex2]);
// T e0 = matrix[iIndex1];
// T e1 = matrix[iIndex2];
// matrix[iIndex1] = e0;
// matrix[iIndex2] = e1;
// }
// }
// }
// BUILD_SINGLE_TEMPLATE(template void swapRows_, (NDArray* matrix, int theFirst, int theSecond), FLOAT_TYPES);
//
// void swapRows(NDArray* matrix, int theFirst, int theSecond) {
// BUILD_SINGLE_SELECTOR(matrix->dataType(), swapRows_, (matrix, theFirst, theSecond), FLOAT_TYPES);
// }
template<typename T> template<typename T>
static __global__ void static __global__ void
invertKernelLow(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) { invertKernelLow(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
@ -71,11 +51,13 @@ namespace helpers {
auto dxIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posX, 2); auto dxIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posX, 2);
auto dyIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posY, 2); auto dyIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posY, 2);
auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2); auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2);
// invert lower triangular matrix
inverted[zIndex] = -input[xIndex] / (input[dxIndex] * input[dyIndex]); inverted[zIndex] = -input[xIndex] / (input[dxIndex] * input[dyIndex]);
// math::atomics::nd4j_atomicAdd(&inverted[zIndex], - input[xIndex] * inverted[iIndex] / input[dIndex]); // math::atomics::nd4j_atomicAdd(&inverted[zIndex], - input[xIndex] * inverted[iIndex] / input[dIndex]);
} }
} }
// ------------------------------------------------------------------------------------------------------------------ //
// invert diagonal vals to upper diagonal matrix
template<typename T> template<typename T>
static __global__ void static __global__ void
upvertKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) { upvertKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
@ -90,10 +72,13 @@ namespace helpers {
auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2); auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2);
auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2); auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2);
// math::atomics::nd4j_atomicDiv(&inverted[zIndex], input[xIndex]); // math::atomics::nd4j_atomicDiv(&inverted[zIndex], input[xIndex]);
// invert diagonal elements
inverted[zIndex] /= input[xIndex]; inverted[zIndex] /= input[xIndex];
} }
} }
// ------------------------------------------------------------------------------------------------------------------ //
// invert upper second diagonal
template<typename T> template<typename T>
static __global__ void static __global__ void
upvertKernelUp(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) { upvertKernelUp(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
@ -120,18 +105,17 @@ namespace helpers {
for (int i = start; i < n - 1; i += step) { for (int i = start; i < n - 1; i += step) {
Nd4jLong pos[] = {i, i + 1}; Nd4jLong pos[] = {i, i + 1};
//Nd4jLong posY[] = {i, i};
Nd4jLong posX[] = {i + 1, i + 1}; Nd4jLong posX[] = {i + 1, i + 1};
auto xIndex = shape::getOffset(0, inputShapeOf, shape::stride(inputShape), pos, 2); auto xIndex = shape::getOffset(0, inputShapeOf, shape::stride(inputShape), pos, 2);
// auto yIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posY, 2);
// auto yIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2);
auto iIndex = shape::getOffset(0, invertedShapeOf, invertedStride, posX, 2); auto iIndex = shape::getOffset(0, invertedShapeOf, invertedStride, posX, 2);
auto zIndex = shape::getOffset(0, invertedShapeOf, invertedStride, pos, 2); auto zIndex = shape::getOffset(0, invertedShapeOf, invertedStride, pos, 2);
// invert upper matrix
math::atomics::nd4j_atomicAdd(&inverted[zIndex], -input[xIndex] * inverted[iIndex]); // / input[yIndex]); math::atomics::nd4j_atomicAdd(&inverted[zIndex], -input[xIndex] * inverted[iIndex]); // / input[yIndex]);
//inputMatrix->t<T>(i, i + 1) * invertedMatrix->t<T>(i + 1, i + 1) / inputMatrix->t<T>(i, i) //inputMatrix->t<T>(i, i + 1) * invertedMatrix->t<T>(i + 1, i + 1) / inputMatrix->t<T>(i, i)
} }
} }
// ------------------------------------------------------------------------------------------------------------------ //
template<typename T> template<typename T>
static __global__ void static __global__ void
invertLowKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) { invertLowKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
@ -152,11 +136,14 @@ namespace helpers {
auto dIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posD, 2); auto dIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posD, 2);
auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), posZ, auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), posZ,
2); 2);
// invert non-diagonal elements
math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex] / input[dIndex]); math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex] / input[dIndex]);
} }
} }
} }
// ------------------------------------------------------------------------------------------------------------------ //
// Invertion of upper triangular matrix non-diagonal elements when main and second diagonals already processed
template<typename T> template<typename T>
static __global__ void static __global__ void
invertUpKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) { invertUpKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
@ -183,18 +170,20 @@ namespace helpers {
Nd4jLong posZ[] = {i, j}; Nd4jLong posZ[] = {i, j};
Nd4jLong posY[] = {k, j}; Nd4jLong posY[] = {k, j};
Nd4jLong posX[] = {i, k}; Nd4jLong posX[] = {i, k};
// Nd4jLong posD[] = {i, i}; // inversion with Joardan Gauss transformation
auto xIndex = shape::getOffset(0, inputShapeOf, inputStrideOf, posX, 2); auto xIndex = shape::getOffset(0, inputShapeOf, inputStrideOf, posX, 2);
auto yIndex = shape::getOffset(0, invertedShapeOf, invertedStrideOf, posY, 2); auto yIndex = shape::getOffset(0, invertedShapeOf, invertedStrideOf, posY, 2);
// auto dIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posD, 2);
auto zIndex = shape::getOffset(0, invertedShapeOf, invertedStrideOf, posZ, 2); auto zIndex = shape::getOffset(0, invertedShapeOf, invertedStrideOf, posZ, 2);
math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex]);// / input[dIndex]); // invert upper non-diagonal elements
// printf("(%d, %d) inverted[%lld] = %lf (-inverted[%lld] * input[%lld]\n", blockIdx.x, threadIdx.x, zIndex, inverted[zIndex], yIndex, xIndex); math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex]);
} }
} }
} }
// ------------------------------------------------------------------------------------------------------------------ //
// procedure to invert lower-triangular matrix.
// In current case lower triangular matrix has main diagonal with general values
//
template<typename T> template<typename T>
static void invertLowerMatrix_(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) { static void invertLowerMatrix_(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) {
int n = inputMatrix->rows(); int n = inputMatrix->rows();
@ -204,20 +193,26 @@ namespace helpers {
auto stream = context->getCudaStream(); auto stream = context->getCudaStream();
// invert lower matrix
// invert main diagonal // invert main diagonal
upvertKernel<T><<<1, n, 512, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n); upvertKernel<T><<<1, n, 512, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
// invert the second diagonal // invert the second diagonal
invertKernelLow<T><<<1, n, 512, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n); invertKernelLow<T><<<1, n, 512, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
// invertKernelLow<T><<<1, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n); // invert non-diagonal elements
invertLowKernel<T><<<n, n, 512, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n); invertLowKernel<T><<<n, n, 512, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
} }
// ------------------------------------------------------------------------------------------------------------------ //
// caller for invert lower matrix routine
void invertLowerMatrix(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) { void invertLowerMatrix(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) {
NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix}); NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertLowerMatrix_, (context, inputMatrix, invertedMatrix), FLOAT_NATIVE); BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertLowerMatrix_, (context, inputMatrix, invertedMatrix), FLOAT_NATIVE);
NDArray::registerSpecialUse({invertedMatrix}, {inputMatrix}); NDArray::registerSpecialUse({invertedMatrix}, {inputMatrix});
} }
// ------------------------------------------------------------------------------------------------------------------ //
// procedure to invert upper-triangular matrix.
// In current case upper triangular matrix has main diagonal with all ones on it.
template<typename T> template<typename T>
static void invertUpperMatrix_(LaunchContext *context, NDArray* inputMatrix, NDArray* invertedMatrix) { static void invertUpperMatrix_(LaunchContext *context, NDArray* inputMatrix, NDArray* invertedMatrix) {
int n = inputMatrix->rows(); int n = inputMatrix->rows();
@ -227,103 +222,60 @@ namespace helpers {
return; return;
} }
//upvertKernel<T><<<1, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n); // invert upper matrix
// invert the second diagonal
upvertKernelUp<T><<<1, n, 512, *stream >>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), upvertKernelUp<T><<<1, n, 512, *stream >>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(),
inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n); inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
invertedMatrix->tickWriteDevice();
invertedMatrix->printIndexedBuffer("Step1 UP inversion"); // invert other elements
invertUpKernel<T><<<n, n, 512, *stream >>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(),inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n); invertUpKernel<T><<<n, n, 512, *stream >>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(),inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
} }
// ------------------------------------------------------------------------------------------------------------------ //
// invertion of upper triangular matrix - runner routine
void invertUpperMatrix(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) { void invertUpperMatrix(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) {
NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix}); NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
BUILD_SINGLE_SELECTOR(invertedMatrix->dataType(), invertUpperMatrix_, (context, inputMatrix, invertedMatrix), FLOAT_NATIVE); BUILD_SINGLE_SELECTOR(invertedMatrix->dataType(), invertUpperMatrix_, (context, inputMatrix, invertedMatrix), FLOAT_NATIVE);
NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix}); NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
} }
// template <typename T> // ------------------------------------------------------------------------------------------------------------------ //
// static __global__ void lupKernel(T* compound, Nd4jLong* compoundShape, T* permutation, Nd4jLong* permutationShape, Nd4jLong rowNum) { // determinant kernel - accumulation product of all values on the main diagonal
// int swapCount = 0;
// for(int i = blockIdx.x; i < rowNum; i += gridDim.x ) {
// auto pivotValue = T(0.0);
// auto pivot = -1;
//
// for(int rowCounter = i; rowCounter < rowNum; rowCounter++ ) {
// Nd4jLong rowCoord[] = {rowCounter, i};
// auto rowPos = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), rowCoord, 2);
// if(nd4j::math::nd4j_abs(compound[rowPos]) > pivotValue ) {
// pivotValue = nd4j::math::nd4j_abs(compound[rowPos]);
// pivot = rowCounter;
// }
// }
//
// if( pivotValue != T(0.0) ) {
// swapRows_<T>(compound, compoundShape, pivot, i, rowNum);
// swapRows_<T>(permutation, permutationShape, pivot, i, rowNum);
// if (pivot != i)
// swapCount++;
//
// for( int j = i + 1; j < rowNum; j++ ) {
// Nd4jLong posJIbuf[] = {j, i};
// Nd4jLong posIIbuf[] = {i, i};
// auto posJI = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posJIbuf, 2);
// auto posII = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posIIbuf, 2);
//
// compound[posJI] /= compound[posII];
// for( int k = i + 1; k < rowNum; k++ ) {
// Nd4jLong posJKbuf[] = {j, k};
// Nd4jLong posIKbuf[] = {i, k};
// auto posJK = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posJKbuf, 2);
// auto posIK = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posIKbuf, 2);
// T arg = compound[posJI] * compound[posIK];
// compound[posJK] -= arg;
// }
// }
// }
// }
// }
// template <typename T, typename F>
template<typename T> template<typename T>
static __global__ void determinantKernel(T *compound, T *result, Nd4jLong len) { static __global__ void determinantKernel(T *compound, T *result, Nd4jLong len) {
//F tempRes = result[0];
auto start = blockIdx.x * blockDim.x + threadIdx.x; auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = blockDim.x * gridDim.x; auto step = blockDim.x * gridDim.x;
for (auto i = start; i < len; i += step) { for (auto i = start; i < len; i += step) {
auto pos = i * len + i; //shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), di, 2); auto pos = i * len + i; //shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), di, 2);
// multiply all diagonal elements
math::atomics::nd4j_atomicMul(&result[0], compound[pos]); math::atomics::nd4j_atomicMul(&result[0], compound[pos]);
} }
} }
// ------------------------------------------------------------------------------------------------------------------ //
// determinant logarithm - accumulation sum of all logarithm values on the main diagonal. All in logarithic values
// should be positive
template<typename T> template<typename T>
static __global__ void determinantLogKernel(T *compound, T *result, Nd4jLong len) { static __global__ void determinantLogKernel(T *compound, T *result, Nd4jLong len) {
// F tempRes = (F)result[0];
auto start = blockIdx.x * blockDim.x + threadIdx.x; auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = blockDim.x * gridDim.x; auto step = blockDim.x * gridDim.x;
for (auto i = start; i < len; i += step) { for (auto i = start; i < len; i += step) {
auto pos = i * len + i; //shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), di, 2); auto pos = i * len + i; //shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), di, 2);
// sum logs of all diagonal elements
math::atomics::nd4j_atomicAdd(result, math::nd4j_log<T,T>(math::nd4j_abs(compound[pos]))); math::atomics::nd4j_atomicAdd(result, math::nd4j_log<T,T>(math::nd4j_abs(compound[pos])));
} }
// __syncthreads();
//
// if (threadIdx.x == 0) {
// result[0] = (T)math::nd4j_log<F,F>(math::nd4j_abs(tempRes));
// }
} }
// ------------------------------------------------------------------------------------------------------------------ //
// kernel to copy matrix with given shape to compound tensor with given pos
// output - a N-D tensor buffer with rank not less than 2, input - 2D square n x n matrix with n = rowLen
template<typename T, typename F> template<typename T, typename F>
static __global__ void static __global__ void
fillMatrix(void *output, Nd4jLong *outShape, void *input, Nd4jLong *inputShape, Nd4jLong pos, Nd4jLong rowLen) { fillMatrix(void *output, Nd4jLong *outShape, void *input, Nd4jLong *inputShape, Nd4jLong pos, Nd4jLong rowLen) {
__shared__ __shared__ F *matrix;
F *matrix; __shared__ T *inputBuf;
__shared__ __shared__ Nd4jLong inputLen;
T *inputBuf; __shared__ Nd4jLong n2;
__shared__
Nd4jLong inputLen;
__shared__
Nd4jLong n2;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
matrix = reinterpret_cast<F*>(output); matrix = reinterpret_cast<F*>(output);
@ -332,6 +284,7 @@ namespace helpers {
n2 = rowLen * rowLen; n2 = rowLen * rowLen;
} }
__syncthreads(); __syncthreads();
auto start = blockIdx.x * blockDim.x + threadIdx.x; auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = blockDim.x * gridDim.x; auto step = blockDim.x * gridDim.x;
@ -341,10 +294,11 @@ namespace helpers {
} }
} }
// ------------------------------------------------------------------------------------------------------------------ //
// same as above, but without type conversion
template<typename T> template<typename T>
static __global__ void static __global__ void
returnMatrix(void *output, Nd4jLong *outputShape, void *input, Nd4jLong *inputShape, Nd4jLong pos, returnMatrix(void *output, Nd4jLong *outputShape, void *input, Nd4jLong *inputShape, Nd4jLong pos, Nd4jLong rowLen) {
Nd4jLong rowLen) {
__shared__ T* matrix; __shared__ T* matrix;
__shared__ T* outputBuf; __shared__ T* outputBuf;
__shared__ Nd4jLong outputLen; __shared__ Nd4jLong outputLen;
@ -362,10 +316,12 @@ namespace helpers {
for (int k = pos + start, j = start; j < n2; k += step, j += step) { for (int k = pos + start, j = start; j < n2; k += step, j += step) {
auto zIndex = shape::getIndexOffset(k, outputShape, outputLen); auto zIndex = shape::getIndexOffset(k, outputShape, outputLen);
outputBuf[zIndex] = (T) matrix[j]; outputBuf[zIndex] = matrix[j];
} }
} }
// ------------------------------------------------------------------------------------------------------------------ //
// fill up permutaion matrix kernel. Permutation matrix filled with zeros and ones
template<typename F> template<typename F>
static __global__ void fillUpPermutation(void *output, Nd4jLong *shape, int *source, int rowNum) { static __global__ void fillUpPermutation(void *output, Nd4jLong *shape, int *source, int rowNum) {
F *permutation = reinterpret_cast<F *>(output); F *permutation = reinterpret_cast<F *>(output);
@ -380,37 +336,43 @@ namespace helpers {
} }
} }
// ------------------------------------------------------------------------------------------------------------------ //
// LUP decomposition runner - using CUBLAS SOLVER
// if permutation is given, then using LUP decomposition, LU decomposition otherwise
// L - lower triangular, U - upper triangular, P - permutation matricies
// PA = LU
//
// input - A matrix nxn
// compound - C matrix L + U - I, or main diagonal and lower - L matrix, from the 2nd diagonal - U matrix
template<typename T> template<typename T>
static void lup_(LaunchContext *context, NDArray *input, NDArray *compound, NDArray *permutation) { static void lup_(LaunchContext *context, NDArray *input, NDArray *compound, NDArray *permutation) {
auto stream = context->getCudaStream(); auto stream = context->getCudaStream();
auto n = input->rows(); auto n = input->rows();
cusolverDnHandle_t cusolverH = nullptr; cusolverDnHandle_t cusolverH = nullptr;
// create solver handle
cusolverStatus_t status = cusolverDnCreate(&cusolverH); cusolverStatus_t status = cusolverDnCreate(&cusolverH);
if (CUSOLVER_STATUS_SUCCESS != status) { if (CUSOLVER_STATUS_SUCCESS != status) {
throw cuda_exception::build("Cannot create cuSolver handle", status); throw cuda_exception::build("Cannot create cuSolver handle", status);
} }
// set solver stream
status = cusolverDnSetStream(cusolverH, *stream); status = cusolverDnSetStream(cusolverH, *stream);
if (CUSOLVER_STATUS_SUCCESS != status) { if (CUSOLVER_STATUS_SUCCESS != status) {
throw cuda_exception::build("Cannot set up stream for cuda solver", status); throw cuda_exception::build("Cannot set up stream for cuda solver", status);
} }
int lwork = 0; int lwork = 0;
int *d_info = nullptr; int *d_info = nullptr;
// allocate memory for permutation vector
auto err = cudaMalloc((void **) &d_info, sizeof(int)); auto err = cudaMalloc((void **) &d_info, sizeof(int));
if (err) { if (err) {
throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver info buffer", err); throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver info buffer", err);
} }
DataType dtype = input->dataType(); DataType dtype = input->dataType();
switch (dtype) { switch (dtype) { // there are two implementations with cublas for LUP decomposition - double and float
case DataType::DOUBLE: { case DataType::DOUBLE: {
double *d_work = nullptr; double *d_work = nullptr;
err = cudaMalloc((void **) &d_work, sizeof(float) * lwork); // compute internal buffer size
if (err) {
throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer",
err);
}
double *matrix = reinterpret_cast<double *>(input->specialBuffer()); double *matrix = reinterpret_cast<double *>(input->specialBuffer());
status = cusolverDnDgetrf_bufferSize( status = cusolverDnDgetrf_bufferSize(
cusolverH, cusolverH,
@ -422,6 +384,13 @@ namespace helpers {
if (CUSOLVER_STATUS_SUCCESS != status) { if (CUSOLVER_STATUS_SUCCESS != status) {
throw cuda_exception::build("helpers::lup_: Cannot create cuSolver handle", status); throw cuda_exception::build("helpers::lup_: Cannot create cuSolver handle", status);
} }
err = cudaMalloc((void **) &d_work, sizeof(float) * lwork);
if (err) {
throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer",
err);
}
if (permutation == nullptr) if (permutation == nullptr)
status = cusolverDnDgetrf( status = cusolverDnDgetrf(
cusolverH, cusolverH,
@ -458,11 +427,6 @@ namespace helpers {
case DataType::FLOAT32: { case DataType::FLOAT32: {
float *matrix = reinterpret_cast<float*>(input->specialBuffer()); float *matrix = reinterpret_cast<float*>(input->specialBuffer());
float *d_work = nullptr; float *d_work = nullptr;
err = cudaMalloc((void **) &d_work, sizeof(float) * lwork);
if (err) {
throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer",
err);
}
status = cusolverDnSgetrf_bufferSize( status = cusolverDnSgetrf_bufferSize(
cusolverH, cusolverH,
@ -475,6 +439,12 @@ namespace helpers {
throw cuda_exception::build("helpers::lup_: Cannot create cuSolver handle", status); throw cuda_exception::build("helpers::lup_: Cannot create cuSolver handle", status);
} }
err = cudaMalloc((void **) &d_work, sizeof(float) * lwork);
if (err) {
throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer",
err);
}
if (permutation == nullptr) if (permutation == nullptr)
status = cusolverDnSgetrf( status = cusolverDnSgetrf(
cusolverH, cusolverH,
@ -520,9 +490,11 @@ namespace helpers {
// NDArray::registerSpecialUse({input}, {input}); // NDArray::registerSpecialUse({input}, {input});
input->tickWriteDevice(); input->tickWriteDevice();
} }
// ------------------------------------------------------------------------------------------------------------------ //
BUILD_SINGLE_TEMPLATE(template void lup_,(LaunchContext * context, NDArray * input, NDArray * output, NDArray * permutation), FLOAT_NATIVE); BUILD_SINGLE_TEMPLATE(template void lup_,(LaunchContext * context, NDArray * input, NDArray * output, NDArray * permutation), FLOAT_NATIVE);
// ------------------------------------------------------------------------------------------------------------------ //
template<typename T> template<typename T>
static int determinant_(nd4j::LaunchContext *context, NDArray *input, NDArray *output) { static int determinant_(nd4j::LaunchContext *context, NDArray *input, NDArray *output) {
Nd4jLong n = input->sizeAt(-1); Nd4jLong n = input->sizeAt(-1);