diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp index 1e3c798e2..76817078b 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp @@ -50,13 +50,13 @@ namespace helpers { int n = inputMatrix->rows(); invertedMatrix->assign(0.f); - PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold()) + // PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold()) for (int i = 0; i < n; i++) invertedMatrix->p(i, i, 1.0f); if (inputMatrix->isIdentityMatrix()) return; - PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold()) + //PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold()) for (int i = 1; i < n; i++) invertedMatrix->t(i, i - 1) = -inputMatrix->t(i, i - 1); @@ -83,11 +83,11 @@ namespace helpers { return; } - PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold()) + //PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold()) for (int i = 0; i < n; i++) invertedMatrix->t(i, i) /= inputMatrix->t(i, i); - PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold()) + //PRAGMA_OMP_PARALLEL_FOR_IF(n > Environment::getInstance()->elementwiseThreshold()) for (int i = 0; i < n - 1; i++) invertedMatrix->t(i, i + 1) -= (inputMatrix->t(i, i + 1) * invertedMatrix->t(i + 1, i + 1) / inputMatrix->t(i, i)); @@ -124,7 +124,7 @@ namespace helpers { for(int i = 0; i < rowNum; i++ ) { pivotValue = T(0.0); pivot = -1; - PRAGMA_OMP_PARALLEL_FOR //_ARGS(firstprivate(pivot,pivotValue)) + //PRAGMA_OMP_PARALLEL_FOR //_ARGS(firstprivate(pivot,pivotValue)) for(int rowCounter = i; rowCounter < rowNum; rowCounter++ ) { if (nd4j::math::nd4j_abs(compoundMatrix.t(rowCounter, i)) > pivotValue) { pivotValue = nd4j::math::nd4j_abs(compoundMatrix.t(rowCounter, i)); @@ -140,7 +140,7 @@ namespace helpers { for( int j = i + 1; j < rowNum; j++ ) { compoundMatrix.t(j, i) /= compoundMatrix.t(i, i); - PRAGMA_OMP_PARALLEL_FOR + //PRAGMA_OMP_PARALLEL_FOR for( int k = i + 1; k < rowNum; k++ ) { compoundMatrix.t(j, k) -= compoundMatrix.t(j, i) * compoundMatrix.t(i, k); } diff --git a/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp b/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp index da3cb3259..b3b65f816 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp @@ -43,7 +43,7 @@ namespace helpers { int remainShift = fullLen % actualShift; // stage 1) swap last actualShift elements with first ones. - PRAGMA_OMP_PARALLEL_FOR_IF(actualShift > Environment::getInstance()->elementwiseThreshold()) + //PRAGMA_OMP_PARALLEL_FOR //_IF(actualShift > Environment::getInstance()->elementwiseThreshold()) for (int e = 0; e < actualShift; ++e) { int sourceIndex = fullLen - actualShift + e; @@ -56,7 +56,7 @@ namespace helpers { } // stage 2) swap swapped actualShift elements with rest remainShiftCount times. - PRAGMA_OMP_PARALLEL_FOR_IF(shiftCount > Environment::getInstance()->tadThreshold()) + //PRAGMA_OMP_PARALLEL_FOR //_IF(shiftCount > Environment::getInstance()->tadThreshold()) for (int count = 1; count < shiftCount; ++count) { for (int e = 0; e < actualShift; ++e) { int destinationIndex = fullLen - (count + 1) * actualShift + e; @@ -91,7 +91,7 @@ namespace helpers { output->assign(input); auto source = output; //input; - for (auto i = 0; i < axes.size(); i++) { + for (size_t i = 0; i < axes.size(); i++) { int axe = axes[i]; if (axe == source->rankOf() - 1) {// last dimension std::unique_ptr listOfTensors(source->allTensorsAlongDimension({axe})); @@ -115,7 +115,7 @@ namespace helpers { std::unique_ptr listOfTensors(source->allTensorsAlongDimension({dims})); std::unique_ptr listOfOutTensors(output->allTensorsAlongDimension({dims})); - + // int fullLen = listOfTensors->size(); int sizeAt = input->sizeAt(axe); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp index a92e6713b..71181afe8 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp @@ -957,26 +957,27 @@ void clipByNorm(nd4j::LaunchContext * context, NDArray& input, NDArray& output, template static void clipByGlobalNorm_(std::vector const& inputs, double clipNorm, nd4j::memory::Workspace* workspace, std::vector& outputs, bool isInplace) { - NDArray globalNorm = NDArrayFactory::create(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list])) - PRAGMA_OMP_PARALLEL_FOR + T globalNorm = 0; //NDArrayFactory::create(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list])) +// PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(sumT : globalNorm) for (size_t i = 0; i < inputs.size(); i++) { auto input = inputs[i]; auto l2norm = input->reduceNumber(reduce::Norm2); - globalNorm += l2norm * l2norm; + globalNorm += l2norm.t(0) * l2norm.t(0); } - globalNorm.applyTransform(transform::Sqrt, nullptr, nullptr);// = nd4j::math::nd4j_sqrt(globalNorm); - outputs[inputs.size()]->p(0, globalNorm); + //globalNorm.applyTransform(transform::Sqrt, nullptr, nullptr);// = nd4j::math::nd4j_sqrt(globalNorm); + auto normS = nd4j::math::nd4j_sqrt(globalNorm); + outputs[inputs.size()]->p(0, normS); - const T factor = clipNorm / globalNorm.e(0); + const T factor = clipNorm / normS; - PRAGMA_OMP_PARALLEL_FOR +// PRAGMA_OMP_PARALLEL_FOR for (size_t e = 0; e < inputs.size(); e++) { // all-reduce auto input = inputs[e]; auto output = outputs[e]; - if (globalNorm.e(0) <= clipNorm) { + if (normS <= clipNorm) { output->assign(input); } else { diff --git a/libnd4j/include/ops/declarable/helpers/cuda/axis.cu b/libnd4j/include/ops/declarable/helpers/cuda/axis.cu index a3b2bcd32..1236ae495 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/axis.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/axis.cu @@ -27,11 +27,11 @@ namespace helpers { void adjustAxis(Nd4jLong rank, NDArray* axisVector, std::vector& output) { output.resize(axisVector->lengthOf()); - axisVector->tickReadDevice(); - axisVector->syncToHost(); + axisVector->tickReadDevice(); // mark input as read on device + axisVector->syncToHost(); // sync to host for (int e = 0; e < axisVector->lengthOf(); e++) { auto ca = axisVector->e(e); - if (ca < 0) + if (ca < 0) // shift values on rank for negative vals ca += rank; output[e] = ca; @@ -41,7 +41,7 @@ namespace helpers { void adjustAxis(Nd4jLong rank, std::vector &axisVector) { for (int e = 0; e < axisVector.size(); e++) { auto a = axisVector[e]; - if (a < 0) + if (a < 0) // shift vals on rank for negative vals axisVector[e] = a + rank; } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu index f11b56745..f0d1df1cc 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu @@ -31,34 +31,14 @@ namespace nd4j { namespace ops { namespace helpers { -// template -// static __device__ void swapRows_(T* matrix, Nd4jLong* shape, int theFirst, int theSecond, Nd4jLong N) { -// if (theFirst != theSecond) { -// auto start = threadIdx.x + blockIdx.x * blockDim.x; -// auto step = blockDim.x * gridDim.x; -// for (auto i = start; i < N; i += step) { -// Nd4jLong iCoord1[] = {theFirst, i}; -// Nd4jLong iCoord2[] = {theSecond, i}; -// auto iIndex1 = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), iCoord1, 2); -// auto iIndex2 = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), iCoord2, 2); -// //atomicExch(&matrix[iIndex1], matrix[iIndex2]); -// T e0 = matrix[iIndex1]; -// T e1 = matrix[iIndex2]; -// matrix[iIndex1] = e0; -// matrix[iIndex2] = e1; -// } -// } -// } -// BUILD_SINGLE_TEMPLATE(template void swapRows_, (NDArray* matrix, int theFirst, int theSecond), FLOAT_TYPES); -// -// void swapRows(NDArray* matrix, int theFirst, int theSecond) { -// BUILD_SINGLE_SELECTOR(matrix->dataType(), swapRows_, (matrix, theFirst, theSecond), FLOAT_TYPES); -// } + +// ------------------------------------------------------------------------------------------------------------------ // +// invert the second diagonal for lower diagonal matrix template static __global__ void invertKernelLow(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) { - T *inverted = reinterpret_cast(invertedBuf); - T *input = reinterpret_cast(inputBuf); + T* inverted = reinterpret_cast(invertedBuf); + T* input = reinterpret_cast(inputBuf); auto start = threadIdx.x + blockIdx.x * blockDim.x; auto step = blockDim.x * gridDim.x; @@ -71,11 +51,13 @@ namespace helpers { auto dxIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posX, 2); auto dyIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posY, 2); auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2); + // invert lower triangular matrix inverted[zIndex] = -input[xIndex] / (input[dxIndex] * input[dyIndex]); // math::atomics::nd4j_atomicAdd(&inverted[zIndex], - input[xIndex] * inverted[iIndex] / input[dIndex]); } } - +// ------------------------------------------------------------------------------------------------------------------ // +// invert diagonal vals to upper diagonal matrix template static __global__ void upvertKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) { @@ -90,10 +72,13 @@ namespace helpers { auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2); auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2); // math::atomics::nd4j_atomicDiv(&inverted[zIndex], input[xIndex]); + // invert diagonal elements inverted[zIndex] /= input[xIndex]; } } +// ------------------------------------------------------------------------------------------------------------------ // +// invert upper second diagonal template static __global__ void upvertKernelUp(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) { @@ -120,18 +105,17 @@ namespace helpers { for (int i = start; i < n - 1; i += step) { Nd4jLong pos[] = {i, i + 1}; - //Nd4jLong posY[] = {i, i}; Nd4jLong posX[] = {i + 1, i + 1}; auto xIndex = shape::getOffset(0, inputShapeOf, shape::stride(inputShape), pos, 2); -// auto yIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posY, 2); -// auto yIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2); auto iIndex = shape::getOffset(0, invertedShapeOf, invertedStride, posX, 2); auto zIndex = shape::getOffset(0, invertedShapeOf, invertedStride, pos, 2); + // invert upper matrix math::atomics::nd4j_atomicAdd(&inverted[zIndex], -input[xIndex] * inverted[iIndex]); // / input[yIndex]); //inputMatrix->t(i, i + 1) * invertedMatrix->t(i + 1, i + 1) / inputMatrix->t(i, i) } } +// ------------------------------------------------------------------------------------------------------------------ // template static __global__ void invertLowKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) { @@ -152,11 +136,14 @@ namespace helpers { auto dIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posD, 2); auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), posZ, 2); + // invert non-diagonal elements math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex] / input[dIndex]); } } } +// ------------------------------------------------------------------------------------------------------------------ // +// Invertion of upper triangular matrix non-diagonal elements when main and second diagonals already processed template static __global__ void invertUpKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) { @@ -183,18 +170,20 @@ namespace helpers { Nd4jLong posZ[] = {i, j}; Nd4jLong posY[] = {k, j}; Nd4jLong posX[] = {i, k}; -// Nd4jLong posD[] = {i, i}; - + // inversion with Joardan Gauss transformation auto xIndex = shape::getOffset(0, inputShapeOf, inputStrideOf, posX, 2); auto yIndex = shape::getOffset(0, invertedShapeOf, invertedStrideOf, posY, 2); - // auto dIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posD, 2); auto zIndex = shape::getOffset(0, invertedShapeOf, invertedStrideOf, posZ, 2); - math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex]);// / input[dIndex]); -// printf("(%d, %d) inverted[%lld] = %lf (-inverted[%lld] * input[%lld]\n", blockIdx.x, threadIdx.x, zIndex, inverted[zIndex], yIndex, xIndex); + // invert upper non-diagonal elements + math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex]); } } } +// ------------------------------------------------------------------------------------------------------------------ // +// procedure to invert lower-triangular matrix. +// In current case lower triangular matrix has main diagonal with general values +// template static void invertLowerMatrix_(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) { int n = inputMatrix->rows(); @@ -204,20 +193,26 @@ namespace helpers { auto stream = context->getCudaStream(); + // invert lower matrix // invert main diagonal upvertKernel<<<1, n, 512, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n); // invert the second diagonal invertKernelLow<<<1, n, 512, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n); -// invertKernelLow<<<1, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n); + // invert non-diagonal elements invertLowKernel<<>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n); } +// ------------------------------------------------------------------------------------------------------------------ // +// caller for invert lower matrix routine void invertLowerMatrix(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) { NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix}); BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertLowerMatrix_, (context, inputMatrix, invertedMatrix), FLOAT_NATIVE); NDArray::registerSpecialUse({invertedMatrix}, {inputMatrix}); } +// ------------------------------------------------------------------------------------------------------------------ // +// procedure to invert upper-triangular matrix. +// In current case upper triangular matrix has main diagonal with all ones on it. template static void invertUpperMatrix_(LaunchContext *context, NDArray* inputMatrix, NDArray* invertedMatrix) { int n = inputMatrix->rows(); @@ -227,342 +222,319 @@ namespace helpers { return; } - //upvertKernel<<<1, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n); + // invert upper matrix + // invert the second diagonal upvertKernelUp<<<1, n, 512, *stream >>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n); - invertedMatrix->tickWriteDevice(); - invertedMatrix->printIndexedBuffer("Step1 UP inversion"); + + // invert other elements invertUpKernel<<>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(),inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n); } +// ------------------------------------------------------------------------------------------------------------------ // +// invertion of upper triangular matrix - runner routine void invertUpperMatrix(LaunchContext *context, NDArray *inputMatrix, NDArray *invertedMatrix) { NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix}); BUILD_SINGLE_SELECTOR(invertedMatrix->dataType(), invertUpperMatrix_, (context, inputMatrix, invertedMatrix), FLOAT_NATIVE); NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix}); } -// template -// static __global__ void lupKernel(T* compound, Nd4jLong* compoundShape, T* permutation, Nd4jLong* permutationShape, Nd4jLong rowNum) { -// int swapCount = 0; -// for(int i = blockIdx.x; i < rowNum; i += gridDim.x ) { -// auto pivotValue = T(0.0); -// auto pivot = -1; -// -// for(int rowCounter = i; rowCounter < rowNum; rowCounter++ ) { -// Nd4jLong rowCoord[] = {rowCounter, i}; -// auto rowPos = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), rowCoord, 2); -// if(nd4j::math::nd4j_abs(compound[rowPos]) > pivotValue ) { -// pivotValue = nd4j::math::nd4j_abs(compound[rowPos]); -// pivot = rowCounter; -// } -// } -// -// if( pivotValue != T(0.0) ) { -// swapRows_(compound, compoundShape, pivot, i, rowNum); -// swapRows_(permutation, permutationShape, pivot, i, rowNum); -// if (pivot != i) -// swapCount++; -// -// for( int j = i + 1; j < rowNum; j++ ) { -// Nd4jLong posJIbuf[] = {j, i}; -// Nd4jLong posIIbuf[] = {i, i}; -// auto posJI = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posJIbuf, 2); -// auto posII = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posIIbuf, 2); -// -// compound[posJI] /= compound[posII]; -// for( int k = i + 1; k < rowNum; k++ ) { -// Nd4jLong posJKbuf[] = {j, k}; -// Nd4jLong posIKbuf[] = {i, k}; -// auto posJK = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posJKbuf, 2); -// auto posIK = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posIKbuf, 2); -// T arg = compound[posJI] * compound[posIK]; -// compound[posJK] -= arg; -// } -// } -// } -// } -// } +// ------------------------------------------------------------------------------------------------------------------ // + // determinant kernel - accumulation product of all values on the main diagonal + template + static __global__ void determinantKernel(T *compound, T *result, Nd4jLong len) { + auto start = blockIdx.x * blockDim.x + threadIdx.x; + auto step = blockDim.x * gridDim.x; + for (auto i = start; i < len; i += step) { + auto pos = i * len + i; //shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), di, 2); + // multiply all diagonal elements + math::atomics::nd4j_atomicMul(&result[0], compound[pos]); + } + } -// template - template - static __global__ void determinantKernel(T *compound, T *result, Nd4jLong len) { - //F tempRes = result[0]; +// ------------------------------------------------------------------------------------------------------------------ // + // determinant logarithm - accumulation sum of all logarithm values on the main diagonal. All in logarithic values + // should be positive + template + static __global__ void determinantLogKernel(T *compound, T *result, Nd4jLong len) { + auto start = blockIdx.x * blockDim.x + threadIdx.x; + auto step = blockDim.x * gridDim.x; + for (auto i = start; i < len; i += step) { + auto pos = i * len + i; //shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), di, 2); + // sum logs of all diagonal elements + math::atomics::nd4j_atomicAdd(result, math::nd4j_log(math::nd4j_abs(compound[pos]))); + } + } - auto start = blockIdx.x * blockDim.x + threadIdx.x; - auto step = blockDim.x * gridDim.x; - for (auto i = start; i < len; i += step) { - auto pos = i * len + i; //shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), di, 2); - math::atomics::nd4j_atomicMul(&result[0], compound[pos]); - } +// ------------------------------------------------------------------------------------------------------------------ // + // kernel to copy matrix with given shape to compound tensor with given pos + // output - a N-D tensor buffer with rank not less than 2, input - 2D square n x n matrix with n = rowLen + template + static __global__ void + fillMatrix(void *output, Nd4jLong *outShape, void *input, Nd4jLong *inputShape, Nd4jLong pos, Nd4jLong rowLen) { + __shared__ F *matrix; + __shared__ T *inputBuf; + __shared__ Nd4jLong inputLen; + __shared__ Nd4jLong n2; + + if (threadIdx.x == 0) { + matrix = reinterpret_cast(output); + inputBuf = reinterpret_cast(input); + inputLen = shape::length(inputShape); + n2 = rowLen * rowLen; + } + __syncthreads(); + + auto start = blockIdx.x * blockDim.x + threadIdx.x; + auto step = blockDim.x * gridDim.x; + + for (int k = pos + start, j = start; j < n2; k += step, j += step) { + auto xIndex = shape::getIndexOffset(k, inputShape, inputLen); + matrix[j] = (F) inputBuf[xIndex]; + } + } + +// ------------------------------------------------------------------------------------------------------------------ // +// same as above, but without type conversion + template + static __global__ void + returnMatrix(void *output, Nd4jLong *outputShape, void *input, Nd4jLong *inputShape, Nd4jLong pos, Nd4jLong rowLen) { + __shared__ T* matrix; + __shared__ T* outputBuf; + __shared__ Nd4jLong outputLen; + __shared__ Nd4jLong n2; + + if (threadIdx.x == 0) { + matrix = reinterpret_cast(input); + outputBuf = reinterpret_cast(output); + outputLen = shape::length(inputShape); + n2 = rowLen * rowLen; + } + __syncthreads(); + auto start = blockIdx.x * blockDim.x + threadIdx.x; + auto step = blockDim.x * gridDim.x; + + for (int k = pos + start, j = start; j < n2; k += step, j += step) { + auto zIndex = shape::getIndexOffset(k, outputShape, outputLen); + outputBuf[zIndex] = matrix[j]; + } + } + +// ------------------------------------------------------------------------------------------------------------------ // + // fill up permutaion matrix kernel. Permutation matrix filled with zeros and ones + template + static __global__ void fillUpPermutation(void *output, Nd4jLong *shape, int *source, int rowNum) { + F *permutation = reinterpret_cast(output); + + auto start = blockIdx.x * blockDim.x + threadIdx.x; + auto step = blockDim.x * gridDim.x; + for (auto i = start; i < rowNum; i += step) { + int val = source[i] - 1; + Nd4jLong posF[] = {i, val}; + auto pos = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), posF, 2); + permutation[pos] = F(1.f); + } + } + +// ------------------------------------------------------------------------------------------------------------------ // + // LUP decomposition runner - using CUBLAS SOLVER + // if permutation is given, then using LUP decomposition, LU decomposition otherwise + // L - lower triangular, U - upper triangular, P - permutation matricies + // PA = LU + // + // input - A matrix nxn + // compound - C matrix L + U - I, or main diagonal and lower - L matrix, from the 2nd diagonal - U matrix + template + static void lup_(LaunchContext *context, NDArray *input, NDArray *compound, NDArray *permutation) { + auto stream = context->getCudaStream(); + auto n = input->rows(); + cusolverDnHandle_t cusolverH = nullptr; + // create solver handle + cusolverStatus_t status = cusolverDnCreate(&cusolverH); + if (CUSOLVER_STATUS_SUCCESS != status) { + throw cuda_exception::build("Cannot create cuSolver handle", status); + } + // set solver stream + status = cusolverDnSetStream(cusolverH, *stream); + if (CUSOLVER_STATUS_SUCCESS != status) { + throw cuda_exception::build("Cannot set up stream for cuda solver", status); + } + int lwork = 0; + int *d_info = nullptr; + // allocate memory for permutation vector + auto err = cudaMalloc((void **) &d_info, sizeof(int)); + if (err) { + throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver info buffer", err); } - template - static __global__ void determinantLogKernel(T *compound, T *result, Nd4jLong len) { -// F tempRes = (F)result[0]; + DataType dtype = input->dataType(); + switch (dtype) { // there are two implementations with cublas for LUP decomposition - double and float - auto start = blockIdx.x * blockDim.x + threadIdx.x; - auto step = blockDim.x * gridDim.x; - for (auto i = start; i < len; i += step) { - auto pos = i * len + i; //shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), di, 2); - math::atomics::nd4j_atomicAdd(result, math::nd4j_log(math::nd4j_abs(compound[pos]))); - } -// __syncthreads(); -// -// if (threadIdx.x == 0) { -// result[0] = (T)math::nd4j_log(math::nd4j_abs(tempRes)); -// } - } + case DataType::DOUBLE: { + double *d_work = nullptr; + // compute internal buffer size + double *matrix = reinterpret_cast(input->specialBuffer()); + status = cusolverDnDgetrf_bufferSize( + cusolverH, + n, + n, + matrix, + n, + &lwork); + if (CUSOLVER_STATUS_SUCCESS != status) { + throw cuda_exception::build("helpers::lup_: Cannot create cuSolver handle", status); + } - template - static __global__ void - fillMatrix(void *output, Nd4jLong *outShape, void *input, Nd4jLong *inputShape, Nd4jLong pos, Nd4jLong rowLen) { - __shared__ - F *matrix; - __shared__ - T *inputBuf; - __shared__ - Nd4jLong inputLen; - __shared__ - Nd4jLong n2; + err = cudaMalloc((void **) &d_work, sizeof(float) * lwork); + if (err) { + throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer", + err); + } - if (threadIdx.x == 0) { - matrix = reinterpret_cast(output); - inputBuf = reinterpret_cast(input); - inputLen = shape::length(inputShape); - n2 = rowLen * rowLen; - } - __syncthreads(); - auto start = blockIdx.x * blockDim.x + threadIdx.x; - auto step = blockDim.x * gridDim.x; - - for (int k = pos + start, j = start; j < n2; k += step, j += step) { - auto xIndex = shape::getIndexOffset(k, inputShape, inputLen); - matrix[j] = (F) inputBuf[xIndex]; - } - } - - template - static __global__ void - returnMatrix(void *output, Nd4jLong *outputShape, void *input, Nd4jLong *inputShape, Nd4jLong pos, - Nd4jLong rowLen) { - __shared__ T *matrix; - __shared__ T *outputBuf; - __shared__ Nd4jLong outputLen; - __shared__ Nd4jLong n2; - - if (threadIdx.x == 0) { - matrix = reinterpret_cast(input); - outputBuf = reinterpret_cast(output); - outputLen = shape::length(inputShape); - n2 = rowLen * rowLen; - } - __syncthreads(); - auto start = blockIdx.x * blockDim.x + threadIdx.x; - auto step = blockDim.x * gridDim.x; - - for (int k = pos + start, j = start; j < n2; k += step, j += step) { - auto zIndex = shape::getIndexOffset(k, outputShape, outputLen); - outputBuf[zIndex] = (T) matrix[j]; - } - } - - template - static __global__ void fillUpPermutation(void *output, Nd4jLong *shape, int *source, int rowNum) { - F *permutation = reinterpret_cast(output); - - auto start = blockIdx.x * blockDim.x + threadIdx.x; - auto step = blockDim.x * gridDim.x; - for (auto i = start; i < rowNum; i += step) { - int val = source[i] - 1; - Nd4jLong posF[] = {i, val}; - auto pos = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), posF, 2); - permutation[pos] = F(1.f); - } - } - - template - static void lup_(LaunchContext *context, NDArray *input, NDArray *compound, NDArray *permutation) { - auto stream = context->getCudaStream(); - auto n = input->rows(); - cusolverDnHandle_t cusolverH = nullptr; - cusolverStatus_t status = cusolverDnCreate(&cusolverH); - if (CUSOLVER_STATUS_SUCCESS != status) { - throw cuda_exception::build("Cannot create cuSolver handle", status); - } - status = cusolverDnSetStream(cusolverH, *stream); - if (CUSOLVER_STATUS_SUCCESS != status) { - throw cuda_exception::build("Cannot set up stream for cuda solver", status); - } - int lwork = 0; - int *d_info = nullptr; - - auto err = cudaMalloc((void **) &d_info, sizeof(int)); - if (err) { - throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver info buffer", err); - } - - DataType dtype = input->dataType(); - switch (dtype) { - - case DataType::DOUBLE: { - double *d_work = nullptr; - err = cudaMalloc((void **) &d_work, sizeof(float) * lwork); - if (err) { - throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer", - err); - } - double *matrix = reinterpret_cast(input->specialBuffer()); - status = cusolverDnDgetrf_bufferSize( + if (permutation == nullptr) + status = cusolverDnDgetrf( cusolverH, n, n, matrix, n, - &lwork); - if (CUSOLVER_STATUS_SUCCESS != status) { - throw cuda_exception::build("helpers::lup_: Cannot create cuSolver handle", status); - } - if (permutation == nullptr) - status = cusolverDnDgetrf( - cusolverH, - n, - n, - matrix, - n, - d_work, - nullptr, - d_info); - else { - NDArray permutVector('c', {n}, nd4j::DataType::INT32, context); - int *permutationBuf = reinterpret_cast(permutVector.specialBuffer()); - status = cusolverDnDgetrf( - cusolverH, - n, - n, - matrix, - n, - d_work, - permutationBuf, - d_info); - fillUpPermutation << < n, n, 1024, *stream >> > - (permutation->specialBuffer(), permutation->specialShapeInfo(), permutationBuf, n); - permutation->tickWriteDevice(); - } - err = cudaFree(d_work); - if (err) { - throw cuda_exception::build("helpers::lup_: Cannot deallocate memory for solver data buffer", - err); - } - } - break; - case DataType::FLOAT32: { - float *matrix = reinterpret_cast(input->specialBuffer()); - float *d_work = nullptr; - err = cudaMalloc((void **) &d_work, sizeof(float) * lwork); - if (err) { - throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer", - err); - } - - status = cusolverDnSgetrf_bufferSize( + d_work, + nullptr, + d_info); + else { + NDArray permutVector('c', {n}, nd4j::DataType::INT32, context); + int *permutationBuf = reinterpret_cast(permutVector.specialBuffer()); + status = cusolverDnDgetrf( cusolverH, n, n, matrix, n, - &lwork); - if (CUSOLVER_STATUS_SUCCESS != status) { - throw cuda_exception::build("helpers::lup_: Cannot create cuSolver handle", status); - } - - if (permutation == nullptr) - status = cusolverDnSgetrf( - cusolverH, - n, - n, - matrix, - n, - d_work, - nullptr, - d_info); - else { - NDArray permutVector('c', {n}, nd4j::DataType::INT32, context); - int *permutationBuf = reinterpret_cast(permutVector.specialBuffer()); - status = cusolverDnSgetrf( - cusolverH, - n, - n, - matrix, - n, - d_work, - permutationBuf, - d_info); - fillUpPermutation <<< n, n, 128, *stream >> > - (permutation->specialBuffer(), permutation->specialShapeInfo(), permutationBuf, n); - permutation->tickWriteDevice(); - } - err = cudaFree(d_work); - if (err) { - throw cuda_exception::build("helpers::lup_: Cannot deallocate memory for solver data buffer", - err); - } - + d_work, + permutationBuf, + d_info); + fillUpPermutation << < n, n, 1024, *stream >> > + (permutation->specialBuffer(), permutation->specialShapeInfo(), permutationBuf, n); + permutation->tickWriteDevice(); + } + err = cudaFree(d_work); + if (err) { + throw cuda_exception::build("helpers::lup_: Cannot deallocate memory for solver data buffer", + err); } } - if (CUSOLVER_STATUS_SUCCESS != status) { - throw cuda_exception::build("helpers::lup_: Cannot make LU decomposition", status); + break; + case DataType::FLOAT32: { + float *matrix = reinterpret_cast(input->specialBuffer()); + float *d_work = nullptr; + + status = cusolverDnSgetrf_bufferSize( + cusolverH, + n, + n, + matrix, + n, + &lwork); + if (CUSOLVER_STATUS_SUCCESS != status) { + throw cuda_exception::build("helpers::lup_: Cannot create cuSolver handle", status); + } + + err = cudaMalloc((void **) &d_work, sizeof(float) * lwork); + if (err) { + throw cuda_exception::build("helpers::lup_: Cannot allocate memory for solver data buffer", + err); + } + + if (permutation == nullptr) + status = cusolverDnSgetrf( + cusolverH, + n, + n, + matrix, + n, + d_work, + nullptr, + d_info); + else { + NDArray permutVector('c', {n}, nd4j::DataType::INT32, context); + int *permutationBuf = reinterpret_cast(permutVector.specialBuffer()); + status = cusolverDnSgetrf( + cusolverH, + n, + n, + matrix, + n, + d_work, + permutationBuf, + d_info); + fillUpPermutation <<< n, n, 128, *stream >> > + (permutation->specialBuffer(), permutation->specialShapeInfo(), permutationBuf, n); + permutation->tickWriteDevice(); + } + err = cudaFree(d_work); + if (err) { + throw cuda_exception::build("helpers::lup_: Cannot deallocate memory for solver data buffer", + err); + } + } - err = cudaFree(d_info); - if (err) { - throw cuda_exception::build("helpers::lup_: Cannot deallocate memory for solver info buffer", err); - } - cusolverDnDestroy(cusolverH); + } + if (CUSOLVER_STATUS_SUCCESS != status) { + throw cuda_exception::build("helpers::lup_: Cannot make LU decomposition", status); + } + err = cudaFree(d_info); + if (err) { + throw cuda_exception::build("helpers::lup_: Cannot deallocate memory for solver info buffer", err); + } + cusolverDnDestroy(cusolverH); // NDArray::registerSpecialUse({input}, {input}); - input->tickWriteDevice(); - } + input->tickWriteDevice(); + } +// ------------------------------------------------------------------------------------------------------------------ // - BUILD_SINGLE_TEMPLATE(template void lup_,(LaunchContext * context, NDArray * input, NDArray * output, NDArray * permutation), FLOAT_NATIVE); + BUILD_SINGLE_TEMPLATE(template void lup_,(LaunchContext * context, NDArray * input, NDArray * output, NDArray * permutation), FLOAT_NATIVE); - template - static int determinant_(nd4j::LaunchContext *context, NDArray *input, NDArray *output) { - Nd4jLong n = input->sizeAt(-1); - Nd4jLong n2 = n * n; - std::vector dims(); - auto packX = ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), {input->rankOf() - 2, input->rankOf() - 1}); - //auto packZ = ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), {output->rankOf() - 1}); +// ------------------------------------------------------------------------------------------------------------------ // + template + static int determinant_(nd4j::LaunchContext *context, NDArray *input, NDArray *output) { + Nd4jLong n = input->sizeAt(-1); + Nd4jLong n2 = n * n; + std::vector dims(); + auto packX = ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), {input->rankOf() - 2, input->rankOf() - 1}); + //auto packZ = ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), {output->rankOf() - 1}); // DataType dtype = input->dataType(); // if (dtype != DataType::DOUBLE) // dtype = DataType::FLOAT32; - auto matrix = NDArrayFactory::create(input->ordering(), {n, n}, DataTypeUtils::fromT(), context); //, block.getWorkspace()); - auto det = NDArrayFactory::create(1); - auto stream = context->getCudaStream(); - NDArray::prepareSpecialUse({output}, {input}); - dim3 launchDims(256, 256, 1024); - output->assign(1.f); - for (int e = 0; e < output->lengthOf(); e++) { - Nd4jLong pos = e * n2; + auto matrix = NDArrayFactory::create(input->ordering(), {n, n}, DataTypeUtils::fromT(), context); //, block.getWorkspace()); + auto det = NDArrayFactory::create(1); + auto stream = context->getCudaStream(); + NDArray::prepareSpecialUse({output}, {input}); + dim3 launchDims(256, 256, 1024); + output->assign(1.f); + for (int e = 0; e < output->lengthOf(); e++) { + Nd4jLong pos = e * n2; // if (matrix.dataType() == input->dataType()) - fillMatrix<<>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), pos, n); + fillMatrix<<>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), pos, n); // else // fillMatrix<<>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), pos, n); // if (matrix.dataType() == input->dataType()) - lup_(context, &matrix, nullptr, nullptr); + lup_(context, &matrix, nullptr, nullptr); // else // lup_(context, &matrix, nullptr, nullptr); - auto offset = shape::getIndexOffset(e, output->shapeInfo(), output->lengthOf()); - auto inputBuf = reinterpret_cast(matrix.specialBuffer()); - auto outputBuf = reinterpret_cast(output->specialBuffer()) + offset; + auto offset = shape::getIndexOffset(e, output->shapeInfo(), output->lengthOf()); + auto inputBuf = reinterpret_cast(matrix.specialBuffer()); + auto outputBuf = reinterpret_cast(output->specialBuffer()) + offset; // if (matrix.dataType() == input->dataType()) - determinantKernel << < launchDims.x, launchDims.y, launchDims.z, *stream >> > - (inputBuf, outputBuf, n); + determinantKernel << < launchDims.x, launchDims.y, launchDims.z, *stream >> > + (inputBuf, outputBuf, n); // else // determinantKernel<<>> (inputBuf, outputBuf, n); - } - NDArray::registerSpecialUse({output}, {input}); - - return Status::OK(); } + NDArray::registerSpecialUse({output}, {input}); + + return Status::OK(); + } int determinant(nd4j::LaunchContext *context, NDArray *input, NDArray *output) { NDArray::prepareSpecialUse({output}, {input});