Shugeo cuda doc2 (#255)

* Added comments to tileKernel routine.

* Refactored kernel and added doc to it.

* Refactored setDiagonal kernel and added doc for it.

* Added doc for tnse cuda helpers.

* Added doc for diag kernels.

* Added doc for kernel.

* Refactored code with fake quantization.

* Added docs for image resize and crop kernels.

* Added docs for image suppression helpers.

* Added docs to matrix_band helpers.

* Added docs for matrix_diag_part and nth_element helpers.

* Fixed syntax error and refactored getIndexOffset usage.
master
shugeo 2019-09-11 21:04:43 +03:00 committed by raver119
parent 589401477d
commit e1a7460f8e
13 changed files with 336 additions and 263 deletions

View File

@ -23,15 +23,28 @@
namespace nd4j { namespace nd4j {
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// set up given value to upper diagonal given
// buffer - input buffer
// shape - input shape
// value - given value
// diagonal - given upper diagonal (acceptable negative values also, 0 - the main diagonal)
// row, cols - height and width of given matrix (MxN, rows = M, cols = N)
//
template <typename T> template <typename T>
static __global__ void setDiagValueUpperKernel(void* buffer, Nd4jLong* shape, T value, int diagonal, Nd4jLong rows, static __global__ void setDiagValueUpperKernel(void* buffer, Nd4jLong* shape, T value, int diagonal, Nd4jLong rows,
Nd4jLong cols) { Nd4jLong cols) {
Nd4jLong rank = shape::rank(shape); __shared__ Nd4jLong rank;
int totalThreads = blockDim.x; __shared__ T* array;
T* array = reinterpret_cast<T*>(buffer);
if (0 == threadIdx.x) {
rank = shape::rank(shape);
array = reinterpret_cast<T *>(buffer);
}
__syncthreads();
for (Nd4jLong i = blockIdx.x; i < rows; i += gridDim.x) { for (Nd4jLong i = blockIdx.x; i < rows; i += gridDim.x) {
for (int j = threadIdx.x; j < cols; j += totalThreads) { for (int j = threadIdx.x; j < cols; j += blockDim.x) {
Nd4jLong coords[2] = {i, j}; Nd4jLong coords[2] = {i, j};
Nd4jLong xOffset = shape::getOffset(shape, coords); Nd4jLong xOffset = shape::getOffset(shape, coords);
if (i + diagonal <= j) if (i + diagonal <= j)
@ -40,6 +53,13 @@ namespace nd4j {
} }
} }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// set up given value to lower given diagonal
// buffer - input buffer
// shape - input shape
// value - given value
// diagonal - given lower diagonal (acceptable negative values also, 0 - the main diagonal)
// row, cols - height and width of given matrix (MxN, rows = M, cols = N)
//
template <typename T> template <typename T>
static __global__ void setDiagValueLowerKernel(void* buffer, Nd4jLong* shape, T value, int diagonal, Nd4jLong rows, Nd4jLong cols) { static __global__ void setDiagValueLowerKernel(void* buffer, Nd4jLong* shape, T value, int diagonal, Nd4jLong rows, Nd4jLong cols) {
@ -96,7 +116,4 @@ namespace nd4j {
int diagonal, Nd4jLong rows, Nd4jLong cols, cudaStream_t& stream), LIBND4J_TYPES); int diagonal, Nd4jLong rows, Nd4jLong cols, cudaStream_t& stream), LIBND4J_TYPES);
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
} }

View File

@ -23,22 +23,31 @@
namespace nd4j { namespace nd4j {
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// kernel to swap two NDArrays vals as linear sequences
// input - theSecondBuffer/Shape from input NDArray
// output - theFirstBuffer/Shape from input NDArray
template <typename T> template <typename T>
static __global__ void swapUnsafeKernel(void* theFirstBuffer, Nd4jLong* theFirstShape, void* theSecondBuffer, Nd4jLong* theSecondShape) { static __global__ void swapUnsafeKernel(void* theFirstBuffer, Nd4jLong* theFirstShape, void* theSecondBuffer, Nd4jLong* theSecondShape) {
auto tid = blockIdx.x * blockDim.x + threadIdx.x; auto tid = blockIdx.x * blockDim.x + threadIdx.x;
int totalThreads = gridDim.x * blockDim.x; int totalThreads = gridDim.x * blockDim.x;
Nd4jLong resultLength = shape::length(theFirstShape);
//const auto resultLength = shape::length(outputShape); __shared__ Nd4jLong resultLength;
// if (shape::order(outputShape) == 'c') { // ews == 1 always here __shared__ T* input;
__shared__ T* output;
if (0 == threadIdx.x) {
resultLength = shape::length(theFirstShape);
input = reinterpret_cast<T*>(theSecondBuffer);
output = reinterpret_cast<T*>(theFirstBuffer);
}
__syncthreads();
for (int i = tid; i < resultLength; i += totalThreads) { for (int i = tid; i < resultLength; i += totalThreads) {
auto xEws = shape::order(theFirstShape) == 'c'? shape::elementWiseStride(theFirstShape) :1; auto xEws = shape::order(theFirstShape) == 'c'? shape::elementWiseStride(theFirstShape) :1;
auto yEws = shape::order(theSecondShape) == 'c'? shape::elementWiseStride(theSecondShape):1; auto yEws = shape::order(theSecondShape) == 'c'? shape::elementWiseStride(theSecondShape):1;
//if (shape::order(theFirstShape) ==)
auto xOffset = shape::getIndexOffset(i * xEws, theFirstShape); auto xOffset = shape::getIndexOffset(i * xEws, theFirstShape);
auto yOffset = shape::getIndexOffset(i * yEws, theSecondShape); auto yOffset = shape::getIndexOffset(i * yEws, theSecondShape);
T temp = *(reinterpret_cast<T*>(theFirstBuffer) + xOffset); nd4j::math::nd4j_swap(output[xOffset], input[yOffset]);
*(reinterpret_cast<T*>(theFirstBuffer) + xOffset) = *(reinterpret_cast<T*>(theSecondBuffer) + yOffset);
*(reinterpret_cast<T*>(theSecondBuffer) + yOffset) = temp;
} }
} }

View File

@ -33,16 +33,19 @@ namespace nd4j {
return shape::length(shapeInfo); return shape::length(shapeInfo);
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// tileKernel:
// input: (inputBuffer and inputShape) - NDArray buffer and shape to tile
// output: (outputBuffer and outputShape) - NDArray to tile input
// resultLength - length for output array
template<typename T> template<typename T>
static __global__ void static __global__ void
tileKernel(void const *inputBuffer, Nd4jLong *inputShape, void *outputBuffer, Nd4jLong *outputShape, tileKernel(void const *inputBuffer, Nd4jLong *inputShape, void *outputBuffer, Nd4jLong *outputShape,
Nd4jLong resultLength) { Nd4jLong resultLength) {
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Original code to transform in cuda-based // Original code to transform in cuda-based
auto tid = blockIdx.x * blockDim.x + threadIdx.x; auto tid = blockIdx.x * blockDim.x + threadIdx.x; // copy linear sequence of elements, so one-level threading
int totalThreads = gridDim.x * blockDim.x; int totalThreads = gridDim.x * blockDim.x;
//const auto resultLength = shape::length(outputShape);
if (shape::order(outputShape) == 'c') { // ews == 1 always here if (shape::order(outputShape) == 'c') { // ews == 1 always here
for (int i = tid; i < resultLength; i += totalThreads) { for (int i = tid; i < resultLength; i += totalThreads) {
auto yOffset = _subArrayOffset(i, outputShape, inputShape); auto yOffset = _subArrayOffset(i, outputShape, inputShape);
@ -60,6 +63,7 @@ namespace nd4j {
BUILD_SINGLE_TEMPLATE(template __global__ void tileKernel,(void const* inputBuffer, Nd4jLong* inputShape, void* outputBuffer, Nd4jLong* outputShape, Nd4jLong resultLength), LIBND4J_TYPES); BUILD_SINGLE_TEMPLATE(template __global__ void tileKernel,(void const* inputBuffer, Nd4jLong* inputShape, void* outputBuffer, Nd4jLong* outputShape, Nd4jLong resultLength), LIBND4J_TYPES);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<typename T> template<typename T>
void tileKernelH(void const *inputBuffer, Nd4jLong *inputShape, void *outputBuffer, Nd4jLong *outputShape, Nd4jLong resultLength, cudaStream_t *stream) { void tileKernelH(void const *inputBuffer, Nd4jLong *inputShape, void *outputBuffer, Nd4jLong *outputShape, Nd4jLong resultLength, cudaStream_t *stream) {
dim3 launchDims(256, 512, 8192); dim3 launchDims(256, 512, 8192);
@ -68,6 +72,8 @@ namespace nd4j {
BUILD_SINGLE_TEMPLATE(template void tileKernelH, (void const* inputBuffer, Nd4jLong* inputShape, void* outputBuffer, Nd4jLong* outputShape, Nd4jLong resultLength, cudaStream_t *stream), LIBND4J_TYPES); BUILD_SINGLE_TEMPLATE(template void tileKernelH, (void const* inputBuffer, Nd4jLong* inputShape, void* outputBuffer, Nd4jLong* outputShape, Nd4jLong resultLength, cudaStream_t *stream), LIBND4J_TYPES);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// enhancement for tileKernel to different input and output data types: X - output type, Y - input type
template<typename X, typename Y> template<typename X, typename Y>
static __global__ void static __global__ void
tileKernelDouble(void const *inputBuffer, Nd4jLong *inputShape, void *outputBuffer, Nd4jLong *outputShape, Nd4jLong resultLength, Nd4jLong ews) { tileKernelDouble(void const *inputBuffer, Nd4jLong *inputShape, void *outputBuffer, Nd4jLong *outputShape, Nd4jLong resultLength, Nd4jLong ews) {

View File

@ -23,7 +23,12 @@
namespace nd4j { namespace nd4j {
namespace ops { namespace ops {
namespace helpers { namespace helpers {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// count rows kernel - count input pRows and pCols and put result onto pRowCounts
// pRowCounts - array of ints, with length N
// pRows - array of ints with length N, vals from 0 to N-1
// pCols - array of ints with length < N and vals between 0 and max(pRows)
//
static __global__ void countRowsKernel(int* pRowCounts, int const* pRows, int const* pCols, Nd4jLong N) { static __global__ void countRowsKernel(int* pRowCounts, int const* pRows, int const* pCols, Nd4jLong N) {
auto start = blockIdx.x * blockDim.x; auto start = blockIdx.x * blockDim.x;
auto step = blockDim.x * gridDim.x; auto step = blockDim.x * gridDim.x;
@ -32,19 +37,22 @@ namespace helpers {
int end = pRows[n + 1];//rowP->e<int>(n + 1); int end = pRows[n + 1];//rowP->e<int>(n + 1);
for (int i = begin; i < end; i++) { for (int i = begin; i < end; i++) {
bool present = false; bool present = false;
// loop between near pRows
for (int m = pRows[pCols[i]]; m < pRows[pCols[i] + 1]; m++) for (int m = pRows[pCols[i]]; m < pRows[pCols[i] + 1]; m++)
if (pCols[m] == n) { if (pCols[m] == n) { // mark index as existed with columns array
present = true; present = true;
break; break;
} }
atomicAdd(&pRowCounts[n], 1); atomicAdd(&pRowCounts[n], 1);
if (!present) if (!present) // increment row counter for given index
atomicAdd(&pRowCounts[pCols[i]], 1); atomicAdd(&pRowCounts[pCols[i]], 1);
} }
} }
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// row counter caller
Nd4jLong barnes_row_count(const NDArray* rowP, const NDArray* colP, Nd4jLong N, NDArray& rowCounts) { Nd4jLong barnes_row_count(const NDArray* rowP, const NDArray* colP, Nd4jLong N, NDArray& rowCounts) {
int* pRowCounts = reinterpret_cast<int*>(rowCounts.specialBuffer()); int* pRowCounts = reinterpret_cast<int*>(rowCounts.specialBuffer());
@ -58,12 +66,18 @@ namespace helpers {
return numElements; return numElements;
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// extend symRowP with pRowCounts array vals
// pRowCounts - int array with length N
// symRowP - int array with length N+1
// N - given array length
//
static __global__ void fillUpsymRow(int const* pRowCounts, int* symRowP, int N) { static __global__ void fillUpsymRow(int const* pRowCounts, int* symRowP, int N) {
auto start = blockIdx.x * blockDim.x + threadIdx.x; auto start = blockIdx.x * blockDim.x + threadIdx.x;
auto step = blockDim.x * gridDim.x; auto step = blockDim.x * gridDim.x;
for (int n = start; n < N + 1; n += step) { for (int n = start; n < N + 1; n += step) { // to avoid race condition use shift only for given index
symRowP[n] = 0; symRowP[n] = 0;
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
atomicAdd(&symRowP[n], pRowCounts[i]); atomicAdd(&symRowP[n], pRowCounts[i]);
@ -71,6 +85,17 @@ namespace helpers {
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// symmetrize routine kernel
// pRows - rows buffer (ints)
// pCols - column buffer (ints) with vals between 0 and max(pRows)
// pVals - values vector (floats)
// symRowP - ints, shifted pRows
// symColP - ints, shifted pCols,
// offset - ints, shitfs
// pOutput - result matrix (floats)
// N - pRows length
//
template <typename T> template <typename T>
static __global__ void symmetrizeKernel(int const* pRows, int const* pCols, T const* pVals, int* symRowP, int* symColP, int* offset, T* pOutput, int N) { static __global__ void symmetrizeKernel(int const* pRows, int const* pCols, T const* pVals, int* symRowP, int* symColP, int* offset, T* pOutput, int N) {
auto start = blockIdx.x * blockDim.x + threadIdx.x; auto start = blockIdx.x * blockDim.x + threadIdx.x;
@ -86,7 +111,6 @@ namespace helpers {
int start = pRows[colPI]; int start = pRows[colPI];
int end = pRows[colPI + 1]; int end = pRows[colPI + 1];
//PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(offset))
for (int m = start; m < end; m++) { for (int m = start; m < end; m++) {
if (pCols[m] == n) { if (pCols[m] == n) {
present = true; present = true;
@ -101,14 +125,10 @@ namespace helpers {
// If (colP[i], n) is not present, there is no addition involved // If (colP[i], n) is not present, there is no addition involved
if (!present) { if (!present) {
//int colPI = pCols[i];
//if (n <= colPI) {
symColP[symRowP[n] + offset[n]] = colPI; symColP[symRowP[n] + offset[n]] = colPI;
symColP[symRowP[pCols[i]] + offset[colPI]] = n; symColP[symRowP[pCols[i]] + offset[colPI]] = n;
pOutput[symRowP[n] + offset[n]] = pVals[i]; pOutput[symRowP[n] + offset[n]] = pVals[i];
pOutput[symRowP[colPI] + offset[colPI]] = pVals[i]; pOutput[symRowP[colPI] + offset[colPI]] = pVals[i];
//}
} }
// Update offsets // Update offsets
if (!present || (present && n <= colPI)) { if (!present || (present && n <= colPI)) {
@ -119,16 +139,18 @@ namespace helpers {
} }
} }
} }
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// symmetrize algorithm itself
//
template <typename T> template <typename T>
static void barnes_symmetrize_(const NDArray* rowP, const NDArray* colP, const NDArray* valP, Nd4jLong N, NDArray* outputRows, NDArray* outputCols, NDArray* outputVals, NDArray* rowCounts) { static void barnes_symmetrize_(const NDArray* rowP, const NDArray* colP, const NDArray* valP, Nd4jLong N, NDArray* outputRows, NDArray* outputCols, NDArray* outputVals, NDArray* rowCounts) {
int const* pRows = reinterpret_cast<int const*>(rowP->getSpecialBuffer()); int const* pRows = reinterpret_cast<int const*>(rowP->getSpecialBuffer());
int* symRowP = reinterpret_cast<int*>(outputRows->specialBuffer()); int* symRowP = reinterpret_cast<int*>(outputRows->specialBuffer());
int* pRowCounts = reinterpret_cast<int*>(rowCounts->specialBuffer()); int* pRowCounts = reinterpret_cast<int*>(rowCounts->specialBuffer());
auto stream = outputCols->getContext()->getCudaStream(); auto stream = outputCols->getContext()->getCudaStream();
// fill up syRowP array
fillUpsymRow<<<1, N, 128, *stream>>>(pRowCounts, symRowP, N); fillUpsymRow<<<1, N, 128, *stream>>>(pRowCounts, symRowP, N);
outputRows->syncToHost(); outputRows->syncToHost();
// outputRows->printBuffer("output rows"); // outputRows->printBuffer("output rows");
@ -140,15 +162,23 @@ namespace helpers {
//std::vector<int> rowCountsV = rowCounts->getBufferAsVector<int>(); //std::vector<int> rowCountsV = rowCounts->getBufferAsVector<int>();
auto offsetArr = NDArrayFactory::create<int>('c', {N}); auto offsetArr = NDArrayFactory::create<int>('c', {N});
int* offset = reinterpret_cast<int*>(offsetArr.specialBuffer()); int* offset = reinterpret_cast<int*>(offsetArr.specialBuffer());
// symmetrize itself
symmetrizeKernel<T><<<1, 1, 1024, *stream>>>(pRows, pCols, pVals, symRowP, symColP, offset, pOutput, N); symmetrizeKernel<T><<<1, 1, 1024, *stream>>>(pRows, pCols, pVals, symRowP, symColP, offset, pOutput, N);
//PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(guided) shared(offset))
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// symmetrize caller and adoption
//
void barnes_symmetrize(const NDArray* rowP, const NDArray* colP, const NDArray* valP, Nd4jLong N, NDArray* outputRows, NDArray* outputCols, NDArray* outputVals, NDArray* rowCounts) { void barnes_symmetrize(const NDArray* rowP, const NDArray* colP, const NDArray* valP, Nd4jLong N, NDArray* outputRows, NDArray* outputCols, NDArray* outputVals, NDArray* rowCounts) {
BUILD_SINGLE_SELECTOR(valP->dataType(), barnes_symmetrize_, (rowP, colP, valP, N, outputRows, outputCols, outputVals, rowCounts), NUMERIC_TYPES); BUILD_SINGLE_SELECTOR(valP->dataType(), barnes_symmetrize_, (rowP, colP, valP, N, outputRows, outputCols, outputVals, rowCounts), NUMERIC_TYPES);
*outputVals /= 2.0; *outputVals /= 2.0;
} }
BUILD_SINGLE_TEMPLATE(template void barnes_symmetrize_, (const NDArray* rowP, const NDArray* colP, const NDArray* valP, Nd4jLong N, NDArray* outputRows, NDArray* outputCols, NDArray* outputVals, NDArray* rowCounts), NUMERIC_TYPES); BUILD_SINGLE_TEMPLATE(template void barnes_symmetrize_, (const NDArray* rowP, const NDArray* colP, const NDArray* valP, Nd4jLong N, NDArray* outputRows, NDArray* outputCols, NDArray* outputVals, NDArray* rowCounts), NUMERIC_TYPES);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// edge forces implementation
//
template <typename T> template <typename T>
static __global__ void edgeForcesKernel(int const* pRows, int const* pCols, T const* dataP, T const* vals, T* outputP, int N, int colCount, int rowSize) { static __global__ void edgeForcesKernel(int const* pRows, int const* pCols, T const* dataP, T const* vals, T* outputP, int N, int colCount, int rowSize) {
// std::vector<T> buffer(colCount); // std::vector<T> buffer(colCount);
@ -172,10 +202,12 @@ namespace helpers {
for (int k = 0; k < colCount; k++) for (int k = 0; k < colCount; k++)
math::atomics::nd4j_atomicAdd(&outputP[shift + k], T((dataP[shift + k] - thisSlice[k]) * res)); math::atomics::nd4j_atomicAdd(&outputP[shift + k], T((dataP[shift + k] - thisSlice[k]) * res));
} }
//atomicAdd(&shift, colCount);
} }
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// edge forces algorithm
//
}
template <typename T> template <typename T>
static void barnes_edge_forces_(const NDArray* rowP, NDArray const* colP, NDArray const* valP, int N, NDArray const* data, NDArray* output) { static void barnes_edge_forces_(const NDArray* rowP, NDArray const* colP, NDArray const* valP, int N, NDArray const* data, NDArray* output) {
NDArray::prepareSpecialUse({output}, {data, rowP, colP, valP, valP}); NDArray::prepareSpecialUse({output}, {data, rowP, colP, valP, valP});
@ -191,18 +223,22 @@ namespace helpers {
edgeForcesKernel<T><<<1, 128, 1024, *stream>>>(pRows, pCols, dataP, vals, outputP, N, colCount, rowSize); edgeForcesKernel<T><<<1, 128, 1024, *stream>>>(pRows, pCols, dataP, vals, outputP, N, colCount, rowSize);
NDArray::registerSpecialUse({output}, {rowP, colP, valP, data}); NDArray::registerSpecialUse({output}, {rowP, colP, valP, data});
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// edge forces caller
//
void barnes_edge_forces(const NDArray* rowP, NDArray const* colP, NDArray const* valP, int N, NDArray* output, NDArray const& data) { void barnes_edge_forces(const NDArray* rowP, NDArray const* colP, NDArray const* valP, int N, NDArray* output, NDArray const& data) {
// Loop over all edges in the graph // Loop over all edges in the graph
BUILD_SINGLE_SELECTOR(output->dataType(), barnes_edge_forces_, (rowP, colP, valP, N, &data, output), FLOAT_TYPES); BUILD_SINGLE_SELECTOR(output->dataType(), barnes_edge_forces_, (rowP, colP, valP, N, &data, output), FLOAT_TYPES);
} }
BUILD_SINGLE_TEMPLATE(template void barnes_edge_forces_, (const NDArray* rowP, NDArray const* colP, NDArray const* valP, int N, NDArray const* data, NDArray* output), FLOAT_TYPES); BUILD_SINGLE_TEMPLATE(template void barnes_edge_forces_, (const NDArray* rowP, NDArray const* colP, NDArray const* valP, int N, NDArray const* data, NDArray* output), FLOAT_TYPES);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// gains - run a function T((x + 2.) * nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps)) + T(x * 0.8 * nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps));
// for all members in input and put all in output
//
template <typename T> template <typename T>
void barnes_gains_(NDArray* input, NDArray* gradX, NDArray* epsilon, NDArray* output) { void barnes_gains_(NDArray* input, NDArray* gradX, NDArray* epsilon, NDArray* output) {
auto gainsInternal = LAMBDA_TTT(x, grad, eps) { auto gainsInternal = LAMBDA_TTT(x, grad, eps) {
// return T((x + 2.) * nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps)) + T(x * 0.8 * nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps));
//return T((x + 2.) * nd4j::math::nd4j_sign<T,T>(grad) == nd4j::math::nd4j_sign<T,T>(eps)) + T(x * 0.8 * nd4j::math::nd4j_sign<T,T>(grad) == nd4j::math::nd4j_sign<T,T>(eps));
T res = nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps) ? x + T(.2) : x * T(.8); T res = nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps) ? x + T(.2) : x * T(.8);
if(res < .01) res = .01; if(res < .01) res = .01;
return res; return res;
@ -211,14 +247,20 @@ namespace helpers {
input->applyTriplewiseLambda(gradX, epsilon, gainsInternal, output); input->applyTriplewiseLambda(gradX, epsilon, gainsInternal, output);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// gains caller
void barnes_gains(NDArray* input, NDArray* gradX, NDArray* epsilon, NDArray* output) { void barnes_gains(NDArray* input, NDArray* gradX, NDArray* epsilon, NDArray* output) {
BUILD_SINGLE_SELECTOR(input->dataType(), barnes_gains_, (input, gradX, epsilon, output), NUMERIC_TYPES); BUILD_SINGLE_SELECTOR(input->dataType(), barnes_gains_, (input, gradX, epsilon, output), NUMERIC_TYPES);
} }
BUILD_SINGLE_TEMPLATE(template void barnes_gains_, (NDArray* input, NDArray* gradX, NDArray* epsilon, NDArray* output), NUMERIC_TYPES); BUILD_SINGLE_TEMPLATE(template void barnes_gains_, (NDArray* input, NDArray* gradX, NDArray* epsilon, NDArray* output), NUMERIC_TYPES);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// cell contains - check cells for given point
//
bool cell_contains(NDArray* corner, NDArray* width, NDArray* point, Nd4jLong dimension) { bool cell_contains(NDArray* corner, NDArray* width, NDArray* point, Nd4jLong dimension) {
auto cornerMinusWidth = *corner - *width; auto cornerMinusWidth = *corner - *width;
auto cornerPlusWidth = *corner + *width; auto cornerPlusWidth = *corner + *width;
// executes on host side, so sync all to host memory
cornerMinusWidth.syncToHost(); cornerMinusWidth.syncToHost();
cornerPlusWidth.syncToHost(); cornerPlusWidth.syncToHost();
for (Nd4jLong i = 0; i < dimension; i++) { for (Nd4jLong i = 0; i < dimension; i++) {

View File

@ -24,7 +24,14 @@
namespace nd4j { namespace nd4j {
namespace ops { namespace ops {
namespace helpers { namespace helpers {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// diag functor cuda kernel
// outputBuffer - output tensor buffer
// outputShape - output tensor shape
// inputBuffer - input tensor buffer - this tensor should be placed on diagonal position of output
// inputShape - input tensor shape
// inputLength - length for input tensor
//
template <typename T> template <typename T>
static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputShape, void const* inputBuffer, Nd4jLong* inputShape, Nd4jLong inputLength) { static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputShape, void const* inputBuffer, Nd4jLong* inputShape, Nd4jLong inputLength) {
__shared__ T *z; __shared__ T *z;
@ -41,12 +48,22 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = gridDim.x * blockDim.x; const auto step = gridDim.x * blockDim.x;
for (int t = tid; t < inputLength; t += step) {
for (int t = tid; t < inputLength; t += step) { // for all vals in input, put all on diagonal position to output
z[shape::getIndexOffset(t * (inputLength + 1), outputShape)] = x[shape::getIndexOffset(t, inputShape)]; //tX]; z[shape::getIndexOffset(t * (inputLength + 1), outputShape)] = x[shape::getIndexOffset(t, inputShape)]; //tX];
} }
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// diag part functor cuda kernel
// outputBuffer - output tensor buffer - linear sequence of diagonal values
// outputShape - output tensor shape
// inputBuffer - input tensor buffer - this tensor should be placed on diagonal position of output
// inputShape - input tensor shape
// outputLength - given length of output
// inputLength - given length for input tensor
//
template <typename T> template <typename T>
static __global__ void diagPartFunctorKernel(void* outputBuffer, Nd4jLong* outputShape, void const* inputBuffer, Nd4jLong* inputShape, Nd4jLong outputLength, Nd4jLong inputLength) { static __global__ void diagPartFunctorKernel(void* outputBuffer, Nd4jLong* outputShape, void const* inputBuffer, Nd4jLong* inputShape, Nd4jLong outputLength, Nd4jLong inputLength) {
__shared__ T *z; __shared__ T *z;
@ -61,10 +78,11 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
const auto tid = blockIdx.x * blockDim.x + threadIdx.x; const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = gridDim.x * blockDim.x; const auto step = gridDim.x * blockDim.x;
Nd4jLong i = threadIdx.x * (outputLength + 1); Nd4jLong i = threadIdx.x * (outputLength + 1); // pos to diagonal value
for (int t = tid; t < outputLength && i < inputLength; t += step) { for (int t = tid; t < outputLength && i < inputLength; t += step) { // loop by output, but input matrix may not be square
z[shape::getIndexOffset(t, outputShape)] = x[shape::getIndexOffset(i, inputShape)]; //tX]; // put diagonal val from input onto output
i += outputLength + 1; z[shape::getIndexOffset(t, outputShape)] = x[shape::getIndexOffset(i, inputShape)];
i += outputLength + 1; // shift to next diagonal value
} }
} }
@ -81,6 +99,8 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
diagFunctorKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), input->getSpecialBuffer(), input->getSpecialShapeInfo(), inputLength); diagFunctorKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), input->getSpecialBuffer(), input->getSpecialShapeInfo(), inputLength);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// diagFunctor - caller for diag functor processor
void diagFunctor(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) { void diagFunctor(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
auto xType = input->dataType(); auto xType = input->dataType();
@ -89,6 +109,8 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
BUILD_SINGLE_TEMPLATE(template void _diagFunctor, (nd4j::LaunchContext * context, const NDArray* input, NDArray* output);, LIBND4J_TYPES); BUILD_SINGLE_TEMPLATE(template void _diagFunctor, (nd4j::LaunchContext * context, const NDArray* input, NDArray* output);, LIBND4J_TYPES);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// diagPartFunctor - caller for diag part functor kernel
template <typename T> template <typename T>
void _diagPartFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray* output) { void _diagPartFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray* output) {
const int outLen = output->lengthOf(); const int outLen = output->lengthOf();
@ -102,7 +124,8 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
diagPartFunctorKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), input->getSpecialBuffer(), input->getSpecialShapeInfo(), outLen, inLen); diagPartFunctorKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), input->getSpecialBuffer(), input->getSpecialShapeInfo(), outLen, inLen);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// diagPartFunctor - caller for diag part functor processor
void diagPartFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray* output) { void diagPartFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray* output) {
auto zType = output->dataType(); auto zType = output->dataType();
BUILD_SINGLE_SELECTOR(zType, _diagPartFunctor, (context, input, output), NUMERIC_TYPES); BUILD_SINGLE_SELECTOR(zType, _diagPartFunctor, (context, input, output), NUMERIC_TYPES);

View File

@ -28,69 +28,31 @@
namespace nd4j { namespace nd4j {
namespace ops { namespace ops {
namespace helpers { namespace helpers {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// template <typename T> // extract patches kernel
// static __global__ void globalExtractPatchesKernel(bool theSame, int batchCount, int sizeRow, int sizeCol, int rowDim, int colDim, int outRowDim, int outColDim, int strideRow, int strideCol, int rateRow, int rateCol, int rowCast, int colCast, int lastDim, T* input, Nd4jLong* patchShape, Nd4jLong* inputOffsets, T* output, Nd4jLong* outTadShape, Nd4jLong* outputOffsets) { // - theSame - SAME or VALID - output format
// //globalExtractPatches_(void *vinput, Nd4jLong *xTadShape, Nd4jLong *xTadOffsets, void *voutput, Nd4jLong *zTadShape, Nd4jLong *zTadOffsets, const int numTads, const int sizeRow, const int sizeCol, const int stradeRow, const int stradeCol, const int rateRow, const int rateCol, const bool theSame, const int lastDim, const int rowDim, const int colDim) { // - batchCount - batches - the first dimension of input
// const int warpSize = lastDim; // - sizeRow, sizeCol - rows and cols sizes for batch
// const int tid = blockIdx.x * gridDim.x + threadIdx.x; // - rowDim, colDim - rows and cols dimensions for input patches
// const int warpIdx = tid / warpSize; // - outRowDim, outColDim - rows and cols dimensions for output patches
// const int warpPos = tid % warpSize; // - strideRow, strideCol - step between input elements with patches
// const int numWarps = 1; //(gridDim.x * blockDim.x) / warpSize; // - rateRow, rateCol - counts for input patches
// const int patchLength = shape::length(outTadShape); // - rowCast, colCast - shifts for output placement (1 or 0)
// - lastDim - last dimension of input/output
// - input - input tensor buffer
// - patchShape - input patch TAD shape
// - inputOffsets - input TAD offsets
// - output - output tensor buffer
// - outTadShape - output TAD shape
// - outputOffsets - output TAD offsets
// //
// auto xShape = shape::shapeOf(patchShape);
// auto xStride = shape::stride(patchShape);
// auto xRank = shape::rank(patchShape);
//
// for (int e = 0; e < batchCount; e += numWarps) {
// auto patch = input + inputOffsets[e];
// auto matrix = output + outputOffsets[e];
// int iter = 0;
//
// for (Nd4jLong i = 0; i < outRowDim; i++) {
// for (Nd4jLong j = 0; j < outColDim; j++) {
// Nd4jLong pos = 0;
// //for (Nd4jLong k = 0; k < outputLastDim; k++) {
// auto rowStart = i * strideRow - (theSame?rowCast:0);
// auto colStart = j * strideCol - (theSame?colCast:0);
// auto rowEnd = rowStart + sizeRow * rateRow;
// auto colEnd = colStart + sizeCol * rateCol;
// if (!theSame) {
// rowEnd = math::nd4j_min(int(rowStart + sizeRow * rateRow), rowDim);
// colEnd = math::nd4j_min(int(colStart + sizeCol * rateCol), colDim);
// }
// //auto pixel = 0LL;
// for (auto row = rowStart; row < rowEnd; row += rateRow)
// for (auto col = colStart; col < colEnd; col += rateCol)
// for (auto pixel = 0; pixel < lastDim; pixel++) {
// Nd4jLong zPos[] = {i, j, pos};
// Nd4jLong xPos[] = {row, col, pixel};
// auto zIndex = shape::getOffset(outTadShape, zPos);
// auto xIndex = shape::getOffset(patchShape, xPos);
// if (theSame) { // SAME case
// if (row >= 0 && col >= 0 && row < rowDim && col < colDim)
// matrix[zIndex] = patch[xIndex]; //outMatrix->p<T>(i, j, pos, patch->e<T>(row, col, pixel));
// //pos++;
// }
// else { // VALID case
// matrix[zIndex] = patch[xIndex]; //outMatrix->p<T>(i, j, pos++, patch->e<T>(row, col, pixel));
// }
// pos++;
// }
// }
// }
// __syncthreads();
// }
// }
template <typename T> template <typename T>
static __global__ void globalExtractPatchesKernel(bool theSame, int batchCount, int sizeRow, int sizeCol, int rowDim, int colDim, int outRowDim, int outColDim, int strideRow, int strideCol, int rateRow, int rateCol, int rowCast, int colCast, int lastDim, T* input, Nd4jLong* patchShape, Nd4jLong* inputOffsets, T* output, Nd4jLong* outTadShape, Nd4jLong* outputOffsets) { static __global__ void globalExtractPatchesKernel(bool theSame, int batchCount, int sizeRow, int sizeCol, int rowDim, int colDim, int outRowDim, int outColDim, int strideRow, int strideCol, int rateRow, int rateCol, int rowCast, int colCast, int lastDim, T* input, Nd4jLong* patchShape, Nd4jLong* inputOffsets, T* output, Nd4jLong* outTadShape, Nd4jLong* outputOffsets) {
auto start = threadIdx.x + blockIdx.x * blockDim.x; auto start = threadIdx.x + blockIdx.x * blockDim.x;
auto step = blockDim.x * gridDim.x; auto step = blockDim.x * gridDim.x;
// batch input by 3 last dims and extrapole input onto output with outColDim/outRowDim
for (Nd4jLong batch = start; batch < batchCount; batch += step) { for (Nd4jLong batch = start; batch < batchCount; batch += step) {
auto patch = input + inputOffsets[batch];// listOfMatricies->at(batch); auto patch = input + inputOffsets[batch];// listOfMatricies->at(batch);
auto outMatrix = output + outputOffsets[batch]; //listOfOutputs->at(batch); auto outMatrix = output + outputOffsets[batch]; //listOfOutputs->at(batch);
@ -98,7 +60,6 @@ namespace helpers {
for (Nd4jLong i = 0; i < outRowDim; i++) { for (Nd4jLong i = 0; i < outRowDim; i++) {
for (Nd4jLong j = 0; j < outColDim; j++) { for (Nd4jLong j = 0; j < outColDim; j++) {
Nd4jLong pos = 0; Nd4jLong pos = 0;
//for (Nd4jLong k = 0; k < outputLastDim; k++) {
auto rowStart = i * strideRow - (theSame?rowCast:0); auto rowStart = i * strideRow - (theSame?rowCast:0);
auto colStart = j * strideCol - (theSame?colCast:0); auto colStart = j * strideCol - (theSame?colCast:0);
auto rowEnd = rowStart + sizeRow * rateRow; auto rowEnd = rowStart + sizeRow * rateRow;
@ -107,13 +68,14 @@ namespace helpers {
rowEnd = math::nd4j_min(rowStart + sizeRow * rateRow, Nd4jLong (rowDim)); rowEnd = math::nd4j_min(rowStart + sizeRow * rateRow, Nd4jLong (rowDim));
colEnd = math::nd4j_min(colStart + sizeCol * rateCol, Nd4jLong (colDim)); colEnd = math::nd4j_min(colStart + sizeCol * rateCol, Nd4jLong (colDim));
} }
//auto pixel = 0LL;
for (auto row = rowStart; row < rowEnd; row += rateRow) for (auto row = rowStart; row < rowEnd; row += rateRow) {
for (auto col = colStart; col < colEnd; col += rateCol) for (auto col = colStart; col < colEnd; col += rateCol) {
for (auto pixel = 0; pixel < lastDim; pixel++) { for (auto pixel = 0; pixel < lastDim; pixel++) {
Nd4jLong zPos[] = {i, j, pos}; Nd4jLong zPos[] = {i, j, pos};
Nd4jLong xPos[] = {row, col, pixel}; Nd4jLong xPos[] = {row, col, pixel};
bool setUp = (theSame && row >= 0 && col >= 0 && row < rowDim && col < colDim) || (!theSame); bool setUp =
(theSame && row >= 0 && col >= 0 && row < rowDim && col < colDim) || (!theSame);
if (setUp) { // VALID or SAME cases if (setUp) { // VALID or SAME cases
outMatrix[shape::getOffset(outTadShape, zPos)] = patch[shape::getOffset(patchShape, xPos)]; outMatrix[shape::getOffset(outTadShape, zPos)] = patch[shape::getOffset(patchShape, xPos)];
@ -123,9 +85,12 @@ namespace helpers {
} }
} }
} }
}
}
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename T> template <typename T>
static void _extractPatches(nd4j::LaunchContext * context, NDArray* images, NDArray* output, int sizeRow, int sizeCol, int strideRow, int strideCol, int rateRow, int rateCol, bool theSame){ static void _extractPatches(nd4j::LaunchContext * context, NDArray* images, NDArray* output, int sizeRow, int sizeCol, int strideRow, int strideCol, int rateRow, int rateCol, bool theSame){
NDArray::prepareSpecialUse({output}, {images}); NDArray::prepareSpecialUse({output}, {images});
@ -141,36 +106,29 @@ namespace helpers {
Nd4jLong colDim = images->sizeAt(2); Nd4jLong colDim = images->sizeAt(2);
Nd4jLong outRowDim = output->sizeAt(1); Nd4jLong outRowDim = output->sizeAt(1);
Nd4jLong outColDim = output->sizeAt(2); Nd4jLong outColDim = output->sizeAt(2);
auto rowCast = 1; //(sizeRow - 1)*rateRow < outRowDim/sizeRow ?0:1;///(ksize * lastDim > rowDim * ksizeColsEffective + lastDim?1:0); auto rowCast = 1;
auto colCast = 1; //colDim / ksizeColsEffective +2 <= sizeCol?0:1;//(ksize * lastDim > ksizeRowsEffective * colDim + lastDim?1:0); auto colCast = 1;
// validate shifts
if (sizeRow * rateRow < 3) if (sizeRow * rateRow < 3)
rowCast = 0; rowCast = 0;
if (sizeCol * rateCol < 3) if (sizeCol * rateCol < 3)
colCast = 0; colCast = 0;
//images->tickReadDevice();
//if (images->isActualOnDeviceSide())
//images->syncToDevice();
auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(images->getShapeInfo(), restDims.data(), restDims.size()); auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(images->getShapeInfo(), restDims.data(), restDims.size());
auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), restDims.data(), restDims.size()); auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), restDims.data(), restDims.size());
int batchCount = packX.numberOfTads(); //'listOfMatricies->size(); //lengthOf() / ksize; int batchCount = packX.numberOfTads();
//printf("Batch Count is %d\n", batchCount);
//shape::printShapeInfo(packX.primaryShapeInfo());
//NDArray::prepareSpecialUse({output}, {images});
PointersManager manager(context, "helpers::extractPatches"); PointersManager manager(context, "helpers::extractPatches");
auto stream = context->getCudaStream(); auto stream = context->getCudaStream();
auto imagesBuffer = reinterpret_cast<T*>(images->specialBuffer()); auto imagesBuffer = reinterpret_cast<T*>(images->specialBuffer());
auto outputBuffer = reinterpret_cast<T*>(output->specialBuffer()); auto outputBuffer = reinterpret_cast<T*>(output->specialBuffer());
//images->printIndexedBuffer("INPUT");
// globalExtractPatchesKernel<T><<<512, 512, 1024, *context->getCudaStream()>>>(theSame, batchCount, sizeRow, sizeCol,
globalExtractPatchesKernel<T><<<128, 128, 1024, *stream>>>(theSame, batchCount, sizeRow, sizeCol, globalExtractPatchesKernel<T><<<128, 128, 1024, *stream>>>(theSame, batchCount, sizeRow, sizeCol,
rowDim, colDim, outRowDim, outColDim, strideRow, strideCol, rateRow, rateCol, rowCast, colCast, lastDim, rowDim, colDim, outRowDim, outColDim, strideRow, strideCol, rateRow, rateCol, rowCast, colCast, lastDim,
imagesBuffer, packX.specialShapeInfo(), packX.specialOffsets(), outputBuffer, packZ.specialShapeInfo(), imagesBuffer, packX.specialShapeInfo(), packX.specialOffsets(), outputBuffer, packZ.specialShapeInfo(),
packZ.specialOffsets()); packZ.specialOffsets());
//extractPatchesKernel<T><<<batchCount, 512, 1024, *stream>>>(theSame, batchCount, sizeRow, sizeCol, rowDim, colDim, outRowDim, outColDim, stradeRow, stradeCol, rateRow, rateCol, rowCast, colCast, lastDim, imagesBuffer, packX.specialShapeInfo(), packX.platformOffsets(), outputBuffer, packZ.specialShapeInfo(), packZ.platformOffsets());
//output->tickWriteDevice();
//output->printIndexedBuffer("OUTPUT");
manager.synchronize(); manager.synchronize();
NDArray::registerSpecialUse({output}, {images}); NDArray::registerSpecialUse({output}, {images});
} }
@ -183,76 +141,6 @@ namespace helpers {
BUILD_SINGLE_SELECTOR(xType, _extractPatches, (context, images, output, sizeRow, sizeCol, stradeRow, stradeCol, rateRow, rateCol, theSame), LIBND4J_TYPES); BUILD_SINGLE_SELECTOR(xType, _extractPatches, (context, images, output, sizeRow, sizeCol, stradeRow, stradeCol, rateRow, rateCol, theSame), LIBND4J_TYPES);
} }
// std::vector<int> restDims({1, 2, 3}); // the first and the last dims
// std::unique_ptr<ResultSet> listOfMatricies(images->allTensorsAlongDimension(restDims));
// std::unique_ptr<ResultSet> listOfOutputs(output->allTensorsAlongDimension(restDims));
// // 3D matricies - 2D matricies of vectors (if last dim is greater than 1)
// //int e = 0;
// const int ksizeRowsEffective = sizeRow + (sizeRow - 1) * (rateRow - 1);
// const int ksizeColsEffective = sizeCol + (sizeCol - 1) * (rateCol - 1);
// const int ksize = ksizeRowsEffective * ksizeColsEffective;
// int batchCount = listOfMatricies->size(); //lengthOf() / ksize;
// Nd4jLong lastDim = images->sizeAt(3);
// Nd4jLong outLastDim = output->sizeAt(3);
// Nd4jLong rowDim = images->sizeAt(1);
// Nd4jLong colDim = images->sizeAt(2);
// Nd4jLong outRowDim = output->sizeAt(1);
// Nd4jLong outColDim = output->sizeAt(2);
// auto rowCast = 1; //(sizeRow - 1)*rateRow < outRowDim/sizeRow ?0:1;///(ksize * lastDim > rowDim * ksizeColsEffective + lastDim?1:0);
// auto colCast = 1; //colDim / ksizeColsEffective +2 <= sizeCol?0:1;//(ksize * lastDim > ksizeRowsEffective * colDim + lastDim?1:0);
// if (sizeRow * rateRow < 3)
// rowCast = 0;
// if (sizeCol * rateCol < 3)
// colCast = 0;
// //Nd4jLong outputLastDim = output->sizeAt(3);
// PRAGMA_OMP_PARALLEL_FOR
// for (Nd4jLong batch = 0; batch < batchCount; batch++) {
// auto patch = listOfMatricies->at(batch);
// auto outMatrix = listOfOutputs->at(batch);
// //auto patchBorder = patch->sizeAt(0);
// if (theSame) { // SAME case
// for (Nd4jLong i = 0; i < outRowDim; i++) {
// for (Nd4jLong j = 0; j < outColDim; j++) {
// Nd4jLong pos = 0;
// //for (Nd4jLong k = 0; k < outputLastDim; k++) {
// auto rowStart = i * strideRow - rowCast;
// auto colStart = j * strideCol - colCast;
// auto rowEnd = rowStart + sizeRow * rateRow;
// auto colEnd = colStart + sizeCol * rateCol;
// auto pixel = 0LL;
// for (auto row = rowStart; row < rowEnd; row += rateRow)
// for (auto col = colStart; col < colEnd; col += rateCol)
// for (auto pixel = 0; pixel < lastDim; pixel++) {
// if (row >=0 && col >= 0 && row < rowDim && col < colDim)
// outMatrix->p<T>(i, j, pos, patch->e<T>(row, col, pixel));
// pos++;
// }
// //}
// }
// }
//
// } else { // VALID case
// for (Nd4jLong i = 0; i < outRowDim; i++) {
// for (Nd4jLong j = 0; j < outColDim; j++) {
// Nd4jLong pos = 0;
// //for (Nd4jLong k = 0; k < outputLastDim; k++) {
// auto rowStart = i * strideRow;
// auto colStart = j * strideCol;
// auto rowEnd = math::nd4j_min(rowStart + sizeRow * rateRow, rowDim);
// auto colEnd = math::nd4j_min(colStart + sizeCol * rateCol, colDim);
// auto pixel = 0LL;
// for (auto row = rowStart; row < rowEnd; row += rateRow)
// for (auto col = colStart; col < colEnd; col += rateCol)
// for (auto pixel = 0; pixel < lastDim; pixel++)
// outMatrix->p<T>(i,j,pos++, patch->e<T>(row, col, pixel));
// //}
// }
// }
// }
// }
//
//
//
} }
} }
} }

View File

@ -24,16 +24,26 @@
namespace nd4j { namespace nd4j {
namespace ops { namespace ops {
namespace helpers { namespace helpers {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// fakeQuantWithMinMaxVars_
// input - input tensor
// min - min scalar tensor
// max - max scalar tensor
// numBits - (default 16bit)
// narrowed - shrink is true
// output - output tensor
//
template <typename T> template <typename T>
void fakeQuantWithMinMaxVars_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) { void fakeQuantWithMinMaxVars_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
int lowIntBound = narrowed?1:0; int lowIntBound = narrowed?1:0;
int upperIntBound = 1 << numBits - 1; int upperIntBound = 1 << numBits - 1;
min->syncToHost();
max->syncToHost();
const float quant_min_float = static_cast<float>(lowIntBound); const float quant_min_float = static_cast<float>(lowIntBound);
const float quant_max_float = static_cast<float>(upperIntBound); const float quant_max_float = static_cast<float>(upperIntBound);
T scale = (max->t<T>(0) - min->t<T>(0)) / (quant_max_float - quant_min_float); T scale = (max->t<T>(0) - min->t<T>(0)) / (quant_max_float - quant_min_float);
const T zero_point_from_min = quant_min_float - min->e<T>(0) / scale; const T zero_point_from_min = quant_min_float - min->t<T>(0) / scale;
const uint16_t nudged_zero_point = [zero_point_from_min, lowIntBound, const uint16_t nudged_zero_point = [zero_point_from_min, lowIntBound,
quant_min_float, upperIntBound, quant_min_float, upperIntBound,
quant_max_float] { quant_max_float] {
@ -48,46 +58,34 @@ namespace helpers {
auto nudged_min = (quant_min_float - nudged_zero_point) * (scale); auto nudged_min = (quant_min_float - nudged_zero_point) * (scale);
auto nudged_max = (quant_max_float - nudged_zero_point) * (scale); auto nudged_max = (quant_max_float - nudged_zero_point) * (scale);
//input->applyScalar(scalar::CompareAndSet, nudged_max, clamped, nullptr); //.cwiseMin(nudged_max).cwiseMax(nudged_min);
//input->applyScalar(scalar::CompareAndSet, nudged_min, clamped, nullptr); //.cwiseMin(nudged_max).cwiseMax(nudged_min);
auto wiseMax = LAMBDA_T(x, nudged_min) { auto wiseMax = LAMBDA_T(x, nudged_min) {
if (x < nudged_min) { if (x < nudged_min) {
return nudged_min; return nudged_min;
} }
return x; return x;
}; };
auto wiseMin = LAMBDA_T(x, nudged_max) { auto wiseMin = LAMBDA_T(x, nudged_max) {
if (x > nudged_max) { if (x > nudged_max) {
return nudged_max; return nudged_max;
} }
return x; return x;
}; };
auto scaleTensor(*input); // = NDArrayFactory::create(input->ordering(), input->getShapeAsVector(), input->getWorkspace());
auto clamped(*input); // = NDArrayFactory::create(input->ordering(), input->getShapeAsVector(), input->getWorkspace()); auto scaleTensor(*input);
auto clamped(*input);
scaleTensor.assign(scale); scaleTensor.assign(scale);
input->applyLambda(wiseMin, &clamped); input->applyLambda(wiseMin, &clamped);
// const auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
clamped.applyLambda(wiseMax, output); clamped.applyLambda(wiseMax, output);
// const auto clamped_shifted = clamped - nudged_min;
*output -= nudged_min; *output -= nudged_min;
// auto nudgedScale = scale;
(*output) /= scaleTensor; (*output) /= scaleTensor;
(*output) += T(0.5f); (*output) += T(0.5f);
output->applyTransform(transform::Floor, nullptr, nullptr); output->applyTransform(transform::Floor, nullptr, nullptr);
(*output) *= scaleTensor; (*output) *= scaleTensor;
(*output) += nudged_min; (*output) += nudged_min;
//output->printIndexedBuffer("FAKE QUANTED");
/*
const auto nudged_scale_repl = inputs.constant(nudged_scale);
const auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
const auto clamped_shifted = clamped - nudged_min;
*output = (clamped_shifted / nudged_scale_repl + 0.5f).floor() *
nudged_scale_repl +
nudged_min;
*/
} }
void fakeQuantWithMinMaxVars(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) { void fakeQuantWithMinMaxVars(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {

View File

@ -32,7 +32,13 @@ namespace helpers {
// https://en.wikipedia.org/wiki/Bilinear_interpolation) // https://en.wikipedia.org/wiki/Bilinear_interpolation)
double interpolarValue; double interpolarValue;
}; };
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// computeInterpolationWeights kernel
// outSize - output length
// inSize - input size
// scale - input scale
// interporationData - result
//
static __global__ void computeInterpolationWeights(Nd4jLong outSize, static __global__ void computeInterpolationWeights(Nd4jLong outSize,
Nd4jLong inSize, Nd4jLong inSize,
double scale, double scale,
@ -54,21 +60,26 @@ namespace helpers {
} }
} }
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// resize image with bilinear interpolation algorithm
//
static void resizeImage(nd4j::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight, static void resizeImage(nd4j::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight,
Nd4jLong outWidth, Nd4jLong channels, Nd4jLong outWidth, Nd4jLong channels,
BilinearInterpolationData* xs_, BilinearInterpolationData* xs_,
BilinearInterpolationData* ys_, BilinearInterpolationData* ys_,
NDArray* output); NDArray* output);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// resize image with bilinear interpolation algorithm kernel
//
template <typename T> template <typename T>
static __global__ void resizeImageKernel(T const* input, Nd4jLong const* inputShape, T* outputYptr, Nd4jLong* outputShape, Nd4jLong batchSize, static __global__ void resizeImageKernel(T const* input, Nd4jLong const* inputShape, T* outputYptr, Nd4jLong* outputShape, Nd4jLong batchSize,
Nd4jLong outWidth, Nd4jLong outHeight, Nd4jLong channels, Nd4jLong inRowSize, Nd4jLong outRowSize, Nd4jLong inBatchNumValues, Nd4jLong outWidth, Nd4jLong outHeight, Nd4jLong channels, Nd4jLong inRowSize, Nd4jLong outRowSize, Nd4jLong inBatchNumValues,
BilinearInterpolationData* xs_, BilinearInterpolationData* ys_) { BilinearInterpolationData* xs_, BilinearInterpolationData* ys_) {
if (blockIdx.x < batchSize) { if (blockIdx.x < batchSize) { // blockIdx.x as batch index
auto pX = input + blockIdx.x * inBatchNumValues; auto pX = input + blockIdx.x * inBatchNumValues;
//auto pZ = output_y_ptr;
auto channelStart = blockIdx.z * blockDim.z + threadIdx.z; auto channelStart = blockIdx.z * blockDim.z + threadIdx.z;
auto step = blockDim.z * gridDim.z; auto step = blockDim.z * gridDim.z;
for (Nd4jLong y = threadIdx.x; y < outHeight; y += blockDim.x) { for (Nd4jLong y = threadIdx.x; y < outHeight; y += blockDim.x) {
@ -80,6 +91,7 @@ namespace helpers {
auto xsBottom = xs_[x].bottomIndex; auto xsBottom = xs_[x].bottomIndex;
auto xsTop = xs_[x].topIndex; auto xsTop = xs_[x].topIndex;
auto xVal = xs_[x].interpolarValue; auto xVal = xs_[x].interpolarValue;
// process interpolation for all channels
for (int c = channelStart; c < channels; c += step) { for (int c = channelStart; c < channels; c += step) {
double topLeft(ys_input_lower_ptr[xsBottom + c]); double topLeft(ys_input_lower_ptr[xsBottom + c]);
double topRight(ys_input_lower_ptr[xsTop + c]); double topRight(ys_input_lower_ptr[xsTop + c]);
@ -87,13 +99,15 @@ namespace helpers {
double bottomRight(ys_input_upper_ptr[xsTop + c]); double bottomRight(ys_input_upper_ptr[xsTop + c]);
double top = topLeft + (topRight - topLeft) * xVal; double top = topLeft + (topRight - topLeft) * xVal;
double bottom = bottomLeft + (bottomRight - bottomLeft) * xVal; double bottom = bottomLeft + (bottomRight - bottomLeft) * xVal;
pZ[x * channels + c] = top + (bottom - top) * yVal; pZ[x * channels + c] = T(top + (bottom - top) * yVal);
} }
} }
} }
} }
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// resize image with
template <typename T> template <typename T>
static void resizeImage_(nd4j::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight, static void resizeImage_(nd4j::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight,
Nd4jLong outWidth, Nd4jLong channels, Nd4jLong outWidth, Nd4jLong channels,
@ -111,6 +125,7 @@ namespace helpers {
outWidth, outHeight, channels, inRowSize, outRowSize, inBatchNumValues, xs_, ys_); outWidth, outHeight, channels, inRowSize, outRowSize, inBatchNumValues, xs_, ys_);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename T> template <typename T>
static int resizeBilinearFunctor_(nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output) { static int resizeBilinearFunctor_(nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output) {
const Nd4jLong batchSize = images->sizeAt(0); const Nd4jLong batchSize = images->sizeAt(0);
@ -174,6 +189,10 @@ namespace helpers {
return Status::OK(); return Status::OK();
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// resize by interpolation nearest neighbor algorithm kernel
//
template <typename T> template <typename T>
static __global__ void resizeNeighborKernel(T const* input, Nd4jLong* inputShape, T* output, Nd4jLong* outputShape, static __global__ void resizeNeighborKernel(T const* input, Nd4jLong* inputShape, T* output, Nd4jLong* outputShape,
Nd4jLong batchSize, Nd4jLong inWidth, Nd4jLong inHeight, Nd4jLong outWidth, Nd4jLong outHeight, Nd4jLong channels, double widthScale, double heightScale, bool center) { Nd4jLong batchSize, Nd4jLong inWidth, Nd4jLong inHeight, Nd4jLong outWidth, Nd4jLong outHeight, Nd4jLong channels, double widthScale, double heightScale, bool center) {
@ -206,6 +225,9 @@ namespace helpers {
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// resizeNeighborFunctor - main algorithm by nearest neighbor
//
template <typename T> template <typename T>
int resizeNeighborFunctor_(nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output) { int resizeNeighborFunctor_(nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output) {
const Nd4jLong batchSize = images->sizeAt(0); const Nd4jLong batchSize = images->sizeAt(0);
@ -243,10 +265,11 @@ namespace helpers {
batchSize, inWidth, inHeight, outWidth, outHeight, channels, widthScale, heightScale, center); batchSize, inWidth, inHeight, outWidth, outHeight, channels, widthScale, heightScale, center);
NDArray::registerSpecialUse({output}, {images}); NDArray::registerSpecialUse({output}, {images});
return ND4J_STATUS_OK;
return Status::OK(); return Status::OK();
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// resizeImage - resize bilinear algorithm caller
//
void resizeImage(nd4j::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight, void resizeImage(nd4j::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight,
Nd4jLong inWidth, Nd4jLong outHeight, Nd4jLong outWidth, Nd4jLong channels, BilinearInterpolationData* xs_, Nd4jLong inWidth, Nd4jLong outHeight, Nd4jLong outWidth, Nd4jLong channels, BilinearInterpolationData* xs_,
BilinearInterpolationData* ys_, NDArray* output) { BilinearInterpolationData* ys_, NDArray* output) {
@ -257,21 +280,25 @@ namespace helpers {
Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight, Nd4jLong outWidth, Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight, Nd4jLong outWidth,
Nd4jLong channels, BilinearInterpolationData* xs_, BilinearInterpolationData* ys_, NDArray* output), LIBND4J_TYPES); Nd4jLong channels, BilinearInterpolationData* xs_, BilinearInterpolationData* ys_, NDArray* output), LIBND4J_TYPES);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int resizeBilinearFunctor(nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output) { int resizeBilinearFunctor(nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output) {
BUILD_SINGLE_SELECTOR(images->dataType(), return resizeBilinearFunctor_, (context, images, width, height, center, output), LIBND4J_TYPES); BUILD_SINGLE_SELECTOR(images->dataType(), return resizeBilinearFunctor_, (context, images, width, height, center, output), LIBND4J_TYPES);
} }
BUILD_SINGLE_TEMPLATE(template int resizeBilinearFunctor_, (nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output), LIBND4J_TYPES); BUILD_SINGLE_TEMPLATE(template int resizeBilinearFunctor_, (nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output), LIBND4J_TYPES);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int resizeNeighborFunctor(nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output) { int resizeNeighborFunctor(nd4j::LaunchContext* context, NDArray const* images, int width, int height, bool center, NDArray* output) {
BUILD_SINGLE_SELECTOR(images->dataType(), return resizeNeighborFunctor_, (context, images, width, height, center, output), LIBND4J_TYPES); BUILD_SINGLE_SELECTOR(images->dataType(), return resizeNeighborFunctor_, (context, images, width, height, center, output), LIBND4J_TYPES);
} }
BUILD_SINGLE_TEMPLATE(template int resizeNeighborFunctor_, (nd4j::LaunchContext* context, NDArray const* images, BUILD_SINGLE_TEMPLATE(template int resizeNeighborFunctor_, (nd4j::LaunchContext* context, NDArray const* images,
int width, int height, bool center, NDArray* output), LIBND4J_TYPES); int width, int height, bool center, NDArray* output), LIBND4J_TYPES);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// --------------------------------------------------------------------------------------------------------------- // // --------------------------------------------------------------------------------------------------------------- //
// Crop and Resize helper implementation // Crop and Resize helper implementation
// --------------------------------------------------------------------------------------------------------------- // // --------------------------------------------------------------------------------------------------------------- //
/////// // cropAndResize kernel
//
template <typename T, typename Z, typename I> template <typename T, typename Z, typename I>
static __global__ void cropAndResizeKernel(T const *images, Nd4jLong* imagesShape, Z const* boxes, Nd4jLong* boxesShape, static __global__ void cropAndResizeKernel(T const *images, Nd4jLong* imagesShape, Z const* boxes, Nd4jLong* boxesShape,
I const* indices, Nd4jLong* indexShape, I const* cropSize, Nd4jLong* cropShape, int method, I const* indices, Nd4jLong* indexShape, I const* cropSize, Nd4jLong* cropShape, int method,
@ -297,7 +324,6 @@ namespace helpers {
Z heightScale = (cropHeight > 1) ? (y2 - y1) * (imageHeight - 1) / Z(cropHeight - 1) : Z(0); Z heightScale = (cropHeight > 1) ? (y2 - y1) * (imageHeight - 1) / Z(cropHeight - 1) : Z(0);
Z widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / Z(cropWidth - 1) : Z(0); Z widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / Z(cropWidth - 1) : Z(0);
// PRAGMA_OMP_PARALLEL_FOR_SIMD
for (int y = threadIdx.x; y < cropHeight; y += blockDim.x) { for (int y = threadIdx.x; y < cropHeight; y += blockDim.x) {
const float inY = (cropHeight > 1) const float inY = (cropHeight > 1)
? y1 * (imageHeight - 1) + y * heightScale ? y1 * (imageHeight - 1) + y * heightScale
@ -315,6 +341,7 @@ namespace helpers {
} }
continue; continue;
} }
if (method == 0 /* bilinear */) { if (method == 0 /* bilinear */) {
const int topYIndex = nd4j::math::p_floor(inY); const int topYIndex = nd4j::math::p_floor(inY);
const int bottomYIndex = nd4j::math::p_ceil(inY); const int bottomYIndex = nd4j::math::p_ceil(inY);
@ -355,7 +382,6 @@ namespace helpers {
Nd4jLong zPos[] = {b, y, x, d}; Nd4jLong zPos[] = {b, y, x, d};
auto zIndex = shape::getOffset(outputShape, zPos); auto zIndex = shape::getOffset(outputShape, zPos);
output[zIndex] = Z(top + (bottom - top) * y_lerp); output[zIndex] = Z(top + (bottom - top) * y_lerp);
// crops->p(b, y, x, d, top + (bottom - top) * y_lerp);
} }
} }
} else { // method is "nearest neighbor" } else { // method is "nearest neighbor"
@ -383,7 +409,6 @@ namespace helpers {
auto zIndex = shape::getOffset(outputShape, zPos); auto zIndex = shape::getOffset(outputShape, zPos);
auto xIndex = shape::getOffset(imagesShape, xPos); auto xIndex = shape::getOffset(imagesShape, xPos);
output[zIndex] = images[xIndex]; output[zIndex] = images[xIndex];
// crops->p(b, y, x, d, images->e<T>(bIn, closestYIndex, closestXIndex, d));
} }
} }
} }
@ -392,6 +417,17 @@ namespace helpers {
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// cropAndResizeFunctor main algorithm
// context - launch context
// images - batch of images (4D tensor - [batch, width, height, pixels])
// boxes - 2D tensor with boxes for crop
// indices - 2D int tensor with indices of boxes to crop
// cropSize - 2D int tensor with crop box sizes
// method - (one of 0 - bilinear, 1 - nearest)
// extrapolationVal - double value of extrapolation
// crops - output (4D tensor - [batch, outWidth, outHeight, pixels])
//
template <typename T, typename Z, typename I> template <typename T, typename Z, typename I>
static void cropAndResizeFunctor_(nd4j::LaunchContext* context, NDArray const *images, NDArray const *boxes, NDArray const *indices, static void cropAndResizeFunctor_(nd4j::LaunchContext* context, NDArray const *images, NDArray const *boxes, NDArray const *indices,
NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) { NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) {
@ -416,6 +452,7 @@ namespace helpers {
NDArray::registerSpecialUse({crops}, {images, boxes, indices, cropSize}); NDArray::registerSpecialUse({crops}, {images, boxes, indices, cropSize});
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void cropAndResizeFunctor(nd4j::LaunchContext * context, NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) { void cropAndResizeFunctor(nd4j::LaunchContext * context, NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) {
BUILD_TRIPLE_SELECTOR(images->dataType(), boxes->dataType(), indices->dataType(), cropAndResizeFunctor_, BUILD_TRIPLE_SELECTOR(images->dataType(), boxes->dataType(), indices->dataType(), cropAndResizeFunctor_,
(context, images, boxes, indices, cropSize, method, extrapolationVal, crops), NUMERIC_TYPES, FLOAT_TYPES, INTEGER_TYPES); (context, images, boxes, indices, cropSize, method, extrapolationVal, crops), NUMERIC_TYPES, FLOAT_TYPES, INTEGER_TYPES);

View File

@ -26,7 +26,16 @@
namespace nd4j { namespace nd4j {
namespace ops { namespace ops {
namespace helpers { namespace helpers {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// needToSuppressWithThreshold - predicate for suppression
// boxes - boxes tensor buffer
// boxesShape boxes tensor shape
// previousIndex - index for current pos value
// nextIndex - index for neighbor pos value
// threshold - threashold value to suppress
//
// return value: true, if threshold is overcome, false otherwise
//
template <typename T> template <typename T>
static __device__ bool needToSuppressWithThreshold(T* boxes, Nd4jLong* boxesShape, int previousIndex, int nextIndex, T threshold) { static __device__ bool needToSuppressWithThreshold(T* boxes, Nd4jLong* boxesShape, int previousIndex, int nextIndex, T threshold) {
Nd4jLong previous0[] = {previousIndex, 0}; Nd4jLong previous0[] = {previousIndex, 0};
@ -38,6 +47,8 @@ namespace helpers {
Nd4jLong next2[] = {nextIndex, 2}; Nd4jLong next2[] = {nextIndex, 2};
Nd4jLong next3[] = {nextIndex, 3}; Nd4jLong next3[] = {nextIndex, 3};
// we have rectangle with given max values. Compute vexes of rectangle first
T minYPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]); T minYPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]);
T minXPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]); T minXPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]);
T maxYPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]); T maxYPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]);
@ -47,11 +58,14 @@ namespace helpers {
T maxYNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next0)], boxes[shape::getOffset(boxesShape, next2)]); T maxYNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next0)], boxes[shape::getOffset(boxesShape, next2)]);
T maxXNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next1)], boxes[shape::getOffset(boxesShape, next3)]); T maxXNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next1)], boxes[shape::getOffset(boxesShape, next3)]);
// compute areas for comparation
T areaPrev = (maxYPrev - minYPrev) * (maxXPrev - minXPrev); T areaPrev = (maxYPrev - minYPrev) * (maxXPrev - minXPrev);
T areaNext = (maxYNext - minYNext) * (maxXNext - minXNext); T areaNext = (maxYNext - minYNext) * (maxXNext - minXNext);
// of course, areas should be positive
if (areaNext <= T(0.f) || areaPrev <= T(0.f)) return false; if (areaNext <= T(0.f) || areaPrev <= T(0.f)) return false;
// compute intersection of rectangles
T minIntersectionY = nd4j::math::nd4j_max(minYPrev, minYNext); T minIntersectionY = nd4j::math::nd4j_max(minYPrev, minYNext);
T minIntersectionX = nd4j::math::nd4j_max(minXPrev, minXNext); T minIntersectionX = nd4j::math::nd4j_max(minXPrev, minXNext);
T maxIntersectionY = nd4j::math::nd4j_min(maxYPrev, maxYNext); T maxIntersectionY = nd4j::math::nd4j_min(maxYPrev, maxYNext);
@ -60,9 +74,15 @@ namespace helpers {
nd4j::math::nd4j_max(T(maxIntersectionY - minIntersectionY), T(0.0f)) * nd4j::math::nd4j_max(T(maxIntersectionY - minIntersectionY), T(0.0f)) *
nd4j::math::nd4j_max(T(maxIntersectionX - minIntersectionX), T(0.0f)); nd4j::math::nd4j_max(T(maxIntersectionX - minIntersectionX), T(0.0f));
T intersectionValue = intersectionArea / (areaPrev + areaNext - intersectionArea); T intersectionValue = intersectionArea / (areaPrev + areaNext - intersectionArea);
// final check
return intersectionValue > threshold; return intersectionValue > threshold;
}; }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// shouldSelectKernel - compute status for all selected rectangles (boxes)
//
// we compute boolean flag as shared uint32 and return it on final only for the first thread
//
template <typename T, typename I> template <typename T, typename I>
static __global__ void shouldSelectKernel(T* boxesBuf, Nd4jLong* boxesShape, I* indexBuf, I* selectedIndicesData, double threshold, int numSelected, int i, bool* shouldSelect) { static __global__ void shouldSelectKernel(T* boxesBuf, Nd4jLong* boxesShape, I* indexBuf, I* selectedIndicesData, double threshold, int numSelected, int i, bool* shouldSelect) {
auto tid = blockIdx.x * blockDim.x + threadIdx.x; auto tid = blockIdx.x * blockDim.x + threadIdx.x;
@ -76,15 +96,20 @@ namespace helpers {
if (shouldSelectShared) { if (shouldSelectShared) {
if (needToSuppressWithThreshold(boxesBuf, boxesShape, indexBuf[i], if (needToSuppressWithThreshold(boxesBuf, boxesShape, indexBuf[i],
indexBuf[selectedIndicesData[j]], T(threshold))) indexBuf[selectedIndicesData[j]], T(threshold)))
atomicCAS(&shouldSelectShared, 1, 0); atomicCAS(&shouldSelectShared, 1, 0); // exchange only when need to suppress
} }
} }
__syncthreads(); __syncthreads();
// final move: collect result
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
*shouldSelect = shouldSelectShared > 0; *shouldSelect = shouldSelectShared > 0;
} }
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// indices - type depended, indicesLong - type defined (only 64bit integers)
//
template <typename I> template <typename I>
static __global__ void copyIndices(void* indices, void* indicesLong, Nd4jLong len) { static __global__ void copyIndices(void* indices, void* indicesLong, Nd4jLong len) {
I* indexBuf = reinterpret_cast<I*>(indices); I* indexBuf = reinterpret_cast<I*>(indices);
@ -96,7 +121,9 @@ namespace helpers {
for (auto i = tid; i < len; i += step) for (auto i = tid; i < len; i += step)
indexBuf[i] = (I)srcBuf[i]; indexBuf[i] = (I)srcBuf[i];
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// nonMaxSuppressionV2 algorithm - given from TF NonMaxSuppressionV2 implementation
//
template <typename T, typename I> template <typename T, typename I>
static void nonMaxSuppressionV2_(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, NDArray* output) { static void nonMaxSuppressionV2_(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, NDArray* output) {
auto stream = context->getCudaStream(); auto stream = context->getCudaStream();
@ -109,8 +136,7 @@ namespace helpers {
Nd4jPointer extras[2] = {nullptr, stream}; Nd4jPointer extras[2] = {nullptr, stream};
sortByValue(extras, indices->buffer(), indices->shapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), scores.buffer(), scores.shapeInfo(), scores.specialBuffer(), scores.specialShapeInfo(), true); sortByValue(extras, indices->buffer(), indices->shapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), scores.buffer(), scores.shapeInfo(), scores.specialBuffer(), scores.specialShapeInfo(), true);
// TO DO: sort indices using scales as value row
//std::sort(indices.begin(), indices.end(), [scales](int i, int j) {return scales->e<T>(i) > scales->e<T>(j);});
auto indexBuf = reinterpret_cast<I*>(indices->specialBuffer()); auto indexBuf = reinterpret_cast<I*>(indices->specialBuffer());
NDArray selectedIndices = NDArrayFactory::create<I>('c', {output->lengthOf()}); NDArray selectedIndices = NDArrayFactory::create<I>('c', {output->lengthOf()});
@ -154,6 +180,7 @@ namespace helpers {
} }
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void nonMaxSuppressionV2(nd4j::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, NDArray* output) { void nonMaxSuppressionV2(nd4j::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, NDArray* output) {
BUILD_DOUBLE_SELECTOR(boxes->dataType(), output->dataType(), nonMaxSuppressionV2_, (context, boxes, scales, maxSize, threshold, output), FLOAT_TYPES, INDEXING_TYPES); BUILD_DOUBLE_SELECTOR(boxes->dataType(), output->dataType(), nonMaxSuppressionV2_, (context, boxes, scales, maxSize, threshold, output), FLOAT_TYPES, INDEXING_TYPES);

View File

@ -26,6 +26,7 @@ namespace nd4j {
namespace ops { namespace ops {
namespace helpers { namespace helpers {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename T> template <typename T>
linkage void cubeDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) { linkage void cubeDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
auto functor = LAMBDA_TT(x, y){ auto functor = LAMBDA_TT(x, y){
@ -35,10 +36,12 @@ namespace helpers {
input->applyPairwiseLambda(epsilon, functor, output); input->applyPairwiseLambda(epsilon, functor, output);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void cubeDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) { void cubeDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
BUILD_SINGLE_SELECTOR(theFirst->dataType(), cubeDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES); BUILD_SINGLE_SELECTOR(theFirst->dataType(), cubeDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//return (x >= X(0.f) ? y: -y); //return (x >= X(0.f) ? y: -y);
template <typename T> template <typename T>
linkage void reduceNorm1_(NDArray* input, NDArray* epsilon, NDArray* output) { linkage void reduceNorm1_(NDArray* input, NDArray* epsilon, NDArray* output) {
@ -49,10 +52,12 @@ namespace helpers {
input->applyPairwiseLambda(epsilon, functor, output); input->applyPairwiseLambda(epsilon, functor, output);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void reduceNorm1(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) { void reduceNorm1(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
BUILD_SINGLE_SELECTOR(theFirst->dataType(), reduceNorm1_, (theFirst, theSecond, theOutput), FLOAT_TYPES); BUILD_SINGLE_SELECTOR(theFirst->dataType(), reduceNorm1_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
template <typename T> template <typename T>
linkage void sigmCrossEntropy_(NDArray* logits, NDArray* labels, NDArray* output) { linkage void sigmCrossEntropy_(NDArray* logits, NDArray* labels, NDArray* output) {
@ -63,10 +68,12 @@ namespace helpers {
logits->applyPairwiseLambda(labels, functor, output); logits->applyPairwiseLambda(labels, functor, output);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void sigmCrossEntropy(nd4j::LaunchContext * context, NDArray* logits, NDArray* labels, NDArray* output) { void sigmCrossEntropy(nd4j::LaunchContext * context, NDArray* logits, NDArray* labels, NDArray* output) {
BUILD_SINGLE_SELECTOR(logits->dataType(), sigmCrossEntropy_, (logits, labels, output), FLOAT_TYPES); BUILD_SINGLE_SELECTOR(logits->dataType(), sigmCrossEntropy_, (logits, labels, output), FLOAT_TYPES);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
template <typename T> template <typename T>
linkage void sigmCrossEntropyGrad_(NDArray* logits, NDArray* labels, NDArray* output) { linkage void sigmCrossEntropyGrad_(NDArray* logits, NDArray* labels, NDArray* output) {
@ -80,14 +87,15 @@ namespace helpers {
logits->applyPairwiseLambda(labels, functor, output); logits->applyPairwiseLambda(labels, functor, output);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void sigmCrossEntropyGrad(nd4j::LaunchContext * context, NDArray* logits, NDArray* labels, NDArray* output) { void sigmCrossEntropyGrad(nd4j::LaunchContext * context, NDArray* logits, NDArray* labels, NDArray* output) {
BUILD_SINGLE_SELECTOR(logits->dataType(), sigmCrossEntropyGrad_, (logits, labels, output), FLOAT_TYPES); BUILD_SINGLE_SELECTOR(logits->dataType(), sigmCrossEntropyGrad_, (logits, labels, output), FLOAT_TYPES);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// X f = (X) 1.0f + nd4j::math::nd4j_abs<X>(d1); // X f = (X) 1.0f + nd4j::math::nd4j_abs<X>(d1);
// return (X) d2 * ((X) 1.0f / (f * f)); // return (X) d2 * ((X) 1.0f / (f * f));
//
template <typename T> template <typename T>
linkage void softSignDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) { linkage void softSignDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
auto functor = LAMBDA_TT(x, y){ auto functor = LAMBDA_TT(x, y){
@ -98,10 +106,12 @@ namespace helpers {
input->applyPairwiseLambda(epsilon, functor, output); input->applyPairwiseLambda(epsilon, functor, output);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void softSignDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) { void softSignDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
BUILD_SINGLE_SELECTOR(theFirst->dataType(), softSignDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES); BUILD_SINGLE_SELECTOR(theFirst->dataType(), softSignDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename T> template <typename T>
linkage void softPlusDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) { linkage void softPlusDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
auto functor = LAMBDA_TT(x, y){ auto functor = LAMBDA_TT(x, y){
@ -115,10 +125,11 @@ namespace helpers {
void softPlusDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) { void softPlusDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
BUILD_SINGLE_SELECTOR(theFirst->dataType(), softPlusDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES); BUILD_SINGLE_SELECTOR(theFirst->dataType(), softPlusDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// ///
/// \param theFirst /// \param input
/// \param theSecond /// \param epsilon
/// \param theOutput /// \param output
template <typename T> template <typename T>
linkage void sigmoidDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) { linkage void sigmoidDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
auto functor = LAMBDA_TT(x, y){ auto functor = LAMBDA_TT(x, y){
@ -146,6 +157,7 @@ namespace helpers {
BUILD_SINGLE_SELECTOR(theFirst->dataType(), hardSigmoidDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES); BUILD_SINGLE_SELECTOR(theFirst->dataType(), hardSigmoidDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename T> template <typename T>
linkage void logSumExp_(NDArray* input, NDArray* axis, NDArray* output) { linkage void logSumExp_(NDArray* input, NDArray* axis, NDArray* output) {
// reduce along axis with // reduce along axis with
@ -178,15 +190,17 @@ namespace helpers {
output->applyTransform(transform::Log, nullptr, nullptr); output->applyTransform(transform::Log, nullptr, nullptr);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void logSumExp(nd4j::LaunchContext * context, NDArray* input, NDArray* axis, NDArray* output) { void logSumExp(nd4j::LaunchContext * context, NDArray* input, NDArray* axis, NDArray* output) {
BUILD_SINGLE_SELECTOR(input->dataType(), logSumExp_, (input, axis, output), FLOAT_TYPES); BUILD_SINGLE_SELECTOR(input->dataType(), logSumExp_, (input, axis, output), FLOAT_TYPES);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void logSumExp(nd4j::LaunchContext * context, NDArray* input, NDArray* subtrah, NDArray* axis, NDArray* output) { void logSumExp(nd4j::LaunchContext * context, NDArray* input, NDArray* subtrah, NDArray* axis, NDArray* output) {
BUILD_SINGLE_SELECTOR(input->dataType(), logSumExp_, (input, subtrah, axis, output), FLOAT_TYPES); BUILD_SINGLE_SELECTOR(input->dataType(), logSumExp_, (input, subtrah, axis, output), FLOAT_TYPES);
} }
////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename T> template <typename T>
void weightedCrossEntropyWithLogitsFunctor_(NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output) { void weightedCrossEntropyWithLogitsFunctor_(NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output) {
@ -220,7 +234,7 @@ namespace helpers {
const_cast<NDArray*>(input)->applyTriplewiseLambda(const_cast<NDArray*>(targets), targetTensor.get(), mainRoutineT2, output); const_cast<NDArray*>(input)->applyTriplewiseLambda(const_cast<NDArray*>(targets), targetTensor.get(), mainRoutineT2, output);
} }
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void weightedCrossEntropyWithLogitsFunctor(nd4j::LaunchContext * context, NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output) { void weightedCrossEntropyWithLogitsFunctor(nd4j::LaunchContext * context, NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output) {
NDArray::prepareSpecialUse({output}, {targets, input, weights}); NDArray::prepareSpecialUse({output}, {targets, input, weights});
@ -229,7 +243,6 @@ void weightedCrossEntropyWithLogitsFunctor(nd4j::LaunchContext * context, NDArra
NDArray::registerSpecialUse({output}, {targets, input, weights}); NDArray::registerSpecialUse({output}, {targets, input, weights});
} }
} }
} }
} }

View File

@ -26,7 +26,22 @@
namespace nd4j { namespace nd4j {
namespace ops { namespace ops {
namespace helpers { namespace helpers {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// matrix band kernel
//
// inputBuffer - buffer of input tensor
// inputShape - shape of input tensor
// outputBuffer - buffer of output tensor
// outputShape - shape of output tensor
// lowerBand - lower band of matrix
// upperBand - upper band of matrix
// tadOnlyInputShapeInfo - TAD shape for input
// tadInputOffsets - TAD offsets for input
// tadOnlyOutputShapeInfo - TAD output shape
// tadOutputOffsets - TAD output offsets
// numTads - number of subarrays
// inputLength - input subarray length
//
template <typename T> template <typename T>
static __global__ void matrixBandKernel(void* inputBuffer, Nd4jLong* inputShape, static __global__ void matrixBandKernel(void* inputBuffer, Nd4jLong* inputShape,
void* outputBuffer, Nd4jLong* outputShape, Nd4jLong lowerBand, Nd4jLong upperBand, Nd4jLong* tadOnlyInputShapeInfo, Nd4jLong* tadInputOffsets, void* outputBuffer, Nd4jLong* outputShape, Nd4jLong lowerBand, Nd4jLong upperBand, Nd4jLong* tadOnlyInputShapeInfo, Nd4jLong* tadInputOffsets,
@ -42,7 +57,7 @@ namespace helpers {
Nd4jLong coords[2] = {i, j}; Nd4jLong coords[2] = {i, j};
Nd4jLong tadOffsetOut = shape::getOffset(tadOnlyOutputShapeInfo, coords); Nd4jLong tadOffsetOut = shape::getOffset(tadOnlyOutputShapeInfo, coords);
Nd4jLong tadOffsetIn = shape::getOffset(tadOnlyInputShapeInfo, coords); Nd4jLong tadOffsetIn = shape::getOffset(tadOnlyInputShapeInfo, coords);
//shape::getIndexOffset(j, tadOnlyOutputShapeInfo)
if (i >= j) { // check lower diagonals if (i >= j) { // check lower diagonals
if (lowerBand > 0) { if (lowerBand > 0) {
if ((i - j) > lowerBand) if ((i - j) > lowerBand)
@ -59,16 +74,14 @@ namespace helpers {
*(reinterpret_cast<T *>(outputBuffer) + xOffset + tadOffsetOut) = *( *(reinterpret_cast<T *>(outputBuffer) + xOffset + tadOffsetOut) = *(
reinterpret_cast<T const *>(inputBuffer) + yOffset + tadOffsetIn); reinterpret_cast<T const *>(inputBuffer) + yOffset + tadOffsetIn);
} }
// if ((i >= j) && (i - j) <= lowerBand && (j - i) <= upperBand) // with in band
// *(reinterpret_cast<T*>(outputBuffer) + xOffset + tadOffsetOut) = *(reinterpret_cast<T const*>(inputBuffer) + yOffset + tadOffsetIn);
//else
// *(reinterpret_cast<T*>(outputBuffer) + xOffset + tadOffsetOut) = T(0);
} }
} }
} }
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// matrixBandPart_ - main algorithm caller
//
template <typename T> template <typename T>
void matrixBandPart_(nd4j::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand) { void matrixBandPart_(nd4j::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand) {
dim3 launchDims(256, 512, 8192); dim3 launchDims(256, 512, 8192);
@ -82,17 +95,14 @@ namespace helpers {
const Nd4jLong numTads = packX.numberOfTads(); const Nd4jLong numTads = packX.numberOfTads();
if (!input->isActualOnDeviceSide()) NDArray::prepareSpecialUse({output}, {input});
input->syncToDevice();
if (!input->isActualOnDeviceSide())
input->syncToDevice();
matrixBandKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(input->getSpecialBuffer(), matrixBandKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(input->getSpecialBuffer(),
input->getSpecialShapeInfo(), output->getSpecialBuffer(), output->getSpecialShapeInfo(), input->getSpecialShapeInfo(), output->getSpecialBuffer(), output->getSpecialShapeInfo(),
lowerBand, upperBand, packX.specialShapeInfo(), packX.specialOffsets(), packZ.specialShapeInfo(), packZ.specialOffsets(), numTads, input->lengthOf()); lowerBand, upperBand, packX.specialShapeInfo(), packX.specialOffsets(), packZ.specialShapeInfo(), packZ.specialOffsets(), numTads, input->lengthOf());
NDArray::registerSpecialUse({output}, {input});
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void matrixBandPart(nd4j::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand) { void matrixBandPart(nd4j::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand) {
BUILD_SINGLE_SELECTOR(input->dataType(), matrixBandPart_, (context, input, output, lowerBand, upperBand), FLOAT_TYPES); BUILD_SINGLE_SELECTOR(input->dataType(), matrixBandPart_, (context, input, output, lowerBand, upperBand), FLOAT_TYPES);
} }

View File

@ -31,6 +31,8 @@ namespace nd4j {
namespace ops { namespace ops {
namespace helpers { namespace helpers {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// put diagonals from input batched matricies to output batched vectors
template <typename T> template <typename T>
static __global__ void matrixDiagPartKernel(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength, static __global__ void matrixDiagPartKernel(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength,
Nd4jLong* tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets, Nd4jLong* tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets,
@ -42,7 +44,6 @@ namespace helpers {
for (Nd4jLong j = threadIdx.x; j < inputLength; j += totalThreads) { for (Nd4jLong j = threadIdx.x; j < inputLength; j += totalThreads) {
Nd4jLong coords[2] = {j, j}; Nd4jLong coords[2] = {j, j};
Nd4jLong tadOffset = shape::getOffset(tadOnlyInputShapeInfo, coords); Nd4jLong tadOffset = shape::getOffset(tadOnlyInputShapeInfo, coords);
//shape::getIndexOffset(j, tadOnlyOutputShapeInfo)
*(reinterpret_cast<T*>(outputBuffer) + xOffset + shape::getIndexOffset(j, tadOnlyOutputShapeInfo)) = *(reinterpret_cast<T const*>(inputBuffer) + yOffset + tadOffset); *(reinterpret_cast<T*>(outputBuffer) + xOffset + shape::getIndexOffset(j, tadOnlyOutputShapeInfo)) = *(reinterpret_cast<T const*>(inputBuffer) + yOffset + tadOffset);
} }
} }
@ -51,6 +52,7 @@ namespace helpers {
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Returns a batched matrix tensor with new batched diagonal values. // Returns a batched matrix tensor with new batched diagonal values.
// for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag // for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
//
template <typename T> template <typename T>
int _matrixDiagPart(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) { int _matrixDiagPart(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
auto stream = context->getCudaStream(); auto stream = context->getCudaStream();
@ -86,6 +88,9 @@ namespace helpers {
return Status::OK(); return Status::OK();
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// caller for _matrixDiagPart
//
int matrixDiagPart(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) { int matrixDiagPart(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiagPart, (context, input, output), LIBND4J_TYPES); BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiagPart, (context, input, output), LIBND4J_TYPES);
} }

View File

@ -57,7 +57,7 @@ namespace helpers {
Nd4jPointer params[2]; Nd4jPointer params[2];
params[0] = context; params[0] = context;
params[1] = context->getCudaStream(); params[1] = context->getCudaStream();
// Nth element in sorted sequence : basic algorithm sort and retrieve nth element in sorted
if (input->isVector()) { if (input->isVector()) {
sort(params, nullptr, sortedVals.shapeInfo(), sortedVals.specialBuffer(), sortedVals.specialShapeInfo(), reverse); sort(params, nullptr, sortedVals.shapeInfo(), sortedVals.specialBuffer(), sortedVals.specialShapeInfo(), reverse);
@ -71,9 +71,7 @@ namespace helpers {
auto pTadShape = packX.specialShapeInfo(); auto pTadShape = packX.specialShapeInfo();
auto pTadShapeH = packX.primaryShapeInfo(); auto pTadShapeH = packX.primaryShapeInfo();
auto pTadOffsets = packX.specialOffsets(); auto pTadOffsets = packX.specialOffsets();
// auto pLastDimData = (int*) manager.replicatePointer(lastDims.data(), lastDims.size() * sizeof(int));
sortTad(params, sortedVals.buffer(), sortedVals.shapeInfo(), sortedVals.specialBuffer(), sortedVals.specialShapeInfo(), lastDims.data(), lastDims.size(), pTadShape, pTadOffsets, reverse); sortTad(params, sortedVals.buffer(), sortedVals.shapeInfo(), sortedVals.specialBuffer(), sortedVals.specialShapeInfo(), lastDims.data(), lastDims.size(), pTadShape, pTadOffsets, reverse);
// manager.synchronize();
sortedVals.tickWriteDevice(); sortedVals.tickWriteDevice();
sortedVals.syncToHost(); sortedVals.syncToHost();
auto stream = context->getCudaStream(); auto stream = context->getCudaStream();