cavis/libnd4j/include/loops/cuda/specials/concatKernel.cu

272 lines
11 KiB
Plaintext

/* ******************************************************************************
*
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
// @author Yurii Shyrma, created on 15.11.2018
//
#include <loops/special_kernels.h>
namespace sd {
///////////////////////////////////////////////////////////////////////
template<typename T>
__device__ void concatKernel(int numArrays,
Nd4jPointer *data, Nd4jPointer *inputShapeInfos,
void *vz, Nd4jLong *resultShapeInfo,
Nd4jPointer *tadPointers, Nd4jPointer *offsetPointers,
Nd4jLong *zTadShape, Nd4jLong *zOffsets) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int zRank = shape::rank(resultShapeInfo);
auto result = reinterpret_cast<T*>(vz);
auto dataT = reinterpret_cast<T **>(data);
auto shapeInfoPointers = reinterpret_cast<Nd4jLong **>(inputShapeInfos);
auto tadShapes = reinterpret_cast<Nd4jLong **>(tadPointers);
auto tadOffsets = reinterpret_cast<Nd4jLong **>(offsetPointers);
//if (threadIdx.x == 0 && blockIdx.x == 0) {
// shape::printShapeInfoLinear("zTadShape", zTadShape);
//}
//__shared__ int tDim[1];
__shared__ int baseIdx;
__shared__ int yLength;
__shared__ char yOrder;
__shared__ int yEWS;
char zOrder = shape::order(resultShapeInfo);
int zEWS = shape::elementWiseStride(resultShapeInfo);
int tadEWS = shape::elementWiseStride(zTadShape);
int zLength = shape::length(resultShapeInfo);
__shared__ int arrOffset;
__shared__ int numTads;
if (shape::isVector(resultShapeInfo)) {
//if (threadIdx.x == 0 && blockIdx.x == 0)
// printf("Vector here\n");
if (zEWS >= 1) {
for (int r = blockIdx.x; r < numArrays; r += gridDim.x) {
if(shape::isVector(shapeInfoPointers[r]) || shape::order(shapeInfoPointers[r]) == shape::order(resultShapeInfo)) {
yLength = shape::length(shapeInfoPointers[r]);
yEWS = shape::elementWiseStride(shapeInfoPointers[r]);
// FIXME: this is bad
__shared__ int baseIdx;
if (threadIdx.x == 0) {
baseIdx = 0;
for (int f = 0; f < r; f++) {
baseIdx += shape::length(shapeInfoPointers[f]);
}
}
__syncthreads();
for (int i = threadIdx.x; i < yLength && baseIdx + i < zLength; i += blockDim.x) {
result[baseIdx + i * zEWS] = dataT[r][i * yEWS];
}
__syncthreads();
} else {
if (tid == 0)
printf("Non-matched order for vector\n");
}
}
} else {
if (tid == 0)
printf("Vector Non-1 zEWS\n");
}
return;
}
bool _vec = shape::isVector(resultShapeInfo);
// TODO: to be pulled into separate kernel. matrix concatenation
for (int r = 0; r < numArrays; r ++) {
auto currentShape = shapeInfoPointers[r];
auto currentData = dataT[r];
auto currentTad = tadShapes[r];
auto currentOffsets = tadOffsets[r];
if (threadIdx.x == 0) {
yLength = shape::length(currentTad);
yOrder = shape::order(currentTad);
yEWS = shape::elementWiseStride(currentTad);
numTads = shape::length(currentShape) / yLength;
arrOffset = 0;
for (int f = 0; f < r; f++) {
arrOffset += shape::length(tadShapes[f]);
}
//if (threadIdx.x == 0 && blockIdx.x == 0) {
// shape::printShapeInfoLinear("currentTad", currentTad);
//}
}
__syncthreads();
if (yLength == 1 && _vec) {
//if (threadIdx.x == 0 && blockIdx.x == 0)
// printf("Branch 0\n");
// edge case, each thread will handle it's own tad then
for (int j = tid; j < numTads; j += blockDim.x * gridDim.x) {
Nd4jLong inputOffset = currentOffsets[j];
Nd4jLong resultOffset = zOffsets[j];
T *dataTAD = currentData + inputOffset;
T *resultTAD = result + resultOffset;
int sub[MAX_RANK];
shape::index2coords(arrOffset, zTadShape, sub);
Nd4jLong baseOffset = shape::getOffset(zTadShape, sub);
resultTAD += baseOffset;
auto yRank = shape::rank(currentTad);
auto tadRank = shape::rank(zTadShape);
shape::index2coords(0, currentTad, sub);
auto yOffset = shape::getOffset(currentTad, sub);
resultOffset = shape::getOffset(zTadShape, sub);
resultTAD[resultOffset] = dataTAD[yOffset];
}
} else {
//if (threadIdx.x == 0 && blockIdx.x == 0)
// printf("Branch 1\n");
for (int j = blockIdx.x; j < numTads; j += gridDim.x) {
auto inputOffset = currentOffsets[j];
auto resultOffset = zOffsets[j];
auto dataTAD = currentData + inputOffset;
auto resultTAD = result + resultOffset;
int sub[MAX_RANK];
shape::index2coords(arrOffset, zTadShape, sub);
Nd4jLong baseOffset = shape::getOffset(zTadShape, sub);
resultTAD += baseOffset;
if (zOrder == yOrder && yEWS > 0 && tadEWS > 0) {
//if (threadIdx.x == 0 && blockIdx.x == 0)
// printf("Branch A\n");
for (int i = threadIdx.x; i < yLength; i += blockDim.x) {
resultTAD[i * tadEWS] = dataTAD[i * yEWS];
}
} else {
if(tadEWS > 0 && shape::order(resultShapeInfo) == shape::order(currentTad)) {
//if (threadIdx.x == 0 && blockIdx.x == 0)
// printf("Branch B\n");
if (threadIdx.x == 0) {
baseIdx = 0;
for (int f = 0; f < r; f++) {
baseIdx += shape::length(shapeInfoPointers[f]);
}
//printf("R: %i; baseIdx: %i;\n", baseIdx);
}
__syncthreads();
if (numTads == 1) {
for(int k = threadIdx.x; k < yLength; k+= blockDim.x) {
resultTAD[baseIdx + k * tadEWS] = dataTAD[k];
}
} else {
int yIdx[MAX_RANK];
auto yRank = shape::rank(currentTad);
for (int i = threadIdx.x; i < yLength; i+= blockDim.x) {
shape::index2coords(i, currentTad, yIdx);
auto yOffset = shape::getOffset(currentTad, yIdx);
resultTAD[baseIdx + i * tadEWS] = dataTAD[yOffset];
}
}
__syncthreads();
} else {
//if (threadIdx.x == 0 && blockIdx.x == 0)
// printf("Branch C; yLength: %i;\n", yLength);
int zIdx[MAX_RANK];
int yIdx[MAX_RANK];
auto yRank = shape::rank(currentTad);
auto tadRank = shape::rank(zTadShape);
for (int i = threadIdx.x; i < yLength; i+= blockDim.x) {
shape::index2coords(i, currentTad, yIdx);
shape::index2coords(i, zTadShape, zIdx);
auto yOffset = shape::getOffset(currentTad, yIdx);
auto resultOffset = shape::getOffset(zTadShape, zIdx);
resultTAD[resultOffset] = dataTAD[yOffset];
}
}
}
__syncthreads();
}
}
__syncthreads();
}
}
///////////////////////////////////////////////////////////////////////
template<typename T>
__global__ void execConcatKernel(int numArrays,
Nd4jPointer *data, Nd4jPointer *inputShapeInfos,
void *vz, Nd4jLong *zShapeInfo,
Nd4jPointer *tadPointers, Nd4jPointer *offsetPointers,
Nd4jLong *zTadShape,
Nd4jLong *zOffsets) {
concatKernel<T>(numArrays, data, inputShapeInfos, vz, zShapeInfo, tadPointers, offsetPointers, zTadShape,
zOffsets);
}
///////////////////////////////////////////////////////////////////////
template<typename T>
__host__ void concatKernelGeneric(dim3 &launchDims, cudaStream_t *stream,
int numArrays,
Nd4jPointer *data, Nd4jPointer *inputShapeInfos,
void *vz, Nd4jLong *zShapeInfo,
Nd4jPointer *tadPointers, Nd4jPointer *offsetPointers,
Nd4jLong *zTadShape,
Nd4jLong *zOffsets) {
execConcatKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(numArrays, data, inputShapeInfos, vz, zShapeInfo, tadPointers, offsetPointers, zTadShape, zOffsets);
sd::DebugHelper::checkErrorCode(stream, "concatGenericLegacy(...) failed");
}
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT concatKernelGeneric, (dim3 & launchDims, cudaStream_t * stream, int numArrays, Nd4jPointer * data, Nd4jPointer * inputShapeInfos, void * vz, Nd4jLong *zShapeInfo, Nd4jPointer * tadPointers, Nd4jPointer * offsetPointers, Nd4jLong * zTadShape, Nd4jLong * zOffsets), LIBND4J_TYPES);
}