542 lines
20 KiB
Plaintext
542 lines
20 KiB
Plaintext
/* ******************************************************************************
|
|
*
|
|
*
|
|
* This program and the accompanying materials are made available under the
|
|
* terms of the Apache License, Version 2.0 which is available at
|
|
* https://www.apache.org/licenses/LICENSE-2.0.
|
|
*
|
|
* See the NOTICE file distributed with this work for additional
|
|
* information regarding copyright ownership.
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
* License for the specific language governing permissions and limitations
|
|
* under the License.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
******************************************************************************/
|
|
|
|
//
|
|
//
|
|
//
|
|
|
|
#include <loops/type_conversions.h>
|
|
#include <types/types.h>
|
|
#include <helpers/DebugHelper.h>
|
|
|
|
namespace sd {
|
|
template<typename S, typename T>
|
|
void TypeCast::convertGenericCuda(Nd4jPointer *extras, void *dx, Nd4jLong N, void *dz) {
|
|
auto stream = reinterpret_cast<cudaStream_t *>(&extras[1]);
|
|
|
|
sd::convertKernel<S, T><<<256, 1024, 1024, *stream>>>(dx, N, dz);
|
|
sd::DebugHelper::checkErrorCode(stream, "convertGeneric(...) failed");
|
|
};
|
|
|
|
|
|
template<typename S, typename T>
|
|
__device__ void convertKernelGeneric(S *x, Nd4jLong N, T *z) {
|
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
for (Nd4jLong i = tid; i < N; i+= blockDim.x * gridDim.x) {
|
|
// despite it's stupid, it simplifies conversion to bottom dtypes
|
|
// FIXME: get rid of through-float though
|
|
z[i] = static_cast<T>(static_cast<float>(x[i]));
|
|
}
|
|
};
|
|
|
|
|
|
// Define this to more rigorously avoid bank conflicts, even at the lower (root) levels of the tree
|
|
//#define ZERO_BANK_CONFLICTS
|
|
|
|
#ifdef ZERO_BANK_CONFLICTS
|
|
#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS + (index) >> (2 * LOG_NUM_BANKS))
|
|
#else
|
|
#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
|
|
#endif
|
|
|
|
#ifdef CHECK_BANK_CONFLICTS
|
|
#define TEMP(index) CUT_BANK_CHECKER(temp, index)
|
|
#else
|
|
#define TEMP(index) temp[index]
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
template <bool isNP2>
|
|
__device__ void loadSharedChunkFromMem(int *s_data, const int *g_idata, int n, int baseIndex, int& ai, int& bi, int& mem_ai, int& mem_bi, int& bankOffsetA, int& bankOffsetB) {
|
|
int thid = threadIdx.x;
|
|
mem_ai = baseIndex + threadIdx.x;
|
|
mem_bi = mem_ai + blockDim.x;
|
|
|
|
ai = thid;
|
|
bi = thid + blockDim.x;
|
|
|
|
// compute spacing to avoid bank conflicts
|
|
bankOffsetA = CONFLICT_FREE_OFFSET(ai);
|
|
bankOffsetB = CONFLICT_FREE_OFFSET(bi);
|
|
|
|
// Cache the computational window in shared memory
|
|
// pad values beyond n with zeros
|
|
s_data[ai + bankOffsetA] = g_idata[mem_ai];
|
|
|
|
if (isNP2) { // compile-time decision
|
|
s_data[bi + bankOffsetB] = (bi < n) ? g_idata[mem_bi] : 0;
|
|
} else {
|
|
s_data[bi + bankOffsetB] = g_idata[mem_bi];
|
|
}
|
|
}
|
|
|
|
template <bool isNP2>
|
|
__device__ void storeSharedChunkToMem(int* g_odata, int* s_data, int n, int ai, int bi, int mem_ai, int mem_bi, int bankOffsetA, int bankOffsetB) {
|
|
__syncthreads();
|
|
|
|
// write results to global memory
|
|
g_odata[mem_ai] = s_data[ai + bankOffsetA];
|
|
if (isNP2) { // compile-time decision
|
|
if (bi < n)
|
|
g_odata[mem_bi] = s_data[bi + bankOffsetB];
|
|
} else {
|
|
g_odata[mem_bi] = s_data[bi + bankOffsetB];
|
|
}
|
|
}
|
|
|
|
template <bool storeSum>
|
|
__device__ void clearLastElement(int* s_data, int *g_blockSums, int blockIndex) {
|
|
if (threadIdx.x == 0)
|
|
{
|
|
int index = (blockDim.x << 1) - 1;
|
|
index += CONFLICT_FREE_OFFSET(index);
|
|
|
|
if (storeSum) { // compile-time decision
|
|
// write this block's total sum to the corresponding index in the blockSums array
|
|
g_blockSums[blockIndex] = s_data[index];
|
|
}
|
|
|
|
// zero the last element in the scan so it will propagate back to the front
|
|
s_data[index] = 0;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
__device__ unsigned int buildSum(int *s_data) {
|
|
unsigned int thid = threadIdx.x;
|
|
unsigned int stride = 1;
|
|
|
|
// build the sum in place up the tree
|
|
for (int d = blockDim.x; d > 0; d >>= 1) {
|
|
__syncthreads();
|
|
|
|
if (thid < d) {
|
|
int i = __mul24(__mul24(2, stride), thid);
|
|
int ai = i + stride - 1;
|
|
int bi = ai + stride;
|
|
|
|
ai += CONFLICT_FREE_OFFSET(ai);
|
|
bi += CONFLICT_FREE_OFFSET(bi);
|
|
|
|
s_data[bi] += s_data[ai];
|
|
}
|
|
|
|
stride *= 2;
|
|
}
|
|
|
|
return stride;
|
|
}
|
|
|
|
__device__ void scanRootToLeaves(int *s_data, unsigned int stride) {
|
|
unsigned int thid = threadIdx.x;
|
|
|
|
// traverse down the tree building the scan in place
|
|
for (int d = 1; d <= blockDim.x; d *= 2) {
|
|
stride >>= 1;
|
|
|
|
__syncthreads();
|
|
|
|
if (thid < d) {
|
|
int i = __mul24(__mul24(2, stride), thid);
|
|
int ai = i + stride - 1;
|
|
int bi = ai + stride;
|
|
|
|
ai += CONFLICT_FREE_OFFSET(ai);
|
|
bi += CONFLICT_FREE_OFFSET(bi);
|
|
|
|
float t = s_data[ai];
|
|
s_data[ai] = s_data[bi];
|
|
s_data[bi] += t;
|
|
}
|
|
}
|
|
}
|
|
|
|
template <bool storeSum>
|
|
__device__ void prescanBlock(int *data, int blockIndex, int *blockSums) {
|
|
int stride = buildSum(data); // build the sum in place up the tree
|
|
clearLastElement<storeSum>(data, blockSums,
|
|
(blockIndex == 0) ? blockIdx.x : blockIndex);
|
|
scanRootToLeaves(data, stride); // traverse down tree to build the scan
|
|
}
|
|
|
|
|
|
template <bool storeSum, bool isNP2>
|
|
__global__ void prescan(int *g_odata, const int *g_idata, int *g_blockSums, int n, int blockIndex, int baseIndex) {
|
|
int ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB;
|
|
extern __shared__ int s_data[];
|
|
|
|
// load data into shared memory
|
|
loadSharedChunkFromMem<isNP2>(reinterpret_cast<int *>(s_data), g_idata, n, (baseIndex == 0) ? __mul24(blockIdx.x, (blockDim.x << 1)):baseIndex, ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB);
|
|
|
|
// scan the data in each block
|
|
prescanBlock<storeSum>(s_data, blockIndex, g_blockSums);
|
|
|
|
// write results to device memory
|
|
storeSharedChunkToMem<isNP2>(g_odata, s_data, n, ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB);
|
|
}
|
|
|
|
|
|
__global__ void uniformAdd(int *g_data, int *uniforms, int n, int blockOffset, int baseIndex) {
|
|
__shared__ float uni;
|
|
if (threadIdx.x == 0)
|
|
uni = uniforms[blockIdx.x + blockOffset];
|
|
|
|
unsigned int address = __mul24(blockIdx.x, (blockDim.x << 1)) + baseIndex + threadIdx.x;
|
|
|
|
__syncthreads();
|
|
|
|
// note two adds per thread
|
|
g_data[address] += uni;
|
|
g_data[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * uni;
|
|
}
|
|
|
|
/*
|
|
* This kernel does prefix sum in parallel, to calculate offsets for each block
|
|
*/
|
|
template<typename T>
|
|
__device__ inline void encoderKernelP2Generic(void *dx, Nd4jLong n, void *dz) {
|
|
// TODO: to be remove
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
/*
|
|
* PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
|
|
*/
|
|
template<typename T>
|
|
__global__ static void execEncoderKernelP1(const void *dx, Nd4jLong N, void *dz, float threshold) {
|
|
auto x = reinterpret_cast<const T *> (dx);
|
|
auto z = reinterpret_cast<int *> (dz);
|
|
|
|
//basically, for phase One we want do calculation: how many eligible values we have, and which blocks will be holding data
|
|
Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
int pass = tid < N && sd::math::nd4j_abs<T>(x[tid]) >= static_cast<T>(threshold) ? 1 : 0;
|
|
int bp=__syncthreads_count(pass);
|
|
|
|
if (threadIdx.x == 0) {
|
|
// saving out per-block passes
|
|
z[blockIdx.x+1] = bp;
|
|
|
|
// saving out sum
|
|
atomicAdd(&z[0], bp);
|
|
}
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
template<typename T>
|
|
__host__ void encoderKernelP1Generic(dim3 &launchDims, cudaStream_t *stream, const void *dx, Nd4jLong N, void *dz, float threshold) {
|
|
|
|
execEncoderKernelP1<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, dz, threshold);
|
|
sd::DebugHelper::checkErrorCode(stream, "encoderP1(...) failed");
|
|
}
|
|
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP1Generic, (dim3 &launchDims, cudaStream_t *stream, const void *dx, Nd4jLong N, void *dz, float threshold), FLOAT_TYPES);
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
/*
|
|
* PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
|
|
*
|
|
* Based on: https://github.com/knotman90/cuStreamComp <-- efficient CUDA stream compaction algorithm
|
|
*/
|
|
template<typename T>
|
|
__global__ static void execEncoderKernelP3(void *dx, int *offsets, Nd4jLong N, void *dz) {
|
|
auto x = reinterpret_cast<T *> (dx);
|
|
auto z = reinterpret_cast<int *> (dz);
|
|
|
|
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
extern __shared__ int warpTotals[];
|
|
|
|
// fetch block offset only once
|
|
__shared__ float threshold;
|
|
__shared__ FloatBits fb;
|
|
__shared__ int bo;
|
|
__shared__ int limit;
|
|
if (threadIdx.x == 0) {
|
|
limit = z[0];
|
|
fb.i_ = z[2];
|
|
threshold = fb.f_;
|
|
bo = offsets[blockIdx.x];
|
|
}
|
|
__syncthreads();
|
|
|
|
// out-of-limit threads do not play here
|
|
auto value = tid < N ? x[tid] : (T) 0.f;
|
|
|
|
// out-of-limit threads just declare they have no changes
|
|
auto pred = tid >= N ? 0 : sd::math::nd4j_abs<T>(value) >= static_cast<T>(threshold) ? 1 : 0;
|
|
auto w_i = threadIdx.x / warpSize; // warp index (or, warp number) - index of the Warp within TOTAL_WARPS
|
|
auto t_i = threadIdx.x % warpSize; // thread index within a warp
|
|
unsigned int t_m = INT_MAX >> (warpSize - t_i - 1); //thread mask (ERROR IN THE PAPER minus one is required)
|
|
|
|
int b = __ballot_sync(t_m, pred); // balres = number whose ith bit isone if the ith's thread pred is true masked up to the current index in warp
|
|
auto t_u = __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
|
|
|
|
if (t_i == warpSize - 1)
|
|
warpTotals[w_i] = t_u + pred;
|
|
|
|
__syncthreads();
|
|
|
|
|
|
int w_i_u = 0;
|
|
for (int j = 0; j <= 5; j++) {
|
|
unsigned int b_j = __ballot_sync(t_m, warpTotals[t_i] & pow2i(j)); //# of the ones in the j'th digit of the warp offsets
|
|
w_i_u += (__popc(b_j) << j);
|
|
}
|
|
|
|
// we just ignore all results coming from non-0 threads
|
|
if (w_i == 0 && t_i < blockDim.x / warpSize)
|
|
warpTotals[t_i] = w_i_u;
|
|
|
|
__syncthreads();
|
|
|
|
|
|
// pred is always false if we're out-of-limits
|
|
if (pred) {
|
|
int idx = t_u + warpTotals[w_i] + bo + 4;
|
|
if (idx < limit + 4) {
|
|
z[idx] = value > static_cast<T>(0.0f) ? tid + 1 : -(tid + 1);
|
|
x[tid] = value > static_cast<T>(0.0f) ? x[tid] - threshold : x[tid] + threshold;
|
|
}
|
|
}
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
template<typename T>
|
|
__host__ void encoderKernelP3Generic(dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz) {
|
|
execEncoderKernelP3<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, offsets, N, dz);
|
|
sd::DebugHelper::checkErrorCode(stream, "encoderP3(...) failed");
|
|
}
|
|
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP3Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz), FLOAT_TYPES);
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
/*
|
|
* This kernel handles decode from sparse threshold array, to dense array
|
|
*
|
|
* PLEASE NOTE: Z is expected to be memset to 0
|
|
*/
|
|
template<typename T>
|
|
__global__ static void execDecoderKernel(const void *dx, Nd4jLong N, void *dz) {
|
|
auto x = reinterpret_cast<const int *> (dx);
|
|
auto z = reinterpret_cast<T *> (dz);
|
|
|
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
__shared__ float threshold;
|
|
__shared__ int limit;
|
|
|
|
__shared__ FloatBits fb;
|
|
if (threadIdx.x == 0) {
|
|
limit = x[0];
|
|
fb.i_ = x[2];
|
|
threshold = fb.f_;
|
|
}
|
|
__syncthreads();
|
|
|
|
for (int e = tid; e < limit; e += blockDim.x * gridDim.x) {
|
|
int el = x[e+4];
|
|
int ael = sd::math::nd4j_abs<int>(el) - 1;
|
|
|
|
// TODO: investigate, if += would work better here, as in "decoded accumulation"
|
|
z[ael] += el > 0 ? threshold : -threshold;
|
|
}
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
template<typename T>
|
|
__host__ void decoderKernelGeneric(dim3 &launchDims, cudaStream_t *stream, const void *dx, Nd4jLong N, void *dz) {
|
|
|
|
execDecoderKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, dz);
|
|
sd::DebugHelper::checkErrorCode(stream, "execDecoder(...) failed");
|
|
}
|
|
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT decoderKernelGeneric, (dim3 &launchDims, cudaStream_t *stream, const void *dx, Nd4jLong N, void *dz), FLOAT_TYPES);
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
template<typename T>
|
|
__global__ static void execCudaEncodeBitmapKernel(void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold) {
|
|
auto dx = reinterpret_cast<T *>(vdx);
|
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
T off(0.0f);
|
|
__shared__ int counter;
|
|
__shared__ int *shmem;
|
|
__shared__ T *vals;
|
|
if (threadIdx.x == 0){
|
|
extern __shared__ char mem[];
|
|
shmem = reinterpret_cast<int*>(mem);
|
|
vals = reinterpret_cast<T *>(shmem + blockDim.x);
|
|
counter = 0;
|
|
}
|
|
__syncthreads();
|
|
|
|
Nd4jLong loopRemainder = N % (blockDim.x * gridDim.x);
|
|
Nd4jLong loopLimit = N + (blockDim.x * gridDim.x - loopRemainder);
|
|
|
|
for (Nd4jLong i = tid; i < loopLimit; i += blockDim.x * gridDim.x) {
|
|
// all threads in block reading stuff
|
|
T val = i < N ? dx[i] : off;
|
|
T abs = sd::math::nd4j_abs<T>(val);
|
|
|
|
int byteId = i / 16 + 4;
|
|
int bitId = i % 16;
|
|
|
|
shmem[threadIdx.x] = 0;
|
|
vals[threadIdx.x] = val;
|
|
|
|
if (abs >= static_cast<T>(threshold) && i < N) {
|
|
shmem[threadIdx.x] = 1 << (bitId);
|
|
atomicAdd(&counter, 1);
|
|
if (val < static_cast<T>(0.0f)) {
|
|
shmem[threadIdx.x] |= 1 << (bitId + 16);
|
|
vals[threadIdx.x] += static_cast<T>(threshold);
|
|
} else {
|
|
vals[threadIdx.x] -= static_cast<T>(threshold);
|
|
}
|
|
} else if (abs >= static_cast<T>(threshold) / static_cast<T>(2.0f) && val < static_cast<T>(0.0f) && i < N) {
|
|
atomicAdd(&counter, 1);
|
|
shmem[threadIdx.x] = 1 << (bitId + 16);
|
|
|
|
vals[threadIdx.x] += static_cast<T>(threshold) / static_cast<T>(2.0f);
|
|
}
|
|
__syncthreads();
|
|
|
|
if (threadIdx.x % 16 == 0 && i < N) {
|
|
int byte = 0;
|
|
for (int e = 0; e < 16; e++) {
|
|
if (i + e >= N)
|
|
continue;
|
|
|
|
byte |= shmem[threadIdx.x + e];
|
|
}
|
|
dz[byteId] = byte;
|
|
}
|
|
__syncthreads();
|
|
|
|
if (i < N)
|
|
dx[i] = vals[threadIdx.x];
|
|
}
|
|
__syncthreads();
|
|
|
|
if (threadIdx.x == 0) {
|
|
atomicAdd(scalar, counter);
|
|
}
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
template<typename T>
|
|
__host__ void cudaEncodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold) {
|
|
|
|
execCudaEncodeBitmapKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(vdx, N, dz, scalar, reductionBuffer, threshold);
|
|
sd::DebugHelper::checkErrorCode(stream, "encodeBitmap(...) failed");
|
|
}
|
|
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaEncodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold), FLOAT_TYPES);
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
template<typename T>
|
|
__global__ static void execCudaDecodeBitmapKernel(const void *dx, Nd4jLong N, void *vdz) {
|
|
auto dz = static_cast<T*>(vdz);
|
|
|
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
__shared__ T *shmem;
|
|
__shared__ FloatBits fb;
|
|
__shared__ float threshold;
|
|
__shared__ const int *x;
|
|
if (threadIdx.x == 0){
|
|
extern __shared__ char mem[];
|
|
shmem = reinterpret_cast<T*>(mem);
|
|
x = reinterpret_cast<const int *>(dx);
|
|
fb.i_ = x[2];
|
|
threshold = fb.f_;
|
|
}
|
|
__syncthreads();
|
|
|
|
int lim = N / 16 + 5;
|
|
for (int i = tid; i < N; i += blockDim.x * gridDim.x) {
|
|
int byteId = i / 16 + 4;
|
|
// printf("I: [%i]; byteId: [%i]\n", i, byteId);
|
|
|
|
shmem[threadIdx.x] = dz[i];
|
|
__syncthreads();
|
|
|
|
if (threadIdx.x % 16 == 0) {
|
|
int byte = x[byteId];
|
|
|
|
for (int e = 0; e < 16; e++) {
|
|
if (i + e >= N)
|
|
continue;
|
|
|
|
int bitId = (i + e) % 16;
|
|
|
|
bool hasBit = (byte & 1 << (bitId) ) != 0;
|
|
bool hasSign = (byte & 1 << (bitId + 16) ) != 0;
|
|
|
|
if (hasBit) {
|
|
if (hasSign)
|
|
shmem[threadIdx.x + bitId] -= threshold;
|
|
else
|
|
shmem[threadIdx.x + bitId] += threshold;
|
|
} else if (hasSign) {
|
|
shmem[threadIdx.x + bitId] -= threshold / 2;
|
|
}
|
|
}
|
|
}
|
|
__syncthreads();
|
|
|
|
dz[i] = shmem[threadIdx.x];
|
|
}
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
template<typename T>
|
|
__host__ void cudaDecodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, const void *dx, Nd4jLong N, void *vdz) {
|
|
|
|
execCudaDecodeBitmapKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, vdz);
|
|
sd::DebugHelper::checkErrorCode(stream, "cudeDecodeBitmap(...) failed");
|
|
}
|
|
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, const void *dx, Nd4jLong N, void *vdz), FLOAT_TYPES);
|
|
|
|
|
|
template <bool storeSum, bool isNP2>
|
|
__host__ void prescanLauncher(dim3 &blocks, dim3 &threads, int shmem, cudaStream_t *stream, int *g_odata, const int *g_idata, int *g_blockSums, int n, int blockIndex, int baseIndex) {
|
|
//printf("Prescan grid: <%i/%i/%i>; threads: <%i/%i/%i>; shareMemSize: %i\n", blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, shmem);
|
|
shmem = sd::math::nd4j_max<int>(shmem, 16384);
|
|
prescan<storeSum, isNP2><<<blocks, threads, shmem, *stream>>>(g_odata, g_idata, g_blockSums, n, blockIndex, baseIndex);
|
|
};
|
|
|
|
template <typename S, typename T>
|
|
__global__ void convertKernel(void *dx, Nd4jLong N, void *dz) {
|
|
auto x = reinterpret_cast<S *>(dx);
|
|
auto z = reinterpret_cast<T *>(dz);
|
|
|
|
sd::convertKernelGeneric(x, N, z);
|
|
}
|
|
|
|
|
|
#define LIBND4J_BOOLS_LOCAL \
|
|
(randomName0, 0), \
|
|
(randomName1, 1)
|
|
|
|
BUILD_DOUBLE_TEMPLATE(template void TypeCast::convertGenericCuda, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES_EXTENDED, LIBND4J_TYPES_EXTENDED);
|
|
BUILD_DOUBLE_TEMPLATE(template void prescanLauncher, (dim3 &blocks, dim3 &threads, int shmem, cudaStream_t *stream, int *g_odata, const int *g_idata, int *g_blockSums, int n, int blockIndex, int baseIndex), LIBND4J_BOOLS_LOCAL, LIBND4J_BOOLS_LOCAL);
|
|
|
|
#undef LIBND4J_BOOLS_LOCAL
|
|
} |