[WIP] minor fixes (#447)
* couple of tests disabled Signed-off-by: raver119@gmail.com <raver119@gmail.com> * few syncs removed, some logging added Signed-off-by: raver119@gmail.com <raver119@gmail.com> * some logging added Signed-off-by: raver119@gmail.com <raver119@gmail.com> * some logging added Signed-off-by: raver119@gmail.com <raver119@gmail.com> * fix min num_threads Signed-off-by: raver119@gmail.com <raver119@gmail.com> * fixed wrong release function for scalarPointer Signed-off-by: raver119@gmail.com <raver119@gmail.com>master
parent
b786418c5d
commit
f0adb6f788
|
@ -88,7 +88,7 @@ namespace sd {
|
||||||
cudaFree(_allocationPointer);
|
cudaFree(_allocationPointer);
|
||||||
|
|
||||||
if (_scalarPointer != nullptr)
|
if (_scalarPointer != nullptr)
|
||||||
cudaFree(_scalarPointer);
|
cudaFreeHost(_scalarPointer);
|
||||||
|
|
||||||
if (_allocationPointer != nullptr)
|
if (_allocationPointer != nullptr)
|
||||||
cudaFree(_reductionPointer);
|
cudaFree(_reductionPointer);
|
||||||
|
|
|
@ -243,9 +243,6 @@ __host__ void ReduceBoolFunction<X,Z>::intermediateXD(dim3 launchDims, cudaStrea
|
||||||
int *dimension, int dimensionLength,
|
int *dimension, int dimensionLength,
|
||||||
void *reductionPointer,
|
void *reductionPointer,
|
||||||
const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
|
const Nd4jLong *tadShapeInfo, const Nd4jLong *tadOffsets) {
|
||||||
|
|
||||||
nd4j_printf("Step A%i\n", -1);
|
|
||||||
|
|
||||||
if(shape::isEmpty(hXShapeInfo)) {
|
if(shape::isEmpty(hXShapeInfo)) {
|
||||||
|
|
||||||
if(shape::isEmpty(hZShapeInfo))
|
if(shape::isEmpty(hZShapeInfo))
|
||||||
|
|
|
@ -515,8 +515,8 @@ BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &
|
||||||
|
|
||||||
template <bool storeSum, bool isNP2>
|
template <bool storeSum, bool isNP2>
|
||||||
__host__ void prescanLauncher(dim3 &blocks, dim3 &threads, int shmem, cudaStream_t *stream, int *g_odata, const int *g_idata, int *g_blockSums, int n, int blockIndex, int baseIndex) {
|
__host__ void prescanLauncher(dim3 &blocks, dim3 &threads, int shmem, cudaStream_t *stream, int *g_odata, const int *g_idata, int *g_blockSums, int n, int blockIndex, int baseIndex) {
|
||||||
|
//printf("Prescan grid: <%i/%i/%i>; threads: <%i/%i/%i>; shareMemSize: %i\n", blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, shmem);
|
||||||
prescan<storeSum, isNP2><<<blocks, threads, shmem, *stream>>>(g_odata, g_idata, g_blockSums, n, blockIndex, baseIndex);
|
prescan<storeSum, isNP2><<<blocks, threads, shmem, *stream>>>(g_odata, g_idata, g_blockSums, n, blockIndex, baseIndex);
|
||||||
sd::DebugHelper::checkErrorCode(stream, "prescan(...) failed");
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename S, typename T>
|
template <typename S, typename T>
|
||||||
|
|
|
@ -41,8 +41,12 @@ namespace sd {
|
||||||
else
|
else
|
||||||
numThreads = sd::floorPow2(numElements);
|
numThreads = sd::floorPow2(numElements);
|
||||||
|
|
||||||
|
numThreads = sd::math::nd4j_max<int>(1, numThreads);
|
||||||
|
|
||||||
int numEltsPerBlock = numThreads * 2;
|
int numEltsPerBlock = numThreads * 2;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// if this is a non-power-of-2 array, the last block will be non-full
|
// if this is a non-power-of-2 array, the last block will be non-full
|
||||||
// compute the smallest power of 2 able to compute its scan.
|
// compute the smallest power of 2 able to compute its scan.
|
||||||
int numEltsLastBlock =
|
int numEltsLastBlock =
|
||||||
|
@ -102,8 +106,6 @@ namespace sd {
|
||||||
} else {
|
} else {
|
||||||
sd::prescanLauncher<false, true>(grid, threads, sharedMemSize, stream, dZ, dX, 0, numElements, 0, 0);
|
sd::prescanLauncher<false, true>(grid, threads, sharedMemSize, stream, dZ, dX, 0, numElements, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
sd::DebugHelper::checkErrorCode(stream, "prescanArray(...) failed");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void encodeThresholdP2Int_(void **prs, int *dx, Nd4jLong N, int *dz) {
|
static void encodeThresholdP2Int_(void **prs, int *dx, Nd4jLong N, int *dz) {
|
||||||
|
|
|
@ -119,7 +119,7 @@ TEST_F(CudaBasicsTests1, TestPairwise_1) {
|
||||||
z.tickWriteHost();
|
z.tickWriteHost();
|
||||||
|
|
||||||
for (int e = 0; e < z.lengthOf(); e++) {
|
for (int e = 0; e < z.lengthOf(); e++) {
|
||||||
nd4j_printf("step %i\n", e);
|
//nd4j_printf("step %i\n", e);
|
||||||
ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
|
ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2822,7 +2822,7 @@ TEST_F(CudaBasicsTests1, execSummaryStats_2) {
|
||||||
// delete cuda stream
|
// delete cuda stream
|
||||||
cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
|
cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(CudaBasicsTests1, execSummaryStats_3) {
|
TEST_F(CudaBasicsTests1, execSummaryStats_3) {
|
||||||
|
|
||||||
|
@ -2876,6 +2876,7 @@ TEST_F(CudaBasicsTests1, execSummaryStats_3) {
|
||||||
// delete cuda stream
|
// delete cuda stream
|
||||||
cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
|
cudaResult = cudaStreamDestroy(stream); ASSERT_EQ(0, cudaResult);
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
TEST_F(CudaBasicsTests1, execSummaryStatsScalar_1) {
|
TEST_F(CudaBasicsTests1, execSummaryStatsScalar_1) {
|
||||||
|
|
|
@ -1054,6 +1054,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeBicubic_Test8) {
|
||||||
ASSERT_TRUE(testData.equalsTo(result));
|
ASSERT_TRUE(testData.equalsTo(result));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
TEST_F(DeclarableOpsTests11, ImageResizeArea_Test1) {
|
TEST_F(DeclarableOpsTests11, ImageResizeArea_Test1) {
|
||||||
|
|
||||||
NDArray input = NDArrayFactory::create<double>('c', {1, 3, 3, 4});
|
NDArray input = NDArrayFactory::create<double>('c', {1, 3, 3, 4});
|
||||||
|
@ -1114,6 +1115,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test1) {
|
||||||
ASSERT_TRUE(expected.equalsTo(result));
|
ASSERT_TRUE(expected.equalsTo(result));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
TEST_F(DeclarableOpsTests11, ImageResizeArea_Test2) {
|
TEST_F(DeclarableOpsTests11, ImageResizeArea_Test2) {
|
||||||
|
|
||||||
NDArray input = NDArrayFactory::create<float>('c', {1, 3, 3, 1});
|
NDArray input = NDArrayFactory::create<float>('c', {1, 3, 3, 1});
|
||||||
|
@ -1530,6 +1532,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test15) {
|
||||||
ASSERT_TRUE(expected.isSameShape(result));
|
ASSERT_TRUE(expected.isSameShape(result));
|
||||||
ASSERT_TRUE(expected.equalsTo(result));
|
ASSERT_TRUE(expected.equalsTo(result));
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
TEST_F(DeclarableOpsTests11, summaryStatsData_test1) {
|
TEST_F(DeclarableOpsTests11, summaryStatsData_test1) {
|
||||||
|
|
Loading…
Reference in New Issue