diff --git a/libnd4j/include/ops/declarable/generic/transforms/histogram_fixed_width.cpp b/libnd4j/include/ops/declarable/generic/transforms/histogram_fixed_width.cpp index b3063c75d..529446e12 100644 --- a/libnd4j/include/ops/declarable/generic/transforms/histogram_fixed_width.cpp +++ b/libnd4j/include/ops/declarable/generic/transforms/histogram_fixed_width.cpp @@ -49,7 +49,7 @@ CUSTOM_OP_IMPL(histogram_fixed_width, 2, 1, false, 0, 0) { DECLARE_TYPES(histogram_fixed_width) { getOpDescriptor() ->setAllowedInputTypes(nd4j::DataType::ANY) - ->setAllowedOutputTypes({ALL_INTS}); + ->setAllowedOutputTypes({ALL_INDICES}); } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu b/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu index ebde4909c..317f1d857 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu @@ -27,16 +27,16 @@ namespace ops { namespace helpers { /////////////////////////////////////////////////////////////////// -template +template __global__ static void histogramFixedWidthCuda( const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, - const T leftEdge, const T rightEdge) { + const X leftEdge, const X rightEdge) { - const T* x = reinterpret_cast(vx); - Nd4jLong* z = reinterpret_cast(vz); + const auto x = reinterpret_cast(vx); + auto z = reinterpret_cast(vz); __shared__ Nd4jLong xLen, zLen, totalThreads, nbins; - __shared__ T binWidth, secondEdge, lastButOneEdge; + __shared__ X binWidth, secondEdge, lastButOneEdge; if (threadIdx.x == 0) { @@ -55,7 +55,7 @@ __global__ static void histogramFixedWidthCuda( const void* vx, const Nd4jLong* for (Nd4jLong i = tid; i < xLen; i += totalThreads) { - const T value = x[shape::getIndexOffset(i, xShapeInfo, xLen)]; + const X value = x[shape::getIndexOffset(i, xShapeInfo, xLen)]; Nd4jLong zIndex; @@ -66,18 +66,18 @@ __global__ static void histogramFixedWidthCuda( const void* vx, const Nd4jLong* else zIndex = static_cast((value - leftEdge) / binWidth); - nd4j::math::atomics::nd4j_atomicAdd(&z[shape::getIndexOffset(zIndex, zShapeInfo, nbins)], 1LL); + nd4j::math::atomics::nd4j_atomicAdd(&z[shape::getIndexOffset(zIndex, zShapeInfo, nbins)], 1); } } /////////////////////////////////////////////////////////////////// -template +template __host__ static void histogramFixedWidthCudaLauncher(const cudaStream_t *stream, const NDArray& input, const NDArray& range, NDArray& output) { - const T leftEdge = range.e(0); - const T rightEdge = range.e(1); + const X leftEdge = range.e(0); + const X rightEdge = range.e(1); - histogramFixedWidthCuda<<<512, MAX_NUM_THREADS / 2, 512, *stream>>>(input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), leftEdge, rightEdge); + histogramFixedWidthCuda<<<256, 256, 1024, *stream>>>(input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), leftEdge, rightEdge); } //////////////////////////////////////////////////////////////////////// @@ -89,7 +89,7 @@ void histogramFixedWidth(nd4j::LaunchContext* context, const NDArray& input, con PointersManager manager(context, "histogramFixedWidth"); NDArray::prepareSpecialUse({&output}, {&input}); - BUILD_SINGLE_SELECTOR(input.dataType(), histogramFixedWidthCudaLauncher, (context->getCudaStream(), input, range, output), LIBND4J_TYPES); + BUILD_DOUBLE_SELECTOR(input.dataType(), output.dataType(), histogramFixedWidthCudaLauncher, (context->getCudaStream(), input, range, output), LIBND4J_TYPES, INDEXING_TYPES); NDArray::registerSpecialUse({&output}, {&input}); manager.synchronize(); diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/allocator/impl/AtomicAllocator.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/allocator/impl/AtomicAllocator.java index 8fbf0a000..0ec1876ca 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/allocator/impl/AtomicAllocator.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/allocator/impl/AtomicAllocator.java @@ -312,7 +312,7 @@ public class AtomicAllocator implements Allocator { @Override public Pointer getPointer(INDArray array, CudaContext context) { // DataBuffer buffer = array.data().originalDataBuffer() == null ? array.data() : array.data().originalDataBuffer(); - if (array.isEmpty()) + if (array.isEmpty() || array.isS()) return null; return memoryHandler.getDevicePointer(array.data(), context); diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/flow/impl/SynchronousFlowController.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/flow/impl/SynchronousFlowController.java index fb4510f1b..d81de381a 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/flow/impl/SynchronousFlowController.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/flow/impl/SynchronousFlowController.java @@ -172,7 +172,7 @@ public class SynchronousFlowController implements FlowController { val cId = allocator.getDeviceId(); - if (result != null && !result.isEmpty()) { + if (result != null && !result.isEmpty() && !result.isS()) { Nd4j.getCompressor().autoDecompress(result); prepareDelayedMemory(result); val pointData = allocator.getAllocationPoint(result); @@ -198,7 +198,8 @@ public class SynchronousFlowController implements FlowController { return context; for (INDArray operand : operands) { - if (operand == null || operand.isEmpty()) + // empty or String arrays can be skipped + if (operand == null || operand.isEmpty() || operand.isS()) continue; Nd4j.getCompressor().autoDecompress(operand); diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContext.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContext.java index 26d363f32..cf779f537 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContext.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContext.java @@ -100,7 +100,7 @@ public class CudaOpContext extends BaseOpContext implements OpContext { @Override public Pointer contextPointer() { for (val v:fastpath_in.values()) { - if (v.isEmpty()) + if (v.isEmpty() || v.isS()) continue; AtomicAllocator.getInstance().getAllocationPoint(v).tickHostRead(); @@ -111,7 +111,7 @@ public class CudaOpContext extends BaseOpContext implements OpContext { } for (val v:fastpath_out.values()) { - if (v.isEmpty()) + if (v.isEmpty() || v.isS()) continue; AtomicAllocator.getInstance().getAllocationPoint(v).tickHostRead();