diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/optimize/solver/accumulation/EncodedGradientsAccumulatorTest.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/optimize/solver/accumulation/EncodedGradientsAccumulatorTest.java index bae025caf..cc85e4b47 100644 --- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/optimize/solver/accumulation/EncodedGradientsAccumulatorTest.java +++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/optimize/solver/accumulation/EncodedGradientsAccumulatorTest.java @@ -23,9 +23,13 @@ import org.deeplearning4j.optimize.solvers.accumulation.EncodedGradientsAccumula import org.deeplearning4j.optimize.solvers.accumulation.EncodingHandler; import org.deeplearning4j.optimize.solvers.accumulation.encoding.threshold.FixedThresholdAlgorithm; import org.junit.Test; +import org.nd4j.linalg.api.concurrency.AffinityManager; import org.nd4j.linalg.api.ndarray.INDArray; +import org.nd4j.linalg.api.ops.util.PrintAffinity; import org.nd4j.linalg.factory.Nd4j; +import org.nd4j.nativeblas.OpaqueDataBuffer; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; /** @@ -93,12 +97,13 @@ public class EncodedGradientsAccumulatorTest extends BaseDL4JTest { } - EncodingHandler handler = new EncodingHandler(new FixedThresholdAlgorithm(1e-3), null, null, false); + EncodingHandler handler = new EncodingHandler(new FixedThresholdAlgorithm(1e-3), null, Integer.MAX_VALUE, false); for (int e = 10; e < numParams / 5; e++) { - INDArray encoded = handler.encodeUpdates(0, 0, getGradients(numParams, e, 2e-3)); + val gradients = getGradients(numParams, e, 2e-3); + val encoded = handler.encodeUpdates(0, 0, gradients); - // log.info("enc len: {}", encoded.data().length()); + assertNotNull("Failed with e == " + e, encoded); int encFormat = encoded.data().getInt(3); diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodedGradientsAccumulator.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodedGradientsAccumulator.java index 146fba9bb..900951e51 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodedGradientsAccumulator.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodedGradientsAccumulator.java @@ -69,7 +69,7 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist protected ThreadLocal index = new ThreadLocal<>(); protected long initialMemory = 100 * 1024 * 1024L; protected int queueSize = 5; - protected Double boundary = 1.0; + protected Integer boundary = Integer.MAX_VALUE; protected boolean encodingDebugMode; protected IndexedTail externalSource; @@ -101,11 +101,11 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist } public EncodedGradientsAccumulator(int parties, ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, boolean encodingDebugMode) { - this(parties, new EncodingHandler(thresholdAlgorithm, residualPostProcessor, 1.0, encodingDebugMode), DEFAULT_INITIAL_MEMORY, 10, 1.0, encodingDebugMode); + this(parties, new EncodingHandler(thresholdAlgorithm, residualPostProcessor, Integer.MAX_VALUE, encodingDebugMode), DEFAULT_INITIAL_MEMORY, 10, Integer.MAX_VALUE, encodingDebugMode); } public EncodedGradientsAccumulator(int parties, @NonNull MessageHandler handler, long initialMemory, - int queueSize, Double boundary, boolean encodingDebugMode) { + int queueSize, Integer boundary, boolean encodingDebugMode) { this.parties = parties; this.handler = handler; this.initialMemory = initialMemory; @@ -551,7 +551,7 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist protected long initialMemory = DEFAULT_INITIAL_MEMORY; protected int queueSize = 5; protected MessageHandler handler; - protected Double boundary = null; + protected int boundary = Integer.MAX_VALUE; protected boolean encodingDebugMode; /** @@ -598,15 +598,12 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist /** * This method enables optional limit for max number of updates per message * - * Default value: 1.0 (no limit) + * Default value: Integer.MAX_VALUE (no limit) * @param boundary positive value in range 0..1 * @return */ - public Builder updatesBoundary(double boundary) { - if (boundary >= 1.0) - return this; - - if (boundary <= 0.0) + public Builder updatesBoundary(int boundary) { + if (boundary <= 0) throw new DL4JInvalidConfigException("Boundary should have positive value"); this.boundary = boundary; diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodingHandler.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodingHandler.java index 24a46117a..c451ecd6a 100644 --- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodingHandler.java +++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodingHandler.java @@ -16,6 +16,7 @@ package org.deeplearning4j.optimize.solvers.accumulation; +import org.nd4j.linalg.api.buffer.DataType; import org.nd4j.shade.guava.util.concurrent.AtomicDouble; import lombok.NonNull; import lombok.extern.slf4j.Slf4j; @@ -24,7 +25,6 @@ import org.deeplearning4j.optimize.solvers.accumulation.encoding.ThresholdAlgori import org.deeplearning4j.optimize.solvers.accumulation.encoding.ThresholdAlgorithmReducer; import org.nd4j.linalg.api.buffer.DataBuffer; import org.nd4j.linalg.api.ndarray.INDArray; -import org.nd4j.linalg.compression.NDArrayCompressor; import org.nd4j.linalg.exception.ND4JIllegalStateException; import org.nd4j.linalg.factory.Nd4j; import org.nd4j.linalg.ops.transforms.Transforms; @@ -54,9 +54,8 @@ public class EncodingHandler implements MessageHandler { protected ThresholdAlgorithm initialThresholdAlgorithm; protected ResidualPostProcessor initialResidualPostProcessor; - protected Double boundary; + protected Integer boundary; protected boolean encodingDebugMode; - protected NDArrayCompressor compressor; protected AtomicInteger atomicBoundary = new AtomicInteger(-1); protected ThreadLocal thresholdAlgorithm = new ThreadLocal<>(); @@ -73,20 +72,16 @@ public class EncodingHandler implements MessageHandler { protected final AtomicLong lastThresholdLogTime = new AtomicLong(); public EncodingHandler(final ThresholdAlgorithm thresholdAlgorithm, final ResidualPostProcessor residualPostProcessor, - Double boundary, boolean encodingDebugMode){ + Integer boundary, boolean encodingDebugMode){ this.initialThresholdAlgorithm = thresholdAlgorithm; this.initialResidualPostProcessor = residualPostProcessor; - this.boundary = boundary; + this.boundary = boundary == null ? Integer.MAX_VALUE : boundary; this.encodingDebugMode = encodingDebugMode; } @Override public void initialize(@NonNull GradientsAccumulator accumulator) { this.accumulator = accumulator; - - compressor = Nd4j.getCompressor().getCompressor("THRESHOLD"); - if (compressor == null) - throw new ND4JIllegalStateException("Can't find Threshold compressor implementation!"); } public INDArray encodeUpdates(int iteration, int epoch, INDArray updates) { @@ -135,14 +130,13 @@ public class EncodingHandler implements MessageHandler { iterations.get().incrementAndGet(); if (boundary != null && atomicBoundary.get() < 0) - atomicBoundary.compareAndSet(-1, (int) (updates.length() * boundary)); + atomicBoundary.compareAndSet(-1, (int) (updates.length() / 16) ); INDArray encoded; if (!bitmapMode.get().get()) { //Sparse updates - encoded = Nd4j.getExecutioner().thresholdEncode(updates, currentThreshold.get().get(), - boundary == null ? null : atomicBoundary.get()); + encoded = Nd4j.getExecutioner().thresholdEncode(updates, currentThreshold.get().get(), boundary == null ? null : atomicBoundary.get()); // updates were TOO sparse, nothing to share here if (encoded == null) { @@ -157,17 +151,14 @@ public class EncodingHandler implements MessageHandler { } - double encLen = encoded.data().getInt(0); + double encLen = encoded.length(); // if updates are too dense - we fallback to bitmap encoding if (encLen >= (updates.length() / 16)) { log.debug("Switching back to bitmapEncoding: iteration {}, epoch {}, threshold {}, encoded length {}", iteration, epoch, currThreshold, encLen); bitmapMode.get().set(true); - DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(updates.length() / 16 + 5); - encoded = Nd4j.createArrayFromShapeBuffer(buffer, updates.shapeInfoDataBuffer()); - - Nd4j.getExecutioner().bitmapEncode(updates, encoded, currentThreshold.get().get()); + encoded = Nd4j.getExecutioner().bitmapEncode(updates, currentThreshold.get().get()); applyPostProcessor(iteration, epoch, currThreshold, updates); lastSparsityRatio.set(null); @@ -186,8 +177,7 @@ public class EncodingHandler implements MessageHandler { } } else { //Dense bitmap updates - DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(updates.length() / 16 + 5); - encoded = Nd4j.createArrayFromShapeBuffer(buffer, updates.shapeInfoDataBuffer()); + encoded = Nd4j.create(DataType.INT32, updates.length() / 16 + 5); long values = Nd4j.getExecutioner().bitmapEncode(updates, encoded, currentThreshold.get().get()); diff --git a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelWrapper.java b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelWrapper.java index 50bd6f34e..8d303d391 100644 --- a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelWrapper.java +++ b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelWrapper.java @@ -910,7 +910,7 @@ public class ParallelWrapper implements AutoCloseable { Preconditions.checkState(thresholdAlgorithm != null, "Cannot use SHARED_GRADIENTS training mode without setting a threshold algorithm"); this.trainerContext = new SymmetricTrainerContext(); if (this.accumulator == null) { - log.info("Creating new GradientsAccumulator instance with threshold of [5e-4"); + log.info("Creating new GradientsAccumulator instance with default threshold of [5e-4]"); this.accumulator = new EncodedGradientsAccumulator(workers, thresholdAlgorithm, residualPostProcessor, false); } } diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v1/WiredEncodingHandler.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v1/WiredEncodingHandler.java index 1560d38cd..c2fc97658 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v1/WiredEncodingHandler.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v1/WiredEncodingHandler.java @@ -45,7 +45,7 @@ public class WiredEncodingHandler extends EncodingHandler { * @param thresholdAlgorithm threshold algorithm to use * @param boundary */ - public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Double boundary, boolean encodingDebugMode) { + public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Integer boundary, boolean encodingDebugMode) { super(thresholdAlgorithm, residualPostProcessor, boundary, encodingDebugMode); } diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v2/WiredEncodingHandler.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v2/WiredEncodingHandler.java index 3f892cfe7..130526658 100644 --- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v2/WiredEncodingHandler.java +++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v2/WiredEncodingHandler.java @@ -44,7 +44,7 @@ public class WiredEncodingHandler extends EncodingHandler { * * @param thresholdAlgorithm The threshold algorithm to use */ - public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Double boundary, boolean encodingDebugMode) { + public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Integer boundary, boolean encodingDebugMode) { super(thresholdAlgorithm, residualPostProcessor, boundary, encodingDebugMode); } diff --git a/libnd4j/CMakeLists.txt b/libnd4j/CMakeLists.txt index 63a83c05b..9610d2890 100755 --- a/libnd4j/CMakeLists.txt +++ b/libnd4j/CMakeLists.txt @@ -49,12 +49,12 @@ elseif(WIN32) set(CMAKE_CXX_FLAGS_RELEASE "-D_RELEASE=true") set(CMAKE_CXX_FLAGS_DEBUG " /FS /EHsc") else() - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true") - set(CMAKE_CXX_FLAGS_DEBUG " -g -O2 -fPIC -fmax-errors=2") + set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -D_RELEASE=true") + set(CMAKE_CXX_FLAGS_DEBUG " -g -O2 -fPIC") endif() else() - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true") - set(CMAKE_CXX_FLAGS_DEBUG " -g -O0 -fPIC -fmax-errors=2") + set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -D_RELEASE=true") + set(CMAKE_CXX_FLAGS_DEBUG " -g -O0 -fPIC") if (SD_CPU) set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address") @@ -221,21 +221,16 @@ include_directories(${FLATBUFFERS_PATH}/include) configure_file(include/config.h.in include/config.h) include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) -if (NOT DEFINED ENV{CLION_IDE}) - message("NOT CLION") - include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) - add_subdirectory(blas) - if(SD_BUILD_TESTS) - # tests are always compiled with all ops included - set(SD_ALL_OPS true) - set(SD_BUILD_MINIFIER true) - add_subdirectory(tests_cpu) - endif() -endif () -if ($ENV{CLION_IDE}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) +add_subdirectory(blas) +if(SD_BUILD_TESTS) + # tests are always compiled with all ops included + set(SD_ALL_OPS true) + set(SD_BUILD_MINIFIER true) add_subdirectory(tests_cpu) -endif () +endif() + if (MSVC_DEV) set(SD_BUILD_MINIFIER false) diff --git a/libnd4j/blas/CMakeLists.txt b/libnd4j/blas/CMakeLists.txt index a793063bc..a12b70194 100755 --- a/libnd4j/blas/CMakeLists.txt +++ b/libnd4j/blas/CMakeLists.txt @@ -120,8 +120,13 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}") elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") # using GCC - SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}") + SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_TUNE} -fmax-errors=2 ") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath,$ORIGIN/") + + if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT(APPLE) AND NOT(WIN32)) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic") + SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic") + endif() endif() @@ -361,11 +366,6 @@ elseif(SD_CPU) endif() endif() - if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT(APPLE) AND NOT(WIN32)) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic") - SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic") - endif() - install(TARGETS ${SD_LIBRARY_NAME} DESTINATION .) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cpu) endif() diff --git a/libnd4j/include/execution/impl/Threads.cpp b/libnd4j/include/execution/impl/Threads.cpp index 2d0ae1144..51339abf1 100644 --- a/libnd4j/include/execution/impl/Threads.cpp +++ b/libnd4j/include/execution/impl/Threads.cpp @@ -571,7 +571,7 @@ namespace samediff { // create temporary array int64_t intermediatery[256]; - auto span = delta / numThreads; + auto span = (numElements / numThreads) - (numElements % numThreads); // execute threads in parallel for (uint32_t e = 0; e < numThreads; e++) { @@ -615,7 +615,7 @@ namespace samediff { // create temporary array double intermediatery[256]; - auto span = delta / numThreads; + auto span = (numElements / numThreads) - (numElements % numThreads); // execute threads in parallel for (uint32_t e = 0; e < numThreads; e++) { diff --git a/libnd4j/include/legacy/NativeOps.h b/libnd4j/include/legacy/NativeOps.h index 665909719..97aa5a402 100755 --- a/libnd4j/include/legacy/NativeOps.h +++ b/libnd4j/include/legacy/NativeOps.h @@ -1432,18 +1432,6 @@ ND4J_EXPORT void tear(Nd4jPointer *extraPointers, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets); -ND4J_EXPORT Nd4jLong encodeBitmap(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold); -ND4J_EXPORT void decodeBitmap(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo); - - -ND4J_EXPORT void encodeThresholdP1(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold); -ND4J_EXPORT void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *dx, Nd4jLong N, int *dz); -ND4J_EXPORT void encodeThresholdP3(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, int *offsets, Nd4jLong N, int *dz); - - -ND4J_EXPORT void decodeThreshold(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo); - - ND4J_EXPORT void sort(Nd4jPointer *extraPointers, void *x, Nd4jLong *xShapeInfo, void *dx, Nd4jLong *dxShapeInfo, diff --git a/libnd4j/include/legacy/cpu/NativeOps.cpp b/libnd4j/include/legacy/cpu/NativeOps.cpp index 2b394f840..5e810c6de 100644 --- a/libnd4j/include/legacy/cpu/NativeOps.cpp +++ b/libnd4j/include/legacy/cpu/NativeOps.cpp @@ -1436,28 +1436,6 @@ void enableP2P(bool enable) { // no-op } - - -void encodeThresholdP1(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) { - // TODO: to be implemented -} - - -void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *hX, Nd4jLong N, int *dz) { - // TODO: to be implemented -} - - -void encodeThresholdP3(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){ - // offsets won't be used here - - // TODO: to be implemented -} - -void decodeThreshold(Nd4jPointer *extraPointers, void *hX, Nd4jLong N, void *dz, Nd4jLong *hZShapeInfo){ - // TODO: to be implemented -} - bool isP2PAvailable() { // always TRUE for cpu backend return true; @@ -1467,10 +1445,6 @@ void checkP2P() { // no-op } -void decodeBitmap(Nd4jPointer *extraPointers, void *hX, Nd4jLong N, void *dz, Nd4jLong *hZShapeInfo) { - NativeOpExecutioner::decodeBitmap(hX, N, dz, hZShapeInfo); -} - template void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZShapeInfo, int N, int *shuffleMap, Nd4jLong **tadOnlyShapeInfo, Nd4jLong **tadOffsets) { @@ -1859,12 +1833,6 @@ void sortCooIndices(Nd4jPointer *extraPointers, } } -Nd4jLong encodeBitmap(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) { - return NativeOpExecutioner::encodeBitmap(hX, hXShapeInfo, N, dz, threshold); -} - - - Nd4jLong* mmapFile(Nd4jPointer *extraPointers, const char *fileName, Nd4jLong length) { auto hZ = new Nd4jLong[2];errno = 0; try { diff --git a/libnd4j/include/legacy/cuda/NativeOps.cu b/libnd4j/include/legacy/cuda/NativeOps.cu index 1d6a90126..0f2abca27 100755 --- a/libnd4j/include/legacy/cuda/NativeOps.cu +++ b/libnd4j/include/legacy/cuda/NativeOps.cu @@ -2197,76 +2197,6 @@ void prescanArrayRecursive(Nd4jPointer *extras, int *dZ, int *dX, int numElement sd::DebugHelper::checkErrorCode(stream, "prescanArray(...) failed"); } - -void encodeThresholdP1(Nd4jPointer *extras, void *dx, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) { - try { - cudaStream_t *stream = reinterpret_cast(extras[1]); - - int blockSize = 1024; - int numBlocks = N / blockSize + (N % blockSize ? 1 : 0); - - dim3 launchDims(numBlocks, blockSize, 1024); - auto xType = sd::ArrayOptions::dataType(hXShapeInfo); - BUILD_SINGLE_SELECTOR(xType, encoderKernelP1Generic, (launchDims, stream, dx, N, dz, threshold), LIBND4J_TYPES); - - sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP1Float(...) failed"); - } catch (std::exception &e) { - sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); - sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); - } -} - - - -void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *dx, Nd4jLong N, int *dz) { - try { - cudaStream_t *stream = reinterpret_cast(extraPointers[1]); - //encoderKernelP2Float<<>>(dx, N, dz); - prescanArrayRecursive(extraPointers, dz, dx + 1, (int) N, 0); - sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP2Int(...) failed"); - } catch (std::exception &e) { - sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); - sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); - } -} - -void encodeThresholdP3(Nd4jPointer *extraPointers, void *dx, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){ - try { - cudaStream_t *stream = reinterpret_cast(extraPointers[1]); - - int blockSize = 1024; - int numBlocks = N / blockSize + (N % blockSize ? 1 : 0); - - dim3 launchDims(numBlocks, blockSize, 4096); - auto xType = sd::ArrayOptions::dataType(hXShapeInfo); - BUILD_SINGLE_SELECTOR(xType, encoderKernelP3Generic, (launchDims, stream, dx, offsets, N, dz), LIBND4J_TYPES); - - sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP3Float(...) failed"); - } catch (std::exception &e) { - sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); - sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); - } -} - -void decodeThreshold(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo){ - try { - cudaStream_t *stream = reinterpret_cast(extraPointers[1]); - - // we probably want to have smaller blocks here, memory writes are misaligned anyway - int blockSize = 128; - int numBlocks = N / blockSize + (N % blockSize ? 1 : 0); - - dim3 launchDims(numBlocks, blockSize, 1024); - auto zType = sd::ArrayOptions::dataType(zShapeInfo); - BUILD_SINGLE_SELECTOR(zType, decoderKernelGeneric, (launchDims, stream, dx, N, dz), LIBND4J_TYPES); - - sd::DebugHelper::checkErrorCode(stream, "decodeThresholdFloat(...) failed"); - } catch (std::exception &e) { - sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); - sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); - } -} - //////////////////////////////////////////////////////////////////////// void execReduce3All(Nd4jPointer *extraPointers, int opNum, @@ -2603,55 +2533,6 @@ void sortCooIndices(Nd4jPointer *extraPointers, Nd4jLong *indices, void *values, throw std::runtime_error("sortCooIndices:: Not implemented yet"); } - -Nd4jLong encodeBitmap(Nd4jPointer *extraPointers, - void *dx, Nd4jLong *hXShapeInfo, - Nd4jLong N, - int *dz, - float threshold) { - try { - - cudaStream_t *stream = reinterpret_cast(extraPointers[1]); - int *resultPointer = reinterpret_cast(extraPointers[2]); - int *reductionPointer = reinterpret_cast(extraPointers[3]); - - dim3 launchDims(512, 512, 32768); - auto xType = sd::ArrayOptions::dataType(hXShapeInfo); - BUILD_SINGLE_SELECTOR(xType, cudaEncodeBitmapGeneric, - (launchDims, stream, dx, N, dz, resultPointer, reductionPointer, threshold), - LIBND4J_TYPES); - - sd::DebugHelper::checkErrorCode(stream, "encodeBitmapFloat(...) failed"); - - Nd4jLong dZ = (Nd4jLong) resultPointer[0]; - resultPointer[0] = 0; - - return dZ; - } catch (std::exception &e) { - sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); - sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); - return 0; - } -} - - -void decodeBitmap(Nd4jPointer *extraPointers, - void *dx, - Nd4jLong N, - void *dz, Nd4jLong *zShapeInfo) { - try { - cudaStream_t *stream = reinterpret_cast(extraPointers[1]); - dim3 launchDims(512, 512, 16384); - auto xType = sd::ArrayOptions::dataType(zShapeInfo); - BUILD_SINGLE_SELECTOR(xType, cudaDecodeBitmapGeneric, (launchDims, stream, dx, N, dz), LIBND4J_TYPES); - - sd::DebugHelper::checkErrorCode(stream, "decodeBitmapFloat(...) failed"); - } catch (std::exception &e) { - sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1); - sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what()); - } -} - Nd4jLong* mmapFile(Nd4jPointer *extraPointers, const char *fileName, Nd4jLong length) { return nullptr; } diff --git a/libnd4j/include/legacy/impl/Environment.cpp b/libnd4j/include/legacy/impl/Environment.cpp index fae3a28dc..b19a7147b 100644 --- a/libnd4j/include/legacy/impl/Environment.cpp +++ b/libnd4j/include/legacy/impl/Environment.cpp @@ -207,7 +207,7 @@ namespace sd { } void Environment::setMaxSpecialyMemory(uint64_t maxBytes) { - _maxTotalSpecialMemory; + _maxTotalSpecialMemory = maxBytes; } void Environment::setMaxDeviceMemory(uint64_t maxBytes) { diff --git a/libnd4j/include/loops/cuda/type_conversions.cu b/libnd4j/include/loops/cuda/type_conversions.cu index 8c38561f4..0f753bc56 100644 --- a/libnd4j/include/loops/cuda/type_conversions.cu +++ b/libnd4j/include/loops/cuda/type_conversions.cu @@ -217,10 +217,27 @@ namespace sd { } ////////////////////////////////////////////////////////////////////////// +/* + * PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge. + */ template __global__ static void execEncoderKernelP1(void *dx, Nd4jLong N, void *dz, float threshold) { + auto x = reinterpret_cast (dx); + auto z = reinterpret_cast (dz); - encoderKernelP1(dx, N, dz, threshold); + //basically, for phase One we want do calculation: how many eligible values we have, and which blocks will be holding data + Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x; + + int pass = tid < N && sd::math::nd4j_abs(x[tid]) >= static_cast(threshold) ? 1 : 0; + int bp=__syncthreads_count(pass); + + if (threadIdx.x == 0) { + // saving out per-block passes + z[blockIdx.x+1] = bp; + + // saving out sum + atomicAdd(&z[0], bp); + } } ////////////////////////////////////////////////////////////////////////// @@ -230,13 +247,74 @@ __host__ void encoderKernelP1Generic(dim3 &launchDims, cudaStream_t *stream, voi execEncoderKernelP1<<>>(dx, N, dz, threshold); sd::DebugHelper::checkErrorCode(stream, "encoderP1(...) failed"); } -BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP1Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold), LIBND4J_TYPES); +BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP1Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold), FLOAT_TYPES); ////////////////////////////////////////////////////////////////////////// +/* + * PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge. + * + * Based on: https://github.com/knotman90/cuStreamComp <-- efficient CUDA stream compaction algorithm + */ template __global__ static void execEncoderKernelP3(void *dx, int *offsets, Nd4jLong N, void *dz) { + auto x = reinterpret_cast (dx); + auto z = reinterpret_cast (dz); - encoderKernelP3(dx, offsets, N, dz); + auto tid = blockIdx.x * blockDim.x + threadIdx.x; + extern __shared__ int warpTotals[]; + + // fetch block offset only once + __shared__ float threshold; + __shared__ FloatBits fb; + __shared__ int bo; + __shared__ int limit; + if (threadIdx.x == 0) { + limit = z[0]; + fb.i_ = z[2]; + threshold = fb.f_; + bo = offsets[blockIdx.x]; + } + __syncthreads(); + + // out-of-limit threads do not play here + auto value = tid < N ? x[tid] : (T) 0.f; + + // out-of-limit threads just declare they have no changes + auto pred = tid >= N ? 0 : sd::math::nd4j_abs(value) >= static_cast(threshold) ? 1 : 0; + auto w_i = threadIdx.x / warpSize; // warp index (or, warp number) - index of the Warp within TOTAL_WARPS + auto t_i = threadIdx.x % warpSize; // thread index within a warp + unsigned int t_m = INT_MAX >> (warpSize - t_i - 1); //thread mask (ERROR IN THE PAPER minus one is required) + + int b = __ballot_sync(t_m, pred); // balres = number whose ith bit isone if the ith's thread pred is true masked up to the current index in warp + auto t_u = __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX + + if (t_i == warpSize - 1) + warpTotals[w_i] = t_u + pred; + + __syncthreads(); + + + int w_i_u = 0; + for (int j = 0; j <= 5; j++) { + unsigned int b_j = __ballot_sync(t_m, warpTotals[t_i] & pow2i(j)); //# of the ones in the j'th digit of the warp offsets + w_i_u += (__popc(b_j) << j); + } + + // we just ignore all results coming from non-0 threads + if (w_i == 0 && t_i < blockDim.x / warpSize) + warpTotals[t_i] = w_i_u; + + __syncthreads(); + + + // pred is always false if we're out-of-limits + if (pred) { + int idx = t_u + warpTotals[w_i] + bo + 4; + if (idx < limit + 4) { + z[idx] = value > static_cast(0.0f) ? tid + 1 : -(tid + 1); + x[tid] = value > static_cast(0.0f) ? x[tid] - threshold : x[tid] + threshold; + } + } } ////////////////////////////////////////////////////////////////////////// @@ -245,13 +323,38 @@ __host__ void encoderKernelP3Generic(dim3 &launchDims, cudaStream_t *stream, voi execEncoderKernelP3<<>>(dx, offsets, N, dz); sd::DebugHelper::checkErrorCode(stream, "encoderP3(...) failed"); } -BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP3Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz), LIBND4J_TYPES); +BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP3Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz), FLOAT_TYPES); ////////////////////////////////////////////////////////////////////////// +/* + * This kernel handles decode from sparse threshold array, to dense array + * + * PLEASE NOTE: Z is expected to be memset to 0 +*/ template __global__ static void execDecoderKernel(void *dx, Nd4jLong N, void *dz) { + auto x = reinterpret_cast (dx); + auto z = reinterpret_cast (dz); - decoderKernel(dx, N, dz); + int tid = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float threshold; + __shared__ int limit; + + __shared__ FloatBits fb; + if (threadIdx.x == 0) { + limit = x[0]; + fb.i_ = x[2]; + threshold = fb.f_; + } + __syncthreads(); + + for (int e = tid; e < limit; e += blockDim.x * gridDim.x) { + int el = x[e+4]; + int ael = sd::math::nd4j_abs(el) - 1; + + // TODO: investigate, if += would work better here, as in "decoded accumulation" + z[ael] += el > 0 ? threshold : -threshold; + } } ////////////////////////////////////////////////////////////////////////// @@ -261,14 +364,78 @@ __host__ void decoderKernelGeneric(dim3 &launchDims, cudaStream_t *stream, void execDecoderKernel<<>>(dx, N, dz); sd::DebugHelper::checkErrorCode(stream, "execDecoder(...) failed"); } -BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT decoderKernelGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES); +BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT decoderKernelGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz), FLOAT_TYPES); ////////////////////////////////////////////////////////////////////////// template __global__ static void execCudaEncodeBitmapKernel(void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold) { + auto dx = reinterpret_cast(vdx); + int tid = blockIdx.x * blockDim.x + threadIdx.x; - cudaEncodeBitmapKernel(vdx, N, dz, scalar, reductionBuffer, threshold); + T off(0.0f); + __shared__ int counter; + __shared__ int *shmem; + __shared__ T *vals; + if (threadIdx.x == 0){ + extern __shared__ char mem[]; + shmem = reinterpret_cast(mem); + vals = reinterpret_cast(shmem + blockDim.x); + counter = 0; + } + __syncthreads(); + + Nd4jLong loopRemainder = N % (blockDim.x * gridDim.x); + Nd4jLong loopLimit = N + (blockDim.x * gridDim.x - loopRemainder); + + for (Nd4jLong i = tid; i < loopLimit; i += blockDim.x * gridDim.x) { + // all threads in block reading stuff + T val = i < N ? dx[i] : off; + T abs = sd::math::nd4j_abs(val); + + int byteId = i / 16 + 4; + int bitId = i % 16; + + shmem[threadIdx.x] = 0; + vals[threadIdx.x] = val; + + if (abs >= static_cast(threshold) && i < N) { + shmem[threadIdx.x] = 1 << (bitId); + atomicAdd(&counter, 1); + if (val < static_cast(0.0f)) { + shmem[threadIdx.x] |= 1 << (bitId + 16); + vals[threadIdx.x] += static_cast(threshold); + } else { + vals[threadIdx.x] -= static_cast(threshold); + } + } else if (abs >= static_cast(threshold) / static_cast(2.0f) && val < static_cast(0.0f) && i < N) { + atomicAdd(&counter, 1); + shmem[threadIdx.x] = 1 << (bitId + 16); + + vals[threadIdx.x] += static_cast(threshold) / static_cast(2.0f); + } + __syncthreads(); + + if (threadIdx.x % 16 == 0 && i < N) { + int byte = 0; + for (int e = 0; e < 16; e++) { + if (i + e >= N) + continue; + + byte |= shmem[threadIdx.x + e]; + } + dz[byteId] = byte; + } + __syncthreads(); + + if (i < N) + dx[i] = vals[threadIdx.x]; + } + __syncthreads(); + + if (threadIdx.x == 0) { + atomicAdd(scalar, counter); + } } ////////////////////////////////////////////////////////////////////////// @@ -278,14 +445,62 @@ __host__ void cudaEncodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, vo execCudaEncodeBitmapKernel<<>>(vdx, N, dz, scalar, reductionBuffer, threshold); sd::DebugHelper::checkErrorCode(stream, "encodeBitmap(...) failed"); } -BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaEncodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold), LIBND4J_TYPES); +BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaEncodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold), FLOAT_TYPES); ////////////////////////////////////////////////////////////////////////// template __global__ static void execCudaDecodeBitmapKernel(void *dx, Nd4jLong N, void *vdz) { + auto dz = static_cast(vdz); - cudaDecodeBitmapKernel(dx, N, vdz); + int tid = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ T *shmem; + __shared__ FloatBits fb; + __shared__ float threshold; + __shared__ int *x; + if (threadIdx.x == 0){ + extern __shared__ char mem[]; + shmem = reinterpret_cast(mem); + x = reinterpret_cast(dx); + fb.i_ = x[2]; + threshold = fb.f_; + } + __syncthreads(); + + int lim = N / 16 + 5; + for (int i = tid; i < N; i += blockDim.x * gridDim.x) { + int byteId = i / 16 + 4; +// printf("I: [%i]; byteId: [%i]\n", i, byteId); + + shmem[threadIdx.x] = dz[i]; + __syncthreads(); + + if (threadIdx.x % 16 == 0) { + int byte = x[byteId]; + + for (int e = 0; e < 16; e++) { + if (i + e >= N) + continue; + + int bitId = (i + e) % 16; + + bool hasBit = (byte & 1 << (bitId) ) != 0; + bool hasSign = (byte & 1 << (bitId + 16) ) != 0; + + if (hasBit) { + if (hasSign) + shmem[threadIdx.x + bitId] -= threshold; + else + shmem[threadIdx.x + bitId] += threshold; + } else if (hasSign) { + shmem[threadIdx.x + bitId] -= threshold / 2; + } + } + } + __syncthreads(); + + dz[i] = shmem[threadIdx.x]; + } } ////////////////////////////////////////////////////////////////////////// @@ -295,7 +510,7 @@ __host__ void cudaDecodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, vo execCudaDecodeBitmapKernel<<>>(dx, N, vdz); sd::DebugHelper::checkErrorCode(stream, "cudeDecodeBitmap(...) failed"); } -BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz), LIBND4J_TYPES); +BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz), FLOAT_TYPES); template diff --git a/libnd4j/include/loops/impl/type_conversions.cpp b/libnd4j/include/loops/impl/type_conversions.cpp index 16914bd86..8d6729a94 100644 --- a/libnd4j/include/loops/impl/type_conversions.cpp +++ b/libnd4j/include/loops/impl/type_conversions.cpp @@ -106,8 +106,14 @@ namespace sd { auto l = static_cast(N); z[1] = l; +#ifdef _OPENMP int threads = OmpLaunchHelper::betterThreads(N); - int span = OmpLaunchHelper::betterSpan(N, threads); + auto span = OmpLaunchHelper::betterSpan(N, threads); +#else + int threads = 1; + auto span = N; +#endif + T tt = static_cast(threshold); T mtt = -tt; @@ -209,21 +215,23 @@ PRAGMA_OMP_ATOMIC_ARGS(write) samediff::Threads::parallel_for(func, 0, N); }; + template void TypeCast::convertFromThreshold(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); template void TypeCast::convertFromThreshold(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); template void TypeCast::convertFromThreshold(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); - template void TypeCast::convertFromThreshold(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); + template void TypeCast::convertFromThreshold(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); + template void TypeCast::convertToThreshold(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); template void TypeCast::convertToThreshold(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); template void TypeCast::convertToThreshold(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); - template void TypeCast::convertToThreshold(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); + template void TypeCast::convertToThreshold(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); + template void TypeCast::convertFromQuantized(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); template void TypeCast::convertFromQuantized(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); template void TypeCast::convertFromQuantized(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); - template void TypeCast::convertFromQuantized(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); + template void TypeCast::convertToQuantized(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); template void TypeCast::convertToQuantized(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); template void TypeCast::convertToQuantized(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); - template void TypeCast::convertToQuantized(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz); #ifndef __CLION_IDE__ BUILD_DOUBLE_TEMPLATE(template void TypeCast::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES) diff --git a/libnd4j/include/loops/type_conversions.h b/libnd4j/include/loops/type_conversions.h index ff5ac5400..3d113eefc 100644 --- a/libnd4j/include/loops/type_conversions.h +++ b/libnd4j/include/loops/type_conversions.h @@ -110,29 +110,6 @@ namespace sd { } #ifdef __CUDACC__ - /* - * PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge. - */ - template - __device__ inline void encoderKernelP1(void *dx, Nd4jLong N, void *dz, float threshold) { - auto x = reinterpret_cast (dx); - auto z = reinterpret_cast (dz); - - //basically, for phase One we want do calculation: how many eligible values we have, and which blocks will be holding data - Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x; - - int pass = tid < N && sd::math::nd4j_abs(x[tid]) >= static_cast(threshold) ? 1 : 0; - int bp=__syncthreads_count(pass); - - if (threadIdx.x == 0) { - // saving out per-block passes - z[blockIdx.x+1] = bp; - - // saving out sum - atomicAdd(&z[0], bp); - } - } - __device__ __inline__ int pow2i (int e){ return 1< __host__ void encoderKernelP1Generic(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold); -/* - * PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge. - * - * Based on: https://github.com/knotman90/cuStreamComp <-- efficient CUDA stream compaction algorithm - */ - template - __device__ inline void encoderKernelP3(void *dx, int *offsets, Nd4jLong N, void *dz) { - T *x = reinterpret_cast (dx); - int *z = reinterpret_cast (dz); - - Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x; - extern __shared__ int warpTotals[]; - - // fetch block offset only once - __shared__ float threshold; - __shared__ FloatBits fb; - __shared__ int bo; - __shared__ int limit; - if (threadIdx.x == 0) { - limit = z[0]; - fb.i_ = z[2]; - threshold = fb.f_; - bo = offsets[blockIdx.x]; - } - __syncthreads(); - - if (tid < N) { - T value = x[tid]; - int pred = sd::math::nd4j_abs(value) >= static_cast(threshold) ? 1 : 0; - int w_i = threadIdx.x/warpSize; //warp index - int w_l = tid % warpSize;//thread index within a warp - unsigned int t_m = INT_MAX >> (warpSize-w_l-1); //thread mask (ERROR IN THE PAPER minus one is required) - - int b = __ballot_sync(t_m, pred); //balres = number whose ith bit isone if the ith's thread pred is true masked up to the current index in warp - int t_u = __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX - - if(w_l==warpSize-1){ - warpTotals[w_i]=t_u+pred; - } -// __syncthreads(); // Eliminated due RTX20xx specific - - if(w_i==0 && w_l static_cast(0.0f) ? tid+1 : -(tid + 1); - x[tid] = value > static_cast(0.0f) ? x[tid] - threshold : x[tid] + threshold; - } - } - } - } template __host__ void encoderKernelP3Generic(dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz); - /* -* This kernel handles decode from sparse threshold array, to dense array - * - * PLEASE NOTE: Z is expected to be memset to 0 -*/ - template - __device__ inline void decoderKernel(void *dx, Nd4jLong N, void *dz) { - auto x = reinterpret_cast (dx); - auto z = reinterpret_cast (dz); - - int tid = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ float threshold; - __shared__ int limit; - - __shared__ FloatBits fb; - if (threadIdx.x == 0) { - limit = x[0]; - fb.i_ = x[2]; - threshold = fb.f_; - } - __syncthreads(); - - for (int e = tid; e < limit; e += blockDim.x * gridDim.x) { - int el = x[e+4]; - int ael = sd::math::nd4j_abs(el) - 1; - - // TODO: investigate, if += would work better here, as in "decoded accumulation" - z[ael] += el > 0 ? threshold : -threshold; - } - } - template __host__ void decoderKernelGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz); - -////////////////////////////////////////////////////////////////////////// - template - __device__ inline void cudaEncodeBitmapKernel(void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold) { - - auto dx = reinterpret_cast(vdx); - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - T off(0.0f); - __shared__ int counter; - __shared__ int *shmem; - __shared__ T *vals; - if (threadIdx.x == 0){ - extern __shared__ char mem[]; - shmem = reinterpret_cast(mem); - vals = reinterpret_cast(shmem + blockDim.x); - counter = 0; - } - __syncthreads(); - - Nd4jLong loopRemainder = N % (blockDim.x * gridDim.x); - Nd4jLong loopLimit = N + (blockDim.x * gridDim.x - loopRemainder); - - for (Nd4jLong i = tid; i < loopLimit; i += blockDim.x * gridDim.x) { - // all threads in block reading stuff - T val = i < N ? dx[i] : off; - T abs = sd::math::nd4j_abs(val); - - int byteId = i / 16 + 4; - int bitId = i % 16; - - shmem[threadIdx.x] = 0; - vals[threadIdx.x] = val; - - if (abs >= static_cast(threshold) && i < N) { - shmem[threadIdx.x] = 1 << (bitId); - atomicAdd(&counter, 1); - if (val < static_cast(0.0f)) { - shmem[threadIdx.x] |= 1 << (bitId + 16); - vals[threadIdx.x] += static_cast(threshold); - } else { - vals[threadIdx.x] -= static_cast(threshold); - } - } else if (abs >= static_cast(threshold) / static_cast(2.0f) && val < static_cast(0.0f) && i < N) { - atomicAdd(&counter, 1); - shmem[threadIdx.x] = 1 << (bitId + 16); - - vals[threadIdx.x] += static_cast(threshold) / static_cast(2.0f); - } - __syncthreads(); - - if (threadIdx.x % 16 == 0 && i < N) { - int byte = 0; - for (int e = 0; e < 16; e++) { - if (i + e >= N) - continue; - - byte |= shmem[threadIdx.x + e]; - } - dz[byteId] = byte; - } - __syncthreads(); - - if (i < N) - dx[i] = vals[threadIdx.x]; - } - __syncthreads(); - - if (threadIdx.x == 0) { - atomicAdd(scalar, counter); - } - } - template __host__ void cudaEncodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold); -////////////////////////////////////////////////////////////////////////// - template - __device__ inline void cudaDecodeBitmapKernel(void *dx, Nd4jLong N, void *vdz) { - - auto dz = static_cast(vdz); - - int tid = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ T *shmem; - __shared__ FloatBits fb; - __shared__ float threshold; - __shared__ int *x; - if (threadIdx.x == 0){ - extern __shared__ char mem[]; - shmem = reinterpret_cast(mem); - x = reinterpret_cast(dx); - fb.i_ = x[2]; - threshold = fb.f_; - } - __syncthreads(); - - int lim = N / 16 + 5; - for (int i = tid; i < N; i += blockDim.x * gridDim.x) { - int byteId = i / 16 + 4; -// printf("I: [%i]; byteId: [%i]\n", i, byteId); - - shmem[threadIdx.x] = dz[i]; - __syncthreads(); - - if (threadIdx.x % 16 == 0) { - int byte = x[byteId]; - - for (int e = 0; e < 16; e++) { - if (i + e >= N) - continue; - - int bitId = (i + e) % 16; - - bool hasBit = (byte & 1 << (bitId) ) != 0; - bool hasSign = (byte & 1 << (bitId + 16) ) != 0; - - if (hasBit) { - if (hasSign) - shmem[threadIdx.x + bitId] -= threshold; - else - shmem[threadIdx.x + bitId] += threshold; - } else if (hasSign) { - shmem[threadIdx.x + bitId] -= threshold / 2; - } - } - } - __syncthreads(); - - dz[i] = shmem[threadIdx.x]; - } - } template __host__ void cudaDecodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz); - // __global__ void cudaEncodeBitmapFloat(float *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold); - - // __global__ void cudaEncodeBitmapDouble(double *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold); - - // __global__ void cudaEncodeBitmapHalf(float16 *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold); - - // __global__ void cudaDecodeBitmapFloat(void *dx, Nd4jLong N, float *dz); - - // __global__ void cudaDecodeBitmapDouble(void *dx, Nd4jLong N, double *dz); - - // __global__ void cudaDecodeBitmapHalf(void *dx, Nd4jLong N, float16 *dz); - - // __global__ void encoderKernelP1Float(void *dx, Nd4jLong N, void *dz, float threshold); - - // __global__ void encoderKernelP1Double(void *dx, Nd4jLong N, void *dz, float threshold); - - // __global__ void encoderKernelP1Half(void *dx, Nd4jLong N, void *dz, float threshold); - - // __global__ void encoderKernelP2Float(int *dx, Nd4jLong N, int *dz); - - // __global__ void encoderKernelP3Float(void *dx, int *offsets, Nd4jLong N, void *dz); - - // __global__ void encoderKernelP3Double(void *dx, int *offsets, Nd4jLong N, void *dz); - - // __global__ void encoderKernelP3Half(void *dx, int *offsets, Nd4jLong N, void *dz); - - // __global__ void decoderKernelFloat(void *dx, Nd4jLong N, void *dz); - - // __global__ void decoderKernelDouble(void *dx, Nd4jLong N, void *dz); - - // __global__ void decoderKernelHalf(void *dx, Nd4jLong N, void *dz); - __global__ void uniformAdd(int *g_data, int *uniforms, int n, int blockOffset, int baseIndex); template diff --git a/libnd4j/include/ops/declarable/CustomOperations.h b/libnd4j/include/ops/declarable/CustomOperations.h index f98deb784..8aa612c7b 100644 --- a/libnd4j/include/ops/declarable/CustomOperations.h +++ b/libnd4j/include/ops/declarable/CustomOperations.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/libnd4j/include/ops/declarable/generic/blas/matmul.cpp b/libnd4j/include/ops/declarable/generic/blas/matmul.cpp index 370aa50c6..c9d8c9476 100644 --- a/libnd4j/include/ops/declarable/generic/blas/matmul.cpp +++ b/libnd4j/include/ops/declarable/generic/blas/matmul.cpp @@ -138,9 +138,9 @@ DECLARE_SHAPE_FN(matmul) { ////////////////////////////////////////////////////////////////////// DECLARE_TYPES(matmul) { getOpDescriptor() - ->setAllowedInputTypes(0, {ALL_FLOATS}) - ->setAllowedInputTypes(1, {ALL_FLOATS}) - ->setAllowedOutputTypes(0, {ALL_FLOATS}); + ->setAllowedInputTypes(0, {ALL_FLOATS, ALL_INTS}) + ->setAllowedInputTypes(1, {ALL_FLOATS, ALL_INTS}) + ->setAllowedOutputTypes(0, {ALL_FLOATS, ALL_INTS}); } ////////////////////////////////////////////////////////////////////// diff --git a/libnd4j/include/ops/declarable/generic/compression/bitmap.cpp b/libnd4j/include/ops/declarable/generic/compression/bitmap.cpp new file mode 100644 index 000000000..d7fab45a6 --- /dev/null +++ b/libnd4j/include/ops/declarable/generic/compression/bitmap.cpp @@ -0,0 +1,92 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author George A. Shulinok +// + +#include +#include +#include + +#if NOT_EXCLUDED(OP_decode_bitmap) +namespace sd { + namespace ops { + CUSTOM_OP_IMPL(decode_bitmap, 2, 1, true, 0, 0) { + auto encoded = INPUT_VARIABLE(1); + auto updates = OUTPUT_VARIABLE(0); + + helpers::decodeBitmap(block.launchContext(), encoded, updates); + return Status::OK(); + } + + DECLARE_SHAPE_FN(decode_bitmap) { + auto weights = INPUT_VARIABLE(0); + + return SHAPELIST(weights->shapeInfo()); + } + + DECLARE_TYPES(decode_bitmap) { + getOpDescriptor() + ->setAllowedInputTypes(0, {ALL_FLOATS}) + ->setAllowedInputTypes(1, DataType::INT32) + ->setAllowedOutputTypes({ALL_FLOATS}); + } + } +} +#endif + +#if NOT_EXCLUDED(OP_encode_bitmap) +namespace sd { + namespace ops { + CUSTOM_OP_IMPL(encode_bitmap, 1, 3, true, 1, 0) { + auto input = INPUT_VARIABLE(0); + auto encoded = OUTPUT_NULLIFIED(1); + auto counter = OUTPUT_NULLIFIED(2); + + float threshold = T_ARG(0); + + encoded->p(0, (int) input->lengthOf()); + encoded->p(1, (int) input->lengthOf()); + encoded->p(2, reinterpret_cast(&threshold)[0]); + encoded->p(3, 1); // flag for BITMAP_ENCODING + + auto result = helpers::encodeBitmap(block.launchContext(), input, encoded, threshold); + counter->p(0, result); + counter->syncToDevice(); + + return Status::OK(); + } + + DECLARE_SHAPE_FN(encode_bitmap) { + auto input = inputShape->at(0); + + auto outputLength = shape::length(input) / 16 + 5; + auto encodedShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(outputLength, DataType::INT32); + auto encodedCounter = ConstantShapeHelper::getInstance()->scalarShapeInfo(DataType::INT32); + return SHAPELIST(input, encodedShape, encodedCounter); + } + + DECLARE_TYPES(encode_bitmap) { + getOpDescriptor() + ->setAllowedInputTypes(sd::DataType::ANY) + ->setAllowedOutputTypes(0, {ALL_FLOATS}) + ->setAllowedInputTypes(1, DataType::INT32) + ->setAllowedInputTypes(2, DataType::INT32); + } + } +} +#endif \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/generic/compression/threshold.cpp b/libnd4j/include/ops/declarable/generic/compression/threshold.cpp new file mode 100644 index 000000000..9512621e8 --- /dev/null +++ b/libnd4j/include/ops/declarable/generic/compression/threshold.cpp @@ -0,0 +1,104 @@ +/******************************************************************************* + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include +#include +#include + +namespace sd { + namespace ops { + CUSTOM_OP_IMPL(encode_threshold, 1, 2, true, 1, 0) { + auto x = INPUT_VARIABLE(0); + auto updated = OUTPUT_VARIABLE(0); + auto encoded = OUTPUT_NULLIFIED(1); + + float threshold = T_ARG(0); + + REQUIRE_TRUE(x->lengthOf() <= DataTypeUtils::max(), 0, "encode_threshold: gradients array must have length <= MAX_INT"); + REQUIRE_TRUE(encoded->lengthOf() >= 4, 0, "encode_threshold: array for encoded updates can't have less than 4 elements"); +// REQUIRE_TRUE(x->platformBuffer() == updated->platformBuffer(), 0, "encode_threshold: gradients array must be the same at input and output"); + + // filling header bytes + encoded->p(0, encoded->lengthOf() - 4); + encoded->p(1, (int) x->lengthOf()); + encoded->p(2, reinterpret_cast(&threshold)[0]); + encoded->p(3, 0); // flag for FLEXIBLE_ENCODING + + // if there's no updates to process - just skip execution + if (encoded->lengthOf() == 4) + return Status::OK(); + + helpers::thresholdEncode(*x, *encoded, threshold); + + return Status::OK(); + } + + DECLARE_SHAPE_FN(encode_threshold) { + auto x = INPUT_VARIABLE(0); + // we have limit option here + int boundary = block.numI() > 0 ? I_ARG(0) : DataTypeUtils::max(); + float threshold = T_ARG(0); + + REQUIRE_TRUE(boundary >= 0, 0, "encode_threshold: boundary must be positive"); + REQUIRE_TRUE(x->lengthOf() <= DataTypeUtils::max(), 0, "encode_threshold: gradients array must have length <= MAX_INT"); + + // we must calculate number of elements that >= threshold + auto elements = sd::math::nd4j_min(helpers::thresholdEstimate(*x, threshold), boundary); + if (elements < 2) + elements = 0; + + // result array must have 4 additional int elements for header + return SHAPELIST(x->shapeInfo(), sd::ConstantShapeHelper::getInstance()->vectorShapeInfo(elements + 4, DataType::INT32)); + } + + DECLARE_TYPES(encode_threshold) { + getOpDescriptor() + ->setAllowedInputTypes(0, {ALL_FLOATS}) + ->setAllowedOutputTypes(0, {ALL_FLOATS}) + ->setAllowedOutputTypes(1, DataType::INT32); + } + + CUSTOM_OP_IMPL(decode_threshold, 2, 1, true, 0, 0) { + auto weights = INPUT_VARIABLE(0); + auto encoded = INPUT_VARIABLE(1); + auto updates = OUTPUT_VARIABLE(0); + + REQUIRE_TRUE(encoded->lengthOf() >= 4, 0, "decode_threshold: encoded array can't have length < 4"); + REQUIRE_TRUE(updates->lengthOf() == encoded->e(1), 0, "decode_threshold: updates array must have length equal to [%i]", encoded->e(1)); + REQUIRE_TRUE(encoded->e(3) == 0, 0, "decode_threshold: encoded array doesn't look like threshold-encoded"); + + helpers::thresholdDecode(*encoded, *updates); + + return Status::OK(); + } + + DECLARE_SHAPE_FN(decode_threshold) { + auto weights = inputShape->at(0); + return SHAPELIST(weights); + } + + DECLARE_TYPES(decode_threshold) { + getOpDescriptor() + ->setAllowedInputTypes(0, {ALL_FLOATS}) + ->setAllowedInputTypes(1, DataType::INT32) + ->setAllowedOutputTypes(0,{ALL_FLOATS}); + } + } +} \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/headers/compression.h b/libnd4j/include/ops/declarable/headers/compression.h new file mode 100644 index 000000000..9c177f8a4 --- /dev/null +++ b/libnd4j/include/ops/declarable/headers/compression.h @@ -0,0 +1,62 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// +#ifndef SD_HEADERS_COMPRESSION_H +#define SD_HEADERS_COMPRESSION_H + +#include + +namespace sd { + namespace ops { + + /** + * encode_bitmap - reinterpret 3D float tensor into uint8_t vector with length N. + * + * Input: + * 0 - 3D float tensor with shape {height, width, channels} + * + * Output: + * 0 - 1D uint8_t tensor with shape {N} + */ + #if NOT_EXCLUDED(OP_encode_bitmap) + DECLARE_CUSTOM_OP(encode_bitmap, 1, 3, true, 1, 0); + #endif + + /** + * decode_bitmap - reinterpret uint8_t linear tensor as data to float tensor with shape + * + * Input: + * 0 - uint8_t vector with length N ( shape {N}) + * + * Output: + * 0 - 3D tensor with shape {height, width, channels} + * + */ + #if NOT_EXCLUDED(OP_decode_bitmap) + DECLARE_CUSTOM_OP(decode_bitmap, 2, 1, true, 0, 0); + #endif + + + DECLARE_CUSTOM_OP(encode_threshold, 2, 1, true, 1, 0); + DECLARE_CUSTOM_OP(decode_threshold, 2, 1, true, 0, 0); + } +} + +#endif // SD_HEADERS_COMPRESSION_H \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/compression.h b/libnd4j/include/ops/declarable/helpers/compression.h new file mode 100644 index 000000000..10eecb09f --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/compression.h @@ -0,0 +1,34 @@ +/******************************************************************************* + * Copyright (c) 2020 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// +#ifndef __COMPRESSION_H_HELPERS__ +#define __COMPRESSION_H_HELPERS__ +#include +#include + +namespace sd { +namespace ops { +namespace helpers { + + void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output); + Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold); +} +} +} +#endif diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compression/compression.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compression/compression.cpp new file mode 100644 index 000000000..eac25e772 --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/compression/compression.cpp @@ -0,0 +1,37 @@ +/******************************************************************************* + * Copyright (c) 2020 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// +#include +#include + +namespace sd { +namespace ops { +namespace helpers { + + void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output) { + NativeOpExecutioner::decodeBitmap(input->buffer(), output->lengthOf(), output->buffer(), output->shapeInfo()); + } + + + Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold) { + return NativeOpExecutioner::encodeBitmap(input->buffer(), input->shapeInfo(), input->lengthOf(), output->bufferAsT(), threshold); + } +} +} +} diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compression/threshold.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compression/threshold.cpp new file mode 100644 index 000000000..b1f8f3b42 --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cpu/compression/threshold.cpp @@ -0,0 +1,62 @@ +/******************************************************************************* + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include +#include +#include + +namespace sd { + namespace ops { + namespace helpers { + template + static int32_t thresholdEstimate_(const NDArray &updates, const float threshold) { + auto N = updates.lengthOf(); + const auto buffer = updates.bufferAsT(); + + auto func = PRAGMA_REDUCE_LONG { + int64_t cnt = 0; + for (auto e = start; e < stop; e++) { + auto v = sd::math::nd4j_abs(buffer[e]); + if (v >= threshold) + cnt++; + } + + return cnt; + }; + + return samediff::Threads::parallel_long(func, LAMBDA_AL { return _old + _new; }, 0, N); + } + + int32_t thresholdEstimate(const NDArray &updates, const float threshold) { + BUILD_SINGLE_SELECTOR(updates.dataType(), return thresholdEstimate_, (updates, threshold), FLOAT_TYPES); + + return 0; + } + + void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold) { + BUILD_SINGLE_SELECTOR(updates.dataType(), sd::TypeCast::convertToThreshold, (nullptr, updates.buffer(), updates.lengthOf(), encoded.buffer()), FLOAT_TYPES); + } + + void thresholdDecode(const NDArray &encoded, NDArray &updates) { + BUILD_SINGLE_SELECTOR(updates.dataType(), sd::TypeCast::convertFromThreshold, (nullptr, encoded.getBuffer(), updates.lengthOf(), updates.buffer()), FLOAT_TYPES); + } + } + } +} diff --git a/libnd4j/include/ops/declarable/helpers/cuda/compression/compression.cu b/libnd4j/include/ops/declarable/helpers/cuda/compression/compression.cu new file mode 100644 index 000000000..ecc1a348e --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cuda/compression/compression.cu @@ -0,0 +1,66 @@ +/******************************************************************************* + * Copyright (c) 2020 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author sgazeos@gmail.com +// +#include +#include +#include + +namespace sd { +namespace ops { +namespace helpers { + void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output) { + auto stream = context->getCudaStream(); + NDArray::prepareSpecialUse({output}, {input}); + + dim3 launchDims(512, 512, 16384); + auto xType = output->dataType(); + BUILD_SINGLE_SELECTOR(xType, cudaDecodeBitmapGeneric, (launchDims, stream, input->specialBuffer(), output->lengthOf(), output->specialBuffer()), FLOAT_TYPES); + + sd::DebugHelper::checkErrorCode(stream, "decodeBitmapFloat(...) failed"); + + NDArray::registerSpecialUse({output}, {input}); + } + + Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold) { + auto stream = LaunchContext::defaultContext()->getCudaStream(); + int *resultPointer = reinterpret_cast(LaunchContext::defaultContext()->getScalarPointer()); + int *reductionPointer = reinterpret_cast(LaunchContext::defaultContext()->getReductionPointer()); + + // nullify result pointer before use + resultPointer[0] = 0; + + NDArray::prepareSpecialUse({},{output, input}); + + dim3 launchDims(512, 512, 32768); + auto xType = input->dataType(); + BUILD_SINGLE_SELECTOR(xType, cudaEncodeBitmapGeneric, + (launchDims, stream, input->specialBuffer(), input->lengthOf(), reinterpret_cast(output->specialBuffer()), resultPointer, reductionPointer, threshold), + FLOAT_TYPES); + + sd::DebugHelper::checkErrorCode(stream, "encodeBitmapFloat(...) failed"); + + Nd4jLong dZ = (Nd4jLong) resultPointer[0]; + resultPointer[0] = 0; + + NDArray::registerSpecialUse({output, input}, {}); + return dZ; + } +} +} +} diff --git a/libnd4j/include/ops/declarable/helpers/cuda/compression/threshold.cu b/libnd4j/include/ops/declarable/helpers/cuda/compression/threshold.cu new file mode 100644 index 000000000..138970816 --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/cuda/compression/threshold.cu @@ -0,0 +1,231 @@ +/******************************************************************************* + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#include +#include +#include +#include + +namespace sd { + namespace ops { + namespace helpers { + void prescanArrayRecursive(int** g_scanBlockSums, int *dZ, int *dX, int numElements, int level) { + auto stream = LaunchContext::defaultContext()->getCudaStream(); + + + int blockSize = 512; // max size of the thread blocks + int numBlocks = sd::math::nd4j_max(1, static_cast(ceil(static_cast(numElements) / (2.f * blockSize)))); + int numThreads; + + if (numBlocks > 1) + numThreads = blockSize; + else if (sd::isPowerOfTwo(numElements)) + numThreads = numElements / 2; + else + numThreads = sd::floorPow2(numElements); + + int numEltsPerBlock = numThreads * 2; + + // if this is a non-power-of-2 array, the last block will be non-full + // compute the smallest power of 2 able to compute its scan. + int numEltsLastBlock = + numElements - (numBlocks-1) * numEltsPerBlock; + int numThreadsLastBlock = sd::math::nd4j_max(1, numEltsLastBlock / 2); + int np2LastBlock = 0; + int sharedMemLastBlock = 0; + + if (numEltsLastBlock != numEltsPerBlock) { + np2LastBlock = 1; + + if(!isPowerOfTwo(numEltsLastBlock)) + numThreadsLastBlock = floorPow2(numEltsLastBlock); + + unsigned int extraSpace = (2 * numThreadsLastBlock) / NUM_BANKS; + sharedMemLastBlock = sizeof(int) * (2 * numThreadsLastBlock + extraSpace); + } + + // padding space is used to avoid shared memory bank conflicts + int extraSpace = numEltsPerBlock / NUM_BANKS; + int sharedMemSize = sizeof(int) * (numEltsPerBlock + extraSpace); + + // setup execution parameters + // if NP2, we process the last block separately + dim3 grid(sd::math::nd4j_max(1, numBlocks - np2LastBlock), 1, 1); + dim3 threads(numThreads, 1, 1); + dim3 gridOnes(1, 1, 1); + dim3 threadsOnes(numThreadsLastBlock, 1, 1); + + if (sharedMemSize < 2048) + sharedMemSize = 2048; + + if (sharedMemLastBlock < 2048) + sharedMemLastBlock = 2048; + + // execute the scan + if (numBlocks > 1) { + sd::prescanLauncher(grid, threads, sharedMemSize, stream, dZ, dX, g_scanBlockSums[level], numThreads * 2, 0, 0); + if (np2LastBlock) { + sd::prescanLauncher(gridOnes, threadsOnes, sharedMemLastBlock, stream, dZ, dX, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock); + } + + // After scanning all the sub-blocks, we are mostly done. But now we + // need to take all of the last values of the sub-blocks and scan those. + // This will give us a new value that must be sdded to each block to + // get the final results. + // recursive (CPU) call + prescanArrayRecursive(g_scanBlockSums, g_scanBlockSums[level], g_scanBlockSums[level], numBlocks, level+1); + + sd::uniformAdd<<>>(dZ, g_scanBlockSums[level], numElements - numEltsLastBlock, 0, 0); + + if (np2LastBlock) { + sd::uniformAdd<<<1, numThreadsLastBlock, 1024, *stream>>>(dZ, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock); + } + } else if (isPowerOfTwo(numElements)) { + sd::prescanLauncher(grid, threads, sharedMemSize, stream, dZ, dX, 0, numThreads * 2, 0, 0); + } else { + sd::prescanLauncher(grid, threads, sharedMemSize, stream, dZ, dX, 0, numElements, 0, 0); + } + + sd::DebugHelper::checkErrorCode(stream, "prescanArray(...) failed"); + } + + static void encodeThresholdP2Int_(void **prs, int *dx, Nd4jLong N, int *dz) { + auto stream = LaunchContext::defaultContext()->getCudaStream(); + + prescanArrayRecursive(reinterpret_cast(prs), dz, dx + 1, (int) N, 0); + sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP2Int(...) failed"); + } + + static void encodeThresholdP3_(void *dx, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){ + auto stream = LaunchContext::defaultContext()->getCudaStream(); + + int blockSize = 512; + int numBlocks = N / blockSize + (N % blockSize ? 1 : 0); + + dim3 launchDims(numBlocks, blockSize, 8192); + auto xType = sd::ArrayOptions::dataType(hXShapeInfo); + BUILD_SINGLE_SELECTOR(xType, encoderKernelP3Generic, (launchDims, stream, dx, offsets, N, dz), FLOAT_TYPES); + + sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP3Float(...) failed"); + } + + + static NDArray thresholdEstimate_(const NDArray &updates, const float threshold) { + const int numThreads = 512; + const int numBlocks = updates.lengthOf() / numThreads + (updates.lengthOf() % numThreads ? 1 : 0); + + auto tmp = NDArrayFactory::create('c', {numBlocks + 1}); + + dim3 launchDims(numBlocks, numThreads, 1024); + auto xType = updates.dataType(); + + NDArray::prepareSpecialUse({&tmp}, {&updates}); + BUILD_SINGLE_SELECTOR(xType, encoderKernelP1Generic, (launchDims, LaunchContext::defaultContext()->getCudaStream(), updates.getSpecialBuffer(), updates.lengthOf(), tmp.specialBuffer(), threshold), FLOAT_TYPES); + NDArray::registerSpecialUse({&tmp}, {&updates}); + + return std::move(tmp); + } + + int32_t thresholdEstimate(const NDArray &updates, const float threshold) { + return thresholdEstimate_(updates, threshold).e(0); + } + + void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold) { + // we need these blocks in order to know, how many "updates" will be processed by each GPU block + auto blocks = thresholdEstimate_(updates, threshold); + + const int numThreads = 512; + const int numBlocks = updates.lengthOf() / numThreads + (updates.lengthOf() % numThreads ? 1 : 0); + + const int prefixThreads = 512; + int numElts = numBlocks; + int level = 0; + + // here we just calculate number of sumBlock arrays + do { + int numPrefixBlocks = sd::math::nd4j_max(1, sd::math::nd4j_ceil((float) numElts / (2.0f * prefixThreads))); + if (numBlocks > 1) { + level++; + } + numElts = numPrefixBlocks; + } while (numElts > 1); + + + + std::vector tempArrays(level); + std::vector pointers(level); + + level = 0; + numElts = numBlocks; + + do { + int numPrefixBlocks = sd::math::nd4j_max(1, sd::math::nd4j_ceil((float) numElts / (2.0f * prefixThreads))); + if (numPrefixBlocks > 1) { + tempArrays[level] = std::move(NDArrayFactory::create('c', {numPrefixBlocks})); + pointers[level] = tempArrays[level++].specialBuffer(); + } + numElts = numPrefixBlocks; + } while (numElts > 1); + + PointersManager pm(LaunchContext::defaultContext(), "thresholdEncode"); + auto dptr = pm.replicatePointer(pointers.data(), pointers.size() * 8); + auto offsets = NDArrayFactory::create('c', {numBlocks}); + + // we want to check, if we're hiting external limit on number of encoded elements + auto numMatches = blocks.e(0); + if (numMatches > encoded.lengthOf() - 4) { + blocks.p(0, encoded.lengthOf() - 4); + blocks.syncToDevice(); + } + + NDArray::prepareSpecialUse({}, {&encoded, &updates}); + + // filling offsets + encodeThresholdP2Int_(reinterpret_cast(dptr), + reinterpret_cast(blocks.getSpecialBuffer()), + numBlocks, + reinterpret_cast(offsets.getSpecialBuffer())); + + NDArray::registerSpecialUse({&blocks, &offsets}, {}); + pm.synchronize(); + + + encodeThresholdP3_(updates.specialBuffer(), + updates.shapeInfo(), + reinterpret_cast(offsets.getSpecialBuffer()), + updates.lengthOf(), + reinterpret_cast(encoded.specialBuffer())); + + pm.synchronize(); + + NDArray::registerSpecialUse({&encoded, &updates}, {}); + } + + void thresholdDecode(const NDArray &encoded, NDArray &updates) { + dim3 launchDims(128, 512, 512); + auto xType = updates.dataType(); + + NDArray::prepareSpecialUse({&updates}, {&encoded}); + BUILD_SINGLE_SELECTOR(xType, decoderKernelGeneric, (launchDims, LaunchContext::defaultContext()->getCudaStream(), encoded.getSpecialBuffer(), updates.lengthOf(), updates.specialBuffer()), FLOAT_TYPES); + NDArray::registerSpecialUse({&updates}, {&encoded}); + } + } + } +} diff --git a/libnd4j/include/ops/declarable/helpers/threshold.h b/libnd4j/include/ops/declarable/helpers/threshold.h new file mode 100644 index 000000000..21ac0c820 --- /dev/null +++ b/libnd4j/include/ops/declarable/helpers/threshold.h @@ -0,0 +1,37 @@ +/******************************************************************************* + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +// +// @author raver119@gmail.com +// + +#ifndef SD_THRESHOLD_H +#define SD_THRESHOLD_H + +#include + +namespace sd { + namespace ops { + namespace helpers { + int32_t thresholdEstimate(const NDArray &updates, float threshold); + + void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold); + void thresholdDecode(const NDArray &encoded, NDArray &updates); + } + } +} + +#endif //SD_THRESHOLD_H diff --git a/libnd4j/include/ops/impl/specials_single.hpp b/libnd4j/include/ops/impl/specials_single.hpp index ed86315f7..c9a48f049 100644 --- a/libnd4j/include/ops/impl/specials_single.hpp +++ b/libnd4j/include/ops/impl/specials_single.hpp @@ -557,21 +557,26 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) fb.i_ = x[2]; float threshold = fb.f_; + auto pPos = -1; auto func = PRAGMA_THREADS_FOR { for (auto e = start; e < stop; e++) { + const auto v = x[e]; for (int bitId = 0; bitId < 16; bitId++) { - bool hasBit = (x[e] & 1 << (bitId)) != 0; - bool hasSign = (x[e] & 1 << (bitId + 16)) != 0; + bool hasBit = (v & 1 << (bitId)) != 0; + bool hasSign = (v & 1 << (bitId + 16)) != 0; + auto cPos = (e - 4) * 16 + bitId; if (hasBit) { if (hasSign) - dz[(e - 4) * 16 + bitId] -= static_cast(threshold); + dz[cPos] -= static_cast(threshold); else - dz[(e - 4) * 16 + bitId] += static_cast(threshold); + dz[cPos] += static_cast(threshold); } else if (hasSign) { - dz[(e - 4) * 16 + bitId] -= static_cast(threshold / 2); + dz[cPos] -= static_cast(threshold / 2); } + + pPos = cPos; } } }; @@ -582,17 +587,21 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) template Nd4jLong SpecialMethods::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) { auto dx = reinterpret_cast(vx); + const T two(2.0f); + const T zero(0.0f); + const T t(threshold); + const T thalf = t / two; -//PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal)) - auto func = PRAGMA_REDUCE_LONG { + //auto func = PRAGMA_REDUCE_LONG { Nd4jLong retVal = 0L; - for (auto x = start; x < stop; x += increment) { + PRAGMA_OMP_PARALLEL_FOR_REDUCTION(+:retVal) + for (auto x = 0; x < N; x += 16) { int byte = 0; int byteId = x / 16 + 4; for (int f = 0; f < 16; f++) { - Nd4jLong e = x + f; + auto e = x + f; if (e >= N) continue; @@ -602,19 +611,19 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) int bitId = e % 16; - if (abs >= (T) threshold) { + if (abs >= t) { byte |= 1 << (bitId); retVal++; - if (val < (T) 0.0f) { + if (val < zero) { byte |= 1 << (bitId + 16); - dx[e] += static_cast(threshold); + dx[e] += t; } else { - dx[e] -= static_cast(threshold); + dx[e] -= t; } - } else if (abs >= (T) threshold / (T) 2.0f && val < (T) 0.0f) { + } else if (abs >= thalf && val < zero) { byte |= 1 << (bitId + 16); - dx[e] += static_cast(threshold / 2); + dx[e] += thalf; retVal++; } @@ -624,8 +633,9 @@ PRAGMA_OMP_SINGLE_ARGS(nowait) } return retVal; - }; - return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16); + //}; + + //return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16); } } diff --git a/libnd4j/tests_cpu/layers_tests/AtomicTests.cu b/libnd4j/tests_cpu/layers_tests/AtomicTests.cu index bd024ef3b..f8248e7ea 100644 --- a/libnd4j/tests_cpu/layers_tests/AtomicTests.cu +++ b/libnd4j/tests_cpu/layers_tests/AtomicTests.cu @@ -57,7 +57,7 @@ static void multiplyLauncher(void *vbuffer, uint64_t length, void *vresult) { multiplyKernel<<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult); auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream()); if (err != 0) - sd::cuda_exception::build("multiply failed", err); + throw sd::cuda_exception::build("multiply failed", err); } template @@ -80,7 +80,7 @@ static void sumLauncher(void *vbuffer, uint64_t length, void *vresult) { sumKernel<<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult); auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream()); if (err != 0) - sd::cuda_exception::build("sum failed", err); + throw sd::cuda_exception::build("sum failed", err); } template @@ -103,7 +103,7 @@ static void subLauncher(void *vbuffer, uint64_t length, void *vresult) { subKernel<<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult); auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream()); if (err != 0) - sd::cuda_exception::build("sub failed", err); + throw sd::cuda_exception::build("sub failed", err); } template @@ -126,7 +126,7 @@ static void divLauncher(void *vbuffer, uint64_t length, void *vresult) { divKernel<<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult); auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream()); if (err != 0) - sd::cuda_exception::build("div failed", err); + throw sd::cuda_exception::build("div failed", err); } static void multiplyHost(NDArray &input, NDArray &output) { diff --git a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt index 07f473dbc..870178152 100644 --- a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt +++ b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt @@ -42,18 +42,18 @@ endif() # -fsanitize=address # -fsanitize=leak if (APPLE) - set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2 -D__APPLE_OS__=true") + set(CMAKE_CXX_FLAGS " -fPIC -D__APPLE_OS__=true") elseif(WIN32) if (SD_CPU) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC -march=native -mtune=native -O3") endif() if (SD_CPU AND LINUX) - set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2") + set(CMAKE_CXX_FLAGS " -fPIC") endif() else() set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") - set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2") + set(CMAKE_CXX_FLAGS " -fPIC") if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*") set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native") else() @@ -82,12 +82,12 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") # using GCC - SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") -endif() + SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmax-errors=2") -if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux" AND NOT(MINGW)) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic") - SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic") + if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux" AND NOT(MINGW)) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic") + SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic") + endif() endif() IF(${CMAKE_SYSTEM_NAME} MATCHES "Linux") diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp index f48e3d946..641728ad3 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp @@ -25,6 +25,7 @@ #include #include #include +#include using namespace sd; @@ -39,6 +40,195 @@ public: } }; +TEST_F(DeclarableOpsTests19, test_threshold_encode_1) { + auto x = NDArrayFactory::create('c', {3}, {1.5, 2.5, -3.5}); + auto exp_encoded = NDArrayFactory::create('c', {7}, {3, 3, 1056964608, 0, 1, 2, -3}); + auto exp_gradients = NDArrayFactory::create('c', {3}, {1.0, 2.0, -3.0}); + + sd::ops::encode_threshold op; + auto result = op.evaluate({&x}, {0.5}); + + auto gradients = result.at(0); + auto encoded = result.at(1); + + //encoded->printIndexedBuffer("ENC"); + + ASSERT_EQ(exp_encoded, *encoded); + ASSERT_EQ(exp_gradients, x); + + // FIXME: we need to add a way to declare individual inplace outputs + //ASSERT_EQ(exp_gradients, *gradients); +} + +TEST_F(DeclarableOpsTests19, test_threshold_encode_2) { + for (int length = 5; length < 35; length++) { + auto x = NDArrayFactory::create('c', {10000}); + auto exp_gradients = NDArrayFactory::create('c', {10000}); + + for (int e = 0; e < length; e++) { + x.p(e, 2e-3); + exp_gradients.p(e, 1e-3); + } + + sd::ops::encode_threshold op; + auto result = op.evaluate({&x}, {1e-3}); + + auto encoded = result.at(1); + + ASSERT_EQ(length + 4, encoded->lengthOf()); + ASSERT_EQ(exp_gradients, x); + } +} + +TEST_F(DeclarableOpsTests19, test_threshold_encode_boundary_1) { + auto x = NDArrayFactory::create('c', {6}); + x = 1.0f; + + sd::ops::encode_threshold op; + auto result = op.evaluate({&x}, {1.0}, {3}); + + auto gradients = result.at(0); + auto encoded = result.at(1); + + ASSERT_EQ(7, encoded->lengthOf()); + ASSERT_EQ(3, x.sumNumber().e(0)); +} + +TEST_F(DeclarableOpsTests19, test_threshold_encode_boundary_2) { + auto x = NDArrayFactory::create('c', {1000}); + x = 1.0f; + + sd::ops::encode_threshold op; + auto result = op.evaluate({&x}, {1.0}, {100}); + + auto gradients = result.at(0); + auto encoded = result.at(1); + + ASSERT_EQ(104, encoded->lengthOf()); + + ASSERT_EQ(900, x.sumNumber().e(0)); +} + +TEST_F(DeclarableOpsTests19, test_threshold_decode_1) { + auto x = NDArrayFactory::create('c', {3}, {1.0, 2.0, -3.0}); + auto y = NDArrayFactory::create('c', {7}, {3, 3, 1056964608, 0, 1, 2, -3}); + auto exp_gradients = NDArrayFactory::create('c', {3}, {1.5, 2.5, -3.5}); + + sd::ops::decode_threshold op; + auto status = op.execute({&x, &y}, {&x}); + ASSERT_EQ(Status::OK(), status); + ASSERT_EQ(exp_gradients, x); +} + +TEST_F(DeclarableOpsTests19, test_bitmap_encode_1) { + auto initial = NDArrayFactory::create('c', {6}, {0.0f, 0.0f, 1e-3f, -1e-3f, 0.0f, 0.0f}); + auto exp_0 = initial.like(); + auto exp_1 = initial.dup(); + auto exp_c = NDArrayFactory::create(2L); + + sd::ops::encode_bitmap enc; + auto enc_result = enc.evaluate({&initial}, {1e-3f}); + ASSERT_EQ(Status::OK(), enc_result.status()); + + //initial.printIndexedBuffer("initial"); + ASSERT_EQ(exp_0, initial); + + auto encoded = enc_result.at(1); + auto counter = enc_result.at(2); + + //encoded->printIndexedBuffer("encoded"); + + ASSERT_EQ(exp_c, *counter); + + sd::ops::decode_bitmap dec; + auto status = dec.execute({&initial, encoded}, {&initial}); + ASSERT_EQ(Status::OK(), status); + + + //initial.printIndexedBuffer(); + + ASSERT_EQ(exp_1, initial); +} + +TEST_F(DeclarableOpsTests19, test_bitmap_encode_decode) { + auto initial = NDArrayFactory::create('c', {256000}); + initial = 1.0f; + auto exp = initial.dup(); + auto neg = initial.like(); + neg = 0.5f; + + sd::ops::encode_bitmap enc; + auto enc_result = enc.evaluate({&initial}, {0.5f}); + auto encoded = enc_result.at(1); + + // checking equality of all encoded bits + for (int e = 5; e < encoded->lengthOf() - 1; e++) { + if (encoded->e(e) != encoded->e(e - 1)) + nd4j_printf("Non equal encoded values at E[%i]: %i;\n", e, encoded->e(e)); + } + + ASSERT_NE(exp, initial); + ASSERT_EQ(neg, initial); + + sd::ops::decode_bitmap dec; + auto status = dec.execute({&initial, encoded}, {&initial}); + ASSERT_EQ(Status::OK(), status); + + // checking equality of all dedoded bits + for (int e = 0; e < initial.lengthOf(); e++) { + auto f = initial.e(e); + if (f != 1.0f) + nd4j_printf("initial[%i] = %f\n", e, f); + } + + + ASSERT_EQ(exp, initial); +} + +TEST_F(DeclarableOpsTests19, test_threshold_encode_decode) { + auto initial = NDArrayFactory::create('c', {256000}); + initial = 1.0f; + auto exp = initial.dup(); + auto neg = initial.like(); + neg = 0.5f; + + sd::ops::encode_threshold enc; + auto enc_result = enc.evaluate({&initial}, {0.5f}); + auto encoded = enc_result.at(1); + + ASSERT_EQ(256000 + 4, encoded->lengthOf()); + ASSERT_NE(exp, initial); + + for (int e = 0; e < initial.lengthOf(); e++) { + auto f = initial.e(e); + if (f != 0.5f) { + nd4j_printf("initial[%i] = %f\n", e, f); + throw std::runtime_error(""); + } + } + ASSERT_EQ(neg, initial); + + // checking equality of all encoded bits + //for (int e = 5; e < encoded->lengthOf() - 1; e++) { + //if (encoded->e(e) != encoded->e(e - 1) + 1) + //nd4j_printf("Non equal encoded values at E[%i]: %i;\n", e, encoded->e(e)); + //} + + sd::ops::decode_threshold dec; + auto status = dec.execute({&initial, encoded}, {&initial}); + ASSERT_EQ(Status::OK(), status); + + // checking equality of all dedoded bits + for (int e = 0; e < initial.lengthOf(); e++) { + auto f = initial.e(e); + if (f != 1.0f) + nd4j_printf("initial[%i] = %f\n", e, f); + } + + ASSERT_EQ(exp, initial); +} + + TEST_F(DeclarableOpsTests19, test_matmul_ccc) { auto x = NDArrayFactory::create('c', {10, 10}); auto y = NDArrayFactory::create('c', {10, 10}); diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java index 0c6724e9a..a053a40ab 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java @@ -73,6 +73,10 @@ public class ImportClassMapping { org.nd4j.linalg.api.ops.impl.broadcast.BroadcastRSubOp.class, org.nd4j.linalg.api.ops.impl.broadcast.BroadcastSubOp.class, org.nd4j.linalg.api.ops.impl.broadcast.BroadcastTo.class, + org.nd4j.linalg.api.ops.compression.EncodeBitmap.class, + org.nd4j.linalg.api.ops.compression.DecodeBitmap.class, + org.nd4j.linalg.api.ops.compression.EncodeThreshold.class, + org.nd4j.linalg.api.ops.compression.DecodeThreshold.class, org.nd4j.linalg.api.ops.impl.shape.Create.class, org.nd4j.linalg.api.ops.impl.broadcast.bool.BroadcastEqualTo.class, org.nd4j.linalg.api.ops.impl.broadcast.bool.BroadcastGreaterThan.class, diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/DecodeBitmap.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/DecodeBitmap.java new file mode 100644 index 000000000..954fc76c1 --- /dev/null +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/DecodeBitmap.java @@ -0,0 +1,55 @@ +/******************************************************************************* + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +package org.nd4j.linalg.api.ops.compression; + +import lombok.NonNull; +import org.nd4j.linalg.api.buffer.DataType; +import org.nd4j.linalg.api.ndarray.INDArray; +import org.nd4j.linalg.api.ops.DynamicCustomOp; +import org.nd4j.linalg.factory.Nd4j; + +import java.util.Arrays; +import java.util.List; + +/** + * Bitmap decoding op wrapper. Used in gradients sharing. + * @author raver119@gmail.com + */ +public class DecodeBitmap extends DynamicCustomOp { + + public DecodeBitmap() { + // + } + + public DecodeBitmap(@NonNull INDArray encoded, @NonNull INDArray updates) { + addInputArgument(updates, encoded); + addOutputArgument(updates); + + // this op ALWAYS modifies updates array + setInPlace(true); + } + + @Override + public String opName() { + return "decode_bitmap"; + } + + @Override + public List calculateOutputDataTypes(List dataTypes) { + return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32); + } +} diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/DecodeThreshold.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/DecodeThreshold.java new file mode 100644 index 000000000..c4eed6f24 --- /dev/null +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/DecodeThreshold.java @@ -0,0 +1,54 @@ +/******************************************************************************* + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +package org.nd4j.linalg.api.ops.compression; + +import lombok.NonNull; +import org.nd4j.linalg.api.buffer.DataType; +import org.nd4j.linalg.api.ndarray.INDArray; +import org.nd4j.linalg.api.ops.DynamicCustomOp; + +import java.util.Arrays; +import java.util.List; + +/** + * Sparse threshold decoding op wrapper. Used in gradients sharing. + * @author raver119@gmail.com + */ +public class DecodeThreshold extends DynamicCustomOp { + + public DecodeThreshold() { + // + } + + public DecodeThreshold(@NonNull INDArray encoded, @NonNull INDArray updates) { + addInputArgument(updates, encoded); + addOutputArgument(updates); + + // this op ALWAYS modifies updates array + setInPlace(true); + } + + @Override + public String opName() { + return "decode_threshold"; + } + + @Override + public List calculateOutputDataTypes(List dataTypes) { + return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32); + } +} diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/EncodeBitmap.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/EncodeBitmap.java new file mode 100644 index 000000000..683237c40 --- /dev/null +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/EncodeBitmap.java @@ -0,0 +1,64 @@ +/******************************************************************************* + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +package org.nd4j.linalg.api.ops.compression; + +import lombok.NonNull; +import org.nd4j.linalg.api.buffer.DataType; +import org.nd4j.linalg.api.ndarray.INDArray; +import org.nd4j.linalg.api.ops.DynamicCustomOp; +import org.nd4j.linalg.factory.Nd4j; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * Bitmap encoding op wrapper. Used in gradients sharing. + * @author raver119@gmail.com + */ +public class EncodeBitmap extends DynamicCustomOp { + protected float threshold = 1e-3f; + + public EncodeBitmap() { + // + } + + public EncodeBitmap(@NonNull INDArray updates, float threshold) { + this(updates, Nd4j.create(DataType.INT32, updates.length() / 16 + 5), Nd4j.scalar(DataType.INT32, 0), threshold); + } + + public EncodeBitmap(@NonNull INDArray updates, @NonNull INDArray encoded, @NonNull INDArray counter, float threshold) { + addInputArgument(updates); + addOutputArgument(updates, encoded, counter); + addTArgument(threshold); + + this.threshold = threshold; + + // this op ALWAYS modifies updates array + setInPlace(true); + } + + @Override + public String opName() { + return "encode_bitmap"; + } + + @Override + public List calculateOutputDataTypes(List dataTypes) { + return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32, DataType.INT32); + } +} diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/EncodeThreshold.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/EncodeThreshold.java new file mode 100644 index 000000000..621b459e9 --- /dev/null +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/EncodeThreshold.java @@ -0,0 +1,63 @@ +/******************************************************************************* + * Copyright (c) 2020 Konduit K.K. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +package org.nd4j.linalg.api.ops.compression; + +import lombok.NonNull; +import org.nd4j.linalg.api.buffer.DataType; +import org.nd4j.linalg.api.ndarray.INDArray; +import org.nd4j.linalg.api.ops.DynamicCustomOp; +import org.nd4j.linalg.factory.Nd4j; + +import java.util.Arrays; +import java.util.List; + +/** + * Sparse threshold encoding op wrapper. Used in gradients sharing. + * @author raver119@gmail.com + */ +public class EncodeThreshold extends DynamicCustomOp { + protected float threshold = 1e-3f; + protected int boundary = Integer.MAX_VALUE; + + public EncodeThreshold() { + // + } + + public EncodeThreshold(@NonNull INDArray updates, float threshold) { + this(updates, threshold, Integer.MAX_VALUE); + } + + public EncodeThreshold(@NonNull INDArray updates, float threshold, @NonNull Integer boundary) { + addInputArgument(updates); + + addTArgument(threshold); + addIArgument(boundary.intValue()); + + this.threshold = threshold; + this.boundary = boundary; + } + + @Override + public String opName() { + return "encode_threshold"; + } + + @Override + public List calculateOutputDataTypes(List dataTypes) { + return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32); + } +} diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java index e65bf4860..9f4217b68 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java @@ -30,6 +30,10 @@ import org.nd4j.linalg.api.ndarray.INDArrayStatistics; import org.nd4j.linalg.api.ops.*; import org.nd4j.linalg.api.ops.aggregates.Aggregate; import org.nd4j.linalg.api.ops.aggregates.Batch; +import org.nd4j.linalg.api.ops.compression.DecodeBitmap; +import org.nd4j.linalg.api.ops.compression.DecodeThreshold; +import org.nd4j.linalg.api.ops.compression.EncodeBitmap; +import org.nd4j.linalg.api.ops.compression.EncodeThreshold; import org.nd4j.linalg.api.ops.impl.scatter.ScatterUpdate; import org.nd4j.linalg.api.ops.impl.summarystats.Variance; import org.nd4j.linalg.api.rng.Random; @@ -685,38 +689,41 @@ public abstract class DefaultOpExecutioner implements OpExecutioner { @Override public INDArray thresholdEncode(INDArray input, double threshold) { - throw new UnsupportedOperationException("Not yet implemented"); + return thresholdEncode(input, threshold, Integer.MAX_VALUE); } @Override public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) { - throw new UnsupportedOperationException("Not yet implemented"); + val result = Nd4j.exec(new EncodeThreshold(input, (float) threshold, boundary))[1]; + + return result.getInt(0) > 0 ? result : null; } @Override public INDArray thresholdDecode(INDArray encoded, INDArray target) { - throw new UnsupportedOperationException("Not yet implemented"); + Nd4j.exec(new DecodeThreshold(encoded, target)); + return target; } @Override public long bitmapEncode(INDArray indArray, INDArray target, double threshold) { - throw new UnsupportedOperationException("Not yet implemented"); + val results = Nd4j.exec(new EncodeBitmap(indArray, target, Nd4j.scalar(0), (float) threshold)); + + // return number of elements taht were compressed + return results[2].getInt(0); } @Override public INDArray bitmapEncode(INDArray indArray, double threshold) { - DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(indArray.length() / 16 + 5); - - INDArray ret = Nd4j.createArrayFromShapeBuffer(buffer, indArray.shapeInfoDataBuffer()); - - bitmapEncode(indArray, ret, threshold); - - return ret; + val array = Nd4j.create(DataType.INT32, indArray.length() / 16 + 5); + bitmapEncode(indArray, array, threshold); + return array; } @Override public INDArray bitmapDecode(INDArray encoded, INDArray target) { - throw new UnsupportedOperationException("Not yet implemented"); + Nd4j.exec(new DecodeBitmap(encoded, target)); + return target; } diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java index a82d1b756..c7789d7dc 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java @@ -1004,20 +1004,6 @@ public interface NativeOps { @Cast("Nd4jLong *") LongPointer tadShapeInfo, @Cast("Nd4jLong *") LongPointer tadOffsets); - - long encodeBitmap(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, long N, IntPointer dz, float threshold); - - void decodeBitmap(PointerPointer extraPointers, Pointer dx, long N, Pointer dz, LongPointer zShapeInfo); - - - void encodeThresholdP1(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, long N, IntPointer dz, float threshold); - - void encodeThresholdP2Int(PointerPointer extraPointers, IntPointer dx, long N, IntPointer dz); - - void encodeThresholdP3(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, IntPointer offsets, long N, IntPointer dz); - - void decodeThreshold(PointerPointer extraPointers, Pointer dx, long N, Pointer dz, LongPointer zShapeInfo); - void sort(PointerPointer extraPointers, Pointer x, @Cast("Nd4jLong *") LongPointer xShapeInfo, Pointer dx, @Cast("Nd4jLong *") LongPointer dxShapeInfo, diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/compression/CudaFlexibleThreshold.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/compression/CudaFlexibleThreshold.java deleted file mode 100644 index c59bc10ad..000000000 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/compression/CudaFlexibleThreshold.java +++ /dev/null @@ -1,100 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -package org.nd4j.linalg.jcublas.compression; - -import org.bytedeco.javacpp.IntPointer; -import org.nd4j.linalg.api.buffer.DataBuffer; -import org.nd4j.linalg.api.buffer.DataTypeEx; -import org.nd4j.linalg.api.concurrency.AffinityManager; -import org.nd4j.linalg.api.ndarray.INDArray; -import org.nd4j.linalg.compression.CompressedDataBuffer; -import org.nd4j.linalg.compression.CompressionDescriptor; -import org.nd4j.linalg.factory.Nd4j; -import org.nd4j.linalg.indexing.conditions.Conditions; - -/** - * This compression is very special case, and shouldn't be ever used outside of ParallelWrapper/ParameterServer implementation. - * It encodes data as delta between zero and abs threshold. - * - * Unlike CudaThreshold codec, CudaFlexibleThreshold tries to target specified sparsity/density updates ratio via topN approach - * - * PLEASE NOTE: DO NOT USE THIS COMPRESSOR UNLESS YOU'RE 100% SURE WHAT YOU DO! - * - * @author raver119@gmail.com - */ -public class CudaFlexibleThreshold extends CudaThreshold { - - public CudaFlexibleThreshold() { - super(); - this.threshold = 0.1f; - } - - /** - * This method returns compression descriptor. It should be unique for any compressor implementation - * - * @return - */ - @Override - public String getDescriptor() { - return "FTHRESHOLD"; - } - - /** - * This method allows you to configure desired sparsity/density ratio for updates. Pass it as float/double value - * - * Default value: 0.1 - * @param vars - */ - @Override - public void configure(Object... vars) { - super.configure(vars); - } - - - @Override - public DataBuffer compress(DataBuffer buffer) { - INDArray temp = Nd4j.createArrayFromShapeBuffer(buffer, Nd4j.getShapeInfoProvider().createShapeInformation(new long[]{1, buffer.length()}, buffer.dataType())); - double max = temp.amaxNumber().doubleValue(); - - int cntAbs = temp.scan(Conditions.absGreaterThanOrEqual(max - (max * threshold))).intValue(); - - long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType()); - int compressedLength = cntAbs + 3; - // first 3 elements contain header - IntPointer pointer = new IntPointer(compressedLength); - pointer.put(0, cntAbs); - pointer.put(1, (int) buffer.length()); - pointer.put(2, Float.floatToIntBits(threshold)); // please note, this value will be ovewritten anyway - - CompressionDescriptor descriptor = new CompressionDescriptor(); - descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT) - descriptor.setOriginalLength(originalLength); - descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType())); - descriptor.setNumberOfElements(buffer.length()); - - descriptor.setCompressionAlgorithm(getDescriptor()); - descriptor.setCompressionType(getCompressionType()); - - CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor); - - Nd4j.getNDArrayFactory().convertDataEx(getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.FTHRESHOLD, pointer, buffer.length()); - - Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST); - - return cbuff; - } -} diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/compression/CudaThreshold.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/compression/CudaThreshold.java deleted file mode 100644 index f9cbb1794..000000000 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/compression/CudaThreshold.java +++ /dev/null @@ -1,271 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -package org.nd4j.linalg.jcublas.compression; - -import lombok.Getter; -import lombok.Setter; -import lombok.extern.slf4j.Slf4j; -import lombok.val; -import org.apache.commons.math3.util.FastMath; -import org.bytedeco.javacpp.*; -import org.nd4j.compression.impl.AbstractCompressor; -import org.nd4j.jita.allocator.impl.AtomicAllocator; -import org.nd4j.linalg.api.buffer.DataBuffer; -import org.nd4j.linalg.api.buffer.DataType; -import org.nd4j.linalg.api.buffer.DataTypeEx; -import org.nd4j.linalg.api.ndarray.INDArray; -import org.nd4j.linalg.compression.CompressedDataBuffer; -import org.nd4j.linalg.compression.CompressionType; -import org.nd4j.linalg.exception.ND4JIllegalStateException; -import org.nd4j.linalg.factory.Nd4j; -import org.nd4j.linalg.jcublas.context.CudaContext; -import org.nd4j.nativeblas.NativeOpsHolder; - -import java.util.ArrayList; -import java.util.List; - -/** - * This compression is very special case, and shouldn't be ever used outside of ParallelWrapper/ParameterServer implementation. - * It encodes data as delta between zero and abs threshold. - * - * PLEASE NOTE: DO NOT USE THIS COMPRESSOR UNLESS YOU'RE 100% SURE WHAT YOU DO! - * - * @author raver119@gmail.com - */ -@Slf4j -public class CudaThreshold extends AbstractCompressor { - @Getter @Setter protected float threshold = 1e-3f; - - /** - * This method returns compression descriptor. It should be unique for any compressor implementation - * - * @return - */ - @Override - public String getDescriptor() { - return "THRESHOLD"; - } - - /** - * This method allows you to configure threshold for delta extraction. Pass it as float/double value - * - * Default value: 1e-3 - * @param vars - */ - @Override - public void configure(Object... vars) { - if (vars[0] instanceof Number) { - Number t = (Number) vars[0]; - threshold = FastMath.abs(t.floatValue()); - log.info("Setting threshold to [{}]", threshold); - } else { - throw new ND4JIllegalStateException("Threshold value should be Number"); - } - } - - @Override - public INDArray compress(INDArray array) { - //logger.info("Threshold [{}] compression", threshold); - - Nd4j.getExecutioner().commit(); - //Nd4j.getAffinityManager().ensureLocation(array, AffinityManager.Location.HOST); - - DataBuffer buffer = compress(array.data()); - if (buffer == null) - return null; - - INDArray dup = Nd4j.createArrayFromShapeBuffer(buffer, array.shapeInfoDataBuffer()); - dup.markAsCompressed(true); - - return dup; - } - - @Override - public CompressionType getCompressionType() { - return CompressionType.LOSSLESS; - } - - @Override - public DataBuffer decompress(DataBuffer buffer, DataType type) { - if (buffer.dataType() != DataType.INT) - throw new UnsupportedOperationException(); - - long compressedLength = buffer.getInt(0); - long originalLength = buffer.getInt(1); - - DataBuffer result = Nd4j.createBuffer(type, originalLength, false); - - val context = AtomicAllocator.getInstance().getDeviceContext(); - - PointerPointer extras = new PointerPointer(32).put(1, context.getOldStream()); - - //log.info("DEC Source length: {}", buffer.length()); - //log.info("DEC Source: {}", Arrays.toString(buffer.asInt())); - - //NativeOpsHolder.getInstance().getDeviceNativeOps().decodeThresholdFloat(extras, AtomicAllocator.getInstance().getPointer(buffer), compressedLength, (FloatPointer) AtomicAllocator.getInstance().getPointer(result)); - AtomicAllocator.getInstance().getAllocationPoint(result).tickDeviceWrite(); - - //DataBuffer result = Nd4j.getNDArrayFactory().convertDataEx(DataTypeEx.THRESHOLD, buffer, getGlobalTypeEx()); - - return result; - } - - @Override - public DataBuffer compress(DataBuffer buffer) { - - int numThreads = 1024; - int numBlocks = (int) (buffer.length() / numThreads + (buffer.length() % numThreads == 0 ? 0 : 1)); - - val context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext(); - - DataBuffer blocksBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks+1, true) : Nd4j.getDataBufferFactory().createInt(numBlocks+1, true, Nd4j.getMemoryManager().getCurrentWorkspace()); - PointerPointer extras = new PointerPointer(32).put(1, context.getOldStream()); - - - //NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP1(extras, (FloatPointer) AtomicAllocator.getInstance().getPointer(buffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), threshold); - AtomicAllocator.getInstance().getAllocationPoint(blocksBuffer).tickDeviceWrite(); - - - int numMatches = blocksBuffer.getInt(0); - - //log.info("Totals: {}", numMatches); -/* - - log.info("Number of blocks for compression: {}", numBlocks); - log.info("BlocksCounts: {}", Arrays.toString(blocksBuffer.asInt())); -*/ - DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(3+numMatches, false) : Nd4j.getDataBufferFactory().createInt(3+numMatches, false, Nd4j.getMemoryManager().getCurrentWorkspace()); - encodedBuffer.put(0, numMatches); - encodedBuffer.put(1, (int) buffer.length()); - encodedBuffer.put(2, Float.floatToIntBits(threshold)); - AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickHostWrite(); - - // FIXME: make it parallel via some kernel, because it can be pretty big array here, i.e. for 150m original array, offsets can - /* - int prevSum = 0; - for (int e = 0; e < numBlocks; e++) { - int prevVal = offsetsBuffer.getInt(e + 1); - offsetsBuffer.put(e + 1, prevSum); - prevSum += prevVal; - } - */ - - int prefixThreads = 512; - int numElts = numBlocks; - int level = 0; - List buffers = new ArrayList<>(); - - // here we just calculate number of sumBlock arrays - do { - int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads))); - if (numBlocks > 1) { - level++; - } - numElts = numPrefixBlocks; - } while (numElts > 1); - - long[] pointers = new long[level]; - - level = 0; - numElts = numBlocks; - - // allocating temp buffers for prefux sum - DataBuffer tempX = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createDouble(pointers.length, false) : Nd4j.getDataBufferFactory().createDouble(pointers.length, false, Nd4j.getMemoryManager().getCurrentWorkspace()); - - do { - int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads))); - if (numPrefixBlocks > 1) { - DataBuffer bf = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false) : Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false, Nd4j.getMemoryManager().getCurrentWorkspace()); - - buffers.add(bf); - - pointers[level++] = AtomicAllocator.getInstance().getPointer(bf).address(); - } - numElts = numPrefixBlocks; - } while (numElts > 1); - - - AtomicAllocator.getInstance().memcpyBlocking(tempX, new LongPointer(pointers), pointers.length * 8, 0); - - extras.put(2, AtomicAllocator.getInstance().getPointer(tempX)); - - DataBuffer offsetsBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks, true) : Nd4j.getDataBufferFactory().createInt(numBlocks, true, Nd4j.getMemoryManager().getCurrentWorkspace()); - - NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP2Int(extras, (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), numBlocks, (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer) ); - AtomicAllocator.getInstance().getAllocationPoint(offsetsBuffer).tickDeviceWrite(); - - //log.info("Offsets: {}", Arrays.toString(offsetsBuffer.asInt())); - //log.info("Target: {}", Arrays.toString(encodedBuffer.asInt())); - - - - //NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP3Float(extras, (FloatPointer) AtomicAllocator.getInstance().getPointer(buffer), (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(encodedBuffer)); - AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickDeviceWrite(); - AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite(); - - //log.info("Encoded: {}", Arrays.toString(encodedBuffer.asInt())); - - extras.address(); - tempX.address(); - - return encodedBuffer; - - /* - INDArray temp = Nd4j.createArrayFromShapeBuffer(buffer, Nd4j.getShapeInfoProvider().createShapeInformation(new int[]{1, (int) buffer.length()})); - MatchCondition condition = new MatchCondition(temp, Conditions.absGreaterThanOrEqual(threshold)); - int cntAbs = Nd4j.getExecutioner().exec(condition, Integer.MAX_VALUE).getInt(0); - - - //log.info("density ratio: {}", String.format("%.2f", cntAbs * 100.0f / buffer.length())); - - if (cntAbs == 0) - return null; - - long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType()); - int compressedLength = cntAbs + 3; - // first 3 elements contain header - IntPointer pointer = new IntPointer(compressedLength); - pointer.put(0, cntAbs); - pointer.put(1, (int) buffer.length()); - pointer.put(2, Float.floatToIntBits(threshold)); - - CompressionDescriptor descriptor = new CompressionDescriptor(); - descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT) - descriptor.setOriginalLength(originalLength); - descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType())); - descriptor.setNumberOfElements(buffer.length()); - - descriptor.setCompressionAlgorithm(getDescriptor()); - descriptor.setCompressionType(getCompressionType()); - - - - CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor); - - Nd4j.getNDArrayFactory().convertDataEx(getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.THRESHOLD, pointer, buffer.length()); - - Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST); - - return cbuff; - */ - } - - @Override - protected CompressedDataBuffer compressPointer(DataTypeEx srcType, Pointer srcPointer, int length, int elementSize) { - throw new UnsupportedOperationException(); - } -} diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java index 76430a50e..afca1daa5 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java @@ -31,6 +31,7 @@ import org.nd4j.jita.allocator.tad.DeviceTADManager; import org.nd4j.jita.conf.CudaEnvironment; import org.nd4j.linalg.api.buffer.DataBuffer; import org.nd4j.linalg.api.buffer.DataType; +import org.nd4j.linalg.api.concurrency.AffinityManager; import org.nd4j.linalg.api.environment.Nd4jEnvironment; import org.nd4j.linalg.api.memory.pointers.PagedPointer; import org.nd4j.linalg.api.ndarray.INDArray; @@ -1674,224 +1675,6 @@ public class CudaExecutioner extends DefaultOpExecutioner { ctx.syncSpecialStream(); } - @Override - public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) { - DataBuffer buffer = input.data(); - - int numThreads = 1024; - int numBlocks = (int) (buffer.length() / numThreads + (buffer.length() % numThreads == 0 ? 0 : 1)); - - val context = AtomicAllocator.getInstance().getDeviceContext(); - - DataBuffer blocksBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks+1, true) : Nd4j.getDataBufferFactory().createInt(numBlocks+1, true, Nd4j.getMemoryManager().getCurrentWorkspace()); - - if (extraz.get() == null) - extraz.set(new PointerPointer(32)); - - val extras = extraz.get().put(1, context.getOldStream()); - - ((BaseCudaDataBuffer) buffer).getOpaqueDataBuffer().syncToSpecial(); - - - NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP1(extras, - AtomicAllocator.getInstance().getPointer(buffer), - (LongPointer) AtomicAllocator.getInstance().getHostPointer(input.shapeInfoDataBuffer()), - buffer.length(), - (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), - (float) threshold); - - AtomicAllocator.getInstance().getAllocationPoint(blocksBuffer).tickDeviceWrite(); - - - int numMatches = blocksBuffer.getInt(0); - - // special case here, nothing to update - if (numMatches < 2) - return null; - - if (boundary != null && numMatches > boundary) { - numMatches = boundary; - blocksBuffer.put(0, numMatches); - } - - DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(4+numMatches, false) : Nd4j.getDataBufferFactory().createInt(4+numMatches, false, Nd4j.getMemoryManager().getCurrentWorkspace()); - - encodedBuffer.put(0, numMatches); - encodedBuffer.put(1, (int) buffer.length()); - encodedBuffer.put(2, Float.floatToIntBits((float) threshold)); - - encodedBuffer.put(3, ThresholdCompression.FLEXIBLE_ENCODING); - - ((BaseCudaDataBuffer) encodedBuffer).getOpaqueDataBuffer().syncToSpecial(); - - - int prefixThreads = 512; - int numElts = numBlocks; - int level = 0; - List buffers = new ArrayList<>(); - - // here we just calculate number of sumBlock arrays - do { - int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads))); - if (numBlocks > 1) { - level++; - } - numElts = numPrefixBlocks; - } while (numElts > 1); - - long[] pointers = new long[level]; - - level = 0; - numElts = numBlocks; - - // allocating temp buffers for prefux sum - DataBuffer tempX = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createDouble(pointers.length, false) : Nd4j.getDataBufferFactory().createDouble(pointers.length, false, Nd4j.getMemoryManager().getCurrentWorkspace()); - - do { - int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads))); - if (numPrefixBlocks > 1) { - DataBuffer bf = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false) : Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false, Nd4j.getMemoryManager().getCurrentWorkspace()); - - buffers.add(bf); - - pointers[level++] = AtomicAllocator.getInstance().getPointer(bf).address(); - } - numElts = numPrefixBlocks; - } while (numElts > 1); - - - AtomicAllocator.getInstance().memcpyBlocking(tempX, new LongPointer(pointers), pointers.length * 8, 0); - - extras.put(2, AtomicAllocator.getInstance().getPointer(tempX)); - - DataBuffer offsetsBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks, true) : Nd4j.getDataBufferFactory().createInt(numBlocks, true, Nd4j.getMemoryManager().getCurrentWorkspace()); - - NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP2Int(extras, (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), numBlocks, (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer) ); - AtomicAllocator.getInstance().getAllocationPoint(offsetsBuffer).tickDeviceWrite(); - - - NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP3(extras, AtomicAllocator.getInstance().getPointer(buffer), (LongPointer) AtomicAllocator.getInstance().getHostPointer(input.shapeInfoDataBuffer()), (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(encodedBuffer)); - - AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickDeviceWrite(); - AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite(); - - return Nd4j.createArrayFromShapeBuffer(encodedBuffer, input.shapeInfoDataBuffer()); - } - - - @Override - public INDArray thresholdEncode(INDArray input, double threshold) { - return thresholdEncode(input, threshold, null); - } - - @Override - public INDArray thresholdDecode(INDArray encoded, INDArray target) { - DataBuffer buffer = encoded.data(); - - if (buffer.dataType() != DataType.INT) - throw new UnsupportedOperationException(); - - long compressedLength = buffer.getInt(0); - long originalLength = buffer.getInt(1); - - if (target.length() != originalLength) - throw new ND4JIllegalStateException("originalLength ["+ originalLength+"] stored in encoded array doesn't match target length ["+ target.length()+"]"); - - DataBuffer result = target.data(); - - val context = AtomicAllocator.getInstance().getDeviceContext(); - - if (extraz.get() == null) - extraz.set(new PointerPointer(32)); - - PointerPointer extras = extraz.get().put(1, context.getOldStream()); - - nativeOps.decodeThreshold(extras, AtomicAllocator.getInstance().getPointer(buffer), compressedLength, AtomicAllocator.getInstance().getPointer(result), (LongPointer) AtomicAllocator.getInstance().getHostPointer(target.shapeInfoDataBuffer())); - - if (nativeOps.lastErrorCode() != 0) - throw new RuntimeException(nativeOps.lastErrorMessage()); - - AtomicAllocator.getInstance().getAllocationPoint(result).tickDeviceWrite(); - - return target; - } - - - @Override - public long bitmapEncode(INDArray indArray, INDArray target, double threshold) { - long length = indArray.length(); - long tLen = target.data().length(); - - if (tLen != (length / 16 + 5)) - throw new ND4JIllegalStateException("Length of target array should be " + (length / 16 + 5)); - - if (target.data().dataType() != DataType.INT) - throw new ND4JIllegalStateException("Target array should have INT dataType"); - - DataBuffer buffer = target.data(); - buffer.put(0, (int) length); - buffer.put(1, (int) length); - buffer.put(2, Float.floatToIntBits((float) threshold)); - - // format id - buffer.put(3, ThresholdCompression.BITMAP_ENCODING); - - val context = AtomicAllocator.getInstance().getDeviceContext(); - - if (extraz.get() == null) - extraz.set(new PointerPointer(32)); - - - PointerPointer extras = extraz.get().put( - AtomicAllocator.getInstance().getHostPointer(indArray), - context.getOldStream(), - context.getBufferScalar(), - context.getBufferReduction() - ); - - - val src = AtomicAllocator.getInstance().getPointer(indArray, context); - val dst = (IntPointer) AtomicAllocator.getInstance().getPointer(buffer, context); - ((BaseCudaDataBuffer) buffer).getOpaqueDataBuffer().syncToSpecial(); - - long val = nativeOps.encodeBitmap(extras, - src, (LongPointer) AtomicAllocator.getInstance().getHostPointer(indArray.shapeInfoDataBuffer()), - length, - dst, - (float) threshold); - - if (nativeOps.lastErrorCode() != 0) - throw new RuntimeException(nativeOps.lastErrorMessage()); - - AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite(); - - return val; - } - - @Override - public INDArray bitmapDecode(INDArray encoded, INDArray target) { - - val context = AtomicAllocator.getInstance().getDeviceContext(); - - if (extraz.get() == null) - extraz.set(new PointerPointer(32)); - - - PointerPointer extras = extraz.get().put( - AtomicAllocator.getInstance().getHostPointer(target), - context.getOldStream(), - context.getBufferScalar(), - context.getBufferReduction()); - - nativeOps.decodeBitmap(extras, AtomicAllocator.getInstance().getPointer(encoded.data(), context), target.length(), AtomicAllocator.getInstance().getPointer(target, context), (LongPointer) AtomicAllocator.getInstance().getHostPointer(target.shapeInfoDataBuffer())); - - if (nativeOps.lastErrorCode() != 0) - throw new RuntimeException(nativeOps.lastErrorMessage()); - - return target; - } - - @Override public synchronized Map getCustomOperations() { if(customOps == null) { @@ -1974,6 +1757,11 @@ public class CudaExecutioner extends DefaultOpExecutioner { val inputArgs = opContext != null ? opContext.getInputArrays() : op.inputArguments(); int cnt= 0; for (val in: inputArgs) { + // TODO: once we implement Context-based shape function call this method should be removed + val loc = Nd4j.getAffinityManager().getActiveLocation(in); + if (loc != AffinityManager.Location.DEVICE && loc != AffinityManager.Location.EVERYWHERE) + Nd4j.getAffinityManager().ensureLocation(in, AffinityManager.Location.DEVICE); + // NOT A TYPO: shape functions work on host side only if (!in.isEmpty()) { inputBuffers.put(cnt, in.data().addressPointer()); diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java index 1bf3ec2d6..f86b3ac08 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java @@ -2804,30 +2804,6 @@ public native void tear(@Cast("Nd4jPointer*") PointerPointer extraPointers, @Cast("Nd4jLong*") long[] tadShapeInfo, @Cast("Nd4jLong*") long[] tadOffsets); -public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold); -public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold); -public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold); -public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo); -public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo); -public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo); - - -public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold); -public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold); -public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold); -public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntPointer dx, @Cast("Nd4jLong") long N, IntPointer dz); -public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntBuffer dx, @Cast("Nd4jLong") long N, IntBuffer dz); -public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, int[] dx, @Cast("Nd4jLong") long N, int[] dz); -public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, IntPointer offsets, @Cast("Nd4jLong") long N, IntPointer dz); -public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, IntBuffer offsets, @Cast("Nd4jLong") long N, IntBuffer dz); -public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, int[] offsets, @Cast("Nd4jLong") long N, int[] dz); - - -public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo); -public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo); -public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo); - - public native void sort(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer x, @Cast("Nd4jLong*") LongPointer xShapeInfo, Pointer dx, @Cast("Nd4jLong*") LongPointer dxShapeInfo, @@ -10712,6 +10688,7 @@ public static final int PREALLOC_SIZE = 33554432; // #include // #include // #include +// #include // #include // #include // #include diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/resources/META-INF/services/org.nd4j.linalg.compression.NDArrayCompressor b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/resources/META-INF/services/org.nd4j.linalg.compression.NDArrayCompressor index 5c829e57a..72e314156 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/resources/META-INF/services/org.nd4j.linalg.compression.NDArrayCompressor +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/resources/META-INF/services/org.nd4j.linalg.compression.NDArrayCompressor @@ -13,5 +13,3 @@ # # SPDX-License-Identifier: Apache-2.0 ################################################################################ - -org.nd4j.linalg.jcublas.compression.CudaThreshold \ No newline at end of file diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java index 96def59d7..508144f26 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java @@ -1376,142 +1376,6 @@ public class NativeOpExecutioner extends DefaultOpExecutioner { } } - @Override - public INDArray thresholdEncode(INDArray input, double threshold) { - return thresholdEncode(input, threshold, null); - } - - @Override - public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) { - - //val condition = new MatchCondition(input, Conditions.absGreaterThanOrEqual(threshold)); - //long t1 = System.currentTimeMillis(); - int cntAbs = loop.estimateThreshold(null, - input.data().addressPointer(), - (LongPointer) input.shapeInfoDataBuffer().addressPointer(), - (int) input.length(), - (float) threshold); - //long t2 = System.currentTimeMillis(); - - if (loop.lastErrorCode() != 0) - throw new RuntimeException(loop.lastErrorMessage()); - - if (cntAbs < 2) - return null; - - if (boundary != null) - cntAbs = Math.min(cntAbs, boundary); - - //log.info("S: {}; T: {}", cntAbs, t2 - t1); - - DataBuffer buffer = input.data(); - - long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType()); - int compressedLength = cntAbs + 4; - // first 3 elements contain header - - DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(4+cntAbs, false) : Nd4j.getDataBufferFactory().createInt(4+cntAbs, false, Nd4j.getMemoryManager().getCurrentWorkspace()); - - encodedBuffer.put(0, cntAbs); - encodedBuffer.put(1, (int) buffer.length()); - encodedBuffer.put(2, Float.floatToIntBits((float) threshold)); - - // format id - encodedBuffer.put(3, ThresholdCompression.FLEXIBLE_ENCODING); - - CompressionDescriptor descriptor = new CompressionDescriptor(); - descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT) - descriptor.setOriginalLength(originalLength); - descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType())); - descriptor.setNumberOfElements(buffer.length()); - - descriptor.setCompressionAlgorithm("THRESHOLD"); - descriptor.setCompressionType(CompressionType.LOSSLESS); - - //CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor); - - Nd4j.getNDArrayFactory().convertDataEx(AbstractCompressor.getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.THRESHOLD, encodedBuffer.addressPointer(), buffer.length()); - - Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST); - - return Nd4j.createArrayFromShapeBuffer(encodedBuffer, input.shapeInfoDataBuffer()); - } - - @Override - public INDArray thresholdDecode(INDArray encoded, INDArray target) { - DataBuffer buffer = encoded.data(); - - if (buffer.dataType() != DataType.INT) - throw new ND4JIllegalStateException("thresholdEncoded array should have dataType of INT"); - - long compressedLength = buffer.getInt(0); - long originalLength = buffer.getInt(1); - float threshold = buffer.getInt(2); - - if (target.length() != originalLength) - throw new ND4JIllegalStateException("originalLength ["+ originalLength+"] stored in encoded array doesn't match target length ["+ target.length()+"]"); - - DataTypeEx typeDst = AbstractCompressor.getBufferTypeEx(target.data()); - - loop.convertTypes(null, DataTypeEx.THRESHOLD.ordinal(), buffer.addressPointer(), target.length(), typeDst.ordinal(), target.data().addressPointer()); - - if (loop.lastErrorCode() != 0) - throw new RuntimeException(loop.lastErrorMessage()); - - return target; - } - - - @Override - public long bitmapEncode(INDArray indArray, INDArray target, double threshold) { - long length = indArray.length(); - long tLen = target.data().length(); - - if (tLen != (length / 16 + 5)) - throw new ND4JIllegalStateException("Length of target array should be " + (length / 16 + 5)); - - if (target.data().dataType() != DataType.INT) - throw new ND4JIllegalStateException("Target array should have INT dataType"); - - DataBuffer buffer = target.data(); - - buffer.put(0, (int) length); - buffer.put(1, (int) length); - buffer.put(2, Float.floatToIntBits((float) threshold)); - - // format id - buffer.put(3, ThresholdCompression.BITMAP_ENCODING); - - long affected = loop.encodeBitmap(null, - indArray.data().addressPointer(), - (LongPointer) indArray.shapeInfoDataBuffer().addressPointer(), - length, - (IntPointer) buffer.addressPointer(), - (float) threshold); - - if (loop.lastErrorCode() != 0) - throw new RuntimeException(loop.lastErrorMessage()); - - return affected; - } - - @Override - public INDArray bitmapDecode(INDArray encoded, INDArray target) { - - loop.decodeBitmap(null, - encoded.data().addressPointer(), - target.length(), - target.data().addressPointer(), - (LongPointer) target.shapeInfoDataBuffer().addressPointer() - ); - - if (loop.lastErrorCode() != 0) - throw new RuntimeException(loop.lastErrorMessage()); - - return target; - } - - @Override public synchronized Map getCustomOperations() { if (customOps == null) { diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java index fd9ffc7c1..875fdfb3a 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java @@ -2808,30 +2808,6 @@ public native void tear(@Cast("Nd4jPointer*") PointerPointer extraPointers, @Cast("Nd4jLong*") long[] tadShapeInfo, @Cast("Nd4jLong*") long[] tadOffsets); -public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold); -public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold); -public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold); -public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo); -public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo); -public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo); - - -public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold); -public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold); -public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold); -public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntPointer dx, @Cast("Nd4jLong") long N, IntPointer dz); -public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntBuffer dx, @Cast("Nd4jLong") long N, IntBuffer dz); -public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, int[] dx, @Cast("Nd4jLong") long N, int[] dz); -public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, IntPointer offsets, @Cast("Nd4jLong") long N, IntPointer dz); -public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, IntBuffer offsets, @Cast("Nd4jLong") long N, IntBuffer dz); -public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, int[] offsets, @Cast("Nd4jLong") long N, int[] dz); - - -public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo); -public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo); -public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo); - - public native void sort(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer x, @Cast("Nd4jLong*") LongPointer xShapeInfo, Pointer dx, @Cast("Nd4jLong*") LongPointer dxShapeInfo, @@ -12463,6 +12439,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); // #include // #include // #include +// #include // #include // #include // #include diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java index d98c1b3bc..77df067ca 100644 --- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java +++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java @@ -8315,6 +8315,85 @@ public class Nd4jTestsC extends BaseNd4jTest { assertArrayEquals(new long[]{bS, oH, oW, oC}, ret[0].shape()); } + @Test + public void testMatmulMethod_8() { + val x = Nd4j.create(DataType.INT8, 3, 5).assign(1); + val y = Nd4j.create(DataType.INT8, 5, 3).assign(1); + val e = Nd4j.create(DataType.INT8, 3, 3).assign(5); + + val z = x.mmul(y); + assertEquals(e, z); + } + + @Test + public void testMatmulMethod_7() { + val x = Nd4j.create(DataType.INT16, 3, 5).assign(1); + val y = Nd4j.create(DataType.INT16, 5, 3).assign(1); + val e = Nd4j.create(DataType.INT16, 3, 3).assign(5); + + val z = x.mmul(y); + assertEquals(e, z); + } + + @Test + public void testMatmulMethod_1() { + val x = Nd4j.create(DataType.INT32, 3, 5).assign(1); + val y = Nd4j.create(DataType.INT32, 5, 3).assign(1); + val e = Nd4j.create(DataType.INT32, 3, 3).assign(5); + + val z = x.mmul(y); + assertEquals(e, z); + } + + @Test + public void testMatmulMethod_2() { + val x = Nd4j.create(DataType.INT64, 3, 5).assign(1); + val y = Nd4j.create(DataType.INT64, 5, 3).assign(1); + val e = Nd4j.create(DataType.INT64, 3, 3).assign(5); + + val z = x.mmul(y); + assertEquals(e, z); + } + + @Test + public void testMatmulMethod_6() { + val x = Nd4j.create(DataType.UINT8, 3, 5).assign(1); + val y = Nd4j.create(DataType.UINT8, 5, 3).assign(1); + val e = Nd4j.create(DataType.UINT8, 3, 3).assign(5); + + val z = x.mmul(y); + assertEquals(e, z); + } + + @Test + public void testMatmulMethod_5() { + val x = Nd4j.create(DataType.UINT16, 3, 5).assign(1); + val y = Nd4j.create(DataType.UINT16, 5, 3).assign(1); + val e = Nd4j.create(DataType.UINT16, 3, 3).assign(5); + + val z = x.mmul(y); + assertEquals(e, z); + } + + @Test + public void testMatmulMethod_3() { + val x = Nd4j.create(DataType.UINT32, 3, 5).assign(1); + val y = Nd4j.create(DataType.UINT32, 5, 3).assign(1); + val e = Nd4j.create(DataType.UINT32, 3, 3).assign(5); + + val z = x.mmul(y); + assertEquals(e, z); + } + + @Test + public void testMatmulMethod_4() { + val x = Nd4j.create(DataType.UINT64, 3, 5).assign(1); + val y = Nd4j.create(DataType.UINT64, 5, 3).assign(1); + val e = Nd4j.create(DataType.UINT64, 3, 3).assign(5); + + val z = x.mmul(y); + assertEquals(e, z); + } @Override public char ordering() { diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/compression/CompressionTests.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/compression/CompressionTests.java index 3967a2c95..5742ab53f 100644 --- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/compression/CompressionTests.java +++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/compression/CompressionTests.java @@ -17,6 +17,7 @@ package org.nd4j.linalg.compression; import lombok.extern.slf4j.Slf4j; +import lombok.val; import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; @@ -44,7 +45,6 @@ import static org.junit.Assert.*; /** * @author raver119@gmail.com */ -@Ignore @Slf4j @RunWith(Parameterized.class) public class CompressionTests extends BaseNd4jTest { @@ -140,40 +140,6 @@ public class CompressionTests extends BaseNd4jTest { } - @Test - public void testThresholdCompressionZ() { - INDArray initial = Nd4j.create(1, 16384); - for (int i = 0; i < 96; i++) - initial.putScalar(i * 20, 1.0f); - - - INDArray exp = Nd4j.create(1, 16384); - for (int i = 0; i < 96; i++) - exp.putScalar(i * 20, 0.1f); - - INDArray exp_d = Nd4j.create(1, 16384); - for (int i = 0; i < 96; i++) - exp_d.putScalar(i * 20, 0.9f); - - NDArrayCompressor compressor = Nd4j.getCompressor().getCompressor("THRESHOLD"); - compressor.configure(0.9); - - INDArray compressed = Nd4j.getExecutioner().thresholdEncode(initial, 0.9); - - assertEquals(exp, initial); - - log.info("Compressed length: {}", compressed.data().length()); - // log.info("Compressed: {}", Arrays.toString(compressed.data().asInt())); - - INDArray decompressed = Nd4j.create(1, initial.length()); - Nd4j.getExecutioner().thresholdDecode(compressed, decompressed); - - log.info("Decompressed length: {}", decompressed.length()); - - assertEquals(exp_d, decompressed); - } - - @Ignore @Test public void testThresholdCompression0() { @@ -296,6 +262,23 @@ public class CompressionTests extends BaseNd4jTest { @Test public void testThresholdCompression5() { + INDArray initial = Nd4j.ones(10); + INDArray exp_0 = initial.dup(); + + Nd4j.getExecutioner().commit(); + + //Nd4j.getCompressor().getCompressor("THRESHOLD").configure(1e-3); + INDArray compressed = Nd4j.getExecutioner().thresholdEncode(initial, 1.0f, 3); + + assertEquals(7, compressed.data().length()); + + assertNotEquals(exp_0, initial); + + assertEquals(7, initial.sumNumber().doubleValue(), 0.01); + } + + @Test + public void testThresholdCompression5_1() { INDArray initial = Nd4j.ones(1000); INDArray exp_0 = initial.dup(); @@ -435,8 +418,8 @@ public class CompressionTests extends BaseNd4jTest { INDArray exp_0 = Nd4j.create(new float[] {0.0f, -1e-4f, 0.0f, 0.0f, 0.0f, 0.0f}); INDArray exp_1 = Nd4j.create(new float[] {0.0f, -5e-4f, 1e-3f, -1e-3f, 0.0f, 0.0f}); - DataBuffer ib = Nd4j.getDataBufferFactory().createInt(5); - INDArray enc = Nd4j.createArrayFromShapeBuffer(ib, initial.shapeInfoDataBuffer()); + + INDArray enc = Nd4j.create(DataType.INT32, initial.length() / 16 + 5); long elements = Nd4j.getExecutioner().bitmapEncode(initial, enc, 1e-3); log.info("Encoded: {}", Arrays.toString(enc.data().asInt())); @@ -471,7 +454,7 @@ public class CompressionTests extends BaseNd4jTest { @Test public void testBitmapEncoding5() { Nd4j.getRandom().setSeed(119); - INDArray initial = Nd4j.rand(new int[]{1, 10000}, -1, -0.5, Nd4j.getRandom()); + INDArray initial = Nd4j.rand(new int[]{10000}, -1, -0.5, Nd4j.getRandom()); INDArray exp_0 = initial.dup().addi(1e-1); INDArray exp_1 = initial.dup(); @@ -486,7 +469,7 @@ public class CompressionTests extends BaseNd4jTest { @Test public void testBitmapEncoding6() { Nd4j.getRandom().setSeed(119); - INDArray initial = Nd4j.rand(new int[]{1, 100000}, -1, 1, Nd4j.getRandom()); + INDArray initial = Nd4j.rand(new int[]{10000}, -1, 1, Nd4j.getRandom()); INDArray exp_1 = initial.dup(); INDArray enc = Nd4j.getExecutioner().bitmapEncode(initial, 1e-3); @@ -494,6 +477,11 @@ public class CompressionTests extends BaseNd4jTest { Nd4j.getExecutioner().bitmapDecode(enc, initial); + val f0 = exp_1.toFloatVector(); + val f1 = initial.toFloatVector(); + + assertArrayEquals(f0, f1, 1e-5f); + assertEquals(exp_1, initial); }