compression ops (#436)

* Added declarations for decode/encode_bitmap ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Added implementation for bitmap encoding/decoding ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Added helpers for encode/decode bitmap ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored encodingBitmap helper. Signed-off-by: shugeo <sgazeos@gmail.com> * threshold encode/decode skeleton * helper skeleton * minor import fix * encoder shape fn & op impl * thresholdEncode cpu impl Signed-off-by: raver119@gmail.com <raver119@gmail.com> * thresholdDecode cpu impl Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Only cosmetical changes. Signed-off-by: shugeo <sgazeos@gmail.com> * placeholder Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Added cuda implementation for bitmap decode helper. Signed-off-by: shugeo <sgazeos@gmail.com> * cuda thresholdEstimate Signed-off-by: raver119@gmail.com <raver119@gmail.com> * cuda thresholdDecode Signed-off-by: raver119@gmail.com <raver119@gmail.com> * next step Signed-off-by: raver119@gmail.com <raver119@gmail.com> * - nano cmakelist update (get rid of Clion section) - fixed forgotten throw in AtomicTests Signed-off-by: raver119@gmail.com <raver119@gmail.com> * thesholdEncode cuda impl Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Added tests for bitmap encoding/decoding ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed tests for encode/decode bitmaps. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored decode/encode helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed crashes with bitmap decode/encode helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * bitmap encode/decode CPU Signed-off-by: raver119@gmail.com <raver119@gmail.com> * bitmap encode/decode CUDA Signed-off-by: raver119@gmail.com <raver119@gmail.com> * C API removed for threshold/bitmap encode Signed-off-by: raver119@gmail.com <raver119@gmail.com> * EncodeBitmap/DecodeBitmap Java side Signed-off-by: raver119@gmail.com <raver119@gmail.com> * EncodeThreshold/DecodeThreshold Java side Signed-off-by: raver119@gmail.com <raver119@gmail.com> * EncodeThreshold/DecodeThreshold Java side Signed-off-by: raver119@gmail.com <raver119@gmail.com> * few more tests for threshold encoding Signed-off-by: raver119@gmail.com <raver119@gmail.com> * minor test tweak Signed-off-by: raver119@gmail.com <raver119@gmail.com> * two special tests Signed-off-by: raver119@gmail.com <raver119@gmail.com> * encodeBitmap CPU fix Signed-off-by: raver119@gmail.com <raver119@gmail.com> * parallel_long/parallel_double proper spans fix Signed-off-by: raver119@gmail.com <raver119@gmail.com> * encodeThreshold CUDA fix Signed-off-by: raver119@gmail.com <raver119@gmail.com> * nano fix Signed-off-by: raver119@gmail.com <raver119@gmail.com> * grid tweaks Signed-off-by: raver119@gmail.com <raver119@gmail.com> * RTX adaptation for thresholdEncode Signed-off-by: raver119 <raver119@gmail.com> * don't allow threshold encoding for length < 2 Signed-off-by: raver119@gmail.com <raver119@gmail.com> * get rid of NDArrayCompressor in EncodingHandler Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one more minor update of EncodingHandler Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one more minor tweak of EncodingHandler Signed-off-by: raver119@gmail.com <raver119@gmail.com> * - matmul allows integer data types use - EncodingHandler boundary default value - few tests for integer matmul Signed-off-by: raver119@gmail.com <raver119@gmail.com> * minor fix of CUDA bitmap encode Signed-off-by: raver119@gmail.com <raver119@gmail.com> * boundary changed to integer everywhere Signed-off-by: raver119@gmail.com <raver119@gmail.com> * boundary changed to integer everywhere Signed-off-by: raver119@gmail.com <raver119@gmail.com> * re-enable CUDA deallocator Signed-off-by: raver119@gmail.com <raver119@gmail.com> * threshold encoder fix for systems without omp Signed-off-by: raver119@gmail.com <raver119@gmail.com> * - encode_threshold now requires non-negative boundary - minor tweak in EncodingHandler Signed-off-by: raver119@gmail.com <raver119@gmail.com> * restore parallelism in decode_bitmap Signed-off-by: raver119@gmail.com <raver119@gmail.com> * fall back to omp for encode_bitmap cpu Signed-off-by: raver119@gmail.com <raver119@gmail.com> * single time casts Signed-off-by: raver119@gmail.com <raver119@gmail.com> * - additional test for encode_threshold - sync buffers to device before calling for shape function Signed-off-by: raver119@gmail.com <raver119@gmail.com> Co-authored-by: shugeo <sgazeos@gmail.com>
2020-05-08 20:59:39 +03:00 · 2020-05-08 20:59:39 +03:00 · 0613485654
commit 0613485654
parent f1232f8221
47 changed files with 1617 additions and 1387 deletions
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/optimize/solver/accumulation/EncodedGradientsAccumulatorTest.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/optimize/solver/accumulation/EncodedGradientsAccumulatorTest.java
@ -23,9 +23,13 @@ import org.deeplearning4j.optimize.solvers.accumulation.EncodedGradientsAccumula
 import org.deeplearning4j.optimize.solvers.accumulation.EncodingHandler;
 import org.deeplearning4j.optimize.solvers.accumulation.encoding.threshold.FixedThresholdAlgorithm;
 import org.junit.Test;
 import org.nd4j.linalg.api.concurrency.AffinityManager;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.api.ops.util.PrintAffinity;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.nativeblas.OpaqueDataBuffer;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 /**
@ -93,12 +97,13 @@ public class EncodedGradientsAccumulatorTest extends BaseDL4JTest {
        }
-        EncodingHandler handler = new EncodingHandler(new FixedThresholdAlgorithm(1e-3), null, null, false);
+        EncodingHandler handler = new EncodingHandler(new FixedThresholdAlgorithm(1e-3), null, Integer.MAX_VALUE, false);
        for (int e = 10; e < numParams / 5; e++) {
-            INDArray encoded = handler.encodeUpdates(0, 0, getGradients(numParams, e, 2e-3));
+            val gradients = getGradients(numParams, e, 2e-3);
            val encoded = handler.encodeUpdates(0, 0, gradients);
-            //  log.info("enc len: {}", encoded.data().length());
+            assertNotNull("Failed with e == " + e, encoded);
            int encFormat = encoded.data().getInt(3);
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodedGradientsAccumulator.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodedGradientsAccumulator.java
@ -69,7 +69,7 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
    protected ThreadLocal<Integer> index = new ThreadLocal<>();
    protected long initialMemory = 100 * 1024 * 1024L;
    protected int queueSize = 5;
-    protected Double boundary = 1.0;
+    protected Integer boundary = Integer.MAX_VALUE;
    protected boolean encodingDebugMode;
    protected IndexedTail externalSource;
@ -101,11 +101,11 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
    }
    public EncodedGradientsAccumulator(int parties, ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, boolean encodingDebugMode) {
-        this(parties, new EncodingHandler(thresholdAlgorithm, residualPostProcessor, 1.0, encodingDebugMode), DEFAULT_INITIAL_MEMORY, 10, 1.0, encodingDebugMode);
+        this(parties, new EncodingHandler(thresholdAlgorithm, residualPostProcessor, Integer.MAX_VALUE, encodingDebugMode), DEFAULT_INITIAL_MEMORY, 10, Integer.MAX_VALUE, encodingDebugMode);
    }
    public EncodedGradientsAccumulator(int parties, @NonNull MessageHandler handler, long initialMemory,
-                    int queueSize, Double boundary, boolean encodingDebugMode) {
+                    int queueSize, Integer boundary, boolean encodingDebugMode) {
        this.parties = parties;
        this.handler = handler;
        this.initialMemory = initialMemory;
@ -551,7 +551,7 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
        protected long initialMemory = DEFAULT_INITIAL_MEMORY;
        protected int queueSize = 5;
        protected MessageHandler handler;
-        protected Double boundary = null;
+        protected int boundary = Integer.MAX_VALUE;
        protected boolean encodingDebugMode;
        /**
@ -598,15 +598,12 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
        /**
         * This method enables optional limit for max number of updates per message
         *
-         * Default value: 1.0 (no limit)
+         * Default value: Integer.MAX_VALUE (no limit)
         * @param boundary positive value in range 0..1
         * @return
         */
-        public Builder updatesBoundary(double boundary) {
+        public Builder updatesBoundary(int boundary) {
-            if (boundary >= 1.0)
+            if (boundary <= 0)
                return this;
            if (boundary <= 0.0)
                throw new DL4JInvalidConfigException("Boundary should have positive value");
            this.boundary = boundary;
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodingHandler.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodingHandler.java
@ -16,6 +16,7 @@
 package org.deeplearning4j.optimize.solvers.accumulation;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.shade.guava.util.concurrent.AtomicDouble;
 import lombok.NonNull;
 import lombok.extern.slf4j.Slf4j;
@ -24,7 +25,6 @@ import org.deeplearning4j.optimize.solvers.accumulation.encoding.ThresholdAlgori
 import org.deeplearning4j.optimize.solvers.accumulation.encoding.ThresholdAlgorithmReducer;
 import org.nd4j.linalg.api.buffer.DataBuffer;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.compression.NDArrayCompressor;
 import org.nd4j.linalg.exception.ND4JIllegalStateException;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.ops.transforms.Transforms;
@ -54,9 +54,8 @@ public class EncodingHandler implements MessageHandler {
    protected ThresholdAlgorithm initialThresholdAlgorithm;
    protected ResidualPostProcessor initialResidualPostProcessor;
-    protected Double boundary;
+    protected Integer boundary;
    protected boolean encodingDebugMode;
    protected NDArrayCompressor compressor;
    protected AtomicInteger atomicBoundary = new AtomicInteger(-1);
    protected ThreadLocal<ThresholdAlgorithm> thresholdAlgorithm = new ThreadLocal<>();
@ -73,20 +72,16 @@ public class EncodingHandler implements MessageHandler {
    protected final AtomicLong lastThresholdLogTime = new AtomicLong();
    public EncodingHandler(final ThresholdAlgorithm thresholdAlgorithm, final ResidualPostProcessor residualPostProcessor,
-                           Double boundary, boolean encodingDebugMode){
+                           Integer boundary, boolean encodingDebugMode){
        this.initialThresholdAlgorithm = thresholdAlgorithm;
        this.initialResidualPostProcessor = residualPostProcessor;
-        this.boundary = boundary;
+        this.boundary = boundary == null ? Integer.MAX_VALUE : boundary;
        this.encodingDebugMode = encodingDebugMode;
    }
    @Override
    public void initialize(@NonNull GradientsAccumulator accumulator) {
        this.accumulator = accumulator;
        compressor = Nd4j.getCompressor().getCompressor("THRESHOLD");
        if (compressor == null)
            throw new ND4JIllegalStateException("Can't find Threshold compressor implementation!");
    }
    public INDArray encodeUpdates(int iteration, int epoch, INDArray updates) {
@ -135,14 +130,13 @@ public class EncodingHandler implements MessageHandler {
        iterations.get().incrementAndGet();
        if (boundary != null && atomicBoundary.get() < 0)
-            atomicBoundary.compareAndSet(-1, (int) (updates.length() * boundary));
+            atomicBoundary.compareAndSet(-1, (int) (updates.length() / 16) );
        INDArray encoded;
        if (!bitmapMode.get().get()) {
            //Sparse updates
-            encoded = Nd4j.getExecutioner().thresholdEncode(updates, currentThreshold.get().get(),
+            encoded = Nd4j.getExecutioner().thresholdEncode(updates, currentThreshold.get().get(), boundary == null ? null : atomicBoundary.get());
                    boundary == null ? null : atomicBoundary.get());
            // updates were TOO sparse, nothing to share here
            if (encoded == null) {
@ -157,17 +151,14 @@ public class EncodingHandler implements MessageHandler {
            }
-            double encLen = encoded.data().getInt(0);
+            double encLen = encoded.length();
            // if updates are too dense - we fallback to bitmap encoding
            if (encLen >= (updates.length() / 16)) {
                log.debug("Switching back to bitmapEncoding: iteration {}, epoch {}, threshold {}, encoded length {}", iteration, epoch, currThreshold, encLen);
                bitmapMode.get().set(true);
-                DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(updates.length() / 16 + 5);
+                encoded = Nd4j.getExecutioner().bitmapEncode(updates, currentThreshold.get().get());
                encoded = Nd4j.createArrayFromShapeBuffer(buffer, updates.shapeInfoDataBuffer());
                Nd4j.getExecutioner().bitmapEncode(updates, encoded, currentThreshold.get().get());
                applyPostProcessor(iteration, epoch, currThreshold, updates);
                lastSparsityRatio.set(null);
@ -186,8 +177,7 @@ public class EncodingHandler implements MessageHandler {
            }
        } else {
            //Dense bitmap updates
-            DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(updates.length() / 16 + 5);
+            encoded = Nd4j.create(DataType.INT32, updates.length() / 16 + 5);
            encoded = Nd4j.createArrayFromShapeBuffer(buffer, updates.shapeInfoDataBuffer());
            long values = Nd4j.getExecutioner().bitmapEncode(updates, encoded, currentThreshold.get().get());
--- a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelWrapper.java
+++ b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelWrapper.java
@ -910,7 +910,7 @@ public class ParallelWrapper implements AutoCloseable {
                    Preconditions.checkState(thresholdAlgorithm != null, "Cannot use SHARED_GRADIENTS training mode without setting a threshold algorithm");
                    this.trainerContext = new SymmetricTrainerContext();
                    if (this.accumulator == null) {
-                        log.info("Creating new GradientsAccumulator instance with threshold of [5e-4");
+                        log.info("Creating new GradientsAccumulator instance with default threshold of [5e-4]");
                        this.accumulator = new EncodedGradientsAccumulator(workers, thresholdAlgorithm, residualPostProcessor,  false);
                    }
                }
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v1/WiredEncodingHandler.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v1/WiredEncodingHandler.java
@ -45,7 +45,7 @@ public class WiredEncodingHandler extends EncodingHandler {
     * @param thresholdAlgorithm threshold algorithm to use
     * @param boundary
     */
-    public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Double boundary, boolean encodingDebugMode) {
+    public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Integer boundary, boolean encodingDebugMode) {
        super(thresholdAlgorithm, residualPostProcessor, boundary, encodingDebugMode);
    }
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v2/WiredEncodingHandler.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v2/WiredEncodingHandler.java
@ -44,7 +44,7 @@ public class WiredEncodingHandler extends EncodingHandler {
     *
     * @param thresholdAlgorithm The threshold algorithm to use
     */
-    public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Double boundary, boolean encodingDebugMode) {
+    public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Integer boundary, boolean encodingDebugMode) {
        super(thresholdAlgorithm, residualPostProcessor, boundary, encodingDebugMode);
    }
--- a/libnd4j/CMakeLists.txt
+++ b/libnd4j/CMakeLists.txt
@ -49,12 +49,12 @@ elseif(WIN32)
        set(CMAKE_CXX_FLAGS_RELEASE  "-D_RELEASE=true")
        set(CMAKE_CXX_FLAGS_DEBUG  "  /FS /EHsc")
    else()
-        set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true")
+        set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -D_RELEASE=true")
-        set(CMAKE_CXX_FLAGS_DEBUG  " -g -O2 -fPIC -fmax-errors=2")
+        set(CMAKE_CXX_FLAGS_DEBUG  " -g -O2 -fPIC")
    endif()
 else()
-    set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true")
+    set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -D_RELEASE=true")
-    set(CMAKE_CXX_FLAGS_DEBUG  " -g -O0 -fPIC -fmax-errors=2")
+    set(CMAKE_CXX_FLAGS_DEBUG  " -g -O0 -fPIC")
    if (SD_CPU)
        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address")
@ -221,21 +221,16 @@ include_directories(${FLATBUFFERS_PATH}/include)
 configure_file(include/config.h.in include/config.h)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
 if (NOT DEFINED ENV{CLION_IDE})
    message("NOT CLION")
    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
    add_subdirectory(blas)
    if(SD_BUILD_TESTS)
        # tests are always compiled with all ops included
        set(SD_ALL_OPS true)
        set(SD_BUILD_MINIFIER true)
        add_subdirectory(tests_cpu)
    endif()
 endif ()
-if ($ENV{CLION_IDE})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 add_subdirectory(blas)
 if(SD_BUILD_TESTS)
    # tests are always compiled with all ops included
    set(SD_ALL_OPS true)
    set(SD_BUILD_MINIFIER true)
    add_subdirectory(tests_cpu)
-endif ()
+endif()
 if (MSVC_DEV)
    set(SD_BUILD_MINIFIER false)
--- a/libnd4j/blas/CMakeLists.txt
+++ b/libnd4j/blas/CMakeLists.txt
@ -120,8 +120,13 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
    set( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}")
 elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
    # using GCC
-    SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}")
+    SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${ARCH_TUNE} -fmax-errors=2 ")
    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath,$ORIGIN/")
    if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT(APPLE) AND NOT(WIN32))
        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic")
        SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
    endif()
 endif()
@ -361,11 +366,6 @@ elseif(SD_CPU)
        endif()
    endif()
    if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT(APPLE) AND NOT(WIN32))
        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic")
        SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
    endif()
    install(TARGETS ${SD_LIBRARY_NAME} DESTINATION  .)
    set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cpu)
 endif()
--- a/libnd4j/include/execution/impl/Threads.cpp
+++ b/libnd4j/include/execution/impl/Threads.cpp
@ -571,7 +571,7 @@ namespace samediff {
        // create temporary array
        int64_t intermediatery[256];
-        auto span = delta / numThreads;
+        auto span = (numElements / numThreads) - (numElements % numThreads);
        // execute threads in parallel
        for (uint32_t e = 0; e < numThreads; e++) {
@ -615,7 +615,7 @@ namespace samediff {
        // create temporary array
        double intermediatery[256];
-        auto span = delta / numThreads;
+        auto span = (numElements / numThreads) - (numElements % numThreads);
        // execute threads in parallel
        for (uint32_t e = 0; e < numThreads; e++) {
--- a/libnd4j/include/legacy/NativeOps.h
+++ b/libnd4j/include/legacy/NativeOps.h
@ -1432,18 +1432,6 @@ ND4J_EXPORT void tear(Nd4jPointer *extraPointers,
                        Nd4jLong *tadShapeInfo,
                        Nd4jLong *tadOffsets);
 ND4J_EXPORT Nd4jLong encodeBitmap(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold);
 ND4J_EXPORT void decodeBitmap(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo);
 ND4J_EXPORT void encodeThresholdP1(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold);
 ND4J_EXPORT void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *dx, Nd4jLong N, int *dz);
 ND4J_EXPORT void encodeThresholdP3(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, int *offsets, Nd4jLong N, int *dz);
 ND4J_EXPORT void decodeThreshold(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo);
 ND4J_EXPORT void sort(Nd4jPointer *extraPointers,
        void *x, Nd4jLong *xShapeInfo,
        void *dx, Nd4jLong *dxShapeInfo,
--- a/libnd4j/include/legacy/cpu/NativeOps.cpp
+++ b/libnd4j/include/legacy/cpu/NativeOps.cpp
@ -1436,28 +1436,6 @@ void enableP2P(bool enable) {
    // no-op
 }
 void encodeThresholdP1(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) {
    // TODO: to be implemented
 }
 void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *hX, Nd4jLong N, int *dz) {
    // TODO: to be implemented
 }
 void encodeThresholdP3(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){
    // offsets won't be used here
    // TODO: to be implemented
 }
 void decodeThreshold(Nd4jPointer *extraPointers, void *hX, Nd4jLong N, void *dz, Nd4jLong *hZShapeInfo){
    // TODO: to be implemented
 }
 bool isP2PAvailable() {
    // always TRUE for cpu backend
    return true;
@ -1467,10 +1445,6 @@ void checkP2P() {
    // no-op
 }
 void decodeBitmap(Nd4jPointer *extraPointers, void *hX, Nd4jLong N, void *dz, Nd4jLong *hZShapeInfo) {
    NativeOpExecutioner::decodeBitmap(hX, N, dz, hZShapeInfo);
 }
 template<typename T>
 void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZShapeInfo, int N, int *shuffleMap, Nd4jLong **tadOnlyShapeInfo, Nd4jLong **tadOffsets) {
@ -1859,12 +1833,6 @@ void sortCooIndices(Nd4jPointer *extraPointers,
    }
 }
 Nd4jLong encodeBitmap(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) {
    return NativeOpExecutioner::encodeBitmap(hX, hXShapeInfo, N, dz, threshold);
 }
 Nd4jLong* mmapFile(Nd4jPointer *extraPointers, const char *fileName, Nd4jLong length) {
    auto hZ = new Nd4jLong[2];errno = 0;
 try {
--- a/libnd4j/include/legacy/cuda/NativeOps.cu
+++ b/libnd4j/include/legacy/cuda/NativeOps.cu
@ -2197,76 +2197,6 @@ void prescanArrayRecursive(Nd4jPointer *extras, int *dZ, int *dX, int numElement
    sd::DebugHelper::checkErrorCode(stream, "prescanArray(...) failed");
 }
 void encodeThresholdP1(Nd4jPointer *extras, void *dx, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) {
    try {
        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extras[1]);
        int blockSize = 1024;
        int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
        dim3 launchDims(numBlocks, blockSize, 1024);
        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
        BUILD_SINGLE_SELECTOR(xType, encoderKernelP1Generic, (launchDims, stream, dx, N, dz, threshold), LIBND4J_TYPES);
        sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP1Float(...) failed");
    } catch (std::exception &e) {
        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
    }
 }
 void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *dx, Nd4jLong N, int *dz) {
    try {
        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
        //encoderKernelP2Float<<<numBlocks, blockSize , 1024 * sizeof(float), *stream>>>(dx, N, dz);
        prescanArrayRecursive(extraPointers, dz, dx + 1, (int) N, 0);
        sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP2Int(...) failed");
    } catch (std::exception &e) {
        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
    }
 }
 void encodeThresholdP3(Nd4jPointer *extraPointers, void *dx, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){
    try {
        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
        int blockSize = 1024;
        int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
        dim3 launchDims(numBlocks, blockSize, 4096);
        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
        BUILD_SINGLE_SELECTOR(xType, encoderKernelP3Generic, (launchDims, stream, dx, offsets, N, dz), LIBND4J_TYPES);
        sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP3Float(...) failed");
    } catch (std::exception &e) {
        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
    }
 }
 void decodeThreshold(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo){
    try {
        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
        // we probably want to have smaller blocks here, memory writes are misaligned anyway
        int blockSize = 128;
        int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
        dim3 launchDims(numBlocks, blockSize, 1024);
        auto zType = sd::ArrayOptions::dataType(zShapeInfo);
        BUILD_SINGLE_SELECTOR(zType, decoderKernelGeneric, (launchDims, stream, dx, N, dz), LIBND4J_TYPES);
        sd::DebugHelper::checkErrorCode(stream, "decodeThresholdFloat(...) failed");
    } catch (std::exception &e) {
        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
    }
 }
 ////////////////////////////////////////////////////////////////////////
 void execReduce3All(Nd4jPointer *extraPointers,
 									int opNum,
@ -2603,55 +2533,6 @@ void sortCooIndices(Nd4jPointer *extraPointers, Nd4jLong *indices, void *values,
 	throw std::runtime_error("sortCooIndices:: Not implemented yet");
 }
 Nd4jLong encodeBitmap(Nd4jPointer *extraPointers,
 								void *dx, Nd4jLong *hXShapeInfo,
 								Nd4jLong N,
 								int *dz,
 								float threshold) {
    try {
        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
        int *resultPointer = reinterpret_cast<int *>(extraPointers[2]);
        int *reductionPointer = reinterpret_cast<int *>(extraPointers[3]);
        dim3 launchDims(512, 512, 32768);
        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
        BUILD_SINGLE_SELECTOR(xType, cudaEncodeBitmapGeneric,
                              (launchDims, stream, dx, N, dz, resultPointer, reductionPointer, threshold),
                              LIBND4J_TYPES);
        sd::DebugHelper::checkErrorCode(stream, "encodeBitmapFloat(...) failed");
        Nd4jLong dZ = (Nd4jLong) resultPointer[0];
        resultPointer[0] = 0;
        return dZ;
    } catch (std::exception &e) {
        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
        return 0;
    }
 }
 void decodeBitmap(Nd4jPointer *extraPointers,
 							void *dx,
 							Nd4jLong N,
 							void *dz, Nd4jLong *zShapeInfo) {
    try {
        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
        dim3 launchDims(512, 512, 16384);
        auto xType = sd::ArrayOptions::dataType(zShapeInfo);
        BUILD_SINGLE_SELECTOR(xType, cudaDecodeBitmapGeneric, (launchDims, stream, dx, N, dz), LIBND4J_TYPES);
        sd::DebugHelper::checkErrorCode(stream, "decodeBitmapFloat(...) failed");
    } catch (std::exception &e) {
        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
    }
 }
 Nd4jLong* mmapFile(Nd4jPointer *extraPointers, const char *fileName, Nd4jLong length) {
 	return nullptr;
 }
--- a/libnd4j/include/legacy/impl/Environment.cpp
+++ b/libnd4j/include/legacy/impl/Environment.cpp
@ -207,7 +207,7 @@ namespace sd {
    }
    void Environment::setMaxSpecialyMemory(uint64_t maxBytes) {
-        _maxTotalSpecialMemory;
+        _maxTotalSpecialMemory = maxBytes;
    }
    void Environment::setMaxDeviceMemory(uint64_t maxBytes) {
--- a/libnd4j/include/loops/cuda/type_conversions.cu
+++ b/libnd4j/include/loops/cuda/type_conversions.cu
@ -217,10 +217,27 @@ namespace sd {
    }
 //////////////////////////////////////////////////////////////////////////
 /*
 * PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
 */
 template<typename T>
 __global__ static void execEncoderKernelP1(void *dx, Nd4jLong N, void *dz, float threshold) {
        auto x = reinterpret_cast<T *> (dx);
        auto z = reinterpret_cast<int *> (dz);
-    encoderKernelP1<T>(dx, N, dz, threshold);
+        //basically, for phase One we want do calculation: how many eligible values we have, and which blocks will be holding data
        Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
        int pass = tid < N && sd::math::nd4j_abs<T>(x[tid]) >= static_cast<T>(threshold) ? 1 : 0;
        int bp=__syncthreads_count(pass);
        if (threadIdx.x == 0) {
            // saving out per-block passes
            z[blockIdx.x+1] = bp;
            // saving out sum
            atomicAdd(&z[0], bp);
        }
 }
 //////////////////////////////////////////////////////////////////////////
@ -230,13 +247,74 @@ __host__ void encoderKernelP1Generic(dim3 &launchDims, cudaStream_t *stream, voi
    execEncoderKernelP1<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, dz, threshold);
        sd::DebugHelper::checkErrorCode(stream, "encoderP1(...) failed");
 }
-BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP1Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP1Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold), FLOAT_TYPES);
 //////////////////////////////////////////////////////////////////////////
 /*
 * PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
 *
 * Based on: https://github.com/knotman90/cuStreamComp <-- efficient CUDA stream compaction algorithm
 */
 template<typename T>
 __global__ static void execEncoderKernelP3(void *dx, int *offsets, Nd4jLong N, void *dz) {
        auto x = reinterpret_cast<T *> (dx);
        auto z = reinterpret_cast<int *> (dz);
-    encoderKernelP3<T>(dx, offsets, N, dz);
+        auto tid = blockIdx.x * blockDim.x + threadIdx.x;
        extern __shared__ int warpTotals[];
        // fetch block offset only once
        __shared__ float threshold;
        __shared__ FloatBits fb;
        __shared__ int bo;
        __shared__ int limit;
        if (threadIdx.x == 0) {
            limit = z[0];
            fb.i_ = z[2];
            threshold = fb.f_;
            bo = offsets[blockIdx.x];
        }
        __syncthreads();
        // out-of-limit threads do not play here
        auto value = tid < N ? x[tid] : (T) 0.f;
        // out-of-limit threads just declare they have no changes
        auto pred = tid >= N ? 0 : sd::math::nd4j_abs<T>(value) >= static_cast<T>(threshold) ? 1 : 0;
        auto w_i = threadIdx.x / warpSize; // warp index (or, warp number) - index of the Warp within TOTAL_WARPS
        auto t_i = threadIdx.x % warpSize; // thread index within a warp
        unsigned int t_m = INT_MAX >> (warpSize - t_i - 1); //thread mask (ERROR IN THE PAPER minus one is required)
        int b = __ballot_sync(t_m, pred); // balres = number whose ith bit isone if the ith's thread pred is true masked up to the current index in warp
        auto t_u = __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
        if (t_i == warpSize - 1)
            warpTotals[w_i] = t_u + pred;
        __syncthreads();
        int w_i_u = 0;
        for (int j = 0; j <= 5; j++) {
            unsigned int b_j = __ballot_sync(t_m, warpTotals[t_i] & pow2i(j)); //# of the ones in the j'th digit of the warp offsets
            w_i_u += (__popc(b_j) << j);
        }
        // we just ignore all results coming from non-0 threads
        if (w_i == 0 && t_i < blockDim.x / warpSize)
            warpTotals[t_i] = w_i_u;
        __syncthreads();
        // pred is always false if we're out-of-limits
        if (pred) {
            int idx = t_u + warpTotals[w_i] + bo + 4;
            if (idx < limit + 4) {
                z[idx] = value > static_cast<T>(0.0f) ? tid + 1 : -(tid + 1);
                x[tid] = value > static_cast<T>(0.0f) ? x[tid] - threshold : x[tid] + threshold;
            }
        }
 }
 //////////////////////////////////////////////////////////////////////////
@ -245,13 +323,38 @@ __host__ void encoderKernelP3Generic(dim3 &launchDims, cudaStream_t *stream, voi
    execEncoderKernelP3<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, offsets, N, dz);
        sd::DebugHelper::checkErrorCode(stream, "encoderP3(...) failed");
 }
-BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP3Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP3Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz), FLOAT_TYPES);
 //////////////////////////////////////////////////////////////////////////
 /*
 *   This kernel handles decode from sparse threshold array, to dense array
 *
 *   PLEASE NOTE: Z is expected to be memset to 0
 */
 template<typename T>
 __global__ static void execDecoderKernel(void *dx, Nd4jLong N, void *dz) {
        auto x = reinterpret_cast<int *> (dx);
        auto z = reinterpret_cast<T *> (dz);
-    decoderKernel<T>(dx, N, dz);
+        int tid = blockIdx.x * blockDim.x + threadIdx.x;
        __shared__ float threshold;
        __shared__ int limit;
        __shared__ FloatBits fb;
        if (threadIdx.x == 0) {
            limit = x[0];
            fb.i_ = x[2];
            threshold = fb.f_;
        }
        __syncthreads();
        for (int e = tid; e < limit; e += blockDim.x * gridDim.x) {
            int el = x[e+4];
            int ael = sd::math::nd4j_abs<int>(el) - 1;
            // TODO: investigate, if += would work better here, as in "decoded accumulation"
            z[ael] += el > 0 ? threshold : -threshold;
        }
 }
 //////////////////////////////////////////////////////////////////////////
@ -261,14 +364,78 @@ __host__ void decoderKernelGeneric(dim3 &launchDims, cudaStream_t *stream, void
    execDecoderKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, dz);
    sd::DebugHelper::checkErrorCode(stream, "execDecoder(...) failed");
 }
-BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT decoderKernelGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT decoderKernelGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz), FLOAT_TYPES);
 //////////////////////////////////////////////////////////////////////////
 template<typename T>
 __global__ static void execCudaEncodeBitmapKernel(void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold) {
        auto dx = reinterpret_cast<T *>(vdx);
        int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    cudaEncodeBitmapKernel<T>(vdx, N, dz, scalar, reductionBuffer, threshold);
+        T off(0.0f);
        __shared__ int counter;
        __shared__ int *shmem;
        __shared__ T *vals;
        if (threadIdx.x == 0){
            extern __shared__ char mem[];
            shmem = reinterpret_cast<int*>(mem);
            vals = reinterpret_cast<T *>(shmem + blockDim.x);
            counter = 0;
        }
        __syncthreads();
        Nd4jLong loopRemainder = N % (blockDim.x * gridDim.x);
        Nd4jLong loopLimit = N + (blockDim.x * gridDim.x - loopRemainder);
        for (Nd4jLong i = tid; i < loopLimit; i += blockDim.x * gridDim.x) {
            // all threads in block reading stuff
            T val = i < N ? dx[i] : off;
            T abs = sd::math::nd4j_abs<T>(val);
            int byteId = i / 16 + 4;
            int bitId = i % 16;
            shmem[threadIdx.x] = 0;
            vals[threadIdx.x] = val;
            if (abs >= static_cast<T>(threshold) && i < N) {
                shmem[threadIdx.x] = 1 << (bitId);
                atomicAdd(&counter, 1);
                if (val < static_cast<T>(0.0f)) {
                    shmem[threadIdx.x] |= 1 << (bitId + 16);
                    vals[threadIdx.x] += static_cast<T>(threshold);
                } else {
                    vals[threadIdx.x] -= static_cast<T>(threshold);
                }
            } else if (abs >= static_cast<T>(threshold) / static_cast<T>(2.0f) && val < static_cast<T>(0.0f) && i < N) {
                atomicAdd(&counter, 1);
                shmem[threadIdx.x] = 1 << (bitId + 16);
                vals[threadIdx.x] += static_cast<T>(threshold) / static_cast<T>(2.0f);
            }
            __syncthreads();
            if (threadIdx.x % 16 == 0 && i < N) {
                int byte = 0;
                for (int e = 0; e < 16; e++) {
                    if (i + e >= N)
                        continue;
                    byte |= shmem[threadIdx.x + e];
                }
                dz[byteId] = byte;
            }
            __syncthreads();
            if (i < N)
                dx[i] = vals[threadIdx.x];
        }
        __syncthreads();
        if (threadIdx.x == 0) {
            atomicAdd(scalar, counter);
        }
 }
 //////////////////////////////////////////////////////////////////////////
@ -278,14 +445,62 @@ __host__ void cudaEncodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, vo
    execCudaEncodeBitmapKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(vdx, N, dz, scalar, reductionBuffer, threshold);
    sd::DebugHelper::checkErrorCode(stream, "encodeBitmap(...) failed");
 }
-BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaEncodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaEncodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold), FLOAT_TYPES);
 //////////////////////////////////////////////////////////////////////////
 template<typename T>
 __global__ static void execCudaDecodeBitmapKernel(void *dx, Nd4jLong N, void *vdz) {
        auto dz = static_cast<T*>(vdz);
-     cudaDecodeBitmapKernel<T>(dx, N, vdz);
+        int tid = blockIdx.x * blockDim.x + threadIdx.x;
        __shared__ T *shmem;
        __shared__ FloatBits fb;
        __shared__ float threshold;
        __shared__ int *x;
        if (threadIdx.x == 0){
            extern __shared__ char mem[];
            shmem = reinterpret_cast<T*>(mem);
            x = reinterpret_cast<int *>(dx);
            fb.i_ = x[2];
            threshold = fb.f_;
        }
        __syncthreads();
        int lim = N / 16 + 5;
        for (int i = tid; i < N; i += blockDim.x * gridDim.x) {
            int byteId = i / 16 + 4;
 //        printf("I: [%i]; byteId: [%i]\n", i, byteId);
            shmem[threadIdx.x] = dz[i];
            __syncthreads();
            if (threadIdx.x % 16 == 0) {
                int byte = x[byteId];
                for (int e = 0; e < 16; e++) {
                    if (i + e >= N)
                        continue;
                    int bitId = (i + e) % 16;
                    bool hasBit = (byte & 1 << (bitId) ) != 0;
                    bool hasSign = (byte & 1 << (bitId + 16) ) != 0;
                    if (hasBit) {
                        if (hasSign)
                            shmem[threadIdx.x + bitId] -= threshold;
                        else
                            shmem[threadIdx.x + bitId] += threshold;
                    } else if (hasSign) {
                        shmem[threadIdx.x + bitId] -= threshold / 2;
                    }
                }
            }
            __syncthreads();
            dz[i] = shmem[threadIdx.x];
        }
 }
 //////////////////////////////////////////////////////////////////////////
@ -295,7 +510,7 @@ __host__ void cudaDecodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, vo
    execCudaDecodeBitmapKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, vdz);
    sd::DebugHelper::checkErrorCode(stream, "cudeDecodeBitmap(...) failed");
 }
-BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz), FLOAT_TYPES);
    template <bool storeSum, bool isNP2>
--- a/libnd4j/include/loops/impl/type_conversions.cpp
+++ b/libnd4j/include/loops/impl/type_conversions.cpp
@ -106,8 +106,14 @@ namespace sd {
        auto l = static_cast<int>(N);
        z[1] = l;
 #ifdef _OPENMP
        int threads = OmpLaunchHelper::betterThreads(N);
-        int span = OmpLaunchHelper::betterSpan(N, threads);
+        auto span = OmpLaunchHelper::betterSpan(N, threads);
 #else
        int threads = 1;
        auto span = N;
 #endif
        T tt = static_cast<T>(threshold);
        T mtt = -tt;
@ -209,21 +215,23 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
        samediff::Threads::parallel_for(func,  0, N);
    };
    template void TypeCast::convertFromThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertFromThreshold<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertFromThreshold<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
-    template void TypeCast::convertFromThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
+    template void TypeCast::convertFromThreshold<bfloat16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertToThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertToThreshold<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertToThreshold<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
-    template void TypeCast::convertToThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
+    template void TypeCast::convertToThreshold<bfloat16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertFromQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertFromQuantized<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertFromQuantized<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertFromQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertToQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertToQuantized<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertToQuantized<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertToQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
 #ifndef __CLION_IDE__
    BUILD_DOUBLE_TEMPLATE(template void TypeCast::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES)
--- a/libnd4j/include/loops/type_conversions.h
+++ b/libnd4j/include/loops/type_conversions.h
@ -110,29 +110,6 @@ namespace sd {
    }
 #ifdef __CUDACC__
    /*
 * PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
 */
    template<typename T>
    __device__ inline void encoderKernelP1(void *dx, Nd4jLong N, void *dz, float threshold) {
        auto x = reinterpret_cast<T *> (dx);
        auto z = reinterpret_cast<int *> (dz);
        //basically, for phase One we want do calculation: how many eligible values we have, and which blocks will be holding data
        Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
        int pass = tid < N && sd::math::nd4j_abs<T>(x[tid]) >= static_cast<T>(threshold) ? 1 : 0;
        int bp=__syncthreads_count(pass);
        if (threadIdx.x == 0) {
            // saving out per-block passes
            z[blockIdx.x+1] = bp;
            // saving out sum
            atomicAdd(&z[0], bp);
        }
    }
    __device__ __inline__ int pow2i (int e){
        return 1<<e;
    }
@ -140,274 +117,21 @@ namespace sd {
    template<typename T>
    __host__ void encoderKernelP1Generic(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold);
 /*
 * PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
 *
 * Based on: https://github.com/knotman90/cuStreamComp <-- efficient CUDA stream compaction algorithm
 */
    template<typename T>
    __device__ inline void encoderKernelP3(void *dx, int *offsets, Nd4jLong N, void *dz) {
        T *x = reinterpret_cast<T *> (dx);
        int *z = reinterpret_cast<int *> (dz);
        Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
        extern __shared__ int warpTotals[];
        // fetch block offset only once
        __shared__ float threshold;
        __shared__ FloatBits fb;
        __shared__ int bo;
        __shared__ int limit;
        if (threadIdx.x == 0) {
            limit = z[0];
            fb.i_ = z[2];
            threshold = fb.f_;
            bo = offsets[blockIdx.x];
        }
        __syncthreads();
        if (tid < N) {
            T value = x[tid];
            int pred = sd::math::nd4j_abs<T>(value) >= static_cast<T>(threshold) ? 1 : 0;
            int w_i = threadIdx.x/warpSize; //warp index
            int w_l = tid % warpSize;//thread index within a warp
            unsigned int t_m = INT_MAX >> (warpSize-w_l-1); //thread mask (ERROR IN THE PAPER minus one is required)
            int b   = __ballot_sync(t_m, pred); //balres = number whose ith bit isone if the ith's thread pred is true masked up to the current index in warp
            int t_u = __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
            if(w_l==warpSize-1){
                warpTotals[w_i]=t_u+pred;
            }
 //            __syncthreads(); // Eliminated due RTX20xx specific
            if(w_i==0 && w_l<blockDim.x/warpSize){
                int w_i_u=0;
                for(int j=0;j<=5;j++){
                    unsigned int b_j =__ballot_sync(t_m, warpTotals[w_l] & pow2i(j) ); //# of the ones in the j'th digit of the warp offsets
                    w_i_u += (__popc(b_j) << j);
                    //printf("indice %i t_m=%i,j=%i,b_j=%i,w_i_u=%i\n",w_l,t_m,j,b_j,w_i_u);
                }
                warpTotals[w_l]=w_i_u;
            }
 //            __syncthreads();  // Eliminated due RTX20xx specific
            if(pred){
                int idx = t_u + warpTotals[w_i] + bo + 4;
                if (idx < limit + 4) {
                    z[idx]= value > static_cast<T>(0.0f) ? tid+1 : -(tid + 1);
                    x[tid] = value > static_cast<T>(0.0f) ? x[tid] - threshold : x[tid] + threshold;
                }
            }
        }
    }
    template<typename T>
    __host__ void encoderKernelP3Generic(dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz);
    /*
 *   This kernel handles decode from sparse threshold array, to dense array
 *
 *   PLEASE NOTE: Z is expected to be memset to 0
 */
    template<typename T>
    __device__ inline void decoderKernel(void *dx, Nd4jLong N, void *dz) {
        auto x = reinterpret_cast<int *> (dx);
        auto z = reinterpret_cast<T *> (dz);
        int tid = blockIdx.x * blockDim.x + threadIdx.x;
        __shared__ float threshold;
        __shared__ int limit;
        __shared__ FloatBits fb;
        if (threadIdx.x == 0) {
            limit = x[0];
            fb.i_ = x[2];
            threshold = fb.f_;
        }
        __syncthreads();
        for (int e = tid; e < limit; e += blockDim.x * gridDim.x) {
            int el = x[e+4];
            int ael = sd::math::nd4j_abs<int>(el) - 1;
            // TODO: investigate, if += would work better here, as in "decoded accumulation"
            z[ael] += el > 0 ? threshold : -threshold;
        }
    }
    template<typename T>
    __host__ void decoderKernelGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz);
 //////////////////////////////////////////////////////////////////////////    
    template<typename T>
    __device__ inline void cudaEncodeBitmapKernel(void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold) {
        auto dx = reinterpret_cast<T *>(vdx);
        int tid = blockIdx.x * blockDim.x + threadIdx.x;
        T off(0.0f);
        __shared__ int counter;
        __shared__ int *shmem;
        __shared__ T *vals;
        if (threadIdx.x == 0){
            extern __shared__ char mem[];
            shmem = reinterpret_cast<int*>(mem);
            vals = reinterpret_cast<T *>(shmem + blockDim.x);
            counter = 0;
        }
        __syncthreads();
        Nd4jLong loopRemainder = N % (blockDim.x * gridDim.x);
        Nd4jLong loopLimit = N + (blockDim.x * gridDim.x - loopRemainder);
        for (Nd4jLong i = tid; i < loopLimit; i += blockDim.x * gridDim.x) {
            // all threads in block reading stuff
            T val = i < N ? dx[i] : off;
            T abs = sd::math::nd4j_abs<T>(val);
            int byteId = i / 16 + 4;
            int bitId = i % 16;
            shmem[threadIdx.x] = 0;
            vals[threadIdx.x] = val;
            if (abs >= static_cast<T>(threshold) && i < N) {
                shmem[threadIdx.x] = 1 << (bitId);
                atomicAdd(&counter, 1);
                if (val < static_cast<T>(0.0f)) {
                    shmem[threadIdx.x] |= 1 << (bitId + 16);
                    vals[threadIdx.x] += static_cast<T>(threshold);
                } else {
                    vals[threadIdx.x] -= static_cast<T>(threshold);
                }
            } else if (abs >= static_cast<T>(threshold) / static_cast<T>(2.0f) && val < static_cast<T>(0.0f) && i < N) {
                atomicAdd(&counter, 1);
                shmem[threadIdx.x] = 1 << (bitId + 16);
                vals[threadIdx.x] += static_cast<T>(threshold) / static_cast<T>(2.0f);
            }
            __syncthreads();
            if (threadIdx.x % 16 == 0 && i < N) {
                int byte = 0;
                for (int e = 0; e < 16; e++) {
                    if (i + e >= N)
                        continue;
                    byte |= shmem[threadIdx.x + e];
                }
                dz[byteId] = byte;
            }
            __syncthreads();
            if (i < N)
                dx[i] = vals[threadIdx.x];
        }
        __syncthreads();
        if (threadIdx.x == 0) {
            atomicAdd(scalar, counter);
        }
    }
    template<typename T>
    __host__ void cudaEncodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
 //////////////////////////////////////////////////////////////////////////
    template<typename T>
    __device__ inline void cudaDecodeBitmapKernel(void *dx, Nd4jLong N, void *vdz) {
        auto dz = static_cast<T*>(vdz);
        int tid = blockIdx.x * blockDim.x + threadIdx.x;
        __shared__ T *shmem;
        __shared__ FloatBits fb;
        __shared__ float threshold;
        __shared__ int *x;
        if (threadIdx.x == 0){
            extern __shared__ char mem[];
            shmem = reinterpret_cast<T*>(mem);
            x = reinterpret_cast<int *>(dx);
            fb.i_ = x[2];
            threshold = fb.f_;
        }
        __syncthreads();
        int lim = N / 16 + 5;
        for (int i = tid; i < N; i += blockDim.x * gridDim.x) {
            int byteId = i / 16 + 4;
 //        printf("I: [%i]; byteId: [%i]\n", i, byteId);
            shmem[threadIdx.x] = dz[i];
            __syncthreads();
            if (threadIdx.x % 16 == 0) {
                int byte = x[byteId];
                for (int e = 0; e < 16; e++) {
                    if (i + e >= N)
                        continue;
                    int bitId = (i + e) % 16;
                    bool hasBit = (byte & 1 << (bitId) ) != 0;
                    bool hasSign = (byte & 1 << (bitId + 16) ) != 0;
                    if (hasBit) {
                        if (hasSign)
                            shmem[threadIdx.x + bitId] -= threshold;
                        else
                            shmem[threadIdx.x + bitId] += threshold;
                    } else if (hasSign) {
                        shmem[threadIdx.x + bitId] -= threshold / 2;
                    }
                }
            }
            __syncthreads();
            dz[i] = shmem[threadIdx.x];
        }
    }
    template<typename T>
    __host__ void cudaDecodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz);
    // __global__ void cudaEncodeBitmapFloat(float *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
    // __global__ void cudaEncodeBitmapDouble(double *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
    // __global__ void cudaEncodeBitmapHalf(float16 *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
    // __global__ void cudaDecodeBitmapFloat(void *dx, Nd4jLong N, float *dz);
    // __global__ void cudaDecodeBitmapDouble(void *dx, Nd4jLong N, double *dz);
    // __global__ void cudaDecodeBitmapHalf(void *dx, Nd4jLong N, float16 *dz);
    // __global__ void encoderKernelP1Float(void *dx, Nd4jLong N, void *dz, float threshold);
    // __global__ void encoderKernelP1Double(void *dx, Nd4jLong N, void *dz, float threshold);
    // __global__ void encoderKernelP1Half(void *dx, Nd4jLong N, void *dz, float threshold);
    // __global__ void encoderKernelP2Float(int *dx, Nd4jLong N, int *dz);
    // __global__ void encoderKernelP3Float(void *dx, int *offsets, Nd4jLong N, void *dz);
    // __global__ void encoderKernelP3Double(void *dx, int *offsets, Nd4jLong N, void *dz);
    // __global__ void encoderKernelP3Half(void *dx, int *offsets, Nd4jLong N, void *dz);
    // __global__ void decoderKernelFloat(void *dx, Nd4jLong N, void *dz);
    // __global__ void decoderKernelDouble(void *dx, Nd4jLong N, void *dz);
    // __global__ void decoderKernelHalf(void *dx, Nd4jLong N, void *dz);
    __global__ void uniformAdd(int *g_data, int *uniforms, int n, int blockOffset, int baseIndex);
    template <bool storeSum, bool isNP2>
--- a/libnd4j/include/ops/declarable/CustomOperations.h
+++ b/libnd4j/include/ops/declarable/CustomOperations.h
@ -25,6 +25,7 @@
 #include <ops/declarable/headers/boolean.h>
 #include <ops/declarable/headers/broadcastable.h>
 #include <ops/declarable/headers/convo.h>
 #include <ops/declarable/headers/compression.h>
 #include <ops/declarable/headers/list.h>
 #include <ops/declarable/headers/recurrent.h>
 #include <ops/declarable/headers/transforms.h>
--- a/libnd4j/include/ops/declarable/generic/blas/matmul.cpp
+++ b/libnd4j/include/ops/declarable/generic/blas/matmul.cpp
@ -138,9 +138,9 @@ DECLARE_SHAPE_FN(matmul) {
 //////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(matmul) {
    getOpDescriptor()
-            ->setAllowedInputTypes(0, {ALL_FLOATS})
+            ->setAllowedInputTypes(0, {ALL_FLOATS, ALL_INTS})
-            ->setAllowedInputTypes(1, {ALL_FLOATS})
+            ->setAllowedInputTypes(1, {ALL_FLOATS, ALL_INTS})
-            ->setAllowedOutputTypes(0, {ALL_FLOATS});
+            ->setAllowedOutputTypes(0, {ALL_FLOATS, ALL_INTS});
 }
 //////////////////////////////////////////////////////////////////////
--- a/libnd4j/include/ops/declarable/generic/compression/bitmap.cpp
+++ b/libnd4j/include/ops/declarable/generic/compression/bitmap.cpp
@ -0,0 +1,92 @@
 /*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 //
 // @author George A. Shulinok <sgazeos@gmail.com>
 //
 #include <system/op_boilerplate.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/compression.h>
 #if NOT_EXCLUDED(OP_decode_bitmap)
 namespace sd {
    namespace ops {
        CUSTOM_OP_IMPL(decode_bitmap, 2, 1, true, 0, 0) {
            auto encoded = INPUT_VARIABLE(1);
            auto updates = OUTPUT_VARIABLE(0);
            helpers::decodeBitmap(block.launchContext(), encoded, updates);
            return Status::OK();
        }
        DECLARE_SHAPE_FN(decode_bitmap) {
            auto weights = INPUT_VARIABLE(0);
            return SHAPELIST(weights->shapeInfo());
        }
        DECLARE_TYPES(decode_bitmap) {
            getOpDescriptor()
                    ->setAllowedInputTypes(0, {ALL_FLOATS})
                    ->setAllowedInputTypes(1, DataType::INT32)
                    ->setAllowedOutputTypes({ALL_FLOATS});
        }
    }
 }
 #endif
 #if NOT_EXCLUDED(OP_encode_bitmap)
 namespace sd {
    namespace ops {
        CUSTOM_OP_IMPL(encode_bitmap, 1, 3, true, 1, 0) {
            auto input = INPUT_VARIABLE(0);
            auto encoded = OUTPUT_NULLIFIED(1);
            auto counter = OUTPUT_NULLIFIED(2);
            float threshold = T_ARG(0);
            encoded->p(0, (int) input->lengthOf());
            encoded->p(1, (int) input->lengthOf());
            encoded->p(2, reinterpret_cast<int *>(&threshold)[0]);
            encoded->p(3, 1); // flag for BITMAP_ENCODING
            auto result = helpers::encodeBitmap(block.launchContext(), input, encoded, threshold);
            counter->p(0, result);
            counter->syncToDevice();
            return Status::OK();
        }
        DECLARE_SHAPE_FN(encode_bitmap) {
            auto input = inputShape->at(0);
            auto outputLength = shape::length(input) / 16 + 5;
            auto encodedShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(outputLength, DataType::INT32);
            auto encodedCounter = ConstantShapeHelper::getInstance()->scalarShapeInfo(DataType::INT32);
            return SHAPELIST(input, encodedShape, encodedCounter);
        }
        DECLARE_TYPES(encode_bitmap) {
            getOpDescriptor()
                    ->setAllowedInputTypes(sd::DataType::ANY)
                    ->setAllowedOutputTypes(0, {ALL_FLOATS})
                    ->setAllowedInputTypes(1, DataType::INT32)
                    ->setAllowedInputTypes(2, DataType::INT32);
        }
    }
 }
 #endif
--- a/libnd4j/include/ops/declarable/generic/compression/threshold.cpp
+++ b/libnd4j/include/ops/declarable/generic/compression/threshold.cpp
@ -0,0 +1,104 @@
 /*******************************************************************************
 * Copyright (c) 2020 Konduit K.K.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 //
 // @author raver119@gmail.com
 //
 #include <system/op_boilerplate.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/threshold.h>
 namespace sd {
    namespace ops {
        CUSTOM_OP_IMPL(encode_threshold, 1, 2, true, 1, 0) {
            auto x = INPUT_VARIABLE(0);
            auto updated = OUTPUT_VARIABLE(0);
            auto encoded = OUTPUT_NULLIFIED(1);
            float threshold = T_ARG(0);
            REQUIRE_TRUE(x->lengthOf() <= DataTypeUtils::max<int>(), 0, "encode_threshold: gradients array must have length <= MAX_INT");
            REQUIRE_TRUE(encoded->lengthOf() >= 4, 0, "encode_threshold: array for encoded updates can't have less than 4 elements");
 //            REQUIRE_TRUE(x->platformBuffer() == updated->platformBuffer(), 0, "encode_threshold: gradients array must be the same at input and output");
            // filling header bytes
            encoded->p(0, encoded->lengthOf() - 4);
            encoded->p(1, (int) x->lengthOf());
            encoded->p(2, reinterpret_cast<int *>(&threshold)[0]);
            encoded->p(3, 0); // flag for FLEXIBLE_ENCODING
            // if there's no updates to process - just skip execution
            if (encoded->lengthOf() == 4)
                return Status::OK();
            helpers::thresholdEncode(*x, *encoded, threshold);
            return Status::OK();
        }
        DECLARE_SHAPE_FN(encode_threshold) {
            auto x = INPUT_VARIABLE(0);
            // we have limit option here
            int boundary = block.numI() > 0 ? I_ARG(0) : DataTypeUtils::max<int>();
            float threshold = T_ARG(0);
            REQUIRE_TRUE(boundary >= 0, 0, "encode_threshold: boundary must be positive");
            REQUIRE_TRUE(x->lengthOf() <= DataTypeUtils::max<int>(), 0, "encode_threshold: gradients array must have length <= MAX_INT");
            // we must calculate number of elements that >= threshold
            auto elements = sd::math::nd4j_min<int>(helpers::thresholdEstimate(*x, threshold), boundary);
            if (elements < 2)
                elements = 0;
            // result array must have 4 additional int elements for header
            return SHAPELIST(x->shapeInfo(), sd::ConstantShapeHelper::getInstance()->vectorShapeInfo(elements + 4, DataType::INT32));
        }
        DECLARE_TYPES(encode_threshold) {
            getOpDescriptor()
                    ->setAllowedInputTypes(0, {ALL_FLOATS})
                    ->setAllowedOutputTypes(0, {ALL_FLOATS})
                    ->setAllowedOutputTypes(1, DataType::INT32);
        }
        CUSTOM_OP_IMPL(decode_threshold, 2, 1, true, 0, 0) {
            auto weights = INPUT_VARIABLE(0);
            auto encoded = INPUT_VARIABLE(1);
            auto updates = OUTPUT_VARIABLE(0);
            REQUIRE_TRUE(encoded->lengthOf() >= 4, 0, "decode_threshold: encoded array can't have length < 4");
            REQUIRE_TRUE(updates->lengthOf() == encoded->e<int>(1), 0, "decode_threshold: updates array must have length equal to [%i]", encoded->e<int>(1));
            REQUIRE_TRUE(encoded->e<int>(3) == 0, 0, "decode_threshold: encoded array doesn't look like threshold-encoded");
            helpers::thresholdDecode(*encoded, *updates);
            return Status::OK();
        }
        DECLARE_SHAPE_FN(decode_threshold) {
            auto weights = inputShape->at(0);
            return SHAPELIST(weights);
        }
        DECLARE_TYPES(decode_threshold) {
            getOpDescriptor()
                    ->setAllowedInputTypes(0, {ALL_FLOATS})
                    ->setAllowedInputTypes(1, DataType::INT32)
                    ->setAllowedOutputTypes(0,{ALL_FLOATS});
        }
    }
 }
--- a/libnd4j/include/ops/declarable/headers/compression.h
+++ b/libnd4j/include/ops/declarable/headers/compression.h
@ -0,0 +1,62 @@
 /*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 * Copyright (c) 2020 Konduit K.K.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 //
 //  @author sgazeos@gmail.com
 //
 #ifndef SD_HEADERS_COMPRESSION_H
 #define SD_HEADERS_COMPRESSION_H
 #include <ops/declarable/headers/common.h>
 namespace sd {
    namespace ops {
        /**
         * encode_bitmap - reinterpret 3D float tensor into uint8_t vector with length N.
         *
         * Input:
         *      0 - 3D float tensor with shape {height, width, channels}
         *
         * Output:
         *      0 - 1D uint8_t tensor with shape {N}
         */
        #if NOT_EXCLUDED(OP_encode_bitmap)
        DECLARE_CUSTOM_OP(encode_bitmap, 1, 3, true, 1, 0);
        #endif
        /**
         *  decode_bitmap - reinterpret uint8_t linear tensor as data to float tensor with shape
         *
         *  Input:
         *      0 - uint8_t vector with length N ( shape {N})
         *
         *  Output:
         *      0 - 3D tensor with shape {height, width, channels}
         *
         */
        #if NOT_EXCLUDED(OP_decode_bitmap)
        DECLARE_CUSTOM_OP(decode_bitmap, 2, 1, true, 0, 0);
        #endif
        DECLARE_CUSTOM_OP(encode_threshold, 2, 1, true, 1, 0);
        DECLARE_CUSTOM_OP(decode_threshold, 2, 1, true, 0, 0);
    }
 }
 #endif // SD_HEADERS_COMPRESSION_H
--- a/libnd4j/include/ops/declarable/helpers/compression.h
+++ b/libnd4j/include/ops/declarable/helpers/compression.h
@ -0,0 +1,34 @@
 /*******************************************************************************
 * Copyright (c) 2020 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 //
 //  @author sgazeos@gmail.com
 //
 #ifndef __COMPRESSION_H_HELPERS__
 #define __COMPRESSION_H_HELPERS__
 #include <system/op_boilerplate.h>
 #include <array/NDArray.h>
 namespace sd {
 namespace ops {
 namespace helpers {
    void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output);
    Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold);
 }
 }
 }
 #endif
--- a/libnd4j/include/ops/declarable/helpers/cpu/compression/compression.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compression/compression.cpp
@ -0,0 +1,37 @@
 /*******************************************************************************
 * Copyright (c) 2020 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 //
 //  @author sgazeos@gmail.com
 //
 #include <ops/declarable/helpers/compression.h>
 #include <execution/Threads.h>
 namespace sd {
 namespace ops {
 namespace helpers {
    void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output) {
        NativeOpExecutioner::decodeBitmap(input->buffer(), output->lengthOf(), output->buffer(), output->shapeInfo());
    }
    Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold) {
        return NativeOpExecutioner::encodeBitmap(input->buffer(), input->shapeInfo(), input->lengthOf(), output->bufferAsT<int>(), threshold);
    }
 }
 }
 }
--- a/libnd4j/include/ops/declarable/helpers/cpu/compression/threshold.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compression/threshold.cpp
@ -0,0 +1,62 @@
 /*******************************************************************************
 * Copyright (c) 2020 Konduit K.K.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 //
 // @author raver119@gmail.com
 //
 #include <ops/declarable/helpers/threshold.h>
 #include <execution/Threads.h>
 #include <helpers/threshold.h>
 namespace sd {
    namespace ops {
        namespace helpers {
            template <typename T>
            static int32_t thresholdEstimate_(const NDArray &updates, const float threshold) {
                auto N = updates.lengthOf();
                const auto buffer = updates.bufferAsT<T>();
                auto func = PRAGMA_REDUCE_LONG {
                    int64_t cnt = 0;
                    for (auto e = start; e < stop; e++) {
                        auto v = sd::math::nd4j_abs<T>(buffer[e]);
                        if (v >= threshold)
                            cnt++;
                    }
                    return cnt;
                };
                return samediff::Threads::parallel_long(func, LAMBDA_AL { return _old + _new; }, 0, N);
            }
            int32_t thresholdEstimate(const NDArray &updates, const float threshold) {
                BUILD_SINGLE_SELECTOR(updates.dataType(), return thresholdEstimate_, (updates, threshold), FLOAT_TYPES);
                return 0;
            }
            void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold) {
                BUILD_SINGLE_SELECTOR(updates.dataType(), sd::TypeCast::convertToThreshold, (nullptr, updates.buffer(), updates.lengthOf(), encoded.buffer()), FLOAT_TYPES);
            }
            void thresholdDecode(const NDArray &encoded, NDArray &updates) {
                BUILD_SINGLE_SELECTOR(updates.dataType(), sd::TypeCast::convertFromThreshold, (nullptr, encoded.getBuffer(), updates.lengthOf(), updates.buffer()), FLOAT_TYPES);
            }
        }
    }
 }
--- a/libnd4j/include/ops/declarable/helpers/cuda/compression/compression.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/compression/compression.cu
@ -0,0 +1,66 @@
 /*******************************************************************************
 * Copyright (c) 2020 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 //
 //  @author sgazeos@gmail.com
 //
 #include <ops/declarable/helpers/compression.h>
 #include <loops/type_conversions.h>
 #include <helpers/DebugHelper.h>
 namespace sd {
 namespace ops {
 namespace helpers {
    void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output) {
        auto stream = context->getCudaStream();
        NDArray::prepareSpecialUse({output}, {input});
        dim3 launchDims(512, 512, 16384);
        auto xType = output->dataType();
        BUILD_SINGLE_SELECTOR(xType, cudaDecodeBitmapGeneric, (launchDims, stream, input->specialBuffer(), output->lengthOf(), output->specialBuffer()), FLOAT_TYPES);
        sd::DebugHelper::checkErrorCode(stream, "decodeBitmapFloat(...) failed");
        NDArray::registerSpecialUse({output}, {input});
    }
    Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold) {
        auto stream = LaunchContext::defaultContext()->getCudaStream();
        int *resultPointer = reinterpret_cast<int *>(LaunchContext::defaultContext()->getScalarPointer());
        int *reductionPointer = reinterpret_cast<int *>(LaunchContext::defaultContext()->getReductionPointer());
        // nullify result pointer before use
        resultPointer[0] = 0;
        NDArray::prepareSpecialUse({},{output, input});
        dim3 launchDims(512, 512, 32768);
        auto xType = input->dataType();
        BUILD_SINGLE_SELECTOR(xType, cudaEncodeBitmapGeneric,
                              (launchDims, stream, input->specialBuffer(), input->lengthOf(), reinterpret_cast<int*>(output->specialBuffer()), resultPointer, reductionPointer, threshold),
                              FLOAT_TYPES);
        sd::DebugHelper::checkErrorCode(stream, "encodeBitmapFloat(...) failed");
        Nd4jLong dZ = (Nd4jLong) resultPointer[0];
        resultPointer[0] = 0;
        NDArray::registerSpecialUse({output, input}, {});
        return dZ;
    }
 }
 }
 }
--- a/libnd4j/include/ops/declarable/helpers/cuda/compression/threshold.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/compression/threshold.cu
@ -0,0 +1,231 @@
 /*******************************************************************************
 * Copyright (c) 2020 Konduit K.K.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 //
 // @author raver119@gmail.com
 //
 #include <ops/declarable/helpers/threshold.h>
 #include <loops/type_conversions.h>
 #include <helpers/PointersManager.h>
 #include <vector>
 namespace sd {
    namespace ops {
        namespace helpers {
            void prescanArrayRecursive(int** g_scanBlockSums, int *dZ, int *dX, int numElements, int level) {
                auto stream = LaunchContext::defaultContext()->getCudaStream();
                int blockSize = 512; // max size of the thread blocks
                int numBlocks = sd::math::nd4j_max<int>(1, static_cast<int>(ceil(static_cast<float>(numElements) / (2.f * blockSize))));
                int numThreads;
                if (numBlocks > 1)
                    numThreads = blockSize;
                else if (sd::isPowerOfTwo(numElements))
                    numThreads = numElements / 2;
                else
                    numThreads = sd::floorPow2(numElements);
                int numEltsPerBlock = numThreads * 2;
                // if this is a non-power-of-2 array, the last block will be non-full
                // compute the smallest power of 2 able to compute its scan.
                int numEltsLastBlock =
                        numElements - (numBlocks-1) * numEltsPerBlock;
                int numThreadsLastBlock = sd::math::nd4j_max<int>(1, numEltsLastBlock / 2);
                int np2LastBlock = 0;
                int sharedMemLastBlock = 0;
                if (numEltsLastBlock != numEltsPerBlock) {
                    np2LastBlock = 1;
                    if(!isPowerOfTwo(numEltsLastBlock))
                        numThreadsLastBlock = floorPow2(numEltsLastBlock);
                    unsigned int extraSpace = (2 * numThreadsLastBlock) / NUM_BANKS;
                    sharedMemLastBlock = sizeof(int) * (2 * numThreadsLastBlock + extraSpace);
                }
                // padding space is used to avoid shared memory bank conflicts
                int extraSpace = numEltsPerBlock / NUM_BANKS;
                int sharedMemSize = sizeof(int) * (numEltsPerBlock + extraSpace);
                // setup execution parameters
                // if NP2, we process the last block separately
                dim3 grid(sd::math::nd4j_max<int>(1, numBlocks - np2LastBlock), 1, 1);
                dim3 threads(numThreads, 1, 1);
                dim3 gridOnes(1, 1, 1);
                dim3 threadsOnes(numThreadsLastBlock, 1, 1);
                if (sharedMemSize < 2048)
                    sharedMemSize = 2048;
                if (sharedMemLastBlock < 2048)
                    sharedMemLastBlock = 2048;
                // execute the scan
                if (numBlocks > 1) {
                    sd::prescanLauncher<true, false>(grid, threads, sharedMemSize, stream, dZ, dX, g_scanBlockSums[level], numThreads * 2, 0, 0);
                    if (np2LastBlock) {
                        sd::prescanLauncher<true, true>(gridOnes, threadsOnes, sharedMemLastBlock, stream, dZ, dX, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock);
                    }
                    // After scanning all the sub-blocks, we are mostly done.  But now we
                    // need to take all of the last values of the sub-blocks and scan those.
                    // This will give us a new value that must be sdded to each block to
                    // get the final results.
                    // recursive (CPU) call
                    prescanArrayRecursive(g_scanBlockSums, g_scanBlockSums[level], g_scanBlockSums[level], numBlocks, level+1);
                    sd::uniformAdd<<<grid, threads, 1024, *stream>>>(dZ, g_scanBlockSums[level], numElements - numEltsLastBlock, 0, 0);
                    if (np2LastBlock) {
                        sd::uniformAdd<<<1, numThreadsLastBlock, 1024, *stream>>>(dZ, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock);
                    }
                } else if (isPowerOfTwo(numElements)) {
                    sd::prescanLauncher<false, false>(grid, threads, sharedMemSize, stream, dZ, dX, 0, numThreads * 2, 0, 0);
                } else {
                    sd::prescanLauncher<false, true>(grid, threads, sharedMemSize, stream, dZ, dX, 0, numElements, 0, 0);
                }
                sd::DebugHelper::checkErrorCode(stream, "prescanArray(...) failed");
            }
            static void encodeThresholdP2Int_(void **prs, int *dx, Nd4jLong N, int *dz) {
                auto stream = LaunchContext::defaultContext()->getCudaStream();
                prescanArrayRecursive(reinterpret_cast<int**>(prs), dz, dx + 1, (int) N, 0);
                sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP2Int(...) failed");
            }
            static void encodeThresholdP3_(void *dx, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){
                auto stream = LaunchContext::defaultContext()->getCudaStream();
                int blockSize = 512;
                int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
                dim3 launchDims(numBlocks, blockSize, 8192);
                auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
                BUILD_SINGLE_SELECTOR(xType, encoderKernelP3Generic, (launchDims, stream, dx, offsets, N, dz), FLOAT_TYPES);
                sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP3Float(...) failed");
            }
            static NDArray thresholdEstimate_(const NDArray &updates, const float threshold) {
                const int numThreads = 512;
                const int numBlocks = updates.lengthOf() / numThreads + (updates.lengthOf() % numThreads ? 1 : 0);
                auto tmp = NDArrayFactory::create<int>('c', {numBlocks + 1});
                dim3 launchDims(numBlocks, numThreads, 1024);
                auto xType = updates.dataType();
                NDArray::prepareSpecialUse({&tmp}, {&updates});
                BUILD_SINGLE_SELECTOR(xType, encoderKernelP1Generic, (launchDims, LaunchContext::defaultContext()->getCudaStream(), updates.getSpecialBuffer(), updates.lengthOf(), tmp.specialBuffer(), threshold), FLOAT_TYPES);
                NDArray::registerSpecialUse({&tmp}, {&updates});
                return std::move(tmp);
            }
            int32_t thresholdEstimate(const NDArray &updates, const float threshold) {
                return thresholdEstimate_(updates, threshold).e<int>(0);
            }
            void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold) {
                // we need these blocks in order to know, how many "updates" will be processed by each GPU block
                auto blocks = thresholdEstimate_(updates, threshold);
                const int numThreads = 512;
                const int numBlocks = updates.lengthOf() / numThreads + (updates.lengthOf() % numThreads ? 1 : 0);
                const int prefixThreads = 512;
                int numElts = numBlocks;
                int level = 0;
                // here we just calculate number of sumBlock arrays
                do {
                    int numPrefixBlocks = sd::math::nd4j_max<int>(1, sd::math::nd4j_ceil<float, int>((float) numElts / (2.0f * prefixThreads)));
                    if (numBlocks > 1) {
                        level++;
                    }
                    numElts = numPrefixBlocks;
                } while (numElts > 1);
                std::vector<NDArray> tempArrays(level);
                std::vector<Nd4jPointer> pointers(level);
                level = 0;
                numElts = numBlocks;
                do {
                    int numPrefixBlocks = sd::math::nd4j_max<int>(1, sd::math::nd4j_ceil<float, int>((float) numElts / (2.0f * prefixThreads)));
                    if (numPrefixBlocks > 1) {
                        tempArrays[level] = std::move(NDArrayFactory::create<int>('c', {numPrefixBlocks}));
                        pointers[level] = tempArrays[level++].specialBuffer();
                    }
                    numElts = numPrefixBlocks;
                } while (numElts > 1);
                PointersManager pm(LaunchContext::defaultContext(), "thresholdEncode");
                auto dptr = pm.replicatePointer(pointers.data(), pointers.size() * 8);
                auto offsets = NDArrayFactory::create<int>('c', {numBlocks});
                // we want to check, if we're hiting external limit on number of encoded elements
                auto numMatches = blocks.e<int>(0);
                if (numMatches > encoded.lengthOf() - 4) {
                    blocks.p(0, encoded.lengthOf() - 4);
                    blocks.syncToDevice();
                }
                NDArray::prepareSpecialUse({}, {&encoded, &updates});
                // filling offsets
                encodeThresholdP2Int_(reinterpret_cast<void **>(dptr),
                                      reinterpret_cast<int*>(blocks.getSpecialBuffer()),
                                      numBlocks,
                                      reinterpret_cast<int*>(offsets.getSpecialBuffer()));
                NDArray::registerSpecialUse({&blocks, &offsets}, {});
                pm.synchronize();
                encodeThresholdP3_(updates.specialBuffer(),
                                   updates.shapeInfo(),
                                   reinterpret_cast<int*>(offsets.getSpecialBuffer()),
                                   updates.lengthOf(),
                                   reinterpret_cast<int*>(encoded.specialBuffer()));
                pm.synchronize();
                NDArray::registerSpecialUse({&encoded, &updates}, {});
            }
            void thresholdDecode(const NDArray &encoded, NDArray &updates) {
                dim3 launchDims(128, 512, 512);
                auto xType = updates.dataType();
                NDArray::prepareSpecialUse({&updates}, {&encoded});
                BUILD_SINGLE_SELECTOR(xType, decoderKernelGeneric, (launchDims, LaunchContext::defaultContext()->getCudaStream(), encoded.getSpecialBuffer(), updates.lengthOf(), updates.specialBuffer()), FLOAT_TYPES);
                NDArray::registerSpecialUse({&updates}, {&encoded});
            }
        }
    }
 }
--- a/libnd4j/include/ops/declarable/helpers/threshold.h
+++ b/libnd4j/include/ops/declarable/helpers/threshold.h
@ -0,0 +1,37 @@
 /*******************************************************************************
 * Copyright (c) 2020 Konduit K.K.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 //
 // @author raver119@gmail.com
 //
 #ifndef SD_THRESHOLD_H
 #define SD_THRESHOLD_H
 #include <ops/declarable/helpers/helpers.h>
 namespace sd {
    namespace ops {
        namespace helpers {
            int32_t thresholdEstimate(const NDArray &updates, float threshold);
            void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold);
            void thresholdDecode(const NDArray &encoded, NDArray &updates);
        }
    }
 }
 #endif //SD_THRESHOLD_H
--- a/libnd4j/include/ops/impl/specials_single.hpp
+++ b/libnd4j/include/ops/impl/specials_single.hpp
@ -557,21 +557,26 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
        fb.i_ = x[2];
        float threshold = fb.f_;
        auto pPos = -1;
        auto func = PRAGMA_THREADS_FOR {
            for (auto e = start; e < stop; e++) {
                const auto v = x[e];
                for (int bitId = 0; bitId < 16; bitId++) {
-                    bool hasBit = (x[e] & 1 << (bitId)) != 0;
+                    bool hasBit = (v & 1 << (bitId)) != 0;
-                    bool hasSign = (x[e] & 1 << (bitId + 16)) != 0;
+                    bool hasSign = (v & 1 << (bitId + 16)) != 0;
                    auto cPos = (e - 4) * 16 + bitId;
                    if (hasBit) {
                        if (hasSign)
-                            dz[(e - 4) * 16 + bitId] -= static_cast<T>(threshold);
+                            dz[cPos] -= static_cast<T>(threshold);
                        else
-                            dz[(e - 4) * 16 + bitId] += static_cast<T>(threshold);
+                            dz[cPos] += static_cast<T>(threshold);
                    } else if (hasSign) {
-                        dz[(e - 4) * 16 + bitId] -= static_cast<T>(threshold / 2);
+                        dz[cPos] -= static_cast<T>(threshold / 2);
                    }
                    pPos = cPos;
                }
            }
        };
@ -582,17 +587,21 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
    template<typename T>
    Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) {
        auto dx = reinterpret_cast<T *>(vx);
        const T two(2.0f);
        const T zero(0.0f);
        const T t(threshold);
        const T thalf = t / two;
-//PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal))
+        //auto func = PRAGMA_REDUCE_LONG {
        auto func = PRAGMA_REDUCE_LONG {
            Nd4jLong retVal = 0L;
-            for (auto x = start; x < stop; x += increment) {
+            PRAGMA_OMP_PARALLEL_FOR_REDUCTION(+:retVal)
            for (auto x = 0; x < N; x += 16) {
                int byte = 0;
                int byteId = x / 16 + 4;
                for (int f = 0; f < 16; f++) {
-                    Nd4jLong e = x + f;
+                    auto e = x + f;
                    if (e >= N)
                        continue;
@ -602,19 +611,19 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
                    int bitId = e % 16;
-                    if (abs >= (T) threshold) {
+                    if (abs >= t) {
                        byte |= 1 << (bitId);
                        retVal++;
-                        if (val < (T) 0.0f) {
+                        if (val < zero) {
                            byte |= 1 << (bitId + 16);
-                            dx[e] += static_cast<T>(threshold);
+                            dx[e] += t;
                        } else {
-                            dx[e] -= static_cast<T>(threshold);
+                            dx[e] -= t;
                        }
-                    } else if (abs >= (T) threshold / (T) 2.0f && val < (T) 0.0f) {
+                    } else if (abs >= thalf && val < zero) {
                        byte |= 1 << (bitId + 16);
-                        dx[e] += static_cast<T>(threshold / 2);
+                        dx[e] += thalf;
                        retVal++;
                    }
@ -624,8 +633,9 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
            }
            return retVal;
-        };
+        //};
-        return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
+
        //return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
    }
 }
--- a/libnd4j/tests_cpu/layers_tests/AtomicTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/AtomicTests.cu
@ -57,7 +57,7 @@ static void multiplyLauncher(void *vbuffer, uint64_t length, void *vresult) {
    multiplyKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
    auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
    if (err != 0)
-        sd::cuda_exception::build("multiply failed", err);
+        throw sd::cuda_exception::build("multiply failed", err);
 }
 template <typename T>
@ -80,7 +80,7 @@ static void sumLauncher(void *vbuffer, uint64_t length, void *vresult) {
    sumKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
    auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
    if (err != 0)
-        sd::cuda_exception::build("sum failed", err);
+        throw sd::cuda_exception::build("sum failed", err);
 }
 template <typename T>
@ -103,7 +103,7 @@ static void subLauncher(void *vbuffer, uint64_t length, void *vresult) {
    subKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
    auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
    if (err != 0)
-        sd::cuda_exception::build("sub failed", err);
+        throw sd::cuda_exception::build("sub failed", err);
 }
 template <typename T>
@ -126,7 +126,7 @@ static void divLauncher(void *vbuffer, uint64_t length, void *vresult) {
    divKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
    auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
    if (err != 0)
-        sd::cuda_exception::build("div failed", err);
+        throw sd::cuda_exception::build("div failed", err);
 }
 static void multiplyHost(NDArray &input, NDArray &output) {
--- a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
@ -42,18 +42,18 @@ endif()
 # -fsanitize=address
 # -fsanitize=leak
 if (APPLE)
-    set(CMAKE_CXX_FLAGS  " -fPIC  -fmax-errors=2 -D__APPLE_OS__=true")
+    set(CMAKE_CXX_FLAGS  " -fPIC -D__APPLE_OS__=true")
 elseif(WIN32)
    if (SD_CPU)
        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC -march=native -mtune=native -O3")
    endif()
 	if (SD_CPU AND LINUX)
-		set(CMAKE_CXX_FLAGS  " -fPIC  -fmax-errors=2")
+		set(CMAKE_CXX_FLAGS  " -fPIC")
 	endif()
 else()
    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
-    set(CMAKE_CXX_FLAGS  " -fPIC  -fmax-errors=2")
+    set(CMAKE_CXX_FLAGS  " -fPIC")
    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*")
        set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native")
    else()
@ -82,12 +82,12 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
 elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
    # using GCC
-    SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}")
+    SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -fmax-errors=2")
 endif()
-if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux" AND NOT(MINGW))
+    if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux" AND NOT(MINGW))
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic")
+        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic")
-    SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
+        SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
    endif()
 endif()
 IF(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp
@ -25,6 +25,7 @@
 #include <ops/ops.h>
 #include <helpers/GradCheck.h>
 #include <array>
 #include <helpers/RandomLauncher.h>
 using namespace sd;
@ -39,6 +40,195 @@ public:
    }
 };
 TEST_F(DeclarableOpsTests19, test_threshold_encode_1) {
    auto x = NDArrayFactory::create<double>('c', {3}, {1.5, 2.5, -3.5});
    auto exp_encoded = NDArrayFactory::create<int>('c', {7}, {3, 3, 1056964608, 0, 1, 2, -3});
    auto exp_gradients = NDArrayFactory::create<double>('c', {3}, {1.0, 2.0, -3.0});
    sd::ops::encode_threshold op;
    auto result = op.evaluate({&x}, {0.5});
    auto gradients = result.at(0);
    auto encoded = result.at(1);
    //encoded->printIndexedBuffer("ENC");
    ASSERT_EQ(exp_encoded, *encoded);
    ASSERT_EQ(exp_gradients, x);
    // FIXME: we need to add a way to declare individual inplace outputs
    //ASSERT_EQ(exp_gradients, *gradients);
 }
 TEST_F(DeclarableOpsTests19, test_threshold_encode_2) {
    for (int length = 5; length < 35; length++) {
        auto x = NDArrayFactory::create<double>('c', {10000});
        auto exp_gradients = NDArrayFactory::create<double>('c', {10000});
        for (int e = 0; e < length; e++) {
            x.p(e, 2e-3);
            exp_gradients.p(e, 1e-3);
        }
        sd::ops::encode_threshold op;
        auto result = op.evaluate({&x}, {1e-3});
        auto encoded = result.at(1);
        ASSERT_EQ(length + 4, encoded->lengthOf());
        ASSERT_EQ(exp_gradients, x);
    }
 }
 TEST_F(DeclarableOpsTests19, test_threshold_encode_boundary_1) {
    auto x = NDArrayFactory::create<float>('c', {6});
    x = 1.0f;
    sd::ops::encode_threshold op;
    auto result = op.evaluate({&x}, {1.0}, {3});
    auto gradients = result.at(0);
    auto encoded = result.at(1);
    ASSERT_EQ(7, encoded->lengthOf());
    ASSERT_EQ(3, x.sumNumber().e<int>(0));
 }
 TEST_F(DeclarableOpsTests19, test_threshold_encode_boundary_2) {
    auto x = NDArrayFactory::create<float>('c', {1000});
    x = 1.0f;
    sd::ops::encode_threshold op;
    auto result = op.evaluate({&x}, {1.0}, {100});
    auto gradients = result.at(0);
    auto encoded = result.at(1);
    ASSERT_EQ(104, encoded->lengthOf());
    ASSERT_EQ(900, x.sumNumber().e<int>(0));
 }
 TEST_F(DeclarableOpsTests19, test_threshold_decode_1) {
    auto x = NDArrayFactory::create<double>('c', {3}, {1.0, 2.0, -3.0});
    auto y = NDArrayFactory::create<int>('c', {7}, {3, 3, 1056964608, 0, 1, 2, -3});
    auto exp_gradients = NDArrayFactory::create<double>('c', {3}, {1.5, 2.5, -3.5});
    sd::ops::decode_threshold op;
    auto status = op.execute({&x, &y}, {&x});
    ASSERT_EQ(Status::OK(), status);
    ASSERT_EQ(exp_gradients, x);
 }
 TEST_F(DeclarableOpsTests19, test_bitmap_encode_1) {
    auto initial = NDArrayFactory::create<float>('c', {6}, {0.0f, 0.0f, 1e-3f, -1e-3f, 0.0f, 0.0f});
    auto exp_0 = initial.like();
    auto exp_1 = initial.dup();
    auto exp_c = NDArrayFactory::create<int>(2L);
    sd::ops::encode_bitmap enc;
    auto enc_result = enc.evaluate({&initial}, {1e-3f});
    ASSERT_EQ(Status::OK(), enc_result.status());
    //initial.printIndexedBuffer("initial");
    ASSERT_EQ(exp_0, initial);
    auto encoded = enc_result.at(1);
    auto counter = enc_result.at(2);
    //encoded->printIndexedBuffer("encoded");
    ASSERT_EQ(exp_c, *counter);
    sd::ops::decode_bitmap dec;
    auto status = dec.execute({&initial, encoded}, {&initial});
    ASSERT_EQ(Status::OK(), status);
    //initial.printIndexedBuffer();
    ASSERT_EQ(exp_1, initial);
 }
 TEST_F(DeclarableOpsTests19, test_bitmap_encode_decode) {
    auto initial = NDArrayFactory::create<float>('c', {256000});
    initial = 1.0f;
    auto exp = initial.dup();
    auto neg = initial.like();
    neg = 0.5f;
    sd::ops::encode_bitmap enc;
    auto enc_result = enc.evaluate({&initial}, {0.5f});
    auto encoded = enc_result.at(1);
    // checking equality of all encoded bits
    for (int e = 5; e < encoded->lengthOf() - 1; e++) {
        if (encoded->e<int>(e) != encoded->e<int>(e - 1))
            nd4j_printf("Non equal encoded values at E[%i]: %i;\n", e, encoded->e<int>(e));
    }
    ASSERT_NE(exp, initial);
    ASSERT_EQ(neg, initial);
    sd::ops::decode_bitmap dec;
    auto status = dec.execute({&initial, encoded}, {&initial});
    ASSERT_EQ(Status::OK(), status);
    // checking equality of all dedoded bits
    for (int e = 0; e < initial.lengthOf(); e++) {
        auto f = initial.e<float>(e);
        if (f != 1.0f)
            nd4j_printf("initial[%i] = %f\n", e, f);
    }
    ASSERT_EQ(exp, initial);
 }
 TEST_F(DeclarableOpsTests19, test_threshold_encode_decode) {
    auto initial = NDArrayFactory::create<float>('c', {256000});
    initial = 1.0f;
    auto exp = initial.dup();
    auto neg = initial.like();
    neg = 0.5f;
    sd::ops::encode_threshold enc;
    auto enc_result = enc.evaluate({&initial}, {0.5f});
    auto encoded = enc_result.at(1);
    ASSERT_EQ(256000 + 4, encoded->lengthOf());
    ASSERT_NE(exp, initial);
    for (int e = 0; e < initial.lengthOf(); e++) {
        auto f = initial.e<float>(e);
        if (f != 0.5f) {
            nd4j_printf("initial[%i] = %f\n", e, f);
            throw std::runtime_error("");
        }
    }
    ASSERT_EQ(neg, initial);
    // checking equality of all encoded bits
    //for (int e = 5; e < encoded->lengthOf() - 1; e++) {
        //if (encoded->e<int>(e) != encoded->e<int>(e - 1) + 1)
            //nd4j_printf("Non equal encoded values at E[%i]: %i;\n", e, encoded->e<int>(e));
    //}
    sd::ops::decode_threshold dec;
    auto status = dec.execute({&initial, encoded}, {&initial});
    ASSERT_EQ(Status::OK(), status);
    // checking equality of all dedoded bits
    for (int e = 0; e < initial.lengthOf(); e++) {
        auto f = initial.e<float>(e);
        if (f != 1.0f)
            nd4j_printf("initial[%i] = %f\n", e, f);
    }
    ASSERT_EQ(exp, initial);
 }
 TEST_F(DeclarableOpsTests19, test_matmul_ccc) {
    auto x = NDArrayFactory::create<float>('c', {10, 10});
    auto y = NDArrayFactory::create<float>('c', {10, 10});
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java
@ -73,6 +73,10 @@ public class ImportClassMapping {
            org.nd4j.linalg.api.ops.impl.broadcast.BroadcastRSubOp.class,
            org.nd4j.linalg.api.ops.impl.broadcast.BroadcastSubOp.class,
            org.nd4j.linalg.api.ops.impl.broadcast.BroadcastTo.class,
            org.nd4j.linalg.api.ops.compression.EncodeBitmap.class,
            org.nd4j.linalg.api.ops.compression.DecodeBitmap.class,
            org.nd4j.linalg.api.ops.compression.EncodeThreshold.class,
            org.nd4j.linalg.api.ops.compression.DecodeThreshold.class,
            org.nd4j.linalg.api.ops.impl.shape.Create.class,
            org.nd4j.linalg.api.ops.impl.broadcast.bool.BroadcastEqualTo.class,
            org.nd4j.linalg.api.ops.impl.broadcast.bool.BroadcastGreaterThan.class,
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/DecodeBitmap.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/DecodeBitmap.java
@ -0,0 +1,55 @@
 /*******************************************************************************
 * Copyright (c) 2020 Konduit K.K.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 package org.nd4j.linalg.api.ops.compression;
 import lombok.NonNull;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.api.ops.DynamicCustomOp;
 import org.nd4j.linalg.factory.Nd4j;
 import java.util.Arrays;
 import java.util.List;
 /**
 * Bitmap decoding op wrapper. Used in gradients sharing.
 * @author raver119@gmail.com
 */
 public class DecodeBitmap extends DynamicCustomOp {
    public DecodeBitmap() {
        //
    }
    public DecodeBitmap(@NonNull INDArray encoded, @NonNull INDArray updates) {
        addInputArgument(updates, encoded);
        addOutputArgument(updates);
        // this op ALWAYS modifies updates array
        setInPlace(true);
    }
    @Override
    public String opName() {
        return "decode_bitmap";
    }
    @Override
    public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
        return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32);
    }
 }
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/DecodeThreshold.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/DecodeThreshold.java
@ -0,0 +1,54 @@
 /*******************************************************************************
 * Copyright (c) 2020 Konduit K.K.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 package org.nd4j.linalg.api.ops.compression;
 import lombok.NonNull;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.api.ops.DynamicCustomOp;
 import java.util.Arrays;
 import java.util.List;
 /**
 * Sparse threshold decoding op wrapper. Used in gradients sharing.
 * @author raver119@gmail.com
 */
 public class DecodeThreshold extends DynamicCustomOp {
    public DecodeThreshold() {
        //
    }
    public DecodeThreshold(@NonNull INDArray encoded, @NonNull INDArray updates) {
        addInputArgument(updates, encoded);
        addOutputArgument(updates);
        // this op ALWAYS modifies updates array
        setInPlace(true);
    }
    @Override
    public String opName() {
        return "decode_threshold";
    }
    @Override
    public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
        return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32);
    }
 }
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/EncodeBitmap.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/EncodeBitmap.java
@ -0,0 +1,64 @@
 /*******************************************************************************
 * Copyright (c) 2020 Konduit K.K.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 package org.nd4j.linalg.api.ops.compression;
 import lombok.NonNull;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.api.ops.DynamicCustomOp;
 import org.nd4j.linalg.factory.Nd4j;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 /**
 * Bitmap encoding op wrapper. Used in gradients sharing.
 * @author raver119@gmail.com
 */
 public class EncodeBitmap extends DynamicCustomOp {
    protected float threshold = 1e-3f;
    public EncodeBitmap() {
        //
    }
    public EncodeBitmap(@NonNull INDArray updates, float threshold) {
        this(updates, Nd4j.create(DataType.INT32, updates.length() / 16 + 5), Nd4j.scalar(DataType.INT32, 0), threshold);
    }
    public EncodeBitmap(@NonNull INDArray updates, @NonNull INDArray encoded, @NonNull INDArray counter, float threshold) {
        addInputArgument(updates);
        addOutputArgument(updates, encoded, counter);
        addTArgument(threshold);
        this.threshold = threshold;
        // this op ALWAYS modifies updates array
        setInPlace(true);
    }
    @Override
    public String opName() {
        return "encode_bitmap";
    }
    @Override
    public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
        return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32, DataType.INT32);
    }
 }
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/EncodeThreshold.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/EncodeThreshold.java
@ -0,0 +1,63 @@
 /*******************************************************************************
 * Copyright (c) 2020 Konduit K.K.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 package org.nd4j.linalg.api.ops.compression;
 import lombok.NonNull;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.api.ops.DynamicCustomOp;
 import org.nd4j.linalg.factory.Nd4j;
 import java.util.Arrays;
 import java.util.List;
 /**
 * Sparse threshold encoding op wrapper. Used in gradients sharing.
 * @author raver119@gmail.com
 */
 public class EncodeThreshold extends DynamicCustomOp {
    protected float threshold = 1e-3f;
    protected int boundary = Integer.MAX_VALUE;
    public EncodeThreshold() {
        //
    }
    public EncodeThreshold(@NonNull INDArray updates, float threshold) {
        this(updates, threshold, Integer.MAX_VALUE);
    }
    public EncodeThreshold(@NonNull INDArray updates, float threshold, @NonNull Integer boundary) {
        addInputArgument(updates);
        addTArgument(threshold);
        addIArgument(boundary.intValue());
        this.threshold = threshold;
        this.boundary = boundary;
    }
    @Override
    public String opName() {
        return "encode_threshold";
    }
    @Override
    public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
        return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32);
    }
 }
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java
@ -30,6 +30,10 @@ import org.nd4j.linalg.api.ndarray.INDArrayStatistics;
 import org.nd4j.linalg.api.ops.*;
 import org.nd4j.linalg.api.ops.aggregates.Aggregate;
 import org.nd4j.linalg.api.ops.aggregates.Batch;
 import org.nd4j.linalg.api.ops.compression.DecodeBitmap;
 import org.nd4j.linalg.api.ops.compression.DecodeThreshold;
 import org.nd4j.linalg.api.ops.compression.EncodeBitmap;
 import org.nd4j.linalg.api.ops.compression.EncodeThreshold;
 import org.nd4j.linalg.api.ops.impl.scatter.ScatterUpdate;
 import org.nd4j.linalg.api.ops.impl.summarystats.Variance;
 import org.nd4j.linalg.api.rng.Random;
@ -685,38 +689,41 @@ public abstract class DefaultOpExecutioner implements OpExecutioner {
    @Override
    public INDArray thresholdEncode(INDArray input, double threshold) {
-        throw new UnsupportedOperationException("Not yet implemented");
+        return thresholdEncode(input, threshold, Integer.MAX_VALUE);
    }
    @Override
    public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) {
-        throw new UnsupportedOperationException("Not yet implemented");
+        val result = Nd4j.exec(new EncodeThreshold(input, (float) threshold, boundary))[1];
        return result.getInt(0) > 0 ? result : null;
    }
    @Override
    public INDArray thresholdDecode(INDArray encoded, INDArray target) {
-        throw new UnsupportedOperationException("Not yet implemented");
+        Nd4j.exec(new DecodeThreshold(encoded, target));
        return target;
    }
    @Override
    public long bitmapEncode(INDArray indArray, INDArray target, double threshold) {
-        throw new UnsupportedOperationException("Not yet implemented");
+        val results = Nd4j.exec(new EncodeBitmap(indArray, target, Nd4j.scalar(0), (float) threshold));
        // return number of elements taht were compressed
        return results[2].getInt(0);
    }
    @Override
    public INDArray bitmapEncode(INDArray indArray, double threshold) {
-        DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(indArray.length() / 16 + 5);
+        val array = Nd4j.create(DataType.INT32, indArray.length() / 16 + 5);
-
+        bitmapEncode(indArray, array, threshold);
-        INDArray ret = Nd4j.createArrayFromShapeBuffer(buffer, indArray.shapeInfoDataBuffer());
+        return array;
        bitmapEncode(indArray, ret, threshold);
        return ret;
    }
    @Override
    public INDArray bitmapDecode(INDArray encoded, INDArray target) {
-        throw new UnsupportedOperationException("Not yet implemented");
+        Nd4j.exec(new DecodeBitmap(encoded, target));
        return target;
    }
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java
@ -1004,20 +1004,6 @@ public interface NativeOps {
                                @Cast("Nd4jLong *") LongPointer tadShapeInfo,
                                @Cast("Nd4jLong *") LongPointer tadOffsets);
    long encodeBitmap(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, long N, IntPointer dz, float threshold);
    void decodeBitmap(PointerPointer extraPointers, Pointer dx, long N, Pointer dz, LongPointer zShapeInfo);
    void encodeThresholdP1(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, long N, IntPointer dz, float threshold);
    void encodeThresholdP2Int(PointerPointer extraPointers, IntPointer dx, long N, IntPointer dz);
    void encodeThresholdP3(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, IntPointer offsets, long N, IntPointer dz);
    void decodeThreshold(PointerPointer extraPointers, Pointer dx, long N, Pointer dz, LongPointer zShapeInfo);
    void sort(PointerPointer extraPointers,
                              Pointer x, @Cast("Nd4jLong *") LongPointer xShapeInfo,
                              Pointer dx, @Cast("Nd4jLong *") LongPointer dxShapeInfo,
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/compression/CudaFlexibleThreshold.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/compression/CudaFlexibleThreshold.java
@ -1,100 +0,0 @@
 /*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 package org.nd4j.linalg.jcublas.compression;
 import org.bytedeco.javacpp.IntPointer;
 import org.nd4j.linalg.api.buffer.DataBuffer;
 import org.nd4j.linalg.api.buffer.DataTypeEx;
 import org.nd4j.linalg.api.concurrency.AffinityManager;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.compression.CompressedDataBuffer;
 import org.nd4j.linalg.compression.CompressionDescriptor;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.indexing.conditions.Conditions;
 /**
 * This compression is very special case, and shouldn't be ever used outside of ParallelWrapper/ParameterServer implementation.
 * It encodes data as delta between zero and abs threshold.
 *
 * Unlike CudaThreshold codec, CudaFlexibleThreshold tries to target specified sparsity/density updates ratio via topN approach
 *
 * PLEASE NOTE: DO NOT USE THIS COMPRESSOR UNLESS YOU'RE 100% SURE WHAT YOU DO!
 *
 * @author raver119@gmail.com
 */
 public class CudaFlexibleThreshold extends CudaThreshold {
    public CudaFlexibleThreshold() {
        super();
        this.threshold = 0.1f;
    }
    /**
     * This method returns compression descriptor. It should be unique for any compressor implementation
     *
     * @return
     */
    @Override
    public String getDescriptor() {
        return "FTHRESHOLD";
    }
    /**
     * This method allows you to configure desired sparsity/density ratio for updates. Pass it as float/double value
     *
     * Default value: 0.1
     * @param vars
     */
    @Override
    public void configure(Object... vars) {
        super.configure(vars);
    }
    @Override
    public DataBuffer compress(DataBuffer buffer) {
        INDArray temp = Nd4j.createArrayFromShapeBuffer(buffer, Nd4j.getShapeInfoProvider().createShapeInformation(new long[]{1, buffer.length()}, buffer.dataType()));
        double max = temp.amaxNumber().doubleValue();
        int cntAbs = temp.scan(Conditions.absGreaterThanOrEqual(max - (max * threshold))).intValue();
        long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType());
        int compressedLength = cntAbs + 3;
        // first 3 elements contain header
        IntPointer pointer = new IntPointer(compressedLength);
        pointer.put(0, cntAbs);
        pointer.put(1, (int) buffer.length());
        pointer.put(2, Float.floatToIntBits(threshold)); // please note, this value will be ovewritten anyway
        CompressionDescriptor descriptor = new CompressionDescriptor();
        descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT)
        descriptor.setOriginalLength(originalLength);
        descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType()));
        descriptor.setNumberOfElements(buffer.length());
        descriptor.setCompressionAlgorithm(getDescriptor());
        descriptor.setCompressionType(getCompressionType());
        CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor);
        Nd4j.getNDArrayFactory().convertDataEx(getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.FTHRESHOLD, pointer, buffer.length());
        Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST);
        return cbuff;
    }
 }
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/compression/CudaThreshold.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/compression/CudaThreshold.java
@ -1,271 +0,0 @@
 /*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 package org.nd4j.linalg.jcublas.compression;
 import lombok.Getter;
 import lombok.Setter;
 import lombok.extern.slf4j.Slf4j;
 import lombok.val;
 import org.apache.commons.math3.util.FastMath;
 import org.bytedeco.javacpp.*;
 import org.nd4j.compression.impl.AbstractCompressor;
 import org.nd4j.jita.allocator.impl.AtomicAllocator;
 import org.nd4j.linalg.api.buffer.DataBuffer;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.buffer.DataTypeEx;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.compression.CompressedDataBuffer;
 import org.nd4j.linalg.compression.CompressionType;
 import org.nd4j.linalg.exception.ND4JIllegalStateException;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.jcublas.context.CudaContext;
 import org.nd4j.nativeblas.NativeOpsHolder;
 import java.util.ArrayList;
 import java.util.List;
 /**
 * This compression is very special case, and shouldn't be ever used outside of ParallelWrapper/ParameterServer implementation.
 * It encodes data as delta between zero and abs threshold.
 *
 * PLEASE NOTE: DO NOT USE THIS COMPRESSOR UNLESS YOU'RE 100% SURE WHAT YOU DO!
 *
 * @author raver119@gmail.com
 */
@Slf4j
 public class CudaThreshold extends AbstractCompressor {
    @Getter @Setter protected float threshold = 1e-3f;
    /**
     * This method returns compression descriptor. It should be unique for any compressor implementation
     *
     * @return
     */
    @Override
    public String getDescriptor() {
        return "THRESHOLD";
    }
    /**
     * This method allows you to configure threshold for delta extraction. Pass it as float/double value
     *
     * Default value: 1e-3
     * @param vars
     */
    @Override
    public void configure(Object... vars) {
        if (vars[0] instanceof Number) {
            Number t = (Number) vars[0];
            threshold = FastMath.abs(t.floatValue());
            log.info("Setting threshold to [{}]", threshold);
        } else {
            throw new ND4JIllegalStateException("Threshold value should be Number");
        }
    }
    @Override
    public INDArray compress(INDArray array) {
        //logger.info("Threshold [{}] compression", threshold);
        Nd4j.getExecutioner().commit();
        //Nd4j.getAffinityManager().ensureLocation(array, AffinityManager.Location.HOST);
        DataBuffer buffer = compress(array.data());
        if (buffer == null)
            return null;
        INDArray dup = Nd4j.createArrayFromShapeBuffer(buffer, array.shapeInfoDataBuffer());
        dup.markAsCompressed(true);
        return dup;
    }
    @Override
    public CompressionType getCompressionType() {
        return CompressionType.LOSSLESS;
    }
    @Override
    public DataBuffer decompress(DataBuffer buffer, DataType type) {
        if (buffer.dataType() != DataType.INT)
            throw new UnsupportedOperationException();
        long compressedLength = buffer.getInt(0);
        long originalLength = buffer.getInt(1);
        DataBuffer result = Nd4j.createBuffer(type, originalLength, false);
        val context = AtomicAllocator.getInstance().getDeviceContext();
        PointerPointer extras = new PointerPointer(32).put(1, context.getOldStream());
        //log.info("DEC Source length: {}", buffer.length());
        //log.info("DEC Source: {}", Arrays.toString(buffer.asInt()));
        //NativeOpsHolder.getInstance().getDeviceNativeOps().decodeThresholdFloat(extras, AtomicAllocator.getInstance().getPointer(buffer), compressedLength, (FloatPointer) AtomicAllocator.getInstance().getPointer(result));
        AtomicAllocator.getInstance().getAllocationPoint(result).tickDeviceWrite();
        //DataBuffer result = Nd4j.getNDArrayFactory().convertDataEx(DataTypeEx.THRESHOLD, buffer, getGlobalTypeEx());
        return result;
    }
    @Override
    public DataBuffer compress(DataBuffer buffer) {
        int numThreads = 1024;
        int numBlocks = (int) (buffer.length() / numThreads + (buffer.length() % numThreads == 0 ? 0 : 1));
        val context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext();
        DataBuffer blocksBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks+1, true) : Nd4j.getDataBufferFactory().createInt(numBlocks+1, true, Nd4j.getMemoryManager().getCurrentWorkspace());
        PointerPointer extras = new PointerPointer(32).put(1, context.getOldStream());
        //NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP1(extras, (FloatPointer) AtomicAllocator.getInstance().getPointer(buffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), threshold);
        AtomicAllocator.getInstance().getAllocationPoint(blocksBuffer).tickDeviceWrite();
        int numMatches = blocksBuffer.getInt(0);
        //log.info("Totals: {}", numMatches);
 /*
        log.info("Number of blocks for compression: {}", numBlocks);
        log.info("BlocksCounts: {}", Arrays.toString(blocksBuffer.asInt()));
 */
        DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(3+numMatches, false) : Nd4j.getDataBufferFactory().createInt(3+numMatches, false, Nd4j.getMemoryManager().getCurrentWorkspace());
        encodedBuffer.put(0, numMatches);
        encodedBuffer.put(1, (int) buffer.length());
        encodedBuffer.put(2, Float.floatToIntBits(threshold));
        AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickHostWrite();
        // FIXME: make it parallel via some kernel, because it can be pretty big array here, i.e. for 150m original array, offsets can
        /*
        int prevSum = 0;
        for (int e = 0; e < numBlocks; e++) {
            int prevVal = offsetsBuffer.getInt(e + 1);
            offsetsBuffer.put(e + 1, prevSum);
            prevSum += prevVal;
        }
        */
        int prefixThreads = 512;
        int numElts = numBlocks;
        int level = 0;
        List<DataBuffer> buffers = new ArrayList<>();
        // here we just calculate number of sumBlock arrays
        do {
            int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
            if (numBlocks > 1) {
                level++;
            }
            numElts = numPrefixBlocks;
        } while (numElts > 1);
        long[] pointers = new long[level];
        level = 0;
        numElts = numBlocks;
        //  allocating temp buffers for prefux sum
        DataBuffer tempX = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createDouble(pointers.length, false) : Nd4j.getDataBufferFactory().createDouble(pointers.length, false, Nd4j.getMemoryManager().getCurrentWorkspace());
        do {
            int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
            if (numPrefixBlocks > 1) {
                DataBuffer bf = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false) : Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false, Nd4j.getMemoryManager().getCurrentWorkspace());
                buffers.add(bf);
                pointers[level++] = AtomicAllocator.getInstance().getPointer(bf).address();
            }
            numElts = numPrefixBlocks;
        } while (numElts > 1);
        AtomicAllocator.getInstance().memcpyBlocking(tempX, new LongPointer(pointers), pointers.length * 8, 0);
        extras.put(2, AtomicAllocator.getInstance().getPointer(tempX));
        DataBuffer offsetsBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks, true) : Nd4j.getDataBufferFactory().createInt(numBlocks, true, Nd4j.getMemoryManager().getCurrentWorkspace());
        NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP2Int(extras, (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), numBlocks, (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer) );
        AtomicAllocator.getInstance().getAllocationPoint(offsetsBuffer).tickDeviceWrite();
        //log.info("Offsets: {}", Arrays.toString(offsetsBuffer.asInt()));
        //log.info("Target: {}", Arrays.toString(encodedBuffer.asInt()));
        //NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP3Float(extras, (FloatPointer) AtomicAllocator.getInstance().getPointer(buffer), (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(encodedBuffer));
        AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickDeviceWrite();
        AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite();
        //log.info("Encoded: {}", Arrays.toString(encodedBuffer.asInt()));
        extras.address();
        tempX.address();
        return encodedBuffer;
        /*
        INDArray temp = Nd4j.createArrayFromShapeBuffer(buffer, Nd4j.getShapeInfoProvider().createShapeInformation(new int[]{1, (int) buffer.length()}));
        MatchCondition condition = new MatchCondition(temp, Conditions.absGreaterThanOrEqual(threshold));
        int cntAbs = Nd4j.getExecutioner().exec(condition, Integer.MAX_VALUE).getInt(0);
        //log.info("density ratio: {}", String.format("%.2f", cntAbs * 100.0f / buffer.length()));
        if (cntAbs == 0)
            return null;
        long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType());
        int compressedLength = cntAbs + 3;
        // first 3 elements contain header
        IntPointer pointer = new IntPointer(compressedLength);
        pointer.put(0, cntAbs);
        pointer.put(1, (int) buffer.length());
        pointer.put(2, Float.floatToIntBits(threshold));
        CompressionDescriptor descriptor = new CompressionDescriptor();
        descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT)
        descriptor.setOriginalLength(originalLength);
        descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType()));
        descriptor.setNumberOfElements(buffer.length());
        descriptor.setCompressionAlgorithm(getDescriptor());
        descriptor.setCompressionType(getCompressionType());
        CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor);
        Nd4j.getNDArrayFactory().convertDataEx(getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.THRESHOLD, pointer, buffer.length());
        Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST);
        return cbuff;
        */
    }
    @Override
    protected CompressedDataBuffer compressPointer(DataTypeEx srcType, Pointer srcPointer, int length, int elementSize) {
        throw new UnsupportedOperationException();
    }
 }
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java
@ -31,6 +31,7 @@ import org.nd4j.jita.allocator.tad.DeviceTADManager;
 import org.nd4j.jita.conf.CudaEnvironment;
 import org.nd4j.linalg.api.buffer.DataBuffer;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.concurrency.AffinityManager;
 import org.nd4j.linalg.api.environment.Nd4jEnvironment;
 import org.nd4j.linalg.api.memory.pointers.PagedPointer;
 import org.nd4j.linalg.api.ndarray.INDArray;
@ -1674,224 +1675,6 @@ public class CudaExecutioner extends DefaultOpExecutioner {
        ctx.syncSpecialStream();
    }
    @Override
    public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) {
        DataBuffer buffer = input.data();
        int numThreads = 1024;
        int numBlocks = (int) (buffer.length() / numThreads + (buffer.length() % numThreads == 0 ? 0 : 1));
        val context = AtomicAllocator.getInstance().getDeviceContext();
        DataBuffer blocksBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks+1, true) : Nd4j.getDataBufferFactory().createInt(numBlocks+1, true, Nd4j.getMemoryManager().getCurrentWorkspace());
        if (extraz.get() == null)
            extraz.set(new PointerPointer(32));
        val extras = extraz.get().put(1, context.getOldStream());
        ((BaseCudaDataBuffer) buffer).getOpaqueDataBuffer().syncToSpecial();
        NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP1(extras,
                AtomicAllocator.getInstance().getPointer(buffer),
                (LongPointer) AtomicAllocator.getInstance().getHostPointer(input.shapeInfoDataBuffer()),
                buffer.length(),
                (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer),
                (float) threshold);
        AtomicAllocator.getInstance().getAllocationPoint(blocksBuffer).tickDeviceWrite();
        int numMatches = blocksBuffer.getInt(0);
        // special case here, nothing to update
        if (numMatches < 2)
            return null;
        if (boundary != null && numMatches > boundary)  {
            numMatches = boundary;
            blocksBuffer.put(0, numMatches);
        }
        DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(4+numMatches, false) : Nd4j.getDataBufferFactory().createInt(4+numMatches, false, Nd4j.getMemoryManager().getCurrentWorkspace());
        encodedBuffer.put(0, numMatches);
        encodedBuffer.put(1, (int) buffer.length());
        encodedBuffer.put(2, Float.floatToIntBits((float) threshold));
        encodedBuffer.put(3, ThresholdCompression.FLEXIBLE_ENCODING);
        ((BaseCudaDataBuffer) encodedBuffer).getOpaqueDataBuffer().syncToSpecial();
        int prefixThreads = 512;
        int numElts = numBlocks;
        int level = 0;
        List<DataBuffer> buffers = new ArrayList<>();
        // here we just calculate number of sumBlock arrays
        do {
            int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
            if (numBlocks > 1) {
                level++;
            }
            numElts = numPrefixBlocks;
        } while (numElts > 1);
        long[] pointers = new long[level];
        level = 0;
        numElts = numBlocks;
        //  allocating temp buffers for prefux sum
        DataBuffer tempX = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createDouble(pointers.length, false) : Nd4j.getDataBufferFactory().createDouble(pointers.length, false, Nd4j.getMemoryManager().getCurrentWorkspace());
        do {
            int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
            if (numPrefixBlocks > 1) {
                DataBuffer bf = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false) : Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false, Nd4j.getMemoryManager().getCurrentWorkspace());
                buffers.add(bf);
                pointers[level++] = AtomicAllocator.getInstance().getPointer(bf).address();
            }
            numElts = numPrefixBlocks;
        } while (numElts > 1);
        AtomicAllocator.getInstance().memcpyBlocking(tempX, new LongPointer(pointers), pointers.length * 8, 0);
        extras.put(2, AtomicAllocator.getInstance().getPointer(tempX));
        DataBuffer offsetsBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks, true) : Nd4j.getDataBufferFactory().createInt(numBlocks, true, Nd4j.getMemoryManager().getCurrentWorkspace());
        NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP2Int(extras, (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), numBlocks, (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer) );
        AtomicAllocator.getInstance().getAllocationPoint(offsetsBuffer).tickDeviceWrite();
        NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP3(extras, AtomicAllocator.getInstance().getPointer(buffer), (LongPointer) AtomicAllocator.getInstance().getHostPointer(input.shapeInfoDataBuffer()), (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(encodedBuffer));
        AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickDeviceWrite();
        AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite();
        return Nd4j.createArrayFromShapeBuffer(encodedBuffer, input.shapeInfoDataBuffer());
    }
    @Override
    public INDArray thresholdEncode(INDArray input, double threshold) {
        return thresholdEncode(input, threshold, null);
    }
    @Override
    public INDArray thresholdDecode(INDArray encoded, INDArray target) {
        DataBuffer buffer = encoded.data();
        if (buffer.dataType() != DataType.INT)
            throw new UnsupportedOperationException();
        long compressedLength = buffer.getInt(0);
        long originalLength = buffer.getInt(1);
        if (target.length() != originalLength)
            throw new ND4JIllegalStateException("originalLength ["+ originalLength+"] stored in encoded array doesn't match target length ["+ target.length()+"]");
        DataBuffer result = target.data();
        val context = AtomicAllocator.getInstance().getDeviceContext();
        if (extraz.get() == null)
            extraz.set(new PointerPointer(32));
        PointerPointer extras = extraz.get().put(1, context.getOldStream());
        nativeOps.decodeThreshold(extras, AtomicAllocator.getInstance().getPointer(buffer), compressedLength, AtomicAllocator.getInstance().getPointer(result), (LongPointer) AtomicAllocator.getInstance().getHostPointer(target.shapeInfoDataBuffer()));
        if (nativeOps.lastErrorCode() != 0)
            throw new RuntimeException(nativeOps.lastErrorMessage());
        AtomicAllocator.getInstance().getAllocationPoint(result).tickDeviceWrite();
        return target;
    }
    @Override
    public long bitmapEncode(INDArray indArray, INDArray target, double threshold) {
        long length = indArray.length();
        long tLen = target.data().length();
        if (tLen != (length / 16 + 5))
            throw new ND4JIllegalStateException("Length of target array should be " + (length / 16 + 5));
        if (target.data().dataType() != DataType.INT)
            throw new ND4JIllegalStateException("Target array should have INT dataType");
        DataBuffer buffer = target.data();
        buffer.put(0, (int) length);
        buffer.put(1, (int) length);
        buffer.put(2, Float.floatToIntBits((float) threshold));
        // format id
        buffer.put(3, ThresholdCompression.BITMAP_ENCODING);
        val context = AtomicAllocator.getInstance().getDeviceContext();
        if (extraz.get() == null)
            extraz.set(new PointerPointer(32));
        PointerPointer extras = extraz.get().put(
                AtomicAllocator.getInstance().getHostPointer(indArray),
                context.getOldStream(),
                context.getBufferScalar(),
                context.getBufferReduction()
        );
        val src = AtomicAllocator.getInstance().getPointer(indArray, context);
        val dst = (IntPointer) AtomicAllocator.getInstance().getPointer(buffer, context);
        ((BaseCudaDataBuffer) buffer).getOpaqueDataBuffer().syncToSpecial();
        long val = nativeOps.encodeBitmap(extras,
                    src, (LongPointer) AtomicAllocator.getInstance().getHostPointer(indArray.shapeInfoDataBuffer()),
                    length,
                    dst,
                    (float) threshold);
        if (nativeOps.lastErrorCode() != 0)
            throw new RuntimeException(nativeOps.lastErrorMessage());
        AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite();
        return val;
    }
    @Override
    public INDArray bitmapDecode(INDArray encoded, INDArray target) {
        val context = AtomicAllocator.getInstance().getDeviceContext();
        if (extraz.get() == null)
            extraz.set(new PointerPointer(32));
        PointerPointer extras = extraz.get().put(
                AtomicAllocator.getInstance().getHostPointer(target),
                context.getOldStream(),
                context.getBufferScalar(),
                context.getBufferReduction());
        nativeOps.decodeBitmap(extras, AtomicAllocator.getInstance().getPointer(encoded.data(), context), target.length(), AtomicAllocator.getInstance().getPointer(target, context), (LongPointer) AtomicAllocator.getInstance().getHostPointer(target.shapeInfoDataBuffer()));
        if (nativeOps.lastErrorCode() != 0)
            throw new RuntimeException(nativeOps.lastErrorMessage());
        return target;
    }
    @Override
    public synchronized Map<String, CustomOpDescriptor> getCustomOperations() {
        if(customOps == null) {
@ -1974,6 +1757,11 @@ public class CudaExecutioner extends DefaultOpExecutioner {
        val inputArgs = opContext != null ? opContext.getInputArrays() : op.inputArguments();
        int cnt= 0;
        for (val in: inputArgs) {
            // TODO: once we implement Context-based shape function call this method should be removed
            val loc = Nd4j.getAffinityManager().getActiveLocation(in);
            if (loc != AffinityManager.Location.DEVICE && loc != AffinityManager.Location.EVERYWHERE)
                Nd4j.getAffinityManager().ensureLocation(in, AffinityManager.Location.DEVICE);
            // NOT A TYPO: shape functions work on host side only
            if (!in.isEmpty()) {
                inputBuffers.put(cnt, in.data().addressPointer());
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
@ -2804,30 +2804,6 @@ public native void tear(@Cast("Nd4jPointer*") PointerPointer extraPointers,
                        @Cast("Nd4jLong*") long[] tadShapeInfo,
                        @Cast("Nd4jLong*") long[] tadOffsets);
 public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
 public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
 public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
 public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
 public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
 public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
 public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
 public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
 public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
 public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntPointer dx, @Cast("Nd4jLong") long N, IntPointer dz);
 public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntBuffer dx, @Cast("Nd4jLong") long N, IntBuffer dz);
 public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, int[] dx, @Cast("Nd4jLong") long N, int[] dz);
 public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, IntPointer offsets, @Cast("Nd4jLong") long N, IntPointer dz);
 public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, IntBuffer offsets, @Cast("Nd4jLong") long N, IntBuffer dz);
 public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, int[] offsets, @Cast("Nd4jLong") long N, int[] dz);
 public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
 public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
 public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
 public native void sort(@Cast("Nd4jPointer*") PointerPointer extraPointers,
        Pointer x, @Cast("Nd4jLong*") LongPointer xShapeInfo,
        Pointer dx, @Cast("Nd4jLong*") LongPointer dxShapeInfo,
@ -10712,6 +10688,7 @@ public static final int PREALLOC_SIZE = 33554432;
 // #include <ops/declarable/headers/boolean.h>
 // #include <ops/declarable/headers/broadcastable.h>
 // #include <ops/declarable/headers/convo.h>
 // #include <ops/declarable/headers/compression.h>
 // #include <ops/declarable/headers/list.h>
 // #include <ops/declarable/headers/recurrent.h>
 // #include <ops/declarable/headers/transforms.h>
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/resources/META-INF/services/org.nd4j.linalg.compression.NDArrayCompressor
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/resources/META-INF/services/org.nd4j.linalg.compression.NDArrayCompressor
@ -13,5 +13,3 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 ################################################################################
 org.nd4j.linalg.jcublas.compression.CudaThreshold
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java
@ -1376,142 +1376,6 @@ public class NativeOpExecutioner extends DefaultOpExecutioner {
        }
    }
    @Override
    public INDArray thresholdEncode(INDArray input, double threshold) {
        return thresholdEncode(input, threshold, null);
    }
    @Override
    public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) {
        //val condition = new MatchCondition(input, Conditions.absGreaterThanOrEqual(threshold));
        //long t1 = System.currentTimeMillis();
        int cntAbs = loop.estimateThreshold(null,
                input.data().addressPointer(),
                (LongPointer) input.shapeInfoDataBuffer().addressPointer(),
                (int) input.length(),
                (float) threshold);
        //long t2 = System.currentTimeMillis();
        if (loop.lastErrorCode() != 0)
            throw new RuntimeException(loop.lastErrorMessage());
        if (cntAbs < 2)
            return null;
        if (boundary != null)
            cntAbs = Math.min(cntAbs, boundary);
        //log.info("S: {}; T: {}", cntAbs, t2 - t1);
        DataBuffer buffer = input.data();
        long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType());
        int compressedLength = cntAbs + 4;
        // first 3 elements contain header
        DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(4+cntAbs, false) : Nd4j.getDataBufferFactory().createInt(4+cntAbs, false, Nd4j.getMemoryManager().getCurrentWorkspace());
        encodedBuffer.put(0, cntAbs);
        encodedBuffer.put(1, (int) buffer.length());
        encodedBuffer.put(2, Float.floatToIntBits((float) threshold));
        // format id
        encodedBuffer.put(3, ThresholdCompression.FLEXIBLE_ENCODING);
        CompressionDescriptor descriptor = new CompressionDescriptor();
        descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT)
        descriptor.setOriginalLength(originalLength);
        descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType()));
        descriptor.setNumberOfElements(buffer.length());
        descriptor.setCompressionAlgorithm("THRESHOLD");
        descriptor.setCompressionType(CompressionType.LOSSLESS);
        //CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor);
        Nd4j.getNDArrayFactory().convertDataEx(AbstractCompressor.getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.THRESHOLD, encodedBuffer.addressPointer(), buffer.length());
        Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST);
        return Nd4j.createArrayFromShapeBuffer(encodedBuffer, input.shapeInfoDataBuffer());
    }
    @Override
    public INDArray thresholdDecode(INDArray encoded, INDArray target) {
        DataBuffer buffer = encoded.data();
        if (buffer.dataType() != DataType.INT)
            throw new ND4JIllegalStateException("thresholdEncoded array should have dataType of INT");
        long compressedLength = buffer.getInt(0);
        long originalLength = buffer.getInt(1);
        float threshold = buffer.getInt(2);
        if (target.length() != originalLength)
            throw new ND4JIllegalStateException("originalLength ["+ originalLength+"] stored in encoded array doesn't match target length ["+ target.length()+"]");
        DataTypeEx typeDst = AbstractCompressor.getBufferTypeEx(target.data());
        loop.convertTypes(null, DataTypeEx.THRESHOLD.ordinal(), buffer.addressPointer(), target.length(), typeDst.ordinal(), target.data().addressPointer());
        if (loop.lastErrorCode() != 0)
            throw new RuntimeException(loop.lastErrorMessage());
        return target;
    }
    @Override
    public long bitmapEncode(INDArray indArray, INDArray target, double threshold) {
        long length = indArray.length();
        long tLen = target.data().length();
        if (tLen != (length / 16 + 5))
            throw new ND4JIllegalStateException("Length of target array should be " + (length / 16 + 5));
        if (target.data().dataType() != DataType.INT)
            throw new ND4JIllegalStateException("Target array should have INT dataType");
        DataBuffer buffer = target.data();
        buffer.put(0, (int) length);
        buffer.put(1, (int) length);
        buffer.put(2, Float.floatToIntBits((float) threshold));
        // format id
        buffer.put(3, ThresholdCompression.BITMAP_ENCODING);
        long affected = loop.encodeBitmap(null,
                indArray.data().addressPointer(),
                (LongPointer) indArray.shapeInfoDataBuffer().addressPointer(),
                length,
                (IntPointer) buffer.addressPointer(),
                (float) threshold);
        if (loop.lastErrorCode() != 0)
            throw new RuntimeException(loop.lastErrorMessage());
        return affected;
    }
    @Override
    public INDArray bitmapDecode(INDArray encoded, INDArray target) {
        loop.decodeBitmap(null,
                encoded.data().addressPointer(),
                target.length(),
                target.data().addressPointer(),
                (LongPointer) target.shapeInfoDataBuffer().addressPointer()
        );
        if (loop.lastErrorCode() != 0)
            throw new RuntimeException(loop.lastErrorMessage());
        return target;
    }
    @Override
    public synchronized Map<String, CustomOpDescriptor> getCustomOperations() {
        if (customOps == null) {
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
@ -2808,30 +2808,6 @@ public native void tear(@Cast("Nd4jPointer*") PointerPointer extraPointers,
                        @Cast("Nd4jLong*") long[] tadShapeInfo,
                        @Cast("Nd4jLong*") long[] tadOffsets);
 public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
 public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
 public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
 public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
 public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
 public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
 public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
 public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
 public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
 public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntPointer dx, @Cast("Nd4jLong") long N, IntPointer dz);
 public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntBuffer dx, @Cast("Nd4jLong") long N, IntBuffer dz);
 public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, int[] dx, @Cast("Nd4jLong") long N, int[] dz);
 public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, IntPointer offsets, @Cast("Nd4jLong") long N, IntPointer dz);
 public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, IntBuffer offsets, @Cast("Nd4jLong") long N, IntBuffer dz);
 public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, int[] offsets, @Cast("Nd4jLong") long N, int[] dz);
 public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
 public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
 public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
 public native void sort(@Cast("Nd4jPointer*") PointerPointer extraPointers,
        Pointer x, @Cast("Nd4jLong*") LongPointer xShapeInfo,
        Pointer dx, @Cast("Nd4jLong*") LongPointer dxShapeInfo,
@ -12463,6 +12439,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #include <ops/declarable/headers/boolean.h>
 // #include <ops/declarable/headers/broadcastable.h>
 // #include <ops/declarable/headers/convo.h>
 // #include <ops/declarable/headers/compression.h>
 // #include <ops/declarable/headers/list.h>
 // #include <ops/declarable/headers/recurrent.h>
 // #include <ops/declarable/headers/transforms.h>
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java
@ -8315,6 +8315,85 @@ public class Nd4jTestsC extends BaseNd4jTest {
        assertArrayEquals(new long[]{bS, oH, oW, oC}, ret[0].shape());
    }
    @Test
    public void testMatmulMethod_8() {
        val x = Nd4j.create(DataType.INT8, 3, 5).assign(1);
        val y = Nd4j.create(DataType.INT8, 5, 3).assign(1);
        val e = Nd4j.create(DataType.INT8, 3, 3).assign(5);
        val z = x.mmul(y);
        assertEquals(e, z);
    }
    @Test
    public void testMatmulMethod_7() {
        val x = Nd4j.create(DataType.INT16, 3, 5).assign(1);
        val y = Nd4j.create(DataType.INT16, 5, 3).assign(1);
        val e = Nd4j.create(DataType.INT16, 3, 3).assign(5);
        val z = x.mmul(y);
        assertEquals(e, z);
    }
    @Test
    public void testMatmulMethod_1() {
        val x = Nd4j.create(DataType.INT32, 3, 5).assign(1);
        val y = Nd4j.create(DataType.INT32, 5, 3).assign(1);
        val e = Nd4j.create(DataType.INT32, 3, 3).assign(5);
        val z = x.mmul(y);
        assertEquals(e, z);
    }
    @Test
    public void testMatmulMethod_2() {
        val x = Nd4j.create(DataType.INT64, 3, 5).assign(1);
        val y = Nd4j.create(DataType.INT64, 5, 3).assign(1);
        val e = Nd4j.create(DataType.INT64, 3, 3).assign(5);
        val z = x.mmul(y);
        assertEquals(e, z);
    }
    @Test
    public void testMatmulMethod_6() {
        val x = Nd4j.create(DataType.UINT8, 3, 5).assign(1);
        val y = Nd4j.create(DataType.UINT8, 5, 3).assign(1);
        val e = Nd4j.create(DataType.UINT8, 3, 3).assign(5);
        val z = x.mmul(y);
        assertEquals(e, z);
    }
    @Test
    public void testMatmulMethod_5() {
        val x = Nd4j.create(DataType.UINT16, 3, 5).assign(1);
        val y = Nd4j.create(DataType.UINT16, 5, 3).assign(1);
        val e = Nd4j.create(DataType.UINT16, 3, 3).assign(5);
        val z = x.mmul(y);
        assertEquals(e, z);
    }
    @Test
    public void testMatmulMethod_3() {
        val x = Nd4j.create(DataType.UINT32, 3, 5).assign(1);
        val y = Nd4j.create(DataType.UINT32, 5, 3).assign(1);
        val e = Nd4j.create(DataType.UINT32, 3, 3).assign(5);
        val z = x.mmul(y);
        assertEquals(e, z);
    }
    @Test
    public void testMatmulMethod_4() {
        val x = Nd4j.create(DataType.UINT64, 3, 5).assign(1);
        val y = Nd4j.create(DataType.UINT64, 5, 3).assign(1);
        val e = Nd4j.create(DataType.UINT64, 3, 3).assign(5);
        val z = x.mmul(y);
        assertEquals(e, z);
    }
    @Override
    public char ordering() {
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/compression/CompressionTests.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/compression/CompressionTests.java
@ -17,6 +17,7 @@
 package org.nd4j.linalg.compression;
 import lombok.extern.slf4j.Slf4j;
 import lombok.val;
 import org.junit.Ignore;
 import org.junit.Test;
 import org.junit.runner.RunWith;
@ -44,7 +45,6 @@ import static org.junit.Assert.*;
 /**
 * @author raver119@gmail.com
 */
@Ignore
@Slf4j
@RunWith(Parameterized.class)
 public class CompressionTests extends BaseNd4jTest {
@ -140,40 +140,6 @@ public class CompressionTests extends BaseNd4jTest {
    }
    @Test
    public void testThresholdCompressionZ() {
        INDArray initial = Nd4j.create(1, 16384);
        for (int i = 0; i < 96; i++)
            initial.putScalar(i * 20, 1.0f);
        INDArray exp = Nd4j.create(1, 16384);
        for (int i = 0; i < 96; i++)
            exp.putScalar(i * 20, 0.1f);
        INDArray exp_d = Nd4j.create(1, 16384);
        for (int i = 0; i < 96; i++)
            exp_d.putScalar(i * 20, 0.9f);
        NDArrayCompressor compressor = Nd4j.getCompressor().getCompressor("THRESHOLD");
        compressor.configure(0.9);
        INDArray compressed = Nd4j.getExecutioner().thresholdEncode(initial, 0.9);
        assertEquals(exp, initial);
        log.info("Compressed length: {}", compressed.data().length());
        //        log.info("Compressed: {}", Arrays.toString(compressed.data().asInt()));
        INDArray decompressed = Nd4j.create(1, initial.length());
        Nd4j.getExecutioner().thresholdDecode(compressed, decompressed);
        log.info("Decompressed length: {}", decompressed.length());
        assertEquals(exp_d, decompressed);
    }
    @Ignore
    @Test
    public void testThresholdCompression0() {
@ -296,6 +262,23 @@ public class CompressionTests extends BaseNd4jTest {
    @Test
    public void testThresholdCompression5() {
        INDArray initial = Nd4j.ones(10);
        INDArray exp_0 = initial.dup();
        Nd4j.getExecutioner().commit();
        //Nd4j.getCompressor().getCompressor("THRESHOLD").configure(1e-3);
        INDArray compressed = Nd4j.getExecutioner().thresholdEncode(initial, 1.0f, 3);
        assertEquals(7, compressed.data().length());
        assertNotEquals(exp_0, initial);
        assertEquals(7, initial.sumNumber().doubleValue(), 0.01);
    }
    @Test
    public void testThresholdCompression5_1() {
        INDArray initial = Nd4j.ones(1000);
        INDArray exp_0 = initial.dup();
@ -435,8 +418,8 @@ public class CompressionTests extends BaseNd4jTest {
        INDArray exp_0 = Nd4j.create(new float[] {0.0f, -1e-4f, 0.0f, 0.0f, 0.0f, 0.0f});
        INDArray exp_1 = Nd4j.create(new float[] {0.0f, -5e-4f, 1e-3f, -1e-3f, 0.0f, 0.0f});
-        DataBuffer ib = Nd4j.getDataBufferFactory().createInt(5);
+
-        INDArray enc = Nd4j.createArrayFromShapeBuffer(ib, initial.shapeInfoDataBuffer());
+        INDArray enc = Nd4j.create(DataType.INT32, initial.length() / 16 + 5);
        long elements = Nd4j.getExecutioner().bitmapEncode(initial, enc, 1e-3);
        log.info("Encoded: {}", Arrays.toString(enc.data().asInt()));
@ -471,7 +454,7 @@ public class CompressionTests extends BaseNd4jTest {
    @Test
    public void testBitmapEncoding5() {
        Nd4j.getRandom().setSeed(119);
-        INDArray initial = Nd4j.rand(new int[]{1, 10000}, -1, -0.5, Nd4j.getRandom());
+        INDArray initial = Nd4j.rand(new int[]{10000}, -1, -0.5, Nd4j.getRandom());
        INDArray exp_0 = initial.dup().addi(1e-1);
        INDArray exp_1 = initial.dup();
@ -486,7 +469,7 @@ public class CompressionTests extends BaseNd4jTest {
    @Test
    public void testBitmapEncoding6() {
        Nd4j.getRandom().setSeed(119);
-        INDArray initial = Nd4j.rand(new int[]{1, 100000}, -1, 1, Nd4j.getRandom());
+        INDArray initial = Nd4j.rand(new int[]{10000}, -1, 1, Nd4j.getRandom());
        INDArray exp_1 = initial.dup();
        INDArray enc = Nd4j.getExecutioner().bitmapEncode(initial, 1e-3);
@ -494,6 +477,11 @@ public class CompressionTests extends BaseNd4jTest {
        Nd4j.getExecutioner().bitmapDecode(enc, initial);
        val f0 = exp_1.toFloatVector();
        val f1 = initial.toFloatVector();
        assertArrayEquals(f0, f1, 1e-5f);
        assertEquals(exp_1, initial);
    }