compression ops (#436)

* Added declarations for decode/encode_bitmap ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Added implementation for bitmap encoding/decoding ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Added helpers for encode/decode bitmap ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored encodingBitmap helper. Signed-off-by: shugeo <sgazeos@gmail.com> * threshold encode/decode skeleton * helper skeleton * minor import fix * encoder shape fn & op impl * thresholdEncode cpu impl Signed-off-by: raver119@gmail.com <raver119@gmail.com> * thresholdDecode cpu impl Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Only cosmetical changes. Signed-off-by: shugeo <sgazeos@gmail.com> * placeholder Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Added cuda implementation for bitmap decode helper. Signed-off-by: shugeo <sgazeos@gmail.com> * cuda thresholdEstimate Signed-off-by: raver119@gmail.com <raver119@gmail.com> * cuda thresholdDecode Signed-off-by: raver119@gmail.com <raver119@gmail.com> * next step Signed-off-by: raver119@gmail.com <raver119@gmail.com> * - nano cmakelist update (get rid of Clion section) - fixed forgotten throw in AtomicTests Signed-off-by: raver119@gmail.com <raver119@gmail.com> * thesholdEncode cuda impl Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Added tests for bitmap encoding/decoding ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed tests for encode/decode bitmaps. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored decode/encode helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed crashes with bitmap decode/encode helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * bitmap encode/decode CPU Signed-off-by: raver119@gmail.com <raver119@gmail.com> * bitmap encode/decode CUDA Signed-off-by: raver119@gmail.com <raver119@gmail.com> * C API removed for threshold/bitmap encode Signed-off-by: raver119@gmail.com <raver119@gmail.com> * EncodeBitmap/DecodeBitmap Java side Signed-off-by: raver119@gmail.com <raver119@gmail.com> * EncodeThreshold/DecodeThreshold Java side Signed-off-by: raver119@gmail.com <raver119@gmail.com> * EncodeThreshold/DecodeThreshold Java side Signed-off-by: raver119@gmail.com <raver119@gmail.com> * few more tests for threshold encoding Signed-off-by: raver119@gmail.com <raver119@gmail.com> * minor test tweak Signed-off-by: raver119@gmail.com <raver119@gmail.com> * two special tests Signed-off-by: raver119@gmail.com <raver119@gmail.com> * encodeBitmap CPU fix Signed-off-by: raver119@gmail.com <raver119@gmail.com> * parallel_long/parallel_double proper spans fix Signed-off-by: raver119@gmail.com <raver119@gmail.com> * encodeThreshold CUDA fix Signed-off-by: raver119@gmail.com <raver119@gmail.com> * nano fix Signed-off-by: raver119@gmail.com <raver119@gmail.com> * grid tweaks Signed-off-by: raver119@gmail.com <raver119@gmail.com> * RTX adaptation for thresholdEncode Signed-off-by: raver119 <raver119@gmail.com> * don't allow threshold encoding for length < 2 Signed-off-by: raver119@gmail.com <raver119@gmail.com> * get rid of NDArrayCompressor in EncodingHandler Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one more minor update of EncodingHandler Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one more minor tweak of EncodingHandler Signed-off-by: raver119@gmail.com <raver119@gmail.com> * - matmul allows integer data types use - EncodingHandler boundary default value - few tests for integer matmul Signed-off-by: raver119@gmail.com <raver119@gmail.com> * minor fix of CUDA bitmap encode Signed-off-by: raver119@gmail.com <raver119@gmail.com> * boundary changed to integer everywhere Signed-off-by: raver119@gmail.com <raver119@gmail.com> * boundary changed to integer everywhere Signed-off-by: raver119@gmail.com <raver119@gmail.com> * re-enable CUDA deallocator Signed-off-by: raver119@gmail.com <raver119@gmail.com> * threshold encoder fix for systems without omp Signed-off-by: raver119@gmail.com <raver119@gmail.com> * - encode_threshold now requires non-negative boundary - minor tweak in EncodingHandler Signed-off-by: raver119@gmail.com <raver119@gmail.com> * restore parallelism in decode_bitmap Signed-off-by: raver119@gmail.com <raver119@gmail.com> * fall back to omp for encode_bitmap cpu Signed-off-by: raver119@gmail.com <raver119@gmail.com> * single time casts Signed-off-by: raver119@gmail.com <raver119@gmail.com> * - additional test for encode_threshold - sync buffers to device before calling for shape function Signed-off-by: raver119@gmail.com <raver119@gmail.com> Co-authored-by: shugeo <sgazeos@gmail.com>
2020-05-08 20:59:39 +03:00 · 2020-05-08 20:59:39 +03:00 · 0613485654
commit 0613485654
parent f1232f8221
47 changed files with 1617 additions and 1387 deletions
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/optimize/solver/accumulation/EncodedGradientsAccumulatorTest.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/optimize/solver/accumulation/EncodedGradientsAccumulatorTest.java
@ -23,9 +23,13 @@ import org.deeplearning4j.optimize.solvers.accumulation.EncodedGradientsAccumula
 import org.deeplearning4j.optimize.solvers.accumulation.EncodingHandler;
 import org.deeplearning4j.optimize.solvers.accumulation.encoding.threshold.FixedThresholdAlgorithm;
 import org.junit.Test;
+import org.nd4j.linalg.api.concurrency.AffinityManager;
 import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.ops.util.PrintAffinity;
 import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.nativeblas.OpaqueDataBuffer;

+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;

 /**
@ -93,12 +97,13 @@ public class EncodedGradientsAccumulatorTest extends BaseDL4JTest {
        }


-        EncodingHandler handler = new EncodingHandler(new FixedThresholdAlgorithm(1e-3), null, null, false);
+        EncodingHandler handler = new EncodingHandler(new FixedThresholdAlgorithm(1e-3), null, Integer.MAX_VALUE, false);
        for (int e = 10; e < numParams / 5; e++) {

-            INDArray encoded = handler.encodeUpdates(0, 0, getGradients(numParams, e, 2e-3));
+            val gradients = getGradients(numParams, e, 2e-3);
+            val encoded = handler.encodeUpdates(0, 0, gradients);

-            //  log.info("enc len: {}", encoded.data().length());
+            assertNotNull("Failed with e == " + e, encoded);

            int encFormat = encoded.data().getInt(3);

--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodedGradientsAccumulator.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodedGradientsAccumulator.java
@ -69,7 +69,7 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
    protected ThreadLocal<Integer> index = new ThreadLocal<>();
    protected long initialMemory = 100 * 1024 * 1024L;
    protected int queueSize = 5;
-    protected Double boundary = 1.0;
+    protected Integer boundary = Integer.MAX_VALUE;
    protected boolean encodingDebugMode;

    protected IndexedTail externalSource;
@ -101,11 +101,11 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
    }

    public EncodedGradientsAccumulator(int parties, ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, boolean encodingDebugMode) {
-        this(parties, new EncodingHandler(thresholdAlgorithm, residualPostProcessor, 1.0, encodingDebugMode), DEFAULT_INITIAL_MEMORY, 10, 1.0, encodingDebugMode);
+        this(parties, new EncodingHandler(thresholdAlgorithm, residualPostProcessor, Integer.MAX_VALUE, encodingDebugMode), DEFAULT_INITIAL_MEMORY, 10, Integer.MAX_VALUE, encodingDebugMode);
    }

    public EncodedGradientsAccumulator(int parties, @NonNull MessageHandler handler, long initialMemory,
-                    int queueSize, Double boundary, boolean encodingDebugMode) {
+                    int queueSize, Integer boundary, boolean encodingDebugMode) {
        this.parties = parties;
        this.handler = handler;
        this.initialMemory = initialMemory;
@ -551,7 +551,7 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
        protected long initialMemory = DEFAULT_INITIAL_MEMORY;
        protected int queueSize = 5;
        protected MessageHandler handler;
-        protected Double boundary = null;
+        protected int boundary = Integer.MAX_VALUE;
        protected boolean encodingDebugMode;

        /**
@ -598,15 +598,12 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
        /**
         * This method enables optional limit for max number of updates per message
         *
-         * Default value: 1.0 (no limit)
+         * Default value: Integer.MAX_VALUE (no limit)
         * @param boundary positive value in range 0..1
         * @return
         */
-        public Builder updatesBoundary(double boundary) {
-            if (boundary >= 1.0)
-                return this;
-
-            if (boundary <= 0.0)
+        public Builder updatesBoundary(int boundary) {
+            if (boundary <= 0)
                throw new DL4JInvalidConfigException("Boundary should have positive value");

            this.boundary = boundary;
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodingHandler.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/accumulation/EncodingHandler.java
@ -16,6 +16,7 @@

 package org.deeplearning4j.optimize.solvers.accumulation;

+import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.shade.guava.util.concurrent.AtomicDouble;
 import lombok.NonNull;
 import lombok.extern.slf4j.Slf4j;
@ -24,7 +25,6 @@ import org.deeplearning4j.optimize.solvers.accumulation.encoding.ThresholdAlgori
 import org.deeplearning4j.optimize.solvers.accumulation.encoding.ThresholdAlgorithmReducer;
 import org.nd4j.linalg.api.buffer.DataBuffer;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.compression.NDArrayCompressor;
 import org.nd4j.linalg.exception.ND4JIllegalStateException;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.ops.transforms.Transforms;
@ -54,9 +54,8 @@ public class EncodingHandler implements MessageHandler {
    protected ThresholdAlgorithm initialThresholdAlgorithm;
    protected ResidualPostProcessor initialResidualPostProcessor;

-    protected Double boundary;
+    protected Integer boundary;
    protected boolean encodingDebugMode;
-    protected NDArrayCompressor compressor;
    protected AtomicInteger atomicBoundary = new AtomicInteger(-1);

    protected ThreadLocal<ThresholdAlgorithm> thresholdAlgorithm = new ThreadLocal<>();
@ -73,20 +72,16 @@ public class EncodingHandler implements MessageHandler {
    protected final AtomicLong lastThresholdLogTime = new AtomicLong();

    public EncodingHandler(final ThresholdAlgorithm thresholdAlgorithm, final ResidualPostProcessor residualPostProcessor,
-                           Double boundary, boolean encodingDebugMode){
+                           Integer boundary, boolean encodingDebugMode){
        this.initialThresholdAlgorithm = thresholdAlgorithm;
        this.initialResidualPostProcessor = residualPostProcessor;
-        this.boundary = boundary;
+        this.boundary = boundary == null ? Integer.MAX_VALUE : boundary;
        this.encodingDebugMode = encodingDebugMode;
    }

    @Override
    public void initialize(@NonNull GradientsAccumulator accumulator) {
        this.accumulator = accumulator;
-
-        compressor = Nd4j.getCompressor().getCompressor("THRESHOLD");
-        if (compressor == null)
-            throw new ND4JIllegalStateException("Can't find Threshold compressor implementation!");
    }

    public INDArray encodeUpdates(int iteration, int epoch, INDArray updates) {
@ -135,14 +130,13 @@ public class EncodingHandler implements MessageHandler {
        iterations.get().incrementAndGet();

        if (boundary != null && atomicBoundary.get() < 0)
-            atomicBoundary.compareAndSet(-1, (int) (updates.length() * boundary));
+            atomicBoundary.compareAndSet(-1, (int) (updates.length() / 16) );

        INDArray encoded;

        if (!bitmapMode.get().get()) {
            //Sparse updates
-            encoded = Nd4j.getExecutioner().thresholdEncode(updates, currentThreshold.get().get(),
-                    boundary == null ? null : atomicBoundary.get());
+            encoded = Nd4j.getExecutioner().thresholdEncode(updates, currentThreshold.get().get(), boundary == null ? null : atomicBoundary.get());

            // updates were TOO sparse, nothing to share here
            if (encoded == null) {
@ -157,17 +151,14 @@ public class EncodingHandler implements MessageHandler {
            }


-            double encLen = encoded.data().getInt(0);
+            double encLen = encoded.length();

            // if updates are too dense - we fallback to bitmap encoding
            if (encLen >= (updates.length() / 16)) {
                log.debug("Switching back to bitmapEncoding: iteration {}, epoch {}, threshold {}, encoded length {}", iteration, epoch, currThreshold, encLen);
                bitmapMode.get().set(true);

-                DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(updates.length() / 16 + 5);
-                encoded = Nd4j.createArrayFromShapeBuffer(buffer, updates.shapeInfoDataBuffer());
-
-                Nd4j.getExecutioner().bitmapEncode(updates, encoded, currentThreshold.get().get());
+                encoded = Nd4j.getExecutioner().bitmapEncode(updates, currentThreshold.get().get());

                applyPostProcessor(iteration, epoch, currThreshold, updates);
                lastSparsityRatio.set(null);
@ -186,8 +177,7 @@ public class EncodingHandler implements MessageHandler {
            }
        } else {
            //Dense bitmap updates
-            DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(updates.length() / 16 + 5);
-            encoded = Nd4j.createArrayFromShapeBuffer(buffer, updates.shapeInfoDataBuffer());
+            encoded = Nd4j.create(DataType.INT32, updates.length() / 16 + 5);

            long values = Nd4j.getExecutioner().bitmapEncode(updates, encoded, currentThreshold.get().get());

--- a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelWrapper.java
+++ b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelWrapper.java
@ -910,7 +910,7 @@ public class ParallelWrapper implements AutoCloseable {
                    Preconditions.checkState(thresholdAlgorithm != null, "Cannot use SHARED_GRADIENTS training mode without setting a threshold algorithm");
                    this.trainerContext = new SymmetricTrainerContext();
                    if (this.accumulator == null) {
-                        log.info("Creating new GradientsAccumulator instance with threshold of [5e-4");
+                        log.info("Creating new GradientsAccumulator instance with default threshold of [5e-4]");
                        this.accumulator = new EncodedGradientsAccumulator(workers, thresholdAlgorithm, residualPostProcessor,  false);
                    }
                }
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v1/WiredEncodingHandler.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v1/WiredEncodingHandler.java
@ -45,7 +45,7 @@ public class WiredEncodingHandler extends EncodingHandler {
     * @param thresholdAlgorithm threshold algorithm to use
     * @param boundary
     */
-    public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Double boundary, boolean encodingDebugMode) {
+    public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Integer boundary, boolean encodingDebugMode) {
        super(thresholdAlgorithm, residualPostProcessor, boundary, encodingDebugMode);
    }

--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v2/WiredEncodingHandler.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark-parameterserver/src/main/java/org/deeplearning4j/spark/parameterserver/networking/v2/WiredEncodingHandler.java
@ -44,7 +44,7 @@ public class WiredEncodingHandler extends EncodingHandler {
     *
     * @param thresholdAlgorithm The threshold algorithm to use
     */
-    public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Double boundary, boolean encodingDebugMode) {
+    public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Integer boundary, boolean encodingDebugMode) {
        super(thresholdAlgorithm, residualPostProcessor, boundary, encodingDebugMode);
    }

--- a/libnd4j/CMakeLists.txt
+++ b/libnd4j/CMakeLists.txt
@ -49,12 +49,12 @@ elseif(WIN32)
        set(CMAKE_CXX_FLAGS_RELEASE  "-D_RELEASE=true")
        set(CMAKE_CXX_FLAGS_DEBUG  "  /FS /EHsc")
    else()
-        set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true")
-        set(CMAKE_CXX_FLAGS_DEBUG  " -g -O2 -fPIC -fmax-errors=2")
+        set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -D_RELEASE=true")
+        set(CMAKE_CXX_FLAGS_DEBUG  " -g -O2 -fPIC")
    endif()
 else()
-    set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true")
-    set(CMAKE_CXX_FLAGS_DEBUG  " -g -O0 -fPIC -fmax-errors=2")
+    set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -D_RELEASE=true")
+    set(CMAKE_CXX_FLAGS_DEBUG  " -g -O0 -fPIC")

    if (SD_CPU)
        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address")
@ -221,21 +221,16 @@ include_directories(${FLATBUFFERS_PATH}/include)
 configure_file(include/config.h.in include/config.h)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)

-if (NOT DEFINED ENV{CLION_IDE})
-    message("NOT CLION")
-    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
-    add_subdirectory(blas)
-    if(SD_BUILD_TESTS)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+add_subdirectory(blas)
+if(SD_BUILD_TESTS)
    # tests are always compiled with all ops included
    set(SD_ALL_OPS true)
    set(SD_BUILD_MINIFIER true)
    add_subdirectory(tests_cpu)
-    endif()
-endif ()
+endif()

-if ($ENV{CLION_IDE})
-    add_subdirectory(tests_cpu)
-endif ()

 if (MSVC_DEV)
    set(SD_BUILD_MINIFIER false)
--- a/libnd4j/blas/CMakeLists.txt
+++ b/libnd4j/blas/CMakeLists.txt
@ -120,8 +120,13 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
    set( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}")
 elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
    # using GCC
-    SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}")
+    SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${ARCH_TUNE} -fmax-errors=2 ")
    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath,$ORIGIN/")
+
+    if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT(APPLE) AND NOT(WIN32))
+        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic")
+        SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
+    endif()
 endif()


@ -361,11 +366,6 @@ elseif(SD_CPU)
        endif()
    endif()

-    if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT(APPLE) AND NOT(WIN32))
-        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic")
-        SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
-    endif()
-
    install(TARGETS ${SD_LIBRARY_NAME} DESTINATION  .)
    set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cpu)
 endif()
--- a/libnd4j/include/execution/impl/Threads.cpp
+++ b/libnd4j/include/execution/impl/Threads.cpp
@ -571,7 +571,7 @@ namespace samediff {

        // create temporary array
        int64_t intermediatery[256];
-        auto span = delta / numThreads;
+        auto span = (numElements / numThreads) - (numElements % numThreads);

        // execute threads in parallel
        for (uint32_t e = 0; e < numThreads; e++) {
@ -615,7 +615,7 @@ namespace samediff {

        // create temporary array
        double intermediatery[256];
-        auto span = delta / numThreads;
+        auto span = (numElements / numThreads) - (numElements % numThreads);

        // execute threads in parallel
        for (uint32_t e = 0; e < numThreads; e++) {
--- a/libnd4j/include/legacy/NativeOps.h
+++ b/libnd4j/include/legacy/NativeOps.h
@ -1432,18 +1432,6 @@ ND4J_EXPORT void tear(Nd4jPointer *extraPointers,
                        Nd4jLong *tadShapeInfo,
                        Nd4jLong *tadOffsets);

-ND4J_EXPORT Nd4jLong encodeBitmap(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold);
-ND4J_EXPORT void decodeBitmap(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo);
-
-
-ND4J_EXPORT void encodeThresholdP1(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold);
-ND4J_EXPORT void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *dx, Nd4jLong N, int *dz);
-ND4J_EXPORT void encodeThresholdP3(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, int *offsets, Nd4jLong N, int *dz);
-
-
-ND4J_EXPORT void decodeThreshold(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo);
-
-
 ND4J_EXPORT void sort(Nd4jPointer *extraPointers,
        void *x, Nd4jLong *xShapeInfo,
        void *dx, Nd4jLong *dxShapeInfo,
--- a/libnd4j/include/legacy/cpu/NativeOps.cpp
+++ b/libnd4j/include/legacy/cpu/NativeOps.cpp
@ -1436,28 +1436,6 @@ void enableP2P(bool enable) {
    // no-op
 }

-
-
-void encodeThresholdP1(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) {
-    // TODO: to be implemented
-}
-
-
-void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *hX, Nd4jLong N, int *dz) {
-    // TODO: to be implemented
-}
-
-
-void encodeThresholdP3(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){
-    // offsets won't be used here
-
-    // TODO: to be implemented
-}
-
-void decodeThreshold(Nd4jPointer *extraPointers, void *hX, Nd4jLong N, void *dz, Nd4jLong *hZShapeInfo){
-    // TODO: to be implemented
-}
-
 bool isP2PAvailable() {
    // always TRUE for cpu backend
    return true;
@ -1467,10 +1445,6 @@ void checkP2P() {
    // no-op
 }

-void decodeBitmap(Nd4jPointer *extraPointers, void *hX, Nd4jLong N, void *dz, Nd4jLong *hZShapeInfo) {
-    NativeOpExecutioner::decodeBitmap(hX, N, dz, hZShapeInfo);
-}
-
 template<typename T>
 void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZShapeInfo, int N, int *shuffleMap, Nd4jLong **tadOnlyShapeInfo, Nd4jLong **tadOffsets) {

@ -1859,12 +1833,6 @@ void sortCooIndices(Nd4jPointer *extraPointers,
    }
 }

-Nd4jLong encodeBitmap(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) {
-    return NativeOpExecutioner::encodeBitmap(hX, hXShapeInfo, N, dz, threshold);
-}
-
-
-
 Nd4jLong* mmapFile(Nd4jPointer *extraPointers, const char *fileName, Nd4jLong length) {
    auto hZ = new Nd4jLong[2];errno = 0;
 try {
--- a/libnd4j/include/legacy/cuda/NativeOps.cu
+++ b/libnd4j/include/legacy/cuda/NativeOps.cu
@ -2197,76 +2197,6 @@ void prescanArrayRecursive(Nd4jPointer *extras, int *dZ, int *dX, int numElement
    sd::DebugHelper::checkErrorCode(stream, "prescanArray(...) failed");
 }

-
-void encodeThresholdP1(Nd4jPointer *extras, void *dx, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) {
-    try {
-        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extras[1]);
-
-        int blockSize = 1024;
-        int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
-
-        dim3 launchDims(numBlocks, blockSize, 1024);
-        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
-        BUILD_SINGLE_SELECTOR(xType, encoderKernelP1Generic, (launchDims, stream, dx, N, dz, threshold), LIBND4J_TYPES);
-
-        sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP1Float(...) failed");
-    } catch (std::exception &e) {
-        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
-}
-
-
-
-void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *dx, Nd4jLong N, int *dz) {
-    try {
-        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
-        //encoderKernelP2Float<<<numBlocks, blockSize , 1024 * sizeof(float), *stream>>>(dx, N, dz);
-        prescanArrayRecursive(extraPointers, dz, dx + 1, (int) N, 0);
-        sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP2Int(...) failed");
-    } catch (std::exception &e) {
-        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
-}
-
-void encodeThresholdP3(Nd4jPointer *extraPointers, void *dx, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){
-    try {
-        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
-
-        int blockSize = 1024;
-        int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
-
-        dim3 launchDims(numBlocks, blockSize, 4096);
-        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
-        BUILD_SINGLE_SELECTOR(xType, encoderKernelP3Generic, (launchDims, stream, dx, offsets, N, dz), LIBND4J_TYPES);
-
-        sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP3Float(...) failed");
-    } catch (std::exception &e) {
-        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
-}
-
-void decodeThreshold(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo){
-    try {
-        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
-
-        // we probably want to have smaller blocks here, memory writes are misaligned anyway
-        int blockSize = 128;
-        int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
-
-        dim3 launchDims(numBlocks, blockSize, 1024);
-        auto zType = sd::ArrayOptions::dataType(zShapeInfo);
-        BUILD_SINGLE_SELECTOR(zType, decoderKernelGeneric, (launchDims, stream, dx, N, dz), LIBND4J_TYPES);
-
-        sd::DebugHelper::checkErrorCode(stream, "decodeThresholdFloat(...) failed");
-    } catch (std::exception &e) {
-        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
-}
-
 ////////////////////////////////////////////////////////////////////////
 void execReduce3All(Nd4jPointer *extraPointers,
 									int opNum,
@ -2603,55 +2533,6 @@ void sortCooIndices(Nd4jPointer *extraPointers, Nd4jLong *indices, void *values,
 	throw std::runtime_error("sortCooIndices:: Not implemented yet");
 }

-
-Nd4jLong encodeBitmap(Nd4jPointer *extraPointers,
-								void *dx, Nd4jLong *hXShapeInfo,
-								Nd4jLong N,
-								int *dz,
-								float threshold) {
-    try {
-
-        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
-        int *resultPointer = reinterpret_cast<int *>(extraPointers[2]);
-        int *reductionPointer = reinterpret_cast<int *>(extraPointers[3]);
-
-        dim3 launchDims(512, 512, 32768);
-        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
-        BUILD_SINGLE_SELECTOR(xType, cudaEncodeBitmapGeneric,
-                              (launchDims, stream, dx, N, dz, resultPointer, reductionPointer, threshold),
-                              LIBND4J_TYPES);
-
-        sd::DebugHelper::checkErrorCode(stream, "encodeBitmapFloat(...) failed");
-
-        Nd4jLong dZ = (Nd4jLong) resultPointer[0];
-        resultPointer[0] = 0;
-
-        return dZ;
-    } catch (std::exception &e) {
-        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-        return 0;
-    }
-}
-
-
-void decodeBitmap(Nd4jPointer *extraPointers,
-							void *dx,
-							Nd4jLong N,
-							void *dz, Nd4jLong *zShapeInfo) {
-    try {
-        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
-        dim3 launchDims(512, 512, 16384);
-        auto xType = sd::ArrayOptions::dataType(zShapeInfo);
-        BUILD_SINGLE_SELECTOR(xType, cudaDecodeBitmapGeneric, (launchDims, stream, dx, N, dz), LIBND4J_TYPES);
-
-        sd::DebugHelper::checkErrorCode(stream, "decodeBitmapFloat(...) failed");
-    } catch (std::exception &e) {
-        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
-}
-
 Nd4jLong* mmapFile(Nd4jPointer *extraPointers, const char *fileName, Nd4jLong length) {
 	return nullptr;
 }
--- a/libnd4j/include/legacy/impl/Environment.cpp
+++ b/libnd4j/include/legacy/impl/Environment.cpp
@ -207,7 +207,7 @@ namespace sd {
    }

    void Environment::setMaxSpecialyMemory(uint64_t maxBytes) {
-        _maxTotalSpecialMemory;
+        _maxTotalSpecialMemory = maxBytes;
    }

    void Environment::setMaxDeviceMemory(uint64_t maxBytes) {
--- a/libnd4j/include/loops/cuda/type_conversions.cu
+++ b/libnd4j/include/loops/cuda/type_conversions.cu
@ -217,10 +217,27 @@ namespace sd {
    }

 //////////////////////////////////////////////////////////////////////////
+/*
+ * PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
+ */
 template<typename T>
 __global__ static void execEncoderKernelP1(void *dx, Nd4jLong N, void *dz, float threshold) {
+        auto x = reinterpret_cast<T *> (dx);
+        auto z = reinterpret_cast<int *> (dz);

-    encoderKernelP1<T>(dx, N, dz, threshold);
+        //basically, for phase One we want do calculation: how many eligible values we have, and which blocks will be holding data
+        Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+        int pass = tid < N && sd::math::nd4j_abs<T>(x[tid]) >= static_cast<T>(threshold) ? 1 : 0;
+        int bp=__syncthreads_count(pass);
+
+        if (threadIdx.x == 0) {
+            // saving out per-block passes
+            z[blockIdx.x+1] = bp;
+
+            // saving out sum
+            atomicAdd(&z[0], bp);
+        }
 }

 //////////////////////////////////////////////////////////////////////////
@ -230,13 +247,74 @@ __host__ void encoderKernelP1Generic(dim3 &launchDims, cudaStream_t *stream, voi
    execEncoderKernelP1<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, dz, threshold);
        sd::DebugHelper::checkErrorCode(stream, "encoderP1(...) failed");
 }
-BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP1Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP1Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold), FLOAT_TYPES);

 //////////////////////////////////////////////////////////////////////////
+/*
+ * PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
+ *
+ * Based on: https://github.com/knotman90/cuStreamComp <-- efficient CUDA stream compaction algorithm
+ */
 template<typename T>
 __global__ static void execEncoderKernelP3(void *dx, int *offsets, Nd4jLong N, void *dz) {
+        auto x = reinterpret_cast<T *> (dx);
+        auto z = reinterpret_cast<int *> (dz);

-    encoderKernelP3<T>(dx, offsets, N, dz);
+        auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+        extern __shared__ int warpTotals[];
+
+        // fetch block offset only once
+        __shared__ float threshold;
+        __shared__ FloatBits fb;
+        __shared__ int bo;
+        __shared__ int limit;
+        if (threadIdx.x == 0) {
+            limit = z[0];
+            fb.i_ = z[2];
+            threshold = fb.f_;
+            bo = offsets[blockIdx.x];
+        }
+        __syncthreads();
+
+        // out-of-limit threads do not play here
+        auto value = tid < N ? x[tid] : (T) 0.f;
+
+        // out-of-limit threads just declare they have no changes
+        auto pred = tid >= N ? 0 : sd::math::nd4j_abs<T>(value) >= static_cast<T>(threshold) ? 1 : 0;
+        auto w_i = threadIdx.x / warpSize; // warp index (or, warp number) - index of the Warp within TOTAL_WARPS
+        auto t_i = threadIdx.x % warpSize; // thread index within a warp
+        unsigned int t_m = INT_MAX >> (warpSize - t_i - 1); //thread mask (ERROR IN THE PAPER minus one is required)
+
+        int b = __ballot_sync(t_m, pred); // balres = number whose ith bit isone if the ith's thread pred is true masked up to the current index in warp
+        auto t_u = __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
+
+        if (t_i == warpSize - 1)
+            warpTotals[w_i] = t_u + pred;
+
+        __syncthreads();
+
+
+        int w_i_u = 0;
+        for (int j = 0; j <= 5; j++) {
+            unsigned int b_j = __ballot_sync(t_m, warpTotals[t_i] & pow2i(j)); //# of the ones in the j'th digit of the warp offsets
+            w_i_u += (__popc(b_j) << j);
+        }
+
+        // we just ignore all results coming from non-0 threads
+        if (w_i == 0 && t_i < blockDim.x / warpSize)
+            warpTotals[t_i] = w_i_u;
+
+        __syncthreads();
+
+
+        // pred is always false if we're out-of-limits
+        if (pred) {
+            int idx = t_u + warpTotals[w_i] + bo + 4;
+            if (idx < limit + 4) {
+                z[idx] = value > static_cast<T>(0.0f) ? tid + 1 : -(tid + 1);
+                x[tid] = value > static_cast<T>(0.0f) ? x[tid] - threshold : x[tid] + threshold;
+            }
+        }
 }

 //////////////////////////////////////////////////////////////////////////
@ -245,13 +323,38 @@ __host__ void encoderKernelP3Generic(dim3 &launchDims, cudaStream_t *stream, voi
    execEncoderKernelP3<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, offsets, N, dz);
        sd::DebugHelper::checkErrorCode(stream, "encoderP3(...) failed");
 }
-BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP3Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP3Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz), FLOAT_TYPES);

 //////////////////////////////////////////////////////////////////////////
+/*
+ *   This kernel handles decode from sparse threshold array, to dense array
+ *
+ *   PLEASE NOTE: Z is expected to be memset to 0
+*/
 template<typename T>
 __global__ static void execDecoderKernel(void *dx, Nd4jLong N, void *dz) {
+        auto x = reinterpret_cast<int *> (dx);
+        auto z = reinterpret_cast<T *> (dz);

-    decoderKernel<T>(dx, N, dz);
+        int tid = blockIdx.x * blockDim.x + threadIdx.x;
+        __shared__ float threshold;
+        __shared__ int limit;
+
+        __shared__ FloatBits fb;
+        if (threadIdx.x == 0) {
+            limit = x[0];
+            fb.i_ = x[2];
+            threshold = fb.f_;
+        }
+        __syncthreads();
+
+        for (int e = tid; e < limit; e += blockDim.x * gridDim.x) {
+            int el = x[e+4];
+            int ael = sd::math::nd4j_abs<int>(el) - 1;
+
+            // TODO: investigate, if += would work better here, as in "decoded accumulation"
+            z[ael] += el > 0 ? threshold : -threshold;
+        }
 }

 //////////////////////////////////////////////////////////////////////////
@ -261,14 +364,78 @@ __host__ void decoderKernelGeneric(dim3 &launchDims, cudaStream_t *stream, void
    execDecoderKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, dz);
    sd::DebugHelper::checkErrorCode(stream, "execDecoder(...) failed");
 }
-BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT decoderKernelGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT decoderKernelGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz), FLOAT_TYPES);


 //////////////////////////////////////////////////////////////////////////
 template<typename T>
 __global__ static void execCudaEncodeBitmapKernel(void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold) {
+        auto dx = reinterpret_cast<T *>(vdx);
+        int tid = blockIdx.x * blockDim.x + threadIdx.x;

-    cudaEncodeBitmapKernel<T>(vdx, N, dz, scalar, reductionBuffer, threshold);
+        T off(0.0f);
+        __shared__ int counter;
+        __shared__ int *shmem;
+        __shared__ T *vals;
+        if (threadIdx.x == 0){
+            extern __shared__ char mem[];
+            shmem = reinterpret_cast<int*>(mem);
+            vals = reinterpret_cast<T *>(shmem + blockDim.x);
+            counter = 0;
+        }
+        __syncthreads();
+
+        Nd4jLong loopRemainder = N % (blockDim.x * gridDim.x);
+        Nd4jLong loopLimit = N + (blockDim.x * gridDim.x - loopRemainder);
+
+        for (Nd4jLong i = tid; i < loopLimit; i += blockDim.x * gridDim.x) {
+            // all threads in block reading stuff
+            T val = i < N ? dx[i] : off;
+            T abs = sd::math::nd4j_abs<T>(val);
+
+            int byteId = i / 16 + 4;
+            int bitId = i % 16;
+
+            shmem[threadIdx.x] = 0;
+            vals[threadIdx.x] = val;
+
+            if (abs >= static_cast<T>(threshold) && i < N) {
+                shmem[threadIdx.x] = 1 << (bitId);
+                atomicAdd(&counter, 1);
+                if (val < static_cast<T>(0.0f)) {
+                    shmem[threadIdx.x] |= 1 << (bitId + 16);
+                    vals[threadIdx.x] += static_cast<T>(threshold);
+                } else {
+                    vals[threadIdx.x] -= static_cast<T>(threshold);
+                }
+            } else if (abs >= static_cast<T>(threshold) / static_cast<T>(2.0f) && val < static_cast<T>(0.0f) && i < N) {
+                atomicAdd(&counter, 1);
+                shmem[threadIdx.x] = 1 << (bitId + 16);
+
+                vals[threadIdx.x] += static_cast<T>(threshold) / static_cast<T>(2.0f);
+            }
+            __syncthreads();
+
+            if (threadIdx.x % 16 == 0 && i < N) {
+                int byte = 0;
+                for (int e = 0; e < 16; e++) {
+                    if (i + e >= N)
+                        continue;
+
+                    byte |= shmem[threadIdx.x + e];
+                }
+                dz[byteId] = byte;
+            }
+            __syncthreads();
+
+            if (i < N)
+                dx[i] = vals[threadIdx.x];
+        }
+        __syncthreads();
+
+        if (threadIdx.x == 0) {
+            atomicAdd(scalar, counter);
+        }
 }

 //////////////////////////////////////////////////////////////////////////
@ -278,14 +445,62 @@ __host__ void cudaEncodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, vo
    execCudaEncodeBitmapKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(vdx, N, dz, scalar, reductionBuffer, threshold);
    sd::DebugHelper::checkErrorCode(stream, "encodeBitmap(...) failed");
 }
-BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaEncodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaEncodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold), FLOAT_TYPES);


 //////////////////////////////////////////////////////////////////////////
 template<typename T>
 __global__ static void execCudaDecodeBitmapKernel(void *dx, Nd4jLong N, void *vdz) {
+        auto dz = static_cast<T*>(vdz);

-     cudaDecodeBitmapKernel<T>(dx, N, vdz);
+        int tid = blockIdx.x * blockDim.x + threadIdx.x;
+        __shared__ T *shmem;
+        __shared__ FloatBits fb;
+        __shared__ float threshold;
+        __shared__ int *x;
+        if (threadIdx.x == 0){
+            extern __shared__ char mem[];
+            shmem = reinterpret_cast<T*>(mem);
+            x = reinterpret_cast<int *>(dx);
+            fb.i_ = x[2];
+            threshold = fb.f_;
+        }
+        __syncthreads();
+
+        int lim = N / 16 + 5;
+        for (int i = tid; i < N; i += blockDim.x * gridDim.x) {
+            int byteId = i / 16 + 4;
+//        printf("I: [%i]; byteId: [%i]\n", i, byteId);
+
+            shmem[threadIdx.x] = dz[i];
+            __syncthreads();
+
+            if (threadIdx.x % 16 == 0) {
+                int byte = x[byteId];
+
+                for (int e = 0; e < 16; e++) {
+                    if (i + e >= N)
+                        continue;
+
+                    int bitId = (i + e) % 16;
+
+                    bool hasBit = (byte & 1 << (bitId) ) != 0;
+                    bool hasSign = (byte & 1 << (bitId + 16) ) != 0;
+
+                    if (hasBit) {
+                        if (hasSign)
+                            shmem[threadIdx.x + bitId] -= threshold;
+                        else
+                            shmem[threadIdx.x + bitId] += threshold;
+                    } else if (hasSign) {
+                        shmem[threadIdx.x + bitId] -= threshold / 2;
+                    }
+                }
+            }
+            __syncthreads();
+
+            dz[i] = shmem[threadIdx.x];
+        }
 }

 //////////////////////////////////////////////////////////////////////////
@ -295,7 +510,7 @@ __host__ void cudaDecodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, vo
    execCudaDecodeBitmapKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, vdz);
    sd::DebugHelper::checkErrorCode(stream, "cudeDecodeBitmap(...) failed");
 }
-BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz), FLOAT_TYPES);


    template <bool storeSum, bool isNP2>
--- a/libnd4j/include/loops/impl/type_conversions.cpp
+++ b/libnd4j/include/loops/impl/type_conversions.cpp
@ -106,8 +106,14 @@ namespace sd {
        auto l = static_cast<int>(N);
        z[1] = l;

+#ifdef _OPENMP
        int threads = OmpLaunchHelper::betterThreads(N);
-        int span = OmpLaunchHelper::betterSpan(N, threads);
+        auto span = OmpLaunchHelper::betterSpan(N, threads);
+#else
+        int threads = 1;
+        auto span = N;
+#endif
+

        T tt = static_cast<T>(threshold);
        T mtt = -tt;
@ -209,21 +215,23 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
        samediff::Threads::parallel_for(func,  0, N);
    };

+    template void TypeCast::convertFromThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertFromThreshold<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertFromThreshold<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
-    template void TypeCast::convertFromThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
+    template void TypeCast::convertFromThreshold<bfloat16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);

+    template void TypeCast::convertToThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertToThreshold<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertToThreshold<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
-    template void TypeCast::convertToThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
+    template void TypeCast::convertToThreshold<bfloat16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);

+    template void TypeCast::convertFromQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertFromQuantized<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertFromQuantized<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
-    template void TypeCast::convertFromQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);

+    template void TypeCast::convertToQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertToQuantized<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
    template void TypeCast::convertToQuantized<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
-    template void TypeCast::convertToQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);

 #ifndef __CLION_IDE__
    BUILD_DOUBLE_TEMPLATE(template void TypeCast::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES)
--- a/libnd4j/include/loops/type_conversions.h
+++ b/libnd4j/include/loops/type_conversions.h
@ -110,29 +110,6 @@ namespace sd {
    }

 #ifdef __CUDACC__
-    /*
- * PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
- */
-    template<typename T>
-    __device__ inline void encoderKernelP1(void *dx, Nd4jLong N, void *dz, float threshold) {
-        auto x = reinterpret_cast<T *> (dx);
-        auto z = reinterpret_cast<int *> (dz);
-
-        //basically, for phase One we want do calculation: how many eligible values we have, and which blocks will be holding data
-        Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-        int pass = tid < N && sd::math::nd4j_abs<T>(x[tid]) >= static_cast<T>(threshold) ? 1 : 0;
-        int bp=__syncthreads_count(pass);
-
-        if (threadIdx.x == 0) {
-            // saving out per-block passes
-            z[blockIdx.x+1] = bp;
-
-            // saving out sum
-            atomicAdd(&z[0], bp);
-        }
-    }
-
    __device__ __inline__ int pow2i (int e){
        return 1<<e;
    }
@ -140,274 +117,21 @@ namespace sd {
    template<typename T>
    __host__ void encoderKernelP1Generic(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold);

-/*
- * PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
- *
- * Based on: https://github.com/knotman90/cuStreamComp <-- efficient CUDA stream compaction algorithm
- */
-    template<typename T>
-    __device__ inline void encoderKernelP3(void *dx, int *offsets, Nd4jLong N, void *dz) {
-        T *x = reinterpret_cast<T *> (dx);
-        int *z = reinterpret_cast<int *> (dz);
-
-        Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
-        extern __shared__ int warpTotals[];
-
-        // fetch block offset only once
-        __shared__ float threshold;
-        __shared__ FloatBits fb;
-        __shared__ int bo;
-        __shared__ int limit;
-        if (threadIdx.x == 0) {
-            limit = z[0];
-            fb.i_ = z[2];
-            threshold = fb.f_;
-            bo = offsets[blockIdx.x];
-        }
-        __syncthreads();
-
-        if (tid < N) {
-            T value = x[tid];
-            int pred = sd::math::nd4j_abs<T>(value) >= static_cast<T>(threshold) ? 1 : 0;
-            int w_i = threadIdx.x/warpSize; //warp index
-            int w_l = tid % warpSize;//thread index within a warp
-            unsigned int t_m = INT_MAX >> (warpSize-w_l-1); //thread mask (ERROR IN THE PAPER minus one is required)
-
-            int b   = __ballot_sync(t_m, pred); //balres = number whose ith bit isone if the ith's thread pred is true masked up to the current index in warp
-            int t_u = __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
-
-            if(w_l==warpSize-1){
-                warpTotals[w_i]=t_u+pred;
-            }
-//            __syncthreads(); // Eliminated due RTX20xx specific
-
-            if(w_i==0 && w_l<blockDim.x/warpSize){
-                int w_i_u=0;
-                for(int j=0;j<=5;j++){
-                    unsigned int b_j =__ballot_sync(t_m, warpTotals[w_l] & pow2i(j) ); //# of the ones in the j'th digit of the warp offsets
-                    w_i_u += (__popc(b_j) << j);
-                    //printf("indice %i t_m=%i,j=%i,b_j=%i,w_i_u=%i\n",w_l,t_m,j,b_j,w_i_u);
-                }
-                warpTotals[w_l]=w_i_u;
-            }
-
-//            __syncthreads();  // Eliminated due RTX20xx specific
-
-            if(pred){
-                int idx = t_u + warpTotals[w_i] + bo + 4;
-                if (idx < limit + 4) {
-                    z[idx]= value > static_cast<T>(0.0f) ? tid+1 : -(tid + 1);
-                    x[tid] = value > static_cast<T>(0.0f) ? x[tid] - threshold : x[tid] + threshold;
-                }
-            }
-        }
-    }

    template<typename T>
    __host__ void encoderKernelP3Generic(dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz);


-    /*
-*   This kernel handles decode from sparse threshold array, to dense array
- *
- *   PLEASE NOTE: Z is expected to be memset to 0
-*/
-    template<typename T>
-    __device__ inline void decoderKernel(void *dx, Nd4jLong N, void *dz) {
-        auto x = reinterpret_cast<int *> (dx);
-        auto z = reinterpret_cast<T *> (dz);
-
-        int tid = blockIdx.x * blockDim.x + threadIdx.x;
-        __shared__ float threshold;
-        __shared__ int limit;
-
-        __shared__ FloatBits fb;
-        if (threadIdx.x == 0) {
-            limit = x[0];
-            fb.i_ = x[2];
-            threshold = fb.f_;
-        }
-        __syncthreads();
-
-        for (int e = tid; e < limit; e += blockDim.x * gridDim.x) {
-            int el = x[e+4];
-            int ael = sd::math::nd4j_abs<int>(el) - 1;
-
-            // TODO: investigate, if += would work better here, as in "decoded accumulation"
-            z[ael] += el > 0 ? threshold : -threshold;
-        }
-    }
-
    template<typename T>
    __host__ void decoderKernelGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz);

-
-//////////////////////////////////////////////////////////////////////////    
-    template<typename T>
-    __device__ inline void cudaEncodeBitmapKernel(void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold) {
-
-        auto dx = reinterpret_cast<T *>(vdx);
-        int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-        T off(0.0f);
-        __shared__ int counter;
-        __shared__ int *shmem;
-        __shared__ T *vals;
-        if (threadIdx.x == 0){
-            extern __shared__ char mem[];
-            shmem = reinterpret_cast<int*>(mem);
-            vals = reinterpret_cast<T *>(shmem + blockDim.x);
-            counter = 0;
-        }
-        __syncthreads();
-
-        Nd4jLong loopRemainder = N % (blockDim.x * gridDim.x);
-        Nd4jLong loopLimit = N + (blockDim.x * gridDim.x - loopRemainder);
-
-        for (Nd4jLong i = tid; i < loopLimit; i += blockDim.x * gridDim.x) {
-            // all threads in block reading stuff
-            T val = i < N ? dx[i] : off;
-            T abs = sd::math::nd4j_abs<T>(val);
-
-            int byteId = i / 16 + 4;
-            int bitId = i % 16;
-
-            shmem[threadIdx.x] = 0;
-            vals[threadIdx.x] = val;
-
-            if (abs >= static_cast<T>(threshold) && i < N) {
-                shmem[threadIdx.x] = 1 << (bitId);
-                atomicAdd(&counter, 1);
-                if (val < static_cast<T>(0.0f)) {
-                    shmem[threadIdx.x] |= 1 << (bitId + 16);
-                    vals[threadIdx.x] += static_cast<T>(threshold);
-                } else {
-                    vals[threadIdx.x] -= static_cast<T>(threshold);
-                }
-            } else if (abs >= static_cast<T>(threshold) / static_cast<T>(2.0f) && val < static_cast<T>(0.0f) && i < N) {
-                atomicAdd(&counter, 1);
-                shmem[threadIdx.x] = 1 << (bitId + 16);
-
-                vals[threadIdx.x] += static_cast<T>(threshold) / static_cast<T>(2.0f);
-            }
-            __syncthreads();
-
-            if (threadIdx.x % 16 == 0 && i < N) {
-                int byte = 0;
-                for (int e = 0; e < 16; e++) {
-                    if (i + e >= N)
-                        continue;
-
-                    byte |= shmem[threadIdx.x + e];
-                }
-                dz[byteId] = byte;
-            }
-            __syncthreads();
-
-            if (i < N)
-                dx[i] = vals[threadIdx.x];
-        }
-        __syncthreads();
-
-        if (threadIdx.x == 0) {
-            atomicAdd(scalar, counter);
-        }
-    }
-
    template<typename T>
    __host__ void cudaEncodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);

-//////////////////////////////////////////////////////////////////////////
-    template<typename T>
-    __device__ inline void cudaDecodeBitmapKernel(void *dx, Nd4jLong N, void *vdz) {
-
-        auto dz = static_cast<T*>(vdz);
-        
-        int tid = blockIdx.x * blockDim.x + threadIdx.x;
-        __shared__ T *shmem;
-        __shared__ FloatBits fb;
-        __shared__ float threshold;
-        __shared__ int *x;
-        if (threadIdx.x == 0){
-            extern __shared__ char mem[];
-            shmem = reinterpret_cast<T*>(mem);
-            x = reinterpret_cast<int *>(dx);
-            fb.i_ = x[2];
-            threshold = fb.f_;
-        }
-        __syncthreads();
-
-        int lim = N / 16 + 5;
-        for (int i = tid; i < N; i += blockDim.x * gridDim.x) {
-            int byteId = i / 16 + 4;
-//        printf("I: [%i]; byteId: [%i]\n", i, byteId);
-
-            shmem[threadIdx.x] = dz[i];
-            __syncthreads();
-
-            if (threadIdx.x % 16 == 0) {
-                int byte = x[byteId];
-
-                for (int e = 0; e < 16; e++) {
-                    if (i + e >= N)
-                        continue;
-
-                    int bitId = (i + e) % 16;
-
-                    bool hasBit = (byte & 1 << (bitId) ) != 0;
-                    bool hasSign = (byte & 1 << (bitId + 16) ) != 0;
-
-                    if (hasBit) {
-                        if (hasSign)
-                            shmem[threadIdx.x + bitId] -= threshold;
-                        else
-                            shmem[threadIdx.x + bitId] += threshold;
-                    } else if (hasSign) {
-                        shmem[threadIdx.x + bitId] -= threshold / 2;
-                    }
-                }
-            }
-            __syncthreads();
-
-            dz[i] = shmem[threadIdx.x];
-        }
-    }

    template<typename T>
    __host__ void cudaDecodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz);

-    // __global__ void cudaEncodeBitmapFloat(float *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
-
-    // __global__ void cudaEncodeBitmapDouble(double *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
-
-    // __global__ void cudaEncodeBitmapHalf(float16 *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
-
-    // __global__ void cudaDecodeBitmapFloat(void *dx, Nd4jLong N, float *dz);
-
-    // __global__ void cudaDecodeBitmapDouble(void *dx, Nd4jLong N, double *dz);
-
-    // __global__ void cudaDecodeBitmapHalf(void *dx, Nd4jLong N, float16 *dz);
-
-    // __global__ void encoderKernelP1Float(void *dx, Nd4jLong N, void *dz, float threshold);
-
-    // __global__ void encoderKernelP1Double(void *dx, Nd4jLong N, void *dz, float threshold);
-
-    // __global__ void encoderKernelP1Half(void *dx, Nd4jLong N, void *dz, float threshold);
-
-    // __global__ void encoderKernelP2Float(int *dx, Nd4jLong N, int *dz);
-
-    // __global__ void encoderKernelP3Float(void *dx, int *offsets, Nd4jLong N, void *dz);
-
-    // __global__ void encoderKernelP3Double(void *dx, int *offsets, Nd4jLong N, void *dz);
-
-    // __global__ void encoderKernelP3Half(void *dx, int *offsets, Nd4jLong N, void *dz);
-
-    // __global__ void decoderKernelFloat(void *dx, Nd4jLong N, void *dz);
-
-    // __global__ void decoderKernelDouble(void *dx, Nd4jLong N, void *dz);
-
-    // __global__ void decoderKernelHalf(void *dx, Nd4jLong N, void *dz);
-
    __global__ void uniformAdd(int *g_data, int *uniforms, int n, int blockOffset, int baseIndex);

    template <bool storeSum, bool isNP2>
--- a/libnd4j/include/ops/declarable/CustomOperations.h
+++ b/libnd4j/include/ops/declarable/CustomOperations.h
@ -25,6 +25,7 @@
 #include <ops/declarable/headers/boolean.h>
 #include <ops/declarable/headers/broadcastable.h>
 #include <ops/declarable/headers/convo.h>
+#include <ops/declarable/headers/compression.h>
 #include <ops/declarable/headers/list.h>
 #include <ops/declarable/headers/recurrent.h>
 #include <ops/declarable/headers/transforms.h>
--- a/libnd4j/include/ops/declarable/generic/blas/matmul.cpp
+++ b/libnd4j/include/ops/declarable/generic/blas/matmul.cpp
@ -138,9 +138,9 @@ DECLARE_SHAPE_FN(matmul) {
 //////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(matmul) {
    getOpDescriptor()
-            ->setAllowedInputTypes(0, {ALL_FLOATS})
-            ->setAllowedInputTypes(1, {ALL_FLOATS})
-            ->setAllowedOutputTypes(0, {ALL_FLOATS});
+            ->setAllowedInputTypes(0, {ALL_FLOATS, ALL_INTS})
+            ->setAllowedInputTypes(1, {ALL_FLOATS, ALL_INTS})
+            ->setAllowedOutputTypes(0, {ALL_FLOATS, ALL_INTS});
 }

 //////////////////////////////////////////////////////////////////////
--- a/libnd4j/include/ops/declarable/generic/compression/bitmap.cpp
+++ b/libnd4j/include/ops/declarable/generic/compression/bitmap.cpp
@ -0,0 +1,92 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author George A. Shulinok <sgazeos@gmail.com>
+//
+
+#include <system/op_boilerplate.h>
+#include <ops/declarable/CustomOperations.h>
+#include <ops/declarable/helpers/compression.h>
+
+#if NOT_EXCLUDED(OP_decode_bitmap)
+namespace sd {
+    namespace ops {
+        CUSTOM_OP_IMPL(decode_bitmap, 2, 1, true, 0, 0) {
+            auto encoded = INPUT_VARIABLE(1);
+            auto updates = OUTPUT_VARIABLE(0);
+
+            helpers::decodeBitmap(block.launchContext(), encoded, updates);
+            return Status::OK();
+        }
+
+        DECLARE_SHAPE_FN(decode_bitmap) {
+            auto weights = INPUT_VARIABLE(0);
+
+            return SHAPELIST(weights->shapeInfo());
+        }
+
+        DECLARE_TYPES(decode_bitmap) {
+            getOpDescriptor()
+                    ->setAllowedInputTypes(0, {ALL_FLOATS})
+                    ->setAllowedInputTypes(1, DataType::INT32)
+                    ->setAllowedOutputTypes({ALL_FLOATS});
+        }
+    }
+}
+#endif
+
+#if NOT_EXCLUDED(OP_encode_bitmap)
+namespace sd {
+    namespace ops {
+        CUSTOM_OP_IMPL(encode_bitmap, 1, 3, true, 1, 0) {
+            auto input = INPUT_VARIABLE(0);
+            auto encoded = OUTPUT_NULLIFIED(1);
+            auto counter = OUTPUT_NULLIFIED(2);
+
+            float threshold = T_ARG(0);
+
+            encoded->p(0, (int) input->lengthOf());
+            encoded->p(1, (int) input->lengthOf());
+            encoded->p(2, reinterpret_cast<int *>(&threshold)[0]);
+            encoded->p(3, 1); // flag for BITMAP_ENCODING
+
+            auto result = helpers::encodeBitmap(block.launchContext(), input, encoded, threshold);
+            counter->p(0, result);
+            counter->syncToDevice();
+
+            return Status::OK();
+        }
+
+        DECLARE_SHAPE_FN(encode_bitmap) {
+            auto input = inputShape->at(0);
+
+            auto outputLength = shape::length(input) / 16 + 5;
+            auto encodedShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(outputLength, DataType::INT32);
+            auto encodedCounter = ConstantShapeHelper::getInstance()->scalarShapeInfo(DataType::INT32);
+            return SHAPELIST(input, encodedShape, encodedCounter);
+        }
+
+        DECLARE_TYPES(encode_bitmap) {
+            getOpDescriptor()
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(0, {ALL_FLOATS})
+                    ->setAllowedInputTypes(1, DataType::INT32)
+                    ->setAllowedInputTypes(2, DataType::INT32);
+        }
+    }
+}
+#endif
--- a/libnd4j/include/ops/declarable/generic/compression/threshold.cpp
+++ b/libnd4j/include/ops/declarable/generic/compression/threshold.cpp
@ -0,0 +1,104 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <system/op_boilerplate.h>
+#include <ops/declarable/CustomOperations.h>
+#include <ops/declarable/helpers/threshold.h>
+
+namespace sd {
+    namespace ops {
+        CUSTOM_OP_IMPL(encode_threshold, 1, 2, true, 1, 0) {
+            auto x = INPUT_VARIABLE(0);
+            auto updated = OUTPUT_VARIABLE(0);
+            auto encoded = OUTPUT_NULLIFIED(1);
+
+            float threshold = T_ARG(0);
+
+            REQUIRE_TRUE(x->lengthOf() <= DataTypeUtils::max<int>(), 0, "encode_threshold: gradients array must have length <= MAX_INT");
+            REQUIRE_TRUE(encoded->lengthOf() >= 4, 0, "encode_threshold: array for encoded updates can't have less than 4 elements");
+//            REQUIRE_TRUE(x->platformBuffer() == updated->platformBuffer(), 0, "encode_threshold: gradients array must be the same at input and output");
+
+            // filling header bytes
+            encoded->p(0, encoded->lengthOf() - 4);
+            encoded->p(1, (int) x->lengthOf());
+            encoded->p(2, reinterpret_cast<int *>(&threshold)[0]);
+            encoded->p(3, 0); // flag for FLEXIBLE_ENCODING
+
+            // if there's no updates to process - just skip execution
+            if (encoded->lengthOf() == 4)
+                return Status::OK();
+
+            helpers::thresholdEncode(*x, *encoded, threshold);
+
+            return Status::OK();
+        }
+
+        DECLARE_SHAPE_FN(encode_threshold) {
+            auto x = INPUT_VARIABLE(0);
+            // we have limit option here
+            int boundary = block.numI() > 0 ? I_ARG(0) : DataTypeUtils::max<int>();
+            float threshold = T_ARG(0);
+
+            REQUIRE_TRUE(boundary >= 0, 0, "encode_threshold: boundary must be positive");
+            REQUIRE_TRUE(x->lengthOf() <= DataTypeUtils::max<int>(), 0, "encode_threshold: gradients array must have length <= MAX_INT");
+
+            // we must calculate number of elements that >= threshold
+            auto elements = sd::math::nd4j_min<int>(helpers::thresholdEstimate(*x, threshold), boundary);
+            if (elements < 2)
+                elements = 0;
+
+            // result array must have 4 additional int elements for header
+            return SHAPELIST(x->shapeInfo(), sd::ConstantShapeHelper::getInstance()->vectorShapeInfo(elements + 4, DataType::INT32));
+        }
+
+        DECLARE_TYPES(encode_threshold) {
+            getOpDescriptor()
+                    ->setAllowedInputTypes(0, {ALL_FLOATS})
+                    ->setAllowedOutputTypes(0, {ALL_FLOATS})
+                    ->setAllowedOutputTypes(1, DataType::INT32);
+        }
+
+        CUSTOM_OP_IMPL(decode_threshold, 2, 1, true, 0, 0) {
+            auto weights = INPUT_VARIABLE(0);
+            auto encoded = INPUT_VARIABLE(1);
+            auto updates = OUTPUT_VARIABLE(0);
+
+            REQUIRE_TRUE(encoded->lengthOf() >= 4, 0, "decode_threshold: encoded array can't have length < 4");
+            REQUIRE_TRUE(updates->lengthOf() == encoded->e<int>(1), 0, "decode_threshold: updates array must have length equal to [%i]", encoded->e<int>(1));
+            REQUIRE_TRUE(encoded->e<int>(3) == 0, 0, "decode_threshold: encoded array doesn't look like threshold-encoded");
+
+            helpers::thresholdDecode(*encoded, *updates);
+
+            return Status::OK();
+        }
+
+        DECLARE_SHAPE_FN(decode_threshold) {
+            auto weights = inputShape->at(0);
+            return SHAPELIST(weights);
+        }
+
+        DECLARE_TYPES(decode_threshold) {
+            getOpDescriptor()
+                    ->setAllowedInputTypes(0, {ALL_FLOATS})
+                    ->setAllowedInputTypes(1, DataType::INT32)
+                    ->setAllowedOutputTypes(0,{ALL_FLOATS});
+        }
+    }
+}
--- a/libnd4j/include/ops/declarable/headers/compression.h
+++ b/libnd4j/include/ops/declarable/headers/compression.h
@ -0,0 +1,62 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+#ifndef SD_HEADERS_COMPRESSION_H
+#define SD_HEADERS_COMPRESSION_H
+
+#include <ops/declarable/headers/common.h>
+
+namespace sd {
+    namespace ops {
+        
+        /**
+         * encode_bitmap - reinterpret 3D float tensor into uint8_t vector with length N.
+         *
+         * Input:
+         *      0 - 3D float tensor with shape {height, width, channels}
+         *
+         * Output:
+         *      0 - 1D uint8_t tensor with shape {N}
+         */
+        #if NOT_EXCLUDED(OP_encode_bitmap)
+        DECLARE_CUSTOM_OP(encode_bitmap, 1, 3, true, 1, 0);
+        #endif
+
+        /**
+         *  decode_bitmap - reinterpret uint8_t linear tensor as data to float tensor with shape
+         *
+         *  Input:
+         *      0 - uint8_t vector with length N ( shape {N})
+         *
+         *  Output:
+         *      0 - 3D tensor with shape {height, width, channels}
+         *
+         */
+        #if NOT_EXCLUDED(OP_decode_bitmap)
+        DECLARE_CUSTOM_OP(decode_bitmap, 2, 1, true, 0, 0);
+        #endif
+
+
+        DECLARE_CUSTOM_OP(encode_threshold, 2, 1, true, 1, 0);
+        DECLARE_CUSTOM_OP(decode_threshold, 2, 1, true, 0, 0);
+    }
+}
+
+#endif // SD_HEADERS_COMPRESSION_H
--- a/libnd4j/include/ops/declarable/helpers/compression.h
+++ b/libnd4j/include/ops/declarable/helpers/compression.h
@ -0,0 +1,34 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+#ifndef __COMPRESSION_H_HELPERS__
+#define __COMPRESSION_H_HELPERS__
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
+
+namespace sd {
+namespace ops {
+namespace helpers {
+
+    void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output);
+    Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold);
+}
+}
+}
+#endif
--- a/libnd4j/include/ops/declarable/helpers/cpu/compression/compression.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compression/compression.cpp
@ -0,0 +1,37 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+#include <ops/declarable/helpers/compression.h>
+#include <execution/Threads.h>
+
+namespace sd {
+namespace ops {
+namespace helpers {
+
+    void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output) {
+        NativeOpExecutioner::decodeBitmap(input->buffer(), output->lengthOf(), output->buffer(), output->shapeInfo());
+    }
+
+
+    Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold) {
+        return NativeOpExecutioner::encodeBitmap(input->buffer(), input->shapeInfo(), input->lengthOf(), output->bufferAsT<int>(), threshold);
+    }
+}
+}
+}
--- a/libnd4j/include/ops/declarable/helpers/cpu/compression/threshold.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compression/threshold.cpp
@ -0,0 +1,62 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <ops/declarable/helpers/threshold.h>
+#include <execution/Threads.h>
+#include <helpers/threshold.h>
+
+namespace sd {
+    namespace ops {
+        namespace helpers {
+            template <typename T>
+            static int32_t thresholdEstimate_(const NDArray &updates, const float threshold) {
+                auto N = updates.lengthOf();
+                const auto buffer = updates.bufferAsT<T>();
+
+                auto func = PRAGMA_REDUCE_LONG {
+                    int64_t cnt = 0;
+                    for (auto e = start; e < stop; e++) {
+                        auto v = sd::math::nd4j_abs<T>(buffer[e]);
+                        if (v >= threshold)
+                            cnt++;
+                    }
+
+                    return cnt;
+                };
+
+                return samediff::Threads::parallel_long(func, LAMBDA_AL { return _old + _new; }, 0, N);
+            }
+
+            int32_t thresholdEstimate(const NDArray &updates, const float threshold) {
+                BUILD_SINGLE_SELECTOR(updates.dataType(), return thresholdEstimate_, (updates, threshold), FLOAT_TYPES);
+
+                return 0;
+            }
+
+            void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold) {
+                BUILD_SINGLE_SELECTOR(updates.dataType(), sd::TypeCast::convertToThreshold, (nullptr, updates.buffer(), updates.lengthOf(), encoded.buffer()), FLOAT_TYPES);
+            }
+
+            void thresholdDecode(const NDArray &encoded, NDArray &updates) {
+                BUILD_SINGLE_SELECTOR(updates.dataType(), sd::TypeCast::convertFromThreshold, (nullptr, encoded.getBuffer(), updates.lengthOf(), updates.buffer()), FLOAT_TYPES);
+            }
+        }
+    }
+}
--- a/libnd4j/include/ops/declarable/helpers/cuda/compression/compression.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/compression/compression.cu
@ -0,0 +1,66 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//
+#include <ops/declarable/helpers/compression.h>
+#include <loops/type_conversions.h>
+#include <helpers/DebugHelper.h>
+
+namespace sd {
+namespace ops {
+namespace helpers {
+    void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output) {
+        auto stream = context->getCudaStream();
+        NDArray::prepareSpecialUse({output}, {input});
+
+        dim3 launchDims(512, 512, 16384);
+        auto xType = output->dataType();
+        BUILD_SINGLE_SELECTOR(xType, cudaDecodeBitmapGeneric, (launchDims, stream, input->specialBuffer(), output->lengthOf(), output->specialBuffer()), FLOAT_TYPES);
+
+        sd::DebugHelper::checkErrorCode(stream, "decodeBitmapFloat(...) failed");
+
+        NDArray::registerSpecialUse({output}, {input});
+    }
+
+    Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold) {
+        auto stream = LaunchContext::defaultContext()->getCudaStream();
+        int *resultPointer = reinterpret_cast<int *>(LaunchContext::defaultContext()->getScalarPointer());
+        int *reductionPointer = reinterpret_cast<int *>(LaunchContext::defaultContext()->getReductionPointer());
+
+        // nullify result pointer before use
+        resultPointer[0] = 0;
+
+        NDArray::prepareSpecialUse({},{output, input});
+
+        dim3 launchDims(512, 512, 32768);
+        auto xType = input->dataType();
+        BUILD_SINGLE_SELECTOR(xType, cudaEncodeBitmapGeneric,
+                              (launchDims, stream, input->specialBuffer(), input->lengthOf(), reinterpret_cast<int*>(output->specialBuffer()), resultPointer, reductionPointer, threshold),
+                              FLOAT_TYPES);
+
+        sd::DebugHelper::checkErrorCode(stream, "encodeBitmapFloat(...) failed");
+
+        Nd4jLong dZ = (Nd4jLong) resultPointer[0];
+        resultPointer[0] = 0;
+
+        NDArray::registerSpecialUse({output, input}, {});
+        return dZ;
+    }
+}
+}
+}
--- a/libnd4j/include/ops/declarable/helpers/cuda/compression/threshold.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/compression/threshold.cu
@ -0,0 +1,231 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <ops/declarable/helpers/threshold.h>
+#include <loops/type_conversions.h>
+#include <helpers/PointersManager.h>
+#include <vector>
+
+namespace sd {
+    namespace ops {
+        namespace helpers {
+            void prescanArrayRecursive(int** g_scanBlockSums, int *dZ, int *dX, int numElements, int level) {
+                auto stream = LaunchContext::defaultContext()->getCudaStream();
+
+
+                int blockSize = 512; // max size of the thread blocks
+                int numBlocks = sd::math::nd4j_max<int>(1, static_cast<int>(ceil(static_cast<float>(numElements) / (2.f * blockSize))));
+                int numThreads;
+
+                if (numBlocks > 1)
+                    numThreads = blockSize;
+                else if (sd::isPowerOfTwo(numElements))
+                    numThreads = numElements / 2;
+                else
+                    numThreads = sd::floorPow2(numElements);
+
+                int numEltsPerBlock = numThreads * 2;
+
+                // if this is a non-power-of-2 array, the last block will be non-full
+                // compute the smallest power of 2 able to compute its scan.
+                int numEltsLastBlock =
+                        numElements - (numBlocks-1) * numEltsPerBlock;
+                int numThreadsLastBlock = sd::math::nd4j_max<int>(1, numEltsLastBlock / 2);
+                int np2LastBlock = 0;
+                int sharedMemLastBlock = 0;
+
+                if (numEltsLastBlock != numEltsPerBlock) {
+                    np2LastBlock = 1;
+
+                    if(!isPowerOfTwo(numEltsLastBlock))
+                        numThreadsLastBlock = floorPow2(numEltsLastBlock);
+
+                    unsigned int extraSpace = (2 * numThreadsLastBlock) / NUM_BANKS;
+                    sharedMemLastBlock = sizeof(int) * (2 * numThreadsLastBlock + extraSpace);
+                }
+
+                // padding space is used to avoid shared memory bank conflicts
+                int extraSpace = numEltsPerBlock / NUM_BANKS;
+                int sharedMemSize = sizeof(int) * (numEltsPerBlock + extraSpace);
+
+                // setup execution parameters
+                // if NP2, we process the last block separately
+                dim3 grid(sd::math::nd4j_max<int>(1, numBlocks - np2LastBlock), 1, 1);
+                dim3 threads(numThreads, 1, 1);
+                dim3 gridOnes(1, 1, 1);
+                dim3 threadsOnes(numThreadsLastBlock, 1, 1);
+
+                if (sharedMemSize < 2048)
+                    sharedMemSize = 2048;
+
+                if (sharedMemLastBlock < 2048)
+                    sharedMemLastBlock = 2048;
+
+                // execute the scan
+                if (numBlocks > 1) {
+                    sd::prescanLauncher<true, false>(grid, threads, sharedMemSize, stream, dZ, dX, g_scanBlockSums[level], numThreads * 2, 0, 0);
+                    if (np2LastBlock) {
+                        sd::prescanLauncher<true, true>(gridOnes, threadsOnes, sharedMemLastBlock, stream, dZ, dX, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock);
+                    }
+
+                    // After scanning all the sub-blocks, we are mostly done.  But now we
+                    // need to take all of the last values of the sub-blocks and scan those.
+                    // This will give us a new value that must be sdded to each block to
+                    // get the final results.
+                    // recursive (CPU) call
+                    prescanArrayRecursive(g_scanBlockSums, g_scanBlockSums[level], g_scanBlockSums[level], numBlocks, level+1);
+
+                    sd::uniformAdd<<<grid, threads, 1024, *stream>>>(dZ, g_scanBlockSums[level], numElements - numEltsLastBlock, 0, 0);
+
+                    if (np2LastBlock) {
+                        sd::uniformAdd<<<1, numThreadsLastBlock, 1024, *stream>>>(dZ, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock);
+                    }
+                } else if (isPowerOfTwo(numElements)) {
+                    sd::prescanLauncher<false, false>(grid, threads, sharedMemSize, stream, dZ, dX, 0, numThreads * 2, 0, 0);
+                } else {
+                    sd::prescanLauncher<false, true>(grid, threads, sharedMemSize, stream, dZ, dX, 0, numElements, 0, 0);
+                }
+
+                sd::DebugHelper::checkErrorCode(stream, "prescanArray(...) failed");
+            }
+
+            static void encodeThresholdP2Int_(void **prs, int *dx, Nd4jLong N, int *dz) {
+                auto stream = LaunchContext::defaultContext()->getCudaStream();
+
+                prescanArrayRecursive(reinterpret_cast<int**>(prs), dz, dx + 1, (int) N, 0);
+                sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP2Int(...) failed");
+            }
+
+            static void encodeThresholdP3_(void *dx, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){
+                auto stream = LaunchContext::defaultContext()->getCudaStream();
+
+                int blockSize = 512;
+                int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
+
+                dim3 launchDims(numBlocks, blockSize, 8192);
+                auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+                BUILD_SINGLE_SELECTOR(xType, encoderKernelP3Generic, (launchDims, stream, dx, offsets, N, dz), FLOAT_TYPES);
+
+                sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP3Float(...) failed");
+            }
+
+
+            static NDArray thresholdEstimate_(const NDArray &updates, const float threshold) {
+                const int numThreads = 512;
+                const int numBlocks = updates.lengthOf() / numThreads + (updates.lengthOf() % numThreads ? 1 : 0);
+
+                auto tmp = NDArrayFactory::create<int>('c', {numBlocks + 1});
+
+                dim3 launchDims(numBlocks, numThreads, 1024);
+                auto xType = updates.dataType();
+
+                NDArray::prepareSpecialUse({&tmp}, {&updates});
+                BUILD_SINGLE_SELECTOR(xType, encoderKernelP1Generic, (launchDims, LaunchContext::defaultContext()->getCudaStream(), updates.getSpecialBuffer(), updates.lengthOf(), tmp.specialBuffer(), threshold), FLOAT_TYPES);
+                NDArray::registerSpecialUse({&tmp}, {&updates});
+
+                return std::move(tmp);
+            }
+
+            int32_t thresholdEstimate(const NDArray &updates, const float threshold) {
+                return thresholdEstimate_(updates, threshold).e<int>(0);
+            }
+
+            void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold) {
+                // we need these blocks in order to know, how many "updates" will be processed by each GPU block
+                auto blocks = thresholdEstimate_(updates, threshold);
+
+                const int numThreads = 512;
+                const int numBlocks = updates.lengthOf() / numThreads + (updates.lengthOf() % numThreads ? 1 : 0);
+
+                const int prefixThreads = 512;
+                int numElts = numBlocks;
+                int level = 0;
+
+                // here we just calculate number of sumBlock arrays
+                do {
+                    int numPrefixBlocks = sd::math::nd4j_max<int>(1, sd::math::nd4j_ceil<float, int>((float) numElts / (2.0f * prefixThreads)));
+                    if (numBlocks > 1) {
+                        level++;
+                    }
+                    numElts = numPrefixBlocks;
+                } while (numElts > 1);
+
+
+
+                std::vector<NDArray> tempArrays(level);
+                std::vector<Nd4jPointer> pointers(level);
+
+                level = 0;
+                numElts = numBlocks;
+
+                do {
+                    int numPrefixBlocks = sd::math::nd4j_max<int>(1, sd::math::nd4j_ceil<float, int>((float) numElts / (2.0f * prefixThreads)));
+                    if (numPrefixBlocks > 1) {
+                        tempArrays[level] = std::move(NDArrayFactory::create<int>('c', {numPrefixBlocks}));
+                        pointers[level] = tempArrays[level++].specialBuffer();
+                    }
+                    numElts = numPrefixBlocks;
+                } while (numElts > 1);
+
+                PointersManager pm(LaunchContext::defaultContext(), "thresholdEncode");
+                auto dptr = pm.replicatePointer(pointers.data(), pointers.size() * 8);
+                auto offsets = NDArrayFactory::create<int>('c', {numBlocks});
+
+                // we want to check, if we're hiting external limit on number of encoded elements
+                auto numMatches = blocks.e<int>(0);
+                if (numMatches > encoded.lengthOf() - 4) {
+                    blocks.p(0, encoded.lengthOf() - 4);
+                    blocks.syncToDevice();
+                }
+
+                NDArray::prepareSpecialUse({}, {&encoded, &updates});
+
+                // filling offsets
+                encodeThresholdP2Int_(reinterpret_cast<void **>(dptr),
+                                      reinterpret_cast<int*>(blocks.getSpecialBuffer()),
+                                      numBlocks,
+                                      reinterpret_cast<int*>(offsets.getSpecialBuffer()));
+
+                NDArray::registerSpecialUse({&blocks, &offsets}, {});
+                pm.synchronize();
+
+
+                encodeThresholdP3_(updates.specialBuffer(),
+                                   updates.shapeInfo(),
+                                   reinterpret_cast<int*>(offsets.getSpecialBuffer()),
+                                   updates.lengthOf(),
+                                   reinterpret_cast<int*>(encoded.specialBuffer()));
+
+                pm.synchronize();
+
+                NDArray::registerSpecialUse({&encoded, &updates}, {});
+            }
+
+            void thresholdDecode(const NDArray &encoded, NDArray &updates) {
+                dim3 launchDims(128, 512, 512);
+                auto xType = updates.dataType();
+
+                NDArray::prepareSpecialUse({&updates}, {&encoded});
+                BUILD_SINGLE_SELECTOR(xType, decoderKernelGeneric, (launchDims, LaunchContext::defaultContext()->getCudaStream(), encoded.getSpecialBuffer(), updates.lengthOf(), updates.specialBuffer()), FLOAT_TYPES);
+                NDArray::registerSpecialUse({&updates}, {&encoded});
+            }
+        }
+    }
+}
--- a/libnd4j/include/ops/declarable/helpers/threshold.h
+++ b/libnd4j/include/ops/declarable/helpers/threshold.h
@ -0,0 +1,37 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef SD_THRESHOLD_H
+#define SD_THRESHOLD_H
+
+#include <ops/declarable/helpers/helpers.h>
+
+namespace sd {
+    namespace ops {
+        namespace helpers {
+            int32_t thresholdEstimate(const NDArray &updates, float threshold);
+
+            void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold);
+            void thresholdDecode(const NDArray &encoded, NDArray &updates);
+        }
+    }
+}
+
+#endif //SD_THRESHOLD_H
--- a/libnd4j/include/ops/impl/specials_single.hpp
+++ b/libnd4j/include/ops/impl/specials_single.hpp
@ -557,21 +557,26 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
        fb.i_ = x[2];
        float threshold = fb.f_;

+        auto pPos = -1;

        auto func = PRAGMA_THREADS_FOR {
            for (auto e = start; e < stop; e++) {
+                const auto v = x[e];
                for (int bitId = 0; bitId < 16; bitId++) {
-                    bool hasBit = (x[e] & 1 << (bitId)) != 0;
-                    bool hasSign = (x[e] & 1 << (bitId + 16)) != 0;
+                    bool hasBit = (v & 1 << (bitId)) != 0;
+                    bool hasSign = (v & 1 << (bitId + 16)) != 0;
+                    auto cPos = (e - 4) * 16 + bitId;

                    if (hasBit) {
                        if (hasSign)
-                            dz[(e - 4) * 16 + bitId] -= static_cast<T>(threshold);
+                            dz[cPos] -= static_cast<T>(threshold);
                        else
-                            dz[(e - 4) * 16 + bitId] += static_cast<T>(threshold);
+                            dz[cPos] += static_cast<T>(threshold);
                    } else if (hasSign) {
-                        dz[(e - 4) * 16 + bitId] -= static_cast<T>(threshold / 2);
+                        dz[cPos] -= static_cast<T>(threshold / 2);
                    }
+
+                    pPos = cPos;
                }
            }
        };
@ -582,17 +587,21 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
    template<typename T>
    Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) {
        auto dx = reinterpret_cast<T *>(vx);
+        const T two(2.0f);
+        const T zero(0.0f);
+        const T t(threshold);
+        const T thalf = t / two;

-//PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal))
-        auto func = PRAGMA_REDUCE_LONG {
+        //auto func = PRAGMA_REDUCE_LONG {
            Nd4jLong retVal = 0L;

-            for (auto x = start; x < stop; x += increment) {
+            PRAGMA_OMP_PARALLEL_FOR_REDUCTION(+:retVal)
+            for (auto x = 0; x < N; x += 16) {
                int byte = 0;
                int byteId = x / 16 + 4;

                for (int f = 0; f < 16; f++) {
-                    Nd4jLong e = x + f;
+                    auto e = x + f;

                    if (e >= N)
                        continue;
@ -602,19 +611,19 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)

                    int bitId = e % 16;

-                    if (abs >= (T) threshold) {
+                    if (abs >= t) {
                        byte |= 1 << (bitId);
                        retVal++;

-                        if (val < (T) 0.0f) {
+                        if (val < zero) {
                            byte |= 1 << (bitId + 16);
-                            dx[e] += static_cast<T>(threshold);
+                            dx[e] += t;
                        } else {
-                            dx[e] -= static_cast<T>(threshold);
+                            dx[e] -= t;
                        }
-                    } else if (abs >= (T) threshold / (T) 2.0f && val < (T) 0.0f) {
+                    } else if (abs >= thalf && val < zero) {
                        byte |= 1 << (bitId + 16);
-                        dx[e] += static_cast<T>(threshold / 2);
+                        dx[e] += thalf;

                        retVal++;
                    }
@ -624,8 +633,9 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
            }

            return retVal;
-        };
-        return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
+        //};
+
+        //return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
    }
 }

--- a/libnd4j/tests_cpu/layers_tests/AtomicTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/AtomicTests.cu
@ -57,7 +57,7 @@ static void multiplyLauncher(void *vbuffer, uint64_t length, void *vresult) {
    multiplyKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
    auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
    if (err != 0)
-        sd::cuda_exception::build("multiply failed", err);
+        throw sd::cuda_exception::build("multiply failed", err);
 }

 template <typename T>
@ -80,7 +80,7 @@ static void sumLauncher(void *vbuffer, uint64_t length, void *vresult) {
    sumKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
    auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
    if (err != 0)
-        sd::cuda_exception::build("sum failed", err);
+        throw sd::cuda_exception::build("sum failed", err);
 }

 template <typename T>
@ -103,7 +103,7 @@ static void subLauncher(void *vbuffer, uint64_t length, void *vresult) {
    subKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
    auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
    if (err != 0)
-        sd::cuda_exception::build("sub failed", err);
+        throw sd::cuda_exception::build("sub failed", err);
 }

 template <typename T>
@ -126,7 +126,7 @@ static void divLauncher(void *vbuffer, uint64_t length, void *vresult) {
    divKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
    auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
    if (err != 0)
-        sd::cuda_exception::build("div failed", err);
+        throw sd::cuda_exception::build("div failed", err);
 }

 static void multiplyHost(NDArray &input, NDArray &output) {
--- a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
@ -42,18 +42,18 @@ endif()
 # -fsanitize=address
 # -fsanitize=leak
 if (APPLE)
-    set(CMAKE_CXX_FLAGS  " -fPIC  -fmax-errors=2 -D__APPLE_OS__=true")
+    set(CMAKE_CXX_FLAGS  " -fPIC -D__APPLE_OS__=true")
 elseif(WIN32)
    if (SD_CPU)
        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC -march=native -mtune=native -O3")
    endif()

 	if (SD_CPU AND LINUX)
-		set(CMAKE_CXX_FLAGS  " -fPIC  -fmax-errors=2")
+		set(CMAKE_CXX_FLAGS  " -fPIC")
 	endif()
 else()
    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
-    set(CMAKE_CXX_FLAGS  " -fPIC  -fmax-errors=2")
+    set(CMAKE_CXX_FLAGS  " -fPIC")
    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*")
        set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native")
    else()
@ -82,12 +82,12 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")

 elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
    # using GCC
-    SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}")
-endif()
+    SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -fmax-errors=2")

-if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux" AND NOT(MINGW))
+    if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux" AND NOT(MINGW))
        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic")
        SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
+    endif()
 endif()

 IF(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp
@ -25,6 +25,7 @@
 #include <ops/ops.h>
 #include <helpers/GradCheck.h>
 #include <array>
+#include <helpers/RandomLauncher.h>


 using namespace sd;
@ -39,6 +40,195 @@ public:
    }
 };

+TEST_F(DeclarableOpsTests19, test_threshold_encode_1) {
+    auto x = NDArrayFactory::create<double>('c', {3}, {1.5, 2.5, -3.5});
+    auto exp_encoded = NDArrayFactory::create<int>('c', {7}, {3, 3, 1056964608, 0, 1, 2, -3});
+    auto exp_gradients = NDArrayFactory::create<double>('c', {3}, {1.0, 2.0, -3.0});
+
+    sd::ops::encode_threshold op;
+    auto result = op.evaluate({&x}, {0.5});
+
+    auto gradients = result.at(0);
+    auto encoded = result.at(1);
+
+    //encoded->printIndexedBuffer("ENC");
+
+    ASSERT_EQ(exp_encoded, *encoded);
+    ASSERT_EQ(exp_gradients, x);
+
+    // FIXME: we need to add a way to declare individual inplace outputs
+    //ASSERT_EQ(exp_gradients, *gradients);
+}
+
+TEST_F(DeclarableOpsTests19, test_threshold_encode_2) {
+    for (int length = 5; length < 35; length++) {
+        auto x = NDArrayFactory::create<double>('c', {10000});
+        auto exp_gradients = NDArrayFactory::create<double>('c', {10000});
+
+        for (int e = 0; e < length; e++) {
+            x.p(e, 2e-3);
+            exp_gradients.p(e, 1e-3);
+        }
+
+        sd::ops::encode_threshold op;
+        auto result = op.evaluate({&x}, {1e-3});
+
+        auto encoded = result.at(1);
+
+        ASSERT_EQ(length + 4, encoded->lengthOf());
+        ASSERT_EQ(exp_gradients, x);
+    }
+}
+
+TEST_F(DeclarableOpsTests19, test_threshold_encode_boundary_1) {
+    auto x = NDArrayFactory::create<float>('c', {6});
+    x = 1.0f;
+
+    sd::ops::encode_threshold op;
+    auto result = op.evaluate({&x}, {1.0}, {3});
+
+    auto gradients = result.at(0);
+    auto encoded = result.at(1);
+
+    ASSERT_EQ(7, encoded->lengthOf());
+    ASSERT_EQ(3, x.sumNumber().e<int>(0));
+}
+
+TEST_F(DeclarableOpsTests19, test_threshold_encode_boundary_2) {
+    auto x = NDArrayFactory::create<float>('c', {1000});
+    x = 1.0f;
+
+    sd::ops::encode_threshold op;
+    auto result = op.evaluate({&x}, {1.0}, {100});
+
+    auto gradients = result.at(0);
+    auto encoded = result.at(1);
+
+    ASSERT_EQ(104, encoded->lengthOf());
+
+    ASSERT_EQ(900, x.sumNumber().e<int>(0));
+}
+
+TEST_F(DeclarableOpsTests19, test_threshold_decode_1) {
+    auto x = NDArrayFactory::create<double>('c', {3}, {1.0, 2.0, -3.0});
+    auto y = NDArrayFactory::create<int>('c', {7}, {3, 3, 1056964608, 0, 1, 2, -3});
+    auto exp_gradients = NDArrayFactory::create<double>('c', {3}, {1.5, 2.5, -3.5});
+
+    sd::ops::decode_threshold op;
+    auto status = op.execute({&x, &y}, {&x});
+    ASSERT_EQ(Status::OK(), status);
+    ASSERT_EQ(exp_gradients, x);
+}
+
+TEST_F(DeclarableOpsTests19, test_bitmap_encode_1) {
+    auto initial = NDArrayFactory::create<float>('c', {6}, {0.0f, 0.0f, 1e-3f, -1e-3f, 0.0f, 0.0f});
+    auto exp_0 = initial.like();
+    auto exp_1 = initial.dup();
+    auto exp_c = NDArrayFactory::create<int>(2L);
+
+    sd::ops::encode_bitmap enc;
+    auto enc_result = enc.evaluate({&initial}, {1e-3f});
+    ASSERT_EQ(Status::OK(), enc_result.status());
+
+    //initial.printIndexedBuffer("initial");
+    ASSERT_EQ(exp_0, initial);
+
+    auto encoded = enc_result.at(1);
+    auto counter = enc_result.at(2);
+
+    //encoded->printIndexedBuffer("encoded");
+
+    ASSERT_EQ(exp_c, *counter);
+
+    sd::ops::decode_bitmap dec;
+    auto status = dec.execute({&initial, encoded}, {&initial});
+    ASSERT_EQ(Status::OK(), status);
+
+
+    //initial.printIndexedBuffer();
+
+    ASSERT_EQ(exp_1, initial);
+}
+
+TEST_F(DeclarableOpsTests19, test_bitmap_encode_decode) {
+    auto initial = NDArrayFactory::create<float>('c', {256000});
+    initial = 1.0f;
+    auto exp = initial.dup();
+    auto neg = initial.like();
+    neg = 0.5f;
+
+    sd::ops::encode_bitmap enc;
+    auto enc_result = enc.evaluate({&initial}, {0.5f});
+    auto encoded = enc_result.at(1);
+
+    // checking equality of all encoded bits
+    for (int e = 5; e < encoded->lengthOf() - 1; e++) {
+        if (encoded->e<int>(e) != encoded->e<int>(e - 1))
+            nd4j_printf("Non equal encoded values at E[%i]: %i;\n", e, encoded->e<int>(e));
+    }
+
+    ASSERT_NE(exp, initial);
+    ASSERT_EQ(neg, initial);
+
+    sd::ops::decode_bitmap dec;
+    auto status = dec.execute({&initial, encoded}, {&initial});
+    ASSERT_EQ(Status::OK(), status);
+
+    // checking equality of all dedoded bits
+    for (int e = 0; e < initial.lengthOf(); e++) {
+        auto f = initial.e<float>(e);
+        if (f != 1.0f)
+            nd4j_printf("initial[%i] = %f\n", e, f);
+    }
+
+
+    ASSERT_EQ(exp, initial);
+}
+
+TEST_F(DeclarableOpsTests19, test_threshold_encode_decode) {
+    auto initial = NDArrayFactory::create<float>('c', {256000});
+    initial = 1.0f;
+    auto exp = initial.dup();
+    auto neg = initial.like();
+    neg = 0.5f;
+
+    sd::ops::encode_threshold enc;
+    auto enc_result = enc.evaluate({&initial}, {0.5f});
+    auto encoded = enc_result.at(1);
+
+    ASSERT_EQ(256000 + 4, encoded->lengthOf());
+    ASSERT_NE(exp, initial);
+
+    for (int e = 0; e < initial.lengthOf(); e++) {
+        auto f = initial.e<float>(e);
+        if (f != 0.5f) {
+            nd4j_printf("initial[%i] = %f\n", e, f);
+            throw std::runtime_error("");
+        }
+    }
+    ASSERT_EQ(neg, initial);
+
+    // checking equality of all encoded bits
+    //for (int e = 5; e < encoded->lengthOf() - 1; e++) {
+        //if (encoded->e<int>(e) != encoded->e<int>(e - 1) + 1)
+            //nd4j_printf("Non equal encoded values at E[%i]: %i;\n", e, encoded->e<int>(e));
+    //}
+
+    sd::ops::decode_threshold dec;
+    auto status = dec.execute({&initial, encoded}, {&initial});
+    ASSERT_EQ(Status::OK(), status);
+
+    // checking equality of all dedoded bits
+    for (int e = 0; e < initial.lengthOf(); e++) {
+        auto f = initial.e<float>(e);
+        if (f != 1.0f)
+            nd4j_printf("initial[%i] = %f\n", e, f);
+    }
+
+    ASSERT_EQ(exp, initial);
+}
+
+
 TEST_F(DeclarableOpsTests19, test_matmul_ccc) {
    auto x = NDArrayFactory::create<float>('c', {10, 10});
    auto y = NDArrayFactory::create<float>('c', {10, 10});
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java
@ -73,6 +73,10 @@ public class ImportClassMapping {
            org.nd4j.linalg.api.ops.impl.broadcast.BroadcastRSubOp.class,
            org.nd4j.linalg.api.ops.impl.broadcast.BroadcastSubOp.class,
            org.nd4j.linalg.api.ops.impl.broadcast.BroadcastTo.class,
+            org.nd4j.linalg.api.ops.compression.EncodeBitmap.class,
+            org.nd4j.linalg.api.ops.compression.DecodeBitmap.class,
+            org.nd4j.linalg.api.ops.compression.EncodeThreshold.class,
+            org.nd4j.linalg.api.ops.compression.DecodeThreshold.class,
            org.nd4j.linalg.api.ops.impl.shape.Create.class,
            org.nd4j.linalg.api.ops.impl.broadcast.bool.BroadcastEqualTo.class,
            org.nd4j.linalg.api.ops.impl.broadcast.bool.BroadcastGreaterThan.class,
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/DecodeBitmap.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/DecodeBitmap.java
@ -0,0 +1,55 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+package org.nd4j.linalg.api.ops.compression;
+
+import lombok.NonNull;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.ops.DynamicCustomOp;
+import org.nd4j.linalg.factory.Nd4j;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Bitmap decoding op wrapper. Used in gradients sharing.
+ * @author raver119@gmail.com
+ */
+public class DecodeBitmap extends DynamicCustomOp {
+
+    public DecodeBitmap() {
+        //
+    }
+
+    public DecodeBitmap(@NonNull INDArray encoded, @NonNull INDArray updates) {
+        addInputArgument(updates, encoded);
+        addOutputArgument(updates);
+
+        // this op ALWAYS modifies updates array
+        setInPlace(true);
+    }
+
+    @Override
+    public String opName() {
+        return "decode_bitmap";
+    }
+
+    @Override
+    public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
+        return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32);
+    }
+}
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/DecodeThreshold.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/DecodeThreshold.java
@ -0,0 +1,54 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+package org.nd4j.linalg.api.ops.compression;
+
+import lombok.NonNull;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.ops.DynamicCustomOp;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Sparse threshold decoding op wrapper. Used in gradients sharing.
+ * @author raver119@gmail.com
+ */
+public class DecodeThreshold extends DynamicCustomOp {
+
+    public DecodeThreshold() {
+        //
+    }
+
+    public DecodeThreshold(@NonNull INDArray encoded, @NonNull INDArray updates) {
+        addInputArgument(updates, encoded);
+        addOutputArgument(updates);
+
+        // this op ALWAYS modifies updates array
+        setInPlace(true);
+    }
+
+    @Override
+    public String opName() {
+        return "decode_threshold";
+    }
+
+    @Override
+    public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
+        return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32);
+    }
+}
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/EncodeBitmap.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/EncodeBitmap.java
@ -0,0 +1,64 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+package org.nd4j.linalg.api.ops.compression;
+
+import lombok.NonNull;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.ops.DynamicCustomOp;
+import org.nd4j.linalg.factory.Nd4j;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Bitmap encoding op wrapper. Used in gradients sharing.
+ * @author raver119@gmail.com
+ */
+public class EncodeBitmap extends DynamicCustomOp {
+    protected float threshold = 1e-3f;
+
+    public EncodeBitmap() {
+        //
+    }
+
+    public EncodeBitmap(@NonNull INDArray updates, float threshold) {
+        this(updates, Nd4j.create(DataType.INT32, updates.length() / 16 + 5), Nd4j.scalar(DataType.INT32, 0), threshold);
+    }
+
+    public EncodeBitmap(@NonNull INDArray updates, @NonNull INDArray encoded, @NonNull INDArray counter, float threshold) {
+        addInputArgument(updates);
+        addOutputArgument(updates, encoded, counter);
+        addTArgument(threshold);
+
+        this.threshold = threshold;
+
+        // this op ALWAYS modifies updates array
+        setInPlace(true);
+    }
+
+    @Override
+    public String opName() {
+        return "encode_bitmap";
+    }
+
+    @Override
+    public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
+        return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32, DataType.INT32);
+    }
+}
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/EncodeThreshold.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/compression/EncodeThreshold.java
@ -0,0 +1,63 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+package org.nd4j.linalg.api.ops.compression;
+
+import lombok.NonNull;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.ops.DynamicCustomOp;
+import org.nd4j.linalg.factory.Nd4j;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Sparse threshold encoding op wrapper. Used in gradients sharing.
+ * @author raver119@gmail.com
+ */
+public class EncodeThreshold extends DynamicCustomOp {
+    protected float threshold = 1e-3f;
+    protected int boundary = Integer.MAX_VALUE;
+
+    public EncodeThreshold() {
+        //
+    }
+
+    public EncodeThreshold(@NonNull INDArray updates, float threshold) {
+        this(updates, threshold, Integer.MAX_VALUE);
+    }
+
+    public EncodeThreshold(@NonNull INDArray updates, float threshold, @NonNull Integer boundary) {
+        addInputArgument(updates);
+
+        addTArgument(threshold);
+        addIArgument(boundary.intValue());
+
+        this.threshold = threshold;
+        this.boundary = boundary;
+    }
+
+    @Override
+    public String opName() {
+        return "encode_threshold";
+    }
+
+    @Override
+    public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
+        return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32);
+    }
+}
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/executioner/DefaultOpExecutioner.java
@ -30,6 +30,10 @@ import org.nd4j.linalg.api.ndarray.INDArrayStatistics;
 import org.nd4j.linalg.api.ops.*;
 import org.nd4j.linalg.api.ops.aggregates.Aggregate;
 import org.nd4j.linalg.api.ops.aggregates.Batch;
+import org.nd4j.linalg.api.ops.compression.DecodeBitmap;
+import org.nd4j.linalg.api.ops.compression.DecodeThreshold;
+import org.nd4j.linalg.api.ops.compression.EncodeBitmap;
+import org.nd4j.linalg.api.ops.compression.EncodeThreshold;
 import org.nd4j.linalg.api.ops.impl.scatter.ScatterUpdate;
 import org.nd4j.linalg.api.ops.impl.summarystats.Variance;
 import org.nd4j.linalg.api.rng.Random;
@ -685,38 +689,41 @@ public abstract class DefaultOpExecutioner implements OpExecutioner {

    @Override
    public INDArray thresholdEncode(INDArray input, double threshold) {
-        throw new UnsupportedOperationException("Not yet implemented");
+        return thresholdEncode(input, threshold, Integer.MAX_VALUE);
    }

    @Override
    public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) {
-        throw new UnsupportedOperationException("Not yet implemented");
+        val result = Nd4j.exec(new EncodeThreshold(input, (float) threshold, boundary))[1];
+
+        return result.getInt(0) > 0 ? result : null;
    }

    @Override
    public INDArray thresholdDecode(INDArray encoded, INDArray target) {
-        throw new UnsupportedOperationException("Not yet implemented");
+        Nd4j.exec(new DecodeThreshold(encoded, target));
+        return target;
    }

    @Override
    public long bitmapEncode(INDArray indArray, INDArray target, double threshold) {
-        throw new UnsupportedOperationException("Not yet implemented");
+        val results = Nd4j.exec(new EncodeBitmap(indArray, target, Nd4j.scalar(0), (float) threshold));
+
+        // return number of elements taht were compressed
+        return results[2].getInt(0);
    }

    @Override
    public INDArray bitmapEncode(INDArray indArray, double threshold) {
-        DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(indArray.length() / 16 + 5);
-
-        INDArray ret = Nd4j.createArrayFromShapeBuffer(buffer, indArray.shapeInfoDataBuffer());
-
-        bitmapEncode(indArray, ret, threshold);
-
-        return ret;
+        val array = Nd4j.create(DataType.INT32, indArray.length() / 16 + 5);
+        bitmapEncode(indArray, array, threshold);
+        return array;
    }

    @Override
    public INDArray bitmapDecode(INDArray encoded, INDArray target) {
-        throw new UnsupportedOperationException("Not yet implemented");
+        Nd4j.exec(new DecodeBitmap(encoded, target));
+        return target;
    }


--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java
@ -1004,20 +1004,6 @@ public interface NativeOps {
                                @Cast("Nd4jLong *") LongPointer tadShapeInfo,
                                @Cast("Nd4jLong *") LongPointer tadOffsets);

-
-    long encodeBitmap(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, long N, IntPointer dz, float threshold);
-
-    void decodeBitmap(PointerPointer extraPointers, Pointer dx, long N, Pointer dz, LongPointer zShapeInfo);
-
-
-    void encodeThresholdP1(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, long N, IntPointer dz, float threshold);
-
-    void encodeThresholdP2Int(PointerPointer extraPointers, IntPointer dx, long N, IntPointer dz);
-
-    void encodeThresholdP3(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, IntPointer offsets, long N, IntPointer dz);
-
-    void decodeThreshold(PointerPointer extraPointers, Pointer dx, long N, Pointer dz, LongPointer zShapeInfo);
-
    void sort(PointerPointer extraPointers,
                              Pointer x, @Cast("Nd4jLong *") LongPointer xShapeInfo,
                              Pointer dx, @Cast("Nd4jLong *") LongPointer dxShapeInfo,
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/compression/CudaFlexibleThreshold.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/compression/CudaFlexibleThreshold.java
@ -1,100 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-package org.nd4j.linalg.jcublas.compression;
-
-import org.bytedeco.javacpp.IntPointer;
-import org.nd4j.linalg.api.buffer.DataBuffer;
-import org.nd4j.linalg.api.buffer.DataTypeEx;
-import org.nd4j.linalg.api.concurrency.AffinityManager;
-import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.compression.CompressedDataBuffer;
-import org.nd4j.linalg.compression.CompressionDescriptor;
-import org.nd4j.linalg.factory.Nd4j;
-import org.nd4j.linalg.indexing.conditions.Conditions;
-
-/**
- * This compression is very special case, and shouldn't be ever used outside of ParallelWrapper/ParameterServer implementation.
- * It encodes data as delta between zero and abs threshold.
- *
- * Unlike CudaThreshold codec, CudaFlexibleThreshold tries to target specified sparsity/density updates ratio via topN approach
- *
- * PLEASE NOTE: DO NOT USE THIS COMPRESSOR UNLESS YOU'RE 100% SURE WHAT YOU DO!
- *
- * @author raver119@gmail.com
- */
-public class CudaFlexibleThreshold extends CudaThreshold {
-
-    public CudaFlexibleThreshold() {
-        super();
-        this.threshold = 0.1f;
-    }
-
-    /**
-     * This method returns compression descriptor. It should be unique for any compressor implementation
-     *
-     * @return
-     */
-    @Override
-    public String getDescriptor() {
-        return "FTHRESHOLD";
-    }
-
-    /**
-     * This method allows you to configure desired sparsity/density ratio for updates. Pass it as float/double value
-     *
-     * Default value: 0.1
-     * @param vars
-     */
-    @Override
-    public void configure(Object... vars) {
-        super.configure(vars);
-    }
-
-
-    @Override
-    public DataBuffer compress(DataBuffer buffer) {
-        INDArray temp = Nd4j.createArrayFromShapeBuffer(buffer, Nd4j.getShapeInfoProvider().createShapeInformation(new long[]{1, buffer.length()}, buffer.dataType()));
-        double max = temp.amaxNumber().doubleValue();
-
-        int cntAbs = temp.scan(Conditions.absGreaterThanOrEqual(max - (max * threshold))).intValue();
-
-        long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType());
-        int compressedLength = cntAbs + 3;
-        // first 3 elements contain header
-        IntPointer pointer = new IntPointer(compressedLength);
-        pointer.put(0, cntAbs);
-        pointer.put(1, (int) buffer.length());
-        pointer.put(2, Float.floatToIntBits(threshold)); // please note, this value will be ovewritten anyway
-
-        CompressionDescriptor descriptor = new CompressionDescriptor();
-        descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT)
-        descriptor.setOriginalLength(originalLength);
-        descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType()));
-        descriptor.setNumberOfElements(buffer.length());
-
-        descriptor.setCompressionAlgorithm(getDescriptor());
-        descriptor.setCompressionType(getCompressionType());
-
-        CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor);
-
-        Nd4j.getNDArrayFactory().convertDataEx(getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.FTHRESHOLD, pointer, buffer.length());
-
-        Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST);
-
-        return cbuff;
-    }
-}
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/compression/CudaThreshold.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/compression/CudaThreshold.java
@ -1,271 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-package org.nd4j.linalg.jcublas.compression;
-
-import lombok.Getter;
-import lombok.Setter;
-import lombok.extern.slf4j.Slf4j;
-import lombok.val;
-import org.apache.commons.math3.util.FastMath;
-import org.bytedeco.javacpp.*;
-import org.nd4j.compression.impl.AbstractCompressor;
-import org.nd4j.jita.allocator.impl.AtomicAllocator;
-import org.nd4j.linalg.api.buffer.DataBuffer;
-import org.nd4j.linalg.api.buffer.DataType;
-import org.nd4j.linalg.api.buffer.DataTypeEx;
-import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.compression.CompressedDataBuffer;
-import org.nd4j.linalg.compression.CompressionType;
-import org.nd4j.linalg.exception.ND4JIllegalStateException;
-import org.nd4j.linalg.factory.Nd4j;
-import org.nd4j.linalg.jcublas.context.CudaContext;
-import org.nd4j.nativeblas.NativeOpsHolder;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * This compression is very special case, and shouldn't be ever used outside of ParallelWrapper/ParameterServer implementation.
- * It encodes data as delta between zero and abs threshold.
- *
- * PLEASE NOTE: DO NOT USE THIS COMPRESSOR UNLESS YOU'RE 100% SURE WHAT YOU DO!
- *
- * @author raver119@gmail.com
- */
-@Slf4j
-public class CudaThreshold extends AbstractCompressor {
-    @Getter @Setter protected float threshold = 1e-3f;
-
-    /**
-     * This method returns compression descriptor. It should be unique for any compressor implementation
-     *
-     * @return
-     */
-    @Override
-    public String getDescriptor() {
-        return "THRESHOLD";
-    }
-
-    /**
-     * This method allows you to configure threshold for delta extraction. Pass it as float/double value
-     *
-     * Default value: 1e-3
-     * @param vars
-     */
-    @Override
-    public void configure(Object... vars) {
-        if (vars[0] instanceof Number) {
-            Number t = (Number) vars[0];
-            threshold = FastMath.abs(t.floatValue());
-            log.info("Setting threshold to [{}]", threshold);
-        } else {
-            throw new ND4JIllegalStateException("Threshold value should be Number");
-        }
-    }
-
-    @Override
-    public INDArray compress(INDArray array) {
-        //logger.info("Threshold [{}] compression", threshold);
-
-        Nd4j.getExecutioner().commit();
-        //Nd4j.getAffinityManager().ensureLocation(array, AffinityManager.Location.HOST);
-
-        DataBuffer buffer = compress(array.data());
-        if (buffer == null)
-            return null;
-
-        INDArray dup = Nd4j.createArrayFromShapeBuffer(buffer, array.shapeInfoDataBuffer());
-        dup.markAsCompressed(true);
-
-        return dup;
-    }
-
-    @Override
-    public CompressionType getCompressionType() {
-        return CompressionType.LOSSLESS;
-    }
-
-    @Override
-    public DataBuffer decompress(DataBuffer buffer, DataType type) {
-        if (buffer.dataType() != DataType.INT)
-            throw new UnsupportedOperationException();
-
-        long compressedLength = buffer.getInt(0);
-        long originalLength = buffer.getInt(1);
-
-        DataBuffer result = Nd4j.createBuffer(type, originalLength, false);
-
-        val context = AtomicAllocator.getInstance().getDeviceContext();
-
-        PointerPointer extras = new PointerPointer(32).put(1, context.getOldStream());
-
-        //log.info("DEC Source length: {}", buffer.length());
-        //log.info("DEC Source: {}", Arrays.toString(buffer.asInt()));
-
-        //NativeOpsHolder.getInstance().getDeviceNativeOps().decodeThresholdFloat(extras, AtomicAllocator.getInstance().getPointer(buffer), compressedLength, (FloatPointer) AtomicAllocator.getInstance().getPointer(result));
-        AtomicAllocator.getInstance().getAllocationPoint(result).tickDeviceWrite();
-
-        //DataBuffer result = Nd4j.getNDArrayFactory().convertDataEx(DataTypeEx.THRESHOLD, buffer, getGlobalTypeEx());
-
-        return result;
-    }
-
-    @Override
-    public DataBuffer compress(DataBuffer buffer) {
-
-        int numThreads = 1024;
-        int numBlocks = (int) (buffer.length() / numThreads + (buffer.length() % numThreads == 0 ? 0 : 1));
-
-        val context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext();
-
-        DataBuffer blocksBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks+1, true) : Nd4j.getDataBufferFactory().createInt(numBlocks+1, true, Nd4j.getMemoryManager().getCurrentWorkspace());
-        PointerPointer extras = new PointerPointer(32).put(1, context.getOldStream());
-
-
-        //NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP1(extras, (FloatPointer) AtomicAllocator.getInstance().getPointer(buffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), threshold);
-        AtomicAllocator.getInstance().getAllocationPoint(blocksBuffer).tickDeviceWrite();
-
-
-        int numMatches = blocksBuffer.getInt(0);
-
-        //log.info("Totals: {}", numMatches);
-/*
-
-        log.info("Number of blocks for compression: {}", numBlocks);
-        log.info("BlocksCounts: {}", Arrays.toString(blocksBuffer.asInt()));
-*/
-        DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(3+numMatches, false) : Nd4j.getDataBufferFactory().createInt(3+numMatches, false, Nd4j.getMemoryManager().getCurrentWorkspace());
-        encodedBuffer.put(0, numMatches);
-        encodedBuffer.put(1, (int) buffer.length());
-        encodedBuffer.put(2, Float.floatToIntBits(threshold));
-        AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickHostWrite();
-
-        // FIXME: make it parallel via some kernel, because it can be pretty big array here, i.e. for 150m original array, offsets can
-        /*
-        int prevSum = 0;
-        for (int e = 0; e < numBlocks; e++) {
-            int prevVal = offsetsBuffer.getInt(e + 1);
-            offsetsBuffer.put(e + 1, prevSum);
-            prevSum += prevVal;
-        }
-        */
-
-        int prefixThreads = 512;
-        int numElts = numBlocks;
-        int level = 0;
-        List<DataBuffer> buffers = new ArrayList<>();
-
-        // here we just calculate number of sumBlock arrays
-        do {
-            int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
-            if (numBlocks > 1) {
-                level++;
-            }
-            numElts = numPrefixBlocks;
-        } while (numElts > 1);
-
-        long[] pointers = new long[level];
-
-        level = 0;
-        numElts = numBlocks;
-
-        //  allocating temp buffers for prefux sum
-        DataBuffer tempX = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createDouble(pointers.length, false) : Nd4j.getDataBufferFactory().createDouble(pointers.length, false, Nd4j.getMemoryManager().getCurrentWorkspace());
-
-        do {
-            int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
-            if (numPrefixBlocks > 1) {
-                DataBuffer bf = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false) : Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false, Nd4j.getMemoryManager().getCurrentWorkspace());
-
-                buffers.add(bf);
-
-                pointers[level++] = AtomicAllocator.getInstance().getPointer(bf).address();
-            }
-            numElts = numPrefixBlocks;
-        } while (numElts > 1);
-
-
-        AtomicAllocator.getInstance().memcpyBlocking(tempX, new LongPointer(pointers), pointers.length * 8, 0);
-
-        extras.put(2, AtomicAllocator.getInstance().getPointer(tempX));
-
-        DataBuffer offsetsBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks, true) : Nd4j.getDataBufferFactory().createInt(numBlocks, true, Nd4j.getMemoryManager().getCurrentWorkspace());
-
-        NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP2Int(extras, (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), numBlocks, (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer) );
-        AtomicAllocator.getInstance().getAllocationPoint(offsetsBuffer).tickDeviceWrite();
-
-        //log.info("Offsets: {}", Arrays.toString(offsetsBuffer.asInt()));
-        //log.info("Target: {}", Arrays.toString(encodedBuffer.asInt()));
-
-
-
-        //NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP3Float(extras, (FloatPointer) AtomicAllocator.getInstance().getPointer(buffer), (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(encodedBuffer));
-        AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickDeviceWrite();
-        AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite();
-
-        //log.info("Encoded: {}", Arrays.toString(encodedBuffer.asInt()));
-
-        extras.address();
-        tempX.address();
-
-        return encodedBuffer;
-
-        /*
-        INDArray temp = Nd4j.createArrayFromShapeBuffer(buffer, Nd4j.getShapeInfoProvider().createShapeInformation(new int[]{1, (int) buffer.length()}));
-        MatchCondition condition = new MatchCondition(temp, Conditions.absGreaterThanOrEqual(threshold));
-        int cntAbs = Nd4j.getExecutioner().exec(condition, Integer.MAX_VALUE).getInt(0);
-
-
-        //log.info("density ratio: {}", String.format("%.2f", cntAbs * 100.0f / buffer.length()));
-
-        if (cntAbs == 0)
-            return null;
-
-        long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType());
-        int compressedLength = cntAbs + 3;
-        // first 3 elements contain header
-        IntPointer pointer = new IntPointer(compressedLength);
-        pointer.put(0, cntAbs);
-        pointer.put(1, (int) buffer.length());
-        pointer.put(2, Float.floatToIntBits(threshold));
-
-        CompressionDescriptor descriptor = new CompressionDescriptor();
-        descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT)
-        descriptor.setOriginalLength(originalLength);
-        descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType()));
-        descriptor.setNumberOfElements(buffer.length());
-
-        descriptor.setCompressionAlgorithm(getDescriptor());
-        descriptor.setCompressionType(getCompressionType());
-
-
-
-        CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor);
-
-        Nd4j.getNDArrayFactory().convertDataEx(getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.THRESHOLD, pointer, buffer.length());
-
-        Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST);
-
-        return cbuff;
-        */
-    }
-
-    @Override
-    protected CompressedDataBuffer compressPointer(DataTypeEx srcType, Pointer srcPointer, int length, int elementSize) {
-        throw new UnsupportedOperationException();
-    }
-}
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java
@ -31,6 +31,7 @@ import org.nd4j.jita.allocator.tad.DeviceTADManager;
 import org.nd4j.jita.conf.CudaEnvironment;
 import org.nd4j.linalg.api.buffer.DataBuffer;
 import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.concurrency.AffinityManager;
 import org.nd4j.linalg.api.environment.Nd4jEnvironment;
 import org.nd4j.linalg.api.memory.pointers.PagedPointer;
 import org.nd4j.linalg.api.ndarray.INDArray;
@ -1674,224 +1675,6 @@ public class CudaExecutioner extends DefaultOpExecutioner {
        ctx.syncSpecialStream();
    }

-    @Override
-    public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) {
-        DataBuffer buffer = input.data();
-
-        int numThreads = 1024;
-        int numBlocks = (int) (buffer.length() / numThreads + (buffer.length() % numThreads == 0 ? 0 : 1));
-
-        val context = AtomicAllocator.getInstance().getDeviceContext();
-
-        DataBuffer blocksBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks+1, true) : Nd4j.getDataBufferFactory().createInt(numBlocks+1, true, Nd4j.getMemoryManager().getCurrentWorkspace());
-
-        if (extraz.get() == null)
-            extraz.set(new PointerPointer(32));
-
-        val extras = extraz.get().put(1, context.getOldStream());
-
-        ((BaseCudaDataBuffer) buffer).getOpaqueDataBuffer().syncToSpecial();
-
-
-        NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP1(extras,
-                AtomicAllocator.getInstance().getPointer(buffer),
-                (LongPointer) AtomicAllocator.getInstance().getHostPointer(input.shapeInfoDataBuffer()),
-                buffer.length(),
-                (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer),
-                (float) threshold);
-
-        AtomicAllocator.getInstance().getAllocationPoint(blocksBuffer).tickDeviceWrite();
-
-
-        int numMatches = blocksBuffer.getInt(0);
-
-        // special case here, nothing to update
-        if (numMatches < 2)
-            return null;
-
-        if (boundary != null && numMatches > boundary)  {
-            numMatches = boundary;
-            blocksBuffer.put(0, numMatches);
-        }
-
-        DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(4+numMatches, false) : Nd4j.getDataBufferFactory().createInt(4+numMatches, false, Nd4j.getMemoryManager().getCurrentWorkspace());
-
-        encodedBuffer.put(0, numMatches);
-        encodedBuffer.put(1, (int) buffer.length());
-        encodedBuffer.put(2, Float.floatToIntBits((float) threshold));
-
-        encodedBuffer.put(3, ThresholdCompression.FLEXIBLE_ENCODING);
-
-        ((BaseCudaDataBuffer) encodedBuffer).getOpaqueDataBuffer().syncToSpecial();
-
-
-        int prefixThreads = 512;
-        int numElts = numBlocks;
-        int level = 0;
-        List<DataBuffer> buffers = new ArrayList<>();
-
-        // here we just calculate number of sumBlock arrays
-        do {
-            int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
-            if (numBlocks > 1) {
-                level++;
-            }
-            numElts = numPrefixBlocks;
-        } while (numElts > 1);
-
-        long[] pointers = new long[level];
-
-        level = 0;
-        numElts = numBlocks;
-
-        //  allocating temp buffers for prefux sum
-        DataBuffer tempX = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createDouble(pointers.length, false) : Nd4j.getDataBufferFactory().createDouble(pointers.length, false, Nd4j.getMemoryManager().getCurrentWorkspace());
-
-        do {
-            int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
-            if (numPrefixBlocks > 1) {
-                DataBuffer bf = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false) : Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false, Nd4j.getMemoryManager().getCurrentWorkspace());
-
-                buffers.add(bf);
-
-                pointers[level++] = AtomicAllocator.getInstance().getPointer(bf).address();
-            }
-            numElts = numPrefixBlocks;
-        } while (numElts > 1);
-
-
-        AtomicAllocator.getInstance().memcpyBlocking(tempX, new LongPointer(pointers), pointers.length * 8, 0);
-
-        extras.put(2, AtomicAllocator.getInstance().getPointer(tempX));
-
-        DataBuffer offsetsBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks, true) : Nd4j.getDataBufferFactory().createInt(numBlocks, true, Nd4j.getMemoryManager().getCurrentWorkspace());
-
-        NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP2Int(extras, (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), numBlocks, (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer) );
-        AtomicAllocator.getInstance().getAllocationPoint(offsetsBuffer).tickDeviceWrite();
-
-
-        NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP3(extras, AtomicAllocator.getInstance().getPointer(buffer), (LongPointer) AtomicAllocator.getInstance().getHostPointer(input.shapeInfoDataBuffer()), (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(encodedBuffer));
-
-        AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickDeviceWrite();
-        AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite();
-
-        return Nd4j.createArrayFromShapeBuffer(encodedBuffer, input.shapeInfoDataBuffer());
-    }
-
-
-    @Override
-    public INDArray thresholdEncode(INDArray input, double threshold) {
-        return thresholdEncode(input, threshold, null);
-    }
-
-    @Override
-    public INDArray thresholdDecode(INDArray encoded, INDArray target) {
-        DataBuffer buffer = encoded.data();
-
-        if (buffer.dataType() != DataType.INT)
-            throw new UnsupportedOperationException();
-
-        long compressedLength = buffer.getInt(0);
-        long originalLength = buffer.getInt(1);
-
-        if (target.length() != originalLength)
-            throw new ND4JIllegalStateException("originalLength ["+ originalLength+"] stored in encoded array doesn't match target length ["+ target.length()+"]");
-
-        DataBuffer result = target.data();
-
-        val context = AtomicAllocator.getInstance().getDeviceContext();
-
-        if (extraz.get() == null)
-            extraz.set(new PointerPointer(32));
-
-        PointerPointer extras = extraz.get().put(1, context.getOldStream());
-
-        nativeOps.decodeThreshold(extras, AtomicAllocator.getInstance().getPointer(buffer), compressedLength, AtomicAllocator.getInstance().getPointer(result), (LongPointer) AtomicAllocator.getInstance().getHostPointer(target.shapeInfoDataBuffer()));
-
-        if (nativeOps.lastErrorCode() != 0)
-            throw new RuntimeException(nativeOps.lastErrorMessage());
-
-        AtomicAllocator.getInstance().getAllocationPoint(result).tickDeviceWrite();
-
-        return target;
-    }
-
-
-    @Override
-    public long bitmapEncode(INDArray indArray, INDArray target, double threshold) {
-        long length = indArray.length();
-        long tLen = target.data().length();
-
-        if (tLen != (length / 16 + 5))
-            throw new ND4JIllegalStateException("Length of target array should be " + (length / 16 + 5));
-
-        if (target.data().dataType() != DataType.INT)
-            throw new ND4JIllegalStateException("Target array should have INT dataType");
-
-        DataBuffer buffer = target.data();
-        buffer.put(0, (int) length);
-        buffer.put(1, (int) length);
-        buffer.put(2, Float.floatToIntBits((float) threshold));
-
-        // format id
-        buffer.put(3, ThresholdCompression.BITMAP_ENCODING);
-
-        val context = AtomicAllocator.getInstance().getDeviceContext();
-
-        if (extraz.get() == null)
-            extraz.set(new PointerPointer(32));
-
-
-        PointerPointer extras = extraz.get().put(
-                AtomicAllocator.getInstance().getHostPointer(indArray),
-                context.getOldStream(),
-                context.getBufferScalar(),
-                context.getBufferReduction()
-        );
-
-
-        val src = AtomicAllocator.getInstance().getPointer(indArray, context);
-        val dst = (IntPointer) AtomicAllocator.getInstance().getPointer(buffer, context);
-        ((BaseCudaDataBuffer) buffer).getOpaqueDataBuffer().syncToSpecial();
-
-        long val = nativeOps.encodeBitmap(extras,
-                    src, (LongPointer) AtomicAllocator.getInstance().getHostPointer(indArray.shapeInfoDataBuffer()),
-                    length,
-                    dst,
-                    (float) threshold);
-
-        if (nativeOps.lastErrorCode() != 0)
-            throw new RuntimeException(nativeOps.lastErrorMessage());
-
-        AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite();
-
-        return val;
-    }
-
-    @Override
-    public INDArray bitmapDecode(INDArray encoded, INDArray target) {
-
-        val context = AtomicAllocator.getInstance().getDeviceContext();
-
-        if (extraz.get() == null)
-            extraz.set(new PointerPointer(32));
-
-
-        PointerPointer extras = extraz.get().put(
-                AtomicAllocator.getInstance().getHostPointer(target),
-                context.getOldStream(),
-                context.getBufferScalar(),
-                context.getBufferReduction());
-
-        nativeOps.decodeBitmap(extras, AtomicAllocator.getInstance().getPointer(encoded.data(), context), target.length(), AtomicAllocator.getInstance().getPointer(target, context), (LongPointer) AtomicAllocator.getInstance().getHostPointer(target.shapeInfoDataBuffer()));
-
-        if (nativeOps.lastErrorCode() != 0)
-            throw new RuntimeException(nativeOps.lastErrorMessage());
-
-        return target;
-    }
-
-
    @Override
    public synchronized Map<String, CustomOpDescriptor> getCustomOperations() {
        if(customOps == null) {
@ -1974,6 +1757,11 @@ public class CudaExecutioner extends DefaultOpExecutioner {
        val inputArgs = opContext != null ? opContext.getInputArrays() : op.inputArguments();
        int cnt= 0;
        for (val in: inputArgs) {
+            // TODO: once we implement Context-based shape function call this method should be removed
+            val loc = Nd4j.getAffinityManager().getActiveLocation(in);
+            if (loc != AffinityManager.Location.DEVICE && loc != AffinityManager.Location.EVERYWHERE)
+                Nd4j.getAffinityManager().ensureLocation(in, AffinityManager.Location.DEVICE);
+
            // NOT A TYPO: shape functions work on host side only
            if (!in.isEmpty()) {
                inputBuffers.put(cnt, in.data().addressPointer());
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
@ -2804,30 +2804,6 @@ public native void tear(@Cast("Nd4jPointer*") PointerPointer extraPointers,
                        @Cast("Nd4jLong*") long[] tadShapeInfo,
                        @Cast("Nd4jLong*") long[] tadOffsets);

-public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
-public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
-public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
-public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
-public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
-public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
-
-
-public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
-public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
-public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
-public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntPointer dx, @Cast("Nd4jLong") long N, IntPointer dz);
-public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntBuffer dx, @Cast("Nd4jLong") long N, IntBuffer dz);
-public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, int[] dx, @Cast("Nd4jLong") long N, int[] dz);
-public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, IntPointer offsets, @Cast("Nd4jLong") long N, IntPointer dz);
-public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, IntBuffer offsets, @Cast("Nd4jLong") long N, IntBuffer dz);
-public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, int[] offsets, @Cast("Nd4jLong") long N, int[] dz);
-
-
-public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
-public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
-public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
-
-
 public native void sort(@Cast("Nd4jPointer*") PointerPointer extraPointers,
        Pointer x, @Cast("Nd4jLong*") LongPointer xShapeInfo,
        Pointer dx, @Cast("Nd4jLong*") LongPointer dxShapeInfo,
@ -10712,6 +10688,7 @@ public static final int PREALLOC_SIZE = 33554432;
 // #include <ops/declarable/headers/boolean.h>
 // #include <ops/declarable/headers/broadcastable.h>
 // #include <ops/declarable/headers/convo.h>
+// #include <ops/declarable/headers/compression.h>
 // #include <ops/declarable/headers/list.h>
 // #include <ops/declarable/headers/recurrent.h>
 // #include <ops/declarable/headers/transforms.h>
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/resources/META-INF/services/org.nd4j.linalg.compression.NDArrayCompressor
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/resources/META-INF/services/org.nd4j.linalg.compression.NDArrayCompressor
@ -13,5 +13,3 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 ################################################################################
-
-org.nd4j.linalg.jcublas.compression.CudaThreshold
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/NativeOpExecutioner.java
@ -1376,142 +1376,6 @@ public class NativeOpExecutioner extends DefaultOpExecutioner {
        }
    }

-    @Override
-    public INDArray thresholdEncode(INDArray input, double threshold) {
-        return thresholdEncode(input, threshold, null);
-    }
-
-    @Override
-    public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) {
-
-        //val condition = new MatchCondition(input, Conditions.absGreaterThanOrEqual(threshold));
-        //long t1 = System.currentTimeMillis();
-        int cntAbs = loop.estimateThreshold(null,
-                input.data().addressPointer(),
-                (LongPointer) input.shapeInfoDataBuffer().addressPointer(),
-                (int) input.length(),
-                (float) threshold);
-        //long t2 = System.currentTimeMillis();
-
-        if (loop.lastErrorCode() != 0)
-            throw new RuntimeException(loop.lastErrorMessage());
-
-        if (cntAbs < 2)
-            return null;
-
-        if (boundary != null)
-            cntAbs = Math.min(cntAbs, boundary);
-
-        //log.info("S: {}; T: {}", cntAbs, t2 - t1);
-
-        DataBuffer buffer = input.data();
-
-        long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType());
-        int compressedLength = cntAbs + 4;
-        // first 3 elements contain header
-
-        DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(4+cntAbs, false) : Nd4j.getDataBufferFactory().createInt(4+cntAbs, false, Nd4j.getMemoryManager().getCurrentWorkspace());
-
-        encodedBuffer.put(0, cntAbs);
-        encodedBuffer.put(1, (int) buffer.length());
-        encodedBuffer.put(2, Float.floatToIntBits((float) threshold));
-
-        // format id
-        encodedBuffer.put(3, ThresholdCompression.FLEXIBLE_ENCODING);
-
-        CompressionDescriptor descriptor = new CompressionDescriptor();
-        descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT)
-        descriptor.setOriginalLength(originalLength);
-        descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType()));
-        descriptor.setNumberOfElements(buffer.length());
-
-        descriptor.setCompressionAlgorithm("THRESHOLD");
-        descriptor.setCompressionType(CompressionType.LOSSLESS);
-
-        //CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor);
-
-        Nd4j.getNDArrayFactory().convertDataEx(AbstractCompressor.getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.THRESHOLD, encodedBuffer.addressPointer(), buffer.length());
-
-        Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST);
-
-        return Nd4j.createArrayFromShapeBuffer(encodedBuffer, input.shapeInfoDataBuffer());
-    }
-
-    @Override
-    public INDArray thresholdDecode(INDArray encoded, INDArray target) {
-        DataBuffer buffer = encoded.data();
-
-        if (buffer.dataType() != DataType.INT)
-            throw new ND4JIllegalStateException("thresholdEncoded array should have dataType of INT");
-
-        long compressedLength = buffer.getInt(0);
-        long originalLength = buffer.getInt(1);
-        float threshold = buffer.getInt(2);
-
-        if (target.length() != originalLength)
-            throw new ND4JIllegalStateException("originalLength ["+ originalLength+"] stored in encoded array doesn't match target length ["+ target.length()+"]");
-
-        DataTypeEx typeDst = AbstractCompressor.getBufferTypeEx(target.data());
-
-        loop.convertTypes(null, DataTypeEx.THRESHOLD.ordinal(), buffer.addressPointer(), target.length(), typeDst.ordinal(), target.data().addressPointer());
-
-        if (loop.lastErrorCode() != 0)
-            throw new RuntimeException(loop.lastErrorMessage());
-
-        return target;
-    }
-
-
-    @Override
-    public long bitmapEncode(INDArray indArray, INDArray target, double threshold) {
-        long length = indArray.length();
-        long tLen = target.data().length();
-
-        if (tLen != (length / 16 + 5))
-            throw new ND4JIllegalStateException("Length of target array should be " + (length / 16 + 5));
-
-        if (target.data().dataType() != DataType.INT)
-            throw new ND4JIllegalStateException("Target array should have INT dataType");
-
-        DataBuffer buffer = target.data();
-
-        buffer.put(0, (int) length);
-        buffer.put(1, (int) length);
-        buffer.put(2, Float.floatToIntBits((float) threshold));
-
-        // format id
-        buffer.put(3, ThresholdCompression.BITMAP_ENCODING);
-
-        long affected = loop.encodeBitmap(null,
-                indArray.data().addressPointer(),
-                (LongPointer) indArray.shapeInfoDataBuffer().addressPointer(),
-                length,
-                (IntPointer) buffer.addressPointer(),
-                (float) threshold);
-
-        if (loop.lastErrorCode() != 0)
-            throw new RuntimeException(loop.lastErrorMessage());
-
-        return affected;
-    }
-
-    @Override
-    public INDArray bitmapDecode(INDArray encoded, INDArray target) {
-
-        loop.decodeBitmap(null,
-                encoded.data().addressPointer(),
-                target.length(),
-                target.data().addressPointer(),
-                (LongPointer) target.shapeInfoDataBuffer().addressPointer()
-        );
-
-        if (loop.lastErrorCode() != 0)
-            throw new RuntimeException(loop.lastErrorMessage());
-
-        return target;
-    }
-
-
    @Override
    public synchronized Map<String, CustomOpDescriptor> getCustomOperations() {
        if (customOps == null) {
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
@ -2808,30 +2808,6 @@ public native void tear(@Cast("Nd4jPointer*") PointerPointer extraPointers,
                        @Cast("Nd4jLong*") long[] tadShapeInfo,
                        @Cast("Nd4jLong*") long[] tadOffsets);

-public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
-public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
-public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
-public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
-public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
-public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
-
-
-public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
-public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
-public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
-public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntPointer dx, @Cast("Nd4jLong") long N, IntPointer dz);
-public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntBuffer dx, @Cast("Nd4jLong") long N, IntBuffer dz);
-public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, int[] dx, @Cast("Nd4jLong") long N, int[] dz);
-public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, IntPointer offsets, @Cast("Nd4jLong") long N, IntPointer dz);
-public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, IntBuffer offsets, @Cast("Nd4jLong") long N, IntBuffer dz);
-public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, int[] offsets, @Cast("Nd4jLong") long N, int[] dz);
-
-
-public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
-public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
-public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
-
-
 public native void sort(@Cast("Nd4jPointer*") PointerPointer extraPointers,
        Pointer x, @Cast("Nd4jLong*") LongPointer xShapeInfo,
        Pointer dx, @Cast("Nd4jLong*") LongPointer dxShapeInfo,
@ -12463,6 +12439,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #include <ops/declarable/headers/boolean.h>
 // #include <ops/declarable/headers/broadcastable.h>
 // #include <ops/declarable/headers/convo.h>
+// #include <ops/declarable/headers/compression.h>
 // #include <ops/declarable/headers/list.h>
 // #include <ops/declarable/headers/recurrent.h>
 // #include <ops/declarable/headers/transforms.h>
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/Nd4jTestsC.java
@ -8315,6 +8315,85 @@ public class Nd4jTestsC extends BaseNd4jTest {
        assertArrayEquals(new long[]{bS, oH, oW, oC}, ret[0].shape());
    }

+    @Test
+    public void testMatmulMethod_8() {
+        val x = Nd4j.create(DataType.INT8, 3, 5).assign(1);
+        val y = Nd4j.create(DataType.INT8, 5, 3).assign(1);
+        val e = Nd4j.create(DataType.INT8, 3, 3).assign(5);
+
+        val z = x.mmul(y);
+        assertEquals(e, z);
+    }
+
+    @Test
+    public void testMatmulMethod_7() {
+        val x = Nd4j.create(DataType.INT16, 3, 5).assign(1);
+        val y = Nd4j.create(DataType.INT16, 5, 3).assign(1);
+        val e = Nd4j.create(DataType.INT16, 3, 3).assign(5);
+
+        val z = x.mmul(y);
+        assertEquals(e, z);
+    }
+
+    @Test
+    public void testMatmulMethod_1() {
+        val x = Nd4j.create(DataType.INT32, 3, 5).assign(1);
+        val y = Nd4j.create(DataType.INT32, 5, 3).assign(1);
+        val e = Nd4j.create(DataType.INT32, 3, 3).assign(5);
+
+        val z = x.mmul(y);
+        assertEquals(e, z);
+    }
+
+    @Test
+    public void testMatmulMethod_2() {
+        val x = Nd4j.create(DataType.INT64, 3, 5).assign(1);
+        val y = Nd4j.create(DataType.INT64, 5, 3).assign(1);
+        val e = Nd4j.create(DataType.INT64, 3, 3).assign(5);
+
+        val z = x.mmul(y);
+        assertEquals(e, z);
+    }
+
+    @Test
+    public void testMatmulMethod_6() {
+        val x = Nd4j.create(DataType.UINT8, 3, 5).assign(1);
+        val y = Nd4j.create(DataType.UINT8, 5, 3).assign(1);
+        val e = Nd4j.create(DataType.UINT8, 3, 3).assign(5);
+
+        val z = x.mmul(y);
+        assertEquals(e, z);
+    }
+
+    @Test
+    public void testMatmulMethod_5() {
+        val x = Nd4j.create(DataType.UINT16, 3, 5).assign(1);
+        val y = Nd4j.create(DataType.UINT16, 5, 3).assign(1);
+        val e = Nd4j.create(DataType.UINT16, 3, 3).assign(5);
+
+        val z = x.mmul(y);
+        assertEquals(e, z);
+    }
+
+    @Test
+    public void testMatmulMethod_3() {
+        val x = Nd4j.create(DataType.UINT32, 3, 5).assign(1);
+        val y = Nd4j.create(DataType.UINT32, 5, 3).assign(1);
+        val e = Nd4j.create(DataType.UINT32, 3, 3).assign(5);
+
+        val z = x.mmul(y);
+        assertEquals(e, z);
+    }
+
+    @Test
+    public void testMatmulMethod_4() {
+        val x = Nd4j.create(DataType.UINT64, 3, 5).assign(1);
+        val y = Nd4j.create(DataType.UINT64, 5, 3).assign(1);
+        val e = Nd4j.create(DataType.UINT64, 3, 3).assign(5);
+
+        val z = x.mmul(y);
+        assertEquals(e, z);
+    }

    @Override
    public char ordering() {
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/compression/CompressionTests.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/compression/CompressionTests.java
@ -17,6 +17,7 @@
 package org.nd4j.linalg.compression;

 import lombok.extern.slf4j.Slf4j;
+import lombok.val;
 import org.junit.Ignore;
 import org.junit.Test;
 import org.junit.runner.RunWith;
@ -44,7 +45,6 @@ import static org.junit.Assert.*;
 /**
 * @author raver119@gmail.com
 */
-@Ignore
@Slf4j
@RunWith(Parameterized.class)
 public class CompressionTests extends BaseNd4jTest {
@ -140,40 +140,6 @@ public class CompressionTests extends BaseNd4jTest {
    }


-    @Test
-    public void testThresholdCompressionZ() {
-        INDArray initial = Nd4j.create(1, 16384);
-        for (int i = 0; i < 96; i++)
-            initial.putScalar(i * 20, 1.0f);
-
-
-        INDArray exp = Nd4j.create(1, 16384);
-        for (int i = 0; i < 96; i++)
-            exp.putScalar(i * 20, 0.1f);
-
-        INDArray exp_d = Nd4j.create(1, 16384);
-        for (int i = 0; i < 96; i++)
-            exp_d.putScalar(i * 20, 0.9f);
-
-        NDArrayCompressor compressor = Nd4j.getCompressor().getCompressor("THRESHOLD");
-        compressor.configure(0.9);
-
-        INDArray compressed = Nd4j.getExecutioner().thresholdEncode(initial, 0.9);
-
-        assertEquals(exp, initial);
-
-        log.info("Compressed length: {}", compressed.data().length());
-        //        log.info("Compressed: {}", Arrays.toString(compressed.data().asInt()));
-
-        INDArray decompressed = Nd4j.create(1, initial.length());
-        Nd4j.getExecutioner().thresholdDecode(compressed, decompressed);
-
-        log.info("Decompressed length: {}", decompressed.length());
-
-        assertEquals(exp_d, decompressed);
-    }
-
-
    @Ignore
    @Test
    public void testThresholdCompression0() {
@ -296,6 +262,23 @@ public class CompressionTests extends BaseNd4jTest {

    @Test
    public void testThresholdCompression5() {
+        INDArray initial = Nd4j.ones(10);
+        INDArray exp_0 = initial.dup();
+
+        Nd4j.getExecutioner().commit();
+
+        //Nd4j.getCompressor().getCompressor("THRESHOLD").configure(1e-3);
+        INDArray compressed = Nd4j.getExecutioner().thresholdEncode(initial, 1.0f, 3);
+
+        assertEquals(7, compressed.data().length());
+
+        assertNotEquals(exp_0, initial);
+
+        assertEquals(7, initial.sumNumber().doubleValue(), 0.01);
+    }
+
+    @Test
+    public void testThresholdCompression5_1() {
        INDArray initial = Nd4j.ones(1000);
        INDArray exp_0 = initial.dup();

@ -435,8 +418,8 @@ public class CompressionTests extends BaseNd4jTest {
        INDArray exp_0 = Nd4j.create(new float[] {0.0f, -1e-4f, 0.0f, 0.0f, 0.0f, 0.0f});
        INDArray exp_1 = Nd4j.create(new float[] {0.0f, -5e-4f, 1e-3f, -1e-3f, 0.0f, 0.0f});

-        DataBuffer ib = Nd4j.getDataBufferFactory().createInt(5);
-        INDArray enc = Nd4j.createArrayFromShapeBuffer(ib, initial.shapeInfoDataBuffer());
+
+        INDArray enc = Nd4j.create(DataType.INT32, initial.length() / 16 + 5);

        long elements = Nd4j.getExecutioner().bitmapEncode(initial, enc, 1e-3);
        log.info("Encoded: {}", Arrays.toString(enc.data().asInt()));
@ -471,7 +454,7 @@ public class CompressionTests extends BaseNd4jTest {
    @Test
    public void testBitmapEncoding5() {
        Nd4j.getRandom().setSeed(119);
-        INDArray initial = Nd4j.rand(new int[]{1, 10000}, -1, -0.5, Nd4j.getRandom());
+        INDArray initial = Nd4j.rand(new int[]{10000}, -1, -0.5, Nd4j.getRandom());
        INDArray exp_0 = initial.dup().addi(1e-1);
        INDArray exp_1 = initial.dup();

@ -486,7 +469,7 @@ public class CompressionTests extends BaseNd4jTest {
    @Test
    public void testBitmapEncoding6() {
        Nd4j.getRandom().setSeed(119);
-        INDArray initial = Nd4j.rand(new int[]{1, 100000}, -1, 1, Nd4j.getRandom());
+        INDArray initial = Nd4j.rand(new int[]{10000}, -1, 1, Nd4j.getRandom());
        INDArray exp_1 = initial.dup();

        INDArray enc = Nd4j.getExecutioner().bitmapEncode(initial, 1e-3);
@ -494,6 +477,11 @@ public class CompressionTests extends BaseNd4jTest {

        Nd4j.getExecutioner().bitmapDecode(enc, initial);

+        val f0 = exp_1.toFloatVector();
+        val f1 = initial.toFloatVector();
+
+        assertArrayEquals(f0, f1, 1e-5f);
+
        assertEquals(exp_1, initial);
    }