compression ops (#436)
* Added declarations for decode/encode_bitmap ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Added implementation for bitmap encoding/decoding ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Added helpers for encode/decode bitmap ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored encodingBitmap helper. Signed-off-by: shugeo <sgazeos@gmail.com> * threshold encode/decode skeleton * helper skeleton * minor import fix * encoder shape fn & op impl * thresholdEncode cpu impl Signed-off-by: raver119@gmail.com <raver119@gmail.com> * thresholdDecode cpu impl Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Only cosmetical changes. Signed-off-by: shugeo <sgazeos@gmail.com> * placeholder Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Added cuda implementation for bitmap decode helper. Signed-off-by: shugeo <sgazeos@gmail.com> * cuda thresholdEstimate Signed-off-by: raver119@gmail.com <raver119@gmail.com> * cuda thresholdDecode Signed-off-by: raver119@gmail.com <raver119@gmail.com> * next step Signed-off-by: raver119@gmail.com <raver119@gmail.com> * - nano cmakelist update (get rid of Clion section) - fixed forgotten throw in AtomicTests Signed-off-by: raver119@gmail.com <raver119@gmail.com> * thesholdEncode cuda impl Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Added tests for bitmap encoding/decoding ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed tests for encode/decode bitmaps. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored decode/encode helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed crashes with bitmap decode/encode helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * bitmap encode/decode CPU Signed-off-by: raver119@gmail.com <raver119@gmail.com> * bitmap encode/decode CUDA Signed-off-by: raver119@gmail.com <raver119@gmail.com> * C API removed for threshold/bitmap encode Signed-off-by: raver119@gmail.com <raver119@gmail.com> * EncodeBitmap/DecodeBitmap Java side Signed-off-by: raver119@gmail.com <raver119@gmail.com> * EncodeThreshold/DecodeThreshold Java side Signed-off-by: raver119@gmail.com <raver119@gmail.com> * EncodeThreshold/DecodeThreshold Java side Signed-off-by: raver119@gmail.com <raver119@gmail.com> * few more tests for threshold encoding Signed-off-by: raver119@gmail.com <raver119@gmail.com> * minor test tweak Signed-off-by: raver119@gmail.com <raver119@gmail.com> * two special tests Signed-off-by: raver119@gmail.com <raver119@gmail.com> * encodeBitmap CPU fix Signed-off-by: raver119@gmail.com <raver119@gmail.com> * parallel_long/parallel_double proper spans fix Signed-off-by: raver119@gmail.com <raver119@gmail.com> * encodeThreshold CUDA fix Signed-off-by: raver119@gmail.com <raver119@gmail.com> * nano fix Signed-off-by: raver119@gmail.com <raver119@gmail.com> * grid tweaks Signed-off-by: raver119@gmail.com <raver119@gmail.com> * RTX adaptation for thresholdEncode Signed-off-by: raver119 <raver119@gmail.com> * don't allow threshold encoding for length < 2 Signed-off-by: raver119@gmail.com <raver119@gmail.com> * get rid of NDArrayCompressor in EncodingHandler Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one more minor update of EncodingHandler Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one more minor tweak of EncodingHandler Signed-off-by: raver119@gmail.com <raver119@gmail.com> * - matmul allows integer data types use - EncodingHandler boundary default value - few tests for integer matmul Signed-off-by: raver119@gmail.com <raver119@gmail.com> * minor fix of CUDA bitmap encode Signed-off-by: raver119@gmail.com <raver119@gmail.com> * boundary changed to integer everywhere Signed-off-by: raver119@gmail.com <raver119@gmail.com> * boundary changed to integer everywhere Signed-off-by: raver119@gmail.com <raver119@gmail.com> * re-enable CUDA deallocator Signed-off-by: raver119@gmail.com <raver119@gmail.com> * threshold encoder fix for systems without omp Signed-off-by: raver119@gmail.com <raver119@gmail.com> * - encode_threshold now requires non-negative boundary - minor tweak in EncodingHandler Signed-off-by: raver119@gmail.com <raver119@gmail.com> * restore parallelism in decode_bitmap Signed-off-by: raver119@gmail.com <raver119@gmail.com> * fall back to omp for encode_bitmap cpu Signed-off-by: raver119@gmail.com <raver119@gmail.com> * single time casts Signed-off-by: raver119@gmail.com <raver119@gmail.com> * - additional test for encode_threshold - sync buffers to device before calling for shape function Signed-off-by: raver119@gmail.com <raver119@gmail.com> Co-authored-by: shugeo <sgazeos@gmail.com>master
parent
f1232f8221
commit
0613485654
|
@ -23,9 +23,13 @@ import org.deeplearning4j.optimize.solvers.accumulation.EncodedGradientsAccumula
|
|||
import org.deeplearning4j.optimize.solvers.accumulation.EncodingHandler;
|
||||
import org.deeplearning4j.optimize.solvers.accumulation.encoding.threshold.FixedThresholdAlgorithm;
|
||||
import org.junit.Test;
|
||||
import org.nd4j.linalg.api.concurrency.AffinityManager;
|
||||
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||
import org.nd4j.linalg.api.ops.util.PrintAffinity;
|
||||
import org.nd4j.linalg.factory.Nd4j;
|
||||
import org.nd4j.nativeblas.OpaqueDataBuffer;
|
||||
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
/**
|
||||
|
@ -93,12 +97,13 @@ public class EncodedGradientsAccumulatorTest extends BaseDL4JTest {
|
|||
}
|
||||
|
||||
|
||||
EncodingHandler handler = new EncodingHandler(new FixedThresholdAlgorithm(1e-3), null, null, false);
|
||||
EncodingHandler handler = new EncodingHandler(new FixedThresholdAlgorithm(1e-3), null, Integer.MAX_VALUE, false);
|
||||
for (int e = 10; e < numParams / 5; e++) {
|
||||
|
||||
INDArray encoded = handler.encodeUpdates(0, 0, getGradients(numParams, e, 2e-3));
|
||||
val gradients = getGradients(numParams, e, 2e-3);
|
||||
val encoded = handler.encodeUpdates(0, 0, gradients);
|
||||
|
||||
// log.info("enc len: {}", encoded.data().length());
|
||||
assertNotNull("Failed with e == " + e, encoded);
|
||||
|
||||
int encFormat = encoded.data().getInt(3);
|
||||
|
||||
|
|
|
@ -69,7 +69,7 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
|
|||
protected ThreadLocal<Integer> index = new ThreadLocal<>();
|
||||
protected long initialMemory = 100 * 1024 * 1024L;
|
||||
protected int queueSize = 5;
|
||||
protected Double boundary = 1.0;
|
||||
protected Integer boundary = Integer.MAX_VALUE;
|
||||
protected boolean encodingDebugMode;
|
||||
|
||||
protected IndexedTail externalSource;
|
||||
|
@ -101,11 +101,11 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
|
|||
}
|
||||
|
||||
public EncodedGradientsAccumulator(int parties, ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, boolean encodingDebugMode) {
|
||||
this(parties, new EncodingHandler(thresholdAlgorithm, residualPostProcessor, 1.0, encodingDebugMode), DEFAULT_INITIAL_MEMORY, 10, 1.0, encodingDebugMode);
|
||||
this(parties, new EncodingHandler(thresholdAlgorithm, residualPostProcessor, Integer.MAX_VALUE, encodingDebugMode), DEFAULT_INITIAL_MEMORY, 10, Integer.MAX_VALUE, encodingDebugMode);
|
||||
}
|
||||
|
||||
public EncodedGradientsAccumulator(int parties, @NonNull MessageHandler handler, long initialMemory,
|
||||
int queueSize, Double boundary, boolean encodingDebugMode) {
|
||||
int queueSize, Integer boundary, boolean encodingDebugMode) {
|
||||
this.parties = parties;
|
||||
this.handler = handler;
|
||||
this.initialMemory = initialMemory;
|
||||
|
@ -551,7 +551,7 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
|
|||
protected long initialMemory = DEFAULT_INITIAL_MEMORY;
|
||||
protected int queueSize = 5;
|
||||
protected MessageHandler handler;
|
||||
protected Double boundary = null;
|
||||
protected int boundary = Integer.MAX_VALUE;
|
||||
protected boolean encodingDebugMode;
|
||||
|
||||
/**
|
||||
|
@ -598,15 +598,12 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
|
|||
/**
|
||||
* This method enables optional limit for max number of updates per message
|
||||
*
|
||||
* Default value: 1.0 (no limit)
|
||||
* Default value: Integer.MAX_VALUE (no limit)
|
||||
* @param boundary positive value in range 0..1
|
||||
* @return
|
||||
*/
|
||||
public Builder updatesBoundary(double boundary) {
|
||||
if (boundary >= 1.0)
|
||||
return this;
|
||||
|
||||
if (boundary <= 0.0)
|
||||
public Builder updatesBoundary(int boundary) {
|
||||
if (boundary <= 0)
|
||||
throw new DL4JInvalidConfigException("Boundary should have positive value");
|
||||
|
||||
this.boundary = boundary;
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
package org.deeplearning4j.optimize.solvers.accumulation;
|
||||
|
||||
import org.nd4j.linalg.api.buffer.DataType;
|
||||
import org.nd4j.shade.guava.util.concurrent.AtomicDouble;
|
||||
import lombok.NonNull;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
@ -24,7 +25,6 @@ import org.deeplearning4j.optimize.solvers.accumulation.encoding.ThresholdAlgori
|
|||
import org.deeplearning4j.optimize.solvers.accumulation.encoding.ThresholdAlgorithmReducer;
|
||||
import org.nd4j.linalg.api.buffer.DataBuffer;
|
||||
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||
import org.nd4j.linalg.compression.NDArrayCompressor;
|
||||
import org.nd4j.linalg.exception.ND4JIllegalStateException;
|
||||
import org.nd4j.linalg.factory.Nd4j;
|
||||
import org.nd4j.linalg.ops.transforms.Transforms;
|
||||
|
@ -54,9 +54,8 @@ public class EncodingHandler implements MessageHandler {
|
|||
protected ThresholdAlgorithm initialThresholdAlgorithm;
|
||||
protected ResidualPostProcessor initialResidualPostProcessor;
|
||||
|
||||
protected Double boundary;
|
||||
protected Integer boundary;
|
||||
protected boolean encodingDebugMode;
|
||||
protected NDArrayCompressor compressor;
|
||||
protected AtomicInteger atomicBoundary = new AtomicInteger(-1);
|
||||
|
||||
protected ThreadLocal<ThresholdAlgorithm> thresholdAlgorithm = new ThreadLocal<>();
|
||||
|
@ -73,20 +72,16 @@ public class EncodingHandler implements MessageHandler {
|
|||
protected final AtomicLong lastThresholdLogTime = new AtomicLong();
|
||||
|
||||
public EncodingHandler(final ThresholdAlgorithm thresholdAlgorithm, final ResidualPostProcessor residualPostProcessor,
|
||||
Double boundary, boolean encodingDebugMode){
|
||||
Integer boundary, boolean encodingDebugMode){
|
||||
this.initialThresholdAlgorithm = thresholdAlgorithm;
|
||||
this.initialResidualPostProcessor = residualPostProcessor;
|
||||
this.boundary = boundary;
|
||||
this.boundary = boundary == null ? Integer.MAX_VALUE : boundary;
|
||||
this.encodingDebugMode = encodingDebugMode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize(@NonNull GradientsAccumulator accumulator) {
|
||||
this.accumulator = accumulator;
|
||||
|
||||
compressor = Nd4j.getCompressor().getCompressor("THRESHOLD");
|
||||
if (compressor == null)
|
||||
throw new ND4JIllegalStateException("Can't find Threshold compressor implementation!");
|
||||
}
|
||||
|
||||
public INDArray encodeUpdates(int iteration, int epoch, INDArray updates) {
|
||||
|
@ -135,14 +130,13 @@ public class EncodingHandler implements MessageHandler {
|
|||
iterations.get().incrementAndGet();
|
||||
|
||||
if (boundary != null && atomicBoundary.get() < 0)
|
||||
atomicBoundary.compareAndSet(-1, (int) (updates.length() * boundary));
|
||||
atomicBoundary.compareAndSet(-1, (int) (updates.length() / 16) );
|
||||
|
||||
INDArray encoded;
|
||||
|
||||
if (!bitmapMode.get().get()) {
|
||||
//Sparse updates
|
||||
encoded = Nd4j.getExecutioner().thresholdEncode(updates, currentThreshold.get().get(),
|
||||
boundary == null ? null : atomicBoundary.get());
|
||||
encoded = Nd4j.getExecutioner().thresholdEncode(updates, currentThreshold.get().get(), boundary == null ? null : atomicBoundary.get());
|
||||
|
||||
// updates were TOO sparse, nothing to share here
|
||||
if (encoded == null) {
|
||||
|
@ -157,17 +151,14 @@ public class EncodingHandler implements MessageHandler {
|
|||
}
|
||||
|
||||
|
||||
double encLen = encoded.data().getInt(0);
|
||||
double encLen = encoded.length();
|
||||
|
||||
// if updates are too dense - we fallback to bitmap encoding
|
||||
if (encLen >= (updates.length() / 16)) {
|
||||
log.debug("Switching back to bitmapEncoding: iteration {}, epoch {}, threshold {}, encoded length {}", iteration, epoch, currThreshold, encLen);
|
||||
bitmapMode.get().set(true);
|
||||
|
||||
DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(updates.length() / 16 + 5);
|
||||
encoded = Nd4j.createArrayFromShapeBuffer(buffer, updates.shapeInfoDataBuffer());
|
||||
|
||||
Nd4j.getExecutioner().bitmapEncode(updates, encoded, currentThreshold.get().get());
|
||||
encoded = Nd4j.getExecutioner().bitmapEncode(updates, currentThreshold.get().get());
|
||||
|
||||
applyPostProcessor(iteration, epoch, currThreshold, updates);
|
||||
lastSparsityRatio.set(null);
|
||||
|
@ -186,8 +177,7 @@ public class EncodingHandler implements MessageHandler {
|
|||
}
|
||||
} else {
|
||||
//Dense bitmap updates
|
||||
DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(updates.length() / 16 + 5);
|
||||
encoded = Nd4j.createArrayFromShapeBuffer(buffer, updates.shapeInfoDataBuffer());
|
||||
encoded = Nd4j.create(DataType.INT32, updates.length() / 16 + 5);
|
||||
|
||||
long values = Nd4j.getExecutioner().bitmapEncode(updates, encoded, currentThreshold.get().get());
|
||||
|
||||
|
|
|
@ -910,7 +910,7 @@ public class ParallelWrapper implements AutoCloseable {
|
|||
Preconditions.checkState(thresholdAlgorithm != null, "Cannot use SHARED_GRADIENTS training mode without setting a threshold algorithm");
|
||||
this.trainerContext = new SymmetricTrainerContext();
|
||||
if (this.accumulator == null) {
|
||||
log.info("Creating new GradientsAccumulator instance with threshold of [5e-4");
|
||||
log.info("Creating new GradientsAccumulator instance with default threshold of [5e-4]");
|
||||
this.accumulator = new EncodedGradientsAccumulator(workers, thresholdAlgorithm, residualPostProcessor, false);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -45,7 +45,7 @@ public class WiredEncodingHandler extends EncodingHandler {
|
|||
* @param thresholdAlgorithm threshold algorithm to use
|
||||
* @param boundary
|
||||
*/
|
||||
public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Double boundary, boolean encodingDebugMode) {
|
||||
public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Integer boundary, boolean encodingDebugMode) {
|
||||
super(thresholdAlgorithm, residualPostProcessor, boundary, encodingDebugMode);
|
||||
}
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@ public class WiredEncodingHandler extends EncodingHandler {
|
|||
*
|
||||
* @param thresholdAlgorithm The threshold algorithm to use
|
||||
*/
|
||||
public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Double boundary, boolean encodingDebugMode) {
|
||||
public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Integer boundary, boolean encodingDebugMode) {
|
||||
super(thresholdAlgorithm, residualPostProcessor, boundary, encodingDebugMode);
|
||||
}
|
||||
|
||||
|
|
|
@ -49,12 +49,12 @@ elseif(WIN32)
|
|||
set(CMAKE_CXX_FLAGS_RELEASE "-D_RELEASE=true")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG " /FS /EHsc")
|
||||
else()
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG " -g -O2 -fPIC -fmax-errors=2")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -D_RELEASE=true")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG " -g -O2 -fPIC")
|
||||
endif()
|
||||
else()
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG " -g -O0 -fPIC -fmax-errors=2")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -D_RELEASE=true")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG " -g -O0 -fPIC")
|
||||
|
||||
if (SD_CPU)
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address")
|
||||
|
@ -221,21 +221,16 @@ include_directories(${FLATBUFFERS_PATH}/include)
|
|||
configure_file(include/config.h.in include/config.h)
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
|
||||
|
||||
if (NOT DEFINED ENV{CLION_IDE})
|
||||
message("NOT CLION")
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
|
||||
add_subdirectory(blas)
|
||||
if(SD_BUILD_TESTS)
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
|
||||
add_subdirectory(blas)
|
||||
if(SD_BUILD_TESTS)
|
||||
# tests are always compiled with all ops included
|
||||
set(SD_ALL_OPS true)
|
||||
set(SD_BUILD_MINIFIER true)
|
||||
add_subdirectory(tests_cpu)
|
||||
endif()
|
||||
endif ()
|
||||
endif()
|
||||
|
||||
if ($ENV{CLION_IDE})
|
||||
add_subdirectory(tests_cpu)
|
||||
endif ()
|
||||
|
||||
if (MSVC_DEV)
|
||||
set(SD_BUILD_MINIFIER false)
|
||||
|
|
|
@ -120,8 +120,13 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
|
|||
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}")
|
||||
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
||||
# using GCC
|
||||
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}")
|
||||
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_TUNE} -fmax-errors=2 ")
|
||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath,$ORIGIN/")
|
||||
|
||||
if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT(APPLE) AND NOT(WIN32))
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic")
|
||||
SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
|
@ -361,11 +366,6 @@ elseif(SD_CPU)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT(APPLE) AND NOT(WIN32))
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic")
|
||||
SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
|
||||
endif()
|
||||
|
||||
install(TARGETS ${SD_LIBRARY_NAME} DESTINATION .)
|
||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cpu)
|
||||
endif()
|
||||
|
|
|
@ -571,7 +571,7 @@ namespace samediff {
|
|||
|
||||
// create temporary array
|
||||
int64_t intermediatery[256];
|
||||
auto span = delta / numThreads;
|
||||
auto span = (numElements / numThreads) - (numElements % numThreads);
|
||||
|
||||
// execute threads in parallel
|
||||
for (uint32_t e = 0; e < numThreads; e++) {
|
||||
|
@ -615,7 +615,7 @@ namespace samediff {
|
|||
|
||||
// create temporary array
|
||||
double intermediatery[256];
|
||||
auto span = delta / numThreads;
|
||||
auto span = (numElements / numThreads) - (numElements % numThreads);
|
||||
|
||||
// execute threads in parallel
|
||||
for (uint32_t e = 0; e < numThreads; e++) {
|
||||
|
|
|
@ -1432,18 +1432,6 @@ ND4J_EXPORT void tear(Nd4jPointer *extraPointers,
|
|||
Nd4jLong *tadShapeInfo,
|
||||
Nd4jLong *tadOffsets);
|
||||
|
||||
ND4J_EXPORT Nd4jLong encodeBitmap(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold);
|
||||
ND4J_EXPORT void decodeBitmap(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo);
|
||||
|
||||
|
||||
ND4J_EXPORT void encodeThresholdP1(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold);
|
||||
ND4J_EXPORT void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *dx, Nd4jLong N, int *dz);
|
||||
ND4J_EXPORT void encodeThresholdP3(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, int *offsets, Nd4jLong N, int *dz);
|
||||
|
||||
|
||||
ND4J_EXPORT void decodeThreshold(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo);
|
||||
|
||||
|
||||
ND4J_EXPORT void sort(Nd4jPointer *extraPointers,
|
||||
void *x, Nd4jLong *xShapeInfo,
|
||||
void *dx, Nd4jLong *dxShapeInfo,
|
||||
|
|
|
@ -1436,28 +1436,6 @@ void enableP2P(bool enable) {
|
|||
// no-op
|
||||
}
|
||||
|
||||
|
||||
|
||||
void encodeThresholdP1(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) {
|
||||
// TODO: to be implemented
|
||||
}
|
||||
|
||||
|
||||
void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *hX, Nd4jLong N, int *dz) {
|
||||
// TODO: to be implemented
|
||||
}
|
||||
|
||||
|
||||
void encodeThresholdP3(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){
|
||||
// offsets won't be used here
|
||||
|
||||
// TODO: to be implemented
|
||||
}
|
||||
|
||||
void decodeThreshold(Nd4jPointer *extraPointers, void *hX, Nd4jLong N, void *dz, Nd4jLong *hZShapeInfo){
|
||||
// TODO: to be implemented
|
||||
}
|
||||
|
||||
bool isP2PAvailable() {
|
||||
// always TRUE for cpu backend
|
||||
return true;
|
||||
|
@ -1467,10 +1445,6 @@ void checkP2P() {
|
|||
// no-op
|
||||
}
|
||||
|
||||
void decodeBitmap(Nd4jPointer *extraPointers, void *hX, Nd4jLong N, void *dz, Nd4jLong *hZShapeInfo) {
|
||||
NativeOpExecutioner::decodeBitmap(hX, N, dz, hZShapeInfo);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZShapeInfo, int N, int *shuffleMap, Nd4jLong **tadOnlyShapeInfo, Nd4jLong **tadOffsets) {
|
||||
|
||||
|
@ -1859,12 +1833,6 @@ void sortCooIndices(Nd4jPointer *extraPointers,
|
|||
}
|
||||
}
|
||||
|
||||
Nd4jLong encodeBitmap(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) {
|
||||
return NativeOpExecutioner::encodeBitmap(hX, hXShapeInfo, N, dz, threshold);
|
||||
}
|
||||
|
||||
|
||||
|
||||
Nd4jLong* mmapFile(Nd4jPointer *extraPointers, const char *fileName, Nd4jLong length) {
|
||||
auto hZ = new Nd4jLong[2];errno = 0;
|
||||
try {
|
||||
|
|
|
@ -2197,76 +2197,6 @@ void prescanArrayRecursive(Nd4jPointer *extras, int *dZ, int *dX, int numElement
|
|||
sd::DebugHelper::checkErrorCode(stream, "prescanArray(...) failed");
|
||||
}
|
||||
|
||||
|
||||
void encodeThresholdP1(Nd4jPointer *extras, void *dx, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) {
|
||||
try {
|
||||
cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extras[1]);
|
||||
|
||||
int blockSize = 1024;
|
||||
int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
|
||||
|
||||
dim3 launchDims(numBlocks, blockSize, 1024);
|
||||
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
||||
BUILD_SINGLE_SELECTOR(xType, encoderKernelP1Generic, (launchDims, stream, dx, N, dz, threshold), LIBND4J_TYPES);
|
||||
|
||||
sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP1Float(...) failed");
|
||||
} catch (std::exception &e) {
|
||||
sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
|
||||
sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *dx, Nd4jLong N, int *dz) {
|
||||
try {
|
||||
cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
|
||||
//encoderKernelP2Float<<<numBlocks, blockSize , 1024 * sizeof(float), *stream>>>(dx, N, dz);
|
||||
prescanArrayRecursive(extraPointers, dz, dx + 1, (int) N, 0);
|
||||
sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP2Int(...) failed");
|
||||
} catch (std::exception &e) {
|
||||
sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
|
||||
sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
|
||||
}
|
||||
}
|
||||
|
||||
void encodeThresholdP3(Nd4jPointer *extraPointers, void *dx, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){
|
||||
try {
|
||||
cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
|
||||
|
||||
int blockSize = 1024;
|
||||
int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
|
||||
|
||||
dim3 launchDims(numBlocks, blockSize, 4096);
|
||||
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
||||
BUILD_SINGLE_SELECTOR(xType, encoderKernelP3Generic, (launchDims, stream, dx, offsets, N, dz), LIBND4J_TYPES);
|
||||
|
||||
sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP3Float(...) failed");
|
||||
} catch (std::exception &e) {
|
||||
sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
|
||||
sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
|
||||
}
|
||||
}
|
||||
|
||||
void decodeThreshold(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo){
|
||||
try {
|
||||
cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
|
||||
|
||||
// we probably want to have smaller blocks here, memory writes are misaligned anyway
|
||||
int blockSize = 128;
|
||||
int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
|
||||
|
||||
dim3 launchDims(numBlocks, blockSize, 1024);
|
||||
auto zType = sd::ArrayOptions::dataType(zShapeInfo);
|
||||
BUILD_SINGLE_SELECTOR(zType, decoderKernelGeneric, (launchDims, stream, dx, N, dz), LIBND4J_TYPES);
|
||||
|
||||
sd::DebugHelper::checkErrorCode(stream, "decodeThresholdFloat(...) failed");
|
||||
} catch (std::exception &e) {
|
||||
sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
|
||||
sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
void execReduce3All(Nd4jPointer *extraPointers,
|
||||
int opNum,
|
||||
|
@ -2603,55 +2533,6 @@ void sortCooIndices(Nd4jPointer *extraPointers, Nd4jLong *indices, void *values,
|
|||
throw std::runtime_error("sortCooIndices:: Not implemented yet");
|
||||
}
|
||||
|
||||
|
||||
Nd4jLong encodeBitmap(Nd4jPointer *extraPointers,
|
||||
void *dx, Nd4jLong *hXShapeInfo,
|
||||
Nd4jLong N,
|
||||
int *dz,
|
||||
float threshold) {
|
||||
try {
|
||||
|
||||
cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
|
||||
int *resultPointer = reinterpret_cast<int *>(extraPointers[2]);
|
||||
int *reductionPointer = reinterpret_cast<int *>(extraPointers[3]);
|
||||
|
||||
dim3 launchDims(512, 512, 32768);
|
||||
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
||||
BUILD_SINGLE_SELECTOR(xType, cudaEncodeBitmapGeneric,
|
||||
(launchDims, stream, dx, N, dz, resultPointer, reductionPointer, threshold),
|
||||
LIBND4J_TYPES);
|
||||
|
||||
sd::DebugHelper::checkErrorCode(stream, "encodeBitmapFloat(...) failed");
|
||||
|
||||
Nd4jLong dZ = (Nd4jLong) resultPointer[0];
|
||||
resultPointer[0] = 0;
|
||||
|
||||
return dZ;
|
||||
} catch (std::exception &e) {
|
||||
sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
|
||||
sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void decodeBitmap(Nd4jPointer *extraPointers,
|
||||
void *dx,
|
||||
Nd4jLong N,
|
||||
void *dz, Nd4jLong *zShapeInfo) {
|
||||
try {
|
||||
cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
|
||||
dim3 launchDims(512, 512, 16384);
|
||||
auto xType = sd::ArrayOptions::dataType(zShapeInfo);
|
||||
BUILD_SINGLE_SELECTOR(xType, cudaDecodeBitmapGeneric, (launchDims, stream, dx, N, dz), LIBND4J_TYPES);
|
||||
|
||||
sd::DebugHelper::checkErrorCode(stream, "decodeBitmapFloat(...) failed");
|
||||
} catch (std::exception &e) {
|
||||
sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
|
||||
sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
|
||||
}
|
||||
}
|
||||
|
||||
Nd4jLong* mmapFile(Nd4jPointer *extraPointers, const char *fileName, Nd4jLong length) {
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
@ -207,7 +207,7 @@ namespace sd {
|
|||
}
|
||||
|
||||
void Environment::setMaxSpecialyMemory(uint64_t maxBytes) {
|
||||
_maxTotalSpecialMemory;
|
||||
_maxTotalSpecialMemory = maxBytes;
|
||||
}
|
||||
|
||||
void Environment::setMaxDeviceMemory(uint64_t maxBytes) {
|
||||
|
|
|
@ -217,10 +217,27 @@ namespace sd {
|
|||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/*
|
||||
* PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
|
||||
*/
|
||||
template<typename T>
|
||||
__global__ static void execEncoderKernelP1(void *dx, Nd4jLong N, void *dz, float threshold) {
|
||||
auto x = reinterpret_cast<T *> (dx);
|
||||
auto z = reinterpret_cast<int *> (dz);
|
||||
|
||||
encoderKernelP1<T>(dx, N, dz, threshold);
|
||||
//basically, for phase One we want do calculation: how many eligible values we have, and which blocks will be holding data
|
||||
Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
int pass = tid < N && sd::math::nd4j_abs<T>(x[tid]) >= static_cast<T>(threshold) ? 1 : 0;
|
||||
int bp=__syncthreads_count(pass);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
// saving out per-block passes
|
||||
z[blockIdx.x+1] = bp;
|
||||
|
||||
// saving out sum
|
||||
atomicAdd(&z[0], bp);
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
@ -230,13 +247,74 @@ __host__ void encoderKernelP1Generic(dim3 &launchDims, cudaStream_t *stream, voi
|
|||
execEncoderKernelP1<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, dz, threshold);
|
||||
sd::DebugHelper::checkErrorCode(stream, "encoderP1(...) failed");
|
||||
}
|
||||
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP1Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold), LIBND4J_TYPES);
|
||||
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP1Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold), FLOAT_TYPES);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/*
|
||||
* PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
|
||||
*
|
||||
* Based on: https://github.com/knotman90/cuStreamComp <-- efficient CUDA stream compaction algorithm
|
||||
*/
|
||||
template<typename T>
|
||||
__global__ static void execEncoderKernelP3(void *dx, int *offsets, Nd4jLong N, void *dz) {
|
||||
auto x = reinterpret_cast<T *> (dx);
|
||||
auto z = reinterpret_cast<int *> (dz);
|
||||
|
||||
encoderKernelP3<T>(dx, offsets, N, dz);
|
||||
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
extern __shared__ int warpTotals[];
|
||||
|
||||
// fetch block offset only once
|
||||
__shared__ float threshold;
|
||||
__shared__ FloatBits fb;
|
||||
__shared__ int bo;
|
||||
__shared__ int limit;
|
||||
if (threadIdx.x == 0) {
|
||||
limit = z[0];
|
||||
fb.i_ = z[2];
|
||||
threshold = fb.f_;
|
||||
bo = offsets[blockIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// out-of-limit threads do not play here
|
||||
auto value = tid < N ? x[tid] : (T) 0.f;
|
||||
|
||||
// out-of-limit threads just declare they have no changes
|
||||
auto pred = tid >= N ? 0 : sd::math::nd4j_abs<T>(value) >= static_cast<T>(threshold) ? 1 : 0;
|
||||
auto w_i = threadIdx.x / warpSize; // warp index (or, warp number) - index of the Warp within TOTAL_WARPS
|
||||
auto t_i = threadIdx.x % warpSize; // thread index within a warp
|
||||
unsigned int t_m = INT_MAX >> (warpSize - t_i - 1); //thread mask (ERROR IN THE PAPER minus one is required)
|
||||
|
||||
int b = __ballot_sync(t_m, pred); // balres = number whose ith bit isone if the ith's thread pred is true masked up to the current index in warp
|
||||
auto t_u = __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
|
||||
|
||||
if (t_i == warpSize - 1)
|
||||
warpTotals[w_i] = t_u + pred;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
int w_i_u = 0;
|
||||
for (int j = 0; j <= 5; j++) {
|
||||
unsigned int b_j = __ballot_sync(t_m, warpTotals[t_i] & pow2i(j)); //# of the ones in the j'th digit of the warp offsets
|
||||
w_i_u += (__popc(b_j) << j);
|
||||
}
|
||||
|
||||
// we just ignore all results coming from non-0 threads
|
||||
if (w_i == 0 && t_i < blockDim.x / warpSize)
|
||||
warpTotals[t_i] = w_i_u;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
// pred is always false if we're out-of-limits
|
||||
if (pred) {
|
||||
int idx = t_u + warpTotals[w_i] + bo + 4;
|
||||
if (idx < limit + 4) {
|
||||
z[idx] = value > static_cast<T>(0.0f) ? tid + 1 : -(tid + 1);
|
||||
x[tid] = value > static_cast<T>(0.0f) ? x[tid] - threshold : x[tid] + threshold;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
@ -245,13 +323,38 @@ __host__ void encoderKernelP3Generic(dim3 &launchDims, cudaStream_t *stream, voi
|
|||
execEncoderKernelP3<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, offsets, N, dz);
|
||||
sd::DebugHelper::checkErrorCode(stream, "encoderP3(...) failed");
|
||||
}
|
||||
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP3Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz), LIBND4J_TYPES);
|
||||
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP3Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz), FLOAT_TYPES);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/*
|
||||
* This kernel handles decode from sparse threshold array, to dense array
|
||||
*
|
||||
* PLEASE NOTE: Z is expected to be memset to 0
|
||||
*/
|
||||
template<typename T>
|
||||
__global__ static void execDecoderKernel(void *dx, Nd4jLong N, void *dz) {
|
||||
auto x = reinterpret_cast<int *> (dx);
|
||||
auto z = reinterpret_cast<T *> (dz);
|
||||
|
||||
decoderKernel<T>(dx, N, dz);
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
__shared__ float threshold;
|
||||
__shared__ int limit;
|
||||
|
||||
__shared__ FloatBits fb;
|
||||
if (threadIdx.x == 0) {
|
||||
limit = x[0];
|
||||
fb.i_ = x[2];
|
||||
threshold = fb.f_;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int e = tid; e < limit; e += blockDim.x * gridDim.x) {
|
||||
int el = x[e+4];
|
||||
int ael = sd::math::nd4j_abs<int>(el) - 1;
|
||||
|
||||
// TODO: investigate, if += would work better here, as in "decoded accumulation"
|
||||
z[ael] += el > 0 ? threshold : -threshold;
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
@ -261,14 +364,78 @@ __host__ void decoderKernelGeneric(dim3 &launchDims, cudaStream_t *stream, void
|
|||
execDecoderKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, dz);
|
||||
sd::DebugHelper::checkErrorCode(stream, "execDecoder(...) failed");
|
||||
}
|
||||
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT decoderKernelGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES);
|
||||
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT decoderKernelGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz), FLOAT_TYPES);
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
template<typename T>
|
||||
__global__ static void execCudaEncodeBitmapKernel(void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold) {
|
||||
auto dx = reinterpret_cast<T *>(vdx);
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
cudaEncodeBitmapKernel<T>(vdx, N, dz, scalar, reductionBuffer, threshold);
|
||||
T off(0.0f);
|
||||
__shared__ int counter;
|
||||
__shared__ int *shmem;
|
||||
__shared__ T *vals;
|
||||
if (threadIdx.x == 0){
|
||||
extern __shared__ char mem[];
|
||||
shmem = reinterpret_cast<int*>(mem);
|
||||
vals = reinterpret_cast<T *>(shmem + blockDim.x);
|
||||
counter = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
Nd4jLong loopRemainder = N % (blockDim.x * gridDim.x);
|
||||
Nd4jLong loopLimit = N + (blockDim.x * gridDim.x - loopRemainder);
|
||||
|
||||
for (Nd4jLong i = tid; i < loopLimit; i += blockDim.x * gridDim.x) {
|
||||
// all threads in block reading stuff
|
||||
T val = i < N ? dx[i] : off;
|
||||
T abs = sd::math::nd4j_abs<T>(val);
|
||||
|
||||
int byteId = i / 16 + 4;
|
||||
int bitId = i % 16;
|
||||
|
||||
shmem[threadIdx.x] = 0;
|
||||
vals[threadIdx.x] = val;
|
||||
|
||||
if (abs >= static_cast<T>(threshold) && i < N) {
|
||||
shmem[threadIdx.x] = 1 << (bitId);
|
||||
atomicAdd(&counter, 1);
|
||||
if (val < static_cast<T>(0.0f)) {
|
||||
shmem[threadIdx.x] |= 1 << (bitId + 16);
|
||||
vals[threadIdx.x] += static_cast<T>(threshold);
|
||||
} else {
|
||||
vals[threadIdx.x] -= static_cast<T>(threshold);
|
||||
}
|
||||
} else if (abs >= static_cast<T>(threshold) / static_cast<T>(2.0f) && val < static_cast<T>(0.0f) && i < N) {
|
||||
atomicAdd(&counter, 1);
|
||||
shmem[threadIdx.x] = 1 << (bitId + 16);
|
||||
|
||||
vals[threadIdx.x] += static_cast<T>(threshold) / static_cast<T>(2.0f);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x % 16 == 0 && i < N) {
|
||||
int byte = 0;
|
||||
for (int e = 0; e < 16; e++) {
|
||||
if (i + e >= N)
|
||||
continue;
|
||||
|
||||
byte |= shmem[threadIdx.x + e];
|
||||
}
|
||||
dz[byteId] = byte;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (i < N)
|
||||
dx[i] = vals[threadIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
atomicAdd(scalar, counter);
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
@ -278,14 +445,62 @@ __host__ void cudaEncodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, vo
|
|||
execCudaEncodeBitmapKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(vdx, N, dz, scalar, reductionBuffer, threshold);
|
||||
sd::DebugHelper::checkErrorCode(stream, "encodeBitmap(...) failed");
|
||||
}
|
||||
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaEncodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold), LIBND4J_TYPES);
|
||||
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaEncodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold), FLOAT_TYPES);
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
template<typename T>
|
||||
__global__ static void execCudaDecodeBitmapKernel(void *dx, Nd4jLong N, void *vdz) {
|
||||
auto dz = static_cast<T*>(vdz);
|
||||
|
||||
cudaDecodeBitmapKernel<T>(dx, N, vdz);
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
__shared__ T *shmem;
|
||||
__shared__ FloatBits fb;
|
||||
__shared__ float threshold;
|
||||
__shared__ int *x;
|
||||
if (threadIdx.x == 0){
|
||||
extern __shared__ char mem[];
|
||||
shmem = reinterpret_cast<T*>(mem);
|
||||
x = reinterpret_cast<int *>(dx);
|
||||
fb.i_ = x[2];
|
||||
threshold = fb.f_;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
int lim = N / 16 + 5;
|
||||
for (int i = tid; i < N; i += blockDim.x * gridDim.x) {
|
||||
int byteId = i / 16 + 4;
|
||||
// printf("I: [%i]; byteId: [%i]\n", i, byteId);
|
||||
|
||||
shmem[threadIdx.x] = dz[i];
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x % 16 == 0) {
|
||||
int byte = x[byteId];
|
||||
|
||||
for (int e = 0; e < 16; e++) {
|
||||
if (i + e >= N)
|
||||
continue;
|
||||
|
||||
int bitId = (i + e) % 16;
|
||||
|
||||
bool hasBit = (byte & 1 << (bitId) ) != 0;
|
||||
bool hasSign = (byte & 1 << (bitId + 16) ) != 0;
|
||||
|
||||
if (hasBit) {
|
||||
if (hasSign)
|
||||
shmem[threadIdx.x + bitId] -= threshold;
|
||||
else
|
||||
shmem[threadIdx.x + bitId] += threshold;
|
||||
} else if (hasSign) {
|
||||
shmem[threadIdx.x + bitId] -= threshold / 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
dz[i] = shmem[threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
@ -295,7 +510,7 @@ __host__ void cudaDecodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, vo
|
|||
execCudaDecodeBitmapKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, vdz);
|
||||
sd::DebugHelper::checkErrorCode(stream, "cudeDecodeBitmap(...) failed");
|
||||
}
|
||||
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz), LIBND4J_TYPES);
|
||||
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz), FLOAT_TYPES);
|
||||
|
||||
|
||||
template <bool storeSum, bool isNP2>
|
||||
|
|
|
@ -106,8 +106,14 @@ namespace sd {
|
|||
auto l = static_cast<int>(N);
|
||||
z[1] = l;
|
||||
|
||||
#ifdef _OPENMP
|
||||
int threads = OmpLaunchHelper::betterThreads(N);
|
||||
int span = OmpLaunchHelper::betterSpan(N, threads);
|
||||
auto span = OmpLaunchHelper::betterSpan(N, threads);
|
||||
#else
|
||||
int threads = 1;
|
||||
auto span = N;
|
||||
#endif
|
||||
|
||||
|
||||
T tt = static_cast<T>(threshold);
|
||||
T mtt = -tt;
|
||||
|
@ -209,21 +215,23 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
|
|||
samediff::Threads::parallel_for(func, 0, N);
|
||||
};
|
||||
|
||||
template void TypeCast::convertFromThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
template void TypeCast::convertFromThreshold<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
template void TypeCast::convertFromThreshold<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
template void TypeCast::convertFromThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
template void TypeCast::convertFromThreshold<bfloat16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
|
||||
template void TypeCast::convertToThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
template void TypeCast::convertToThreshold<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
template void TypeCast::convertToThreshold<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
template void TypeCast::convertToThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
template void TypeCast::convertToThreshold<bfloat16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
|
||||
template void TypeCast::convertFromQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
template void TypeCast::convertFromQuantized<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
template void TypeCast::convertFromQuantized<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
template void TypeCast::convertFromQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
|
||||
template void TypeCast::convertToQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
template void TypeCast::convertToQuantized<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
template void TypeCast::convertToQuantized<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
template void TypeCast::convertToQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
|
||||
|
||||
#ifndef __CLION_IDE__
|
||||
BUILD_DOUBLE_TEMPLATE(template void TypeCast::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES)
|
||||
|
|
|
@ -110,29 +110,6 @@ namespace sd {
|
|||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
/*
|
||||
* PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
|
||||
*/
|
||||
template<typename T>
|
||||
__device__ inline void encoderKernelP1(void *dx, Nd4jLong N, void *dz, float threshold) {
|
||||
auto x = reinterpret_cast<T *> (dx);
|
||||
auto z = reinterpret_cast<int *> (dz);
|
||||
|
||||
//basically, for phase One we want do calculation: how many eligible values we have, and which blocks will be holding data
|
||||
Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
int pass = tid < N && sd::math::nd4j_abs<T>(x[tid]) >= static_cast<T>(threshold) ? 1 : 0;
|
||||
int bp=__syncthreads_count(pass);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
// saving out per-block passes
|
||||
z[blockIdx.x+1] = bp;
|
||||
|
||||
// saving out sum
|
||||
atomicAdd(&z[0], bp);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __inline__ int pow2i (int e){
|
||||
return 1<<e;
|
||||
}
|
||||
|
@ -140,274 +117,21 @@ namespace sd {
|
|||
template<typename T>
|
||||
__host__ void encoderKernelP1Generic(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold);
|
||||
|
||||
/*
|
||||
* PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
|
||||
*
|
||||
* Based on: https://github.com/knotman90/cuStreamComp <-- efficient CUDA stream compaction algorithm
|
||||
*/
|
||||
template<typename T>
|
||||
__device__ inline void encoderKernelP3(void *dx, int *offsets, Nd4jLong N, void *dz) {
|
||||
T *x = reinterpret_cast<T *> (dx);
|
||||
int *z = reinterpret_cast<int *> (dz);
|
||||
|
||||
Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
extern __shared__ int warpTotals[];
|
||||
|
||||
// fetch block offset only once
|
||||
__shared__ float threshold;
|
||||
__shared__ FloatBits fb;
|
||||
__shared__ int bo;
|
||||
__shared__ int limit;
|
||||
if (threadIdx.x == 0) {
|
||||
limit = z[0];
|
||||
fb.i_ = z[2];
|
||||
threshold = fb.f_;
|
||||
bo = offsets[blockIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (tid < N) {
|
||||
T value = x[tid];
|
||||
int pred = sd::math::nd4j_abs<T>(value) >= static_cast<T>(threshold) ? 1 : 0;
|
||||
int w_i = threadIdx.x/warpSize; //warp index
|
||||
int w_l = tid % warpSize;//thread index within a warp
|
||||
unsigned int t_m = INT_MAX >> (warpSize-w_l-1); //thread mask (ERROR IN THE PAPER minus one is required)
|
||||
|
||||
int b = __ballot_sync(t_m, pred); //balres = number whose ith bit isone if the ith's thread pred is true masked up to the current index in warp
|
||||
int t_u = __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
|
||||
|
||||
if(w_l==warpSize-1){
|
||||
warpTotals[w_i]=t_u+pred;
|
||||
}
|
||||
// __syncthreads(); // Eliminated due RTX20xx specific
|
||||
|
||||
if(w_i==0 && w_l<blockDim.x/warpSize){
|
||||
int w_i_u=0;
|
||||
for(int j=0;j<=5;j++){
|
||||
unsigned int b_j =__ballot_sync(t_m, warpTotals[w_l] & pow2i(j) ); //# of the ones in the j'th digit of the warp offsets
|
||||
w_i_u += (__popc(b_j) << j);
|
||||
//printf("indice %i t_m=%i,j=%i,b_j=%i,w_i_u=%i\n",w_l,t_m,j,b_j,w_i_u);
|
||||
}
|
||||
warpTotals[w_l]=w_i_u;
|
||||
}
|
||||
|
||||
// __syncthreads(); // Eliminated due RTX20xx specific
|
||||
|
||||
if(pred){
|
||||
int idx = t_u + warpTotals[w_i] + bo + 4;
|
||||
if (idx < limit + 4) {
|
||||
z[idx]= value > static_cast<T>(0.0f) ? tid+1 : -(tid + 1);
|
||||
x[tid] = value > static_cast<T>(0.0f) ? x[tid] - threshold : x[tid] + threshold;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__host__ void encoderKernelP3Generic(dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz);
|
||||
|
||||
|
||||
/*
|
||||
* This kernel handles decode from sparse threshold array, to dense array
|
||||
*
|
||||
* PLEASE NOTE: Z is expected to be memset to 0
|
||||
*/
|
||||
template<typename T>
|
||||
__device__ inline void decoderKernel(void *dx, Nd4jLong N, void *dz) {
|
||||
auto x = reinterpret_cast<int *> (dx);
|
||||
auto z = reinterpret_cast<T *> (dz);
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
__shared__ float threshold;
|
||||
__shared__ int limit;
|
||||
|
||||
__shared__ FloatBits fb;
|
||||
if (threadIdx.x == 0) {
|
||||
limit = x[0];
|
||||
fb.i_ = x[2];
|
||||
threshold = fb.f_;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int e = tid; e < limit; e += blockDim.x * gridDim.x) {
|
||||
int el = x[e+4];
|
||||
int ael = sd::math::nd4j_abs<int>(el) - 1;
|
||||
|
||||
// TODO: investigate, if += would work better here, as in "decoded accumulation"
|
||||
z[ael] += el > 0 ? threshold : -threshold;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__host__ void decoderKernelGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz);
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
template<typename T>
|
||||
__device__ inline void cudaEncodeBitmapKernel(void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold) {
|
||||
|
||||
auto dx = reinterpret_cast<T *>(vdx);
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
T off(0.0f);
|
||||
__shared__ int counter;
|
||||
__shared__ int *shmem;
|
||||
__shared__ T *vals;
|
||||
if (threadIdx.x == 0){
|
||||
extern __shared__ char mem[];
|
||||
shmem = reinterpret_cast<int*>(mem);
|
||||
vals = reinterpret_cast<T *>(shmem + blockDim.x);
|
||||
counter = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
Nd4jLong loopRemainder = N % (blockDim.x * gridDim.x);
|
||||
Nd4jLong loopLimit = N + (blockDim.x * gridDim.x - loopRemainder);
|
||||
|
||||
for (Nd4jLong i = tid; i < loopLimit; i += blockDim.x * gridDim.x) {
|
||||
// all threads in block reading stuff
|
||||
T val = i < N ? dx[i] : off;
|
||||
T abs = sd::math::nd4j_abs<T>(val);
|
||||
|
||||
int byteId = i / 16 + 4;
|
||||
int bitId = i % 16;
|
||||
|
||||
shmem[threadIdx.x] = 0;
|
||||
vals[threadIdx.x] = val;
|
||||
|
||||
if (abs >= static_cast<T>(threshold) && i < N) {
|
||||
shmem[threadIdx.x] = 1 << (bitId);
|
||||
atomicAdd(&counter, 1);
|
||||
if (val < static_cast<T>(0.0f)) {
|
||||
shmem[threadIdx.x] |= 1 << (bitId + 16);
|
||||
vals[threadIdx.x] += static_cast<T>(threshold);
|
||||
} else {
|
||||
vals[threadIdx.x] -= static_cast<T>(threshold);
|
||||
}
|
||||
} else if (abs >= static_cast<T>(threshold) / static_cast<T>(2.0f) && val < static_cast<T>(0.0f) && i < N) {
|
||||
atomicAdd(&counter, 1);
|
||||
shmem[threadIdx.x] = 1 << (bitId + 16);
|
||||
|
||||
vals[threadIdx.x] += static_cast<T>(threshold) / static_cast<T>(2.0f);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x % 16 == 0 && i < N) {
|
||||
int byte = 0;
|
||||
for (int e = 0; e < 16; e++) {
|
||||
if (i + e >= N)
|
||||
continue;
|
||||
|
||||
byte |= shmem[threadIdx.x + e];
|
||||
}
|
||||
dz[byteId] = byte;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (i < N)
|
||||
dx[i] = vals[threadIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
atomicAdd(scalar, counter);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__host__ void cudaEncodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
template<typename T>
|
||||
__device__ inline void cudaDecodeBitmapKernel(void *dx, Nd4jLong N, void *vdz) {
|
||||
|
||||
auto dz = static_cast<T*>(vdz);
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
__shared__ T *shmem;
|
||||
__shared__ FloatBits fb;
|
||||
__shared__ float threshold;
|
||||
__shared__ int *x;
|
||||
if (threadIdx.x == 0){
|
||||
extern __shared__ char mem[];
|
||||
shmem = reinterpret_cast<T*>(mem);
|
||||
x = reinterpret_cast<int *>(dx);
|
||||
fb.i_ = x[2];
|
||||
threshold = fb.f_;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
int lim = N / 16 + 5;
|
||||
for (int i = tid; i < N; i += blockDim.x * gridDim.x) {
|
||||
int byteId = i / 16 + 4;
|
||||
// printf("I: [%i]; byteId: [%i]\n", i, byteId);
|
||||
|
||||
shmem[threadIdx.x] = dz[i];
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x % 16 == 0) {
|
||||
int byte = x[byteId];
|
||||
|
||||
for (int e = 0; e < 16; e++) {
|
||||
if (i + e >= N)
|
||||
continue;
|
||||
|
||||
int bitId = (i + e) % 16;
|
||||
|
||||
bool hasBit = (byte & 1 << (bitId) ) != 0;
|
||||
bool hasSign = (byte & 1 << (bitId + 16) ) != 0;
|
||||
|
||||
if (hasBit) {
|
||||
if (hasSign)
|
||||
shmem[threadIdx.x + bitId] -= threshold;
|
||||
else
|
||||
shmem[threadIdx.x + bitId] += threshold;
|
||||
} else if (hasSign) {
|
||||
shmem[threadIdx.x + bitId] -= threshold / 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
dz[i] = shmem[threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__host__ void cudaDecodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz);
|
||||
|
||||
// __global__ void cudaEncodeBitmapFloat(float *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
|
||||
|
||||
// __global__ void cudaEncodeBitmapDouble(double *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
|
||||
|
||||
// __global__ void cudaEncodeBitmapHalf(float16 *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
|
||||
|
||||
// __global__ void cudaDecodeBitmapFloat(void *dx, Nd4jLong N, float *dz);
|
||||
|
||||
// __global__ void cudaDecodeBitmapDouble(void *dx, Nd4jLong N, double *dz);
|
||||
|
||||
// __global__ void cudaDecodeBitmapHalf(void *dx, Nd4jLong N, float16 *dz);
|
||||
|
||||
// __global__ void encoderKernelP1Float(void *dx, Nd4jLong N, void *dz, float threshold);
|
||||
|
||||
// __global__ void encoderKernelP1Double(void *dx, Nd4jLong N, void *dz, float threshold);
|
||||
|
||||
// __global__ void encoderKernelP1Half(void *dx, Nd4jLong N, void *dz, float threshold);
|
||||
|
||||
// __global__ void encoderKernelP2Float(int *dx, Nd4jLong N, int *dz);
|
||||
|
||||
// __global__ void encoderKernelP3Float(void *dx, int *offsets, Nd4jLong N, void *dz);
|
||||
|
||||
// __global__ void encoderKernelP3Double(void *dx, int *offsets, Nd4jLong N, void *dz);
|
||||
|
||||
// __global__ void encoderKernelP3Half(void *dx, int *offsets, Nd4jLong N, void *dz);
|
||||
|
||||
// __global__ void decoderKernelFloat(void *dx, Nd4jLong N, void *dz);
|
||||
|
||||
// __global__ void decoderKernelDouble(void *dx, Nd4jLong N, void *dz);
|
||||
|
||||
// __global__ void decoderKernelHalf(void *dx, Nd4jLong N, void *dz);
|
||||
|
||||
__global__ void uniformAdd(int *g_data, int *uniforms, int n, int blockOffset, int baseIndex);
|
||||
|
||||
template <bool storeSum, bool isNP2>
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#include <ops/declarable/headers/boolean.h>
|
||||
#include <ops/declarable/headers/broadcastable.h>
|
||||
#include <ops/declarable/headers/convo.h>
|
||||
#include <ops/declarable/headers/compression.h>
|
||||
#include <ops/declarable/headers/list.h>
|
||||
#include <ops/declarable/headers/recurrent.h>
|
||||
#include <ops/declarable/headers/transforms.h>
|
||||
|
|
|
@ -138,9 +138,9 @@ DECLARE_SHAPE_FN(matmul) {
|
|||
//////////////////////////////////////////////////////////////////////
|
||||
DECLARE_TYPES(matmul) {
|
||||
getOpDescriptor()
|
||||
->setAllowedInputTypes(0, {ALL_FLOATS})
|
||||
->setAllowedInputTypes(1, {ALL_FLOATS})
|
||||
->setAllowedOutputTypes(0, {ALL_FLOATS});
|
||||
->setAllowedInputTypes(0, {ALL_FLOATS, ALL_INTS})
|
||||
->setAllowedInputTypes(1, {ALL_FLOATS, ALL_INTS})
|
||||
->setAllowedOutputTypes(0, {ALL_FLOATS, ALL_INTS});
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -0,0 +1,92 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2015-2018 Skymind, Inc.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
//
|
||||
// @author George A. Shulinok <sgazeos@gmail.com>
|
||||
//
|
||||
|
||||
#include <system/op_boilerplate.h>
|
||||
#include <ops/declarable/CustomOperations.h>
|
||||
#include <ops/declarable/helpers/compression.h>
|
||||
|
||||
#if NOT_EXCLUDED(OP_decode_bitmap)
|
||||
namespace sd {
|
||||
namespace ops {
|
||||
CUSTOM_OP_IMPL(decode_bitmap, 2, 1, true, 0, 0) {
|
||||
auto encoded = INPUT_VARIABLE(1);
|
||||
auto updates = OUTPUT_VARIABLE(0);
|
||||
|
||||
helpers::decodeBitmap(block.launchContext(), encoded, updates);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
DECLARE_SHAPE_FN(decode_bitmap) {
|
||||
auto weights = INPUT_VARIABLE(0);
|
||||
|
||||
return SHAPELIST(weights->shapeInfo());
|
||||
}
|
||||
|
||||
DECLARE_TYPES(decode_bitmap) {
|
||||
getOpDescriptor()
|
||||
->setAllowedInputTypes(0, {ALL_FLOATS})
|
||||
->setAllowedInputTypes(1, DataType::INT32)
|
||||
->setAllowedOutputTypes({ALL_FLOATS});
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if NOT_EXCLUDED(OP_encode_bitmap)
|
||||
namespace sd {
|
||||
namespace ops {
|
||||
CUSTOM_OP_IMPL(encode_bitmap, 1, 3, true, 1, 0) {
|
||||
auto input = INPUT_VARIABLE(0);
|
||||
auto encoded = OUTPUT_NULLIFIED(1);
|
||||
auto counter = OUTPUT_NULLIFIED(2);
|
||||
|
||||
float threshold = T_ARG(0);
|
||||
|
||||
encoded->p(0, (int) input->lengthOf());
|
||||
encoded->p(1, (int) input->lengthOf());
|
||||
encoded->p(2, reinterpret_cast<int *>(&threshold)[0]);
|
||||
encoded->p(3, 1); // flag for BITMAP_ENCODING
|
||||
|
||||
auto result = helpers::encodeBitmap(block.launchContext(), input, encoded, threshold);
|
||||
counter->p(0, result);
|
||||
counter->syncToDevice();
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
DECLARE_SHAPE_FN(encode_bitmap) {
|
||||
auto input = inputShape->at(0);
|
||||
|
||||
auto outputLength = shape::length(input) / 16 + 5;
|
||||
auto encodedShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(outputLength, DataType::INT32);
|
||||
auto encodedCounter = ConstantShapeHelper::getInstance()->scalarShapeInfo(DataType::INT32);
|
||||
return SHAPELIST(input, encodedShape, encodedCounter);
|
||||
}
|
||||
|
||||
DECLARE_TYPES(encode_bitmap) {
|
||||
getOpDescriptor()
|
||||
->setAllowedInputTypes(sd::DataType::ANY)
|
||||
->setAllowedOutputTypes(0, {ALL_FLOATS})
|
||||
->setAllowedInputTypes(1, DataType::INT32)
|
||||
->setAllowedInputTypes(2, DataType::INT32);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,104 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2020 Konduit K.K.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
//
|
||||
// @author raver119@gmail.com
|
||||
//
|
||||
|
||||
#include <system/op_boilerplate.h>
|
||||
#include <ops/declarable/CustomOperations.h>
|
||||
#include <ops/declarable/helpers/threshold.h>
|
||||
|
||||
namespace sd {
|
||||
namespace ops {
|
||||
CUSTOM_OP_IMPL(encode_threshold, 1, 2, true, 1, 0) {
|
||||
auto x = INPUT_VARIABLE(0);
|
||||
auto updated = OUTPUT_VARIABLE(0);
|
||||
auto encoded = OUTPUT_NULLIFIED(1);
|
||||
|
||||
float threshold = T_ARG(0);
|
||||
|
||||
REQUIRE_TRUE(x->lengthOf() <= DataTypeUtils::max<int>(), 0, "encode_threshold: gradients array must have length <= MAX_INT");
|
||||
REQUIRE_TRUE(encoded->lengthOf() >= 4, 0, "encode_threshold: array for encoded updates can't have less than 4 elements");
|
||||
// REQUIRE_TRUE(x->platformBuffer() == updated->platformBuffer(), 0, "encode_threshold: gradients array must be the same at input and output");
|
||||
|
||||
// filling header bytes
|
||||
encoded->p(0, encoded->lengthOf() - 4);
|
||||
encoded->p(1, (int) x->lengthOf());
|
||||
encoded->p(2, reinterpret_cast<int *>(&threshold)[0]);
|
||||
encoded->p(3, 0); // flag for FLEXIBLE_ENCODING
|
||||
|
||||
// if there's no updates to process - just skip execution
|
||||
if (encoded->lengthOf() == 4)
|
||||
return Status::OK();
|
||||
|
||||
helpers::thresholdEncode(*x, *encoded, threshold);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
DECLARE_SHAPE_FN(encode_threshold) {
|
||||
auto x = INPUT_VARIABLE(0);
|
||||
// we have limit option here
|
||||
int boundary = block.numI() > 0 ? I_ARG(0) : DataTypeUtils::max<int>();
|
||||
float threshold = T_ARG(0);
|
||||
|
||||
REQUIRE_TRUE(boundary >= 0, 0, "encode_threshold: boundary must be positive");
|
||||
REQUIRE_TRUE(x->lengthOf() <= DataTypeUtils::max<int>(), 0, "encode_threshold: gradients array must have length <= MAX_INT");
|
||||
|
||||
// we must calculate number of elements that >= threshold
|
||||
auto elements = sd::math::nd4j_min<int>(helpers::thresholdEstimate(*x, threshold), boundary);
|
||||
if (elements < 2)
|
||||
elements = 0;
|
||||
|
||||
// result array must have 4 additional int elements for header
|
||||
return SHAPELIST(x->shapeInfo(), sd::ConstantShapeHelper::getInstance()->vectorShapeInfo(elements + 4, DataType::INT32));
|
||||
}
|
||||
|
||||
DECLARE_TYPES(encode_threshold) {
|
||||
getOpDescriptor()
|
||||
->setAllowedInputTypes(0, {ALL_FLOATS})
|
||||
->setAllowedOutputTypes(0, {ALL_FLOATS})
|
||||
->setAllowedOutputTypes(1, DataType::INT32);
|
||||
}
|
||||
|
||||
CUSTOM_OP_IMPL(decode_threshold, 2, 1, true, 0, 0) {
|
||||
auto weights = INPUT_VARIABLE(0);
|
||||
auto encoded = INPUT_VARIABLE(1);
|
||||
auto updates = OUTPUT_VARIABLE(0);
|
||||
|
||||
REQUIRE_TRUE(encoded->lengthOf() >= 4, 0, "decode_threshold: encoded array can't have length < 4");
|
||||
REQUIRE_TRUE(updates->lengthOf() == encoded->e<int>(1), 0, "decode_threshold: updates array must have length equal to [%i]", encoded->e<int>(1));
|
||||
REQUIRE_TRUE(encoded->e<int>(3) == 0, 0, "decode_threshold: encoded array doesn't look like threshold-encoded");
|
||||
|
||||
helpers::thresholdDecode(*encoded, *updates);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
DECLARE_SHAPE_FN(decode_threshold) {
|
||||
auto weights = inputShape->at(0);
|
||||
return SHAPELIST(weights);
|
||||
}
|
||||
|
||||
DECLARE_TYPES(decode_threshold) {
|
||||
getOpDescriptor()
|
||||
->setAllowedInputTypes(0, {ALL_FLOATS})
|
||||
->setAllowedInputTypes(1, DataType::INT32)
|
||||
->setAllowedOutputTypes(0,{ALL_FLOATS});
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2015-2018 Skymind, Inc.
|
||||
* Copyright (c) 2020 Konduit K.K.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
//
|
||||
// @author sgazeos@gmail.com
|
||||
//
|
||||
#ifndef SD_HEADERS_COMPRESSION_H
|
||||
#define SD_HEADERS_COMPRESSION_H
|
||||
|
||||
#include <ops/declarable/headers/common.h>
|
||||
|
||||
namespace sd {
|
||||
namespace ops {
|
||||
|
||||
/**
|
||||
* encode_bitmap - reinterpret 3D float tensor into uint8_t vector with length N.
|
||||
*
|
||||
* Input:
|
||||
* 0 - 3D float tensor with shape {height, width, channels}
|
||||
*
|
||||
* Output:
|
||||
* 0 - 1D uint8_t tensor with shape {N}
|
||||
*/
|
||||
#if NOT_EXCLUDED(OP_encode_bitmap)
|
||||
DECLARE_CUSTOM_OP(encode_bitmap, 1, 3, true, 1, 0);
|
||||
#endif
|
||||
|
||||
/**
|
||||
* decode_bitmap - reinterpret uint8_t linear tensor as data to float tensor with shape
|
||||
*
|
||||
* Input:
|
||||
* 0 - uint8_t vector with length N ( shape {N})
|
||||
*
|
||||
* Output:
|
||||
* 0 - 3D tensor with shape {height, width, channels}
|
||||
*
|
||||
*/
|
||||
#if NOT_EXCLUDED(OP_decode_bitmap)
|
||||
DECLARE_CUSTOM_OP(decode_bitmap, 2, 1, true, 0, 0);
|
||||
#endif
|
||||
|
||||
|
||||
DECLARE_CUSTOM_OP(encode_threshold, 2, 1, true, 1, 0);
|
||||
DECLARE_CUSTOM_OP(decode_threshold, 2, 1, true, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // SD_HEADERS_COMPRESSION_H
|
|
@ -0,0 +1,34 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2020 Skymind, Inc.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
//
|
||||
// @author sgazeos@gmail.com
|
||||
//
|
||||
#ifndef __COMPRESSION_H_HELPERS__
|
||||
#define __COMPRESSION_H_HELPERS__
|
||||
#include <system/op_boilerplate.h>
|
||||
#include <array/NDArray.h>
|
||||
|
||||
namespace sd {
|
||||
namespace ops {
|
||||
namespace helpers {
|
||||
|
||||
void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output);
|
||||
Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,37 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2020 Skymind, Inc.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
//
|
||||
// @author sgazeos@gmail.com
|
||||
//
|
||||
#include <ops/declarable/helpers/compression.h>
|
||||
#include <execution/Threads.h>
|
||||
|
||||
namespace sd {
|
||||
namespace ops {
|
||||
namespace helpers {
|
||||
|
||||
void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output) {
|
||||
NativeOpExecutioner::decodeBitmap(input->buffer(), output->lengthOf(), output->buffer(), output->shapeInfo());
|
||||
}
|
||||
|
||||
|
||||
Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold) {
|
||||
return NativeOpExecutioner::encodeBitmap(input->buffer(), input->shapeInfo(), input->lengthOf(), output->bufferAsT<int>(), threshold);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2020 Konduit K.K.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
//
|
||||
// @author raver119@gmail.com
|
||||
//
|
||||
|
||||
#include <ops/declarable/helpers/threshold.h>
|
||||
#include <execution/Threads.h>
|
||||
#include <helpers/threshold.h>
|
||||
|
||||
namespace sd {
|
||||
namespace ops {
|
||||
namespace helpers {
|
||||
template <typename T>
|
||||
static int32_t thresholdEstimate_(const NDArray &updates, const float threshold) {
|
||||
auto N = updates.lengthOf();
|
||||
const auto buffer = updates.bufferAsT<T>();
|
||||
|
||||
auto func = PRAGMA_REDUCE_LONG {
|
||||
int64_t cnt = 0;
|
||||
for (auto e = start; e < stop; e++) {
|
||||
auto v = sd::math::nd4j_abs<T>(buffer[e]);
|
||||
if (v >= threshold)
|
||||
cnt++;
|
||||
}
|
||||
|
||||
return cnt;
|
||||
};
|
||||
|
||||
return samediff::Threads::parallel_long(func, LAMBDA_AL { return _old + _new; }, 0, N);
|
||||
}
|
||||
|
||||
int32_t thresholdEstimate(const NDArray &updates, const float threshold) {
|
||||
BUILD_SINGLE_SELECTOR(updates.dataType(), return thresholdEstimate_, (updates, threshold), FLOAT_TYPES);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold) {
|
||||
BUILD_SINGLE_SELECTOR(updates.dataType(), sd::TypeCast::convertToThreshold, (nullptr, updates.buffer(), updates.lengthOf(), encoded.buffer()), FLOAT_TYPES);
|
||||
}
|
||||
|
||||
void thresholdDecode(const NDArray &encoded, NDArray &updates) {
|
||||
BUILD_SINGLE_SELECTOR(updates.dataType(), sd::TypeCast::convertFromThreshold, (nullptr, encoded.getBuffer(), updates.lengthOf(), updates.buffer()), FLOAT_TYPES);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2020 Skymind, Inc.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
//
|
||||
// @author sgazeos@gmail.com
|
||||
//
|
||||
#include <ops/declarable/helpers/compression.h>
|
||||
#include <loops/type_conversions.h>
|
||||
#include <helpers/DebugHelper.h>
|
||||
|
||||
namespace sd {
|
||||
namespace ops {
|
||||
namespace helpers {
|
||||
void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output) {
|
||||
auto stream = context->getCudaStream();
|
||||
NDArray::prepareSpecialUse({output}, {input});
|
||||
|
||||
dim3 launchDims(512, 512, 16384);
|
||||
auto xType = output->dataType();
|
||||
BUILD_SINGLE_SELECTOR(xType, cudaDecodeBitmapGeneric, (launchDims, stream, input->specialBuffer(), output->lengthOf(), output->specialBuffer()), FLOAT_TYPES);
|
||||
|
||||
sd::DebugHelper::checkErrorCode(stream, "decodeBitmapFloat(...) failed");
|
||||
|
||||
NDArray::registerSpecialUse({output}, {input});
|
||||
}
|
||||
|
||||
Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold) {
|
||||
auto stream = LaunchContext::defaultContext()->getCudaStream();
|
||||
int *resultPointer = reinterpret_cast<int *>(LaunchContext::defaultContext()->getScalarPointer());
|
||||
int *reductionPointer = reinterpret_cast<int *>(LaunchContext::defaultContext()->getReductionPointer());
|
||||
|
||||
// nullify result pointer before use
|
||||
resultPointer[0] = 0;
|
||||
|
||||
NDArray::prepareSpecialUse({},{output, input});
|
||||
|
||||
dim3 launchDims(512, 512, 32768);
|
||||
auto xType = input->dataType();
|
||||
BUILD_SINGLE_SELECTOR(xType, cudaEncodeBitmapGeneric,
|
||||
(launchDims, stream, input->specialBuffer(), input->lengthOf(), reinterpret_cast<int*>(output->specialBuffer()), resultPointer, reductionPointer, threshold),
|
||||
FLOAT_TYPES);
|
||||
|
||||
sd::DebugHelper::checkErrorCode(stream, "encodeBitmapFloat(...) failed");
|
||||
|
||||
Nd4jLong dZ = (Nd4jLong) resultPointer[0];
|
||||
resultPointer[0] = 0;
|
||||
|
||||
NDArray::registerSpecialUse({output, input}, {});
|
||||
return dZ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,231 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2020 Konduit K.K.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
//
|
||||
// @author raver119@gmail.com
|
||||
//
|
||||
|
||||
#include <ops/declarable/helpers/threshold.h>
|
||||
#include <loops/type_conversions.h>
|
||||
#include <helpers/PointersManager.h>
|
||||
#include <vector>
|
||||
|
||||
namespace sd {
|
||||
namespace ops {
|
||||
namespace helpers {
|
||||
void prescanArrayRecursive(int** g_scanBlockSums, int *dZ, int *dX, int numElements, int level) {
|
||||
auto stream = LaunchContext::defaultContext()->getCudaStream();
|
||||
|
||||
|
||||
int blockSize = 512; // max size of the thread blocks
|
||||
int numBlocks = sd::math::nd4j_max<int>(1, static_cast<int>(ceil(static_cast<float>(numElements) / (2.f * blockSize))));
|
||||
int numThreads;
|
||||
|
||||
if (numBlocks > 1)
|
||||
numThreads = blockSize;
|
||||
else if (sd::isPowerOfTwo(numElements))
|
||||
numThreads = numElements / 2;
|
||||
else
|
||||
numThreads = sd::floorPow2(numElements);
|
||||
|
||||
int numEltsPerBlock = numThreads * 2;
|
||||
|
||||
// if this is a non-power-of-2 array, the last block will be non-full
|
||||
// compute the smallest power of 2 able to compute its scan.
|
||||
int numEltsLastBlock =
|
||||
numElements - (numBlocks-1) * numEltsPerBlock;
|
||||
int numThreadsLastBlock = sd::math::nd4j_max<int>(1, numEltsLastBlock / 2);
|
||||
int np2LastBlock = 0;
|
||||
int sharedMemLastBlock = 0;
|
||||
|
||||
if (numEltsLastBlock != numEltsPerBlock) {
|
||||
np2LastBlock = 1;
|
||||
|
||||
if(!isPowerOfTwo(numEltsLastBlock))
|
||||
numThreadsLastBlock = floorPow2(numEltsLastBlock);
|
||||
|
||||
unsigned int extraSpace = (2 * numThreadsLastBlock) / NUM_BANKS;
|
||||
sharedMemLastBlock = sizeof(int) * (2 * numThreadsLastBlock + extraSpace);
|
||||
}
|
||||
|
||||
// padding space is used to avoid shared memory bank conflicts
|
||||
int extraSpace = numEltsPerBlock / NUM_BANKS;
|
||||
int sharedMemSize = sizeof(int) * (numEltsPerBlock + extraSpace);
|
||||
|
||||
// setup execution parameters
|
||||
// if NP2, we process the last block separately
|
||||
dim3 grid(sd::math::nd4j_max<int>(1, numBlocks - np2LastBlock), 1, 1);
|
||||
dim3 threads(numThreads, 1, 1);
|
||||
dim3 gridOnes(1, 1, 1);
|
||||
dim3 threadsOnes(numThreadsLastBlock, 1, 1);
|
||||
|
||||
if (sharedMemSize < 2048)
|
||||
sharedMemSize = 2048;
|
||||
|
||||
if (sharedMemLastBlock < 2048)
|
||||
sharedMemLastBlock = 2048;
|
||||
|
||||
// execute the scan
|
||||
if (numBlocks > 1) {
|
||||
sd::prescanLauncher<true, false>(grid, threads, sharedMemSize, stream, dZ, dX, g_scanBlockSums[level], numThreads * 2, 0, 0);
|
||||
if (np2LastBlock) {
|
||||
sd::prescanLauncher<true, true>(gridOnes, threadsOnes, sharedMemLastBlock, stream, dZ, dX, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock);
|
||||
}
|
||||
|
||||
// After scanning all the sub-blocks, we are mostly done. But now we
|
||||
// need to take all of the last values of the sub-blocks and scan those.
|
||||
// This will give us a new value that must be sdded to each block to
|
||||
// get the final results.
|
||||
// recursive (CPU) call
|
||||
prescanArrayRecursive(g_scanBlockSums, g_scanBlockSums[level], g_scanBlockSums[level], numBlocks, level+1);
|
||||
|
||||
sd::uniformAdd<<<grid, threads, 1024, *stream>>>(dZ, g_scanBlockSums[level], numElements - numEltsLastBlock, 0, 0);
|
||||
|
||||
if (np2LastBlock) {
|
||||
sd::uniformAdd<<<1, numThreadsLastBlock, 1024, *stream>>>(dZ, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock);
|
||||
}
|
||||
} else if (isPowerOfTwo(numElements)) {
|
||||
sd::prescanLauncher<false, false>(grid, threads, sharedMemSize, stream, dZ, dX, 0, numThreads * 2, 0, 0);
|
||||
} else {
|
||||
sd::prescanLauncher<false, true>(grid, threads, sharedMemSize, stream, dZ, dX, 0, numElements, 0, 0);
|
||||
}
|
||||
|
||||
sd::DebugHelper::checkErrorCode(stream, "prescanArray(...) failed");
|
||||
}
|
||||
|
||||
static void encodeThresholdP2Int_(void **prs, int *dx, Nd4jLong N, int *dz) {
|
||||
auto stream = LaunchContext::defaultContext()->getCudaStream();
|
||||
|
||||
prescanArrayRecursive(reinterpret_cast<int**>(prs), dz, dx + 1, (int) N, 0);
|
||||
sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP2Int(...) failed");
|
||||
}
|
||||
|
||||
static void encodeThresholdP3_(void *dx, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){
|
||||
auto stream = LaunchContext::defaultContext()->getCudaStream();
|
||||
|
||||
int blockSize = 512;
|
||||
int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
|
||||
|
||||
dim3 launchDims(numBlocks, blockSize, 8192);
|
||||
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
||||
BUILD_SINGLE_SELECTOR(xType, encoderKernelP3Generic, (launchDims, stream, dx, offsets, N, dz), FLOAT_TYPES);
|
||||
|
||||
sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP3Float(...) failed");
|
||||
}
|
||||
|
||||
|
||||
static NDArray thresholdEstimate_(const NDArray &updates, const float threshold) {
|
||||
const int numThreads = 512;
|
||||
const int numBlocks = updates.lengthOf() / numThreads + (updates.lengthOf() % numThreads ? 1 : 0);
|
||||
|
||||
auto tmp = NDArrayFactory::create<int>('c', {numBlocks + 1});
|
||||
|
||||
dim3 launchDims(numBlocks, numThreads, 1024);
|
||||
auto xType = updates.dataType();
|
||||
|
||||
NDArray::prepareSpecialUse({&tmp}, {&updates});
|
||||
BUILD_SINGLE_SELECTOR(xType, encoderKernelP1Generic, (launchDims, LaunchContext::defaultContext()->getCudaStream(), updates.getSpecialBuffer(), updates.lengthOf(), tmp.specialBuffer(), threshold), FLOAT_TYPES);
|
||||
NDArray::registerSpecialUse({&tmp}, {&updates});
|
||||
|
||||
return std::move(tmp);
|
||||
}
|
||||
|
||||
int32_t thresholdEstimate(const NDArray &updates, const float threshold) {
|
||||
return thresholdEstimate_(updates, threshold).e<int>(0);
|
||||
}
|
||||
|
||||
void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold) {
|
||||
// we need these blocks in order to know, how many "updates" will be processed by each GPU block
|
||||
auto blocks = thresholdEstimate_(updates, threshold);
|
||||
|
||||
const int numThreads = 512;
|
||||
const int numBlocks = updates.lengthOf() / numThreads + (updates.lengthOf() % numThreads ? 1 : 0);
|
||||
|
||||
const int prefixThreads = 512;
|
||||
int numElts = numBlocks;
|
||||
int level = 0;
|
||||
|
||||
// here we just calculate number of sumBlock arrays
|
||||
do {
|
||||
int numPrefixBlocks = sd::math::nd4j_max<int>(1, sd::math::nd4j_ceil<float, int>((float) numElts / (2.0f * prefixThreads)));
|
||||
if (numBlocks > 1) {
|
||||
level++;
|
||||
}
|
||||
numElts = numPrefixBlocks;
|
||||
} while (numElts > 1);
|
||||
|
||||
|
||||
|
||||
std::vector<NDArray> tempArrays(level);
|
||||
std::vector<Nd4jPointer> pointers(level);
|
||||
|
||||
level = 0;
|
||||
numElts = numBlocks;
|
||||
|
||||
do {
|
||||
int numPrefixBlocks = sd::math::nd4j_max<int>(1, sd::math::nd4j_ceil<float, int>((float) numElts / (2.0f * prefixThreads)));
|
||||
if (numPrefixBlocks > 1) {
|
||||
tempArrays[level] = std::move(NDArrayFactory::create<int>('c', {numPrefixBlocks}));
|
||||
pointers[level] = tempArrays[level++].specialBuffer();
|
||||
}
|
||||
numElts = numPrefixBlocks;
|
||||
} while (numElts > 1);
|
||||
|
||||
PointersManager pm(LaunchContext::defaultContext(), "thresholdEncode");
|
||||
auto dptr = pm.replicatePointer(pointers.data(), pointers.size() * 8);
|
||||
auto offsets = NDArrayFactory::create<int>('c', {numBlocks});
|
||||
|
||||
// we want to check, if we're hiting external limit on number of encoded elements
|
||||
auto numMatches = blocks.e<int>(0);
|
||||
if (numMatches > encoded.lengthOf() - 4) {
|
||||
blocks.p(0, encoded.lengthOf() - 4);
|
||||
blocks.syncToDevice();
|
||||
}
|
||||
|
||||
NDArray::prepareSpecialUse({}, {&encoded, &updates});
|
||||
|
||||
// filling offsets
|
||||
encodeThresholdP2Int_(reinterpret_cast<void **>(dptr),
|
||||
reinterpret_cast<int*>(blocks.getSpecialBuffer()),
|
||||
numBlocks,
|
||||
reinterpret_cast<int*>(offsets.getSpecialBuffer()));
|
||||
|
||||
NDArray::registerSpecialUse({&blocks, &offsets}, {});
|
||||
pm.synchronize();
|
||||
|
||||
|
||||
encodeThresholdP3_(updates.specialBuffer(),
|
||||
updates.shapeInfo(),
|
||||
reinterpret_cast<int*>(offsets.getSpecialBuffer()),
|
||||
updates.lengthOf(),
|
||||
reinterpret_cast<int*>(encoded.specialBuffer()));
|
||||
|
||||
pm.synchronize();
|
||||
|
||||
NDArray::registerSpecialUse({&encoded, &updates}, {});
|
||||
}
|
||||
|
||||
void thresholdDecode(const NDArray &encoded, NDArray &updates) {
|
||||
dim3 launchDims(128, 512, 512);
|
||||
auto xType = updates.dataType();
|
||||
|
||||
NDArray::prepareSpecialUse({&updates}, {&encoded});
|
||||
BUILD_SINGLE_SELECTOR(xType, decoderKernelGeneric, (launchDims, LaunchContext::defaultContext()->getCudaStream(), encoded.getSpecialBuffer(), updates.lengthOf(), updates.specialBuffer()), FLOAT_TYPES);
|
||||
NDArray::registerSpecialUse({&updates}, {&encoded});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2020 Konduit K.K.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
//
|
||||
// @author raver119@gmail.com
|
||||
//
|
||||
|
||||
#ifndef SD_THRESHOLD_H
|
||||
#define SD_THRESHOLD_H
|
||||
|
||||
#include <ops/declarable/helpers/helpers.h>
|
||||
|
||||
namespace sd {
|
||||
namespace ops {
|
||||
namespace helpers {
|
||||
int32_t thresholdEstimate(const NDArray &updates, float threshold);
|
||||
|
||||
void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold);
|
||||
void thresholdDecode(const NDArray &encoded, NDArray &updates);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif //SD_THRESHOLD_H
|
|
@ -557,21 +557,26 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
|
|||
fb.i_ = x[2];
|
||||
float threshold = fb.f_;
|
||||
|
||||
auto pPos = -1;
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto e = start; e < stop; e++) {
|
||||
const auto v = x[e];
|
||||
for (int bitId = 0; bitId < 16; bitId++) {
|
||||
bool hasBit = (x[e] & 1 << (bitId)) != 0;
|
||||
bool hasSign = (x[e] & 1 << (bitId + 16)) != 0;
|
||||
bool hasBit = (v & 1 << (bitId)) != 0;
|
||||
bool hasSign = (v & 1 << (bitId + 16)) != 0;
|
||||
auto cPos = (e - 4) * 16 + bitId;
|
||||
|
||||
if (hasBit) {
|
||||
if (hasSign)
|
||||
dz[(e - 4) * 16 + bitId] -= static_cast<T>(threshold);
|
||||
dz[cPos] -= static_cast<T>(threshold);
|
||||
else
|
||||
dz[(e - 4) * 16 + bitId] += static_cast<T>(threshold);
|
||||
dz[cPos] += static_cast<T>(threshold);
|
||||
} else if (hasSign) {
|
||||
dz[(e - 4) * 16 + bitId] -= static_cast<T>(threshold / 2);
|
||||
dz[cPos] -= static_cast<T>(threshold / 2);
|
||||
}
|
||||
|
||||
pPos = cPos;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -582,17 +587,21 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
|
|||
template<typename T>
|
||||
Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) {
|
||||
auto dx = reinterpret_cast<T *>(vx);
|
||||
const T two(2.0f);
|
||||
const T zero(0.0f);
|
||||
const T t(threshold);
|
||||
const T thalf = t / two;
|
||||
|
||||
//PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal))
|
||||
auto func = PRAGMA_REDUCE_LONG {
|
||||
//auto func = PRAGMA_REDUCE_LONG {
|
||||
Nd4jLong retVal = 0L;
|
||||
|
||||
for (auto x = start; x < stop; x += increment) {
|
||||
PRAGMA_OMP_PARALLEL_FOR_REDUCTION(+:retVal)
|
||||
for (auto x = 0; x < N; x += 16) {
|
||||
int byte = 0;
|
||||
int byteId = x / 16 + 4;
|
||||
|
||||
for (int f = 0; f < 16; f++) {
|
||||
Nd4jLong e = x + f;
|
||||
auto e = x + f;
|
||||
|
||||
if (e >= N)
|
||||
continue;
|
||||
|
@ -602,19 +611,19 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
|
|||
|
||||
int bitId = e % 16;
|
||||
|
||||
if (abs >= (T) threshold) {
|
||||
if (abs >= t) {
|
||||
byte |= 1 << (bitId);
|
||||
retVal++;
|
||||
|
||||
if (val < (T) 0.0f) {
|
||||
if (val < zero) {
|
||||
byte |= 1 << (bitId + 16);
|
||||
dx[e] += static_cast<T>(threshold);
|
||||
dx[e] += t;
|
||||
} else {
|
||||
dx[e] -= static_cast<T>(threshold);
|
||||
dx[e] -= t;
|
||||
}
|
||||
} else if (abs >= (T) threshold / (T) 2.0f && val < (T) 0.0f) {
|
||||
} else if (abs >= thalf && val < zero) {
|
||||
byte |= 1 << (bitId + 16);
|
||||
dx[e] += static_cast<T>(threshold / 2);
|
||||
dx[e] += thalf;
|
||||
|
||||
retVal++;
|
||||
}
|
||||
|
@ -624,8 +633,9 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
|
|||
}
|
||||
|
||||
return retVal;
|
||||
};
|
||||
return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
|
||||
//};
|
||||
|
||||
//return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -57,7 +57,7 @@ static void multiplyLauncher(void *vbuffer, uint64_t length, void *vresult) {
|
|||
multiplyKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
|
||||
auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
|
||||
if (err != 0)
|
||||
sd::cuda_exception::build("multiply failed", err);
|
||||
throw sd::cuda_exception::build("multiply failed", err);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -80,7 +80,7 @@ static void sumLauncher(void *vbuffer, uint64_t length, void *vresult) {
|
|||
sumKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
|
||||
auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
|
||||
if (err != 0)
|
||||
sd::cuda_exception::build("sum failed", err);
|
||||
throw sd::cuda_exception::build("sum failed", err);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -103,7 +103,7 @@ static void subLauncher(void *vbuffer, uint64_t length, void *vresult) {
|
|||
subKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
|
||||
auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
|
||||
if (err != 0)
|
||||
sd::cuda_exception::build("sub failed", err);
|
||||
throw sd::cuda_exception::build("sub failed", err);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -126,7 +126,7 @@ static void divLauncher(void *vbuffer, uint64_t length, void *vresult) {
|
|||
divKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
|
||||
auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
|
||||
if (err != 0)
|
||||
sd::cuda_exception::build("div failed", err);
|
||||
throw sd::cuda_exception::build("div failed", err);
|
||||
}
|
||||
|
||||
static void multiplyHost(NDArray &input, NDArray &output) {
|
||||
|
|
|
@ -42,18 +42,18 @@ endif()
|
|||
# -fsanitize=address
|
||||
# -fsanitize=leak
|
||||
if (APPLE)
|
||||
set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2 -D__APPLE_OS__=true")
|
||||
set(CMAKE_CXX_FLAGS " -fPIC -D__APPLE_OS__=true")
|
||||
elseif(WIN32)
|
||||
if (SD_CPU)
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC -march=native -mtune=native -O3")
|
||||
endif()
|
||||
|
||||
if (SD_CPU AND LINUX)
|
||||
set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2")
|
||||
set(CMAKE_CXX_FLAGS " -fPIC")
|
||||
endif()
|
||||
else()
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
|
||||
set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2")
|
||||
set(CMAKE_CXX_FLAGS " -fPIC")
|
||||
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*")
|
||||
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native")
|
||||
else()
|
||||
|
@ -82,12 +82,12 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
|
|||
|
||||
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
||||
# using GCC
|
||||
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||
endif()
|
||||
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmax-errors=2")
|
||||
|
||||
if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux" AND NOT(MINGW))
|
||||
if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux" AND NOT(MINGW))
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic")
|
||||
SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
IF(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#include <ops/ops.h>
|
||||
#include <helpers/GradCheck.h>
|
||||
#include <array>
|
||||
#include <helpers/RandomLauncher.h>
|
||||
|
||||
|
||||
using namespace sd;
|
||||
|
@ -39,6 +40,195 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
TEST_F(DeclarableOpsTests19, test_threshold_encode_1) {
|
||||
auto x = NDArrayFactory::create<double>('c', {3}, {1.5, 2.5, -3.5});
|
||||
auto exp_encoded = NDArrayFactory::create<int>('c', {7}, {3, 3, 1056964608, 0, 1, 2, -3});
|
||||
auto exp_gradients = NDArrayFactory::create<double>('c', {3}, {1.0, 2.0, -3.0});
|
||||
|
||||
sd::ops::encode_threshold op;
|
||||
auto result = op.evaluate({&x}, {0.5});
|
||||
|
||||
auto gradients = result.at(0);
|
||||
auto encoded = result.at(1);
|
||||
|
||||
//encoded->printIndexedBuffer("ENC");
|
||||
|
||||
ASSERT_EQ(exp_encoded, *encoded);
|
||||
ASSERT_EQ(exp_gradients, x);
|
||||
|
||||
// FIXME: we need to add a way to declare individual inplace outputs
|
||||
//ASSERT_EQ(exp_gradients, *gradients);
|
||||
}
|
||||
|
||||
TEST_F(DeclarableOpsTests19, test_threshold_encode_2) {
|
||||
for (int length = 5; length < 35; length++) {
|
||||
auto x = NDArrayFactory::create<double>('c', {10000});
|
||||
auto exp_gradients = NDArrayFactory::create<double>('c', {10000});
|
||||
|
||||
for (int e = 0; e < length; e++) {
|
||||
x.p(e, 2e-3);
|
||||
exp_gradients.p(e, 1e-3);
|
||||
}
|
||||
|
||||
sd::ops::encode_threshold op;
|
||||
auto result = op.evaluate({&x}, {1e-3});
|
||||
|
||||
auto encoded = result.at(1);
|
||||
|
||||
ASSERT_EQ(length + 4, encoded->lengthOf());
|
||||
ASSERT_EQ(exp_gradients, x);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(DeclarableOpsTests19, test_threshold_encode_boundary_1) {
|
||||
auto x = NDArrayFactory::create<float>('c', {6});
|
||||
x = 1.0f;
|
||||
|
||||
sd::ops::encode_threshold op;
|
||||
auto result = op.evaluate({&x}, {1.0}, {3});
|
||||
|
||||
auto gradients = result.at(0);
|
||||
auto encoded = result.at(1);
|
||||
|
||||
ASSERT_EQ(7, encoded->lengthOf());
|
||||
ASSERT_EQ(3, x.sumNumber().e<int>(0));
|
||||
}
|
||||
|
||||
TEST_F(DeclarableOpsTests19, test_threshold_encode_boundary_2) {
|
||||
auto x = NDArrayFactory::create<float>('c', {1000});
|
||||
x = 1.0f;
|
||||
|
||||
sd::ops::encode_threshold op;
|
||||
auto result = op.evaluate({&x}, {1.0}, {100});
|
||||
|
||||
auto gradients = result.at(0);
|
||||
auto encoded = result.at(1);
|
||||
|
||||
ASSERT_EQ(104, encoded->lengthOf());
|
||||
|
||||
ASSERT_EQ(900, x.sumNumber().e<int>(0));
|
||||
}
|
||||
|
||||
TEST_F(DeclarableOpsTests19, test_threshold_decode_1) {
|
||||
auto x = NDArrayFactory::create<double>('c', {3}, {1.0, 2.0, -3.0});
|
||||
auto y = NDArrayFactory::create<int>('c', {7}, {3, 3, 1056964608, 0, 1, 2, -3});
|
||||
auto exp_gradients = NDArrayFactory::create<double>('c', {3}, {1.5, 2.5, -3.5});
|
||||
|
||||
sd::ops::decode_threshold op;
|
||||
auto status = op.execute({&x, &y}, {&x});
|
||||
ASSERT_EQ(Status::OK(), status);
|
||||
ASSERT_EQ(exp_gradients, x);
|
||||
}
|
||||
|
||||
TEST_F(DeclarableOpsTests19, test_bitmap_encode_1) {
|
||||
auto initial = NDArrayFactory::create<float>('c', {6}, {0.0f, 0.0f, 1e-3f, -1e-3f, 0.0f, 0.0f});
|
||||
auto exp_0 = initial.like();
|
||||
auto exp_1 = initial.dup();
|
||||
auto exp_c = NDArrayFactory::create<int>(2L);
|
||||
|
||||
sd::ops::encode_bitmap enc;
|
||||
auto enc_result = enc.evaluate({&initial}, {1e-3f});
|
||||
ASSERT_EQ(Status::OK(), enc_result.status());
|
||||
|
||||
//initial.printIndexedBuffer("initial");
|
||||
ASSERT_EQ(exp_0, initial);
|
||||
|
||||
auto encoded = enc_result.at(1);
|
||||
auto counter = enc_result.at(2);
|
||||
|
||||
//encoded->printIndexedBuffer("encoded");
|
||||
|
||||
ASSERT_EQ(exp_c, *counter);
|
||||
|
||||
sd::ops::decode_bitmap dec;
|
||||
auto status = dec.execute({&initial, encoded}, {&initial});
|
||||
ASSERT_EQ(Status::OK(), status);
|
||||
|
||||
|
||||
//initial.printIndexedBuffer();
|
||||
|
||||
ASSERT_EQ(exp_1, initial);
|
||||
}
|
||||
|
||||
TEST_F(DeclarableOpsTests19, test_bitmap_encode_decode) {
|
||||
auto initial = NDArrayFactory::create<float>('c', {256000});
|
||||
initial = 1.0f;
|
||||
auto exp = initial.dup();
|
||||
auto neg = initial.like();
|
||||
neg = 0.5f;
|
||||
|
||||
sd::ops::encode_bitmap enc;
|
||||
auto enc_result = enc.evaluate({&initial}, {0.5f});
|
||||
auto encoded = enc_result.at(1);
|
||||
|
||||
// checking equality of all encoded bits
|
||||
for (int e = 5; e < encoded->lengthOf() - 1; e++) {
|
||||
if (encoded->e<int>(e) != encoded->e<int>(e - 1))
|
||||
nd4j_printf("Non equal encoded values at E[%i]: %i;\n", e, encoded->e<int>(e));
|
||||
}
|
||||
|
||||
ASSERT_NE(exp, initial);
|
||||
ASSERT_EQ(neg, initial);
|
||||
|
||||
sd::ops::decode_bitmap dec;
|
||||
auto status = dec.execute({&initial, encoded}, {&initial});
|
||||
ASSERT_EQ(Status::OK(), status);
|
||||
|
||||
// checking equality of all dedoded bits
|
||||
for (int e = 0; e < initial.lengthOf(); e++) {
|
||||
auto f = initial.e<float>(e);
|
||||
if (f != 1.0f)
|
||||
nd4j_printf("initial[%i] = %f\n", e, f);
|
||||
}
|
||||
|
||||
|
||||
ASSERT_EQ(exp, initial);
|
||||
}
|
||||
|
||||
TEST_F(DeclarableOpsTests19, test_threshold_encode_decode) {
|
||||
auto initial = NDArrayFactory::create<float>('c', {256000});
|
||||
initial = 1.0f;
|
||||
auto exp = initial.dup();
|
||||
auto neg = initial.like();
|
||||
neg = 0.5f;
|
||||
|
||||
sd::ops::encode_threshold enc;
|
||||
auto enc_result = enc.evaluate({&initial}, {0.5f});
|
||||
auto encoded = enc_result.at(1);
|
||||
|
||||
ASSERT_EQ(256000 + 4, encoded->lengthOf());
|
||||
ASSERT_NE(exp, initial);
|
||||
|
||||
for (int e = 0; e < initial.lengthOf(); e++) {
|
||||
auto f = initial.e<float>(e);
|
||||
if (f != 0.5f) {
|
||||
nd4j_printf("initial[%i] = %f\n", e, f);
|
||||
throw std::runtime_error("");
|
||||
}
|
||||
}
|
||||
ASSERT_EQ(neg, initial);
|
||||
|
||||
// checking equality of all encoded bits
|
||||
//for (int e = 5; e < encoded->lengthOf() - 1; e++) {
|
||||
//if (encoded->e<int>(e) != encoded->e<int>(e - 1) + 1)
|
||||
//nd4j_printf("Non equal encoded values at E[%i]: %i;\n", e, encoded->e<int>(e));
|
||||
//}
|
||||
|
||||
sd::ops::decode_threshold dec;
|
||||
auto status = dec.execute({&initial, encoded}, {&initial});
|
||||
ASSERT_EQ(Status::OK(), status);
|
||||
|
||||
// checking equality of all dedoded bits
|
||||
for (int e = 0; e < initial.lengthOf(); e++) {
|
||||
auto f = initial.e<float>(e);
|
||||
if (f != 1.0f)
|
||||
nd4j_printf("initial[%i] = %f\n", e, f);
|
||||
}
|
||||
|
||||
ASSERT_EQ(exp, initial);
|
||||
}
|
||||
|
||||
|
||||
TEST_F(DeclarableOpsTests19, test_matmul_ccc) {
|
||||
auto x = NDArrayFactory::create<float>('c', {10, 10});
|
||||
auto y = NDArrayFactory::create<float>('c', {10, 10});
|
||||
|
|
|
@ -73,6 +73,10 @@ public class ImportClassMapping {
|
|||
org.nd4j.linalg.api.ops.impl.broadcast.BroadcastRSubOp.class,
|
||||
org.nd4j.linalg.api.ops.impl.broadcast.BroadcastSubOp.class,
|
||||
org.nd4j.linalg.api.ops.impl.broadcast.BroadcastTo.class,
|
||||
org.nd4j.linalg.api.ops.compression.EncodeBitmap.class,
|
||||
org.nd4j.linalg.api.ops.compression.DecodeBitmap.class,
|
||||
org.nd4j.linalg.api.ops.compression.EncodeThreshold.class,
|
||||
org.nd4j.linalg.api.ops.compression.DecodeThreshold.class,
|
||||
org.nd4j.linalg.api.ops.impl.shape.Create.class,
|
||||
org.nd4j.linalg.api.ops.impl.broadcast.bool.BroadcastEqualTo.class,
|
||||
org.nd4j.linalg.api.ops.impl.broadcast.bool.BroadcastGreaterThan.class,
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2020 Konduit K.K.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
package org.nd4j.linalg.api.ops.compression;
|
||||
|
||||
import lombok.NonNull;
|
||||
import org.nd4j.linalg.api.buffer.DataType;
|
||||
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||
import org.nd4j.linalg.api.ops.DynamicCustomOp;
|
||||
import org.nd4j.linalg.factory.Nd4j;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Bitmap decoding op wrapper. Used in gradients sharing.
|
||||
* @author raver119@gmail.com
|
||||
*/
|
||||
public class DecodeBitmap extends DynamicCustomOp {
|
||||
|
||||
public DecodeBitmap() {
|
||||
//
|
||||
}
|
||||
|
||||
public DecodeBitmap(@NonNull INDArray encoded, @NonNull INDArray updates) {
|
||||
addInputArgument(updates, encoded);
|
||||
addOutputArgument(updates);
|
||||
|
||||
// this op ALWAYS modifies updates array
|
||||
setInPlace(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String opName() {
|
||||
return "decode_bitmap";
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
|
||||
return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2020 Konduit K.K.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
package org.nd4j.linalg.api.ops.compression;
|
||||
|
||||
import lombok.NonNull;
|
||||
import org.nd4j.linalg.api.buffer.DataType;
|
||||
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||
import org.nd4j.linalg.api.ops.DynamicCustomOp;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Sparse threshold decoding op wrapper. Used in gradients sharing.
|
||||
* @author raver119@gmail.com
|
||||
*/
|
||||
public class DecodeThreshold extends DynamicCustomOp {
|
||||
|
||||
public DecodeThreshold() {
|
||||
//
|
||||
}
|
||||
|
||||
public DecodeThreshold(@NonNull INDArray encoded, @NonNull INDArray updates) {
|
||||
addInputArgument(updates, encoded);
|
||||
addOutputArgument(updates);
|
||||
|
||||
// this op ALWAYS modifies updates array
|
||||
setInPlace(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String opName() {
|
||||
return "decode_threshold";
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
|
||||
return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2020 Konduit K.K.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
package org.nd4j.linalg.api.ops.compression;
|
||||
|
||||
import lombok.NonNull;
|
||||
import org.nd4j.linalg.api.buffer.DataType;
|
||||
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||
import org.nd4j.linalg.api.ops.DynamicCustomOp;
|
||||
import org.nd4j.linalg.factory.Nd4j;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Bitmap encoding op wrapper. Used in gradients sharing.
|
||||
* @author raver119@gmail.com
|
||||
*/
|
||||
public class EncodeBitmap extends DynamicCustomOp {
|
||||
protected float threshold = 1e-3f;
|
||||
|
||||
public EncodeBitmap() {
|
||||
//
|
||||
}
|
||||
|
||||
public EncodeBitmap(@NonNull INDArray updates, float threshold) {
|
||||
this(updates, Nd4j.create(DataType.INT32, updates.length() / 16 + 5), Nd4j.scalar(DataType.INT32, 0), threshold);
|
||||
}
|
||||
|
||||
public EncodeBitmap(@NonNull INDArray updates, @NonNull INDArray encoded, @NonNull INDArray counter, float threshold) {
|
||||
addInputArgument(updates);
|
||||
addOutputArgument(updates, encoded, counter);
|
||||
addTArgument(threshold);
|
||||
|
||||
this.threshold = threshold;
|
||||
|
||||
// this op ALWAYS modifies updates array
|
||||
setInPlace(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String opName() {
|
||||
return "encode_bitmap";
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
|
||||
return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32, DataType.INT32);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2020 Konduit K.K.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
package org.nd4j.linalg.api.ops.compression;
|
||||
|
||||
import lombok.NonNull;
|
||||
import org.nd4j.linalg.api.buffer.DataType;
|
||||
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||
import org.nd4j.linalg.api.ops.DynamicCustomOp;
|
||||
import org.nd4j.linalg.factory.Nd4j;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Sparse threshold encoding op wrapper. Used in gradients sharing.
|
||||
* @author raver119@gmail.com
|
||||
*/
|
||||
public class EncodeThreshold extends DynamicCustomOp {
|
||||
protected float threshold = 1e-3f;
|
||||
protected int boundary = Integer.MAX_VALUE;
|
||||
|
||||
public EncodeThreshold() {
|
||||
//
|
||||
}
|
||||
|
||||
public EncodeThreshold(@NonNull INDArray updates, float threshold) {
|
||||
this(updates, threshold, Integer.MAX_VALUE);
|
||||
}
|
||||
|
||||
public EncodeThreshold(@NonNull INDArray updates, float threshold, @NonNull Integer boundary) {
|
||||
addInputArgument(updates);
|
||||
|
||||
addTArgument(threshold);
|
||||
addIArgument(boundary.intValue());
|
||||
|
||||
this.threshold = threshold;
|
||||
this.boundary = boundary;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String opName() {
|
||||
return "encode_threshold";
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
|
||||
return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32);
|
||||
}
|
||||
}
|
|
@ -30,6 +30,10 @@ import org.nd4j.linalg.api.ndarray.INDArrayStatistics;
|
|||
import org.nd4j.linalg.api.ops.*;
|
||||
import org.nd4j.linalg.api.ops.aggregates.Aggregate;
|
||||
import org.nd4j.linalg.api.ops.aggregates.Batch;
|
||||
import org.nd4j.linalg.api.ops.compression.DecodeBitmap;
|
||||
import org.nd4j.linalg.api.ops.compression.DecodeThreshold;
|
||||
import org.nd4j.linalg.api.ops.compression.EncodeBitmap;
|
||||
import org.nd4j.linalg.api.ops.compression.EncodeThreshold;
|
||||
import org.nd4j.linalg.api.ops.impl.scatter.ScatterUpdate;
|
||||
import org.nd4j.linalg.api.ops.impl.summarystats.Variance;
|
||||
import org.nd4j.linalg.api.rng.Random;
|
||||
|
@ -685,38 +689,41 @@ public abstract class DefaultOpExecutioner implements OpExecutioner {
|
|||
|
||||
@Override
|
||||
public INDArray thresholdEncode(INDArray input, double threshold) {
|
||||
throw new UnsupportedOperationException("Not yet implemented");
|
||||
return thresholdEncode(input, threshold, Integer.MAX_VALUE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) {
|
||||
throw new UnsupportedOperationException("Not yet implemented");
|
||||
val result = Nd4j.exec(new EncodeThreshold(input, (float) threshold, boundary))[1];
|
||||
|
||||
return result.getInt(0) > 0 ? result : null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public INDArray thresholdDecode(INDArray encoded, INDArray target) {
|
||||
throw new UnsupportedOperationException("Not yet implemented");
|
||||
Nd4j.exec(new DecodeThreshold(encoded, target));
|
||||
return target;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long bitmapEncode(INDArray indArray, INDArray target, double threshold) {
|
||||
throw new UnsupportedOperationException("Not yet implemented");
|
||||
val results = Nd4j.exec(new EncodeBitmap(indArray, target, Nd4j.scalar(0), (float) threshold));
|
||||
|
||||
// return number of elements taht were compressed
|
||||
return results[2].getInt(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public INDArray bitmapEncode(INDArray indArray, double threshold) {
|
||||
DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(indArray.length() / 16 + 5);
|
||||
|
||||
INDArray ret = Nd4j.createArrayFromShapeBuffer(buffer, indArray.shapeInfoDataBuffer());
|
||||
|
||||
bitmapEncode(indArray, ret, threshold);
|
||||
|
||||
return ret;
|
||||
val array = Nd4j.create(DataType.INT32, indArray.length() / 16 + 5);
|
||||
bitmapEncode(indArray, array, threshold);
|
||||
return array;
|
||||
}
|
||||
|
||||
@Override
|
||||
public INDArray bitmapDecode(INDArray encoded, INDArray target) {
|
||||
throw new UnsupportedOperationException("Not yet implemented");
|
||||
Nd4j.exec(new DecodeBitmap(encoded, target));
|
||||
return target;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1004,20 +1004,6 @@ public interface NativeOps {
|
|||
@Cast("Nd4jLong *") LongPointer tadShapeInfo,
|
||||
@Cast("Nd4jLong *") LongPointer tadOffsets);
|
||||
|
||||
|
||||
long encodeBitmap(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, long N, IntPointer dz, float threshold);
|
||||
|
||||
void decodeBitmap(PointerPointer extraPointers, Pointer dx, long N, Pointer dz, LongPointer zShapeInfo);
|
||||
|
||||
|
||||
void encodeThresholdP1(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, long N, IntPointer dz, float threshold);
|
||||
|
||||
void encodeThresholdP2Int(PointerPointer extraPointers, IntPointer dx, long N, IntPointer dz);
|
||||
|
||||
void encodeThresholdP3(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, IntPointer offsets, long N, IntPointer dz);
|
||||
|
||||
void decodeThreshold(PointerPointer extraPointers, Pointer dx, long N, Pointer dz, LongPointer zShapeInfo);
|
||||
|
||||
void sort(PointerPointer extraPointers,
|
||||
Pointer x, @Cast("Nd4jLong *") LongPointer xShapeInfo,
|
||||
Pointer dx, @Cast("Nd4jLong *") LongPointer dxShapeInfo,
|
||||
|
|
|
@ -1,100 +0,0 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2015-2018 Skymind, Inc.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
package org.nd4j.linalg.jcublas.compression;
|
||||
|
||||
import org.bytedeco.javacpp.IntPointer;
|
||||
import org.nd4j.linalg.api.buffer.DataBuffer;
|
||||
import org.nd4j.linalg.api.buffer.DataTypeEx;
|
||||
import org.nd4j.linalg.api.concurrency.AffinityManager;
|
||||
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||
import org.nd4j.linalg.compression.CompressedDataBuffer;
|
||||
import org.nd4j.linalg.compression.CompressionDescriptor;
|
||||
import org.nd4j.linalg.factory.Nd4j;
|
||||
import org.nd4j.linalg.indexing.conditions.Conditions;
|
||||
|
||||
/**
|
||||
* This compression is very special case, and shouldn't be ever used outside of ParallelWrapper/ParameterServer implementation.
|
||||
* It encodes data as delta between zero and abs threshold.
|
||||
*
|
||||
* Unlike CudaThreshold codec, CudaFlexibleThreshold tries to target specified sparsity/density updates ratio via topN approach
|
||||
*
|
||||
* PLEASE NOTE: DO NOT USE THIS COMPRESSOR UNLESS YOU'RE 100% SURE WHAT YOU DO!
|
||||
*
|
||||
* @author raver119@gmail.com
|
||||
*/
|
||||
public class CudaFlexibleThreshold extends CudaThreshold {
|
||||
|
||||
public CudaFlexibleThreshold() {
|
||||
super();
|
||||
this.threshold = 0.1f;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method returns compression descriptor. It should be unique for any compressor implementation
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public String getDescriptor() {
|
||||
return "FTHRESHOLD";
|
||||
}
|
||||
|
||||
/**
|
||||
* This method allows you to configure desired sparsity/density ratio for updates. Pass it as float/double value
|
||||
*
|
||||
* Default value: 0.1
|
||||
* @param vars
|
||||
*/
|
||||
@Override
|
||||
public void configure(Object... vars) {
|
||||
super.configure(vars);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public DataBuffer compress(DataBuffer buffer) {
|
||||
INDArray temp = Nd4j.createArrayFromShapeBuffer(buffer, Nd4j.getShapeInfoProvider().createShapeInformation(new long[]{1, buffer.length()}, buffer.dataType()));
|
||||
double max = temp.amaxNumber().doubleValue();
|
||||
|
||||
int cntAbs = temp.scan(Conditions.absGreaterThanOrEqual(max - (max * threshold))).intValue();
|
||||
|
||||
long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType());
|
||||
int compressedLength = cntAbs + 3;
|
||||
// first 3 elements contain header
|
||||
IntPointer pointer = new IntPointer(compressedLength);
|
||||
pointer.put(0, cntAbs);
|
||||
pointer.put(1, (int) buffer.length());
|
||||
pointer.put(2, Float.floatToIntBits(threshold)); // please note, this value will be ovewritten anyway
|
||||
|
||||
CompressionDescriptor descriptor = new CompressionDescriptor();
|
||||
descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT)
|
||||
descriptor.setOriginalLength(originalLength);
|
||||
descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType()));
|
||||
descriptor.setNumberOfElements(buffer.length());
|
||||
|
||||
descriptor.setCompressionAlgorithm(getDescriptor());
|
||||
descriptor.setCompressionType(getCompressionType());
|
||||
|
||||
CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor);
|
||||
|
||||
Nd4j.getNDArrayFactory().convertDataEx(getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.FTHRESHOLD, pointer, buffer.length());
|
||||
|
||||
Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST);
|
||||
|
||||
return cbuff;
|
||||
}
|
||||
}
|
|
@ -1,271 +0,0 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2015-2018 Skymind, Inc.
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
package org.nd4j.linalg.jcublas.compression;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import lombok.val;
|
||||
import org.apache.commons.math3.util.FastMath;
|
||||
import org.bytedeco.javacpp.*;
|
||||
import org.nd4j.compression.impl.AbstractCompressor;
|
||||
import org.nd4j.jita.allocator.impl.AtomicAllocator;
|
||||
import org.nd4j.linalg.api.buffer.DataBuffer;
|
||||
import org.nd4j.linalg.api.buffer.DataType;
|
||||
import org.nd4j.linalg.api.buffer.DataTypeEx;
|
||||
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||
import org.nd4j.linalg.compression.CompressedDataBuffer;
|
||||
import org.nd4j.linalg.compression.CompressionType;
|
||||
import org.nd4j.linalg.exception.ND4JIllegalStateException;
|
||||
import org.nd4j.linalg.factory.Nd4j;
|
||||
import org.nd4j.linalg.jcublas.context.CudaContext;
|
||||
import org.nd4j.nativeblas.NativeOpsHolder;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* This compression is very special case, and shouldn't be ever used outside of ParallelWrapper/ParameterServer implementation.
|
||||
* It encodes data as delta between zero and abs threshold.
|
||||
*
|
||||
* PLEASE NOTE: DO NOT USE THIS COMPRESSOR UNLESS YOU'RE 100% SURE WHAT YOU DO!
|
||||
*
|
||||
* @author raver119@gmail.com
|
||||
*/
|
||||
@Slf4j
|
||||
public class CudaThreshold extends AbstractCompressor {
|
||||
@Getter @Setter protected float threshold = 1e-3f;
|
||||
|
||||
/**
|
||||
* This method returns compression descriptor. It should be unique for any compressor implementation
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public String getDescriptor() {
|
||||
return "THRESHOLD";
|
||||
}
|
||||
|
||||
/**
|
||||
* This method allows you to configure threshold for delta extraction. Pass it as float/double value
|
||||
*
|
||||
* Default value: 1e-3
|
||||
* @param vars
|
||||
*/
|
||||
@Override
|
||||
public void configure(Object... vars) {
|
||||
if (vars[0] instanceof Number) {
|
||||
Number t = (Number) vars[0];
|
||||
threshold = FastMath.abs(t.floatValue());
|
||||
log.info("Setting threshold to [{}]", threshold);
|
||||
} else {
|
||||
throw new ND4JIllegalStateException("Threshold value should be Number");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public INDArray compress(INDArray array) {
|
||||
//logger.info("Threshold [{}] compression", threshold);
|
||||
|
||||
Nd4j.getExecutioner().commit();
|
||||
//Nd4j.getAffinityManager().ensureLocation(array, AffinityManager.Location.HOST);
|
||||
|
||||
DataBuffer buffer = compress(array.data());
|
||||
if (buffer == null)
|
||||
return null;
|
||||
|
||||
INDArray dup = Nd4j.createArrayFromShapeBuffer(buffer, array.shapeInfoDataBuffer());
|
||||
dup.markAsCompressed(true);
|
||||
|
||||
return dup;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CompressionType getCompressionType() {
|
||||
return CompressionType.LOSSLESS;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DataBuffer decompress(DataBuffer buffer, DataType type) {
|
||||
if (buffer.dataType() != DataType.INT)
|
||||
throw new UnsupportedOperationException();
|
||||
|
||||
long compressedLength = buffer.getInt(0);
|
||||
long originalLength = buffer.getInt(1);
|
||||
|
||||
DataBuffer result = Nd4j.createBuffer(type, originalLength, false);
|
||||
|
||||
val context = AtomicAllocator.getInstance().getDeviceContext();
|
||||
|
||||
PointerPointer extras = new PointerPointer(32).put(1, context.getOldStream());
|
||||
|
||||
//log.info("DEC Source length: {}", buffer.length());
|
||||
//log.info("DEC Source: {}", Arrays.toString(buffer.asInt()));
|
||||
|
||||
//NativeOpsHolder.getInstance().getDeviceNativeOps().decodeThresholdFloat(extras, AtomicAllocator.getInstance().getPointer(buffer), compressedLength, (FloatPointer) AtomicAllocator.getInstance().getPointer(result));
|
||||
AtomicAllocator.getInstance().getAllocationPoint(result).tickDeviceWrite();
|
||||
|
||||
//DataBuffer result = Nd4j.getNDArrayFactory().convertDataEx(DataTypeEx.THRESHOLD, buffer, getGlobalTypeEx());
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DataBuffer compress(DataBuffer buffer) {
|
||||
|
||||
int numThreads = 1024;
|
||||
int numBlocks = (int) (buffer.length() / numThreads + (buffer.length() % numThreads == 0 ? 0 : 1));
|
||||
|
||||
val context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext();
|
||||
|
||||
DataBuffer blocksBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks+1, true) : Nd4j.getDataBufferFactory().createInt(numBlocks+1, true, Nd4j.getMemoryManager().getCurrentWorkspace());
|
||||
PointerPointer extras = new PointerPointer(32).put(1, context.getOldStream());
|
||||
|
||||
|
||||
//NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP1(extras, (FloatPointer) AtomicAllocator.getInstance().getPointer(buffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), threshold);
|
||||
AtomicAllocator.getInstance().getAllocationPoint(blocksBuffer).tickDeviceWrite();
|
||||
|
||||
|
||||
int numMatches = blocksBuffer.getInt(0);
|
||||
|
||||
//log.info("Totals: {}", numMatches);
|
||||
/*
|
||||
|
||||
log.info("Number of blocks for compression: {}", numBlocks);
|
||||
log.info("BlocksCounts: {}", Arrays.toString(blocksBuffer.asInt()));
|
||||
*/
|
||||
DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(3+numMatches, false) : Nd4j.getDataBufferFactory().createInt(3+numMatches, false, Nd4j.getMemoryManager().getCurrentWorkspace());
|
||||
encodedBuffer.put(0, numMatches);
|
||||
encodedBuffer.put(1, (int) buffer.length());
|
||||
encodedBuffer.put(2, Float.floatToIntBits(threshold));
|
||||
AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickHostWrite();
|
||||
|
||||
// FIXME: make it parallel via some kernel, because it can be pretty big array here, i.e. for 150m original array, offsets can
|
||||
/*
|
||||
int prevSum = 0;
|
||||
for (int e = 0; e < numBlocks; e++) {
|
||||
int prevVal = offsetsBuffer.getInt(e + 1);
|
||||
offsetsBuffer.put(e + 1, prevSum);
|
||||
prevSum += prevVal;
|
||||
}
|
||||
*/
|
||||
|
||||
int prefixThreads = 512;
|
||||
int numElts = numBlocks;
|
||||
int level = 0;
|
||||
List<DataBuffer> buffers = new ArrayList<>();
|
||||
|
||||
// here we just calculate number of sumBlock arrays
|
||||
do {
|
||||
int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
|
||||
if (numBlocks > 1) {
|
||||
level++;
|
||||
}
|
||||
numElts = numPrefixBlocks;
|
||||
} while (numElts > 1);
|
||||
|
||||
long[] pointers = new long[level];
|
||||
|
||||
level = 0;
|
||||
numElts = numBlocks;
|
||||
|
||||
// allocating temp buffers for prefux sum
|
||||
DataBuffer tempX = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createDouble(pointers.length, false) : Nd4j.getDataBufferFactory().createDouble(pointers.length, false, Nd4j.getMemoryManager().getCurrentWorkspace());
|
||||
|
||||
do {
|
||||
int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
|
||||
if (numPrefixBlocks > 1) {
|
||||
DataBuffer bf = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false) : Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false, Nd4j.getMemoryManager().getCurrentWorkspace());
|
||||
|
||||
buffers.add(bf);
|
||||
|
||||
pointers[level++] = AtomicAllocator.getInstance().getPointer(bf).address();
|
||||
}
|
||||
numElts = numPrefixBlocks;
|
||||
} while (numElts > 1);
|
||||
|
||||
|
||||
AtomicAllocator.getInstance().memcpyBlocking(tempX, new LongPointer(pointers), pointers.length * 8, 0);
|
||||
|
||||
extras.put(2, AtomicAllocator.getInstance().getPointer(tempX));
|
||||
|
||||
DataBuffer offsetsBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks, true) : Nd4j.getDataBufferFactory().createInt(numBlocks, true, Nd4j.getMemoryManager().getCurrentWorkspace());
|
||||
|
||||
NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP2Int(extras, (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), numBlocks, (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer) );
|
||||
AtomicAllocator.getInstance().getAllocationPoint(offsetsBuffer).tickDeviceWrite();
|
||||
|
||||
//log.info("Offsets: {}", Arrays.toString(offsetsBuffer.asInt()));
|
||||
//log.info("Target: {}", Arrays.toString(encodedBuffer.asInt()));
|
||||
|
||||
|
||||
|
||||
//NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP3Float(extras, (FloatPointer) AtomicAllocator.getInstance().getPointer(buffer), (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(encodedBuffer));
|
||||
AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickDeviceWrite();
|
||||
AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite();
|
||||
|
||||
//log.info("Encoded: {}", Arrays.toString(encodedBuffer.asInt()));
|
||||
|
||||
extras.address();
|
||||
tempX.address();
|
||||
|
||||
return encodedBuffer;
|
||||
|
||||
/*
|
||||
INDArray temp = Nd4j.createArrayFromShapeBuffer(buffer, Nd4j.getShapeInfoProvider().createShapeInformation(new int[]{1, (int) buffer.length()}));
|
||||
MatchCondition condition = new MatchCondition(temp, Conditions.absGreaterThanOrEqual(threshold));
|
||||
int cntAbs = Nd4j.getExecutioner().exec(condition, Integer.MAX_VALUE).getInt(0);
|
||||
|
||||
|
||||
//log.info("density ratio: {}", String.format("%.2f", cntAbs * 100.0f / buffer.length()));
|
||||
|
||||
if (cntAbs == 0)
|
||||
return null;
|
||||
|
||||
long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType());
|
||||
int compressedLength = cntAbs + 3;
|
||||
// first 3 elements contain header
|
||||
IntPointer pointer = new IntPointer(compressedLength);
|
||||
pointer.put(0, cntAbs);
|
||||
pointer.put(1, (int) buffer.length());
|
||||
pointer.put(2, Float.floatToIntBits(threshold));
|
||||
|
||||
CompressionDescriptor descriptor = new CompressionDescriptor();
|
||||
descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT)
|
||||
descriptor.setOriginalLength(originalLength);
|
||||
descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType()));
|
||||
descriptor.setNumberOfElements(buffer.length());
|
||||
|
||||
descriptor.setCompressionAlgorithm(getDescriptor());
|
||||
descriptor.setCompressionType(getCompressionType());
|
||||
|
||||
|
||||
|
||||
CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor);
|
||||
|
||||
Nd4j.getNDArrayFactory().convertDataEx(getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.THRESHOLD, pointer, buffer.length());
|
||||
|
||||
Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST);
|
||||
|
||||
return cbuff;
|
||||
*/
|
||||
}
|
||||
|
||||
@Override
|
||||
protected CompressedDataBuffer compressPointer(DataTypeEx srcType, Pointer srcPointer, int length, int elementSize) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
|
@ -31,6 +31,7 @@ import org.nd4j.jita.allocator.tad.DeviceTADManager;
|
|||
import org.nd4j.jita.conf.CudaEnvironment;
|
||||
import org.nd4j.linalg.api.buffer.DataBuffer;
|
||||
import org.nd4j.linalg.api.buffer.DataType;
|
||||
import org.nd4j.linalg.api.concurrency.AffinityManager;
|
||||
import org.nd4j.linalg.api.environment.Nd4jEnvironment;
|
||||
import org.nd4j.linalg.api.memory.pointers.PagedPointer;
|
||||
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||
|
@ -1674,224 +1675,6 @@ public class CudaExecutioner extends DefaultOpExecutioner {
|
|||
ctx.syncSpecialStream();
|
||||
}
|
||||
|
||||
@Override
|
||||
public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) {
|
||||
DataBuffer buffer = input.data();
|
||||
|
||||
int numThreads = 1024;
|
||||
int numBlocks = (int) (buffer.length() / numThreads + (buffer.length() % numThreads == 0 ? 0 : 1));
|
||||
|
||||
val context = AtomicAllocator.getInstance().getDeviceContext();
|
||||
|
||||
DataBuffer blocksBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks+1, true) : Nd4j.getDataBufferFactory().createInt(numBlocks+1, true, Nd4j.getMemoryManager().getCurrentWorkspace());
|
||||
|
||||
if (extraz.get() == null)
|
||||
extraz.set(new PointerPointer(32));
|
||||
|
||||
val extras = extraz.get().put(1, context.getOldStream());
|
||||
|
||||
((BaseCudaDataBuffer) buffer).getOpaqueDataBuffer().syncToSpecial();
|
||||
|
||||
|
||||
NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP1(extras,
|
||||
AtomicAllocator.getInstance().getPointer(buffer),
|
||||
(LongPointer) AtomicAllocator.getInstance().getHostPointer(input.shapeInfoDataBuffer()),
|
||||
buffer.length(),
|
||||
(IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer),
|
||||
(float) threshold);
|
||||
|
||||
AtomicAllocator.getInstance().getAllocationPoint(blocksBuffer).tickDeviceWrite();
|
||||
|
||||
|
||||
int numMatches = blocksBuffer.getInt(0);
|
||||
|
||||
// special case here, nothing to update
|
||||
if (numMatches < 2)
|
||||
return null;
|
||||
|
||||
if (boundary != null && numMatches > boundary) {
|
||||
numMatches = boundary;
|
||||
blocksBuffer.put(0, numMatches);
|
||||
}
|
||||
|
||||
DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(4+numMatches, false) : Nd4j.getDataBufferFactory().createInt(4+numMatches, false, Nd4j.getMemoryManager().getCurrentWorkspace());
|
||||
|
||||
encodedBuffer.put(0, numMatches);
|
||||
encodedBuffer.put(1, (int) buffer.length());
|
||||
encodedBuffer.put(2, Float.floatToIntBits((float) threshold));
|
||||
|
||||
encodedBuffer.put(3, ThresholdCompression.FLEXIBLE_ENCODING);
|
||||
|
||||
((BaseCudaDataBuffer) encodedBuffer).getOpaqueDataBuffer().syncToSpecial();
|
||||
|
||||
|
||||
int prefixThreads = 512;
|
||||
int numElts = numBlocks;
|
||||
int level = 0;
|
||||
List<DataBuffer> buffers = new ArrayList<>();
|
||||
|
||||
// here we just calculate number of sumBlock arrays
|
||||
do {
|
||||
int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
|
||||
if (numBlocks > 1) {
|
||||
level++;
|
||||
}
|
||||
numElts = numPrefixBlocks;
|
||||
} while (numElts > 1);
|
||||
|
||||
long[] pointers = new long[level];
|
||||
|
||||
level = 0;
|
||||
numElts = numBlocks;
|
||||
|
||||
// allocating temp buffers for prefux sum
|
||||
DataBuffer tempX = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createDouble(pointers.length, false) : Nd4j.getDataBufferFactory().createDouble(pointers.length, false, Nd4j.getMemoryManager().getCurrentWorkspace());
|
||||
|
||||
do {
|
||||
int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
|
||||
if (numPrefixBlocks > 1) {
|
||||
DataBuffer bf = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false) : Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false, Nd4j.getMemoryManager().getCurrentWorkspace());
|
||||
|
||||
buffers.add(bf);
|
||||
|
||||
pointers[level++] = AtomicAllocator.getInstance().getPointer(bf).address();
|
||||
}
|
||||
numElts = numPrefixBlocks;
|
||||
} while (numElts > 1);
|
||||
|
||||
|
||||
AtomicAllocator.getInstance().memcpyBlocking(tempX, new LongPointer(pointers), pointers.length * 8, 0);
|
||||
|
||||
extras.put(2, AtomicAllocator.getInstance().getPointer(tempX));
|
||||
|
||||
DataBuffer offsetsBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks, true) : Nd4j.getDataBufferFactory().createInt(numBlocks, true, Nd4j.getMemoryManager().getCurrentWorkspace());
|
||||
|
||||
NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP2Int(extras, (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), numBlocks, (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer) );
|
||||
AtomicAllocator.getInstance().getAllocationPoint(offsetsBuffer).tickDeviceWrite();
|
||||
|
||||
|
||||
NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP3(extras, AtomicAllocator.getInstance().getPointer(buffer), (LongPointer) AtomicAllocator.getInstance().getHostPointer(input.shapeInfoDataBuffer()), (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(encodedBuffer));
|
||||
|
||||
AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickDeviceWrite();
|
||||
AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite();
|
||||
|
||||
return Nd4j.createArrayFromShapeBuffer(encodedBuffer, input.shapeInfoDataBuffer());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public INDArray thresholdEncode(INDArray input, double threshold) {
|
||||
return thresholdEncode(input, threshold, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public INDArray thresholdDecode(INDArray encoded, INDArray target) {
|
||||
DataBuffer buffer = encoded.data();
|
||||
|
||||
if (buffer.dataType() != DataType.INT)
|
||||
throw new UnsupportedOperationException();
|
||||
|
||||
long compressedLength = buffer.getInt(0);
|
||||
long originalLength = buffer.getInt(1);
|
||||
|
||||
if (target.length() != originalLength)
|
||||
throw new ND4JIllegalStateException("originalLength ["+ originalLength+"] stored in encoded array doesn't match target length ["+ target.length()+"]");
|
||||
|
||||
DataBuffer result = target.data();
|
||||
|
||||
val context = AtomicAllocator.getInstance().getDeviceContext();
|
||||
|
||||
if (extraz.get() == null)
|
||||
extraz.set(new PointerPointer(32));
|
||||
|
||||
PointerPointer extras = extraz.get().put(1, context.getOldStream());
|
||||
|
||||
nativeOps.decodeThreshold(extras, AtomicAllocator.getInstance().getPointer(buffer), compressedLength, AtomicAllocator.getInstance().getPointer(result), (LongPointer) AtomicAllocator.getInstance().getHostPointer(target.shapeInfoDataBuffer()));
|
||||
|
||||
if (nativeOps.lastErrorCode() != 0)
|
||||
throw new RuntimeException(nativeOps.lastErrorMessage());
|
||||
|
||||
AtomicAllocator.getInstance().getAllocationPoint(result).tickDeviceWrite();
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public long bitmapEncode(INDArray indArray, INDArray target, double threshold) {
|
||||
long length = indArray.length();
|
||||
long tLen = target.data().length();
|
||||
|
||||
if (tLen != (length / 16 + 5))
|
||||
throw new ND4JIllegalStateException("Length of target array should be " + (length / 16 + 5));
|
||||
|
||||
if (target.data().dataType() != DataType.INT)
|
||||
throw new ND4JIllegalStateException("Target array should have INT dataType");
|
||||
|
||||
DataBuffer buffer = target.data();
|
||||
buffer.put(0, (int) length);
|
||||
buffer.put(1, (int) length);
|
||||
buffer.put(2, Float.floatToIntBits((float) threshold));
|
||||
|
||||
// format id
|
||||
buffer.put(3, ThresholdCompression.BITMAP_ENCODING);
|
||||
|
||||
val context = AtomicAllocator.getInstance().getDeviceContext();
|
||||
|
||||
if (extraz.get() == null)
|
||||
extraz.set(new PointerPointer(32));
|
||||
|
||||
|
||||
PointerPointer extras = extraz.get().put(
|
||||
AtomicAllocator.getInstance().getHostPointer(indArray),
|
||||
context.getOldStream(),
|
||||
context.getBufferScalar(),
|
||||
context.getBufferReduction()
|
||||
);
|
||||
|
||||
|
||||
val src = AtomicAllocator.getInstance().getPointer(indArray, context);
|
||||
val dst = (IntPointer) AtomicAllocator.getInstance().getPointer(buffer, context);
|
||||
((BaseCudaDataBuffer) buffer).getOpaqueDataBuffer().syncToSpecial();
|
||||
|
||||
long val = nativeOps.encodeBitmap(extras,
|
||||
src, (LongPointer) AtomicAllocator.getInstance().getHostPointer(indArray.shapeInfoDataBuffer()),
|
||||
length,
|
||||
dst,
|
||||
(float) threshold);
|
||||
|
||||
if (nativeOps.lastErrorCode() != 0)
|
||||
throw new RuntimeException(nativeOps.lastErrorMessage());
|
||||
|
||||
AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite();
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
@Override
|
||||
public INDArray bitmapDecode(INDArray encoded, INDArray target) {
|
||||
|
||||
val context = AtomicAllocator.getInstance().getDeviceContext();
|
||||
|
||||
if (extraz.get() == null)
|
||||
extraz.set(new PointerPointer(32));
|
||||
|
||||
|
||||
PointerPointer extras = extraz.get().put(
|
||||
AtomicAllocator.getInstance().getHostPointer(target),
|
||||
context.getOldStream(),
|
||||
context.getBufferScalar(),
|
||||
context.getBufferReduction());
|
||||
|
||||
nativeOps.decodeBitmap(extras, AtomicAllocator.getInstance().getPointer(encoded.data(), context), target.length(), AtomicAllocator.getInstance().getPointer(target, context), (LongPointer) AtomicAllocator.getInstance().getHostPointer(target.shapeInfoDataBuffer()));
|
||||
|
||||
if (nativeOps.lastErrorCode() != 0)
|
||||
throw new RuntimeException(nativeOps.lastErrorMessage());
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public synchronized Map<String, CustomOpDescriptor> getCustomOperations() {
|
||||
if(customOps == null) {
|
||||
|
@ -1974,6 +1757,11 @@ public class CudaExecutioner extends DefaultOpExecutioner {
|
|||
val inputArgs = opContext != null ? opContext.getInputArrays() : op.inputArguments();
|
||||
int cnt= 0;
|
||||
for (val in: inputArgs) {
|
||||
// TODO: once we implement Context-based shape function call this method should be removed
|
||||
val loc = Nd4j.getAffinityManager().getActiveLocation(in);
|
||||
if (loc != AffinityManager.Location.DEVICE && loc != AffinityManager.Location.EVERYWHERE)
|
||||
Nd4j.getAffinityManager().ensureLocation(in, AffinityManager.Location.DEVICE);
|
||||
|
||||
// NOT A TYPO: shape functions work on host side only
|
||||
if (!in.isEmpty()) {
|
||||
inputBuffers.put(cnt, in.data().addressPointer());
|
||||
|
|
|
@ -2804,30 +2804,6 @@ public native void tear(@Cast("Nd4jPointer*") PointerPointer extraPointers,
|
|||
@Cast("Nd4jLong*") long[] tadShapeInfo,
|
||||
@Cast("Nd4jLong*") long[] tadOffsets);
|
||||
|
||||
public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
|
||||
public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
|
||||
public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
|
||||
public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
|
||||
public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
|
||||
public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
|
||||
|
||||
|
||||
public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
|
||||
public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
|
||||
public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
|
||||
public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntPointer dx, @Cast("Nd4jLong") long N, IntPointer dz);
|
||||
public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntBuffer dx, @Cast("Nd4jLong") long N, IntBuffer dz);
|
||||
public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, int[] dx, @Cast("Nd4jLong") long N, int[] dz);
|
||||
public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, IntPointer offsets, @Cast("Nd4jLong") long N, IntPointer dz);
|
||||
public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, IntBuffer offsets, @Cast("Nd4jLong") long N, IntBuffer dz);
|
||||
public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, int[] offsets, @Cast("Nd4jLong") long N, int[] dz);
|
||||
|
||||
|
||||
public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
|
||||
public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
|
||||
public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
|
||||
|
||||
|
||||
public native void sort(@Cast("Nd4jPointer*") PointerPointer extraPointers,
|
||||
Pointer x, @Cast("Nd4jLong*") LongPointer xShapeInfo,
|
||||
Pointer dx, @Cast("Nd4jLong*") LongPointer dxShapeInfo,
|
||||
|
@ -10712,6 +10688,7 @@ public static final int PREALLOC_SIZE = 33554432;
|
|||
// #include <ops/declarable/headers/boolean.h>
|
||||
// #include <ops/declarable/headers/broadcastable.h>
|
||||
// #include <ops/declarable/headers/convo.h>
|
||||
// #include <ops/declarable/headers/compression.h>
|
||||
// #include <ops/declarable/headers/list.h>
|
||||
// #include <ops/declarable/headers/recurrent.h>
|
||||
// #include <ops/declarable/headers/transforms.h>
|
||||
|
|
|
@ -13,5 +13,3 @@
|
|||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
################################################################################
|
||||
|
||||
org.nd4j.linalg.jcublas.compression.CudaThreshold
|
|
@ -1376,142 +1376,6 @@ public class NativeOpExecutioner extends DefaultOpExecutioner {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public INDArray thresholdEncode(INDArray input, double threshold) {
|
||||
return thresholdEncode(input, threshold, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) {
|
||||
|
||||
//val condition = new MatchCondition(input, Conditions.absGreaterThanOrEqual(threshold));
|
||||
//long t1 = System.currentTimeMillis();
|
||||
int cntAbs = loop.estimateThreshold(null,
|
||||
input.data().addressPointer(),
|
||||
(LongPointer) input.shapeInfoDataBuffer().addressPointer(),
|
||||
(int) input.length(),
|
||||
(float) threshold);
|
||||
//long t2 = System.currentTimeMillis();
|
||||
|
||||
if (loop.lastErrorCode() != 0)
|
||||
throw new RuntimeException(loop.lastErrorMessage());
|
||||
|
||||
if (cntAbs < 2)
|
||||
return null;
|
||||
|
||||
if (boundary != null)
|
||||
cntAbs = Math.min(cntAbs, boundary);
|
||||
|
||||
//log.info("S: {}; T: {}", cntAbs, t2 - t1);
|
||||
|
||||
DataBuffer buffer = input.data();
|
||||
|
||||
long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType());
|
||||
int compressedLength = cntAbs + 4;
|
||||
// first 3 elements contain header
|
||||
|
||||
DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(4+cntAbs, false) : Nd4j.getDataBufferFactory().createInt(4+cntAbs, false, Nd4j.getMemoryManager().getCurrentWorkspace());
|
||||
|
||||
encodedBuffer.put(0, cntAbs);
|
||||
encodedBuffer.put(1, (int) buffer.length());
|
||||
encodedBuffer.put(2, Float.floatToIntBits((float) threshold));
|
||||
|
||||
// format id
|
||||
encodedBuffer.put(3, ThresholdCompression.FLEXIBLE_ENCODING);
|
||||
|
||||
CompressionDescriptor descriptor = new CompressionDescriptor();
|
||||
descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT)
|
||||
descriptor.setOriginalLength(originalLength);
|
||||
descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType()));
|
||||
descriptor.setNumberOfElements(buffer.length());
|
||||
|
||||
descriptor.setCompressionAlgorithm("THRESHOLD");
|
||||
descriptor.setCompressionType(CompressionType.LOSSLESS);
|
||||
|
||||
//CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor);
|
||||
|
||||
Nd4j.getNDArrayFactory().convertDataEx(AbstractCompressor.getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.THRESHOLD, encodedBuffer.addressPointer(), buffer.length());
|
||||
|
||||
Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST);
|
||||
|
||||
return Nd4j.createArrayFromShapeBuffer(encodedBuffer, input.shapeInfoDataBuffer());
|
||||
}
|
||||
|
||||
@Override
|
||||
public INDArray thresholdDecode(INDArray encoded, INDArray target) {
|
||||
DataBuffer buffer = encoded.data();
|
||||
|
||||
if (buffer.dataType() != DataType.INT)
|
||||
throw new ND4JIllegalStateException("thresholdEncoded array should have dataType of INT");
|
||||
|
||||
long compressedLength = buffer.getInt(0);
|
||||
long originalLength = buffer.getInt(1);
|
||||
float threshold = buffer.getInt(2);
|
||||
|
||||
if (target.length() != originalLength)
|
||||
throw new ND4JIllegalStateException("originalLength ["+ originalLength+"] stored in encoded array doesn't match target length ["+ target.length()+"]");
|
||||
|
||||
DataTypeEx typeDst = AbstractCompressor.getBufferTypeEx(target.data());
|
||||
|
||||
loop.convertTypes(null, DataTypeEx.THRESHOLD.ordinal(), buffer.addressPointer(), target.length(), typeDst.ordinal(), target.data().addressPointer());
|
||||
|
||||
if (loop.lastErrorCode() != 0)
|
||||
throw new RuntimeException(loop.lastErrorMessage());
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public long bitmapEncode(INDArray indArray, INDArray target, double threshold) {
|
||||
long length = indArray.length();
|
||||
long tLen = target.data().length();
|
||||
|
||||
if (tLen != (length / 16 + 5))
|
||||
throw new ND4JIllegalStateException("Length of target array should be " + (length / 16 + 5));
|
||||
|
||||
if (target.data().dataType() != DataType.INT)
|
||||
throw new ND4JIllegalStateException("Target array should have INT dataType");
|
||||
|
||||
DataBuffer buffer = target.data();
|
||||
|
||||
buffer.put(0, (int) length);
|
||||
buffer.put(1, (int) length);
|
||||
buffer.put(2, Float.floatToIntBits((float) threshold));
|
||||
|
||||
// format id
|
||||
buffer.put(3, ThresholdCompression.BITMAP_ENCODING);
|
||||
|
||||
long affected = loop.encodeBitmap(null,
|
||||
indArray.data().addressPointer(),
|
||||
(LongPointer) indArray.shapeInfoDataBuffer().addressPointer(),
|
||||
length,
|
||||
(IntPointer) buffer.addressPointer(),
|
||||
(float) threshold);
|
||||
|
||||
if (loop.lastErrorCode() != 0)
|
||||
throw new RuntimeException(loop.lastErrorMessage());
|
||||
|
||||
return affected;
|
||||
}
|
||||
|
||||
@Override
|
||||
public INDArray bitmapDecode(INDArray encoded, INDArray target) {
|
||||
|
||||
loop.decodeBitmap(null,
|
||||
encoded.data().addressPointer(),
|
||||
target.length(),
|
||||
target.data().addressPointer(),
|
||||
(LongPointer) target.shapeInfoDataBuffer().addressPointer()
|
||||
);
|
||||
|
||||
if (loop.lastErrorCode() != 0)
|
||||
throw new RuntimeException(loop.lastErrorMessage());
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public synchronized Map<String, CustomOpDescriptor> getCustomOperations() {
|
||||
if (customOps == null) {
|
||||
|
|
|
@ -2808,30 +2808,6 @@ public native void tear(@Cast("Nd4jPointer*") PointerPointer extraPointers,
|
|||
@Cast("Nd4jLong*") long[] tadShapeInfo,
|
||||
@Cast("Nd4jLong*") long[] tadOffsets);
|
||||
|
||||
public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
|
||||
public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
|
||||
public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
|
||||
public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
|
||||
public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
|
||||
public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
|
||||
|
||||
|
||||
public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
|
||||
public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
|
||||
public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
|
||||
public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntPointer dx, @Cast("Nd4jLong") long N, IntPointer dz);
|
||||
public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntBuffer dx, @Cast("Nd4jLong") long N, IntBuffer dz);
|
||||
public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, int[] dx, @Cast("Nd4jLong") long N, int[] dz);
|
||||
public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, IntPointer offsets, @Cast("Nd4jLong") long N, IntPointer dz);
|
||||
public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, IntBuffer offsets, @Cast("Nd4jLong") long N, IntBuffer dz);
|
||||
public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, int[] offsets, @Cast("Nd4jLong") long N, int[] dz);
|
||||
|
||||
|
||||
public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
|
||||
public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
|
||||
public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
|
||||
|
||||
|
||||
public native void sort(@Cast("Nd4jPointer*") PointerPointer extraPointers,
|
||||
Pointer x, @Cast("Nd4jLong*") LongPointer xShapeInfo,
|
||||
Pointer dx, @Cast("Nd4jLong*") LongPointer dxShapeInfo,
|
||||
|
@ -12463,6 +12439,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
|
|||
// #include <ops/declarable/headers/boolean.h>
|
||||
// #include <ops/declarable/headers/broadcastable.h>
|
||||
// #include <ops/declarable/headers/convo.h>
|
||||
// #include <ops/declarable/headers/compression.h>
|
||||
// #include <ops/declarable/headers/list.h>
|
||||
// #include <ops/declarable/headers/recurrent.h>
|
||||
// #include <ops/declarable/headers/transforms.h>
|
||||
|
|
|
@ -8315,6 +8315,85 @@ public class Nd4jTestsC extends BaseNd4jTest {
|
|||
assertArrayEquals(new long[]{bS, oH, oW, oC}, ret[0].shape());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMatmulMethod_8() {
|
||||
val x = Nd4j.create(DataType.INT8, 3, 5).assign(1);
|
||||
val y = Nd4j.create(DataType.INT8, 5, 3).assign(1);
|
||||
val e = Nd4j.create(DataType.INT8, 3, 3).assign(5);
|
||||
|
||||
val z = x.mmul(y);
|
||||
assertEquals(e, z);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMatmulMethod_7() {
|
||||
val x = Nd4j.create(DataType.INT16, 3, 5).assign(1);
|
||||
val y = Nd4j.create(DataType.INT16, 5, 3).assign(1);
|
||||
val e = Nd4j.create(DataType.INT16, 3, 3).assign(5);
|
||||
|
||||
val z = x.mmul(y);
|
||||
assertEquals(e, z);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMatmulMethod_1() {
|
||||
val x = Nd4j.create(DataType.INT32, 3, 5).assign(1);
|
||||
val y = Nd4j.create(DataType.INT32, 5, 3).assign(1);
|
||||
val e = Nd4j.create(DataType.INT32, 3, 3).assign(5);
|
||||
|
||||
val z = x.mmul(y);
|
||||
assertEquals(e, z);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMatmulMethod_2() {
|
||||
val x = Nd4j.create(DataType.INT64, 3, 5).assign(1);
|
||||
val y = Nd4j.create(DataType.INT64, 5, 3).assign(1);
|
||||
val e = Nd4j.create(DataType.INT64, 3, 3).assign(5);
|
||||
|
||||
val z = x.mmul(y);
|
||||
assertEquals(e, z);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMatmulMethod_6() {
|
||||
val x = Nd4j.create(DataType.UINT8, 3, 5).assign(1);
|
||||
val y = Nd4j.create(DataType.UINT8, 5, 3).assign(1);
|
||||
val e = Nd4j.create(DataType.UINT8, 3, 3).assign(5);
|
||||
|
||||
val z = x.mmul(y);
|
||||
assertEquals(e, z);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMatmulMethod_5() {
|
||||
val x = Nd4j.create(DataType.UINT16, 3, 5).assign(1);
|
||||
val y = Nd4j.create(DataType.UINT16, 5, 3).assign(1);
|
||||
val e = Nd4j.create(DataType.UINT16, 3, 3).assign(5);
|
||||
|
||||
val z = x.mmul(y);
|
||||
assertEquals(e, z);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMatmulMethod_3() {
|
||||
val x = Nd4j.create(DataType.UINT32, 3, 5).assign(1);
|
||||
val y = Nd4j.create(DataType.UINT32, 5, 3).assign(1);
|
||||
val e = Nd4j.create(DataType.UINT32, 3, 3).assign(5);
|
||||
|
||||
val z = x.mmul(y);
|
||||
assertEquals(e, z);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMatmulMethod_4() {
|
||||
val x = Nd4j.create(DataType.UINT64, 3, 5).assign(1);
|
||||
val y = Nd4j.create(DataType.UINT64, 5, 3).assign(1);
|
||||
val e = Nd4j.create(DataType.UINT64, 3, 3).assign(5);
|
||||
|
||||
val z = x.mmul(y);
|
||||
assertEquals(e, z);
|
||||
}
|
||||
|
||||
@Override
|
||||
public char ordering() {
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.nd4j.linalg.compression;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import lombok.val;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
|
@ -44,7 +45,6 @@ import static org.junit.Assert.*;
|
|||
/**
|
||||
* @author raver119@gmail.com
|
||||
*/
|
||||
@Ignore
|
||||
@Slf4j
|
||||
@RunWith(Parameterized.class)
|
||||
public class CompressionTests extends BaseNd4jTest {
|
||||
|
@ -140,40 +140,6 @@ public class CompressionTests extends BaseNd4jTest {
|
|||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testThresholdCompressionZ() {
|
||||
INDArray initial = Nd4j.create(1, 16384);
|
||||
for (int i = 0; i < 96; i++)
|
||||
initial.putScalar(i * 20, 1.0f);
|
||||
|
||||
|
||||
INDArray exp = Nd4j.create(1, 16384);
|
||||
for (int i = 0; i < 96; i++)
|
||||
exp.putScalar(i * 20, 0.1f);
|
||||
|
||||
INDArray exp_d = Nd4j.create(1, 16384);
|
||||
for (int i = 0; i < 96; i++)
|
||||
exp_d.putScalar(i * 20, 0.9f);
|
||||
|
||||
NDArrayCompressor compressor = Nd4j.getCompressor().getCompressor("THRESHOLD");
|
||||
compressor.configure(0.9);
|
||||
|
||||
INDArray compressed = Nd4j.getExecutioner().thresholdEncode(initial, 0.9);
|
||||
|
||||
assertEquals(exp, initial);
|
||||
|
||||
log.info("Compressed length: {}", compressed.data().length());
|
||||
// log.info("Compressed: {}", Arrays.toString(compressed.data().asInt()));
|
||||
|
||||
INDArray decompressed = Nd4j.create(1, initial.length());
|
||||
Nd4j.getExecutioner().thresholdDecode(compressed, decompressed);
|
||||
|
||||
log.info("Decompressed length: {}", decompressed.length());
|
||||
|
||||
assertEquals(exp_d, decompressed);
|
||||
}
|
||||
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void testThresholdCompression0() {
|
||||
|
@ -296,6 +262,23 @@ public class CompressionTests extends BaseNd4jTest {
|
|||
|
||||
@Test
|
||||
public void testThresholdCompression5() {
|
||||
INDArray initial = Nd4j.ones(10);
|
||||
INDArray exp_0 = initial.dup();
|
||||
|
||||
Nd4j.getExecutioner().commit();
|
||||
|
||||
//Nd4j.getCompressor().getCompressor("THRESHOLD").configure(1e-3);
|
||||
INDArray compressed = Nd4j.getExecutioner().thresholdEncode(initial, 1.0f, 3);
|
||||
|
||||
assertEquals(7, compressed.data().length());
|
||||
|
||||
assertNotEquals(exp_0, initial);
|
||||
|
||||
assertEquals(7, initial.sumNumber().doubleValue(), 0.01);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testThresholdCompression5_1() {
|
||||
INDArray initial = Nd4j.ones(1000);
|
||||
INDArray exp_0 = initial.dup();
|
||||
|
||||
|
@ -435,8 +418,8 @@ public class CompressionTests extends BaseNd4jTest {
|
|||
INDArray exp_0 = Nd4j.create(new float[] {0.0f, -1e-4f, 0.0f, 0.0f, 0.0f, 0.0f});
|
||||
INDArray exp_1 = Nd4j.create(new float[] {0.0f, -5e-4f, 1e-3f, -1e-3f, 0.0f, 0.0f});
|
||||
|
||||
DataBuffer ib = Nd4j.getDataBufferFactory().createInt(5);
|
||||
INDArray enc = Nd4j.createArrayFromShapeBuffer(ib, initial.shapeInfoDataBuffer());
|
||||
|
||||
INDArray enc = Nd4j.create(DataType.INT32, initial.length() / 16 + 5);
|
||||
|
||||
long elements = Nd4j.getExecutioner().bitmapEncode(initial, enc, 1e-3);
|
||||
log.info("Encoded: {}", Arrays.toString(enc.data().asInt()));
|
||||
|
@ -471,7 +454,7 @@ public class CompressionTests extends BaseNd4jTest {
|
|||
@Test
|
||||
public void testBitmapEncoding5() {
|
||||
Nd4j.getRandom().setSeed(119);
|
||||
INDArray initial = Nd4j.rand(new int[]{1, 10000}, -1, -0.5, Nd4j.getRandom());
|
||||
INDArray initial = Nd4j.rand(new int[]{10000}, -1, -0.5, Nd4j.getRandom());
|
||||
INDArray exp_0 = initial.dup().addi(1e-1);
|
||||
INDArray exp_1 = initial.dup();
|
||||
|
||||
|
@ -486,7 +469,7 @@ public class CompressionTests extends BaseNd4jTest {
|
|||
@Test
|
||||
public void testBitmapEncoding6() {
|
||||
Nd4j.getRandom().setSeed(119);
|
||||
INDArray initial = Nd4j.rand(new int[]{1, 100000}, -1, 1, Nd4j.getRandom());
|
||||
INDArray initial = Nd4j.rand(new int[]{10000}, -1, 1, Nd4j.getRandom());
|
||||
INDArray exp_1 = initial.dup();
|
||||
|
||||
INDArray enc = Nd4j.getExecutioner().bitmapEncode(initial, 1e-3);
|
||||
|
@ -494,6 +477,11 @@ public class CompressionTests extends BaseNd4jTest {
|
|||
|
||||
Nd4j.getExecutioner().bitmapDecode(enc, initial);
|
||||
|
||||
val f0 = exp_1.toFloatVector();
|
||||
val f1 = initial.toFloatVector();
|
||||
|
||||
assertArrayEquals(f0, f1, 1e-5f);
|
||||
|
||||
assertEquals(exp_1, initial);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue