compression ops (#436)

* Added declarations for decode/encode_bitmap ops.

Signed-off-by: shugeo <sgazeos@gmail.com>

* Added implementation for bitmap encoding/decoding ops.

Signed-off-by: shugeo <sgazeos@gmail.com>

* Added helpers for encode/decode bitmap ops.

Signed-off-by: shugeo <sgazeos@gmail.com>

* Refactored encodingBitmap helper.

Signed-off-by: shugeo <sgazeos@gmail.com>

* threshold encode/decode skeleton

* helper skeleton

* minor import fix

* encoder shape fn & op impl

* thresholdEncode cpu impl

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* thresholdDecode cpu impl

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* Only cosmetical changes.

Signed-off-by: shugeo <sgazeos@gmail.com>

* placeholder

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* Added cuda implementation for bitmap decode helper.

Signed-off-by: shugeo <sgazeos@gmail.com>

* cuda thresholdEstimate

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* cuda thresholdDecode

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* next step

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* - nano cmakelist update (get rid of Clion section)
- fixed forgotten throw in AtomicTests

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* thesholdEncode cuda impl

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* Added tests for bitmap encoding/decoding ops.

Signed-off-by: shugeo <sgazeos@gmail.com>

* Fixed tests for encode/decode bitmaps.

Signed-off-by: shugeo <sgazeos@gmail.com>

* Refactored decode/encode helpers.

Signed-off-by: shugeo <sgazeos@gmail.com>

* Fixed crashes with bitmap decode/encode helpers.

Signed-off-by: shugeo <sgazeos@gmail.com>

* bitmap encode/decode CPU

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* bitmap encode/decode CUDA

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* C API removed for threshold/bitmap encode

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* EncodeBitmap/DecodeBitmap Java side

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* EncodeThreshold/DecodeThreshold Java side

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* EncodeThreshold/DecodeThreshold Java side

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* few more tests for threshold encoding

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* minor test tweak

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* two special tests

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* encodeBitmap CPU fix

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* parallel_long/parallel_double proper spans fix

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* encodeThreshold CUDA fix

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* nano fix

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* grid tweaks

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* RTX adaptation for thresholdEncode

Signed-off-by: raver119 <raver119@gmail.com>

* don't allow threshold encoding for length < 2

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* get rid of NDArrayCompressor in EncodingHandler

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* one more minor update of EncodingHandler

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* one more minor tweak of EncodingHandler

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* - matmul allows integer data types use
- EncodingHandler boundary default value
- few tests for integer matmul

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* minor fix of CUDA bitmap encode

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* boundary changed to integer everywhere

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* boundary changed to integer everywhere

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* re-enable CUDA deallocator

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* threshold encoder fix for systems without omp

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* - encode_threshold now requires non-negative boundary
- minor tweak in EncodingHandler

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* restore parallelism in decode_bitmap

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* fall back to omp for encode_bitmap cpu

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* single time casts

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

* - additional test for encode_threshold
- sync buffers to device before calling for shape function

Signed-off-by: raver119@gmail.com <raver119@gmail.com>

Co-authored-by: shugeo <sgazeos@gmail.com>
master
raver119 2020-05-08 20:59:39 +03:00 committed by GitHub
parent f1232f8221
commit 0613485654
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
47 changed files with 1617 additions and 1387 deletions

View File

@ -23,9 +23,13 @@ import org.deeplearning4j.optimize.solvers.accumulation.EncodedGradientsAccumula
import org.deeplearning4j.optimize.solvers.accumulation.EncodingHandler;
import org.deeplearning4j.optimize.solvers.accumulation.encoding.threshold.FixedThresholdAlgorithm;
import org.junit.Test;
import org.nd4j.linalg.api.concurrency.AffinityManager;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.ops.util.PrintAffinity;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.nativeblas.OpaqueDataBuffer;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
/**
@ -93,12 +97,13 @@ public class EncodedGradientsAccumulatorTest extends BaseDL4JTest {
}
EncodingHandler handler = new EncodingHandler(new FixedThresholdAlgorithm(1e-3), null, null, false);
EncodingHandler handler = new EncodingHandler(new FixedThresholdAlgorithm(1e-3), null, Integer.MAX_VALUE, false);
for (int e = 10; e < numParams / 5; e++) {
INDArray encoded = handler.encodeUpdates(0, 0, getGradients(numParams, e, 2e-3));
val gradients = getGradients(numParams, e, 2e-3);
val encoded = handler.encodeUpdates(0, 0, gradients);
// log.info("enc len: {}", encoded.data().length());
assertNotNull("Failed with e == " + e, encoded);
int encFormat = encoded.data().getInt(3);

View File

@ -69,7 +69,7 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
protected ThreadLocal<Integer> index = new ThreadLocal<>();
protected long initialMemory = 100 * 1024 * 1024L;
protected int queueSize = 5;
protected Double boundary = 1.0;
protected Integer boundary = Integer.MAX_VALUE;
protected boolean encodingDebugMode;
protected IndexedTail externalSource;
@ -101,11 +101,11 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
}
public EncodedGradientsAccumulator(int parties, ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, boolean encodingDebugMode) {
this(parties, new EncodingHandler(thresholdAlgorithm, residualPostProcessor, 1.0, encodingDebugMode), DEFAULT_INITIAL_MEMORY, 10, 1.0, encodingDebugMode);
this(parties, new EncodingHandler(thresholdAlgorithm, residualPostProcessor, Integer.MAX_VALUE, encodingDebugMode), DEFAULT_INITIAL_MEMORY, 10, Integer.MAX_VALUE, encodingDebugMode);
}
public EncodedGradientsAccumulator(int parties, @NonNull MessageHandler handler, long initialMemory,
int queueSize, Double boundary, boolean encodingDebugMode) {
int queueSize, Integer boundary, boolean encodingDebugMode) {
this.parties = parties;
this.handler = handler;
this.initialMemory = initialMemory;
@ -551,7 +551,7 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
protected long initialMemory = DEFAULT_INITIAL_MEMORY;
protected int queueSize = 5;
protected MessageHandler handler;
protected Double boundary = null;
protected int boundary = Integer.MAX_VALUE;
protected boolean encodingDebugMode;
/**
@ -598,15 +598,12 @@ public class EncodedGradientsAccumulator implements GradientsAccumulator, Regist
/**
* This method enables optional limit for max number of updates per message
*
* Default value: 1.0 (no limit)
* Default value: Integer.MAX_VALUE (no limit)
* @param boundary positive value in range 0..1
* @return
*/
public Builder updatesBoundary(double boundary) {
if (boundary >= 1.0)
return this;
if (boundary <= 0.0)
public Builder updatesBoundary(int boundary) {
if (boundary <= 0)
throw new DL4JInvalidConfigException("Boundary should have positive value");
this.boundary = boundary;

View File

@ -16,6 +16,7 @@
package org.deeplearning4j.optimize.solvers.accumulation;
import org.nd4j.linalg.api.buffer.DataType;
import org.nd4j.shade.guava.util.concurrent.AtomicDouble;
import lombok.NonNull;
import lombok.extern.slf4j.Slf4j;
@ -24,7 +25,6 @@ import org.deeplearning4j.optimize.solvers.accumulation.encoding.ThresholdAlgori
import org.deeplearning4j.optimize.solvers.accumulation.encoding.ThresholdAlgorithmReducer;
import org.nd4j.linalg.api.buffer.DataBuffer;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.compression.NDArrayCompressor;
import org.nd4j.linalg.exception.ND4JIllegalStateException;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.ops.transforms.Transforms;
@ -54,9 +54,8 @@ public class EncodingHandler implements MessageHandler {
protected ThresholdAlgorithm initialThresholdAlgorithm;
protected ResidualPostProcessor initialResidualPostProcessor;
protected Double boundary;
protected Integer boundary;
protected boolean encodingDebugMode;
protected NDArrayCompressor compressor;
protected AtomicInteger atomicBoundary = new AtomicInteger(-1);
protected ThreadLocal<ThresholdAlgorithm> thresholdAlgorithm = new ThreadLocal<>();
@ -73,20 +72,16 @@ public class EncodingHandler implements MessageHandler {
protected final AtomicLong lastThresholdLogTime = new AtomicLong();
public EncodingHandler(final ThresholdAlgorithm thresholdAlgorithm, final ResidualPostProcessor residualPostProcessor,
Double boundary, boolean encodingDebugMode){
Integer boundary, boolean encodingDebugMode){
this.initialThresholdAlgorithm = thresholdAlgorithm;
this.initialResidualPostProcessor = residualPostProcessor;
this.boundary = boundary;
this.boundary = boundary == null ? Integer.MAX_VALUE : boundary;
this.encodingDebugMode = encodingDebugMode;
}
@Override
public void initialize(@NonNull GradientsAccumulator accumulator) {
this.accumulator = accumulator;
compressor = Nd4j.getCompressor().getCompressor("THRESHOLD");
if (compressor == null)
throw new ND4JIllegalStateException("Can't find Threshold compressor implementation!");
}
public INDArray encodeUpdates(int iteration, int epoch, INDArray updates) {
@ -135,14 +130,13 @@ public class EncodingHandler implements MessageHandler {
iterations.get().incrementAndGet();
if (boundary != null && atomicBoundary.get() < 0)
atomicBoundary.compareAndSet(-1, (int) (updates.length() * boundary));
atomicBoundary.compareAndSet(-1, (int) (updates.length() / 16) );
INDArray encoded;
if (!bitmapMode.get().get()) {
//Sparse updates
encoded = Nd4j.getExecutioner().thresholdEncode(updates, currentThreshold.get().get(),
boundary == null ? null : atomicBoundary.get());
encoded = Nd4j.getExecutioner().thresholdEncode(updates, currentThreshold.get().get(), boundary == null ? null : atomicBoundary.get());
// updates were TOO sparse, nothing to share here
if (encoded == null) {
@ -157,17 +151,14 @@ public class EncodingHandler implements MessageHandler {
}
double encLen = encoded.data().getInt(0);
double encLen = encoded.length();
// if updates are too dense - we fallback to bitmap encoding
if (encLen >= (updates.length() / 16)) {
log.debug("Switching back to bitmapEncoding: iteration {}, epoch {}, threshold {}, encoded length {}", iteration, epoch, currThreshold, encLen);
bitmapMode.get().set(true);
DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(updates.length() / 16 + 5);
encoded = Nd4j.createArrayFromShapeBuffer(buffer, updates.shapeInfoDataBuffer());
Nd4j.getExecutioner().bitmapEncode(updates, encoded, currentThreshold.get().get());
encoded = Nd4j.getExecutioner().bitmapEncode(updates, currentThreshold.get().get());
applyPostProcessor(iteration, epoch, currThreshold, updates);
lastSparsityRatio.set(null);
@ -186,8 +177,7 @@ public class EncodingHandler implements MessageHandler {
}
} else {
//Dense bitmap updates
DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(updates.length() / 16 + 5);
encoded = Nd4j.createArrayFromShapeBuffer(buffer, updates.shapeInfoDataBuffer());
encoded = Nd4j.create(DataType.INT32, updates.length() / 16 + 5);
long values = Nd4j.getExecutioner().bitmapEncode(updates, encoded, currentThreshold.get().get());

View File

@ -910,7 +910,7 @@ public class ParallelWrapper implements AutoCloseable {
Preconditions.checkState(thresholdAlgorithm != null, "Cannot use SHARED_GRADIENTS training mode without setting a threshold algorithm");
this.trainerContext = new SymmetricTrainerContext();
if (this.accumulator == null) {
log.info("Creating new GradientsAccumulator instance with threshold of [5e-4");
log.info("Creating new GradientsAccumulator instance with default threshold of [5e-4]");
this.accumulator = new EncodedGradientsAccumulator(workers, thresholdAlgorithm, residualPostProcessor, false);
}
}

View File

@ -45,7 +45,7 @@ public class WiredEncodingHandler extends EncodingHandler {
* @param thresholdAlgorithm threshold algorithm to use
* @param boundary
*/
public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Double boundary, boolean encodingDebugMode) {
public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Integer boundary, boolean encodingDebugMode) {
super(thresholdAlgorithm, residualPostProcessor, boundary, encodingDebugMode);
}

View File

@ -44,7 +44,7 @@ public class WiredEncodingHandler extends EncodingHandler {
*
* @param thresholdAlgorithm The threshold algorithm to use
*/
public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Double boundary, boolean encodingDebugMode) {
public WiredEncodingHandler(ThresholdAlgorithm thresholdAlgorithm, ResidualPostProcessor residualPostProcessor, Integer boundary, boolean encodingDebugMode) {
super(thresholdAlgorithm, residualPostProcessor, boundary, encodingDebugMode);
}

View File

@ -49,12 +49,12 @@ elseif(WIN32)
set(CMAKE_CXX_FLAGS_RELEASE "-D_RELEASE=true")
set(CMAKE_CXX_FLAGS_DEBUG " /FS /EHsc")
else()
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true")
set(CMAKE_CXX_FLAGS_DEBUG " -g -O2 -fPIC -fmax-errors=2")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -D_RELEASE=true")
set(CMAKE_CXX_FLAGS_DEBUG " -g -O2 -fPIC")
endif()
else()
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true")
set(CMAKE_CXX_FLAGS_DEBUG " -g -O0 -fPIC -fmax-errors=2")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fPIC -D_RELEASE=true")
set(CMAKE_CXX_FLAGS_DEBUG " -g -O0 -fPIC")
if (SD_CPU)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address")
@ -221,21 +221,16 @@ include_directories(${FLATBUFFERS_PATH}/include)
configure_file(include/config.h.in include/config.h)
include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
if (NOT DEFINED ENV{CLION_IDE})
message("NOT CLION")
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
add_subdirectory(blas)
if(SD_BUILD_TESTS)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
add_subdirectory(blas)
if(SD_BUILD_TESTS)
# tests are always compiled with all ops included
set(SD_ALL_OPS true)
set(SD_BUILD_MINIFIER true)
add_subdirectory(tests_cpu)
endif()
endif ()
endif()
if ($ENV{CLION_IDE})
add_subdirectory(tests_cpu)
endif ()
if (MSVC_DEV)
set(SD_BUILD_MINIFIER false)

View File

@ -120,8 +120,13 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}")
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
# using GCC
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}")
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_TUNE} -fmax-errors=2 ")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath,$ORIGIN/")
if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT(APPLE) AND NOT(WIN32))
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic")
SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
endif()
endif()
@ -361,11 +366,6 @@ elseif(SD_CPU)
endif()
endif()
if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT(APPLE) AND NOT(WIN32))
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic")
SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
endif()
install(TARGETS ${SD_LIBRARY_NAME} DESTINATION .)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cpu)
endif()

View File

@ -571,7 +571,7 @@ namespace samediff {
// create temporary array
int64_t intermediatery[256];
auto span = delta / numThreads;
auto span = (numElements / numThreads) - (numElements % numThreads);
// execute threads in parallel
for (uint32_t e = 0; e < numThreads; e++) {
@ -615,7 +615,7 @@ namespace samediff {
// create temporary array
double intermediatery[256];
auto span = delta / numThreads;
auto span = (numElements / numThreads) - (numElements % numThreads);
// execute threads in parallel
for (uint32_t e = 0; e < numThreads; e++) {

View File

@ -1432,18 +1432,6 @@ ND4J_EXPORT void tear(Nd4jPointer *extraPointers,
Nd4jLong *tadShapeInfo,
Nd4jLong *tadOffsets);
ND4J_EXPORT Nd4jLong encodeBitmap(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold);
ND4J_EXPORT void decodeBitmap(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo);
ND4J_EXPORT void encodeThresholdP1(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold);
ND4J_EXPORT void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *dx, Nd4jLong N, int *dz);
ND4J_EXPORT void encodeThresholdP3(Nd4jPointer *extraPointers, void *dx, Nd4jLong *xShapeInfo, int *offsets, Nd4jLong N, int *dz);
ND4J_EXPORT void decodeThreshold(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo);
ND4J_EXPORT void sort(Nd4jPointer *extraPointers,
void *x, Nd4jLong *xShapeInfo,
void *dx, Nd4jLong *dxShapeInfo,

View File

@ -1436,28 +1436,6 @@ void enableP2P(bool enable) {
// no-op
}
void encodeThresholdP1(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) {
// TODO: to be implemented
}
void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *hX, Nd4jLong N, int *dz) {
// TODO: to be implemented
}
void encodeThresholdP3(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){
// offsets won't be used here
// TODO: to be implemented
}
void decodeThreshold(Nd4jPointer *extraPointers, void *hX, Nd4jLong N, void *dz, Nd4jLong *hZShapeInfo){
// TODO: to be implemented
}
bool isP2PAvailable() {
// always TRUE for cpu backend
return true;
@ -1467,10 +1445,6 @@ void checkP2P() {
// no-op
}
void decodeBitmap(Nd4jPointer *extraPointers, void *hX, Nd4jLong N, void *dz, Nd4jLong *hZShapeInfo) {
NativeOpExecutioner::decodeBitmap(hX, N, dz, hZShapeInfo);
}
template<typename T>
void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZShapeInfo, int N, int *shuffleMap, Nd4jLong **tadOnlyShapeInfo, Nd4jLong **tadOffsets) {
@ -1859,12 +1833,6 @@ void sortCooIndices(Nd4jPointer *extraPointers,
}
}
Nd4jLong encodeBitmap(Nd4jPointer *extraPointers, void *hX, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) {
return NativeOpExecutioner::encodeBitmap(hX, hXShapeInfo, N, dz, threshold);
}
Nd4jLong* mmapFile(Nd4jPointer *extraPointers, const char *fileName, Nd4jLong length) {
auto hZ = new Nd4jLong[2];errno = 0;
try {

View File

@ -2197,76 +2197,6 @@ void prescanArrayRecursive(Nd4jPointer *extras, int *dZ, int *dX, int numElement
sd::DebugHelper::checkErrorCode(stream, "prescanArray(...) failed");
}
void encodeThresholdP1(Nd4jPointer *extras, void *dx, Nd4jLong *hXShapeInfo, Nd4jLong N, int *dz, float threshold) {
try {
cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extras[1]);
int blockSize = 1024;
int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
dim3 launchDims(numBlocks, blockSize, 1024);
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
BUILD_SINGLE_SELECTOR(xType, encoderKernelP1Generic, (launchDims, stream, dx, N, dz, threshold), LIBND4J_TYPES);
sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP1Float(...) failed");
} catch (std::exception &e) {
sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
}
}
void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *dx, Nd4jLong N, int *dz) {
try {
cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
//encoderKernelP2Float<<<numBlocks, blockSize , 1024 * sizeof(float), *stream>>>(dx, N, dz);
prescanArrayRecursive(extraPointers, dz, dx + 1, (int) N, 0);
sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP2Int(...) failed");
} catch (std::exception &e) {
sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
}
}
void encodeThresholdP3(Nd4jPointer *extraPointers, void *dx, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){
try {
cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
int blockSize = 1024;
int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
dim3 launchDims(numBlocks, blockSize, 4096);
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
BUILD_SINGLE_SELECTOR(xType, encoderKernelP3Generic, (launchDims, stream, dx, offsets, N, dz), LIBND4J_TYPES);
sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP3Float(...) failed");
} catch (std::exception &e) {
sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
}
}
void decodeThreshold(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo){
try {
cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
// we probably want to have smaller blocks here, memory writes are misaligned anyway
int blockSize = 128;
int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
dim3 launchDims(numBlocks, blockSize, 1024);
auto zType = sd::ArrayOptions::dataType(zShapeInfo);
BUILD_SINGLE_SELECTOR(zType, decoderKernelGeneric, (launchDims, stream, dx, N, dz), LIBND4J_TYPES);
sd::DebugHelper::checkErrorCode(stream, "decodeThresholdFloat(...) failed");
} catch (std::exception &e) {
sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
}
}
////////////////////////////////////////////////////////////////////////
void execReduce3All(Nd4jPointer *extraPointers,
int opNum,
@ -2603,55 +2533,6 @@ void sortCooIndices(Nd4jPointer *extraPointers, Nd4jLong *indices, void *values,
throw std::runtime_error("sortCooIndices:: Not implemented yet");
}
Nd4jLong encodeBitmap(Nd4jPointer *extraPointers,
void *dx, Nd4jLong *hXShapeInfo,
Nd4jLong N,
int *dz,
float threshold) {
try {
cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
int *resultPointer = reinterpret_cast<int *>(extraPointers[2]);
int *reductionPointer = reinterpret_cast<int *>(extraPointers[3]);
dim3 launchDims(512, 512, 32768);
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
BUILD_SINGLE_SELECTOR(xType, cudaEncodeBitmapGeneric,
(launchDims, stream, dx, N, dz, resultPointer, reductionPointer, threshold),
LIBND4J_TYPES);
sd::DebugHelper::checkErrorCode(stream, "encodeBitmapFloat(...) failed");
Nd4jLong dZ = (Nd4jLong) resultPointer[0];
resultPointer[0] = 0;
return dZ;
} catch (std::exception &e) {
sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
return 0;
}
}
void decodeBitmap(Nd4jPointer *extraPointers,
void *dx,
Nd4jLong N,
void *dz, Nd4jLong *zShapeInfo) {
try {
cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
dim3 launchDims(512, 512, 16384);
auto xType = sd::ArrayOptions::dataType(zShapeInfo);
BUILD_SINGLE_SELECTOR(xType, cudaDecodeBitmapGeneric, (launchDims, stream, dx, N, dz), LIBND4J_TYPES);
sd::DebugHelper::checkErrorCode(stream, "decodeBitmapFloat(...) failed");
} catch (std::exception &e) {
sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
}
}
Nd4jLong* mmapFile(Nd4jPointer *extraPointers, const char *fileName, Nd4jLong length) {
return nullptr;
}

View File

@ -207,7 +207,7 @@ namespace sd {
}
void Environment::setMaxSpecialyMemory(uint64_t maxBytes) {
_maxTotalSpecialMemory;
_maxTotalSpecialMemory = maxBytes;
}
void Environment::setMaxDeviceMemory(uint64_t maxBytes) {

View File

@ -217,10 +217,27 @@ namespace sd {
}
//////////////////////////////////////////////////////////////////////////
/*
* PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
*/
template<typename T>
__global__ static void execEncoderKernelP1(void *dx, Nd4jLong N, void *dz, float threshold) {
auto x = reinterpret_cast<T *> (dx);
auto z = reinterpret_cast<int *> (dz);
encoderKernelP1<T>(dx, N, dz, threshold);
//basically, for phase One we want do calculation: how many eligible values we have, and which blocks will be holding data
Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
int pass = tid < N && sd::math::nd4j_abs<T>(x[tid]) >= static_cast<T>(threshold) ? 1 : 0;
int bp=__syncthreads_count(pass);
if (threadIdx.x == 0) {
// saving out per-block passes
z[blockIdx.x+1] = bp;
// saving out sum
atomicAdd(&z[0], bp);
}
}
//////////////////////////////////////////////////////////////////////////
@ -230,13 +247,74 @@ __host__ void encoderKernelP1Generic(dim3 &launchDims, cudaStream_t *stream, voi
execEncoderKernelP1<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, dz, threshold);
sd::DebugHelper::checkErrorCode(stream, "encoderP1(...) failed");
}
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP1Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold), LIBND4J_TYPES);
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP1Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold), FLOAT_TYPES);
//////////////////////////////////////////////////////////////////////////
/*
* PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
*
* Based on: https://github.com/knotman90/cuStreamComp <-- efficient CUDA stream compaction algorithm
*/
template<typename T>
__global__ static void execEncoderKernelP3(void *dx, int *offsets, Nd4jLong N, void *dz) {
auto x = reinterpret_cast<T *> (dx);
auto z = reinterpret_cast<int *> (dz);
encoderKernelP3<T>(dx, offsets, N, dz);
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
extern __shared__ int warpTotals[];
// fetch block offset only once
__shared__ float threshold;
__shared__ FloatBits fb;
__shared__ int bo;
__shared__ int limit;
if (threadIdx.x == 0) {
limit = z[0];
fb.i_ = z[2];
threshold = fb.f_;
bo = offsets[blockIdx.x];
}
__syncthreads();
// out-of-limit threads do not play here
auto value = tid < N ? x[tid] : (T) 0.f;
// out-of-limit threads just declare they have no changes
auto pred = tid >= N ? 0 : sd::math::nd4j_abs<T>(value) >= static_cast<T>(threshold) ? 1 : 0;
auto w_i = threadIdx.x / warpSize; // warp index (or, warp number) - index of the Warp within TOTAL_WARPS
auto t_i = threadIdx.x % warpSize; // thread index within a warp
unsigned int t_m = INT_MAX >> (warpSize - t_i - 1); //thread mask (ERROR IN THE PAPER minus one is required)
int b = __ballot_sync(t_m, pred); // balres = number whose ith bit isone if the ith's thread pred is true masked up to the current index in warp
auto t_u = __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
if (t_i == warpSize - 1)
warpTotals[w_i] = t_u + pred;
__syncthreads();
int w_i_u = 0;
for (int j = 0; j <= 5; j++) {
unsigned int b_j = __ballot_sync(t_m, warpTotals[t_i] & pow2i(j)); //# of the ones in the j'th digit of the warp offsets
w_i_u += (__popc(b_j) << j);
}
// we just ignore all results coming from non-0 threads
if (w_i == 0 && t_i < blockDim.x / warpSize)
warpTotals[t_i] = w_i_u;
__syncthreads();
// pred is always false if we're out-of-limits
if (pred) {
int idx = t_u + warpTotals[w_i] + bo + 4;
if (idx < limit + 4) {
z[idx] = value > static_cast<T>(0.0f) ? tid + 1 : -(tid + 1);
x[tid] = value > static_cast<T>(0.0f) ? x[tid] - threshold : x[tid] + threshold;
}
}
}
//////////////////////////////////////////////////////////////////////////
@ -245,13 +323,38 @@ __host__ void encoderKernelP3Generic(dim3 &launchDims, cudaStream_t *stream, voi
execEncoderKernelP3<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, offsets, N, dz);
sd::DebugHelper::checkErrorCode(stream, "encoderP3(...) failed");
}
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP3Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz), LIBND4J_TYPES);
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP3Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz), FLOAT_TYPES);
//////////////////////////////////////////////////////////////////////////
/*
* This kernel handles decode from sparse threshold array, to dense array
*
* PLEASE NOTE: Z is expected to be memset to 0
*/
template<typename T>
__global__ static void execDecoderKernel(void *dx, Nd4jLong N, void *dz) {
auto x = reinterpret_cast<int *> (dx);
auto z = reinterpret_cast<T *> (dz);
decoderKernel<T>(dx, N, dz);
int tid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float threshold;
__shared__ int limit;
__shared__ FloatBits fb;
if (threadIdx.x == 0) {
limit = x[0];
fb.i_ = x[2];
threshold = fb.f_;
}
__syncthreads();
for (int e = tid; e < limit; e += blockDim.x * gridDim.x) {
int el = x[e+4];
int ael = sd::math::nd4j_abs<int>(el) - 1;
// TODO: investigate, if += would work better here, as in "decoded accumulation"
z[ael] += el > 0 ? threshold : -threshold;
}
}
//////////////////////////////////////////////////////////////////////////
@ -261,14 +364,78 @@ __host__ void decoderKernelGeneric(dim3 &launchDims, cudaStream_t *stream, void
execDecoderKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, dz);
sd::DebugHelper::checkErrorCode(stream, "execDecoder(...) failed");
}
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT decoderKernelGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES);
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT decoderKernelGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz), FLOAT_TYPES);
//////////////////////////////////////////////////////////////////////////
template<typename T>
__global__ static void execCudaEncodeBitmapKernel(void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold) {
auto dx = reinterpret_cast<T *>(vdx);
int tid = blockIdx.x * blockDim.x + threadIdx.x;
cudaEncodeBitmapKernel<T>(vdx, N, dz, scalar, reductionBuffer, threshold);
T off(0.0f);
__shared__ int counter;
__shared__ int *shmem;
__shared__ T *vals;
if (threadIdx.x == 0){
extern __shared__ char mem[];
shmem = reinterpret_cast<int*>(mem);
vals = reinterpret_cast<T *>(shmem + blockDim.x);
counter = 0;
}
__syncthreads();
Nd4jLong loopRemainder = N % (blockDim.x * gridDim.x);
Nd4jLong loopLimit = N + (blockDim.x * gridDim.x - loopRemainder);
for (Nd4jLong i = tid; i < loopLimit; i += blockDim.x * gridDim.x) {
// all threads in block reading stuff
T val = i < N ? dx[i] : off;
T abs = sd::math::nd4j_abs<T>(val);
int byteId = i / 16 + 4;
int bitId = i % 16;
shmem[threadIdx.x] = 0;
vals[threadIdx.x] = val;
if (abs >= static_cast<T>(threshold) && i < N) {
shmem[threadIdx.x] = 1 << (bitId);
atomicAdd(&counter, 1);
if (val < static_cast<T>(0.0f)) {
shmem[threadIdx.x] |= 1 << (bitId + 16);
vals[threadIdx.x] += static_cast<T>(threshold);
} else {
vals[threadIdx.x] -= static_cast<T>(threshold);
}
} else if (abs >= static_cast<T>(threshold) / static_cast<T>(2.0f) && val < static_cast<T>(0.0f) && i < N) {
atomicAdd(&counter, 1);
shmem[threadIdx.x] = 1 << (bitId + 16);
vals[threadIdx.x] += static_cast<T>(threshold) / static_cast<T>(2.0f);
}
__syncthreads();
if (threadIdx.x % 16 == 0 && i < N) {
int byte = 0;
for (int e = 0; e < 16; e++) {
if (i + e >= N)
continue;
byte |= shmem[threadIdx.x + e];
}
dz[byteId] = byte;
}
__syncthreads();
if (i < N)
dx[i] = vals[threadIdx.x];
}
__syncthreads();
if (threadIdx.x == 0) {
atomicAdd(scalar, counter);
}
}
//////////////////////////////////////////////////////////////////////////
@ -278,14 +445,62 @@ __host__ void cudaEncodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, vo
execCudaEncodeBitmapKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(vdx, N, dz, scalar, reductionBuffer, threshold);
sd::DebugHelper::checkErrorCode(stream, "encodeBitmap(...) failed");
}
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaEncodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold), LIBND4J_TYPES);
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaEncodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold), FLOAT_TYPES);
//////////////////////////////////////////////////////////////////////////
template<typename T>
__global__ static void execCudaDecodeBitmapKernel(void *dx, Nd4jLong N, void *vdz) {
auto dz = static_cast<T*>(vdz);
cudaDecodeBitmapKernel<T>(dx, N, vdz);
int tid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ T *shmem;
__shared__ FloatBits fb;
__shared__ float threshold;
__shared__ int *x;
if (threadIdx.x == 0){
extern __shared__ char mem[];
shmem = reinterpret_cast<T*>(mem);
x = reinterpret_cast<int *>(dx);
fb.i_ = x[2];
threshold = fb.f_;
}
__syncthreads();
int lim = N / 16 + 5;
for (int i = tid; i < N; i += blockDim.x * gridDim.x) {
int byteId = i / 16 + 4;
// printf("I: [%i]; byteId: [%i]\n", i, byteId);
shmem[threadIdx.x] = dz[i];
__syncthreads();
if (threadIdx.x % 16 == 0) {
int byte = x[byteId];
for (int e = 0; e < 16; e++) {
if (i + e >= N)
continue;
int bitId = (i + e) % 16;
bool hasBit = (byte & 1 << (bitId) ) != 0;
bool hasSign = (byte & 1 << (bitId + 16) ) != 0;
if (hasBit) {
if (hasSign)
shmem[threadIdx.x + bitId] -= threshold;
else
shmem[threadIdx.x + bitId] += threshold;
} else if (hasSign) {
shmem[threadIdx.x + bitId] -= threshold / 2;
}
}
}
__syncthreads();
dz[i] = shmem[threadIdx.x];
}
}
//////////////////////////////////////////////////////////////////////////
@ -295,7 +510,7 @@ __host__ void cudaDecodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, vo
execCudaDecodeBitmapKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, vdz);
sd::DebugHelper::checkErrorCode(stream, "cudeDecodeBitmap(...) failed");
}
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz), LIBND4J_TYPES);
BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz), FLOAT_TYPES);
template <bool storeSum, bool isNP2>

View File

@ -106,8 +106,14 @@ namespace sd {
auto l = static_cast<int>(N);
z[1] = l;
#ifdef _OPENMP
int threads = OmpLaunchHelper::betterThreads(N);
int span = OmpLaunchHelper::betterSpan(N, threads);
auto span = OmpLaunchHelper::betterSpan(N, threads);
#else
int threads = 1;
auto span = N;
#endif
T tt = static_cast<T>(threshold);
T mtt = -tt;
@ -209,21 +215,23 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
samediff::Threads::parallel_for(func, 0, N);
};
template void TypeCast::convertFromThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertFromThreshold<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertFromThreshold<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertFromThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertFromThreshold<bfloat16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertToThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertToThreshold<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertToThreshold<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertToThreshold<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertToThreshold<bfloat16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertFromQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertFromQuantized<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertFromQuantized<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertFromQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertToQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertToQuantized<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertToQuantized<float16>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
template void TypeCast::convertToQuantized<double>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
#ifndef __CLION_IDE__
BUILD_DOUBLE_TEMPLATE(template void TypeCast::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES)

View File

@ -110,29 +110,6 @@ namespace sd {
}
#ifdef __CUDACC__
/*
* PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
*/
template<typename T>
__device__ inline void encoderKernelP1(void *dx, Nd4jLong N, void *dz, float threshold) {
auto x = reinterpret_cast<T *> (dx);
auto z = reinterpret_cast<int *> (dz);
//basically, for phase One we want do calculation: how many eligible values we have, and which blocks will be holding data
Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
int pass = tid < N && sd::math::nd4j_abs<T>(x[tid]) >= static_cast<T>(threshold) ? 1 : 0;
int bp=__syncthreads_count(pass);
if (threadIdx.x == 0) {
// saving out per-block passes
z[blockIdx.x+1] = bp;
// saving out sum
atomicAdd(&z[0], bp);
}
}
__device__ __inline__ int pow2i (int e){
return 1<<e;
}
@ -140,274 +117,21 @@ namespace sd {
template<typename T>
__host__ void encoderKernelP1Generic(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold);
/*
* PLEASE NOTE: This kernel doesn't allow loop for data. Basically: grid will be huge.
*
* Based on: https://github.com/knotman90/cuStreamComp <-- efficient CUDA stream compaction algorithm
*/
template<typename T>
__device__ inline void encoderKernelP3(void *dx, int *offsets, Nd4jLong N, void *dz) {
T *x = reinterpret_cast<T *> (dx);
int *z = reinterpret_cast<int *> (dz);
Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
extern __shared__ int warpTotals[];
// fetch block offset only once
__shared__ float threshold;
__shared__ FloatBits fb;
__shared__ int bo;
__shared__ int limit;
if (threadIdx.x == 0) {
limit = z[0];
fb.i_ = z[2];
threshold = fb.f_;
bo = offsets[blockIdx.x];
}
__syncthreads();
if (tid < N) {
T value = x[tid];
int pred = sd::math::nd4j_abs<T>(value) >= static_cast<T>(threshold) ? 1 : 0;
int w_i = threadIdx.x/warpSize; //warp index
int w_l = tid % warpSize;//thread index within a warp
unsigned int t_m = INT_MAX >> (warpSize-w_l-1); //thread mask (ERROR IN THE PAPER minus one is required)
int b = __ballot_sync(t_m, pred); //balres = number whose ith bit isone if the ith's thread pred is true masked up to the current index in warp
int t_u = __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
if(w_l==warpSize-1){
warpTotals[w_i]=t_u+pred;
}
// __syncthreads(); // Eliminated due RTX20xx specific
if(w_i==0 && w_l<blockDim.x/warpSize){
int w_i_u=0;
for(int j=0;j<=5;j++){
unsigned int b_j =__ballot_sync(t_m, warpTotals[w_l] & pow2i(j) ); //# of the ones in the j'th digit of the warp offsets
w_i_u += (__popc(b_j) << j);
//printf("indice %i t_m=%i,j=%i,b_j=%i,w_i_u=%i\n",w_l,t_m,j,b_j,w_i_u);
}
warpTotals[w_l]=w_i_u;
}
// __syncthreads(); // Eliminated due RTX20xx specific
if(pred){
int idx = t_u + warpTotals[w_i] + bo + 4;
if (idx < limit + 4) {
z[idx]= value > static_cast<T>(0.0f) ? tid+1 : -(tid + 1);
x[tid] = value > static_cast<T>(0.0f) ? x[tid] - threshold : x[tid] + threshold;
}
}
}
}
template<typename T>
__host__ void encoderKernelP3Generic(dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz);
/*
* This kernel handles decode from sparse threshold array, to dense array
*
* PLEASE NOTE: Z is expected to be memset to 0
*/
template<typename T>
__device__ inline void decoderKernel(void *dx, Nd4jLong N, void *dz) {
auto x = reinterpret_cast<int *> (dx);
auto z = reinterpret_cast<T *> (dz);
int tid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float threshold;
__shared__ int limit;
__shared__ FloatBits fb;
if (threadIdx.x == 0) {
limit = x[0];
fb.i_ = x[2];
threshold = fb.f_;
}
__syncthreads();
for (int e = tid; e < limit; e += blockDim.x * gridDim.x) {
int el = x[e+4];
int ael = sd::math::nd4j_abs<int>(el) - 1;
// TODO: investigate, if += would work better here, as in "decoded accumulation"
z[ael] += el > 0 ? threshold : -threshold;
}
}
template<typename T>
__host__ void decoderKernelGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz);
//////////////////////////////////////////////////////////////////////////
template<typename T>
__device__ inline void cudaEncodeBitmapKernel(void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold) {
auto dx = reinterpret_cast<T *>(vdx);
int tid = blockIdx.x * blockDim.x + threadIdx.x;
T off(0.0f);
__shared__ int counter;
__shared__ int *shmem;
__shared__ T *vals;
if (threadIdx.x == 0){
extern __shared__ char mem[];
shmem = reinterpret_cast<int*>(mem);
vals = reinterpret_cast<T *>(shmem + blockDim.x);
counter = 0;
}
__syncthreads();
Nd4jLong loopRemainder = N % (blockDim.x * gridDim.x);
Nd4jLong loopLimit = N + (blockDim.x * gridDim.x - loopRemainder);
for (Nd4jLong i = tid; i < loopLimit; i += blockDim.x * gridDim.x) {
// all threads in block reading stuff
T val = i < N ? dx[i] : off;
T abs = sd::math::nd4j_abs<T>(val);
int byteId = i / 16 + 4;
int bitId = i % 16;
shmem[threadIdx.x] = 0;
vals[threadIdx.x] = val;
if (abs >= static_cast<T>(threshold) && i < N) {
shmem[threadIdx.x] = 1 << (bitId);
atomicAdd(&counter, 1);
if (val < static_cast<T>(0.0f)) {
shmem[threadIdx.x] |= 1 << (bitId + 16);
vals[threadIdx.x] += static_cast<T>(threshold);
} else {
vals[threadIdx.x] -= static_cast<T>(threshold);
}
} else if (abs >= static_cast<T>(threshold) / static_cast<T>(2.0f) && val < static_cast<T>(0.0f) && i < N) {
atomicAdd(&counter, 1);
shmem[threadIdx.x] = 1 << (bitId + 16);
vals[threadIdx.x] += static_cast<T>(threshold) / static_cast<T>(2.0f);
}
__syncthreads();
if (threadIdx.x % 16 == 0 && i < N) {
int byte = 0;
for (int e = 0; e < 16; e++) {
if (i + e >= N)
continue;
byte |= shmem[threadIdx.x + e];
}
dz[byteId] = byte;
}
__syncthreads();
if (i < N)
dx[i] = vals[threadIdx.x];
}
__syncthreads();
if (threadIdx.x == 0) {
atomicAdd(scalar, counter);
}
}
template<typename T>
__host__ void cudaEncodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
//////////////////////////////////////////////////////////////////////////
template<typename T>
__device__ inline void cudaDecodeBitmapKernel(void *dx, Nd4jLong N, void *vdz) {
auto dz = static_cast<T*>(vdz);
int tid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ T *shmem;
__shared__ FloatBits fb;
__shared__ float threshold;
__shared__ int *x;
if (threadIdx.x == 0){
extern __shared__ char mem[];
shmem = reinterpret_cast<T*>(mem);
x = reinterpret_cast<int *>(dx);
fb.i_ = x[2];
threshold = fb.f_;
}
__syncthreads();
int lim = N / 16 + 5;
for (int i = tid; i < N; i += blockDim.x * gridDim.x) {
int byteId = i / 16 + 4;
// printf("I: [%i]; byteId: [%i]\n", i, byteId);
shmem[threadIdx.x] = dz[i];
__syncthreads();
if (threadIdx.x % 16 == 0) {
int byte = x[byteId];
for (int e = 0; e < 16; e++) {
if (i + e >= N)
continue;
int bitId = (i + e) % 16;
bool hasBit = (byte & 1 << (bitId) ) != 0;
bool hasSign = (byte & 1 << (bitId + 16) ) != 0;
if (hasBit) {
if (hasSign)
shmem[threadIdx.x + bitId] -= threshold;
else
shmem[threadIdx.x + bitId] += threshold;
} else if (hasSign) {
shmem[threadIdx.x + bitId] -= threshold / 2;
}
}
}
__syncthreads();
dz[i] = shmem[threadIdx.x];
}
}
template<typename T>
__host__ void cudaDecodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz);
// __global__ void cudaEncodeBitmapFloat(float *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
// __global__ void cudaEncodeBitmapDouble(double *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
// __global__ void cudaEncodeBitmapHalf(float16 *dx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold);
// __global__ void cudaDecodeBitmapFloat(void *dx, Nd4jLong N, float *dz);
// __global__ void cudaDecodeBitmapDouble(void *dx, Nd4jLong N, double *dz);
// __global__ void cudaDecodeBitmapHalf(void *dx, Nd4jLong N, float16 *dz);
// __global__ void encoderKernelP1Float(void *dx, Nd4jLong N, void *dz, float threshold);
// __global__ void encoderKernelP1Double(void *dx, Nd4jLong N, void *dz, float threshold);
// __global__ void encoderKernelP1Half(void *dx, Nd4jLong N, void *dz, float threshold);
// __global__ void encoderKernelP2Float(int *dx, Nd4jLong N, int *dz);
// __global__ void encoderKernelP3Float(void *dx, int *offsets, Nd4jLong N, void *dz);
// __global__ void encoderKernelP3Double(void *dx, int *offsets, Nd4jLong N, void *dz);
// __global__ void encoderKernelP3Half(void *dx, int *offsets, Nd4jLong N, void *dz);
// __global__ void decoderKernelFloat(void *dx, Nd4jLong N, void *dz);
// __global__ void decoderKernelDouble(void *dx, Nd4jLong N, void *dz);
// __global__ void decoderKernelHalf(void *dx, Nd4jLong N, void *dz);
__global__ void uniformAdd(int *g_data, int *uniforms, int n, int blockOffset, int baseIndex);
template <bool storeSum, bool isNP2>

View File

@ -25,6 +25,7 @@
#include <ops/declarable/headers/boolean.h>
#include <ops/declarable/headers/broadcastable.h>
#include <ops/declarable/headers/convo.h>
#include <ops/declarable/headers/compression.h>
#include <ops/declarable/headers/list.h>
#include <ops/declarable/headers/recurrent.h>
#include <ops/declarable/headers/transforms.h>

View File

@ -138,9 +138,9 @@ DECLARE_SHAPE_FN(matmul) {
//////////////////////////////////////////////////////////////////////
DECLARE_TYPES(matmul) {
getOpDescriptor()
->setAllowedInputTypes(0, {ALL_FLOATS})
->setAllowedInputTypes(1, {ALL_FLOATS})
->setAllowedOutputTypes(0, {ALL_FLOATS});
->setAllowedInputTypes(0, {ALL_FLOATS, ALL_INTS})
->setAllowedInputTypes(1, {ALL_FLOATS, ALL_INTS})
->setAllowedOutputTypes(0, {ALL_FLOATS, ALL_INTS});
}
//////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,92 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author George A. Shulinok <sgazeos@gmail.com>
//
#include <system/op_boilerplate.h>
#include <ops/declarable/CustomOperations.h>
#include <ops/declarable/helpers/compression.h>
#if NOT_EXCLUDED(OP_decode_bitmap)
namespace sd {
namespace ops {
CUSTOM_OP_IMPL(decode_bitmap, 2, 1, true, 0, 0) {
auto encoded = INPUT_VARIABLE(1);
auto updates = OUTPUT_VARIABLE(0);
helpers::decodeBitmap(block.launchContext(), encoded, updates);
return Status::OK();
}
DECLARE_SHAPE_FN(decode_bitmap) {
auto weights = INPUT_VARIABLE(0);
return SHAPELIST(weights->shapeInfo());
}
DECLARE_TYPES(decode_bitmap) {
getOpDescriptor()
->setAllowedInputTypes(0, {ALL_FLOATS})
->setAllowedInputTypes(1, DataType::INT32)
->setAllowedOutputTypes({ALL_FLOATS});
}
}
}
#endif
#if NOT_EXCLUDED(OP_encode_bitmap)
namespace sd {
namespace ops {
CUSTOM_OP_IMPL(encode_bitmap, 1, 3, true, 1, 0) {
auto input = INPUT_VARIABLE(0);
auto encoded = OUTPUT_NULLIFIED(1);
auto counter = OUTPUT_NULLIFIED(2);
float threshold = T_ARG(0);
encoded->p(0, (int) input->lengthOf());
encoded->p(1, (int) input->lengthOf());
encoded->p(2, reinterpret_cast<int *>(&threshold)[0]);
encoded->p(3, 1); // flag for BITMAP_ENCODING
auto result = helpers::encodeBitmap(block.launchContext(), input, encoded, threshold);
counter->p(0, result);
counter->syncToDevice();
return Status::OK();
}
DECLARE_SHAPE_FN(encode_bitmap) {
auto input = inputShape->at(0);
auto outputLength = shape::length(input) / 16 + 5;
auto encodedShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(outputLength, DataType::INT32);
auto encodedCounter = ConstantShapeHelper::getInstance()->scalarShapeInfo(DataType::INT32);
return SHAPELIST(input, encodedShape, encodedCounter);
}
DECLARE_TYPES(encode_bitmap) {
getOpDescriptor()
->setAllowedInputTypes(sd::DataType::ANY)
->setAllowedOutputTypes(0, {ALL_FLOATS})
->setAllowedInputTypes(1, DataType::INT32)
->setAllowedInputTypes(2, DataType::INT32);
}
}
}
#endif

View File

@ -0,0 +1,104 @@
/*******************************************************************************
* Copyright (c) 2020 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#include <system/op_boilerplate.h>
#include <ops/declarable/CustomOperations.h>
#include <ops/declarable/helpers/threshold.h>
namespace sd {
namespace ops {
CUSTOM_OP_IMPL(encode_threshold, 1, 2, true, 1, 0) {
auto x = INPUT_VARIABLE(0);
auto updated = OUTPUT_VARIABLE(0);
auto encoded = OUTPUT_NULLIFIED(1);
float threshold = T_ARG(0);
REQUIRE_TRUE(x->lengthOf() <= DataTypeUtils::max<int>(), 0, "encode_threshold: gradients array must have length <= MAX_INT");
REQUIRE_TRUE(encoded->lengthOf() >= 4, 0, "encode_threshold: array for encoded updates can't have less than 4 elements");
// REQUIRE_TRUE(x->platformBuffer() == updated->platformBuffer(), 0, "encode_threshold: gradients array must be the same at input and output");
// filling header bytes
encoded->p(0, encoded->lengthOf() - 4);
encoded->p(1, (int) x->lengthOf());
encoded->p(2, reinterpret_cast<int *>(&threshold)[0]);
encoded->p(3, 0); // flag for FLEXIBLE_ENCODING
// if there's no updates to process - just skip execution
if (encoded->lengthOf() == 4)
return Status::OK();
helpers::thresholdEncode(*x, *encoded, threshold);
return Status::OK();
}
DECLARE_SHAPE_FN(encode_threshold) {
auto x = INPUT_VARIABLE(0);
// we have limit option here
int boundary = block.numI() > 0 ? I_ARG(0) : DataTypeUtils::max<int>();
float threshold = T_ARG(0);
REQUIRE_TRUE(boundary >= 0, 0, "encode_threshold: boundary must be positive");
REQUIRE_TRUE(x->lengthOf() <= DataTypeUtils::max<int>(), 0, "encode_threshold: gradients array must have length <= MAX_INT");
// we must calculate number of elements that >= threshold
auto elements = sd::math::nd4j_min<int>(helpers::thresholdEstimate(*x, threshold), boundary);
if (elements < 2)
elements = 0;
// result array must have 4 additional int elements for header
return SHAPELIST(x->shapeInfo(), sd::ConstantShapeHelper::getInstance()->vectorShapeInfo(elements + 4, DataType::INT32));
}
DECLARE_TYPES(encode_threshold) {
getOpDescriptor()
->setAllowedInputTypes(0, {ALL_FLOATS})
->setAllowedOutputTypes(0, {ALL_FLOATS})
->setAllowedOutputTypes(1, DataType::INT32);
}
CUSTOM_OP_IMPL(decode_threshold, 2, 1, true, 0, 0) {
auto weights = INPUT_VARIABLE(0);
auto encoded = INPUT_VARIABLE(1);
auto updates = OUTPUT_VARIABLE(0);
REQUIRE_TRUE(encoded->lengthOf() >= 4, 0, "decode_threshold: encoded array can't have length < 4");
REQUIRE_TRUE(updates->lengthOf() == encoded->e<int>(1), 0, "decode_threshold: updates array must have length equal to [%i]", encoded->e<int>(1));
REQUIRE_TRUE(encoded->e<int>(3) == 0, 0, "decode_threshold: encoded array doesn't look like threshold-encoded");
helpers::thresholdDecode(*encoded, *updates);
return Status::OK();
}
DECLARE_SHAPE_FN(decode_threshold) {
auto weights = inputShape->at(0);
return SHAPELIST(weights);
}
DECLARE_TYPES(decode_threshold) {
getOpDescriptor()
->setAllowedInputTypes(0, {ALL_FLOATS})
->setAllowedInputTypes(1, DataType::INT32)
->setAllowedOutputTypes(0,{ALL_FLOATS});
}
}
}

View File

@ -0,0 +1,62 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
* Copyright (c) 2020 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author sgazeos@gmail.com
//
#ifndef SD_HEADERS_COMPRESSION_H
#define SD_HEADERS_COMPRESSION_H
#include <ops/declarable/headers/common.h>
namespace sd {
namespace ops {
/**
* encode_bitmap - reinterpret 3D float tensor into uint8_t vector with length N.
*
* Input:
* 0 - 3D float tensor with shape {height, width, channels}
*
* Output:
* 0 - 1D uint8_t tensor with shape {N}
*/
#if NOT_EXCLUDED(OP_encode_bitmap)
DECLARE_CUSTOM_OP(encode_bitmap, 1, 3, true, 1, 0);
#endif
/**
* decode_bitmap - reinterpret uint8_t linear tensor as data to float tensor with shape
*
* Input:
* 0 - uint8_t vector with length N ( shape {N})
*
* Output:
* 0 - 3D tensor with shape {height, width, channels}
*
*/
#if NOT_EXCLUDED(OP_decode_bitmap)
DECLARE_CUSTOM_OP(decode_bitmap, 2, 1, true, 0, 0);
#endif
DECLARE_CUSTOM_OP(encode_threshold, 2, 1, true, 1, 0);
DECLARE_CUSTOM_OP(decode_threshold, 2, 1, true, 0, 0);
}
}
#endif // SD_HEADERS_COMPRESSION_H

View File

@ -0,0 +1,34 @@
/*******************************************************************************
* Copyright (c) 2020 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author sgazeos@gmail.com
//
#ifndef __COMPRESSION_H_HELPERS__
#define __COMPRESSION_H_HELPERS__
#include <system/op_boilerplate.h>
#include <array/NDArray.h>
namespace sd {
namespace ops {
namespace helpers {
void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output);
Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold);
}
}
}
#endif

View File

@ -0,0 +1,37 @@
/*******************************************************************************
* Copyright (c) 2020 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author sgazeos@gmail.com
//
#include <ops/declarable/helpers/compression.h>
#include <execution/Threads.h>
namespace sd {
namespace ops {
namespace helpers {
void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output) {
NativeOpExecutioner::decodeBitmap(input->buffer(), output->lengthOf(), output->buffer(), output->shapeInfo());
}
Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold) {
return NativeOpExecutioner::encodeBitmap(input->buffer(), input->shapeInfo(), input->lengthOf(), output->bufferAsT<int>(), threshold);
}
}
}
}

View File

@ -0,0 +1,62 @@
/*******************************************************************************
* Copyright (c) 2020 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#include <ops/declarable/helpers/threshold.h>
#include <execution/Threads.h>
#include <helpers/threshold.h>
namespace sd {
namespace ops {
namespace helpers {
template <typename T>
static int32_t thresholdEstimate_(const NDArray &updates, const float threshold) {
auto N = updates.lengthOf();
const auto buffer = updates.bufferAsT<T>();
auto func = PRAGMA_REDUCE_LONG {
int64_t cnt = 0;
for (auto e = start; e < stop; e++) {
auto v = sd::math::nd4j_abs<T>(buffer[e]);
if (v >= threshold)
cnt++;
}
return cnt;
};
return samediff::Threads::parallel_long(func, LAMBDA_AL { return _old + _new; }, 0, N);
}
int32_t thresholdEstimate(const NDArray &updates, const float threshold) {
BUILD_SINGLE_SELECTOR(updates.dataType(), return thresholdEstimate_, (updates, threshold), FLOAT_TYPES);
return 0;
}
void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold) {
BUILD_SINGLE_SELECTOR(updates.dataType(), sd::TypeCast::convertToThreshold, (nullptr, updates.buffer(), updates.lengthOf(), encoded.buffer()), FLOAT_TYPES);
}
void thresholdDecode(const NDArray &encoded, NDArray &updates) {
BUILD_SINGLE_SELECTOR(updates.dataType(), sd::TypeCast::convertFromThreshold, (nullptr, encoded.getBuffer(), updates.lengthOf(), updates.buffer()), FLOAT_TYPES);
}
}
}
}

View File

@ -0,0 +1,66 @@
/*******************************************************************************
* Copyright (c) 2020 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author sgazeos@gmail.com
//
#include <ops/declarable/helpers/compression.h>
#include <loops/type_conversions.h>
#include <helpers/DebugHelper.h>
namespace sd {
namespace ops {
namespace helpers {
void decodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output) {
auto stream = context->getCudaStream();
NDArray::prepareSpecialUse({output}, {input});
dim3 launchDims(512, 512, 16384);
auto xType = output->dataType();
BUILD_SINGLE_SELECTOR(xType, cudaDecodeBitmapGeneric, (launchDims, stream, input->specialBuffer(), output->lengthOf(), output->specialBuffer()), FLOAT_TYPES);
sd::DebugHelper::checkErrorCode(stream, "decodeBitmapFloat(...) failed");
NDArray::registerSpecialUse({output}, {input});
}
Nd4jLong encodeBitmap(sd::LaunchContext* context, NDArray* input, NDArray* output, float threshold) {
auto stream = LaunchContext::defaultContext()->getCudaStream();
int *resultPointer = reinterpret_cast<int *>(LaunchContext::defaultContext()->getScalarPointer());
int *reductionPointer = reinterpret_cast<int *>(LaunchContext::defaultContext()->getReductionPointer());
// nullify result pointer before use
resultPointer[0] = 0;
NDArray::prepareSpecialUse({},{output, input});
dim3 launchDims(512, 512, 32768);
auto xType = input->dataType();
BUILD_SINGLE_SELECTOR(xType, cudaEncodeBitmapGeneric,
(launchDims, stream, input->specialBuffer(), input->lengthOf(), reinterpret_cast<int*>(output->specialBuffer()), resultPointer, reductionPointer, threshold),
FLOAT_TYPES);
sd::DebugHelper::checkErrorCode(stream, "encodeBitmapFloat(...) failed");
Nd4jLong dZ = (Nd4jLong) resultPointer[0];
resultPointer[0] = 0;
NDArray::registerSpecialUse({output, input}, {});
return dZ;
}
}
}
}

View File

@ -0,0 +1,231 @@
/*******************************************************************************
* Copyright (c) 2020 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#include <ops/declarable/helpers/threshold.h>
#include <loops/type_conversions.h>
#include <helpers/PointersManager.h>
#include <vector>
namespace sd {
namespace ops {
namespace helpers {
void prescanArrayRecursive(int** g_scanBlockSums, int *dZ, int *dX, int numElements, int level) {
auto stream = LaunchContext::defaultContext()->getCudaStream();
int blockSize = 512; // max size of the thread blocks
int numBlocks = sd::math::nd4j_max<int>(1, static_cast<int>(ceil(static_cast<float>(numElements) / (2.f * blockSize))));
int numThreads;
if (numBlocks > 1)
numThreads = blockSize;
else if (sd::isPowerOfTwo(numElements))
numThreads = numElements / 2;
else
numThreads = sd::floorPow2(numElements);
int numEltsPerBlock = numThreads * 2;
// if this is a non-power-of-2 array, the last block will be non-full
// compute the smallest power of 2 able to compute its scan.
int numEltsLastBlock =
numElements - (numBlocks-1) * numEltsPerBlock;
int numThreadsLastBlock = sd::math::nd4j_max<int>(1, numEltsLastBlock / 2);
int np2LastBlock = 0;
int sharedMemLastBlock = 0;
if (numEltsLastBlock != numEltsPerBlock) {
np2LastBlock = 1;
if(!isPowerOfTwo(numEltsLastBlock))
numThreadsLastBlock = floorPow2(numEltsLastBlock);
unsigned int extraSpace = (2 * numThreadsLastBlock) / NUM_BANKS;
sharedMemLastBlock = sizeof(int) * (2 * numThreadsLastBlock + extraSpace);
}
// padding space is used to avoid shared memory bank conflicts
int extraSpace = numEltsPerBlock / NUM_BANKS;
int sharedMemSize = sizeof(int) * (numEltsPerBlock + extraSpace);
// setup execution parameters
// if NP2, we process the last block separately
dim3 grid(sd::math::nd4j_max<int>(1, numBlocks - np2LastBlock), 1, 1);
dim3 threads(numThreads, 1, 1);
dim3 gridOnes(1, 1, 1);
dim3 threadsOnes(numThreadsLastBlock, 1, 1);
if (sharedMemSize < 2048)
sharedMemSize = 2048;
if (sharedMemLastBlock < 2048)
sharedMemLastBlock = 2048;
// execute the scan
if (numBlocks > 1) {
sd::prescanLauncher<true, false>(grid, threads, sharedMemSize, stream, dZ, dX, g_scanBlockSums[level], numThreads * 2, 0, 0);
if (np2LastBlock) {
sd::prescanLauncher<true, true>(gridOnes, threadsOnes, sharedMemLastBlock, stream, dZ, dX, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock);
}
// After scanning all the sub-blocks, we are mostly done. But now we
// need to take all of the last values of the sub-blocks and scan those.
// This will give us a new value that must be sdded to each block to
// get the final results.
// recursive (CPU) call
prescanArrayRecursive(g_scanBlockSums, g_scanBlockSums[level], g_scanBlockSums[level], numBlocks, level+1);
sd::uniformAdd<<<grid, threads, 1024, *stream>>>(dZ, g_scanBlockSums[level], numElements - numEltsLastBlock, 0, 0);
if (np2LastBlock) {
sd::uniformAdd<<<1, numThreadsLastBlock, 1024, *stream>>>(dZ, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock);
}
} else if (isPowerOfTwo(numElements)) {
sd::prescanLauncher<false, false>(grid, threads, sharedMemSize, stream, dZ, dX, 0, numThreads * 2, 0, 0);
} else {
sd::prescanLauncher<false, true>(grid, threads, sharedMemSize, stream, dZ, dX, 0, numElements, 0, 0);
}
sd::DebugHelper::checkErrorCode(stream, "prescanArray(...) failed");
}
static void encodeThresholdP2Int_(void **prs, int *dx, Nd4jLong N, int *dz) {
auto stream = LaunchContext::defaultContext()->getCudaStream();
prescanArrayRecursive(reinterpret_cast<int**>(prs), dz, dx + 1, (int) N, 0);
sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP2Int(...) failed");
}
static void encodeThresholdP3_(void *dx, Nd4jLong *hXShapeInfo, int *offsets, Nd4jLong N, int *dz){
auto stream = LaunchContext::defaultContext()->getCudaStream();
int blockSize = 512;
int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
dim3 launchDims(numBlocks, blockSize, 8192);
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
BUILD_SINGLE_SELECTOR(xType, encoderKernelP3Generic, (launchDims, stream, dx, offsets, N, dz), FLOAT_TYPES);
sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP3Float(...) failed");
}
static NDArray thresholdEstimate_(const NDArray &updates, const float threshold) {
const int numThreads = 512;
const int numBlocks = updates.lengthOf() / numThreads + (updates.lengthOf() % numThreads ? 1 : 0);
auto tmp = NDArrayFactory::create<int>('c', {numBlocks + 1});
dim3 launchDims(numBlocks, numThreads, 1024);
auto xType = updates.dataType();
NDArray::prepareSpecialUse({&tmp}, {&updates});
BUILD_SINGLE_SELECTOR(xType, encoderKernelP1Generic, (launchDims, LaunchContext::defaultContext()->getCudaStream(), updates.getSpecialBuffer(), updates.lengthOf(), tmp.specialBuffer(), threshold), FLOAT_TYPES);
NDArray::registerSpecialUse({&tmp}, {&updates});
return std::move(tmp);
}
int32_t thresholdEstimate(const NDArray &updates, const float threshold) {
return thresholdEstimate_(updates, threshold).e<int>(0);
}
void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold) {
// we need these blocks in order to know, how many "updates" will be processed by each GPU block
auto blocks = thresholdEstimate_(updates, threshold);
const int numThreads = 512;
const int numBlocks = updates.lengthOf() / numThreads + (updates.lengthOf() % numThreads ? 1 : 0);
const int prefixThreads = 512;
int numElts = numBlocks;
int level = 0;
// here we just calculate number of sumBlock arrays
do {
int numPrefixBlocks = sd::math::nd4j_max<int>(1, sd::math::nd4j_ceil<float, int>((float) numElts / (2.0f * prefixThreads)));
if (numBlocks > 1) {
level++;
}
numElts = numPrefixBlocks;
} while (numElts > 1);
std::vector<NDArray> tempArrays(level);
std::vector<Nd4jPointer> pointers(level);
level = 0;
numElts = numBlocks;
do {
int numPrefixBlocks = sd::math::nd4j_max<int>(1, sd::math::nd4j_ceil<float, int>((float) numElts / (2.0f * prefixThreads)));
if (numPrefixBlocks > 1) {
tempArrays[level] = std::move(NDArrayFactory::create<int>('c', {numPrefixBlocks}));
pointers[level] = tempArrays[level++].specialBuffer();
}
numElts = numPrefixBlocks;
} while (numElts > 1);
PointersManager pm(LaunchContext::defaultContext(), "thresholdEncode");
auto dptr = pm.replicatePointer(pointers.data(), pointers.size() * 8);
auto offsets = NDArrayFactory::create<int>('c', {numBlocks});
// we want to check, if we're hiting external limit on number of encoded elements
auto numMatches = blocks.e<int>(0);
if (numMatches > encoded.lengthOf() - 4) {
blocks.p(0, encoded.lengthOf() - 4);
blocks.syncToDevice();
}
NDArray::prepareSpecialUse({}, {&encoded, &updates});
// filling offsets
encodeThresholdP2Int_(reinterpret_cast<void **>(dptr),
reinterpret_cast<int*>(blocks.getSpecialBuffer()),
numBlocks,
reinterpret_cast<int*>(offsets.getSpecialBuffer()));
NDArray::registerSpecialUse({&blocks, &offsets}, {});
pm.synchronize();
encodeThresholdP3_(updates.specialBuffer(),
updates.shapeInfo(),
reinterpret_cast<int*>(offsets.getSpecialBuffer()),
updates.lengthOf(),
reinterpret_cast<int*>(encoded.specialBuffer()));
pm.synchronize();
NDArray::registerSpecialUse({&encoded, &updates}, {});
}
void thresholdDecode(const NDArray &encoded, NDArray &updates) {
dim3 launchDims(128, 512, 512);
auto xType = updates.dataType();
NDArray::prepareSpecialUse({&updates}, {&encoded});
BUILD_SINGLE_SELECTOR(xType, decoderKernelGeneric, (launchDims, LaunchContext::defaultContext()->getCudaStream(), encoded.getSpecialBuffer(), updates.lengthOf(), updates.specialBuffer()), FLOAT_TYPES);
NDArray::registerSpecialUse({&updates}, {&encoded});
}
}
}
}

View File

@ -0,0 +1,37 @@
/*******************************************************************************
* Copyright (c) 2020 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author raver119@gmail.com
//
#ifndef SD_THRESHOLD_H
#define SD_THRESHOLD_H
#include <ops/declarable/helpers/helpers.h>
namespace sd {
namespace ops {
namespace helpers {
int32_t thresholdEstimate(const NDArray &updates, float threshold);
void thresholdEncode(NDArray &updates, NDArray &encoded, float threshold);
void thresholdDecode(const NDArray &encoded, NDArray &updates);
}
}
}
#endif //SD_THRESHOLD_H

View File

@ -557,21 +557,26 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
fb.i_ = x[2];
float threshold = fb.f_;
auto pPos = -1;
auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e++) {
const auto v = x[e];
for (int bitId = 0; bitId < 16; bitId++) {
bool hasBit = (x[e] & 1 << (bitId)) != 0;
bool hasSign = (x[e] & 1 << (bitId + 16)) != 0;
bool hasBit = (v & 1 << (bitId)) != 0;
bool hasSign = (v & 1 << (bitId + 16)) != 0;
auto cPos = (e - 4) * 16 + bitId;
if (hasBit) {
if (hasSign)
dz[(e - 4) * 16 + bitId] -= static_cast<T>(threshold);
dz[cPos] -= static_cast<T>(threshold);
else
dz[(e - 4) * 16 + bitId] += static_cast<T>(threshold);
dz[cPos] += static_cast<T>(threshold);
} else if (hasSign) {
dz[(e - 4) * 16 + bitId] -= static_cast<T>(threshold / 2);
dz[cPos] -= static_cast<T>(threshold / 2);
}
pPos = cPos;
}
}
};
@ -582,17 +587,21 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
template<typename T>
Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) {
auto dx = reinterpret_cast<T *>(vx);
const T two(2.0f);
const T zero(0.0f);
const T t(threshold);
const T thalf = t / two;
//PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal))
auto func = PRAGMA_REDUCE_LONG {
//auto func = PRAGMA_REDUCE_LONG {
Nd4jLong retVal = 0L;
for (auto x = start; x < stop; x += increment) {
PRAGMA_OMP_PARALLEL_FOR_REDUCTION(+:retVal)
for (auto x = 0; x < N; x += 16) {
int byte = 0;
int byteId = x / 16 + 4;
for (int f = 0; f < 16; f++) {
Nd4jLong e = x + f;
auto e = x + f;
if (e >= N)
continue;
@ -602,19 +611,19 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
int bitId = e % 16;
if (abs >= (T) threshold) {
if (abs >= t) {
byte |= 1 << (bitId);
retVal++;
if (val < (T) 0.0f) {
if (val < zero) {
byte |= 1 << (bitId + 16);
dx[e] += static_cast<T>(threshold);
dx[e] += t;
} else {
dx[e] -= static_cast<T>(threshold);
dx[e] -= t;
}
} else if (abs >= (T) threshold / (T) 2.0f && val < (T) 0.0f) {
} else if (abs >= thalf && val < zero) {
byte |= 1 << (bitId + 16);
dx[e] += static_cast<T>(threshold / 2);
dx[e] += thalf;
retVal++;
}
@ -624,8 +633,9 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
}
return retVal;
};
return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
//};
//return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
}
}

View File

@ -57,7 +57,7 @@ static void multiplyLauncher(void *vbuffer, uint64_t length, void *vresult) {
multiplyKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
if (err != 0)
sd::cuda_exception::build("multiply failed", err);
throw sd::cuda_exception::build("multiply failed", err);
}
template <typename T>
@ -80,7 +80,7 @@ static void sumLauncher(void *vbuffer, uint64_t length, void *vresult) {
sumKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
if (err != 0)
sd::cuda_exception::build("sum failed", err);
throw sd::cuda_exception::build("sum failed", err);
}
template <typename T>
@ -103,7 +103,7 @@ static void subLauncher(void *vbuffer, uint64_t length, void *vresult) {
subKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
if (err != 0)
sd::cuda_exception::build("sub failed", err);
throw sd::cuda_exception::build("sub failed", err);
}
template <typename T>
@ -126,7 +126,7 @@ static void divLauncher(void *vbuffer, uint64_t length, void *vresult) {
divKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
if (err != 0)
sd::cuda_exception::build("div failed", err);
throw sd::cuda_exception::build("div failed", err);
}
static void multiplyHost(NDArray &input, NDArray &output) {

View File

@ -42,18 +42,18 @@ endif()
# -fsanitize=address
# -fsanitize=leak
if (APPLE)
set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2 -D__APPLE_OS__=true")
set(CMAKE_CXX_FLAGS " -fPIC -D__APPLE_OS__=true")
elseif(WIN32)
if (SD_CPU)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC -march=native -mtune=native -O3")
endif()
if (SD_CPU AND LINUX)
set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2")
set(CMAKE_CXX_FLAGS " -fPIC")
endif()
else()
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
set(CMAKE_CXX_FLAGS " -fPIC -fmax-errors=2")
set(CMAKE_CXX_FLAGS " -fPIC")
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*")
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native")
else()
@ -82,12 +82,12 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
# using GCC
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
endif()
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmax-errors=2")
if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux" AND NOT(MINGW))
if (CMAKE_BUILD_TYPE STREQUAL "Debug" AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux" AND NOT(MINGW))
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -Wl,-export-dynamic")
SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
endif()
endif()
IF(${CMAKE_SYSTEM_NAME} MATCHES "Linux")

View File

@ -25,6 +25,7 @@
#include <ops/ops.h>
#include <helpers/GradCheck.h>
#include <array>
#include <helpers/RandomLauncher.h>
using namespace sd;
@ -39,6 +40,195 @@ public:
}
};
TEST_F(DeclarableOpsTests19, test_threshold_encode_1) {
auto x = NDArrayFactory::create<double>('c', {3}, {1.5, 2.5, -3.5});
auto exp_encoded = NDArrayFactory::create<int>('c', {7}, {3, 3, 1056964608, 0, 1, 2, -3});
auto exp_gradients = NDArrayFactory::create<double>('c', {3}, {1.0, 2.0, -3.0});
sd::ops::encode_threshold op;
auto result = op.evaluate({&x}, {0.5});
auto gradients = result.at(0);
auto encoded = result.at(1);
//encoded->printIndexedBuffer("ENC");
ASSERT_EQ(exp_encoded, *encoded);
ASSERT_EQ(exp_gradients, x);
// FIXME: we need to add a way to declare individual inplace outputs
//ASSERT_EQ(exp_gradients, *gradients);
}
TEST_F(DeclarableOpsTests19, test_threshold_encode_2) {
for (int length = 5; length < 35; length++) {
auto x = NDArrayFactory::create<double>('c', {10000});
auto exp_gradients = NDArrayFactory::create<double>('c', {10000});
for (int e = 0; e < length; e++) {
x.p(e, 2e-3);
exp_gradients.p(e, 1e-3);
}
sd::ops::encode_threshold op;
auto result = op.evaluate({&x}, {1e-3});
auto encoded = result.at(1);
ASSERT_EQ(length + 4, encoded->lengthOf());
ASSERT_EQ(exp_gradients, x);
}
}
TEST_F(DeclarableOpsTests19, test_threshold_encode_boundary_1) {
auto x = NDArrayFactory::create<float>('c', {6});
x = 1.0f;
sd::ops::encode_threshold op;
auto result = op.evaluate({&x}, {1.0}, {3});
auto gradients = result.at(0);
auto encoded = result.at(1);
ASSERT_EQ(7, encoded->lengthOf());
ASSERT_EQ(3, x.sumNumber().e<int>(0));
}
TEST_F(DeclarableOpsTests19, test_threshold_encode_boundary_2) {
auto x = NDArrayFactory::create<float>('c', {1000});
x = 1.0f;
sd::ops::encode_threshold op;
auto result = op.evaluate({&x}, {1.0}, {100});
auto gradients = result.at(0);
auto encoded = result.at(1);
ASSERT_EQ(104, encoded->lengthOf());
ASSERT_EQ(900, x.sumNumber().e<int>(0));
}
TEST_F(DeclarableOpsTests19, test_threshold_decode_1) {
auto x = NDArrayFactory::create<double>('c', {3}, {1.0, 2.0, -3.0});
auto y = NDArrayFactory::create<int>('c', {7}, {3, 3, 1056964608, 0, 1, 2, -3});
auto exp_gradients = NDArrayFactory::create<double>('c', {3}, {1.5, 2.5, -3.5});
sd::ops::decode_threshold op;
auto status = op.execute({&x, &y}, {&x});
ASSERT_EQ(Status::OK(), status);
ASSERT_EQ(exp_gradients, x);
}
TEST_F(DeclarableOpsTests19, test_bitmap_encode_1) {
auto initial = NDArrayFactory::create<float>('c', {6}, {0.0f, 0.0f, 1e-3f, -1e-3f, 0.0f, 0.0f});
auto exp_0 = initial.like();
auto exp_1 = initial.dup();
auto exp_c = NDArrayFactory::create<int>(2L);
sd::ops::encode_bitmap enc;
auto enc_result = enc.evaluate({&initial}, {1e-3f});
ASSERT_EQ(Status::OK(), enc_result.status());
//initial.printIndexedBuffer("initial");
ASSERT_EQ(exp_0, initial);
auto encoded = enc_result.at(1);
auto counter = enc_result.at(2);
//encoded->printIndexedBuffer("encoded");
ASSERT_EQ(exp_c, *counter);
sd::ops::decode_bitmap dec;
auto status = dec.execute({&initial, encoded}, {&initial});
ASSERT_EQ(Status::OK(), status);
//initial.printIndexedBuffer();
ASSERT_EQ(exp_1, initial);
}
TEST_F(DeclarableOpsTests19, test_bitmap_encode_decode) {
auto initial = NDArrayFactory::create<float>('c', {256000});
initial = 1.0f;
auto exp = initial.dup();
auto neg = initial.like();
neg = 0.5f;
sd::ops::encode_bitmap enc;
auto enc_result = enc.evaluate({&initial}, {0.5f});
auto encoded = enc_result.at(1);
// checking equality of all encoded bits
for (int e = 5; e < encoded->lengthOf() - 1; e++) {
if (encoded->e<int>(e) != encoded->e<int>(e - 1))
nd4j_printf("Non equal encoded values at E[%i]: %i;\n", e, encoded->e<int>(e));
}
ASSERT_NE(exp, initial);
ASSERT_EQ(neg, initial);
sd::ops::decode_bitmap dec;
auto status = dec.execute({&initial, encoded}, {&initial});
ASSERT_EQ(Status::OK(), status);
// checking equality of all dedoded bits
for (int e = 0; e < initial.lengthOf(); e++) {
auto f = initial.e<float>(e);
if (f != 1.0f)
nd4j_printf("initial[%i] = %f\n", e, f);
}
ASSERT_EQ(exp, initial);
}
TEST_F(DeclarableOpsTests19, test_threshold_encode_decode) {
auto initial = NDArrayFactory::create<float>('c', {256000});
initial = 1.0f;
auto exp = initial.dup();
auto neg = initial.like();
neg = 0.5f;
sd::ops::encode_threshold enc;
auto enc_result = enc.evaluate({&initial}, {0.5f});
auto encoded = enc_result.at(1);
ASSERT_EQ(256000 + 4, encoded->lengthOf());
ASSERT_NE(exp, initial);
for (int e = 0; e < initial.lengthOf(); e++) {
auto f = initial.e<float>(e);
if (f != 0.5f) {
nd4j_printf("initial[%i] = %f\n", e, f);
throw std::runtime_error("");
}
}
ASSERT_EQ(neg, initial);
// checking equality of all encoded bits
//for (int e = 5; e < encoded->lengthOf() - 1; e++) {
//if (encoded->e<int>(e) != encoded->e<int>(e - 1) + 1)
//nd4j_printf("Non equal encoded values at E[%i]: %i;\n", e, encoded->e<int>(e));
//}
sd::ops::decode_threshold dec;
auto status = dec.execute({&initial, encoded}, {&initial});
ASSERT_EQ(Status::OK(), status);
// checking equality of all dedoded bits
for (int e = 0; e < initial.lengthOf(); e++) {
auto f = initial.e<float>(e);
if (f != 1.0f)
nd4j_printf("initial[%i] = %f\n", e, f);
}
ASSERT_EQ(exp, initial);
}
TEST_F(DeclarableOpsTests19, test_matmul_ccc) {
auto x = NDArrayFactory::create<float>('c', {10, 10});
auto y = NDArrayFactory::create<float>('c', {10, 10});

View File

@ -73,6 +73,10 @@ public class ImportClassMapping {
org.nd4j.linalg.api.ops.impl.broadcast.BroadcastRSubOp.class,
org.nd4j.linalg.api.ops.impl.broadcast.BroadcastSubOp.class,
org.nd4j.linalg.api.ops.impl.broadcast.BroadcastTo.class,
org.nd4j.linalg.api.ops.compression.EncodeBitmap.class,
org.nd4j.linalg.api.ops.compression.DecodeBitmap.class,
org.nd4j.linalg.api.ops.compression.EncodeThreshold.class,
org.nd4j.linalg.api.ops.compression.DecodeThreshold.class,
org.nd4j.linalg.api.ops.impl.shape.Create.class,
org.nd4j.linalg.api.ops.impl.broadcast.bool.BroadcastEqualTo.class,
org.nd4j.linalg.api.ops.impl.broadcast.bool.BroadcastGreaterThan.class,

View File

@ -0,0 +1,55 @@
/*******************************************************************************
* Copyright (c) 2020 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
package org.nd4j.linalg.api.ops.compression;
import lombok.NonNull;
import org.nd4j.linalg.api.buffer.DataType;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.ops.DynamicCustomOp;
import org.nd4j.linalg.factory.Nd4j;
import java.util.Arrays;
import java.util.List;
/**
* Bitmap decoding op wrapper. Used in gradients sharing.
* @author raver119@gmail.com
*/
public class DecodeBitmap extends DynamicCustomOp {
public DecodeBitmap() {
//
}
public DecodeBitmap(@NonNull INDArray encoded, @NonNull INDArray updates) {
addInputArgument(updates, encoded);
addOutputArgument(updates);
// this op ALWAYS modifies updates array
setInPlace(true);
}
@Override
public String opName() {
return "decode_bitmap";
}
@Override
public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32);
}
}

View File

@ -0,0 +1,54 @@
/*******************************************************************************
* Copyright (c) 2020 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
package org.nd4j.linalg.api.ops.compression;
import lombok.NonNull;
import org.nd4j.linalg.api.buffer.DataType;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.ops.DynamicCustomOp;
import java.util.Arrays;
import java.util.List;
/**
* Sparse threshold decoding op wrapper. Used in gradients sharing.
* @author raver119@gmail.com
*/
public class DecodeThreshold extends DynamicCustomOp {
public DecodeThreshold() {
//
}
public DecodeThreshold(@NonNull INDArray encoded, @NonNull INDArray updates) {
addInputArgument(updates, encoded);
addOutputArgument(updates);
// this op ALWAYS modifies updates array
setInPlace(true);
}
@Override
public String opName() {
return "decode_threshold";
}
@Override
public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32);
}
}

View File

@ -0,0 +1,64 @@
/*******************************************************************************
* Copyright (c) 2020 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
package org.nd4j.linalg.api.ops.compression;
import lombok.NonNull;
import org.nd4j.linalg.api.buffer.DataType;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.ops.DynamicCustomOp;
import org.nd4j.linalg.factory.Nd4j;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* Bitmap encoding op wrapper. Used in gradients sharing.
* @author raver119@gmail.com
*/
public class EncodeBitmap extends DynamicCustomOp {
protected float threshold = 1e-3f;
public EncodeBitmap() {
//
}
public EncodeBitmap(@NonNull INDArray updates, float threshold) {
this(updates, Nd4j.create(DataType.INT32, updates.length() / 16 + 5), Nd4j.scalar(DataType.INT32, 0), threshold);
}
public EncodeBitmap(@NonNull INDArray updates, @NonNull INDArray encoded, @NonNull INDArray counter, float threshold) {
addInputArgument(updates);
addOutputArgument(updates, encoded, counter);
addTArgument(threshold);
this.threshold = threshold;
// this op ALWAYS modifies updates array
setInPlace(true);
}
@Override
public String opName() {
return "encode_bitmap";
}
@Override
public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32, DataType.INT32);
}
}

View File

@ -0,0 +1,63 @@
/*******************************************************************************
* Copyright (c) 2020 Konduit K.K.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
package org.nd4j.linalg.api.ops.compression;
import lombok.NonNull;
import org.nd4j.linalg.api.buffer.DataType;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.ops.DynamicCustomOp;
import org.nd4j.linalg.factory.Nd4j;
import java.util.Arrays;
import java.util.List;
/**
* Sparse threshold encoding op wrapper. Used in gradients sharing.
* @author raver119@gmail.com
*/
public class EncodeThreshold extends DynamicCustomOp {
protected float threshold = 1e-3f;
protected int boundary = Integer.MAX_VALUE;
public EncodeThreshold() {
//
}
public EncodeThreshold(@NonNull INDArray updates, float threshold) {
this(updates, threshold, Integer.MAX_VALUE);
}
public EncodeThreshold(@NonNull INDArray updates, float threshold, @NonNull Integer boundary) {
addInputArgument(updates);
addTArgument(threshold);
addIArgument(boundary.intValue());
this.threshold = threshold;
this.boundary = boundary;
}
@Override
public String opName() {
return "encode_threshold";
}
@Override
public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes) {
return Arrays.asList(inputArguments.get(0).dataType(), DataType.INT32);
}
}

View File

@ -30,6 +30,10 @@ import org.nd4j.linalg.api.ndarray.INDArrayStatistics;
import org.nd4j.linalg.api.ops.*;
import org.nd4j.linalg.api.ops.aggregates.Aggregate;
import org.nd4j.linalg.api.ops.aggregates.Batch;
import org.nd4j.linalg.api.ops.compression.DecodeBitmap;
import org.nd4j.linalg.api.ops.compression.DecodeThreshold;
import org.nd4j.linalg.api.ops.compression.EncodeBitmap;
import org.nd4j.linalg.api.ops.compression.EncodeThreshold;
import org.nd4j.linalg.api.ops.impl.scatter.ScatterUpdate;
import org.nd4j.linalg.api.ops.impl.summarystats.Variance;
import org.nd4j.linalg.api.rng.Random;
@ -685,38 +689,41 @@ public abstract class DefaultOpExecutioner implements OpExecutioner {
@Override
public INDArray thresholdEncode(INDArray input, double threshold) {
throw new UnsupportedOperationException("Not yet implemented");
return thresholdEncode(input, threshold, Integer.MAX_VALUE);
}
@Override
public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) {
throw new UnsupportedOperationException("Not yet implemented");
val result = Nd4j.exec(new EncodeThreshold(input, (float) threshold, boundary))[1];
return result.getInt(0) > 0 ? result : null;
}
@Override
public INDArray thresholdDecode(INDArray encoded, INDArray target) {
throw new UnsupportedOperationException("Not yet implemented");
Nd4j.exec(new DecodeThreshold(encoded, target));
return target;
}
@Override
public long bitmapEncode(INDArray indArray, INDArray target, double threshold) {
throw new UnsupportedOperationException("Not yet implemented");
val results = Nd4j.exec(new EncodeBitmap(indArray, target, Nd4j.scalar(0), (float) threshold));
// return number of elements taht were compressed
return results[2].getInt(0);
}
@Override
public INDArray bitmapEncode(INDArray indArray, double threshold) {
DataBuffer buffer = Nd4j.getDataBufferFactory().createInt(indArray.length() / 16 + 5);
INDArray ret = Nd4j.createArrayFromShapeBuffer(buffer, indArray.shapeInfoDataBuffer());
bitmapEncode(indArray, ret, threshold);
return ret;
val array = Nd4j.create(DataType.INT32, indArray.length() / 16 + 5);
bitmapEncode(indArray, array, threshold);
return array;
}
@Override
public INDArray bitmapDecode(INDArray encoded, INDArray target) {
throw new UnsupportedOperationException("Not yet implemented");
Nd4j.exec(new DecodeBitmap(encoded, target));
return target;
}

View File

@ -1004,20 +1004,6 @@ public interface NativeOps {
@Cast("Nd4jLong *") LongPointer tadShapeInfo,
@Cast("Nd4jLong *") LongPointer tadOffsets);
long encodeBitmap(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, long N, IntPointer dz, float threshold);
void decodeBitmap(PointerPointer extraPointers, Pointer dx, long N, Pointer dz, LongPointer zShapeInfo);
void encodeThresholdP1(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, long N, IntPointer dz, float threshold);
void encodeThresholdP2Int(PointerPointer extraPointers, IntPointer dx, long N, IntPointer dz);
void encodeThresholdP3(PointerPointer extraPointers, Pointer dx, LongPointer xShapeInfo, IntPointer offsets, long N, IntPointer dz);
void decodeThreshold(PointerPointer extraPointers, Pointer dx, long N, Pointer dz, LongPointer zShapeInfo);
void sort(PointerPointer extraPointers,
Pointer x, @Cast("Nd4jLong *") LongPointer xShapeInfo,
Pointer dx, @Cast("Nd4jLong *") LongPointer dxShapeInfo,

View File

@ -1,100 +0,0 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
package org.nd4j.linalg.jcublas.compression;
import org.bytedeco.javacpp.IntPointer;
import org.nd4j.linalg.api.buffer.DataBuffer;
import org.nd4j.linalg.api.buffer.DataTypeEx;
import org.nd4j.linalg.api.concurrency.AffinityManager;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.compression.CompressedDataBuffer;
import org.nd4j.linalg.compression.CompressionDescriptor;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.indexing.conditions.Conditions;
/**
* This compression is very special case, and shouldn't be ever used outside of ParallelWrapper/ParameterServer implementation.
* It encodes data as delta between zero and abs threshold.
*
* Unlike CudaThreshold codec, CudaFlexibleThreshold tries to target specified sparsity/density updates ratio via topN approach
*
* PLEASE NOTE: DO NOT USE THIS COMPRESSOR UNLESS YOU'RE 100% SURE WHAT YOU DO!
*
* @author raver119@gmail.com
*/
public class CudaFlexibleThreshold extends CudaThreshold {
public CudaFlexibleThreshold() {
super();
this.threshold = 0.1f;
}
/**
* This method returns compression descriptor. It should be unique for any compressor implementation
*
* @return
*/
@Override
public String getDescriptor() {
return "FTHRESHOLD";
}
/**
* This method allows you to configure desired sparsity/density ratio for updates. Pass it as float/double value
*
* Default value: 0.1
* @param vars
*/
@Override
public void configure(Object... vars) {
super.configure(vars);
}
@Override
public DataBuffer compress(DataBuffer buffer) {
INDArray temp = Nd4j.createArrayFromShapeBuffer(buffer, Nd4j.getShapeInfoProvider().createShapeInformation(new long[]{1, buffer.length()}, buffer.dataType()));
double max = temp.amaxNumber().doubleValue();
int cntAbs = temp.scan(Conditions.absGreaterThanOrEqual(max - (max * threshold))).intValue();
long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType());
int compressedLength = cntAbs + 3;
// first 3 elements contain header
IntPointer pointer = new IntPointer(compressedLength);
pointer.put(0, cntAbs);
pointer.put(1, (int) buffer.length());
pointer.put(2, Float.floatToIntBits(threshold)); // please note, this value will be ovewritten anyway
CompressionDescriptor descriptor = new CompressionDescriptor();
descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT)
descriptor.setOriginalLength(originalLength);
descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType()));
descriptor.setNumberOfElements(buffer.length());
descriptor.setCompressionAlgorithm(getDescriptor());
descriptor.setCompressionType(getCompressionType());
CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor);
Nd4j.getNDArrayFactory().convertDataEx(getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.FTHRESHOLD, pointer, buffer.length());
Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST);
return cbuff;
}
}

View File

@ -1,271 +0,0 @@
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
package org.nd4j.linalg.jcublas.compression;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import lombok.val;
import org.apache.commons.math3.util.FastMath;
import org.bytedeco.javacpp.*;
import org.nd4j.compression.impl.AbstractCompressor;
import org.nd4j.jita.allocator.impl.AtomicAllocator;
import org.nd4j.linalg.api.buffer.DataBuffer;
import org.nd4j.linalg.api.buffer.DataType;
import org.nd4j.linalg.api.buffer.DataTypeEx;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.compression.CompressedDataBuffer;
import org.nd4j.linalg.compression.CompressionType;
import org.nd4j.linalg.exception.ND4JIllegalStateException;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.jcublas.context.CudaContext;
import org.nd4j.nativeblas.NativeOpsHolder;
import java.util.ArrayList;
import java.util.List;
/**
* This compression is very special case, and shouldn't be ever used outside of ParallelWrapper/ParameterServer implementation.
* It encodes data as delta between zero and abs threshold.
*
* PLEASE NOTE: DO NOT USE THIS COMPRESSOR UNLESS YOU'RE 100% SURE WHAT YOU DO!
*
* @author raver119@gmail.com
*/
@Slf4j
public class CudaThreshold extends AbstractCompressor {
@Getter @Setter protected float threshold = 1e-3f;
/**
* This method returns compression descriptor. It should be unique for any compressor implementation
*
* @return
*/
@Override
public String getDescriptor() {
return "THRESHOLD";
}
/**
* This method allows you to configure threshold for delta extraction. Pass it as float/double value
*
* Default value: 1e-3
* @param vars
*/
@Override
public void configure(Object... vars) {
if (vars[0] instanceof Number) {
Number t = (Number) vars[0];
threshold = FastMath.abs(t.floatValue());
log.info("Setting threshold to [{}]", threshold);
} else {
throw new ND4JIllegalStateException("Threshold value should be Number");
}
}
@Override
public INDArray compress(INDArray array) {
//logger.info("Threshold [{}] compression", threshold);
Nd4j.getExecutioner().commit();
//Nd4j.getAffinityManager().ensureLocation(array, AffinityManager.Location.HOST);
DataBuffer buffer = compress(array.data());
if (buffer == null)
return null;
INDArray dup = Nd4j.createArrayFromShapeBuffer(buffer, array.shapeInfoDataBuffer());
dup.markAsCompressed(true);
return dup;
}
@Override
public CompressionType getCompressionType() {
return CompressionType.LOSSLESS;
}
@Override
public DataBuffer decompress(DataBuffer buffer, DataType type) {
if (buffer.dataType() != DataType.INT)
throw new UnsupportedOperationException();
long compressedLength = buffer.getInt(0);
long originalLength = buffer.getInt(1);
DataBuffer result = Nd4j.createBuffer(type, originalLength, false);
val context = AtomicAllocator.getInstance().getDeviceContext();
PointerPointer extras = new PointerPointer(32).put(1, context.getOldStream());
//log.info("DEC Source length: {}", buffer.length());
//log.info("DEC Source: {}", Arrays.toString(buffer.asInt()));
//NativeOpsHolder.getInstance().getDeviceNativeOps().decodeThresholdFloat(extras, AtomicAllocator.getInstance().getPointer(buffer), compressedLength, (FloatPointer) AtomicAllocator.getInstance().getPointer(result));
AtomicAllocator.getInstance().getAllocationPoint(result).tickDeviceWrite();
//DataBuffer result = Nd4j.getNDArrayFactory().convertDataEx(DataTypeEx.THRESHOLD, buffer, getGlobalTypeEx());
return result;
}
@Override
public DataBuffer compress(DataBuffer buffer) {
int numThreads = 1024;
int numBlocks = (int) (buffer.length() / numThreads + (buffer.length() % numThreads == 0 ? 0 : 1));
val context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext();
DataBuffer blocksBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks+1, true) : Nd4j.getDataBufferFactory().createInt(numBlocks+1, true, Nd4j.getMemoryManager().getCurrentWorkspace());
PointerPointer extras = new PointerPointer(32).put(1, context.getOldStream());
//NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP1(extras, (FloatPointer) AtomicAllocator.getInstance().getPointer(buffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), threshold);
AtomicAllocator.getInstance().getAllocationPoint(blocksBuffer).tickDeviceWrite();
int numMatches = blocksBuffer.getInt(0);
//log.info("Totals: {}", numMatches);
/*
log.info("Number of blocks for compression: {}", numBlocks);
log.info("BlocksCounts: {}", Arrays.toString(blocksBuffer.asInt()));
*/
DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(3+numMatches, false) : Nd4j.getDataBufferFactory().createInt(3+numMatches, false, Nd4j.getMemoryManager().getCurrentWorkspace());
encodedBuffer.put(0, numMatches);
encodedBuffer.put(1, (int) buffer.length());
encodedBuffer.put(2, Float.floatToIntBits(threshold));
AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickHostWrite();
// FIXME: make it parallel via some kernel, because it can be pretty big array here, i.e. for 150m original array, offsets can
/*
int prevSum = 0;
for (int e = 0; e < numBlocks; e++) {
int prevVal = offsetsBuffer.getInt(e + 1);
offsetsBuffer.put(e + 1, prevSum);
prevSum += prevVal;
}
*/
int prefixThreads = 512;
int numElts = numBlocks;
int level = 0;
List<DataBuffer> buffers = new ArrayList<>();
// here we just calculate number of sumBlock arrays
do {
int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
if (numBlocks > 1) {
level++;
}
numElts = numPrefixBlocks;
} while (numElts > 1);
long[] pointers = new long[level];
level = 0;
numElts = numBlocks;
// allocating temp buffers for prefux sum
DataBuffer tempX = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createDouble(pointers.length, false) : Nd4j.getDataBufferFactory().createDouble(pointers.length, false, Nd4j.getMemoryManager().getCurrentWorkspace());
do {
int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
if (numPrefixBlocks > 1) {
DataBuffer bf = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false) : Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false, Nd4j.getMemoryManager().getCurrentWorkspace());
buffers.add(bf);
pointers[level++] = AtomicAllocator.getInstance().getPointer(bf).address();
}
numElts = numPrefixBlocks;
} while (numElts > 1);
AtomicAllocator.getInstance().memcpyBlocking(tempX, new LongPointer(pointers), pointers.length * 8, 0);
extras.put(2, AtomicAllocator.getInstance().getPointer(tempX));
DataBuffer offsetsBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks, true) : Nd4j.getDataBufferFactory().createInt(numBlocks, true, Nd4j.getMemoryManager().getCurrentWorkspace());
NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP2Int(extras, (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), numBlocks, (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer) );
AtomicAllocator.getInstance().getAllocationPoint(offsetsBuffer).tickDeviceWrite();
//log.info("Offsets: {}", Arrays.toString(offsetsBuffer.asInt()));
//log.info("Target: {}", Arrays.toString(encodedBuffer.asInt()));
//NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP3Float(extras, (FloatPointer) AtomicAllocator.getInstance().getPointer(buffer), (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(encodedBuffer));
AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickDeviceWrite();
AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite();
//log.info("Encoded: {}", Arrays.toString(encodedBuffer.asInt()));
extras.address();
tempX.address();
return encodedBuffer;
/*
INDArray temp = Nd4j.createArrayFromShapeBuffer(buffer, Nd4j.getShapeInfoProvider().createShapeInformation(new int[]{1, (int) buffer.length()}));
MatchCondition condition = new MatchCondition(temp, Conditions.absGreaterThanOrEqual(threshold));
int cntAbs = Nd4j.getExecutioner().exec(condition, Integer.MAX_VALUE).getInt(0);
//log.info("density ratio: {}", String.format("%.2f", cntAbs * 100.0f / buffer.length()));
if (cntAbs == 0)
return null;
long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType());
int compressedLength = cntAbs + 3;
// first 3 elements contain header
IntPointer pointer = new IntPointer(compressedLength);
pointer.put(0, cntAbs);
pointer.put(1, (int) buffer.length());
pointer.put(2, Float.floatToIntBits(threshold));
CompressionDescriptor descriptor = new CompressionDescriptor();
descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT)
descriptor.setOriginalLength(originalLength);
descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType()));
descriptor.setNumberOfElements(buffer.length());
descriptor.setCompressionAlgorithm(getDescriptor());
descriptor.setCompressionType(getCompressionType());
CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor);
Nd4j.getNDArrayFactory().convertDataEx(getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.THRESHOLD, pointer, buffer.length());
Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST);
return cbuff;
*/
}
@Override
protected CompressedDataBuffer compressPointer(DataTypeEx srcType, Pointer srcPointer, int length, int elementSize) {
throw new UnsupportedOperationException();
}
}

View File

@ -31,6 +31,7 @@ import org.nd4j.jita.allocator.tad.DeviceTADManager;
import org.nd4j.jita.conf.CudaEnvironment;
import org.nd4j.linalg.api.buffer.DataBuffer;
import org.nd4j.linalg.api.buffer.DataType;
import org.nd4j.linalg.api.concurrency.AffinityManager;
import org.nd4j.linalg.api.environment.Nd4jEnvironment;
import org.nd4j.linalg.api.memory.pointers.PagedPointer;
import org.nd4j.linalg.api.ndarray.INDArray;
@ -1674,224 +1675,6 @@ public class CudaExecutioner extends DefaultOpExecutioner {
ctx.syncSpecialStream();
}
@Override
public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) {
DataBuffer buffer = input.data();
int numThreads = 1024;
int numBlocks = (int) (buffer.length() / numThreads + (buffer.length() % numThreads == 0 ? 0 : 1));
val context = AtomicAllocator.getInstance().getDeviceContext();
DataBuffer blocksBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks+1, true) : Nd4j.getDataBufferFactory().createInt(numBlocks+1, true, Nd4j.getMemoryManager().getCurrentWorkspace());
if (extraz.get() == null)
extraz.set(new PointerPointer(32));
val extras = extraz.get().put(1, context.getOldStream());
((BaseCudaDataBuffer) buffer).getOpaqueDataBuffer().syncToSpecial();
NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP1(extras,
AtomicAllocator.getInstance().getPointer(buffer),
(LongPointer) AtomicAllocator.getInstance().getHostPointer(input.shapeInfoDataBuffer()),
buffer.length(),
(IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer),
(float) threshold);
AtomicAllocator.getInstance().getAllocationPoint(blocksBuffer).tickDeviceWrite();
int numMatches = blocksBuffer.getInt(0);
// special case here, nothing to update
if (numMatches < 2)
return null;
if (boundary != null && numMatches > boundary) {
numMatches = boundary;
blocksBuffer.put(0, numMatches);
}
DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(4+numMatches, false) : Nd4j.getDataBufferFactory().createInt(4+numMatches, false, Nd4j.getMemoryManager().getCurrentWorkspace());
encodedBuffer.put(0, numMatches);
encodedBuffer.put(1, (int) buffer.length());
encodedBuffer.put(2, Float.floatToIntBits((float) threshold));
encodedBuffer.put(3, ThresholdCompression.FLEXIBLE_ENCODING);
((BaseCudaDataBuffer) encodedBuffer).getOpaqueDataBuffer().syncToSpecial();
int prefixThreads = 512;
int numElts = numBlocks;
int level = 0;
List<DataBuffer> buffers = new ArrayList<>();
// here we just calculate number of sumBlock arrays
do {
int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
if (numBlocks > 1) {
level++;
}
numElts = numPrefixBlocks;
} while (numElts > 1);
long[] pointers = new long[level];
level = 0;
numElts = numBlocks;
// allocating temp buffers for prefux sum
DataBuffer tempX = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createDouble(pointers.length, false) : Nd4j.getDataBufferFactory().createDouble(pointers.length, false, Nd4j.getMemoryManager().getCurrentWorkspace());
do {
int numPrefixBlocks = Math.max(1, (int)Math.ceil((float)numElts / (2.0f * prefixThreads)));
if (numPrefixBlocks > 1) {
DataBuffer bf = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false) : Nd4j.getDataBufferFactory().createInt(numPrefixBlocks, false, Nd4j.getMemoryManager().getCurrentWorkspace());
buffers.add(bf);
pointers[level++] = AtomicAllocator.getInstance().getPointer(bf).address();
}
numElts = numPrefixBlocks;
} while (numElts > 1);
AtomicAllocator.getInstance().memcpyBlocking(tempX, new LongPointer(pointers), pointers.length * 8, 0);
extras.put(2, AtomicAllocator.getInstance().getPointer(tempX));
DataBuffer offsetsBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(numBlocks, true) : Nd4j.getDataBufferFactory().createInt(numBlocks, true, Nd4j.getMemoryManager().getCurrentWorkspace());
NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP2Int(extras, (IntPointer) AtomicAllocator.getInstance().getPointer(blocksBuffer), numBlocks, (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer) );
AtomicAllocator.getInstance().getAllocationPoint(offsetsBuffer).tickDeviceWrite();
NativeOpsHolder.getInstance().getDeviceNativeOps().encodeThresholdP3(extras, AtomicAllocator.getInstance().getPointer(buffer), (LongPointer) AtomicAllocator.getInstance().getHostPointer(input.shapeInfoDataBuffer()), (IntPointer) AtomicAllocator.getInstance().getPointer(offsetsBuffer), buffer.length(), (IntPointer) AtomicAllocator.getInstance().getPointer(encodedBuffer));
AtomicAllocator.getInstance().getAllocationPoint(encodedBuffer).tickDeviceWrite();
AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite();
return Nd4j.createArrayFromShapeBuffer(encodedBuffer, input.shapeInfoDataBuffer());
}
@Override
public INDArray thresholdEncode(INDArray input, double threshold) {
return thresholdEncode(input, threshold, null);
}
@Override
public INDArray thresholdDecode(INDArray encoded, INDArray target) {
DataBuffer buffer = encoded.data();
if (buffer.dataType() != DataType.INT)
throw new UnsupportedOperationException();
long compressedLength = buffer.getInt(0);
long originalLength = buffer.getInt(1);
if (target.length() != originalLength)
throw new ND4JIllegalStateException("originalLength ["+ originalLength+"] stored in encoded array doesn't match target length ["+ target.length()+"]");
DataBuffer result = target.data();
val context = AtomicAllocator.getInstance().getDeviceContext();
if (extraz.get() == null)
extraz.set(new PointerPointer(32));
PointerPointer extras = extraz.get().put(1, context.getOldStream());
nativeOps.decodeThreshold(extras, AtomicAllocator.getInstance().getPointer(buffer), compressedLength, AtomicAllocator.getInstance().getPointer(result), (LongPointer) AtomicAllocator.getInstance().getHostPointer(target.shapeInfoDataBuffer()));
if (nativeOps.lastErrorCode() != 0)
throw new RuntimeException(nativeOps.lastErrorMessage());
AtomicAllocator.getInstance().getAllocationPoint(result).tickDeviceWrite();
return target;
}
@Override
public long bitmapEncode(INDArray indArray, INDArray target, double threshold) {
long length = indArray.length();
long tLen = target.data().length();
if (tLen != (length / 16 + 5))
throw new ND4JIllegalStateException("Length of target array should be " + (length / 16 + 5));
if (target.data().dataType() != DataType.INT)
throw new ND4JIllegalStateException("Target array should have INT dataType");
DataBuffer buffer = target.data();
buffer.put(0, (int) length);
buffer.put(1, (int) length);
buffer.put(2, Float.floatToIntBits((float) threshold));
// format id
buffer.put(3, ThresholdCompression.BITMAP_ENCODING);
val context = AtomicAllocator.getInstance().getDeviceContext();
if (extraz.get() == null)
extraz.set(new PointerPointer(32));
PointerPointer extras = extraz.get().put(
AtomicAllocator.getInstance().getHostPointer(indArray),
context.getOldStream(),
context.getBufferScalar(),
context.getBufferReduction()
);
val src = AtomicAllocator.getInstance().getPointer(indArray, context);
val dst = (IntPointer) AtomicAllocator.getInstance().getPointer(buffer, context);
((BaseCudaDataBuffer) buffer).getOpaqueDataBuffer().syncToSpecial();
long val = nativeOps.encodeBitmap(extras,
src, (LongPointer) AtomicAllocator.getInstance().getHostPointer(indArray.shapeInfoDataBuffer()),
length,
dst,
(float) threshold);
if (nativeOps.lastErrorCode() != 0)
throw new RuntimeException(nativeOps.lastErrorMessage());
AtomicAllocator.getInstance().getAllocationPoint(buffer).tickDeviceWrite();
return val;
}
@Override
public INDArray bitmapDecode(INDArray encoded, INDArray target) {
val context = AtomicAllocator.getInstance().getDeviceContext();
if (extraz.get() == null)
extraz.set(new PointerPointer(32));
PointerPointer extras = extraz.get().put(
AtomicAllocator.getInstance().getHostPointer(target),
context.getOldStream(),
context.getBufferScalar(),
context.getBufferReduction());
nativeOps.decodeBitmap(extras, AtomicAllocator.getInstance().getPointer(encoded.data(), context), target.length(), AtomicAllocator.getInstance().getPointer(target, context), (LongPointer) AtomicAllocator.getInstance().getHostPointer(target.shapeInfoDataBuffer()));
if (nativeOps.lastErrorCode() != 0)
throw new RuntimeException(nativeOps.lastErrorMessage());
return target;
}
@Override
public synchronized Map<String, CustomOpDescriptor> getCustomOperations() {
if(customOps == null) {
@ -1974,6 +1757,11 @@ public class CudaExecutioner extends DefaultOpExecutioner {
val inputArgs = opContext != null ? opContext.getInputArrays() : op.inputArguments();
int cnt= 0;
for (val in: inputArgs) {
// TODO: once we implement Context-based shape function call this method should be removed
val loc = Nd4j.getAffinityManager().getActiveLocation(in);
if (loc != AffinityManager.Location.DEVICE && loc != AffinityManager.Location.EVERYWHERE)
Nd4j.getAffinityManager().ensureLocation(in, AffinityManager.Location.DEVICE);
// NOT A TYPO: shape functions work on host side only
if (!in.isEmpty()) {
inputBuffers.put(cnt, in.data().addressPointer());

View File

@ -2804,30 +2804,6 @@ public native void tear(@Cast("Nd4jPointer*") PointerPointer extraPointers,
@Cast("Nd4jLong*") long[] tadShapeInfo,
@Cast("Nd4jLong*") long[] tadOffsets);
public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntPointer dx, @Cast("Nd4jLong") long N, IntPointer dz);
public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntBuffer dx, @Cast("Nd4jLong") long N, IntBuffer dz);
public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, int[] dx, @Cast("Nd4jLong") long N, int[] dz);
public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, IntPointer offsets, @Cast("Nd4jLong") long N, IntPointer dz);
public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, IntBuffer offsets, @Cast("Nd4jLong") long N, IntBuffer dz);
public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, int[] offsets, @Cast("Nd4jLong") long N, int[] dz);
public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
public native void sort(@Cast("Nd4jPointer*") PointerPointer extraPointers,
Pointer x, @Cast("Nd4jLong*") LongPointer xShapeInfo,
Pointer dx, @Cast("Nd4jLong*") LongPointer dxShapeInfo,
@ -10712,6 +10688,7 @@ public static final int PREALLOC_SIZE = 33554432;
// #include <ops/declarable/headers/boolean.h>
// #include <ops/declarable/headers/broadcastable.h>
// #include <ops/declarable/headers/convo.h>
// #include <ops/declarable/headers/compression.h>
// #include <ops/declarable/headers/list.h>
// #include <ops/declarable/headers/recurrent.h>
// #include <ops/declarable/headers/transforms.h>

View File

@ -13,5 +13,3 @@
#
# SPDX-License-Identifier: Apache-2.0
################################################################################
org.nd4j.linalg.jcublas.compression.CudaThreshold

View File

@ -1376,142 +1376,6 @@ public class NativeOpExecutioner extends DefaultOpExecutioner {
}
}
@Override
public INDArray thresholdEncode(INDArray input, double threshold) {
return thresholdEncode(input, threshold, null);
}
@Override
public INDArray thresholdEncode(INDArray input, double threshold, Integer boundary) {
//val condition = new MatchCondition(input, Conditions.absGreaterThanOrEqual(threshold));
//long t1 = System.currentTimeMillis();
int cntAbs = loop.estimateThreshold(null,
input.data().addressPointer(),
(LongPointer) input.shapeInfoDataBuffer().addressPointer(),
(int) input.length(),
(float) threshold);
//long t2 = System.currentTimeMillis();
if (loop.lastErrorCode() != 0)
throw new RuntimeException(loop.lastErrorMessage());
if (cntAbs < 2)
return null;
if (boundary != null)
cntAbs = Math.min(cntAbs, boundary);
//log.info("S: {}; T: {}", cntAbs, t2 - t1);
DataBuffer buffer = input.data();
long originalLength = buffer.length() * Nd4j.sizeOfDataType(buffer.dataType());
int compressedLength = cntAbs + 4;
// first 3 elements contain header
DataBuffer encodedBuffer = Nd4j.getMemoryManager().getCurrentWorkspace() == null ? Nd4j.getDataBufferFactory().createInt(4+cntAbs, false) : Nd4j.getDataBufferFactory().createInt(4+cntAbs, false, Nd4j.getMemoryManager().getCurrentWorkspace());
encodedBuffer.put(0, cntAbs);
encodedBuffer.put(1, (int) buffer.length());
encodedBuffer.put(2, Float.floatToIntBits((float) threshold));
// format id
encodedBuffer.put(3, ThresholdCompression.FLEXIBLE_ENCODING);
CompressionDescriptor descriptor = new CompressionDescriptor();
descriptor.setCompressedLength(compressedLength * 4); // sizeOf(INT)
descriptor.setOriginalLength(originalLength);
descriptor.setOriginalElementSize(Nd4j.sizeOfDataType(buffer.dataType()));
descriptor.setNumberOfElements(buffer.length());
descriptor.setCompressionAlgorithm("THRESHOLD");
descriptor.setCompressionType(CompressionType.LOSSLESS);
//CompressedDataBuffer cbuff = new CompressedDataBuffer(pointer, descriptor);
Nd4j.getNDArrayFactory().convertDataEx(AbstractCompressor.getBufferTypeEx(buffer), buffer.addressPointer(), DataTypeEx.THRESHOLD, encodedBuffer.addressPointer(), buffer.length());
Nd4j.getAffinityManager().tagLocation(buffer, AffinityManager.Location.HOST);
return Nd4j.createArrayFromShapeBuffer(encodedBuffer, input.shapeInfoDataBuffer());
}
@Override
public INDArray thresholdDecode(INDArray encoded, INDArray target) {
DataBuffer buffer = encoded.data();
if (buffer.dataType() != DataType.INT)
throw new ND4JIllegalStateException("thresholdEncoded array should have dataType of INT");
long compressedLength = buffer.getInt(0);
long originalLength = buffer.getInt(1);
float threshold = buffer.getInt(2);
if (target.length() != originalLength)
throw new ND4JIllegalStateException("originalLength ["+ originalLength+"] stored in encoded array doesn't match target length ["+ target.length()+"]");
DataTypeEx typeDst = AbstractCompressor.getBufferTypeEx(target.data());
loop.convertTypes(null, DataTypeEx.THRESHOLD.ordinal(), buffer.addressPointer(), target.length(), typeDst.ordinal(), target.data().addressPointer());
if (loop.lastErrorCode() != 0)
throw new RuntimeException(loop.lastErrorMessage());
return target;
}
@Override
public long bitmapEncode(INDArray indArray, INDArray target, double threshold) {
long length = indArray.length();
long tLen = target.data().length();
if (tLen != (length / 16 + 5))
throw new ND4JIllegalStateException("Length of target array should be " + (length / 16 + 5));
if (target.data().dataType() != DataType.INT)
throw new ND4JIllegalStateException("Target array should have INT dataType");
DataBuffer buffer = target.data();
buffer.put(0, (int) length);
buffer.put(1, (int) length);
buffer.put(2, Float.floatToIntBits((float) threshold));
// format id
buffer.put(3, ThresholdCompression.BITMAP_ENCODING);
long affected = loop.encodeBitmap(null,
indArray.data().addressPointer(),
(LongPointer) indArray.shapeInfoDataBuffer().addressPointer(),
length,
(IntPointer) buffer.addressPointer(),
(float) threshold);
if (loop.lastErrorCode() != 0)
throw new RuntimeException(loop.lastErrorMessage());
return affected;
}
@Override
public INDArray bitmapDecode(INDArray encoded, INDArray target) {
loop.decodeBitmap(null,
encoded.data().addressPointer(),
target.length(),
target.data().addressPointer(),
(LongPointer) target.shapeInfoDataBuffer().addressPointer()
);
if (loop.lastErrorCode() != 0)
throw new RuntimeException(loop.lastErrorMessage());
return target;
}
@Override
public synchronized Map<String, CustomOpDescriptor> getCustomOperations() {
if (customOps == null) {

View File

@ -2808,30 +2808,6 @@ public native void tear(@Cast("Nd4jPointer*") PointerPointer extraPointers,
@Cast("Nd4jLong*") long[] tadShapeInfo,
@Cast("Nd4jLong*") long[] tadOffsets);
public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
public native @Cast("Nd4jLong") long encodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
public native void decodeBitmap(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong") long N, IntPointer dz, float threshold);
public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong") long N, IntBuffer dz, float threshold);
public native void encodeThresholdP1(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong") long N, int[] dz, float threshold);
public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntPointer dx, @Cast("Nd4jLong") long N, IntPointer dz);
public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, IntBuffer dx, @Cast("Nd4jLong") long N, IntBuffer dz);
public native void encodeThresholdP2Int(@Cast("Nd4jPointer*") PointerPointer extraPointers, int[] dx, @Cast("Nd4jLong") long N, int[] dz);
public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongPointer xShapeInfo, IntPointer offsets, @Cast("Nd4jLong") long N, IntPointer dz);
public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") LongBuffer xShapeInfo, IntBuffer offsets, @Cast("Nd4jLong") long N, IntBuffer dz);
public native void encodeThresholdP3(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong*") long[] xShapeInfo, int[] offsets, @Cast("Nd4jLong") long N, int[] dz);
public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongPointer zShapeInfo);
public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") LongBuffer zShapeInfo);
public native void decodeThreshold(@Cast("Nd4jPointer*") PointerPointer extraPointers, Pointer dx, @Cast("Nd4jLong") long N, Pointer dz, @Cast("Nd4jLong*") long[] zShapeInfo);
public native void sort(@Cast("Nd4jPointer*") PointerPointer extraPointers,
Pointer x, @Cast("Nd4jLong*") LongPointer xShapeInfo,
Pointer dx, @Cast("Nd4jLong*") LongPointer dxShapeInfo,
@ -12463,6 +12439,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
// #include <ops/declarable/headers/boolean.h>
// #include <ops/declarable/headers/broadcastable.h>
// #include <ops/declarable/headers/convo.h>
// #include <ops/declarable/headers/compression.h>
// #include <ops/declarable/headers/list.h>
// #include <ops/declarable/headers/recurrent.h>
// #include <ops/declarable/headers/transforms.h>

View File

@ -8315,6 +8315,85 @@ public class Nd4jTestsC extends BaseNd4jTest {
assertArrayEquals(new long[]{bS, oH, oW, oC}, ret[0].shape());
}
@Test
public void testMatmulMethod_8() {
val x = Nd4j.create(DataType.INT8, 3, 5).assign(1);
val y = Nd4j.create(DataType.INT8, 5, 3).assign(1);
val e = Nd4j.create(DataType.INT8, 3, 3).assign(5);
val z = x.mmul(y);
assertEquals(e, z);
}
@Test
public void testMatmulMethod_7() {
val x = Nd4j.create(DataType.INT16, 3, 5).assign(1);
val y = Nd4j.create(DataType.INT16, 5, 3).assign(1);
val e = Nd4j.create(DataType.INT16, 3, 3).assign(5);
val z = x.mmul(y);
assertEquals(e, z);
}
@Test
public void testMatmulMethod_1() {
val x = Nd4j.create(DataType.INT32, 3, 5).assign(1);
val y = Nd4j.create(DataType.INT32, 5, 3).assign(1);
val e = Nd4j.create(DataType.INT32, 3, 3).assign(5);
val z = x.mmul(y);
assertEquals(e, z);
}
@Test
public void testMatmulMethod_2() {
val x = Nd4j.create(DataType.INT64, 3, 5).assign(1);
val y = Nd4j.create(DataType.INT64, 5, 3).assign(1);
val e = Nd4j.create(DataType.INT64, 3, 3).assign(5);
val z = x.mmul(y);
assertEquals(e, z);
}
@Test
public void testMatmulMethod_6() {
val x = Nd4j.create(DataType.UINT8, 3, 5).assign(1);
val y = Nd4j.create(DataType.UINT8, 5, 3).assign(1);
val e = Nd4j.create(DataType.UINT8, 3, 3).assign(5);
val z = x.mmul(y);
assertEquals(e, z);
}
@Test
public void testMatmulMethod_5() {
val x = Nd4j.create(DataType.UINT16, 3, 5).assign(1);
val y = Nd4j.create(DataType.UINT16, 5, 3).assign(1);
val e = Nd4j.create(DataType.UINT16, 3, 3).assign(5);
val z = x.mmul(y);
assertEquals(e, z);
}
@Test
public void testMatmulMethod_3() {
val x = Nd4j.create(DataType.UINT32, 3, 5).assign(1);
val y = Nd4j.create(DataType.UINT32, 5, 3).assign(1);
val e = Nd4j.create(DataType.UINT32, 3, 3).assign(5);
val z = x.mmul(y);
assertEquals(e, z);
}
@Test
public void testMatmulMethod_4() {
val x = Nd4j.create(DataType.UINT64, 3, 5).assign(1);
val y = Nd4j.create(DataType.UINT64, 5, 3).assign(1);
val e = Nd4j.create(DataType.UINT64, 3, 3).assign(5);
val z = x.mmul(y);
assertEquals(e, z);
}
@Override
public char ordering() {

View File

@ -17,6 +17,7 @@
package org.nd4j.linalg.compression;
import lombok.extern.slf4j.Slf4j;
import lombok.val;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
@ -44,7 +45,6 @@ import static org.junit.Assert.*;
/**
* @author raver119@gmail.com
*/
@Ignore
@Slf4j
@RunWith(Parameterized.class)
public class CompressionTests extends BaseNd4jTest {
@ -140,40 +140,6 @@ public class CompressionTests extends BaseNd4jTest {
}
@Test
public void testThresholdCompressionZ() {
INDArray initial = Nd4j.create(1, 16384);
for (int i = 0; i < 96; i++)
initial.putScalar(i * 20, 1.0f);
INDArray exp = Nd4j.create(1, 16384);
for (int i = 0; i < 96; i++)
exp.putScalar(i * 20, 0.1f);
INDArray exp_d = Nd4j.create(1, 16384);
for (int i = 0; i < 96; i++)
exp_d.putScalar(i * 20, 0.9f);
NDArrayCompressor compressor = Nd4j.getCompressor().getCompressor("THRESHOLD");
compressor.configure(0.9);
INDArray compressed = Nd4j.getExecutioner().thresholdEncode(initial, 0.9);
assertEquals(exp, initial);
log.info("Compressed length: {}", compressed.data().length());
// log.info("Compressed: {}", Arrays.toString(compressed.data().asInt()));
INDArray decompressed = Nd4j.create(1, initial.length());
Nd4j.getExecutioner().thresholdDecode(compressed, decompressed);
log.info("Decompressed length: {}", decompressed.length());
assertEquals(exp_d, decompressed);
}
@Ignore
@Test
public void testThresholdCompression0() {
@ -296,6 +262,23 @@ public class CompressionTests extends BaseNd4jTest {
@Test
public void testThresholdCompression5() {
INDArray initial = Nd4j.ones(10);
INDArray exp_0 = initial.dup();
Nd4j.getExecutioner().commit();
//Nd4j.getCompressor().getCompressor("THRESHOLD").configure(1e-3);
INDArray compressed = Nd4j.getExecutioner().thresholdEncode(initial, 1.0f, 3);
assertEquals(7, compressed.data().length());
assertNotEquals(exp_0, initial);
assertEquals(7, initial.sumNumber().doubleValue(), 0.01);
}
@Test
public void testThresholdCompression5_1() {
INDArray initial = Nd4j.ones(1000);
INDArray exp_0 = initial.dup();
@ -435,8 +418,8 @@ public class CompressionTests extends BaseNd4jTest {
INDArray exp_0 = Nd4j.create(new float[] {0.0f, -1e-4f, 0.0f, 0.0f, 0.0f, 0.0f});
INDArray exp_1 = Nd4j.create(new float[] {0.0f, -5e-4f, 1e-3f, -1e-3f, 0.0f, 0.0f});
DataBuffer ib = Nd4j.getDataBufferFactory().createInt(5);
INDArray enc = Nd4j.createArrayFromShapeBuffer(ib, initial.shapeInfoDataBuffer());
INDArray enc = Nd4j.create(DataType.INT32, initial.length() / 16 + 5);
long elements = Nd4j.getExecutioner().bitmapEncode(initial, enc, 1e-3);
log.info("Encoded: {}", Arrays.toString(enc.data().asInt()));
@ -471,7 +454,7 @@ public class CompressionTests extends BaseNd4jTest {
@Test
public void testBitmapEncoding5() {
Nd4j.getRandom().setSeed(119);
INDArray initial = Nd4j.rand(new int[]{1, 10000}, -1, -0.5, Nd4j.getRandom());
INDArray initial = Nd4j.rand(new int[]{10000}, -1, -0.5, Nd4j.getRandom());
INDArray exp_0 = initial.dup().addi(1e-1);
INDArray exp_1 = initial.dup();
@ -486,7 +469,7 @@ public class CompressionTests extends BaseNd4jTest {
@Test
public void testBitmapEncoding6() {
Nd4j.getRandom().setSeed(119);
INDArray initial = Nd4j.rand(new int[]{1, 100000}, -1, 1, Nd4j.getRandom());
INDArray initial = Nd4j.rand(new int[]{10000}, -1, 1, Nd4j.getRandom());
INDArray exp_1 = initial.dup();
INDArray enc = Nd4j.getExecutioner().bitmapEncode(initial, 1e-3);
@ -494,6 +477,11 @@ public class CompressionTests extends BaseNd4jTest {
Nd4j.getExecutioner().bitmapDecode(enc, initial);
val f0 = exp_1.toFloatVector();
val f1 = initial.toFloatVector();
assertArrayEquals(f0, f1, 1e-5f);
assertEquals(exp_1, initial);
}