[WIP] CUDA Java side (#58)
* one crashing test Signed-off-by: raver119 <raver119@gmail.com> * stupid issue fixed Signed-off-by: raver119 <raver119@gmail.com> * one fix Signed-off-by: raver119 <raver119@gmail.com> * dont ensure location for empty arrays Signed-off-by: raver119 <raver119@gmail.com> * few more signatures fixed Signed-off-by: raver119 <raver119@gmail.com> * few tweaks for DataBuffer creation from java primitives Signed-off-by: raver119 <raver119@gmail.com> * get rid of legacy im2col/col2im intercept Signed-off-by: raver119 <raver119@gmail.com> * rsubi scalar array fix Signed-off-by: raver119 <raver119@gmail.com>master
parent
68b82f3856
commit
6ce458e949
|
@ -37,7 +37,6 @@ void DataBuffer::allocateSpecial() {
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
void DataBuffer::syncToPrimary(const LaunchContext* context, const bool forceSync) {
|
void DataBuffer::syncToPrimary(const LaunchContext* context, const bool forceSync) {
|
||||||
|
|
||||||
if(isPrimaryActual() && !forceSync)
|
if(isPrimaryActual() && !forceSync)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
|
|
@ -93,8 +93,12 @@ namespace helpers {
|
||||||
if (comp != nullptr)
|
if (comp != nullptr)
|
||||||
comp->syncToHost();
|
comp->syncToHost();
|
||||||
|
|
||||||
|
if (output != nullptr)
|
||||||
output->syncToHost();
|
output->syncToHost();
|
||||||
|
|
||||||
|
if (numResult != nullptr)
|
||||||
numResult->syncToHost();
|
numResult->syncToHost();
|
||||||
|
|
||||||
compScalar.syncToHost();
|
compScalar.syncToHost();
|
||||||
|
|
||||||
BUILD_SINGLE_SELECTOR(arg->dataType(), return processCondition_, (mode, arg, comp, output, numResult, compScalar), FLOAT_TYPES);
|
BUILD_SINGLE_SELECTOR(arg->dataType(), return processCondition_, (mode, arg, comp, output, numResult, compScalar), FLOAT_TYPES);
|
||||||
|
@ -104,8 +108,12 @@ namespace helpers {
|
||||||
if (comp != nullptr)
|
if (comp != nullptr)
|
||||||
comp->syncToDevice();
|
comp->syncToDevice();
|
||||||
|
|
||||||
|
if (output != nullptr)
|
||||||
output->syncToDevice();
|
output->syncToDevice();
|
||||||
|
|
||||||
|
if (numResult != nullptr)
|
||||||
numResult->syncToDevice();
|
numResult->syncToDevice();
|
||||||
|
|
||||||
compScalar.syncToDevice();
|
compScalar.syncToDevice();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (c) 2015-2018 Skymind, Inc.
|
||||||
|
*
|
||||||
|
* This program and the accompanying materials are made available under the
|
||||||
|
* terms of the Apache License, Version 2.0 which is available at
|
||||||
|
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
* License for the specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
******************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// @author raver119@gmail.com
|
||||||
|
//
|
||||||
|
|
||||||
|
#include "testlayers.h"
|
||||||
|
#include <ops/declarable/CustomOperations.h>
|
||||||
|
#include <NDArray.h>
|
||||||
|
#include <ops/ops.h>
|
||||||
|
#include <GradCheck.h>
|
||||||
|
|
||||||
|
|
||||||
|
using namespace nd4j;
|
||||||
|
|
||||||
|
|
||||||
|
class DeclarableOpsTestsCuda1 : public testing::Test {
|
||||||
|
public:
|
||||||
|
|
||||||
|
DeclarableOpsTestsCuda1() {
|
||||||
|
printf("\n");
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
TEST_F(DeclarableOpsTestsCuda1, Test_CHOOSE_SCALAR_LARGE) {
|
||||||
|
double inputData[150] = {
|
||||||
|
0, 0.51, 0.68, 0.69, 0.86, 0.91, 0.96, 0.97, 0.97, 1.03, 1.13, 1.16, 1.16, 1.17, 1.19, 1.25, 1.25, 1.26, 1.27, 1.28, 1.29, 1.29, 1.29, 1.30, 1.31, 1.32, 1.33, 1.33, 1.35, 1.35, 1.36, 1.37, 1.38, 1.40, 1.41, 1.42, 1.43, 1.44, 1.44, 1.45, 1.45, 1.47, 1.47, 1.51, 1.51, 1.51, 1.52, 1.53, 1.56, 1.57, 1.58, 1.59, 1.61, 1.62, 1.63, 1.63, 1.64, 1.64, 1.66, 1.66, 1.67, 1.67, 1.70, 1.70, 1.70, 1.72, 1.72, 1.72, 1.72, 1.73, 1.74, 1.74, 1.76, 1.76, 1.77, 1.77, 1.80, 1.80, 1.81, 1.82, 1.83, 1.83, 1.84, 1.84, 1.84, 1.85, 1.85, 1.85, 1.86, 1.86, 1.87, 1.88, 1.89, 1.89, 1.89, 1.89, 1.89, 1.91, 1.91, 1.91, 1.92, 1.94, 1.95, 1.97, 1.98, 1.98, 1.98, 1.98, 1.98, 1.99, 2, 2, 2.01, 2.01, 2.02, 2.03, 2.03, 2.03, 2.04, 2.04, 2.05, 2.06, 2.07, 2.08, 2.08, 2.08, 2.08, 2.09, 2.09, 2.10, 2.10, 2.11, 2.11, 2.11, 2.12, 2.12, 2.13, 2.13, 2.14, 2.14, 2.14, 2.14, 2.15, 2.15, 2.16, 2.16, 2.16, 2.16, 2.16, 2.17
|
||||||
|
};
|
||||||
|
|
||||||
|
auto precursor = NDArrayFactory::create<double>(inputData,'c',{1,149});
|
||||||
|
NDArray x(nullptr, precursor.specialBuffer(), precursor.shapeInfo());
|
||||||
|
|
||||||
|
nd4j::ops::choose op;
|
||||||
|
//greater than test
|
||||||
|
auto result = op.execute({&x}, {0.0},{3});
|
||||||
|
ASSERT_EQ(Status::OK(), result->status());
|
||||||
|
|
||||||
|
auto z = result->at(1);
|
||||||
|
|
||||||
|
ASSERT_EQ(148,z->e<double>(0));
|
||||||
|
//ASSERT_TRUE(exp.isSameShape(z));
|
||||||
|
|
||||||
|
delete result;
|
||||||
|
|
||||||
|
}
|
|
@ -4003,7 +4003,7 @@ public abstract class BaseNDArray implements INDArray, Iterable {
|
||||||
public INDArray rsubi(INDArray other, INDArray result) {
|
public INDArray rsubi(INDArray other, INDArray result) {
|
||||||
validateNumericalArray("rsubi", false);
|
validateNumericalArray("rsubi", false);
|
||||||
if (other.isScalar()) {
|
if (other.isScalar()) {
|
||||||
return this.addi(other.getDouble(0), result);
|
return this.rsubi(other.getDouble(0), result);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isScalar()) {
|
if (isScalar()) {
|
||||||
|
|
|
@ -379,7 +379,11 @@ public class CudaAffinityManager extends BasicAffinityManager {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void ensureLocation(INDArray array, Location location) {
|
public void ensureLocation(INDArray array, Location location) {
|
||||||
AllocationPoint point = AtomicAllocator.getInstance().getAllocationPoint(array);
|
// to location to ensure for empty array
|
||||||
|
if (array.isEmpty())
|
||||||
|
return;
|
||||||
|
|
||||||
|
val point = AtomicAllocator.getInstance().getAllocationPoint(array);
|
||||||
switch (location) {
|
switch (location) {
|
||||||
case HOST: {
|
case HOST: {
|
||||||
AtomicAllocator.getInstance().synchronizeHostData(array);
|
AtomicAllocator.getInstance().synchronizeHostData(array);
|
||||||
|
@ -399,7 +403,10 @@ public class CudaAffinityManager extends BasicAffinityManager {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Location getActiveLocation(INDArray array) {
|
public Location getActiveLocation(INDArray array) {
|
||||||
AllocationPoint point = AtomicAllocator.getInstance().getAllocationPoint(array);
|
if (array.isEmpty())
|
||||||
|
return Location.EVERYWHERE;
|
||||||
|
|
||||||
|
val point = AtomicAllocator.getInstance().getAllocationPoint(array);
|
||||||
|
|
||||||
if (point.isActualOnDeviceSide() && point.isActualOnHostSide()) {
|
if (point.isActualOnDeviceSide() && point.isActualOnHostSide()) {
|
||||||
return Location.EVERYWHERE;
|
return Location.EVERYWHERE;
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.nd4j.jita.flow.impl;
|
||||||
|
|
||||||
|
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
|
import lombok.NonNull;
|
||||||
import lombok.val;
|
import lombok.val;
|
||||||
import org.bytedeco.javacpp.DoublePointer;
|
import org.bytedeco.javacpp.DoublePointer;
|
||||||
import org.nd4j.jita.allocator.Allocator;
|
import org.nd4j.jita.allocator.Allocator;
|
||||||
|
@ -95,7 +96,7 @@ public class SynchronousFlowController implements FlowController {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void synchronizeToDevice(AllocationPoint point) {
|
public void synchronizeToDevice(@NonNull AllocationPoint point) {
|
||||||
if (point.isConstant())
|
if (point.isConstant())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
|
|
@ -567,6 +567,14 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda
|
||||||
return allocationPoint.getPointers().getHostPointer().address();
|
return allocationPoint.getPointers().getHostPointer().address();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Pointer pointer() {
|
||||||
|
// FIXME: very bad thing,
|
||||||
|
lazyAllocateHostPointer();
|
||||||
|
|
||||||
|
return super.pointer();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
@ -672,7 +680,6 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void set(long[] data, long length, long srcOffset, long dstOffset) {
|
public void set(long[] data, long length, long srcOffset, long dstOffset) {
|
||||||
// TODO: make sure getPointer returns proper pointer
|
// TODO: make sure getPointer returns proper pointer
|
||||||
|
|
||||||
|
@ -698,11 +705,14 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case UBYTE: {
|
case UBYTE: {
|
||||||
|
data = ArrayUtil.cutBelowZero(data);
|
||||||
for (int e = 0; e < data.length; e++) {
|
for (int e = 0; e < data.length; e++) {
|
||||||
put(e, data[e]);
|
put(e, data[e]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case UINT16:
|
||||||
|
data = ArrayUtil.cutBelowZero(data);
|
||||||
case SHORT: {
|
case SHORT: {
|
||||||
val pointer = new ShortPointer(ArrayUtil.toShorts(data));
|
val pointer = new ShortPointer(ArrayUtil.toShorts(data));
|
||||||
val srcPtr = new CudaPointer(pointer.address() + (dstOffset * elementSize));
|
val srcPtr = new CudaPointer(pointer.address() + (dstOffset * elementSize));
|
||||||
|
@ -714,6 +724,7 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case UINT32:
|
case UINT32:
|
||||||
|
data = ArrayUtil.cutBelowZero(data);
|
||||||
case INT: {
|
case INT: {
|
||||||
val pointer = new IntPointer(ArrayUtil.toInts(data));
|
val pointer = new IntPointer(ArrayUtil.toInts(data));
|
||||||
val srcPtr = new CudaPointer(pointer.address() + (dstOffset * elementSize));
|
val srcPtr = new CudaPointer(pointer.address() + (dstOffset * elementSize));
|
||||||
|
@ -725,6 +736,7 @@ public abstract class BaseCudaDataBuffer extends BaseDataBuffer implements JCuda
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case UINT64:
|
case UINT64:
|
||||||
|
data = ArrayUtil.cutBelowZero(data);
|
||||||
case LONG: {
|
case LONG: {
|
||||||
val pointer = new LongPointer(data);
|
val pointer = new LongPointer(data);
|
||||||
val srcPtr = new CudaPointer(pointer.address() + (dstOffset * elementSize));
|
val srcPtr = new CudaPointer(pointer.address() + (dstOffset * elementSize));
|
||||||
|
|
|
@ -187,6 +187,11 @@ public class CudaBfloat16DataBuffer extends BaseCudaDataBuffer {
|
||||||
setData(ArrayUtil.toShorts(data));
|
setData(ArrayUtil.toShorts(data));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setData(long[] data) {
|
||||||
|
setData(ArrayUtil.toShorts(data));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -414,6 +414,8 @@ public class CudaDataBufferFactory implements DataBufferFactory {
|
||||||
return new CudaFloatDataBuffer(length, initialize, workspace);
|
return new CudaFloatDataBuffer(length, initialize, workspace);
|
||||||
case HALF:
|
case HALF:
|
||||||
return new CudaHalfDataBuffer(length, initialize, workspace);
|
return new CudaHalfDataBuffer(length, initialize, workspace);
|
||||||
|
case BFLOAT16:
|
||||||
|
return new CudaBfloat16DataBuffer(length, initialize, workspace);
|
||||||
case BOOL:
|
case BOOL:
|
||||||
return new CudaBoolDataBuffer(length, initialize, workspace);
|
return new CudaBoolDataBuffer(length, initialize, workspace);
|
||||||
default:
|
default:
|
||||||
|
|
|
@ -61,6 +61,7 @@ import org.nd4j.linalg.exception.ND4JIllegalArgumentException;
|
||||||
import org.nd4j.linalg.exception.ND4JIllegalStateException;
|
import org.nd4j.linalg.exception.ND4JIllegalStateException;
|
||||||
import org.nd4j.linalg.factory.Nd4j;
|
import org.nd4j.linalg.factory.Nd4j;
|
||||||
import org.nd4j.linalg.jcublas.buffer.AddressRetriever;
|
import org.nd4j.linalg.jcublas.buffer.AddressRetriever;
|
||||||
|
import org.nd4j.linalg.jcublas.buffer.BaseCudaDataBuffer;
|
||||||
import org.nd4j.linalg.jcublas.buffer.CudaLongDataBuffer;
|
import org.nd4j.linalg.jcublas.buffer.CudaLongDataBuffer;
|
||||||
import org.nd4j.linalg.jcublas.context.CudaContext;
|
import org.nd4j.linalg.jcublas.context.CudaContext;
|
||||||
import org.nd4j.linalg.primitives.AtomicBoolean;
|
import org.nd4j.linalg.primitives.AtomicBoolean;
|
||||||
|
@ -1495,7 +1496,8 @@ public class CudaExecutioner extends DefaultOpExecutioner {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public <T extends Aggregate> void exec(Batch<T> batch) {
|
public <T extends Aggregate> void exec(Batch<T> batch) {
|
||||||
DataBuffer surfaceBuffer = getBuffer(batch);
|
val surfaceBuffer = (BaseCudaDataBuffer) getBuffer(batch);
|
||||||
|
surfaceBuffer.lazyAllocateHostPointer();
|
||||||
|
|
||||||
CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
|
CudaContext context = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
|
||||||
|
|
||||||
|
@ -2238,152 +2240,6 @@ public class CudaExecutioner extends DefaultOpExecutioner {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (op.opName().equalsIgnoreCase("im2col")) {
|
|
||||||
val xArr = op.inputArguments()[0];
|
|
||||||
val zArr = op.outputArguments()[0];
|
|
||||||
|
|
||||||
CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(zArr, xArr);
|
|
||||||
|
|
||||||
if (extraz.get() == null)
|
|
||||||
extraz.set(new PointerPointer(32));
|
|
||||||
|
|
||||||
PointerPointer xShapeHost =
|
|
||||||
extraz.get().put(AddressRetriever.retrieveHostPointer(xArr.shapeInfoDataBuffer()), // 0
|
|
||||||
context.getOldStream(), // 1
|
|
||||||
AtomicAllocator.getInstance().getDeviceIdPointer(), // 2
|
|
||||||
context.getBufferAllocation(), // 3
|
|
||||||
context.getBufferReduction(), // 4
|
|
||||||
context.getBufferScalar(), // 5
|
|
||||||
context.getBufferSpecial(),
|
|
||||||
null,
|
|
||||||
AddressRetriever.retrieveHostPointer(zArr.shapeInfoDataBuffer())
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
val x = AtomicAllocator.getInstance().getPointer(xArr, context);
|
|
||||||
val z = AtomicAllocator.getInstance().getPointer(zArr, context);
|
|
||||||
|
|
||||||
val xShape = AtomicAllocator.getInstance().getPointer(xArr.shapeInfoDataBuffer(), context);
|
|
||||||
val zShape = AtomicAllocator.getInstance().getPointer(zArr.shapeInfoDataBuffer(), context);
|
|
||||||
|
|
||||||
val hxShape = AtomicAllocator.getInstance().getHostPointer(xArr.shapeInfoDataBuffer());
|
|
||||||
val hzShape = AtomicAllocator.getInstance().getHostPointer(zArr.shapeInfoDataBuffer());
|
|
||||||
|
|
||||||
double zeroPad = 0.0;
|
|
||||||
if(op.tArgs() != null && op.tArgs().length > 0){
|
|
||||||
zeroPad = op.tArgs()[0];
|
|
||||||
}
|
|
||||||
val extrass = new double[]{op.iArgs()[0], op.iArgs()[1], op.iArgs()[2], op.iArgs()[3], op.iArgs()[4], op.iArgs()[5], op.iArgs()[6], op.iArgs()[7], op.iArgs()[8], zeroPad};
|
|
||||||
val extraArgsBuff = Nd4j.getConstantHandler().getConstantBuffer(extrass, xArr.dataType());
|
|
||||||
val extraArgs = AtomicAllocator.getInstance().getPointer(extraArgsBuff, context);
|
|
||||||
|
|
||||||
nativeOps.execTransformSame(xShapeHost, 9,
|
|
||||||
null, (LongPointer) hxShape, x, (LongPointer) xShape,
|
|
||||||
null, (LongPointer) hzShape, z, (LongPointer) zShape, extraArgs);
|
|
||||||
|
|
||||||
//AtomicAllocator.getInstance().getAllocationPoint(zArr).tickDeviceWrite();
|
|
||||||
AtomicAllocator.getInstance().getFlowController().registerAction(context, zArr, xArr);
|
|
||||||
|
|
||||||
Nd4j.getExecutioner().commit();
|
|
||||||
|
|
||||||
return op.outputArguments();
|
|
||||||
} else if (op.opName().equalsIgnoreCase("col2im")) {
|
|
||||||
val dtype = Nd4j.dataType();
|
|
||||||
|
|
||||||
val xArr = op.inputArguments()[0];
|
|
||||||
val zArr = op.outputArguments()[0];
|
|
||||||
|
|
||||||
CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(zArr, xArr);
|
|
||||||
|
|
||||||
if (extraz.get() == null)
|
|
||||||
extraz.set(new PointerPointer(32));
|
|
||||||
|
|
||||||
PointerPointer xShapeHost =
|
|
||||||
extraz.get().put(AddressRetriever.retrieveHostPointer(xArr.shapeInfoDataBuffer()), // 0
|
|
||||||
context.getOldStream(), // 1
|
|
||||||
AtomicAllocator.getInstance().getDeviceIdPointer(), // 2
|
|
||||||
context.getBufferAllocation(), // 3
|
|
||||||
context.getBufferReduction(), // 4
|
|
||||||
context.getBufferScalar(), // 5
|
|
||||||
context.getBufferSpecial(),
|
|
||||||
null,
|
|
||||||
AddressRetriever.retrieveHostPointer(zArr.shapeInfoDataBuffer())
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
val x = AtomicAllocator.getInstance().getPointer(xArr, context);
|
|
||||||
val z = AtomicAllocator.getInstance().getPointer(zArr, context);
|
|
||||||
|
|
||||||
val xShape = AtomicAllocator.getInstance().getPointer(xArr.shapeInfoDataBuffer(), context);
|
|
||||||
val zShape = AtomicAllocator.getInstance().getPointer(zArr.shapeInfoDataBuffer(), context);
|
|
||||||
|
|
||||||
val hxShape = AtomicAllocator.getInstance().getHostPointer(xArr.shapeInfoDataBuffer());
|
|
||||||
val hzShape = AtomicAllocator.getInstance().getHostPointer(zArr.shapeInfoDataBuffer());
|
|
||||||
|
|
||||||
val extrass = new double[]{op.iArgs()[0], op.iArgs()[1], op.iArgs()[2], op.iArgs()[3], op.iArgs()[4], op.iArgs()[5], op.iArgs()[6], op.iArgs()[7]};
|
|
||||||
val extraArgsBuff = Nd4j.getConstantHandler().getConstantBuffer(extrass, xArr.dataType());
|
|
||||||
val extraArgs = AtomicAllocator.getInstance().getPointer(extraArgsBuff, context);
|
|
||||||
|
|
||||||
|
|
||||||
nativeOps.execTransformSame(xShapeHost, 8,
|
|
||||||
null, (LongPointer) hxShape, x, (LongPointer) xShape,
|
|
||||||
null, (LongPointer) hzShape, z, (LongPointer) zShape, extraArgs);
|
|
||||||
|
|
||||||
//AtomicAllocator.getInstance().getAllocationPoint(zArr).tickDeviceWrite();
|
|
||||||
AtomicAllocator.getInstance().getFlowController().registerAction(context, zArr, xArr);
|
|
||||||
|
|
||||||
//Nd4j.getExecutioner().commit();
|
|
||||||
return op.outputArguments();
|
|
||||||
} else if (op.opName().equalsIgnoreCase("pooling2d")) {
|
|
||||||
val dtype = Nd4j.dataType();
|
|
||||||
|
|
||||||
val xArr = op.inputArguments()[0];
|
|
||||||
val zArr = op.outputArguments()[0];
|
|
||||||
|
|
||||||
CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(zArr, xArr);
|
|
||||||
|
|
||||||
if (extraz.get() == null)
|
|
||||||
extraz.set(new PointerPointer(32));
|
|
||||||
|
|
||||||
PointerPointer xShapeHost =
|
|
||||||
extraz.get().put(AddressRetriever.retrieveHostPointer(xArr.shapeInfoDataBuffer()), // 0
|
|
||||||
context.getOldStream(), // 1
|
|
||||||
AtomicAllocator.getInstance().getDeviceIdPointer(), // 2
|
|
||||||
context.getBufferAllocation(), // 3
|
|
||||||
context.getBufferReduction(), // 4
|
|
||||||
context.getBufferScalar(), // 5
|
|
||||||
context.getBufferSpecial(),
|
|
||||||
null,
|
|
||||||
AddressRetriever.retrieveHostPointer(zArr.shapeInfoDataBuffer())
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
val x = AtomicAllocator.getInstance().getPointer(xArr, context);
|
|
||||||
val z = AtomicAllocator.getInstance().getPointer(zArr, context);
|
|
||||||
|
|
||||||
val xShape = AtomicAllocator.getInstance().getPointer(xArr.shapeInfoDataBuffer(), context);
|
|
||||||
val zShape = AtomicAllocator.getInstance().getPointer(zArr.shapeInfoDataBuffer(), context);
|
|
||||||
|
|
||||||
val hxShape = AtomicAllocator.getInstance().getHostPointer(xArr.shapeInfoDataBuffer());
|
|
||||||
val hzShape = AtomicAllocator.getInstance().getHostPointer(zArr.shapeInfoDataBuffer());
|
|
||||||
|
|
||||||
val extrass = new double[]{op.iArgs()[0], op.iArgs()[1], op.iArgs()[2], op.iArgs()[3], op.iArgs()[4], op.iArgs()[5], op.iArgs()[6], op.iArgs()[7], op.iArgs()[8]};
|
|
||||||
val extraArgsBuff = Nd4j.getConstantHandler().getConstantBuffer(extrass, zArr.dataType());
|
|
||||||
val extraArgs = AtomicAllocator.getInstance().getPointer(extraArgsBuff, context);
|
|
||||||
|
|
||||||
|
|
||||||
nativeOps.execTransformFloat(xShapeHost, 23,
|
|
||||||
null, (LongPointer) hxShape, x, (LongPointer) xShape,
|
|
||||||
zArr.data().addressPointer(), (LongPointer) hzShape, z, (LongPointer) zShape,
|
|
||||||
extraArgs);
|
|
||||||
|
|
||||||
// AtomicAllocator.getInstance().getAllocationPoint(zArr).tickDeviceWrite();
|
|
||||||
AtomicAllocator.getInstance().getFlowController().registerAction(context, zArr, xArr);
|
|
||||||
|
|
||||||
return op.outputArguments();
|
|
||||||
}
|
|
||||||
|
|
||||||
Nd4j.getExecutioner().commit();
|
|
||||||
val ctx = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
|
val ctx = (CudaContext) AtomicAllocator.getInstance().getDeviceContext().getContext();
|
||||||
|
|
||||||
val context = (CudaOpContext) buildContext();
|
val context = (CudaOpContext) buildContext();
|
||||||
|
|
|
@ -15377,7 +15377,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
|
||||||
* Input arrays:
|
* Input arrays:
|
||||||
* 0: input 3d tensor with shape [bS x K x N], N - number of time steps, bS - batch size, K - number of features
|
* 0: input 3d tensor with shape [bS x K x N], N - number of time steps, bS - batch size, K - number of features
|
||||||
* 1: 2d tensor of weights [3K x K]
|
* 1: 2d tensor of weights [3K x K]
|
||||||
* 2: row of biases with twice length [1 × 2K]
|
* 2: row of biases with twice length [1 x 2K]
|
||||||
* 3: 2d tensor of previous cell state [bS x K]
|
* 3: 2d tensor of previous cell state [bS x K]
|
||||||
* 4: optional, 2d tensor of dropout mask [bS x K]
|
* 4: optional, 2d tensor of dropout mask [bS x K]
|
||||||
*
|
*
|
||||||
|
@ -15410,7 +15410,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
|
||||||
* Input arrays:
|
* Input arrays:
|
||||||
* 0: input 3d tensor with shape [N x bS x 2K], N - number of time steps, bS - batch size, K - number of features
|
* 0: input 3d tensor with shape [N x bS x 2K], N - number of time steps, bS - batch size, K - number of features
|
||||||
* 1: 2d tensor of weights [2K x 6K]
|
* 1: 2d tensor of weights [2K x 6K]
|
||||||
* 2: row of biases with twice length [1 × 4K]
|
* 2: row of biases with twice length [1 x 4K]
|
||||||
* 3: 2d tensor of previous cell state [bS x 2K]
|
* 3: 2d tensor of previous cell state [bS x 2K]
|
||||||
* 4: optional, 2d tensor of dropout mask [bS x 2K]
|
* 4: optional, 2d tensor of dropout mask [bS x 2K]
|
||||||
*
|
*
|
||||||
|
@ -15444,7 +15444,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
|
||||||
* Input arrays:
|
* Input arrays:
|
||||||
* 0: input 3d tensor with shape [bS x K x N], N - number of time steps, bS - batch size, K - number of features
|
* 0: input 3d tensor with shape [bS x K x N], N - number of time steps, bS - batch size, K - number of features
|
||||||
* 1: 2d tensor of weights [3K x K]
|
* 1: 2d tensor of weights [3K x K]
|
||||||
* 2: row of biases with twice length [1 × 2K]
|
* 2: row of biases with twice length [1 x 2K]
|
||||||
* 3: 2d tensor of previous cell state [bS x K]
|
* 3: 2d tensor of previous cell state [bS x K]
|
||||||
* 4: 3d tensor of cell state [bS x K x N]
|
* 4: 3d tensor of cell state [bS x K x N]
|
||||||
* 5: 2d tensor of cell state gradients [bS x K]
|
* 5: 2d tensor of cell state gradients [bS x K]
|
||||||
|
@ -15482,7 +15482,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
|
||||||
* Input arrays:
|
* Input arrays:
|
||||||
* 0: input 3d tensor with shape [N x bS x 2K], N - number of time steps, bS - batch size, K - number of features
|
* 0: input 3d tensor with shape [N x bS x 2K], N - number of time steps, bS - batch size, K - number of features
|
||||||
* 1: 2d tensor of weights [2K x 6K]
|
* 1: 2d tensor of weights [2K x 6K]
|
||||||
* 2: row of biases with twice length [1 × 4K]
|
* 2: row of biases with twice length [1 x 4K]
|
||||||
* 3: 2d tensor of previous cell state [bS x 2K]
|
* 3: 2d tensor of previous cell state [bS x 2K]
|
||||||
* 4: 3d tensor of cell state [N x bS x 2K]
|
* 4: 3d tensor of cell state [N x bS x 2K]
|
||||||
* 5: 2d tensor of cell state gradients [bS x 2K]
|
* 5: 2d tensor of cell state gradients [bS x 2K]
|
||||||
|
@ -15681,7 +15681,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
|
||||||
* 0: input with shape [batchSize x inSize], batchSize - batch size, inSize - number of features
|
* 0: input with shape [batchSize x inSize], batchSize - batch size, inSize - number of features
|
||||||
* 1: previous cell state [batchSize x inSize], that is at previous time step t-1
|
* 1: previous cell state [batchSize x inSize], that is at previous time step t-1
|
||||||
* 2: weights [inSize x 3*inSize]
|
* 2: weights [inSize x 3*inSize]
|
||||||
* 3: biases [1 × 2*inSize]
|
* 3: biases [1 x 2*inSize]
|
||||||
*
|
*
|
||||||
* Output arrays:
|
* Output arrays:
|
||||||
* 0: current cell output [batchSize x inSize], that is at current time step t
|
* 0: current cell output [batchSize x inSize], that is at current time step t
|
||||||
|
|
|
@ -23,6 +23,7 @@ import org.junit.Test;
|
||||||
import org.junit.runner.RunWith;
|
import org.junit.runner.RunWith;
|
||||||
import org.junit.runners.Parameterized;
|
import org.junit.runners.Parameterized;
|
||||||
import org.nd4j.linalg.BaseNd4jTest;
|
import org.nd4j.linalg.BaseNd4jTest;
|
||||||
|
import org.nd4j.linalg.api.buffer.DataType;
|
||||||
import org.nd4j.linalg.api.ndarray.INDArray;
|
import org.nd4j.linalg.api.ndarray.INDArray;
|
||||||
import org.nd4j.linalg.api.ops.aggregates.impl.AggregateCBOW;
|
import org.nd4j.linalg.api.ops.aggregates.impl.AggregateCBOW;
|
||||||
import org.nd4j.linalg.api.ops.aggregates.impl.AggregateSkipGram;
|
import org.nd4j.linalg.api.ops.aggregates.impl.AggregateSkipGram;
|
||||||
|
@ -95,17 +96,17 @@ public class HierarchicSoftmaxTests extends BaseNd4jTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSGGradient1() {
|
public void testSGGradient1() {
|
||||||
INDArray syn0 = Nd4j.create(10, 10).assign(0.01f);
|
INDArray syn0 = Nd4j.create(DataType.DOUBLE, 10, 10).assign(0.01f);
|
||||||
INDArray syn1 = Nd4j.create(10, 10).assign(0.02f);
|
INDArray syn1 = Nd4j.create(DataType.DOUBLE,10, 10).assign(0.02f);
|
||||||
INDArray syn1Neg = Nd4j.ones(10, 10).assign(0.03f);
|
INDArray syn1Neg = Nd4j.create(DataType.DOUBLE,10, 10).assign(0.03f);
|
||||||
INDArray expTable = Nd4j.create(10000).assign(0.5f);
|
INDArray expTable = Nd4j.create(DataType.DOUBLE,10000).assign(0.5f);
|
||||||
|
|
||||||
double lr = 0.001;
|
double lr = 0.001;
|
||||||
|
|
||||||
int idxSyn0 = 0;
|
int idxSyn0 = 0;
|
||||||
|
|
||||||
INDArray expSyn0 = Nd4j.create(10).assign(0.01001f);
|
INDArray expSyn0 = Nd4j.create(DataType.DOUBLE,10).assign(0.01001f);
|
||||||
INDArray expSyn1_1 = Nd4j.create(10).assign(0.020005);
|
INDArray expSyn1_1 = Nd4j.create(DataType.DOUBLE,10).assign(0.020005);
|
||||||
|
|
||||||
INDArray syn0row = syn0.getRow(idxSyn0);
|
INDArray syn0row = syn0.getRow(idxSyn0);
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.junit.Test;
|
||||||
import org.junit.runner.RunWith;
|
import org.junit.runner.RunWith;
|
||||||
import org.junit.runners.Parameterized;
|
import org.junit.runners.Parameterized;
|
||||||
import org.nd4j.linalg.BaseNd4jTest;
|
import org.nd4j.linalg.BaseNd4jTest;
|
||||||
|
import org.nd4j.linalg.api.concurrency.AffinityManager;
|
||||||
import org.nd4j.linalg.api.memory.MemoryWorkspace;
|
import org.nd4j.linalg.api.memory.MemoryWorkspace;
|
||||||
import org.nd4j.linalg.api.memory.conf.WorkspaceConfiguration;
|
import org.nd4j.linalg.api.memory.conf.WorkspaceConfiguration;
|
||||||
import org.nd4j.linalg.api.memory.enums.AllocationPolicy;
|
import org.nd4j.linalg.api.memory.enums.AllocationPolicy;
|
||||||
|
@ -288,6 +289,8 @@ public class DataBufferTests extends BaseNd4jTest {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log.info("Testing source [{}]; target: [{}]", sourceType, dt);
|
||||||
|
|
||||||
for (boolean useWs : new boolean[]{false, true}) {
|
for (boolean useWs : new boolean[]{false, true}) {
|
||||||
|
|
||||||
try (MemoryWorkspace ws = (useWs ? workspace.notifyScopeEntered() : null)) {
|
try (MemoryWorkspace ws = (useWs ? workspace.notifyScopeEntered() : null)) {
|
||||||
|
@ -334,7 +337,6 @@ public class DataBufferTests extends BaseNd4jTest {
|
||||||
assertFalse(db2.isAttached());
|
assertFalse(db2.isAttached());
|
||||||
|
|
||||||
if(!sourceType.equals("boolean")){
|
if(!sourceType.equals("boolean")){
|
||||||
log.info("Testing source [{}]; target: [{}]", sourceType, dt);
|
|
||||||
testDBOps(db1);
|
testDBOps(db1);
|
||||||
testDBOps(db2);
|
testDBOps(db2);
|
||||||
}
|
}
|
||||||
|
@ -375,6 +377,8 @@ public class DataBufferTests extends BaseNd4jTest {
|
||||||
bb.position(0);
|
bb.position(0);
|
||||||
bb.put(b);
|
bb.put(b);
|
||||||
|
|
||||||
|
Nd4j.getAffinityManager().tagLocation(arr2, AffinityManager.Location.HOST);
|
||||||
|
|
||||||
assertEquals(arr.toString(), arr2.toString());
|
assertEquals(arr.toString(), arr2.toString());
|
||||||
assertEquals(arr, arr2);
|
assertEquals(arr, arr2);
|
||||||
|
|
||||||
|
|
|
@ -44,6 +44,7 @@ import static org.junit.Assert.*;
|
||||||
/**
|
/**
|
||||||
* @author raver119@gmail.com
|
* @author raver119@gmail.com
|
||||||
*/
|
*/
|
||||||
|
@Ignore
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@RunWith(Parameterized.class)
|
@RunWith(Parameterized.class)
|
||||||
public class CompressionTests extends BaseNd4jTest {
|
public class CompressionTests extends BaseNd4jTest {
|
||||||
|
|
|
@ -1133,6 +1133,33 @@ public class ArrayUtil {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int[] cutBelowZero(int[] data) {
|
||||||
|
val ret = new int[data.length];
|
||||||
|
for (int i = 0; i < data.length; i++)
|
||||||
|
ret[i] = data[i] < 0 ? 0 : data[i];
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long[] cutBelowZero(long[] data) {
|
||||||
|
val ret = new long[data.length];
|
||||||
|
for (int i = 0; i < data.length; i++)
|
||||||
|
ret[i] = data[i] < 0 ? 0 : data[i];
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static short[] cutBelowZero(short[] data) {
|
||||||
|
val ret = new short[data.length];
|
||||||
|
for (int i = 0; i < data.length; i++)
|
||||||
|
ret[i] = data[i] < 0 ? 0 : data[i];
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static byte[] cutBelowZero(byte[] data) {
|
||||||
|
val ret = new byte[data.length];
|
||||||
|
for (int i = 0; i < data.length; i++)
|
||||||
|
ret[i] = data[i] < 0 ? 0 : data[i];
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return a copy of this array with the
|
* Return a copy of this array with the
|
||||||
|
|
Loading…
Reference in New Issue