From 9e3c1b02b1648354a700f3e078162d4f197d33c8 Mon Sep 17 00:00:00 2001 From: raver119 Date: Fri, 14 Feb 2020 16:20:31 +0300 Subject: [PATCH] Perf improvements (#242) * initial commit Signed-off-by: raver119 * meh Signed-off-by: raver119 * better ExpandDims impl Signed-off-by: raver119 * better Squeeze impl Signed-off-by: raver119 * better Softmax impl Signed-off-by: raver119 * one test disabled Signed-off-by: raver119 * more accurate impl Signed-off-by: raver119 * - GraphProfiler now prints full shapeInfo instead of shape - softmax typo fix Signed-off-by: raver119 --- libnd4j/include/graph/impl/Graph.cpp | 15 + .../graph/profiling/impl/NodeProfile.cpp | 4 +- libnd4j/include/helpers/ShapeUtils.h | 2 + libnd4j/include/helpers/impl/ShapeUtils.cpp | 20 + .../declarable/generic/shape/expand_dims.cpp | 11 +- .../ops/declarable/generic/shape/squeeze.cpp | 10 +- .../declarable/helpers/cpu/activations.cpp | 91 +++-- libnd4j/include/templatemath.h | 46 ++- .../layers_tests/DeclarableOpsTests18.cpp | 11 + .../layers_tests/DeclarableOpsTests19.cpp | 10 + .../layers_tests/PlaygroundTests.cpp | 47 +++ .../java/org/nd4j/nativeblas/Nd4jCuda.java | 353 +++++++++++++---- .../java/org/nd4j/nativeblas/Nd4jCpu.java | 374 ++++++++++++++---- 13 files changed, 822 insertions(+), 172 deletions(-) diff --git a/libnd4j/include/graph/impl/Graph.cpp b/libnd4j/include/graph/impl/Graph.cpp index 2acedcea3..4b337dd0d 100644 --- a/libnd4j/include/graph/impl/Graph.cpp +++ b/libnd4j/include/graph/impl/Graph.cpp @@ -1088,8 +1088,23 @@ namespace nd4j { if (e < node->input()->size() - 1) nd4j_printf(", ", ""); } + + if (node->opType() == OpType_CUSTOM) { + auto ctx = node->protoContext(); + if (ctx->getIArguments()->size() > 0) { + printf("]; iArgs: ["); + + for (int e = 0; e < ctx->getIArguments()->size(); e++) { + printf("%i", ctx->getIArguments()->at(e)); + if (e < ctx->getIArguments()->size() - 1) + nd4j_printf(", ", ""); + } + } + } + nd4j_printf("]; \n", ""); + // printf("\n"); fflush(stdout); } diff --git a/libnd4j/include/graph/profiling/impl/NodeProfile.cpp b/libnd4j/include/graph/profiling/impl/NodeProfile.cpp index c8b00e788..a6a990eb8 100644 --- a/libnd4j/include/graph/profiling/impl/NodeProfile.cpp +++ b/libnd4j/include/graph/profiling/impl/NodeProfile.cpp @@ -117,11 +117,11 @@ namespace nd4j { } void NodeProfile::addInputShape(Nd4jLong *shapeInfo) { - _inputShapes.emplace_back(ShapeUtils::shapeAsString(shapeInfo)); + _inputShapes.emplace_back(ShapeUtils::shapeInfoAsString(shapeInfo)); } void NodeProfile::addOutputShape(Nd4jLong *shapeInfo) { - _outputShapes.emplace_back(ShapeUtils::shapeAsString(shapeInfo)); + _outputShapes.emplace_back(ShapeUtils::shapeInfoAsString(shapeInfo)); } void NodeProfile::merge(NodeProfile *other) { diff --git a/libnd4j/include/helpers/ShapeUtils.h b/libnd4j/include/helpers/ShapeUtils.h index c99a0b0de..ec31f479a 100644 --- a/libnd4j/include/helpers/ShapeUtils.h +++ b/libnd4j/include/helpers/ShapeUtils.h @@ -97,6 +97,8 @@ namespace nd4j { static std::string shapeAsString(const int rank, const Nd4jLong* shapeInfo); static std::string strideAsString(const NDArray* array); + static std::string shapeInfoAsString(const Nd4jLong* shapeInfo); + static std::vector shapeAsVector(const Nd4jLong* shapeInfo); // evaluate shapeInfo for diagonal array which is made using input arr elements as diagonal diff --git a/libnd4j/include/helpers/impl/ShapeUtils.cpp b/libnd4j/include/helpers/impl/ShapeUtils.cpp index 9d002e238..235ab3d10 100644 --- a/libnd4j/include/helpers/impl/ShapeUtils.cpp +++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp @@ -666,6 +666,26 @@ Nd4jLong* ShapeUtils::evalTileShapeInfo(const NDArray& arr, const std::vectorreshape(input->ordering(), shape); - output->assign(tmp); - - STORE_RESULT(output); - + if (input->ews() == 1 && output->ews() == 1 && input->ordering() == output->ordering()) { + output->dataBuffer()->copyBufferFrom(*input->dataBuffer().get(), output->lengthOf() * DataTypeUtils::sizeOfElement(output->dataType()), 0, input->bufferOffset()); + } else { + auto tmp = input->reshape(input->ordering(), shape); + output->assign(tmp); + } return Status::OK(); } diff --git a/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp b/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp index 085d7f09c..22e229643 100644 --- a/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp +++ b/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp @@ -25,7 +25,7 @@ namespace nd4j { namespace ops { - CUSTOM_OP_IMPL(squeeze, 1, 1, true, 0, -2) { + CUSTOM_OP_IMPL(squeeze, 1, 1, false, 0, -2) { auto input = INPUT_VARIABLE(0); auto output = OUTPUT_VARIABLE(0); @@ -73,8 +73,12 @@ namespace nd4j { if (block.isInplace()) { output->reshapei(input->ordering(), shape, false); } else { - auto tmp = input->reshape(input->ordering(), shape); - output->assign(tmp); + if (input->ews() == 1 && output->ews() == 1 && input->ordering() == output->ordering()) { + output->dataBuffer()->copyBufferFrom(*input->dataBuffer().get(), output->lengthOf() * DataTypeUtils::sizeOfElement(output->dataType()), 0, input->bufferOffset()); + } else { + auto tmp = input->reshape(input->ordering(), shape); + output->assign(tmp); + } } return Status::OK(); diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp index 9a11baf37..56c93b611 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp @@ -191,6 +191,70 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr BUILD_SINGLE_SELECTOR(xType, logSoftMaxForVector_, (input.getBuffer(), input.getShapeInfo(), output.buffer(), output.shapeInfo()), FLOAT_TYPES); } + template + void softmax_loop(T *input, T *output, Nd4jLong *offsets, Nd4jLong numOfSubArrs, uint32_t tadLen); + + template <> + FORCEINLINE void softmax_loop(float *input, float *output, Nd4jLong *offsets, Nd4jLong numOfSubArrs, uint32_t tadLen) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i++) { + auto inBuff = input + offsets[i]; + auto outBuff = output + offsets[i]; + + float max = -DataTypeUtils::max(); + float sum = 0.f; + + #pragma omp simd reduction(max:max) + for (uint j = 0; j < tadLen; ++j) + max = nd4j::math::nd4j_max(max, inBuff[j]); + + #pragma omp simd reduction(+:sum) + for (uint j = 0; j < tadLen; ++j) { + float temp = nd4j::math::nd4j_exp(inBuff[j] - max); + outBuff[j] = temp; + sum += temp; + } + + #pragma omp simd + for (uint j = 0; j < tadLen; ++j) + outBuff[j] /= sum; + } + }; + + samediff::Threads::parallel_tad(func,0, numOfSubArrs); + } + + + template + FORCEINLINE void softmax_loop(T *input, T *output, Nd4jLong *offsets, Nd4jLong numOfSubArrs, uint32_t tadLen) { + auto func = PRAGMA_THREADS_FOR { + for (auto i = start; i < stop; i++) { + auto inBuff = input + offsets[i]; + auto outBuff = output + offsets[i]; + + T max = -DataTypeUtils::max(); + T sum(0.f); + + #pragma omp simd reduction(maxT:max) + for (uint j = 0; j < tadLen; ++j) + max = nd4j::math::nd4j_max(max, inBuff[j]); + + #pragma omp simd reduction(sumT:sum) + for (uint j = 0; j < tadLen; ++j) { + T temp = nd4j::math::nd4j_exp(inBuff[j] - max); + outBuff[j] = temp; + sum += temp; + } + + #pragma omp simd + for (uint j = 0; j < tadLen; ++j) + outBuff[j] /= sum; + } + }; + + samediff::Threads::parallel_tad(func,0, numOfSubArrs); + } + ////////////////////////////////////////////////////////////////////////// template static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) { @@ -213,31 +277,10 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra const uint tadLen = shape::length(tadShapeInfo); if(shape::elementWiseStride(tadShapeInfo) == 1){ + T *inBuff = input.bufferAsT(); + T *outBuff = output.bufferAsT(); - auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i += increment) { - - T *inBuff = input.bufferAsT() + tadOffsets[i]; - T *outBuff = output.bufferAsT() + tadOffsets[i]; - - T max = -DataTypeUtils::max(); - T sum = 0; - - for (uint j = 0; j < tadLen; ++j) - max = nd4j::math::nd4j_max(max, inBuff[j]); - - for (uint j = 0; j < tadLen; ++j) { - T temp = nd4j::math::nd4j_exp(inBuff[j] - max); - outBuff[j] = temp; - sum += temp; - } - - for (uint j = 0; j < tadLen; ++j) - outBuff[j] /= sum; - } - }; - - samediff::Threads::parallel_tad(func,0, numOfSubArrs); + softmax_loop(inBuff, outBuff, tadOffsets, numOfSubArrs, tadLen); } else { diff --git a/libnd4j/include/templatemath.h b/libnd4j/include/templatemath.h index b412befd8..48021d734 100644 --- a/libnd4j/include/templatemath.h +++ b/libnd4j/include/templatemath.h @@ -127,6 +127,32 @@ namespace nd4j { template math_def inline Z nd4j_erfc(T num); + math_def inline int32_t floatToRawIntBits(float d) { + union { + float f; + int32_t i; + } tmp; + tmp.f = d; + return tmp.i; + } + + math_def inline float intBitsToFloat(int32_t i) { + union { + float f; + int32_t i; + } tmp; + tmp.i = i; + return tmp.f; + } + + math_def inline float mulsignf(float x, float y) { + return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31))); + } + + math_def inline float copysignfk(float x, float y) { + return intBitsToFloat((floatToRawIntBits(x) & ~(1 << 31)) ^ (floatToRawIntBits(y) & (1 << 31))); + } + template math_def inline Z nd4j_sigmoid(T val) { return (Z) 1.0f / ((Z) 1.0f + nd4j_exp(-val)); @@ -660,6 +686,11 @@ namespace nd4j { * @param val2 * @return */ + template <> + math_def inline float nd4j_pow(float val, float val2) { + return p_pow(val, val2); + } + template math_def inline Z nd4j_pow(X val, Y val2) { return p_pow(static_cast(val), static_cast(val2)); @@ -767,10 +798,23 @@ namespace nd4j { } + math_def inline float neu_tanh(float val, float sign) { + float e(M_E); + float av = sign * val; + auto p = nd4j::math::nd4j_pow(e, -av * 2.f); + return (1 - p) / (1 + p); + } + + template <> + math_def inline float nd4j_tanh(float val) { + float sign = copysignfk(1.0f, val); + return sign * neu_tanh(val, sign); + } + + template math_def inline Z nd4j_tanh(X val) { return val <= 0 ? neg_tanh(val) : pos_tanh(val); - //return p_tanh(static_cast(val)); } template diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp index 93864af8c..2c7737a31 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp @@ -48,5 +48,16 @@ TEST_F(DeclarableOpsTests18, test_bitcast_1) { auto status = op.execute({&x}, {&z}, {}, {(Nd4jLong) nd4j::DataType::INT64}, {}); ASSERT_EQ(Status::OK(), status); + ASSERT_EQ(e, z); +} + +TEST_F(DeclarableOpsTests18, test_tanh_1) { + auto x = NDArrayFactory::create('c', {8}, {0.23f, -0.23f, 0.35f, -0.35f, 0.64f, -0.64f, 100000.f, -100000.f}); + auto z = x.ulike(); + auto e = NDArrayFactory::create('c', {8}, {0.226028f, -0.226028f, 0.336376f, -0.336376f, 0.564900f, -0.564900f, 1.f, -1.f}); + + nd4j::ops::tanh op; + op.execute({&x}, {&z}); + ASSERT_EQ(e, z); } \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp index 9883a9d79..b0a547a7d 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp @@ -66,4 +66,14 @@ TEST_F(DeclarableOpsTests19, test_conv1d_bp_1) { delete result; +} + +TEST_F(DeclarableOpsTests19, test_squeeze_1) { + auto x = NDArrayFactory::create('c', {3, 4, 1}); + auto e = NDArrayFactory::create('c', {3, 4}); + int axis = 2; + + nd4j::ops::squeeze op; + auto status = op.execute({&x}, {&e}, {axis}); + ASSERT_EQ(Status::OK(), status); } \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp index 7db7a791a..83d3ee3b8 100644 --- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp @@ -169,6 +169,53 @@ TEST_F(PlaygroundTests, test_broadcast_1) { } */ +/* +TEST_F(PlaygroundTests, test_broadcast_1) { + int pool = 500; + std::vector aX(pool); + std::vector aY(pool); + std::vector aZ(pool); + + for (int e = 0; e < pool; e++) { + aX[e] = NDArrayFactory::create_('c', {512, 3072}); + aY[e] = NDArrayFactory::create_('c', {768}); + aZ[e] = NDArrayFactory::create_('c', {512, 3072}); + + aX[e]->assign( (e+1) / 119); + aY[e]->assign( (e+3) / 119); + } + + + + std::vector values; + + for (int e = 0; e < 1000; e++) { + auto x = aX[e < pool ? e : e % pool]; + auto y = aY[e < pool ? e : e % pool]; + auto z = aZ[e < pool ? e : e % pool]; + + auto timeStart = std::chrono::system_clock::now(); + + //x->applyTrueBroadcast(BroadcastOpsTuple::Multiply(), *y, *z); + x->applyTransform(transform::Tanh, *z, nullptr); + + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); + values.emplace_back(outerTime); + } + + std::sort(values.begin(), values.end()); + + nd4j_printf("Time: %lld us;\n", values[values.size() / 2]); + + for (int e = 0; e < pool; e++) { + delete aX[e]; + delete aY[e]; + delete aZ[e]; + } +} + +*/ /* TEST_F(PlaygroundTests, test_s_0) { diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java index e7ddcda11..c8b15c1a2 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java @@ -4250,14 +4250,20 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); * set new order and shape in case of suitable array length (in-place operation) * order - order to set * shape - shape to set - * + * copyToNewBuff - if true then old buffer will be copied to new buffer if last one will be allocated after reshaping * if there was permute applied before or there are weird strides, then new buffer is allocated for array */ + public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape); + public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape); + public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector long[] shape); + public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongPointer shape); + public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongBuffer shape); + public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector long[] shape); /** @@ -4267,8 +4273,11 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); * * if permute have been applied before or there are weird strides, then new buffer is allocated for new array */ + public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape); + public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape); + public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape); /** @@ -6203,6 +6212,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include // #include +// #include @Namespace("nd4j::graph") @NoOffset public static class NodeProfile extends Pointer { static { Loader.load(); } /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ @@ -6235,11 +6245,20 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); public native void setObjectsSize(@Cast("Nd4jLong") long bytes); public native void setTotalSize(@Cast("Nd4jLong") long bytes); + public native void addInputShape(@Cast("Nd4jLong*") LongPointer shapeInfo); + public native void addInputShape(@Cast("Nd4jLong*") LongBuffer shapeInfo); + public native void addInputShape(@Cast("Nd4jLong*") long[] shapeInfo); + public native void addOutputShape(@Cast("Nd4jLong*") LongPointer shapeInfo); + public native void addOutputShape(@Cast("Nd4jLong*") LongBuffer shapeInfo); + public native void addOutputShape(@Cast("Nd4jLong*") long[] shapeInfo); + public native @Cast("Nd4jLong") long getActivationsSize(); public native @Cast("Nd4jLong") long getTemporarySize(); public native @Cast("Nd4jLong") long getObjectsSize(); public native @Cast("Nd4jLong") long getTotalSize(); + public native @Cast("Nd4jLong") long getExecutionTime(); + public native @StdString @ByRef @Cast({"char*", "std::string*"}) BytePointer name(); public native void merge(NodeProfile other); @@ -6835,9 +6854,15 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native @Cast("bool") boolean canReshape(int oldRank, @Cast("Nd4jLong*") LongBuffer oldShape, int newRank, @Cast("Nd4jLong*") LongBuffer newShape, @Cast("bool") boolean isFOrder); @Namespace("shape") public static native @Cast("bool") boolean canReshape(int oldRank, @Cast("Nd4jLong*") long[] oldShape, int newRank, @Cast("Nd4jLong*") long[] newShape, @Cast("bool") boolean isFOrder); - @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") LongPointer oldShapeInfo, int newRank, @Cast("const Nd4jLong*") LongPointer newShape, @Cast("Nd4jLong*") LongPointer newShapeInfo); - @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") LongBuffer oldShapeInfo, int newRank, @Cast("const Nd4jLong*") LongBuffer newShape, @Cast("Nd4jLong*") LongBuffer newShapeInfo); - @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") long[] oldShapeInfo, int newRank, @Cast("const Nd4jLong*") long[] newShape, @Cast("Nd4jLong*") long[] newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongPointer oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") LongPointer newShape, @Cast("Nd4jLong*") LongPointer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongBuffer oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") LongBuffer newShape, @Cast("Nd4jLong*") LongBuffer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") long[] oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") long[] newShape, @Cast("Nd4jLong*") long[] newShapeInfo); + /** + * newShapeInfo contains rank, shape and order only, no strides/ews/type + */ + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongPointer oldShapeInfo, @Cast("Nd4jLong*") LongPointer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongBuffer oldShapeInfo, @Cast("Nd4jLong*") LongBuffer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") long[] oldShapeInfo, @Cast("Nd4jLong*") long[] newShapeInfo); /** * Get the shape info buffer @@ -7145,6 +7170,15 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") LongPointer shapeInfo); @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") LongBuffer shapeInfo); @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") long[] shapeInfo); + + /** + * shape - input inShape is shape only, not shapeInfo + * returns number of non-unity dimensions in inShape + */ + @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") LongPointer inShape); + @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") LongBuffer inShape); + @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") long[] inShape); + /** * Returns whether the * given shape is a vector or not @@ -7163,9 +7197,9 @@ public static final int PREALLOC_SIZE = 33554432; * Returns the shape portion of an information * buffer */ - @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeOf(@Cast("Nd4jLong*") LongPointer buffer); - @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeOf(@Cast("Nd4jLong*") LongBuffer buffer); - @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeOf(@Cast("Nd4jLong*") long[] buffer); + @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeOf(@Cast("Nd4jLong*") LongPointer shapeInfo); + @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeOf(@Cast("Nd4jLong*") LongBuffer shapeInfo); + @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeOf(@Cast("Nd4jLong*") long[] shapeInfo); /** * Return a copy of a buffer. @@ -7903,40 +7937,22 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong*") LongBuffer offsets); @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong*") long[] offsets, byte order/*='c'*/); @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong*") long[] offsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, @Cast("const Nd4jLong*") LongPointer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer zOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, @Cast("const Nd4jLong*") LongPointer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer zOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, @Cast("const Nd4jLong*") LongBuffer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer zOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, @Cast("const Nd4jLong*") LongBuffer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer zOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, @Cast("const Nd4jLong*") long[] zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] zOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, @Cast("const Nd4jLong*") long[] zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] zOffsets); + // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order = 'c'); + // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order = 'c'); @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongPointer buffer, byte order); @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongBuffer buffer, byte order); @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") long[] buffer, byte order); - // deduce element-wise stride - // if array is scalar or unit length vector then ews = 1 - // if array is common vector then ews = stride of non-unity dimension - // if strides are normal set ews = 1, otherwise ews = 0 - @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("Nd4jLong") long len); - @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong") long len); - @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong") long len); - // deduce order and element-wise stride // if array is scalar or unit length vector then ews = 1 and order is preserved // if array is common vector then ews = stride of non-unity dimension and order is preserved // if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("Nd4jLong") long len/*=-1*/); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongPointer shapeInfo); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong") long len/*=-1*/); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongBuffer shapeInfo); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong") long len/*=-1*/); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") long[] shapeInfo); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongPointer shapeNoUnities, @Cast("const Nd4jLong*") LongPointer stridesNoUnities); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongBuffer shapeNoUnities, @Cast("const Nd4jLong*") LongBuffer stridesNoUnities); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") long[] shapeNoUnities, @Cast("const Nd4jLong*") long[] stridesNoUnities); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo); /** * processes whole set of sub-arrays @@ -7946,7 +7962,7 @@ public static final int PREALLOC_SIZE = 33554432; * numOfSubArrs - number of sub-arrays, size of subArrOffsets is equal to numOfSubArrs * dimsSize - size of dimsToExclude, if dimsSize = array rank or dimsSize = 0 it means sub-array is whole array, copy of wholeShapeInfo and one zero offset will be returned * dimsToExclude - MUST BE SORTED, dimensions to evaluate sub-array along, i.e. when shape is [2,3,4,5] and dimsToExclude={0,2}, then there will be 8 sub-arrays with shape [3,5] - * subArrShapeInfo - output argument, contains shapeInfo common for all sub-arrays + * subArrShapeInfo - output argument, contains shapeInfo (same for all sub-arrays) * subArrOffsets - output argument, contains successive sub-arrays offsets from original this-buffer * keepUnitiesInShape - if false then eliminate unities from sub-array shapeInfo, for example {1,a,1,b} -> {a,b} */ @@ -7957,6 +7973,24 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native void calcSubArrShapeAndOffsets(@Cast("const Nd4jLong*") long[] wholeShapeInfo, @Cast("const Nd4jLong") long numOfSubArrs, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] subArrShapeInfo, @Cast("Nd4jLong*") long[] subArrOffsets, @Cast("bool") boolean keepUnitiesInShape/*=false*/); @Namespace("shape") public static native void calcSubArrShapeAndOffsets(@Cast("const Nd4jLong*") long[] wholeShapeInfo, @Cast("const Nd4jLong") long numOfSubArrs, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] subArrShapeInfo, @Cast("Nd4jLong*") long[] subArrOffsets); + /** + * for example inShapeInfo is {3, 2,1,4, 4,4,1, 16384,1,99} + * then output shapeNoUnities will contain {2,4, 4,1} - that is only shape and strides, no rank/type/ews/order + * stridesNoUnities will point on strides in shapeNoUnities that is on {4,1} + * returns number of non-unity dimensions in inShapeInfo + * if there is no unities in inShapeInfo, then no copy procedure will be performed and shapeNoUnities/stridesNoUnities will point on corresponding places in inShapeInfo + */ + @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongPointer inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef LongPointer stridesNoUnities); + @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongBuffer inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer stridesNoUnities); + @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") long[] inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef long[] stridesNoUnities); + + /** + * for example inShapeInfo is {3, 2,1,3,1,4, 12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2 + * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99} + */ + @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongPointer inShapeInfo, int dimsSize, @Const IntPointer dimsToExclude, @Cast("Nd4jLong*") LongPointer outShapeInfo); + @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongBuffer inShapeInfo, int dimsSize, @Const IntBuffer dimsToExclude, @Cast("Nd4jLong*") LongBuffer outShapeInfo); + @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") long[] inShapeInfo, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] outShapeInfo); @@ -8186,6 +8220,8 @@ public static final int PREALLOC_SIZE = 33554432; * @param rank the rank of the shape */ +////////////////////////////////////////////////////////////////////// + /** * Returns whether the * given shape is a vector or not @@ -8735,69 +8771,60 @@ public static final int PREALLOC_SIZE = 33554432; // return true; // } -// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, const bool isFOrder, Nd4jLong* newShapeInfo) { +////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, Nd4jLong* newShapeInfo) { // // PLEASE NOTE !: reshaping not-permuted (ews=1) array in f order (except insertion/elimination of unities) will definitely cause allocation of new buffer for array elements // // also this function takes into account identical shapes automatically, namely in that case oldShapeInfo is completely copied to newShapeInfo -// const int newOrder = isFOrder ? 102 : 99; -// const int oldOrder = oldShapeInfo[2 * oldRank + 3]; - // newShapeInfo[0] = newRank; // memcpy(newShapeInfo + 1, newShape, newRank * sizeof(Nd4jLong)); -// Nd4jLong* newStrides = shape::stride(newShapeInfo); -// const Nd4jLong* oldShape = shape::shapeOf(const_cast(oldShapeInfo)); +// Nd4jLong* newStrides = shape::stride(newShapeInfo); +// const Nd4jLong* oldShape = shape::shapeOf(const_cast(oldShapeInfo)); // const Nd4jLong* oldStrides = shape::stride(const_cast(oldShapeInfo)); -// int oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim; - +// Nd4jLong oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim; // while (newStart < newRank && oldStart < oldRank) { // newDim = newShape[newStart]; // oldDim = oldShape[oldStart]; -// while (newDim != oldDim) +// while (newDim != oldDim && newDim > 0 && oldDim > 0) // if (newDim < oldDim) newDim *= newShape[newStop++]; // else oldDim *= oldShape[oldStop++]; // // ------ Check whether the original axes can be combined ------ // -// for (int i = oldStart; i < oldStop - 1; i++) { - -// if(oldShape[i] == 1) { // ignore strides like {...,1,1,...} -// if(oldOrder == 102) ++oldStart; +// for (int step = 1, i = oldStart; i < oldStop - 1; ++i) { +// if(oldShape[i] == 1) // skip unity-dimension and its stride // continue; -// } - -// if(oldOrder == 102 && oldStrides[i + 1] != oldShape[i] * oldStrides[i]) -// return false; // not contiguous enough -// if(oldOrder == 99 && oldStrides[i] != oldShape[i + 1] * oldStrides[i + 1]) -// return false; // not contiguous enough +// while((i + step) < oldRank && oldShape[i + step] == 1) +// ++step; // skip following unity-dimensions and its strides if such are present +// if((i + step) < oldRank && oldStrides[i] != oldShape[i + step] * oldStrides[i + step]) +// return false; // not contiguous enough // } -// // ------ Calculate new strides for all axes currently worked with ------ // -// if(isFOrder) { -// newStrides[newStart] = oldStrides[oldStart]; -// for (int i = newStart + 1; i < newStop; ++i) -// newStrides[i] = newStrides[i - 1] * newShape[i - 1]; -// } -// else { -// newStrides[newStop - 1] = oldStrides[oldStop - 1]; -// for (int i = newStop - 1; i > newStart; --i) -// newStrides[i - 1] = newStrides[i] * newShape[i]; -// } +// newStrides[newStop - 1] = oldStrides[oldStop - 1]; +// for (int i = newStop - 1; i > newStart; --i) +// newStrides[i - 1] = newStrides[i] * newShape[i]; // newStart = newStop++; // oldStart = oldStop++; // } -// newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo); // order -// newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo); // ews -// newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo); // type +// // rest of strides should be unities (if there is remainder in strides space, that is newStart < newRank) +// for (int i = newStart; i < newRank; ++i) +// newStrides[i] = 1; + +// newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo); // order +// newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo); // ews +// newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo); // type // return true; // } +////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////// // this function checks the consistence of dimensions with array rank (negative dimensions, too large dimensions, too big number of dimensions) @@ -8838,9 +8865,198 @@ public static final int PREALLOC_SIZE = 33554432; ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) { + +// // we assume all array have same length +// const Nd4jLong len = shape::length(xShapeInfo); + +// const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo); +// const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo); +// const Nd4jLong zEws = shape::elementWiseStride(zShapeInfo); + +// const char xOrder = shape::order(xShapeInfo); +// const char yOrder = shape::order(yShapeInfo); +// const char zOrder = shape::order(zShapeInfo); + +// const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo, zShapeInfo); + +// if (xEws == 1 && yEws == 1 && zEws == 1 && xOrder == yOrder && xOrder == zOrder && (xOrder == 'c' || shapesSame)) { +// xOffsets = yOffsets = zOffsets = nullptr; +// } +// else if(xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, yShapeInfo))) { +// xOffsets = yOffsets = nullptr; +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, xOrder); +// } +// else if(xEws == 1 && zEws == 1 && xOrder == zOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, zShapeInfo))) { +// xOffsets = zOffsets = nullptr; +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// else if(yEws == 1 && zEws == 1 && yOrder == zOrder && (yOrder == 'c' || shape::shapeEquals(yShapeInfo, zShapeInfo))) { +// yOffsets = zOffsets = nullptr; +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// else if(xEws == 1) { +// xOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, xOrder); +// } +// } +// } +// else if(yEws == 1) { +// yOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, yOrder); +// } +// } +// } +// else if(zEws == 1) { +// zOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, zOrder); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, zOrder); +// } +// } +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo, zShapeInfo)) { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// yOffsets = zOffsets = xOffsets; +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets); +// } +// } +// yOffsets = xOffsets; +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// } +// zOffsets = xOffsets; +// } +// else { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets); +// } +// } +// } +// } + +////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order) { + +// // we assume all array have same length +// const Nd4jLong len = shape::length(xShapeInfo); + +// const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo); +// const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo); + +// const char xOrder = shape::order(xShapeInfo); +// const char yOrder = shape::order(yShapeInfo); + +// const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo); + +// if (xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shapesSame)) { +// xOffsets = yOffsets = nullptr; +// } +// else if(xEws == 1) { +// xOffsets = nullptr; +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// else if(yEws == 1) { +// yOffsets = nullptr; +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// yOffsets = xOffsets; +// } +// else { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// } +// } +// } ////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////// @@ -9065,6 +9281,9 @@ public static final int PREALLOC_SIZE = 33554432; // returns TRUE if this op allows in-place execution public native @Cast("bool") boolean allowsInplace(); + // this method allows you to enable/disable inplace call for a given op + public native void allowInplace(@Cast("bool") boolean reallyAllow); + // this method returns opNum (applicable for legacy XYZ ops only) public native int getOpNum(); diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java index 49d088f27..71614c20f 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java @@ -4253,14 +4253,20 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); * set new order and shape in case of suitable array length (in-place operation) * order - order to set * shape - shape to set - * + * copyToNewBuff - if true then old buffer will be copied to new buffer if last one will be allocated after reshaping * if there was permute applied before or there are weird strides, then new buffer is allocated for array */ + public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape); + public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape); + public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(byte order, @Cast("Nd4jLong*") @StdVector long[] shape); + public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongPointer shape); + public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector LongBuffer shape); + public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @Cast("bool") boolean reshapei(@Cast("Nd4jLong*") @StdVector long[] shape); /** @@ -4270,8 +4276,11 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); * * if permute have been applied before or there are weird strides, then new buffer is allocated for new array */ + public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape); + public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape); + public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/); public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape); /** @@ -6206,6 +6215,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); // #include // #include // #include +// #include @Namespace("nd4j::graph") @NoOffset public static class NodeProfile extends Pointer { static { Loader.load(); } /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ @@ -6238,11 +6248,20 @@ public native @Cast("bool") boolean isOptimalRequirementsMet(); public native void setObjectsSize(@Cast("Nd4jLong") long bytes); public native void setTotalSize(@Cast("Nd4jLong") long bytes); + public native void addInputShape(@Cast("Nd4jLong*") LongPointer shapeInfo); + public native void addInputShape(@Cast("Nd4jLong*") LongBuffer shapeInfo); + public native void addInputShape(@Cast("Nd4jLong*") long[] shapeInfo); + public native void addOutputShape(@Cast("Nd4jLong*") LongPointer shapeInfo); + public native void addOutputShape(@Cast("Nd4jLong*") LongBuffer shapeInfo); + public native void addOutputShape(@Cast("Nd4jLong*") long[] shapeInfo); + public native @Cast("Nd4jLong") long getActivationsSize(); public native @Cast("Nd4jLong") long getTemporarySize(); public native @Cast("Nd4jLong") long getObjectsSize(); public native @Cast("Nd4jLong") long getTotalSize(); + public native @Cast("Nd4jLong") long getExecutionTime(); + public native @StdString @ByRef @Cast({"char*", "std::string*"}) BytePointer name(); public native void merge(NodeProfile other); @@ -6838,9 +6857,15 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native @Cast("bool") boolean canReshape(int oldRank, @Cast("Nd4jLong*") LongBuffer oldShape, int newRank, @Cast("Nd4jLong*") LongBuffer newShape, @Cast("bool") boolean isFOrder); @Namespace("shape") public static native @Cast("bool") boolean canReshape(int oldRank, @Cast("Nd4jLong*") long[] oldShape, int newRank, @Cast("Nd4jLong*") long[] newShape, @Cast("bool") boolean isFOrder); - @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") LongPointer oldShapeInfo, int newRank, @Cast("const Nd4jLong*") LongPointer newShape, @Cast("Nd4jLong*") LongPointer newShapeInfo); - @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") LongBuffer oldShapeInfo, int newRank, @Cast("const Nd4jLong*") LongBuffer newShape, @Cast("Nd4jLong*") LongBuffer newShapeInfo); - @Namespace("shape") public static native @Cast("bool") boolean reshapeC(int oldRank, @Cast("const Nd4jLong*") long[] oldShapeInfo, int newRank, @Cast("const Nd4jLong*") long[] newShape, @Cast("Nd4jLong*") long[] newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongPointer oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") LongPointer newShape, @Cast("Nd4jLong*") LongPointer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongBuffer oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") LongBuffer newShape, @Cast("Nd4jLong*") LongBuffer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") long[] oldShapeInfo, byte newOrder, int newRank, @Cast("const Nd4jLong*") long[] newShape, @Cast("Nd4jLong*") long[] newShapeInfo); + /** + * newShapeInfo contains rank, shape and order only, no strides/ews/type + */ + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongPointer oldShapeInfo, @Cast("Nd4jLong*") LongPointer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") LongBuffer oldShapeInfo, @Cast("Nd4jLong*") LongBuffer newShapeInfo); + @Namespace("shape") public static native @Cast("bool") boolean reshapeC(@Cast("const Nd4jLong*") long[] oldShapeInfo, @Cast("Nd4jLong*") long[] newShapeInfo); /** * Get the shape info buffer @@ -7148,6 +7173,15 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") LongPointer shapeInfo); @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") LongBuffer shapeInfo); @Namespace("shape") public static native @Cast("bool") boolean isColumnVector(@Cast("Nd4jLong*") long[] shapeInfo); + + /** + * shape - input inShape is shape only, not shapeInfo + * returns number of non-unity dimensions in inShape + */ + @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") LongPointer inShape); + @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") LongBuffer inShape); + @Namespace("shape") public static native int numOfNonUnitDims(int rank, @Cast("const Nd4jLong*") long[] inShape); + /** * Returns whether the * given shape is a vector or not @@ -7166,9 +7200,9 @@ public static final int PREALLOC_SIZE = 33554432; * Returns the shape portion of an information * buffer */ - @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeOf(@Cast("Nd4jLong*") LongPointer buffer); - @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeOf(@Cast("Nd4jLong*") LongBuffer buffer); - @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeOf(@Cast("Nd4jLong*") long[] buffer); + @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeOf(@Cast("Nd4jLong*") LongPointer shapeInfo); + @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeOf(@Cast("Nd4jLong*") LongBuffer shapeInfo); + @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeOf(@Cast("Nd4jLong*") long[] shapeInfo); /** * Return a copy of a buffer. @@ -7906,40 +7940,22 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong*") LongBuffer offsets); @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong*") long[] offsets, byte order/*='c'*/); @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong*") long[] offsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, @Cast("const Nd4jLong*") LongPointer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer zOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongPointer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer xOffsets, @Cast("const Nd4jLong*") LongPointer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer yOffsets, @Cast("const Nd4jLong*") LongPointer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer zOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, @Cast("const Nd4jLong*") LongBuffer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer zOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") LongBuffer xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer xOffsets, @Cast("const Nd4jLong*") LongBuffer yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer yOffsets, @Cast("const Nd4jLong*") LongBuffer zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer zOffsets); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, @Cast("const Nd4jLong*") long[] zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] zOffsets, byte order/*='c'*/); - @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] xShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] xOffsets, @Cast("const Nd4jLong*") long[] yShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] yOffsets, @Cast("const Nd4jLong*") long[] zShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] zOffsets); + // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order = 'c'); + // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order = 'c'); @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongPointer buffer, byte order); @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongBuffer buffer, byte order); @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") long[] buffer, byte order); - // deduce element-wise stride - // if array is scalar or unit length vector then ews = 1 - // if array is common vector then ews = stride of non-unity dimension - // if strides are normal set ews = 1, otherwise ews = 0 - @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("Nd4jLong") long len); - @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong") long len); - @Namespace("shape") public static native void setEws(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong") long len); - // deduce order and element-wise stride // if array is scalar or unit length vector then ews = 1 and order is preserved // if array is common vector then ews = stride of non-unity dimension and order is preserved // if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("Nd4jLong") long len/*=-1*/); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongPointer shapeInfo); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jLong") long len/*=-1*/); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") LongBuffer shapeInfo); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong") long len/*=-1*/); - @Namespace("shape") public static native void setOrderAndEws(@Cast("Nd4jLong*") long[] shapeInfo); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongPointer shapeNoUnities, @Cast("const Nd4jLong*") LongPointer stridesNoUnities); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongBuffer shapeNoUnities, @Cast("const Nd4jLong*") LongBuffer stridesNoUnities); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") long[] shapeNoUnities, @Cast("const Nd4jLong*") long[] stridesNoUnities); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo); + @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo); /** * processes whole set of sub-arrays @@ -7949,7 +7965,7 @@ public static final int PREALLOC_SIZE = 33554432; * numOfSubArrs - number of sub-arrays, size of subArrOffsets is equal to numOfSubArrs * dimsSize - size of dimsToExclude, if dimsSize = array rank or dimsSize = 0 it means sub-array is whole array, copy of wholeShapeInfo and one zero offset will be returned * dimsToExclude - MUST BE SORTED, dimensions to evaluate sub-array along, i.e. when shape is [2,3,4,5] and dimsToExclude={0,2}, then there will be 8 sub-arrays with shape [3,5] - * subArrShapeInfo - output argument, contains shapeInfo common for all sub-arrays + * subArrShapeInfo - output argument, contains shapeInfo (same for all sub-arrays) * subArrOffsets - output argument, contains successive sub-arrays offsets from original this-buffer * keepUnitiesInShape - if false then eliminate unities from sub-array shapeInfo, for example {1,a,1,b} -> {a,b} */ @@ -7960,6 +7976,24 @@ public static final int PREALLOC_SIZE = 33554432; @Namespace("shape") public static native void calcSubArrShapeAndOffsets(@Cast("const Nd4jLong*") long[] wholeShapeInfo, @Cast("const Nd4jLong") long numOfSubArrs, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] subArrShapeInfo, @Cast("Nd4jLong*") long[] subArrOffsets, @Cast("bool") boolean keepUnitiesInShape/*=false*/); @Namespace("shape") public static native void calcSubArrShapeAndOffsets(@Cast("const Nd4jLong*") long[] wholeShapeInfo, @Cast("const Nd4jLong") long numOfSubArrs, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] subArrShapeInfo, @Cast("Nd4jLong*") long[] subArrOffsets); + /** + * for example inShapeInfo is {3, 2,1,4, 4,4,1, 16384,1,99} + * then output shapeNoUnities will contain {2,4, 4,1} - that is only shape and strides, no rank/type/ews/order + * stridesNoUnities will point on strides in shapeNoUnities that is on {4,1} + * returns number of non-unity dimensions in inShapeInfo + * if there is no unities in inShapeInfo, then no copy procedure will be performed and shapeNoUnities/stridesNoUnities will point on corresponding places in inShapeInfo + */ + @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongPointer inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongPointer shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef LongPointer stridesNoUnities); + @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongBuffer inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef LongBuffer stridesNoUnities); + @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") long[] inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef long[] stridesNoUnities); + + /** + * for example inShapeInfo is {3, 2,1,3,1,4, 12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2 + * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99} + */ + @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongPointer inShapeInfo, int dimsSize, @Const IntPointer dimsToExclude, @Cast("Nd4jLong*") LongPointer outShapeInfo); + @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongBuffer inShapeInfo, int dimsSize, @Const IntBuffer dimsToExclude, @Cast("Nd4jLong*") LongBuffer outShapeInfo); + @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") long[] inShapeInfo, int dimsSize, @Const int[] dimsToExclude, @Cast("Nd4jLong*") long[] outShapeInfo); @@ -8189,6 +8223,8 @@ public static final int PREALLOC_SIZE = 33554432; * @param rank the rank of the shape */ +////////////////////////////////////////////////////////////////////// + /** * Returns whether the * given shape is a vector or not @@ -8738,69 +8774,60 @@ public static final int PREALLOC_SIZE = 33554432; // return true; // } -// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, const bool isFOrder, Nd4jLong* newShapeInfo) { +////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_H bool reshapeC(const int oldRank, const Nd4jLong* oldShapeInfo, const int newRank, const Nd4jLong* newShape, Nd4jLong* newShapeInfo) { // // PLEASE NOTE !: reshaping not-permuted (ews=1) array in f order (except insertion/elimination of unities) will definitely cause allocation of new buffer for array elements // // also this function takes into account identical shapes automatically, namely in that case oldShapeInfo is completely copied to newShapeInfo -// const int newOrder = isFOrder ? 102 : 99; -// const int oldOrder = oldShapeInfo[2 * oldRank + 3]; - // newShapeInfo[0] = newRank; // memcpy(newShapeInfo + 1, newShape, newRank * sizeof(Nd4jLong)); -// Nd4jLong* newStrides = shape::stride(newShapeInfo); -// const Nd4jLong* oldShape = shape::shapeOf(const_cast(oldShapeInfo)); +// Nd4jLong* newStrides = shape::stride(newShapeInfo); +// const Nd4jLong* oldShape = shape::shapeOf(const_cast(oldShapeInfo)); // const Nd4jLong* oldStrides = shape::stride(const_cast(oldShapeInfo)); -// int oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim; - +// Nd4jLong oldStart(0), oldStop(1), newStart(0), newStop(1), newDim, oldDim; // while (newStart < newRank && oldStart < oldRank) { // newDim = newShape[newStart]; // oldDim = oldShape[oldStart]; -// while (newDim != oldDim) +// while (newDim != oldDim && newDim > 0 && oldDim > 0) // if (newDim < oldDim) newDim *= newShape[newStop++]; // else oldDim *= oldShape[oldStop++]; // // ------ Check whether the original axes can be combined ------ // -// for (int i = oldStart; i < oldStop - 1; i++) { - -// if(oldShape[i] == 1) { // ignore strides like {...,1,1,...} -// if(oldOrder == 102) ++oldStart; +// for (int step = 1, i = oldStart; i < oldStop - 1; ++i) { +// if(oldShape[i] == 1) // skip unity-dimension and its stride // continue; -// } - -// if(oldOrder == 102 && oldStrides[i + 1] != oldShape[i] * oldStrides[i]) -// return false; // not contiguous enough -// if(oldOrder == 99 && oldStrides[i] != oldShape[i + 1] * oldStrides[i + 1]) -// return false; // not contiguous enough +// while((i + step) < oldRank && oldShape[i + step] == 1) +// ++step; // skip following unity-dimensions and its strides if such are present +// if((i + step) < oldRank && oldStrides[i] != oldShape[i + step] * oldStrides[i + step]) +// return false; // not contiguous enough // } -// // ------ Calculate new strides for all axes currently worked with ------ // -// if(isFOrder) { -// newStrides[newStart] = oldStrides[oldStart]; -// for (int i = newStart + 1; i < newStop; ++i) -// newStrides[i] = newStrides[i - 1] * newShape[i - 1]; -// } -// else { -// newStrides[newStop - 1] = oldStrides[oldStop - 1]; -// for (int i = newStop - 1; i > newStart; --i) -// newStrides[i - 1] = newStrides[i] * newShape[i]; -// } +// newStrides[newStop - 1] = oldStrides[oldStop - 1]; +// for (int i = newStop - 1; i > newStart; --i) +// newStrides[i - 1] = newStrides[i] * newShape[i]; // newStart = newStop++; // oldStart = oldStop++; // } -// newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo); // order -// newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo); // ews -// newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo); // type +// // rest of strides should be unities (if there is remainder in strides space, that is newStart < newRank) +// for (int i = newStart; i < newRank; ++i) +// newStrides[i] = 1; + +// newShapeInfo[2 * newRank + 3] = shape::order(oldShapeInfo); // order +// newShapeInfo[2 * newRank + 2] = shape::elementWiseStride(oldShapeInfo); // ews +// newShapeInfo[2 * newRank + 1] = shape::type(oldShapeInfo); // type // return true; // } +////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////// // this function checks the consistence of dimensions with array rank (negative dimensions, too large dimensions, too big number of dimensions) @@ -8841,9 +8868,198 @@ public static final int PREALLOC_SIZE = 33554432; ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) { + +// // we assume all array have same length +// const Nd4jLong len = shape::length(xShapeInfo); + +// const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo); +// const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo); +// const Nd4jLong zEws = shape::elementWiseStride(zShapeInfo); + +// const char xOrder = shape::order(xShapeInfo); +// const char yOrder = shape::order(yShapeInfo); +// const char zOrder = shape::order(zShapeInfo); + +// const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo, zShapeInfo); + +// if (xEws == 1 && yEws == 1 && zEws == 1 && xOrder == yOrder && xOrder == zOrder && (xOrder == 'c' || shapesSame)) { +// xOffsets = yOffsets = zOffsets = nullptr; +// } +// else if(xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, yShapeInfo))) { +// xOffsets = yOffsets = nullptr; +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, xOrder); +// } +// else if(xEws == 1 && zEws == 1 && xOrder == zOrder && (xOrder == 'c' || shape::shapeEquals(xShapeInfo, zShapeInfo))) { +// xOffsets = zOffsets = nullptr; +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// else if(yEws == 1 && zEws == 1 && yOrder == zOrder && (yOrder == 'c' || shape::shapeEquals(yShapeInfo, zShapeInfo))) { +// yOffsets = zOffsets = nullptr; +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// else if(xEws == 1) { +// xOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, xOrder); +// } +// } +// } +// else if(yEws == 1) { +// yOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets, yOrder); +// } +// } +// } +// else if(zEws == 1) { +// zOffsets = nullptr; +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, zOrder); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, zOrder); +// } +// } +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo, zShapeInfo)) { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// yOffsets = zOffsets = xOffsets; +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets); +// } +// } +// yOffsets = xOffsets; +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// } +// zOffsets = xOffsets; +// } +// else { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// zOffsets = new Nd4jLong[len]; +// shape::calcOffsets(zShapeInfo, zOffsets); +// } +// } +// } +// } + +////////////////////////////////////////////////////////////////////// +// INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order) { + +// // we assume all array have same length +// const Nd4jLong len = shape::length(xShapeInfo); + +// const Nd4jLong xEws = shape::elementWiseStride(xShapeInfo); +// const Nd4jLong yEws = shape::elementWiseStride(yShapeInfo); + +// const char xOrder = shape::order(xShapeInfo); +// const char yOrder = shape::order(yShapeInfo); + +// const bool shapesSame = shape::shapeEquals(xShapeInfo, yShapeInfo); + +// if (xEws == 1 && yEws == 1 && xOrder == yOrder && (xOrder == 'c' || shapesSame)) { +// xOffsets = yOffsets = nullptr; +// } +// else if(xEws == 1) { +// xOffsets = nullptr; +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets, xOrder); +// } +// else if(yEws == 1) { +// yOffsets = nullptr; +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets, yOrder); +// } +// else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// yOffsets = xOffsets; +// } +// else { +// PRAGMA_OMP_PARALLEL_SECTIONS +// { +// PRAGMA_OMP_SECTION +// { +// xOffsets = new Nd4jLong[len]; +// shape::calcOffsets(xShapeInfo, xOffsets); +// } +// PRAGMA_OMP_SECTION +// { +// yOffsets = new Nd4jLong[len]; +// shape::calcOffsets(yShapeInfo, yOffsets); +// } +// } +// } +// } ////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////// @@ -11382,6 +11598,9 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); // returns TRUE if this op allows in-place execution public native @Cast("bool") boolean allowsInplace(); + // this method allows you to enable/disable inplace call for a given op + public native void allowInplace(@Cast("bool") boolean reallyAllow); + // this method returns opNum (applicable for legacy XYZ ops only) public native int getOpNum(); @@ -21093,7 +21312,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); public permute() { super((Pointer)null); allocate(); } private native void allocate(); public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); - } + } // #endif // #if NOT_EXCLUDED(OP_reshapeas) @@ -21111,7 +21330,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); public reshapeas() { super((Pointer)null); allocate(); } private native void allocate(); public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); - } + } // #endif // #if NOT_EXCLUDED(OP_transpose) @@ -22222,7 +22441,22 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD(); public tensormmul() { super((Pointer)null); allocate(); } private native void allocate(); public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); - } + } + @Namespace("nd4j::ops") public static class tensormmul_bp extends DeclarableCustomOp { + static { Loader.load(); } + /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */ + public tensormmul_bp(Pointer p) { super(p); } + /** Native array allocator. Access with {@link Pointer#position(long)}. */ + public tensormmul_bp(long size) { super((Pointer)null); allocateArray(size); } + private native void allocateArray(long size); + @Override public tensormmul_bp position(long position) { + return (tensormmul_bp)super.position(position); + } + + public tensormmul_bp() { super((Pointer)null); allocate(); } + private native void allocate(); + public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block); + } // #endif /**