MKLDNN tweaks (#415)

* one simple test Signed-off-by: raver119 <raver119@gmail.com> * fix Signed-off-by: raver119 <raver119@gmail.com> * hmmmm... Signed-off-by: raver119 <raver119@gmail.com> * mkl matmul skip tweaks Signed-off-by: raver119 <raver119@gmail.com> * minor fix for MemoryTracker * long shapes in matmul * - 2 new tests for mkldnn tanh - mkldnn isn't used for scalar tanh
2020-04-27 17:37:53 +03:00 · 2020-04-27 17:37:53 +03:00 · c9d1454743
commit c9d1454743
parent dbebaa9c51
5 changed files with 118 additions and 30 deletions
--- a/libnd4j/include/memory/impl/MemoryTracker.cpp
+++ b/libnd4j/include/memory/impl/MemoryTracker.cpp
@ -90,6 +90,9 @@ namespace sd {
                    return result;
                }
            }
+
+            // safe return
+            return std::string("");
        }

 #endif
--- a/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp
@ -31,6 +31,20 @@ namespace sd      {
 namespace ops       {
 namespace platforms {

+    dnnl::memory::format_tag get_format_tag(const sd::NDArray &array) {
+        switch (array.rankOf()) {
+            case 1:
+                return dnnl::memory::format_tag::ab;
+            case 2:
+                return array.ordering() == 'c' ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::ba;
+            case 3:
+                return array.ordering() == 'c' ? dnnl::memory::format_tag::abc : dnnl::memory::format_tag::cba;
+            default:
+                throw std::runtime_error("MKLDNN matmul only supports 2D/3D arrays");
+        }
+    }
+
+
 //////////////////////////////////////////////////////////////////////////
 static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const bool transX, const bool transY, float alpha = 1.f, float beta = 0.f) {

@ -69,17 +83,15 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b
          NDArray* zR =  xRank <= 3 ? z  : new NDArray(z->reshape(z->ordering(), {z->lengthOf() / (z->sizeAt(-2) * z->sizeAt(-1)), z->sizeAt(-2),  z->sizeAt(-1)})/*, false*/);

    // [M,K] x [K,N] = [M,N]
-    const int M  = (xRank > 1) ? xTR->sizeAt(-2) : 1;
-    const int K  = (xRank > 1) ? xTR->sizeAt(-1) : xTR->lengthOf();
-    const int N  = (yRank > 1) ? yTR->sizeAt(-1) : 1;
-    const int bS = (xRank > 2) ? xTR->sizeAt(0)  : 1;                   // [bS, M,K] x [bS, K,N] = [bS, M,N]
+    const int64_t M  = (xRank > 1) ? xTR->sizeAt(-2) : 1;
+    const int64_t K  = (xRank > 1) ? xTR->sizeAt(-1) : xTR->lengthOf();
+    const int64_t N  = (yRank > 1) ? yTR->sizeAt(-1) : 1;
+    const int64_t bS = (xRank > 2) ? xTR->sizeAt(0)  : 1;                   // [bS, M,K] x [bS, K,N] = [bS, M,N]

    dnnl::memory::dims xShape = xRank < 3 ? dnnl::memory::dims({M, K}) : dnnl::memory::dims({bS, M, K});
    dnnl::memory::dims yShape = xRank < 3 ? dnnl::memory::dims({K, N}) : dnnl::memory::dims({bS, K, N});
    dnnl::memory::dims zShape = xRank < 3 ? dnnl::memory::dims({M, N}) : dnnl::memory::dims({bS, M, N});

-    dnnl::memory::format_tag format = xRank < 3 ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::abc;
-
    // x type
    dnnl::memory::data_type xType;
    if(x->dataType() == DataType::FLOAT32)
@ -114,9 +126,9 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b
    // memory descriptors for arrays

    // x
-    dnnl::memory::desc x_mkl_md  = dnnl::memory::desc(xShape, xType, dnnl::memory::format_tag::any);
-    dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, xType, format);
-    if(xTR->ews() != 1 || xTR->ordering() != 'c') {
+    dnnl::memory::desc x_mkl_md  = dnnl::memory::desc(xShape, xType, get_format_tag(*xTR));
+    dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, xType, get_format_tag(*xTR));
+    if(xTR->ews() != 1) {
        x_user_md.data.format_kind = dnnl_blocked;    // overrides format
        x_user_md.data.format_desc.blocking.strides[0] = xRank == 1 ? 1 : xTR->strideAt(0);
        x_user_md.data.format_desc.blocking.strides[1] = xRank == 1 ? xTR->strideAt(0) : xTR->strideAt(1);
@ -125,9 +137,9 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b
    }

    // y
-    dnnl::memory::desc y_mkl_md  = dnnl::memory::desc(yShape, yType, dnnl::memory::format_tag::any);
-    dnnl::memory::desc y_user_md = dnnl::memory::desc(yShape, yType, format);
-    if(yTR->ews() != 1 || yTR->ordering() != 'c') {
+    dnnl::memory::desc y_mkl_md  = dnnl::memory::desc(yShape, yType, get_format_tag(*yTR));
+    dnnl::memory::desc y_user_md = dnnl::memory::desc(yShape, yType, get_format_tag(*yTR));
+    if(yTR->ews() != 1) {
        y_user_md.data.format_kind = dnnl_blocked;    // overrides format
        y_user_md.data.format_desc.blocking.strides[0] = yRank == 1 ? 1 : yTR->strideAt(0);
        y_user_md.data.format_desc.blocking.strides[1] = yRank == 1 ? yTR->strideAt(0) : yTR->strideAt(1);
@ -136,9 +148,9 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b
    }

    // z
-    dnnl::memory::desc z_mkl_md  = dnnl::memory::desc(zShape, zType, dnnl::memory::format_tag::any);
-    dnnl::memory::desc z_user_md = dnnl::memory::desc(zShape, zType, format);
-    if(zR->ews() != 1 || zR->ordering() != 'c') {
+    dnnl::memory::desc z_mkl_md  = dnnl::memory::desc(zShape, zType, get_format_tag(*zR));
+    dnnl::memory::desc z_user_md = dnnl::memory::desc(zShape, zType, get_format_tag(*zR));
+    if(zR->ews() != 1) {
        z_user_md.data.format_kind = dnnl_blocked;    // overrides format
        z_user_md.data.format_desc.blocking.strides[0] = zRank == 1 ? 1 : zR->strideAt(0);
        z_user_md.data.format_desc.blocking.strides[1] = zRank == 1 ? zR->strideAt(0) : zR->strideAt(1);
@ -289,14 +301,20 @@ PLATFORM_CHECK(matmul, ENGINE_CPU) {

    auto z = OUTPUT_VARIABLE(0);

-    const DataType xType = x->dataType();
-    const DataType yType = y->dataType();
-    const DataType zType = z->dataType();
+    const auto xType = x->dataType();
+    const auto yType = y->dataType();
+    const auto zType = z->dataType();

-    float alpha = block.numT() > 0 ? T_ARG(0) : 1.0;
-    float beta = block.numT() > 1 ? T_ARG(1) : 0.0;
+    float alpha = block.numT() > 0 ? T_ARG(0) : 1.0f;
+    float beta = block.numT() > 1 ? T_ARG(1) : 0.0f;

-    return !(z->ordering() == 'f' && beta != 0.f) && block.isUseMKLDNN() && x->rankOf() < 3 &&
+    // we're skipping if result order is F or arrays are not continuous
+    bool skip2D = z->rankOf() == 2 && (z->ordering() == 'f' || x->ews() != 1 || y->ews() != 1 || z->ews() != 1);
+
+    // we're skipping 3D cases if they are not C continuoys
+    bool skip3D = z->rankOf() == 3 && (x->ordering() == 'f' || y->ordering() == 'f' || z->ordering() == 'f' || x->ews() != 1 || y->ews() != 1 || z->ews() != 1);
+
+    return !skip2D && !skip3D && block.isUseMKLDNN() && x->rankOf() < 3 &&
          (
            (xType==DataType::FLOAT32  && yType==DataType::FLOAT32  && zType==DataType::FLOAT32)  ||
            (xType==DataType::HALF     && yType==DataType::HALF     && zType==DataType::FLOAT32)  ||
--- a/libnd4j/include/ops/declarable/platform/mkldnn/tanh.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/tanh.cpp
@ -109,7 +109,7 @@ namespace sd {
                const DataType zType = z->dataType();

                const int xRank = x->rankOf();
-                bool bSupportedRanks = !x->isEmpty() && xRank < 7 && (xType == DataType::FLOAT32 && zType == DataType::FLOAT32);
+                bool bSupportedRanks = !x->isEmpty() && xRank < 7 && xRank > 0 && (xType == DataType::FLOAT32 && zType == DataType::FLOAT32);
                /*
                Source     Destination
                f32 	    f32
@ -214,7 +214,7 @@ namespace sd {
                const int xRank = x->rankOf();
                const int dLdzRank = dLdz->rankOf();

-                bool bSupportedRanks = xRank < 7 && dLdzRank == xRank && (!x->isEmpty() && !dLdz->isEmpty());
+                bool bSupportedRanks = xRank < 7 && xRank > 0 && dLdzRank == xRank && (!x->isEmpty() && !dLdz->isEmpty());
                bSupportedRanks &= (xType == DataType::FLOAT32 && dLdzType == DataType::FLOAT32 && dLdxType == DataType::FLOAT32);

                if (bSupportedRanks) {
--- a/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp
@ -19,15 +19,17 @@
 // @author raver119@gmail.com
 //

+#ifdef HAVE_MKLDNN
+
 #include "testlayers.h"
 #include <initializer_list>
 #include <ops/declarable/PlatformHelper.h>
-
-#ifdef HAVE_MKLDNN
-
 #include <ops/declarable/platform/mkldnn/mkldnnUtils.h>
+#include <array/NDArrayFactory.h>
+#include <ops/declarable/CustomOperations.h>

-#endif
+
+using namespace sd;

 class MklDnnTests : public testing::Test {
 public:
@ -44,7 +46,6 @@ static void printer(std::initializer_list<sd::ops::platforms::PlatformHelper*> h

 TEST_F(MklDnnTests, helpers_includer) {
    // we need this block, to make sure all helpers are still available within binary, and not optimized out by linker
-#ifdef HAVE_MKLDNN
    sd::ops::platforms::PLATFORM_conv2d_ENGINE_CPU conv2d;
    sd::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CPU conv2d_bp;

@ -83,6 +84,26 @@ TEST_F(MklDnnTests, helpers_includer) {


    printer({&conv2d, &conv2d_bp, &conv3d, &conv3d_bp, &avgpool2d, &avgpool2d_bp, &maxpool2d, &maxpool2d_bp, &avgpool3d, &avgpool3d_bp, &maxpool3d, &maxpool3d_bp, &lrn, &batchnorm, &matmul, &softmax, &softmax_bp, &tanh, &tanh_bp, &xw_plus_b, &xw_plus_b_bp });
-    
-#endif
 }
+
+TEST_F(MklDnnTests, test_tanh_1) {
+    auto x = NDArrayFactory::create<float>(1.0f);
+    auto z = NDArrayFactory::create<float>(0.0f);
+
+    sd::ops::tanh op;
+    auto status = op.execute({&x}, {&z});
+
+    ASSERT_EQ(Status::OK(), status);
+}
+
+TEST_F(MklDnnTests, test_tanh_2) {
+    auto x = NDArrayFactory::create<float>('c', {1}, {1.0f});
+    auto z = NDArrayFactory::create<float>('c', {1}, {0.0f});
+
+    sd::ops::tanh op;
+    auto status = op.execute({&x}, {&z});
+
+    ASSERT_EQ(Status::OK(), status);
+}
+
+#endif
--- a/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp
@ -60,6 +60,52 @@ public:

 #ifdef RELEASE_BUILD

+TEST_F(PerformanceTests, test_matmul_c_f_1) {
+    int iterations = 500;
+    std::vector<ino64_t> valuesC, valuesF;
+    for (int e = 0; e < iterations; e++) {
+        auto xc = NDArrayFactory::create<float>('c', {512, 2048});
+        auto yc = NDArrayFactory::create<float>('c', {2048, 512});
+        auto zc = NDArrayFactory::create<float>('c', {512, 512});
+
+        auto xf = NDArrayFactory::create<float>('f', {512, 2048});
+        auto yf = NDArrayFactory::create<float>('f', {2048, 512});
+        auto zf = NDArrayFactory::create<float>('f', {512, 512});
+
+        auto warm = xc.like();
+        warm.linspace(1.0);
+
+        //zc.linspace(1.0);
+        //zf.linspace(1.0);
+
+        sd::ops::matmul op;
+
+        auto timeStartF = std::chrono::system_clock::now();
+
+        op.execute({&xf, &yf}, {&zf});
+
+        auto timeEndF = std::chrono::system_clock::now();
+        auto outerTimeF = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEndF - timeStartF).count();
+
+
+        auto timeStartC = std::chrono::system_clock::now();
+
+        op.execute({&xc, &yc}, {&zc});
+
+        auto timeEndC = std::chrono::system_clock::now();
+        auto outerTimeC = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEndC - timeStartC).count();
+
+        valuesF.emplace_back(outerTimeF);
+        valuesC.emplace_back(outerTimeC);
+    }
+
+    std::sort(valuesC.begin(), valuesC.end());
+    std::sort(valuesF.begin(), valuesF.end());
+
+
+    nd4j_printf("Median time C: [%lld]; Median time F: [%lld];", valuesC[valuesC.size() / 2], valuesF[valuesF.size() / 2]);
+}
+
 TEST_F(PerformanceTests, test_maxpooling2d_1) {
    std::vector<Nd4jLong> valuesX;
    // auto x = NDArrayFactory::create<float>('c', {32, 3, 224, 224});