MKLDNN tweaks (#415)

* one simple test

Signed-off-by: raver119 <raver119@gmail.com>

* fix

Signed-off-by: raver119 <raver119@gmail.com>

* hmmmm...

Signed-off-by: raver119 <raver119@gmail.com>

* mkl matmul skip tweaks

Signed-off-by: raver119 <raver119@gmail.com>

* minor fix for MemoryTracker

* long shapes in matmul

* - 2 new tests for mkldnn tanh
- mkldnn isn't used for scalar tanh
master
raver119 2020-04-27 17:37:53 +03:00 committed by GitHub
parent dbebaa9c51
commit c9d1454743
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 118 additions and 30 deletions

View File

@ -90,6 +90,9 @@ namespace sd {
return result;
}
}
// safe return
return std::string("");
}
#endif

View File

@ -31,6 +31,20 @@ namespace sd {
namespace ops {
namespace platforms {
dnnl::memory::format_tag get_format_tag(const sd::NDArray &array) {
switch (array.rankOf()) {
case 1:
return dnnl::memory::format_tag::ab;
case 2:
return array.ordering() == 'c' ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::ba;
case 3:
return array.ordering() == 'c' ? dnnl::memory::format_tag::abc : dnnl::memory::format_tag::cba;
default:
throw std::runtime_error("MKLDNN matmul only supports 2D/3D arrays");
}
}
//////////////////////////////////////////////////////////////////////////
static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const bool transX, const bool transY, float alpha = 1.f, float beta = 0.f) {
@ -69,17 +83,15 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b
NDArray* zR = xRank <= 3 ? z : new NDArray(z->reshape(z->ordering(), {z->lengthOf() / (z->sizeAt(-2) * z->sizeAt(-1)), z->sizeAt(-2), z->sizeAt(-1)})/*, false*/);
// [M,K] x [K,N] = [M,N]
const int M = (xRank > 1) ? xTR->sizeAt(-2) : 1;
const int K = (xRank > 1) ? xTR->sizeAt(-1) : xTR->lengthOf();
const int N = (yRank > 1) ? yTR->sizeAt(-1) : 1;
const int bS = (xRank > 2) ? xTR->sizeAt(0) : 1; // [bS, M,K] x [bS, K,N] = [bS, M,N]
const int64_t M = (xRank > 1) ? xTR->sizeAt(-2) : 1;
const int64_t K = (xRank > 1) ? xTR->sizeAt(-1) : xTR->lengthOf();
const int64_t N = (yRank > 1) ? yTR->sizeAt(-1) : 1;
const int64_t bS = (xRank > 2) ? xTR->sizeAt(0) : 1; // [bS, M,K] x [bS, K,N] = [bS, M,N]
dnnl::memory::dims xShape = xRank < 3 ? dnnl::memory::dims({M, K}) : dnnl::memory::dims({bS, M, K});
dnnl::memory::dims yShape = xRank < 3 ? dnnl::memory::dims({K, N}) : dnnl::memory::dims({bS, K, N});
dnnl::memory::dims zShape = xRank < 3 ? dnnl::memory::dims({M, N}) : dnnl::memory::dims({bS, M, N});
dnnl::memory::format_tag format = xRank < 3 ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::abc;
// x type
dnnl::memory::data_type xType;
if(x->dataType() == DataType::FLOAT32)
@ -114,9 +126,9 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b
// memory descriptors for arrays
// x
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xShape, xType, dnnl::memory::format_tag::any);
dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, xType, format);
if(xTR->ews() != 1 || xTR->ordering() != 'c') {
dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xShape, xType, get_format_tag(*xTR));
dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, xType, get_format_tag(*xTR));
if(xTR->ews() != 1) {
x_user_md.data.format_kind = dnnl_blocked; // overrides format
x_user_md.data.format_desc.blocking.strides[0] = xRank == 1 ? 1 : xTR->strideAt(0);
x_user_md.data.format_desc.blocking.strides[1] = xRank == 1 ? xTR->strideAt(0) : xTR->strideAt(1);
@ -125,9 +137,9 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b
}
// y
dnnl::memory::desc y_mkl_md = dnnl::memory::desc(yShape, yType, dnnl::memory::format_tag::any);
dnnl::memory::desc y_user_md = dnnl::memory::desc(yShape, yType, format);
if(yTR->ews() != 1 || yTR->ordering() != 'c') {
dnnl::memory::desc y_mkl_md = dnnl::memory::desc(yShape, yType, get_format_tag(*yTR));
dnnl::memory::desc y_user_md = dnnl::memory::desc(yShape, yType, get_format_tag(*yTR));
if(yTR->ews() != 1) {
y_user_md.data.format_kind = dnnl_blocked; // overrides format
y_user_md.data.format_desc.blocking.strides[0] = yRank == 1 ? 1 : yTR->strideAt(0);
y_user_md.data.format_desc.blocking.strides[1] = yRank == 1 ? yTR->strideAt(0) : yTR->strideAt(1);
@ -136,9 +148,9 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b
}
// z
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zShape, zType, dnnl::memory::format_tag::any);
dnnl::memory::desc z_user_md = dnnl::memory::desc(zShape, zType, format);
if(zR->ews() != 1 || zR->ordering() != 'c') {
dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zShape, zType, get_format_tag(*zR));
dnnl::memory::desc z_user_md = dnnl::memory::desc(zShape, zType, get_format_tag(*zR));
if(zR->ews() != 1) {
z_user_md.data.format_kind = dnnl_blocked; // overrides format
z_user_md.data.format_desc.blocking.strides[0] = zRank == 1 ? 1 : zR->strideAt(0);
z_user_md.data.format_desc.blocking.strides[1] = zRank == 1 ? zR->strideAt(0) : zR->strideAt(1);
@ -289,14 +301,20 @@ PLATFORM_CHECK(matmul, ENGINE_CPU) {
auto z = OUTPUT_VARIABLE(0);
const DataType xType = x->dataType();
const DataType yType = y->dataType();
const DataType zType = z->dataType();
const auto xType = x->dataType();
const auto yType = y->dataType();
const auto zType = z->dataType();
float alpha = block.numT() > 0 ? T_ARG(0) : 1.0;
float beta = block.numT() > 1 ? T_ARG(1) : 0.0;
float alpha = block.numT() > 0 ? T_ARG(0) : 1.0f;
float beta = block.numT() > 1 ? T_ARG(1) : 0.0f;
return !(z->ordering() == 'f' && beta != 0.f) && block.isUseMKLDNN() && x->rankOf() < 3 &&
// we're skipping if result order is F or arrays are not continuous
bool skip2D = z->rankOf() == 2 && (z->ordering() == 'f' || x->ews() != 1 || y->ews() != 1 || z->ews() != 1);
// we're skipping 3D cases if they are not C continuoys
bool skip3D = z->rankOf() == 3 && (x->ordering() == 'f' || y->ordering() == 'f' || z->ordering() == 'f' || x->ews() != 1 || y->ews() != 1 || z->ews() != 1);
return !skip2D && !skip3D && block.isUseMKLDNN() && x->rankOf() < 3 &&
(
(xType==DataType::FLOAT32 && yType==DataType::FLOAT32 && zType==DataType::FLOAT32) ||
(xType==DataType::HALF && yType==DataType::HALF && zType==DataType::FLOAT32) ||

View File

@ -109,7 +109,7 @@ namespace sd {
const DataType zType = z->dataType();
const int xRank = x->rankOf();
bool bSupportedRanks = !x->isEmpty() && xRank < 7 && (xType == DataType::FLOAT32 && zType == DataType::FLOAT32);
bool bSupportedRanks = !x->isEmpty() && xRank < 7 && xRank > 0 && (xType == DataType::FLOAT32 && zType == DataType::FLOAT32);
/*
Source Destination
f32 f32
@ -214,7 +214,7 @@ namespace sd {
const int xRank = x->rankOf();
const int dLdzRank = dLdz->rankOf();
bool bSupportedRanks = xRank < 7 && dLdzRank == xRank && (!x->isEmpty() && !dLdz->isEmpty());
bool bSupportedRanks = xRank < 7 && xRank > 0 && dLdzRank == xRank && (!x->isEmpty() && !dLdz->isEmpty());
bSupportedRanks &= (xType == DataType::FLOAT32 && dLdzType == DataType::FLOAT32 && dLdxType == DataType::FLOAT32);
if (bSupportedRanks) {

View File

@ -19,15 +19,17 @@
// @author raver119@gmail.com
//
#ifdef HAVE_MKLDNN
#include "testlayers.h"
#include <initializer_list>
#include <ops/declarable/PlatformHelper.h>
#ifdef HAVE_MKLDNN
#include <ops/declarable/platform/mkldnn/mkldnnUtils.h>
#include <array/NDArrayFactory.h>
#include <ops/declarable/CustomOperations.h>
#endif
using namespace sd;
class MklDnnTests : public testing::Test {
public:
@ -44,7 +46,6 @@ static void printer(std::initializer_list<sd::ops::platforms::PlatformHelper*> h
TEST_F(MklDnnTests, helpers_includer) {
// we need this block, to make sure all helpers are still available within binary, and not optimized out by linker
#ifdef HAVE_MKLDNN
sd::ops::platforms::PLATFORM_conv2d_ENGINE_CPU conv2d;
sd::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CPU conv2d_bp;
@ -83,6 +84,26 @@ TEST_F(MklDnnTests, helpers_includer) {
printer({&conv2d, &conv2d_bp, &conv3d, &conv3d_bp, &avgpool2d, &avgpool2d_bp, &maxpool2d, &maxpool2d_bp, &avgpool3d, &avgpool3d_bp, &maxpool3d, &maxpool3d_bp, &lrn, &batchnorm, &matmul, &softmax, &softmax_bp, &tanh, &tanh_bp, &xw_plus_b, &xw_plus_b_bp });
#endif
}
TEST_F(MklDnnTests, test_tanh_1) {
auto x = NDArrayFactory::create<float>(1.0f);
auto z = NDArrayFactory::create<float>(0.0f);
sd::ops::tanh op;
auto status = op.execute({&x}, {&z});
ASSERT_EQ(Status::OK(), status);
}
TEST_F(MklDnnTests, test_tanh_2) {
auto x = NDArrayFactory::create<float>('c', {1}, {1.0f});
auto z = NDArrayFactory::create<float>('c', {1}, {0.0f});
sd::ops::tanh op;
auto status = op.execute({&x}, {&z});
ASSERT_EQ(Status::OK(), status);
}
#endif

View File

@ -60,6 +60,52 @@ public:
#ifdef RELEASE_BUILD
TEST_F(PerformanceTests, test_matmul_c_f_1) {
int iterations = 500;
std::vector<ino64_t> valuesC, valuesF;
for (int e = 0; e < iterations; e++) {
auto xc = NDArrayFactory::create<float>('c', {512, 2048});
auto yc = NDArrayFactory::create<float>('c', {2048, 512});
auto zc = NDArrayFactory::create<float>('c', {512, 512});
auto xf = NDArrayFactory::create<float>('f', {512, 2048});
auto yf = NDArrayFactory::create<float>('f', {2048, 512});
auto zf = NDArrayFactory::create<float>('f', {512, 512});
auto warm = xc.like();
warm.linspace(1.0);
//zc.linspace(1.0);
//zf.linspace(1.0);
sd::ops::matmul op;
auto timeStartF = std::chrono::system_clock::now();
op.execute({&xf, &yf}, {&zf});
auto timeEndF = std::chrono::system_clock::now();
auto outerTimeF = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEndF - timeStartF).count();
auto timeStartC = std::chrono::system_clock::now();
op.execute({&xc, &yc}, {&zc});
auto timeEndC = std::chrono::system_clock::now();
auto outerTimeC = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEndC - timeStartC).count();
valuesF.emplace_back(outerTimeF);
valuesC.emplace_back(outerTimeC);
}
std::sort(valuesC.begin(), valuesC.end());
std::sort(valuesF.begin(), valuesF.end());
nd4j_printf("Median time C: [%lld]; Median time F: [%lld];", valuesC[valuesC.size() / 2], valuesF[valuesF.size() / 2]);
}
TEST_F(PerformanceTests, test_maxpooling2d_1) {
std::vector<Nd4jLong> valuesX;
// auto x = NDArrayFactory::create<float>('c', {32, 3, 224, 224});