cavis/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp

1684 lines
59 KiB
C++
Raw Normal View History

2019-06-06 14:21:15 +02:00
/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
* Copyright (c) 2019 Konduit K.K.
2019-06-06 14:21:15 +02:00
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// Created by raver119 on 20.11.17.
//
#include "testlayers.h"
#include <graph/Graph.h>
2019-06-06 14:21:15 +02:00
#include <chrono>
#include <graph/Node.h>
2019-06-06 14:21:15 +02:00
#include <ops/declarable/CustomOperations.h>
#include <graph/profiling/GraphProfilingHelper.h>
#include <loops/type_conversions.h>
2019-06-06 14:21:15 +02:00
#include <helpers/threshold.h>
#include <helpers/MmulHelper.h>
#include <ops/ops.h>
#include <helpers/OmpLaunchHelper.h>
#include <helpers/GradCheck.h>
2019-06-06 14:21:15 +02:00
#include <ops/declarable/helpers/im2col.h>
#include <helpers/Loops.h>
#include <helpers/RandomLauncher.h>
#include <ops/declarable/helpers/convolutions.h>
2019-06-06 14:21:15 +02:00
#include <helpers/BenchmarkHelper.h>
#include <ops/declarable/helpers/scatter.h>
#include <helpers/ConstantShapeHelper.h>
#include <helpers/ConstantTadHelper.h>
#include <array>
#include <performance/benchmarking/FullBenchmarkSuit.h>
#include <performance/benchmarking/LightBenchmarkSuit.h>
#include <random>
#include <ops/declarable/helpers/legacy_helpers.h>
#include <ops/declarable/helpers/addBias.h>
#include <ops/declarable/helpers/axis.h>
#include <ops/declarable/helpers/reductions.h>
#include <helpers/LoopsCoordsHelper.h>
2021-02-01 06:31:20 +01:00
#include <build_info.h>
using namespace sd;
using namespace sd::graph;
2019-06-06 14:21:15 +02:00
class PlaygroundTests : public testing::Test {
public:
int numIterations = 3;
int poolSize = 10;
PlaygroundTests() {
}
};
TEST_F(PlaygroundTests, test_avx) {
nd4j_printf("Optimal level: %i; Binary level: %i;\n", ::optimalLevel(), ::binaryLevel());
}
2021-02-01 06:31:20 +01:00
TEST_F(PlaygroundTests, buildver) {
nd4j_printf("%s\n", buildInfo());
}
TEST_F(PlaygroundTests, test_biasAdd_1) {
auto x = NDArrayFactory::create<float>('c', {512, 3072});
auto y = NDArrayFactory::create<float>('c', {3072});
std::vector<Nd4jLong> values;
sd::ops::biasadd op;
for (int e = 0; e < 100; e++) {
auto timeStart = std::chrono::system_clock::now();
op.execute({&x, &y}, {&x});
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
values.emplace_back(outerTime);
}
std::sort(values.begin(), values.end());
nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
}
TEST_F(PlaygroundTests, test_bert_full_1) {
#ifdef _RELEASE
// this test will run ONLY if this model exists
if (sd::graph::getFileSize("/home/raver119/Downloads/BertFull/model.fb") < 0)
return;
auto graph = GraphExecutioner::importFromFlatBuffers("/home/raver119/Downloads/BertFull/model.fb");
auto t = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/BertFull/in0_IteratorGetNext.npy");
auto u = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/BertFull/in1_IteratorGetNext_1.npy");
auto v = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/BertFull/in2_IteratorGetNext_4.npy");
auto z = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/BertFull/out_loss-Softmax.npy");
//graph->printOut();
graph->tagInplaceNodes();
graph->getVariableSpace()->putVariable(658,0, t);
graph->getVariableSpace()->putVariable(659,0, u);
graph->getVariableSpace()->putVariable(660,0, v);
/*
// validating graph now
auto status = GraphExecutioner::execute(graph);
ASSERT_EQ(Status::OK(), status);
ASSERT_TRUE(graph->getVariableSpace()->hasVariable(1620));
auto array = graph->getVariableSpace()->getVariable(1620)->getNDArray();
ASSERT_EQ(z, *array);
*/
sd::Environment::getInstance().setProfiling(true);
auto profile = GraphProfilingHelper::profile(graph, 1);
profile->printOut();
sd::Environment::getInstance().setProfiling(false);
delete profile;
/*
std::vector<Nd4jLong> values;
for (int e = 0; e < 1; e++) {
auto timeStart = std::chrono::system_clock::now();
GraphExecutioner::execute(graph);
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
values.emplace_back(outerTime);
}
std::sort(values.begin(), values.end());
nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
*/
delete graph;
#endif
}
TEST_F(PlaygroundTests, test_bert_1) {
#ifdef _RELEASE
// this test will run ONLY if this model exists
if (sd::graph::getFileSize("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb") < 0)
return;
auto graph = GraphExecutioner::importFromFlatBuffers("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb");
auto t = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext.numpy");
auto u = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext_1.numpy");
auto v = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_input_IteratorGetNext_4.numpy");
auto z = NDArrayFactory::fromNpyFile("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model_output.numpy");
//graph->printOut();
graph->tagInplaceNodes();
graph->getVariableSpace()->putVariable(85,0, t);
graph->getVariableSpace()->putVariable(86,0, u);
graph->getVariableSpace()->putVariable(87,0, v);
/*
// validating graph now
auto status = GraphExecutioner::execute(graph);
ASSERT_EQ(Status::OK(), status);
ASSERT_TRUE(graph->getVariableSpace()->hasVariable(198));
auto array = graph->getVariableSpace()->getVariable(198)->getNDArray();
ASSERT_EQ(z, *array);
*/
sd::Environment::getInstance().setProfiling(true);
auto profile = GraphProfilingHelper::profile(graph, 1);
profile->printOut();
sd::Environment::getInstance().setProfiling(false);
delete profile;
/*
std::vector<Nd4jLong> values;
for (int e = 0; e < 1; e++) {
auto timeStart = std::chrono::system_clock::now();
GraphExecutioner::execute(graph);
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
values.emplace_back(outerTime);
}
std::sort(values.begin(), values.end());
nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
*/
delete graph;
#endif
}
TEST_F(PlaygroundTests, test_bert_2) {
#ifdef _RELEASE
// this test will run ONLY if this model exists
if (sd::graph::getFileSize("/home/raver119/Downloads/Bert_minimal_model/bert_like_ops.fb") < 0)
return;
auto graph = GraphExecutioner::importFromFlatBuffers("/home/raver119/Downloads/Bert_minimal_model/bert_like_ops.fb");
//graph->printOut();
graph->tagInplaceNodes();
/*
// validating graph now
auto status = GraphExecutioner::execute(graph);
ASSERT_EQ(Status::OK(), status);
ASSERT_TRUE(graph->getVariableSpace()->hasVariable(198));
auto array = graph->getVariableSpace()->getVariable(198)->getNDArray();
ASSERT_EQ(z, *array);
*/
sd::Environment::getInstance().setProfiling(true);
auto profile = GraphProfilingHelper::profile(graph, 1);
profile->printOut();
sd::Environment::getInstance().setProfiling(false);
delete profile;
/*
std::vector<Nd4jLong> values;
for (int e = 0; e < 1; e++) {
auto timeStart = std::chrono::system_clock::now();
GraphExecutioner::execute(graph);
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
values.emplace_back(outerTime);
}
std::sort(values.begin(), values.end());
nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
*/
delete graph;
#endif
}
TEST_F(PlaygroundTests, test_one_off_ops_1) {
auto x = NDArrayFactory::create<float>('c', {4, 128, 768});
auto y = NDArrayFactory::create<float>('c', {4, 128, 1});
auto z = x.ulike();
sd::ops::squaredsubtract op;
op.execute({&x, &y}, {&z});
}
#if defined(INDEX_REDUCTIONS_BENCH_TESTS)
//temporarly, testing against the original one
void original_argmax(const NDArray& input, std::vector<int>& axis, NDArray& output) {
sd::ops::helpers::adjustAxis(input.rankOf(), axis);
input.applyIndexReduce(sd::indexreduce::IndexMax, output, axis);
}
template<typename T>
void fill_random(sd::NDArray& arr) {
Nd4jLong coords[MAX_RANK] = {};
std::random_device rd;
std::mt19937 gen(rd());
//for floats
std::uniform_real_distribution<T> dis((T)-10.0, (T)22.9);
T* x = arr.bufferAsT<T>();
Nd4jLong* shapeInfo = arr.getShapeInfo();
Nd4jLong* strides = arr.stridesOf();
Nd4jLong rank = shapeInfo[0];
Nd4jLong* bases = &(shapeInfo[1]);
size_t t = 1;
for (size_t i = 0; i < rank ; i++) {
t *= bases[i];
}
size_t offset = 0;
if (arr.ordering() == 'c') {
for (size_t i = 0; i < t; i++) {
x[offset] = dis(gen) ;
offset = sd::inc_coords(bases, strides, coords, offset, rank);
}
}
else {
for (size_t i = 0; i < t; i++) {
x[offset] = dis(gen) ;
offset = sd::inc_coords<false>(bases, strides, coords, offset, rank);
}
}
}
void testLegacy(bool random) {
#if 0
int bases[] = { 3, 2, 4, 5, 7 };
constexpr int Loop = 1;
#else
int bases[] = { 8, 32, 64, 32, 64 };
constexpr int Loop = 10;
#endif
constexpr int N = 5;
auto x = NDArrayFactory::create<float>('c', { bases[0], bases[1], bases[2], bases[3], bases[4] });
if (!random) {
x.linspace(1);
}
else{
fill_random<float>(x);
}
#define COMBINATIONS 1
#if COMBINATIONS
//https://www.rosettacode.org/wiki/Combinations#C.2B.2B
for (int k = N; k >= 1; k--) {
std::string bitmask(k, 1); // K leading 1's
bitmask.resize(N, 0); // N-K trailing 0's
do {
std::vector<int> dimension;
std::vector<Nd4jLong> output_bases;
for (int i = 0; i < N; ++i) // [0..N-1] integers
{
if (bitmask[i]) dimension.push_back(i);
else {
output_bases.push_back(bases[i]);
}
}
#else
std::vector<int> dimension = { 0,1,2,3 };
int k = 4;
#endif
auto dim = NDArrayFactory::create<int>(dimension);
#if 1
nd4j_printf("C(N:%d K:%d) \n", N, k);
dim.printIndexedBuffer("Dimension");
for (int xind : dimension) {
nd4j_printf(" %d ,", bases[xind]);
}
nd4j_printf("%s", "\n");
#endif
std::vector<Nd4jLong> values;
sd::ResultSet result;
for (int e = 0; e < Loop; e++) {
auto timeStart = std::chrono::system_clock::now();
NDArray exp = output_bases.size() > 0 ? NDArrayFactory::create<Nd4jLong>('c', output_bases) : NDArrayFactory::create<Nd4jLong>(0);
original_argmax(x, dimension, exp);
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
values.emplace_back(outerTime);
}
std::sort(values.begin(), values.end());
nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
#if COMBINATIONS
} while (std::prev_permutation(bitmask.begin(), bitmask.end()));
}
#endif
}
#define DEBUG 1
void testNewReduction(bool random, bool checkCorrectness = false , char order ='c') {
std::vector<Nd4jLong> arr_dimensions;
#if defined(DEBUG)
int bases[] = { 3, 2, 3, 3, 5 ,4,7,4,7,7 };
constexpr int Loop = 1;
constexpr int N = 10;
#else
int bases[] = { 8, 32, 64, 32, 64 };
constexpr int Loop = 10;
constexpr int N = 5;
#endif
for (int i = 0; i < N; i++) {
arr_dimensions.push_back(bases[i]);
}
auto x = NDArrayFactory::create<float>(order,arr_dimensions);
if (!random) {
x.linspace(1);
}
else {
fill_random<float>(x);
}
#define COMBINATIONS 1
#if COMBINATIONS
//https://www.rosettacode.org/wiki/Combinations#C.2B.2B
for (int k = N; k >= 1; k--) {
std::string bitmask(k, 1); // K leading 1's
bitmask.resize(N, 0); // N-K trailing 0's
do {
std::vector<int> dimension;
std::vector<Nd4jLong> output_bases;
for (int i = 0; i < N; ++i) // [0..N-1] integers
{
if (bitmask[i]) dimension.push_back(i);
else {
output_bases.push_back(bases[i]);
}
}
#else
std::vector<int> dimension = { 0,1,2,3 };
int k = 4;
#endif
auto dim = NDArrayFactory::create<int>(dimension);
#if 1
nd4j_printf("C(N:%d K:%d) \n", N, k);
dim.printIndexedBuffer("Dimension");
for (int xind : dimension) {
nd4j_printf(" %d ,", bases[xind]);
}
nd4j_printf("%s", "\n");
#endif
sd::ops::argmax op;
std::vector<Nd4jLong> values;
sd::ResultSet result;
for (int e = 0; e < Loop; e++) {
auto timeStart = std::chrono::system_clock::now();
result = op.evaluate({ &x, &dim }, {}, {});
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
values.emplace_back(outerTime);
}
auto z = result.at(0);
if (checkCorrectness) {
//check for the correctness
NDArray exp = output_bases.size() > 0 ? NDArrayFactory::create<Nd4jLong>('c', output_bases) : NDArrayFactory::create<Nd4jLong>(0);
original_argmax(x, dimension, exp);
#if 0// defined(DEBUG)
x.printIndexedBuffer("X");
exp.printIndexedBuffer("Expected");
z->printIndexedBuffer("Z");
#endif
ASSERT_TRUE(exp.isSameShape(z));
ASSERT_TRUE(exp.equalsTo(z));
}
std::sort(values.begin(), values.end());
nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
#if COMBINATIONS
} while (std::prev_permutation(bitmask.begin(), bitmask.end()));
}
#endif
}
constexpr bool test_corr = true;
#if !defined(DEBUG)
TEST_F(PlaygroundTests, ArgMaxPerfLinspace) {
testNewReduction(false, test_corr);
}
#endif
TEST_F(PlaygroundTests, ArgMaxPerfRandom) {
testNewReduction(true, test_corr);
}
TEST_F(PlaygroundTests, ArgMaxPerfRandomOrderF) {
testNewReduction(true, test_corr, 'f');
}
#if !defined(DEBUG)
TEST_F(PlaygroundTests, ArgMaxPerfLegacyLinspace) {
testLegacy(false);
}
TEST_F(PlaygroundTests, ArgMaxPerfLegacyRandom) {
testLegacy(true);
}
#endif
#endif
/*
TEST_F(PlaygroundTests, test_broadcast_1) {
int pool = 1000;
std::vector<NDArray*> aX(pool);
std::vector<NDArray*> aY(pool);
std::vector<NDArray*> aZ(pool);
for (int e = 0; e < pool; e++) {
aX[e] = NDArrayFactory::create_<float>('c', {512, 3072});
aY[e] = NDArrayFactory::create_<float>('c', {3072});
aZ[e] = NDArrayFactory::create_<float>('c', {512, 3072});
aX[e]->assign(119 * (e+1));
aY[e]->assign(119 * (e+3));
}
std::vector<Nd4jLong> values;
Context ctx(1);
sd::ops::biasadd op;
for (int e = 0; e < 1000; e++) {
auto x = aX[e < pool ? e : e % pool];
auto y = aY[e < pool ? e : e % pool];
auto z = aZ[e < pool ? e : e % pool];
auto timeStart = std::chrono::system_clock::now();
//op.execute({x, y}, {z});
sd::ops::helpers::addBias(ctx, *x, *y, *z, false);
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
values.emplace_back(outerTime);
}
std::sort(values.begin(), values.end());
nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
for (int e = 0; e < pool; e++) {
delete aX[e];
delete aY[e];
delete aZ[e];
}
}
/*
TEST_F(PlaygroundTests, test_broadcast_1) {
int pool = 500;
std::vector<NDArray*> aX(pool);
std::vector<NDArray*> aY(pool);
std::vector<NDArray*> aZ(pool);
for (int e = 0; e < pool; e++) {
aX[e] = NDArrayFactory::create_<float>('c', {512, 3072});
aY[e] = NDArrayFactory::create_<float>('c', {768});
aZ[e] = NDArrayFactory::create_<float>('c', {512, 3072});
aX[e]->assign( (e+1) / 119);
aY[e]->assign( (e+3) / 119);
}
std::vector<Nd4jLong> values;
for (int e = 0; e < 1000; e++) {
auto x = aX[e < pool ? e : e % pool];
auto y = aY[e < pool ? e : e % pool];
auto z = aZ[e < pool ? e : e % pool];
auto timeStart = std::chrono::system_clock::now();
//x->applyTrueBroadcast(BroadcastOpsTuple::Multiply(), *y, *z);
x->applyTransform(transform::Tanh, *z, nullptr);
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
values.emplace_back(outerTime);
}
std::sort(values.begin(), values.end());
nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
for (int e = 0; e < pool; e++) {
delete aX[e];
delete aY[e];
delete aZ[e];
}
}
*/
/*
TEST_F(PlaygroundTests, test_s_0) {
std::vector<std::vector<Nd4jLong>> shapes = {{32, 224, 224, 3}, {32, 56, 56, 64}, {32, 7, 7, 512}};
std::vector<int> threads = {1, 2, 4, 8, 16};
for (auto shape: shapes) {
for (auto t: threads) {
sd::Environment::getInstance().setMaxMasterThreads(t);
auto x = NDArrayFactory::create<float>('c', shape);
auto y = NDArrayFactory::create<float>('c', {shape[3]});
auto z = x.ulike();
std::vector<Nd4jLong> values;
Context ctx(1);
ctx.setInputArray(0, &x);
ctx.setInputArray(1, &y);
ctx.setOutputArray(0, &z);
sd::ops::biasadd op;
for (int e = 0; e < 10000; e++) {
auto timeStart = std::chrono::system_clock::now();
op.execute(&ctx);
sd::ops::helpers::addBias(ctx, x, y, z, false);
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
values.emplace_back(outerTime);
}
std::sort(values.begin(), values.end());
nd4j_printf("Shape: [%lld, %lld, %lld, %lld]; Threads: [%i]; Time: %lld us;\n", shape[0], shape[1], shape[2], shape[3], t, values[values.size() / 2]);
}
}
}
TEST_F(PlaygroundTests, test_s_1) {
std::vector<std::vector<Nd4jLong>> shapes = {{32, 3, 224, 224}, {32, 64, 56, 56}, {32, 512, 7, 7}};
std::vector<int> threads = {1, 2, 4, 8, 16};
for (auto shape: shapes) {
for (auto t: threads) {
sd::Environment::getInstance().setMaxMasterThreads(t);
auto x = NDArrayFactory::create<float>('c', shape);
auto y = NDArrayFactory::create<float>('c', {shape[1]});
auto z = x.ulike();
std::vector<Nd4jLong> values;
Context ctx(1);
ctx.setInputArray(0, &x);
ctx.setInputArray(1, &y);
ctx.setOutputArray(0, &z);
sd::ops::biasadd op;
for (int e = 0; e < 10000; e++) {
auto timeStart = std::chrono::system_clock::now();
//op.execute({&x, &y}, {&z}, {true});
sd::ops::helpers::addBias(ctx, x, y, z, true);
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
values.emplace_back(outerTime);
}
std::sort(values.begin(), values.end());
nd4j_printf("Shape: [%lld, %lld, %lld, %lld]; Threads: [%i]; Time: %lld us;\n", shape[0], shape[1], shape[2], shape[3], t, values[values.size() / 2]);
}
}
}
*/
/*
TEST_F(PlaygroundTests, test_s_0) {
auto x = NDArrayFactory::create<float>('c', {32, 112, 112, 16});
auto y = NDArrayFactory::create<float>('c', {16});
auto z = x.ulike();
std::vector<Nd4jLong> values;
Context ctx(1);
ctx.setInputArray(0, &x);
ctx.setInputArray(1, &y);
ctx.setOutputArray(0, &z);
sd::ops::biasadd op;
for (int e = 0; e < 10000; e++) {
auto timeStart = std::chrono::system_clock::now();
op.execute(&ctx);
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
values.emplace_back(outerTime);
}
std::sort(values.begin(), values.end());
nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
}
*/
/*
TEST_F(PlaygroundTests, test_s_1) {
auto x0 = NDArrayFactory::create<float>('c', {32, 7, 7, 176});
auto x1 = x0.ulike();
auto x2 = x0.ulike();
auto x3 = x0.ulike();
auto x4 = x0.ulike();
auto x5 = x0.ulike();
auto y = NDArrayFactory::create<int >(3);
auto z = NDArrayFactory::create<float>('c', {32, 7, 7, 1056});
Context ctx(1);
ctx.setInputArray(0, &x0);
ctx.setInputArray(1, &x1);
ctx.setInputArray(2, &x2);
ctx.setInputArray(3, &x3);
ctx.setInputArray(4, &x4);
ctx.setInputArray(5, &x5);
ctx.setInputArray(6, &y);
ctx.setOutputArray(0, &z);
ctx.setBArguments({true});
std::vector<Nd4jLong> values;
sd::ops::concat op;
op.execute(&ctx);
for (int e = 0; e < 1000; e++) {
auto timeStart = std::chrono::system_clock::now();
op.execute(&ctx);
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
values.emplace_back(outerTime);
}
std::sort(values.begin(), values.end());
nd4j_printf("Time: %lld us;\n", values[values.size() / 2]);
}
*/
/*
TEST_F(PlaygroundTests, test_s_1) {
auto t = ::runLightBenchmarkSuit(true);
delete[] t;
}
TEST_F(PlaygroundTests, test_s_2) {
std::atomic<int> s;
s = 0;
auto func = PRAGMA_THREADS_FOR {
s++;
};
samediff::Threads::parallel_for(func, 0, 8192, 1, 4);
std::vector<Nd4jLong> values;
for (int e = 0; e < 100000; e++) {
s = 0;
auto timeStart = std::chrono::system_clock::now();
//samediff::Threads::parallel_for(func, 0, 8192, 1, 4);
PRAGMA_OMP_PARALLEL_THREADS(4) {
s++;
}
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds> (timeEnd - timeStart).count();
values.emplace_back(outerTime);
};
std::sort(values.begin(), values.end());
nd4j_printf("Time: %lld;\n", values[values.size() / 2]);
}
*/
/*
TEST_F(PlaygroundTests, test_s_4) {
std::atomic<float> f;
std::atomic<int> s;
std::vector<Nd4jLong> valuesX, valuesY;
int iterations = 1000;
s = 0;
auto func = PRAGMA_THREADS_FOR {
s++;
};
samediff::Threads::parallel_for(func, 0, 8192, 1, 4);
////////
auto x = NDArrayFactory::create<float>('c', {32, 3, 256, 256});
auto z = NDArrayFactory::create<float>('c', {32, 3, 256, 256});
x.linspace(1.0);
auto xs0 = x.sizeAt(0);
auto xs1 = x.sizeAt(1);
auto xs2 = x.sizeAt(2);
auto xs3 = x.sizeAt(3);
auto buffer = x.bufferAsT<float>();
auto zbuffer = z.bufferAsT<float>();
for (int e = 0; e < iterations; e++) {
auto timeStart = std::chrono::system_clock::now();
PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2)
for (int i = 0; i < xs0; i++) {
for (int j = 0; j < xs1; j++) {
auto thread_id = omp_get_thread_num();
for (int k = 0; k < xs2; k++) {
for (int l = 0; l < xs3; l++) {
zbuffer[thread_id] += buffer[i * j + (k*l)] * 2.5f;
}
}
}
}
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
valuesX.emplace_back(outerTime);
}
for (int e = 0; e < iterations; e++) {
auto timeStart = std::chrono::system_clock::now();
auto f2d = PRAGMA_THREADS_FOR_2D {
for (auto i = start_x; i < stop_x; i++) {
for (auto j = start_y; j < stop_y; j++) {
for (auto k = 0; k < xs2; k++) {
for (auto l = 0; l < xs3; l++) {
zbuffer[thread_id] += buffer[i * j + (k * l)] * 2.5f;
}
}
}
}
};
samediff::Threads::parallel_for(f2d, 0, xs0, 1, 0, xs1, 1);
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
valuesY.emplace_back(outerTime);
}
if (valuesX.size() > 0) {
std::sort(valuesX.begin(), valuesX.end());
nd4j_printf("OpenMP time: %lld; Min: %lld; Max: %lld;\n", valuesX[valuesX.size() / 2], valuesX[0], valuesX[valuesX.size() - 1]);
}
if (valuesY.size() > 0) {
std::sort(valuesY.begin(), valuesY.end());
nd4j_printf("Threads time: %lld; Min: %lld; Max: %lld;\n", valuesY[valuesY.size() / 2], valuesY[0], valuesY[valuesY.size() - 1]);
}
nd4j_printf("Sum: %f\n", z.sumNumber().e<float>(0));
}
TEST_F(PlaygroundTests, test_s_5) {
auto x = NDArrayFactory::create<float>('c', {32, 1, 28, 28});
std::vector<Nd4jLong> values;
auto iterations = 100;
auto startX = 0;
auto stopX = x.sizeAt(0);
auto incX = 1;
auto startY = 0;
auto stopY = x.sizeAt(1);
auto incY = 1;
auto numThreads = 4;
// number of elements per loop
auto delta_x = (stopX - startX);
auto delta_y = (stopY - startY);
// number of iterations per loop
auto itersX = delta_x / incX;
auto itersY = delta_y / incY;
for (int e = 0; e < iterations; e++) {
auto timeStart = std::chrono::system_clock::now();
// picking best fit here
auto splitLoop = samediff::ThreadsHelper::pickLoop2d(numThreads, itersX, itersY);
auto span = samediff::Span2::build(splitLoop, 0, numThreads, startX, stopX, incX, startY, stopY, incY);
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
values.emplace_back(outerTime);
}
std::sort(values.begin(), values.end());
nd4j_printf("Calculations time: [Median: %lld; Min: %lld; Max: %lld;]\n", values[values.size() / 2], values[0], values[values.size()-1]);
}
TEST_F(PlaygroundTests, test_s_6) {
auto x = NDArrayFactory::create<float>('c', {1024 * 1024 * 64});
auto buffer = x.bufferAsT<float>();
auto len = x.lengthOf();
std::vector<Nd4jLong> values;
auto iterations = 1000;
for (int i = 0; i < iterations; i++) {
auto timeStart = std::chrono::system_clock::now();
// picking best fit here
for (int e = 0; e < len; e++) {
buffer[e] = (buffer[e] + 1.72f) * 3.17f - 0.0012f;
}
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
values.emplace_back(outerTime);
}
std::sort(values.begin(), values.end());
nd4j_printf("Calculations time: [Median: %lld; Min: %lld; Max: %lld;]\n", values[values.size() / 2], values[0], values[values.size()-1]);
}
TEST_F(PlaygroundTests, test_s_3) {
std::atomic<int> s;
s = 0;
auto func = PRAGMA_THREADS_FOR {
s++;
};
for (int e = 0; e < 10000; e++) {
samediff::Threads::parallel_for(func, 0, 8192, 1, 4);
}
}
*/
/*
TEST_F(PlaygroundTests, test_relubp_1) {
auto x = NDArrayFactory::create<float>('c', {128, 64, 224, 224});
auto y = x.ulike();
auto z = x.ulike();
RandomGenerator rng(119, 120);
RandomLauncher::fillUniform(LaunchContext::defaultContext(), rng, &x, -1.0, 1.0);
RandomLauncher::fillUniform(LaunchContext::defaultContext(), rng, &y, -1.0, 1.0);
int iterations = 10;
auto timeStart = std::chrono::system_clock::now();
for (int e = 0; e < iterations; e++)
ops::helpers::reluDerivative(LaunchContext::defaultContext(), &x, &y, &z);
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
auto time = (Nd4jLong) outerTime / iterations;
auto bw = (1000000L * (float) (x.lengthOf() * x.sizeOfT()) / time) / 1024 / 1024 / 1024;
nd4j_printf("Time: %lld; BW: %f GB/s\n", time, bw);
}
//////////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, my) {
int bS=8, iD=32,iH=32,iW=32, iC=128, kD=2,kH=2,kW=2, sD=1,sH=1,sW=1, pD=0,pH=0,pW=0, dD=2,dH=2,dW=2;
int oD,oH,oW;
sd::ops::ConvolutionUtils::calcOutSizeDeconv3D(oD, oH, oW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, 0);
printf("!!%i, %i, %i\n", oD,oH,oW);
NDArray col('c', {bS, iC, kD, kH, kW, iD, iH, iW}, sd::DataType::DOUBLE);
NDArray vol('c', {bS, iC, oD, oH, oW}, sd::DataType::DOUBLE);
col = 3.77;
vol = -10.33;
auto variableSpace = new VariableSpace();
auto block = new Context(1, variableSpace, false); // not-in-place
auto timeStart = std::chrono::system_clock::now();
sd::ops::ConvolutionUtils::col2vol(*block, col, vol, sD, sH, sW, pD, pH, pW, dD, dH, dW);
auto timeEnd = std::chrono::system_clock::now();
auto time = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
printf("time: %i \n", time);
delete block;
delete variableSpace;
}
TEST_F(PlaygroundTests, my) {
int bS=32, iD=32,iH=64,iW=64, iC=128, kD=2,kH=2,kW=2, sD=1,sH=1,sW=1, pD=0,pH=0,pW=0, dD=2,dH=2,dW=2;
int oD,oH,oW;
// sd::ops::ConvolutionUtils::calcOutSizeDeconv3D(oD, oH, oW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, 0);
sd::ops::ConvolutionUtils::calcOutSizeDeconv2D(oH, oW, kH, kW, sH, sW, pH, pW,dH, dW, iH, iW, 0);
printf("!!%i, %i, %i\n", oD,oH,oW);
// NDArray col('c', {bS, iC, kD, kH, kW, iD, iH, iW}, sd::DataType::DOUBLE);
// NDArray vol('c', {bS, iC, oD, oH, oW}, sd::DataType::DOUBLE);
NDArray col('c', {bS, iC, kH, kW, iH, iW}, sd::DataType::DOUBLE);
NDArray im('c', {bS, iC, oH, oW}, sd::DataType::DOUBLE);
col = 3.77;
// vol = -10.33;
im = -10.33;
auto variableSpace = new VariableSpace();
auto block = new Context(1, variableSpace, false); // not-in-place
auto timeStart = std::chrono::system_clock::now();
// sd::ops::ConvolutionUtils::col2vol(*block, col, vol, sD, sH, sW, pD, pH, pW, dD, dH, dW);
sd::ops::helpers::col2im(*col.getContext(), col, im, sH, sW, pH, pW, iH, iW, dH, dW);
auto timeEnd = std::chrono::system_clock::now();
auto time = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
printf("time: %i \n", time);
delete block;
delete variableSpace;
}
Shyrma lstm layer bp (#370) * - start working on bp for lstm Signed-off-by: Yurii <iuriish@yahoo.com> * - further working on bp for lstmLayer Signed-off-by: Yurii <iuriish@yahoo.com> * - minor change Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 2 Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 3 Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 4 Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 5 Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 6 Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 7 Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 8 Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 9 Signed-off-by: Yurii <iuriish@yahoo.com> * - provide lstmLayerCell and lstmLayerCellBp as separate CUSTOM_OPs Signed-off-by: Yurii <iuriish@yahoo.com> * - testing and fixing lstmLayerCellBp helper Signed-off-by: Yurii <iuriish@yahoo.com> * - implement lstmLayerCellBp as separate op Signed-off-by: Yurii <iuriish@yahoo.com> * - implement lstmLayerBp as separate op (not tested) Signed-off-by: Yurii <iuriish@yahoo.com> * - fixing calculations of dLdWp and dLdb in lstmLayerCellBp Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 10 Signed-off-by: Yurii <iuriish@yahoo.com> * - fixing typo in lstmLayerTimeLoop Signed-off-by: Yurii <iuriish@yahoo.com> * - forgot to perform clipping of c array and calculate corresponding derivative in lstmLayerCellBp Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 10 Signed-off-by: Yurii <iuriish@yahoo.com> * - testing and fixing bugs in lstmLayer_bp op 1 Signed-off-by: Yurii <iuriish@yahoo.com> * - testing and fixing bugs in lstmLayer_bp op 2 Signed-off-by: Yurii <iuriish@yahoo.com> * - turn off heavy tests for cuda for lstmLayer_bp op Signed-off-by: Yurii <iuriish@yahoo.com> * - forgot to nullify gradients at eliminated time steps (when sequnce length array is present ) Signed-off-by: Yurii <iuriish@yahoo.com>
2020-04-13 12:21:51 +02:00
///////////////////////////////////////////////////////////////////
TEST_F(PlaygroundTests, lstmLayerCellBp_1) {
const int bS = 2;
const int nIn = 4;
const int nOut = 3;
// const int nIn = 8;
// const int nOut = 6;
const float cellClip = 1.1; // clipping value
const Nd4jLong gateAct = 2; // sigmoid activation for input (i), forget (f) and output (o) gates
const float gateAlpha = 0; // alpha value for activation for gates, not required for sigmoid
const float gateBeta = 0; // beta value for activation for gates, not required for sigmoid
const Nd4jLong cellAct = 0; // tanh activation for cell state
const float cellAlpha = 0; // alpha value for cell state activation, not required for tanh
const float cellBeta = 0; // beta value for cell state activation, not required for tanh
const Nd4jLong outAct = 0; // tanh activation for output
const float outAlpha = 0; // alpha value for output activation, not required for tanh
const float outBeta = 0; // beta value for output activation, not required for tanh
NDArray x ('c', {bS, nIn}, sd::DataType::DOUBLE);
NDArray hI('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray cI('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray dLdh('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray dLdc('c', {bS, nOut}, sd::DataType::DOUBLE);
// NDArray x ('c', {nIn}, sd::DataType::DOUBLE);
// NDArray hI('c', {nOut}, sd::DataType::DOUBLE);
// NDArray cI('c', {nOut}, sd::DataType::DOUBLE);
// NDArray dLdh('c', {nOut}, sd::DataType::DOUBLE);
// NDArray dLdc('c', {nOut}, sd::DataType::DOUBLE);
NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::DOUBLE);
NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::DOUBLE);
NDArray b ('c', {4*nOut}, sd::DataType::DOUBLE);
NDArray Wp('c', {3*nOut}, sd::DataType::DOUBLE);
x.linspace(-4,1);
hI.linspace(-2.5,0.5);
cI.linspace(-3,0.5);
Wx.linspace(0,0.1);
Wr.linspace(3,-0.1);
Wp.linspace(0.2,0.2);
b.linspace(1,-0.15);
// x.assign(1.);
// hI.assign(2.);
// cI.assign(3.);
// Wx.assign(0.5);
// Wr.assign(0.5);
// Wp.assign(0.75);
// b.assign(0.7);
std::vector<double> tArgs = {cellClip};
std::vector<Nd4jLong> iArgs = {gateAct, cellAct, outAct};
// std::vector<bool> bArgs = {false, false};
// const OpArgsHolder argsHolderFF({&x, &Wx, &Wr, &hI, &cI}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &hI, &cI, &dLdh}, tArgs, iArgs, bArgs);
std::vector<bool> bArgs = {true, true};
const OpArgsHolder argsHolderFF({&x, &Wx, &Wr, &b, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &hI, &cI, &Wp, &dLdh}, tArgs, iArgs, bArgs);
sd::ops::lstmLayerCell opFF;
sd::ops::lstmLayerCellBp opBP;
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {true, true, true, true, true, true, true});
}
Development updates (#9053) * RL4J: Add generic update rule (#502) Signed-off-by: Alexandre Boulanger <aboulang2002@yahoo.com> * Shyrma reduce (#481) * - start working on improving of cpu legacy code for reduce ops Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on improving legacy loops Signed-off-by: Yurii <iuriish@yahoo.com> * - still working on improving reduce ops Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on improving reduce ops Signed-off-by: Yurii <iuriish@yahoo.com> * - testing speed run of new reduce op Signed-off-by: Yurii <iuriish@yahoo.com> * - working on improvement of default loop for reduce op Signed-off-by: Yurii <iuriish@yahoo.com> * - update signatures of stuff which calls reduce ops Signed-off-by: Yurii <iuriish@yahoo.com> * - make corrections in cuda reduce kernels Signed-off-by: Yurii <iuriish@yahoo.com> * - change loop for default case in broadcast legacy ops Signed-off-by: Yurii <iuriish@yahoo.com> * - comment some shape stuff Signed-off-by: Yurii <iuriish@yahoo.com> * - comment unnecessary prints in RNGtests Signed-off-by: Yurii <iuriish@yahoo.com> * - finish to resolve conflicts after master has been merged Signed-off-by: Yurii <iuriish@yahoo.com> * - get rid of some compilation mistakes of cuda stuff Signed-off-by: Yurii <iuriish@yahoo.com> * - minor changes Signed-off-by: Yurii <iuriish@yahoo.com> * - further search for bug causing crash on java test Signed-off-by: Yurii <iuriish@yahoo.com> * - add scalar case in reduce_ ... exec stuff Signed-off-by: Yurii <iuriish@yahoo.com> * - minor corrections in NAtiveOps.cu Signed-off-by: Yurii <iuriish@yahoo.com> * - add switch to scalar case execReduceXD functions Signed-off-by: Yurii <iuriish@yahoo.com> * - add support for vectors old shape in ConstantShapeHelper::createShapeInfoWithNoUnitiesForReduce Signed-off-by: Yurii <iuriish@yahoo.com> * - correct cuda mirrorPad Signed-off-by: Yurii <iuriish@yahoo.com> * - add support for vectors old shape in cuda createShapeInfoWithNoUnitiesForReduce Signed-off-by: Yurii <iuriish@yahoo.com> Co-authored-by: raver119 <raver119@gmail.com> * Add support for CUDA 11.0 (#492) * Add support for CUDA 11.0 * libnd4j tweaks for CUDA 11 Signed-off-by: raver119@gmail.com <raver119@gmail.com> * bindings update, again? Signed-off-by: raver119@gmail.com <raver119@gmail.com> * * Update versions of JavaCPP Presets for FFmpeg, OpenBLAS, and NumPy * update API to match CUDA 8 Signed-off-by: raver119@gmail.com <raver119@gmail.com> * * Update version of JavaCPP Presets for CPython * C++ updated for cuDNN 8.0 Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one more test Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one more test Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one more test Signed-off-by: raver119@gmail.com <raver119@gmail.com> * 128-bit alignment for workspaces Signed-off-by: raver119@gmail.com <raver119@gmail.com> * change seed in 1 test Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Fix dependecy duplication in python4j-parent pom * Fix group id for in python4j-numpy * few tests tweaked Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Remove macosx-x86_64-gpu from nd4j-tests-tensorflow * few minor tweaks for IndexReduce Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one test removed Signed-off-by: raver119@gmail.com <raver119@gmail.com> Co-authored-by: raver119@gmail.com <raver119@gmail.com> Co-authored-by: Serhii Shepel <9946053+sshepel@users.noreply.github.com> * RL4J: Add SyncTrainer and AgentLearnerBuilder for a few algorithms (#504) Signed-off-by: Alexandre Boulanger <aboulang2002@yahoo.com> Co-authored-by: Alexandre Boulanger <44292157+aboulang2002@users.noreply.github.com> Co-authored-by: Yurii Shyrma <iuriish@yahoo.com> Co-authored-by: raver119 <raver119@gmail.com> Co-authored-by: Serhii Shepel <9946053+sshepel@users.noreply.github.com>
2020-07-26 14:59:27 +02:00
TEST_F(PlaygroundTests, my) {
const int N = 40;
NDArray x('c', {256,256,128,128}, sd::DataType::FLOAT32);
NDArray z1('c', {256,2,128}, sd::DataType::DOUBLE);
NDArray z = z1({0,0,0,1,0,0});
z.printShapeInfo();
auto timeStart = std::chrono::system_clock::now();
for (int i = 0; i < N; ++i) {
// x.reduceAlongDimension(sd::reduce::Mean, z, {1,3});
x.applyBroadcast(sd::broadcast::Ops::Add, {1,3}, z, x);
}
auto timeEnd = std::chrono::system_clock::now();
auto time = std::chrono::duration_cast<std::chrono::microseconds> ((timeEnd - timeStart) / N).count();
printf("old %ld\n", time);
}
Shyrma lstm layer bp (#370) * - start working on bp for lstm Signed-off-by: Yurii <iuriish@yahoo.com> * - further working on bp for lstmLayer Signed-off-by: Yurii <iuriish@yahoo.com> * - minor change Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 2 Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 3 Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 4 Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 5 Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 6 Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 7 Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 8 Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 9 Signed-off-by: Yurii <iuriish@yahoo.com> * - provide lstmLayerCell and lstmLayerCellBp as separate CUSTOM_OPs Signed-off-by: Yurii <iuriish@yahoo.com> * - testing and fixing lstmLayerCellBp helper Signed-off-by: Yurii <iuriish@yahoo.com> * - implement lstmLayerCellBp as separate op Signed-off-by: Yurii <iuriish@yahoo.com> * - implement lstmLayerBp as separate op (not tested) Signed-off-by: Yurii <iuriish@yahoo.com> * - fixing calculations of dLdWp and dLdb in lstmLayerCellBp Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 10 Signed-off-by: Yurii <iuriish@yahoo.com> * - fixing typo in lstmLayerTimeLoop Signed-off-by: Yurii <iuriish@yahoo.com> * - forgot to perform clipping of c array and calculate corresponding derivative in lstmLayerCellBp Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on bp for lstmLayer 10 Signed-off-by: Yurii <iuriish@yahoo.com> * - testing and fixing bugs in lstmLayer_bp op 1 Signed-off-by: Yurii <iuriish@yahoo.com> * - testing and fixing bugs in lstmLayer_bp op 2 Signed-off-by: Yurii <iuriish@yahoo.com> * - turn off heavy tests for cuda for lstmLayer_bp op Signed-off-by: Yurii <iuriish@yahoo.com> * - forgot to nullify gradients at eliminated time steps (when sequnce length array is present ) Signed-off-by: Yurii <iuriish@yahoo.com>
2020-04-13 12:21:51 +02:00
Shyrma sqrtm (#429) * - start working on implementation of sqrtm op Signed-off-by: Yurii <iuriish@yahoo.com> * - improving householder procedure Signed-off-by: Yurii <iuriish@yahoo.com> * - further polishing householder stuff Signed-off-by: Yurii <iuriish@yahoo.com> * - polishing hh pivoting qr procedure Signed-off-by: Yurii <iuriish@yahoo.com> * - polishing BiDiagonalUp procedure Signed-off-by: Yurii <iuriish@yahoo.com> * - polishing householder sequence class Signed-off-by: Yurii <iuriish@yahoo.com> * - polishing jacobi svd class Signed-off-by: Yurii <iuriish@yahoo.com> * - polishing svd stuff 1 Signed-off-by: Yurii <iuriish@yahoo.com> * - polishing svd stuff 2 Signed-off-by: Yurii <iuriish@yahoo.com> * - implementation and testing class which performs Hessenberg decomposition of square matrix Signed-off-by: Yurii <iuriish@yahoo.com> * - add static method to JacobiSVD class which makes the continuous Givens rotation generation algorithm Signed-off-by: Yurii <iuriish@yahoo.com> * - implementation and testing auxiliary methods of Schur decomp class Signed-off-by: Yurii <iuriish@yahoo.com> * some references here and there Signed-off-by: raver119 <raver119@gmail.com> * - trying figure out difference between eigen and our Schur alg Signed-off-by: Yurii <iuriish@yahoo.com> * - testing fixing bugs in Schur decomposition op Signed-off-by: Yurii <iuriish@yahoo.com> * - start to implement class which performs calculation of eigen values and vectors Signed-off-by: Yurii <iuriish@yahoo.com> * - add to EigenValsAndVecs method which calculates complex eigen vectors Signed-off-by: Yurii <iuriish@yahoo.com> * - testing and fixing bugs in EigenValsAndVecs class Signed-off-by: Yurii <iuriish@yahoo.com> * - implementation and testing triangularSolver class Signed-off-by: Yurii <iuriish@yahoo.com> * Added a 2D routine for triangular systems solve. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored triangularSolve2D routine and tests. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored another test for triangularSolve2D. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored test for triangularSolve for vector-bar case. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored triangularSolve2D routine and tests. Signed-off-by: shugeo <sgazeos@gmail.com> * - implementation of FullPivLU class Signed-off-by: Yurii <iuriish@yahoo.com> * - fix bugs in FullPivLU::solve method Signed-off-by: Yurii <iuriish@yahoo.com> * - correct permutation vector in FullPivLU::solve Signed-off-by: Yurii <iuriish@yahoo.com> * - correct include headers Signed-off-by: Yurii <iuriish@yahoo.com> * - implementation of Sqrtm class Signed-off-by: Yurii <iuriish@yahoo.com> * - testing and fixing bugs in Sqrtm class Signed-off-by: Yurii <iuriish@yahoo.com> * - include sqrtm classes to cuda folder, investigate in what places synchronization doesn't work Signed-off-by: Yurii <iuriish@yahoo.com> * Added implementation for cuda triangularSolve2D and also refactored triangularSolve2D for cpu. Signed-off-by: shugeo <sgazeos@gmail.com> * Eliminated waste implementations. Signed-off-by: shugeo <sgazeos@gmail.com> * - make offset calculation faster in t<> methods Signed-off-by: Yurii <iuriish@yahoo.com> * - rename refference T& NDArray::t<> method Signed-off-by: Yurii <iuriish@yahoo.com> * - further work on cuda sqrtm Signed-off-by: Yurii <iuriish@yahoo.com> * - provide correct synchronization to device in Sqrtm class Signed-off-by: Yurii <iuriish@yahoo.com> * - add tests for sqrtm op Signed-off-by: Yurii <iuriish@yahoo.com> * - correct fails which appeared while testing on jenkins Signed-off-by: Yurii <iuriish@yahoo.com> * - trying to find out mistake in svd::deflation method Signed-off-by: Yurii <iuriish@yahoo.com> * Revert "- trying to find out mistake in svd::deflation method" This reverts commit 19d37baddbc509028e4bc67bc932fe7449becdb6. * Revert "- trying to find out mistake in svd::deflation method" This reverts commit 19d37baddbc509028e4bc67bc932fe7449becdb6. Signed-off-by: Yurii <iuriish@yahoo.com> * - change call semantic of r<> and t<> methods Signed-off-by: Yurii <iuriish@yahoo.com> * - ged rid of ambiguity in * operator overloads for windows buikd Signed-off-by: Yurii <iuriish@yahoo.com> * - get rid of ambiguity in * operator overloads for windows build 2 Signed-off-by: Yurii <iuriish@yahoo.com> * - get rid of ambiguity in * operator overloads for windows build 3 Signed-off-by: Yurii <iuriish@yahoo.com> * - resolve conflicts with master Signed-off-by: Yurii <iuriish@yahoo.com> * cmakelists updated Signed-off-by: raver119@gmail.com <raver119@gmail.com> * - minor fix in merge cpu helper - make use of reference getter Signed-off-by: Yurii <iuriish@yahoo.com> Co-authored-by: raver119 <raver119@gmail.com> Co-authored-by: shugeo <sgazeos@gmail.com>
2020-05-14 17:06:13 +02:00
///////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests13, lstmLayer_bp_1) {
const int sL = 3;
const int bS = 2;
const int nIn = 2;
const int nOut = 3;
const int dataFormat = 0; // [sL,bS,nIn]
const int directionMode = 0; // forward
const int gateAct = 2; // sigmoid activation for input (i), forget (f) and output (o) gates
const int cellAct = 0; // tanh activation for cell state
const int outAct = 0; // tanh activation for output
const bool hasBiases = true; // biases array is provided
const bool hasSeqLen = false; // seqLen array is not provided
const auto hasInitH = true; // initial output is provided
const auto hasInitC = true; // initial cell state is provided
const auto hasPH = true; // peephole connections are absent
const auto retFullSeq = true; // dLdh per each time step
const auto retLastH = true; // output at last time step
const auto retLastC = true; // cells state at last time step
const double cellClip = 0.5; // clipping
NDArray x('c', {sL, bS, nIn}, sd::DataType::DOUBLE);
NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::DOUBLE);
NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::DOUBLE);
NDArray b('c', {4*nOut}, sd::DataType::DOUBLE);
NDArray hI('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray cI('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray Wp('c', {3*nOut}, sd::DataType::DOUBLE);
NDArray dLdh('c', {sL, bS, nOut}, sd::DataType::DOUBLE);
NDArray dLdhL('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray dLdcL('c', {bS, nOut}, sd::DataType::DOUBLE);
x.linspace(-2,0.1);
hI.linspace(-1.5,0.1);
cI.linspace(0.7,-0.1);
Wx.linspace(1,-0.1);
Wr.linspace(-1,0.1);
Wp.linspace(0.2,0.2);
b.linspace(1,-0.15);
std::vector<double> tArgs = {cellClip};
std::vector<Nd4jLong> iArgs = {dataFormat, directionMode, gateAct, cellAct, outAct};
std::vector<bool> bArgs = {hasBiases, hasSeqLen, hasInitH, hasInitC, hasPH, retFullSeq, retLastH, retLastC};
const OpArgsHolder argsHolderFF({&x, &Wx, &Wr, &b, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &hI, &cI, &Wp, &dLdh, &dLdhL, &dLdcL}, tArgs, iArgs, bArgs);
sd::ops::lstmLayer opFF;
sd::ops::lstmLayer_bp opBP;
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
ASSERT_TRUE(isGradCorrect);
}
///////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests13, lstmLayer_bp_2) {
const int sL = 3;
const int bS = 2;
const int nIn = 2;
const int nOut = 3;
const int dataFormat = 1; // [bS,sL,nIn]
const int directionMode = 0; // forward
const int gateAct = 2; // sigmoid activation for input (i), forget (f) and output (o) gates
const int cellAct = 0; // tanh activation for cell state
const int outAct = 0; // tanh activation for output
const bool hasBiases = true; // biases array is provided
const bool hasSeqLen = false; // seqLen array is not provided
const auto hasInitH = true; // initial output is provided
const auto hasInitC = true; // initial cell state is provided
const auto hasPH = true; // peephole connections are absent
const auto retFullSeq = true; // return whole h {h_0, h_1, ... , h_sL-1}, [sL,bS,nOut]
const auto retLastH = false; // output at last time step
const auto retLastC = true; // cells state at last time step
const double cellClip = 0.5; // clipping
NDArray x('c', {bS, sL, nIn}, sd::DataType::DOUBLE);
NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::DOUBLE);
NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::DOUBLE);
NDArray b('c', {4*nOut}, sd::DataType::DOUBLE);
NDArray hI('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray cI('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray Wp('c', {3*nOut}, sd::DataType::DOUBLE);
NDArray dLdh('c', {bS, sL, nOut}, sd::DataType::DOUBLE);
NDArray dLdcL('c', {bS, nOut}, sd::DataType::DOUBLE);
x.linspace(-2,0.1);
hI.linspace(-1.5,0.1);
cI.linspace(0.7,-0.1);
Wx.linspace(1,-0.1);
Wr.linspace(-1,0.1);
Wp.linspace(0.2,0.2);
b.linspace(1,-0.15);
std::vector<double> tArgs = {cellClip};
std::vector<Nd4jLong> iArgs = {dataFormat, directionMode, gateAct, cellAct, outAct};
std::vector<bool> bArgs = {hasBiases, hasSeqLen, hasInitH, hasInitC, hasPH, retFullSeq, retLastH, retLastC};
const OpArgsHolder argsHolderFF({&x, &Wx, &Wr, &b, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &hI, &cI, &Wp, &dLdh, &dLdcL}, tArgs, iArgs, bArgs);
sd::ops::lstmLayer opFF;
sd::ops::lstmLayer_bp opBP;
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, std::vector<bool>(), {0., 1.}, GradCheck::LossFunc::MEAN);
ASSERT_TRUE(isGradCorrect);
}
///////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests13, lstmLayer_bp_3) {
const int sL = 4;
const int bS = 3;
const int nIn = 3;
const int nOut = 2;
const int dataFormat = 2; // [bS, nIn, sL]
const int directionMode = 0; // forward
const int gateAct = 2; // sigmoid activation for input (i), forget (f) and output (o) gates
const int cellAct = 0; // tanh activation for cell state
const int outAct = 0; // tanh activation for output
const bool hasBiases = true; // biases array is provided
const bool hasSeqLen = true; // seqLen array is not provided
const auto hasInitH = true; // initial output is provided
const auto hasInitC = true; // initial cell state is provided
const auto hasPH = true; // peephole connections are absent
const auto retFullSeq = true; // dLdh per each time step
const auto retLastH = true; // output at last time step
const auto retLastC = true; // cells state at last time step
const double cellClip = 0.5; // clipping
NDArray x('c', {bS, nIn, sL}, sd::DataType::DOUBLE);
NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::DOUBLE);
NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::DOUBLE);
NDArray b('c', {4*nOut}, sd::DataType::DOUBLE);
NDArray seqLen('c', {bS}, {2,0,4}, sd::DataType::DOUBLE);
NDArray hI('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray cI('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray Wp('c', {3*nOut}, sd::DataType::DOUBLE);
NDArray dLdh('c', {bS, nOut, sL}, sd::DataType::DOUBLE);
NDArray dLdhL('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray dLdcL('c', {bS, nOut}, sd::DataType::DOUBLE);
x.linspace(-2,0.1);
hI.linspace(-1.5,0.1);
cI.linspace(0.7,-0.1);
Wx.linspace(1,-0.1);
Wr.linspace(-1,0.1);
Wp.linspace(0.2,0.2);
b.linspace(1,-0.15);
std::vector<double> tArgs = {cellClip};
std::vector<Nd4jLong> iArgs = {dataFormat, directionMode, gateAct, cellAct, outAct};
std::vector<bool> bArgs = {hasBiases, hasSeqLen, hasInitH, hasInitC, hasPH, retFullSeq, retLastH, retLastC};
const OpArgsHolder argsHolderFF({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdh, &dLdhL, &dLdcL}, tArgs, iArgs, bArgs);
sd::ops::lstmLayer opFF;
sd::ops::lstmLayer_bp opBP;
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {true, true, true, true, false, true, true, true});
ASSERT_TRUE(isGradCorrect);
}
///////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests13, lstmLayer_bp_4) {
const int sL = 3;
const int bS = 2;
const int nIn = 2;
const int nOut = 3;
const int dataFormat = 1; // [bS,sL,nIn]
const int directionMode = 1; // backward
const int gateAct = 2; // sigmoid activation for input (i), forget (f) and output (o) gates
const int cellAct = 0; // tanh activation for cell state
const int outAct = 0; // tanh activation for output
const bool hasBiases = true; // biases array is provided
const bool hasSeqLen = false; // seqLen array is not provided
const auto hasInitH = true; // initial output is provided
const auto hasInitC = true; // initial cell state is provided
const auto hasPH = true; // peephole connections are absent
const auto retFullSeq = true; // dLdh per each time step
const auto retLastH = true; // output at last time step
const auto retLastC = true; // cells state at last time step
const double cellClip = 0.5; // clipping
NDArray x('c', {bS, sL, nIn}, sd::DataType::DOUBLE);
NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::DOUBLE);
NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::DOUBLE);
NDArray b('c', {4*nOut}, sd::DataType::DOUBLE);
NDArray hI('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray cI('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray Wp('c', {3*nOut}, sd::DataType::DOUBLE);
NDArray dLdh('c', {bS, sL, nOut}, sd::DataType::DOUBLE);
NDArray dLdhL('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray dLdcL('c', {bS, nOut}, sd::DataType::DOUBLE);
x.linspace(-2,0.1);
hI.linspace(-1.5,0.1);
cI.linspace(0.7,-0.1);
Wx.linspace(1,-0.1);
Wr.linspace(-1,0.1);
Wp.linspace(0.2,0.2);
b.linspace(1,-0.15);
std::vector<double> tArgs = {cellClip};
std::vector<Nd4jLong> iArgs = {dataFormat, directionMode, gateAct, cellAct, outAct};
std::vector<bool> bArgs = {hasBiases, hasSeqLen, hasInitH, hasInitC, hasPH, retFullSeq, retLastH, retLastC};
const OpArgsHolder argsHolderFF({&x, &Wx, &Wr, &b, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &hI, &cI, &Wp, &dLdh, &dLdhL, &dLdcL}, tArgs, iArgs, bArgs);
sd::ops::lstmLayer opFF;
sd::ops::lstmLayer_bp opBP;
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
ASSERT_TRUE(isGradCorrect);
}
///////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests13, lstmLayer_bp_5) {
const int sL = 3;
const int bS = 2;
const int nIn = 2;
const int nOut = 2;
const int dataFormat = 2; // [bS, nIn, sL]
const int directionMode = 1; // backward
const int gateAct = 2; // sigmoid activation for input (i), forget (f) and output (o) gates
const int cellAct = 0; // tanh activation for cell state
const int outAct = 0; // tanh activation for output
const bool hasBiases = true; // biases array is provided
const bool hasSeqLen = true; // seqLen array is not provided
const auto hasInitH = true; // initial output is provided
const auto hasInitC = true; // initial cell state is provided
const auto hasPH = true; // peephole connections are absent
const auto retFullSeq = true; // dLdh per each time step
const auto retLastH = true; // output at last time step
const auto retLastC = true; // cells state at last time step
const double cellClip = 0.5; // clipping
NDArray x('c', {bS, nIn, sL}, sd::DataType::DOUBLE);
NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::DOUBLE);
NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::DOUBLE);
NDArray b('c', {4*nOut}, sd::DataType::DOUBLE);
NDArray seqLen('c', {bS}, {0,2}, sd::DataType::DOUBLE);
NDArray hI('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray cI('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray Wp('c', {3*nOut}, sd::DataType::DOUBLE);
NDArray dLdh('c', {bS, nOut, sL}, sd::DataType::DOUBLE);
NDArray dLdhL('c', {bS, nOut}, sd::DataType::DOUBLE);
NDArray dLdcL('c', {bS, nOut}, sd::DataType::DOUBLE);
x.linspace(-2,0.1);
hI.linspace(-1.5,0.1);
cI.linspace(0.7,-0.1);
Wx.linspace(1,-0.1);
Wr.linspace(-1,0.1);
Wp.linspace(0.2,0.2);
b.linspace(1,-0.15);
std::vector<double> tArgs = {cellClip};
std::vector<Nd4jLong> iArgs = {dataFormat, directionMode, gateAct, cellAct, outAct};
std::vector<bool> bArgs = {hasBiases, hasSeqLen, hasInitH, hasInitC, hasPH, retFullSeq, retLastH, retLastC};
const OpArgsHolder argsHolderFF({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdh, &dLdhL, &dLdcL}, tArgs, iArgs, bArgs);
sd::ops::lstmLayer opFF;
sd::ops::lstmLayer_bp opBP;
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {true, true, true, true, false, true, true, true});
ASSERT_TRUE(isGradCorrect);
}
///////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests13, lstmLayer_bp_6) {
const int sL = 3;
const int bS = 2;
const int nIn = 2;
const int nOut = 2;
const int dataFormat = 2; // [bS, nIn, sL]
const int directionMode = 2; // bidirectional sum
const int gateAct = 2; // sigmoid activation for input (i), forget (f) and output (o) gates
const int cellAct = 0; // tanh activation for cell state
const int outAct = 0; // tanh activation for output
const bool hasBiases = true; // biases array is provided
const bool hasSeqLen = true; // seqLen array is not provided
const auto hasInitH = true; // initial output is provided
const auto hasInitC = true; // initial cell state is provided
const auto hasPH = true; // peephole connections are absent
const auto retFullSeq = true; // dLdh per each time step
const auto retLastH = true; // output at last time step
const auto retLastC = true; // cells state at last time step
const double cellClip = 0.5; // clipping
NDArray x('c', {bS, nIn, sL}, sd::DataType::DOUBLE);
NDArray Wx('c', {2, nIn, 4*nOut}, sd::DataType::DOUBLE);
NDArray Wr('c', {2, nOut, 4*nOut}, sd::DataType::DOUBLE);
NDArray b('c', {2, 4*nOut}, sd::DataType::DOUBLE);
NDArray seqLen('c', {bS}, {0,2}, sd::DataType::DOUBLE);
NDArray hI('c', {2, bS, nOut}, sd::DataType::DOUBLE);
NDArray cI('c', {2, bS, nOut}, sd::DataType::DOUBLE);
NDArray Wp('c', {2, 3*nOut}, sd::DataType::DOUBLE);
NDArray dLdh('c', {bS, nOut, sL}, sd::DataType::DOUBLE);
NDArray dLdhL('c', {2, bS, nOut}, sd::DataType::DOUBLE);
NDArray dLdcL('c', {2, bS, nOut}, sd::DataType::DOUBLE);
x.linspace(-2,0.1);
hI.linspace(-1.5,0.1);
cI.linspace(0.7,-0.1);
Wx.linspace(1,-0.1);
Wr.linspace(-1,0.1);
Wp.linspace(0.2,0.2);
b.linspace(1,-0.15);
std::vector<double> tArgs = {cellClip};
std::vector<Nd4jLong> iArgs = {dataFormat, directionMode, gateAct, cellAct, outAct};
std::vector<bool> bArgs = {hasBiases, hasSeqLen, hasInitH, hasInitC, hasPH, retFullSeq, retLastH, retLastC};
const OpArgsHolder argsHolderFF({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdh, &dLdhL, &dLdcL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdh, &dLdhL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdh, &dLdcL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdhL, &dLdcL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdh}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdhL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdcL}, tArgs, iArgs, bArgs);
sd::ops::lstmLayer opFF;
sd::ops::lstmLayer_bp opBP;
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {true, true, true, true, false, true, true, true});
ASSERT_TRUE(isGradCorrect);
}
///////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests13, lstmLayer_bp_7) {
const int sL = 3;
const int bS = 2;
const int nIn = 2;
const int nOut = 2;
const int dataFormat = 1; // [bS,sL,nIn]
const int directionMode = 3; // bidirectional concat
const int gateAct = 2; // sigmoid activation for input (i), forget (f) and output (o) gates
const int cellAct = 0; // tanh activation for cell state
const int outAct = 0; // tanh activation for output
const bool hasBiases = true; // biases array is provided
const bool hasSeqLen = true; // seqLen array is not provided
const auto hasInitH = true; // initial output is provided
const auto hasInitC = true; // initial cell state is provided
const auto hasPH = true; // peephole connections are absent
const auto retFullSeq = true; // dLdh per each time step
const auto retLastH = true; // output at last time step
const auto retLastC = true; // cells state at last time step
const double cellClip = 0.5; // clipping
NDArray x('c', {bS,sL,nIn}, sd::DataType::DOUBLE);
NDArray Wx('c', {2, nIn, 4*nOut}, sd::DataType::DOUBLE);
NDArray Wr('c', {2, nOut, 4*nOut}, sd::DataType::DOUBLE);
NDArray b('c', {2, 4*nOut}, sd::DataType::DOUBLE);
NDArray seqLen('c', {bS}, {0,2}, sd::DataType::DOUBLE);
NDArray hI('c', {2, bS, nOut}, sd::DataType::DOUBLE);
NDArray cI('c', {2, bS, nOut}, sd::DataType::DOUBLE);
NDArray Wp('c', {2, 3*nOut}, sd::DataType::DOUBLE);
NDArray dLdh('c', {bS,sL,2*nOut}, sd::DataType::DOUBLE);
NDArray dLdhL('c', {2, bS, nOut}, sd::DataType::DOUBLE);
NDArray dLdcL('c', {2, bS, nOut}, sd::DataType::DOUBLE);
x.linspace(-2,0.1);
hI.linspace(-1.5,0.1);
cI.linspace(0.7,-0.1);
Wx.linspace(1,-0.1);
Wr.linspace(-1,0.1);
Wp.linspace(0.2,0.2);
b.linspace(1,-0.15);
std::vector<double> tArgs = {cellClip};
std::vector<Nd4jLong> iArgs = {dataFormat, directionMode, gateAct, cellAct, outAct};
std::vector<bool> bArgs = {hasBiases, hasSeqLen, hasInitH, hasInitC, hasPH, retFullSeq, retLastH, retLastC};
const OpArgsHolder argsHolderFF({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdh, &dLdhL, &dLdcL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdh, &dLdhL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdh, &dLdcL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdhL, &dLdcL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdh}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdhL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdcL}, tArgs, iArgs, bArgs);
sd::ops::lstmLayer opFF;
sd::ops::lstmLayer_bp opBP;
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {true, true, true, true, false, true, true, true});
ASSERT_TRUE(isGradCorrect);
}
///////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests13, lstmLayer_bp_8) {
const int sL = 3;
const int bS = 2;
const int nIn = 2;
const int nOut = 2;
const int dataFormat = 3; // [sL, bS, nIn]
const int directionMode = 4; // bidirectional extra output dim
const int gateAct = 2; // sigmoid activation for input (i), forget (f) and output (o) gates
const int cellAct = 0; // tanh activation for cell state
const int outAct = 0; // tanh activation for output
const bool hasBiases = true; // biases array is provided
const bool hasSeqLen = true; // seqLen array is not provided
const auto hasInitH = true; // initial output is provided
const auto hasInitC = true; // initial cell state is provided
const auto hasPH = true; // peephole connections are absent
const auto retFullSeq = true; // dLdh per each time step
const auto retLastH = true; // output at last time step
const auto retLastC = true; // cells state at last time step
const double cellClip = 0.5; // clipping
NDArray x('c', {sL, bS, nIn}, sd::DataType::DOUBLE);
NDArray Wx('c', {2, nIn, 4*nOut}, sd::DataType::DOUBLE);
NDArray Wr('c', {2, nOut, 4*nOut}, sd::DataType::DOUBLE);
NDArray b('c', {2, 4*nOut}, sd::DataType::DOUBLE);
NDArray seqLen('c', {bS}, {0,2}, sd::DataType::DOUBLE);
NDArray hI('c', {2, bS, nOut}, sd::DataType::DOUBLE);
NDArray cI('c', {2, bS, nOut}, sd::DataType::DOUBLE);
NDArray Wp('c', {2, 3*nOut}, sd::DataType::DOUBLE);
NDArray dLdh('c', {sL, 2, bS, nOut}, sd::DataType::DOUBLE);
NDArray dLdhL('c', {2, bS, nOut}, sd::DataType::DOUBLE);
NDArray dLdcL('c', {2, bS, nOut}, sd::DataType::DOUBLE);
x.linspace(-2,0.1);
hI.linspace(-1.5,0.1);
cI.linspace(0.7,-0.1);
Wx.linspace(1,-0.1);
Wr.linspace(-1,0.1);
Wp.linspace(0.2,0.2);
b.linspace(1,-0.15);
std::vector<double> tArgs = {cellClip};
std::vector<Nd4jLong> iArgs = {dataFormat, directionMode, gateAct, cellAct, outAct};
std::vector<bool> bArgs = {hasBiases, hasSeqLen, hasInitH, hasInitC, hasPH, retFullSeq, retLastH, retLastC};
const OpArgsHolder argsHolderFF({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdh, &dLdhL, &dLdcL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdh, &dLdhL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdh, &dLdcL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdhL, &dLdcL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdh}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdhL}, tArgs, iArgs, bArgs);
// const OpArgsHolder argsHolderBP({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp, &dLdcL}, tArgs, iArgs, bArgs);
sd::ops::lstmLayer opFF;
sd::ops::lstmLayer_bp opBP;
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {true, true, true, true, false, true, true, true});
ASSERT_TRUE(isGradCorrect);
}
//////////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests15, gru_bp_1) {
const int sL = 3;
const int bS = 2;
const int nIn = 5;
const int nOut = 4;
NDArray x('c', {sL, bS, nIn}, {0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. , 6.5, 7. , 7.5, 8. , 8.5, 9. , 9.5, 10. , 10.5, 11. , 11.5, 12. , 12.5, 13. , 13.5, 14. , 14.5, 15.}, sd::DataType::DOUBLE);
NDArray hI('c', {bS, nOut}, {-3,-2,-1,0,1,2,3,4}, sd::DataType::DOUBLE);
NDArray Wx('c', {nIn, 3*nOut}, sd::DataType::DOUBLE);
NDArray Wh('c', {nOut, 3*nOut}, sd::DataType::DOUBLE);
NDArray b('c', {3*nOut}, sd::DataType::DOUBLE);
NDArray dLdh('c', {sL, bS, nOut}, sd::DataType::DOUBLE);
Wx.linspace(1,-0.1);
Wh.linspace(0.2,0.2);
b.linspace(1,-0.15);
const OpArgsHolder argsHolderFF({&x, &hI, &Wx, &Wh, &b}, {}, {});
const OpArgsHolder argsHolderBP({&x, &hI, &Wx, &Wh, &b, &dLdh}, {}, {});
sd::ops::gru opFF;
sd::ops::gru_bp opBP;
const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
}
*/