/******************************************************************************* * Copyright (c) 2015-2018 Skymind, Inc. * Copyright (c) 2019 Konduit K.K. * * This program and the accompanying materials are made available under the * terms of the Apache License, Version 2.0 which is available at * https://www.apache.org/licenses/LICENSE-2.0. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * SPDX-License-Identifier: Apache-2.0 ******************************************************************************/ // // Created by raver119 on 20.11.17. // #include "testlayers.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace nd4j; using namespace nd4j::graph; class PlaygroundTests : public testing::Test { public: int numIterations = 3; int poolSize = 10; PlaygroundTests() { printf("\n"); fflush(stdout); } }; TEST_F(PlaygroundTests, test_avx) { nd4j_printf("Optimal level: %i; Binary level: %i;\n", ::optimalLevel(), ::binaryLevel()); } /* TEST_F(PlaygroundTests, test_s_0) { auto x = NDArrayFactory::create('c', {32, 112, 112, 16}); auto y = NDArrayFactory::create('c', {16}); auto z = x.ulike(); std::vector values; Context ctx(1); ctx.setInputArray(0, &x); ctx.setInputArray(1, &y); ctx.setOutputArray(0, &z); nd4j::ops::biasadd op; for (int e = 0; e < 10000; e++) { auto timeStart = std::chrono::system_clock::now(); op.execute(&ctx); auto timeEnd = std::chrono::system_clock::now(); auto outerTime = std::chrono::duration_cast (timeEnd - timeStart).count(); values.emplace_back(outerTime); } std::sort(values.begin(), values.end()); nd4j_printf("Time: %lld us;\n", values[values.size() / 2]); } */ /* TEST_F(PlaygroundTests, test_s_1) { auto x0 = NDArrayFactory::create('c', {32, 7, 7, 176}); auto x1 = x0.ulike(); auto x2 = x0.ulike(); auto x3 = x0.ulike(); auto x4 = x0.ulike(); auto x5 = x0.ulike(); auto y = NDArrayFactory::create(3); auto z = NDArrayFactory::create('c', {32, 7, 7, 1056}); Context ctx(1); ctx.setInputArray(0, &x0); ctx.setInputArray(1, &x1); ctx.setInputArray(2, &x2); ctx.setInputArray(3, &x3); ctx.setInputArray(4, &x4); ctx.setInputArray(5, &x5); ctx.setInputArray(6, &y); ctx.setOutputArray(0, &z); ctx.setBArguments({true}); std::vector values; nd4j::ops::concat op; op.execute(&ctx); for (int e = 0; e < 1000; e++) { auto timeStart = std::chrono::system_clock::now(); op.execute(&ctx); auto timeEnd = std::chrono::system_clock::now(); auto outerTime = std::chrono::duration_cast (timeEnd - timeStart).count(); values.emplace_back(outerTime); } std::sort(values.begin(), values.end()); nd4j_printf("Time: %lld us;\n", values[values.size() / 2]); } */ /* TEST_F(PlaygroundTests, test_s_1) { auto t = ::runLightBenchmarkSuit(true); delete[] t; } TEST_F(PlaygroundTests, test_s_2) { std::atomic s; s = 0; auto func = PRAGMA_THREADS_FOR { s++; }; samediff::Threads::parallel_for(func, 0, 8192, 1, 4); std::vector values; for (int e = 0; e < 100000; e++) { s = 0; auto timeStart = std::chrono::system_clock::now(); //samediff::Threads::parallel_for(func, 0, 8192, 1, 4); PRAGMA_OMP_PARALLEL_THREADS(4) { s++; } auto timeEnd = std::chrono::system_clock::now(); auto outerTime = std::chrono::duration_cast (timeEnd - timeStart).count(); values.emplace_back(outerTime); }; std::sort(values.begin(), values.end()); nd4j_printf("Time: %lld;\n", values[values.size() / 2]); } */ /* TEST_F(PlaygroundTests, test_s_4) { std::atomic f; std::atomic s; std::vector valuesX, valuesY; int iterations = 1000; s = 0; auto func = PRAGMA_THREADS_FOR { s++; }; samediff::Threads::parallel_for(func, 0, 8192, 1, 4); //////// auto x = NDArrayFactory::create('c', {32, 3, 256, 256}); auto z = NDArrayFactory::create('c', {32, 3, 256, 256}); x.linspace(1.0); auto xs0 = x.sizeAt(0); auto xs1 = x.sizeAt(1); auto xs2 = x.sizeAt(2); auto xs3 = x.sizeAt(3); auto buffer = x.bufferAsT(); auto zbuffer = z.bufferAsT(); for (int e = 0; e < iterations; e++) { auto timeStart = std::chrono::system_clock::now(); PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2) for (int i = 0; i < xs0; i++) { for (int j = 0; j < xs1; j++) { auto thread_id = omp_get_thread_num(); for (int k = 0; k < xs2; k++) { for (int l = 0; l < xs3; l++) { zbuffer[thread_id] += buffer[i * j + (k*l)] * 2.5f; } } } } auto timeEnd = std::chrono::system_clock::now(); auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); valuesX.emplace_back(outerTime); } for (int e = 0; e < iterations; e++) { auto timeStart = std::chrono::system_clock::now(); auto f2d = PRAGMA_THREADS_FOR_2D { for (auto i = start_x; i < stop_x; i++) { for (auto j = start_y; j < stop_y; j++) { for (auto k = 0; k < xs2; k++) { for (auto l = 0; l < xs3; l++) { zbuffer[thread_id] += buffer[i * j + (k * l)] * 2.5f; } } } } }; samediff::Threads::parallel_for(f2d, 0, xs0, 1, 0, xs1, 1); auto timeEnd = std::chrono::system_clock::now(); auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); valuesY.emplace_back(outerTime); } if (valuesX.size() > 0) { std::sort(valuesX.begin(), valuesX.end()); nd4j_printf("OpenMP time: %lld; Min: %lld; Max: %lld;\n", valuesX[valuesX.size() / 2], valuesX[0], valuesX[valuesX.size() - 1]); } if (valuesY.size() > 0) { std::sort(valuesY.begin(), valuesY.end()); nd4j_printf("Threads time: %lld; Min: %lld; Max: %lld;\n", valuesY[valuesY.size() / 2], valuesY[0], valuesY[valuesY.size() - 1]); } nd4j_printf("Sum: %f\n", z.sumNumber().e(0)); } TEST_F(PlaygroundTests, test_s_5) { auto x = NDArrayFactory::create('c', {32, 1, 28, 28}); std::vector values; auto iterations = 100; auto startX = 0; auto stopX = x.sizeAt(0); auto incX = 1; auto startY = 0; auto stopY = x.sizeAt(1); auto incY = 1; auto numThreads = 4; // number of elements per loop auto delta_x = (stopX - startX); auto delta_y = (stopY - startY); // number of iterations per loop auto itersX = delta_x / incX; auto itersY = delta_y / incY; for (int e = 0; e < iterations; e++) { auto timeStart = std::chrono::system_clock::now(); // picking best fit here auto splitLoop = samediff::ThreadsHelper::pickLoop2d(numThreads, itersX, itersY); auto span = samediff::Span2::build(splitLoop, 0, numThreads, startX, stopX, incX, startY, stopY, incY); auto timeEnd = std::chrono::system_clock::now(); auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); values.emplace_back(outerTime); } std::sort(values.begin(), values.end()); nd4j_printf("Calculations time: [Median: %lld; Min: %lld; Max: %lld;]\n", values[values.size() / 2], values[0], values[values.size()-1]); } TEST_F(PlaygroundTests, test_s_6) { auto x = NDArrayFactory::create('c', {1024 * 1024 * 64}); auto buffer = x.bufferAsT(); auto len = x.lengthOf(); std::vector values; auto iterations = 1000; for (int i = 0; i < iterations; i++) { auto timeStart = std::chrono::system_clock::now(); // picking best fit here for (int e = 0; e < len; e++) { buffer[e] = (buffer[e] + 1.72f) * 3.17f - 0.0012f; } auto timeEnd = std::chrono::system_clock::now(); auto outerTime = std::chrono::duration_cast(timeEnd - timeStart).count(); values.emplace_back(outerTime); } std::sort(values.begin(), values.end()); nd4j_printf("Calculations time: [Median: %lld; Min: %lld; Max: %lld;]\n", values[values.size() / 2], values[0], values[values.size()-1]); } TEST_F(PlaygroundTests, test_s_3) { std::atomic s; s = 0; auto func = PRAGMA_THREADS_FOR { s++; }; for (int e = 0; e < 10000; e++) { samediff::Threads::parallel_for(func, 0, 8192, 1, 4); } } */ /* TEST_F(PlaygroundTests, test_relubp_1) { auto x = NDArrayFactory::create('c', {128, 64, 224, 224}); auto y = x.ulike(); auto z = x.ulike(); RandomGenerator rng(119, 120); RandomLauncher::fillUniform(LaunchContext::defaultContext(), rng, &x, -1.0, 1.0); RandomLauncher::fillUniform(LaunchContext::defaultContext(), rng, &y, -1.0, 1.0); int iterations = 10; auto timeStart = std::chrono::system_clock::now(); for (int e = 0; e < iterations; e++) ops::helpers::reluDerivative(LaunchContext::defaultContext(), &x, &y, &z); auto timeEnd = std::chrono::system_clock::now(); auto outerTime = std::chrono::duration_cast (timeEnd - timeStart).count(); auto time = (Nd4jLong) outerTime / iterations; auto bw = (1000000L * (float) (x.lengthOf() * x.sizeOfT()) / time) / 1024 / 1024 / 1024; nd4j_printf("Time: %lld; BW: %f GB/s\n", time, bw); } ////////////////////////////////////////////////////////////////////// TEST_F(PlaygroundTests, my) { int bS=8, iD=32,iH=32,iW=32, iC=128, kD=2,kH=2,kW=2, sD=1,sH=1,sW=1, pD=0,pH=0,pW=0, dD=2,dH=2,dW=2; int oD,oH,oW; nd4j::ops::ConvolutionUtils::calcOutSizeDeconv3D(oD, oH, oW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, 0); printf("!!%i, %i, %i\n", oD,oH,oW); NDArray col('c', {bS, iC, kD, kH, kW, iD, iH, iW}, nd4j::DataType::DOUBLE); NDArray vol('c', {bS, iC, oD, oH, oW}, nd4j::DataType::DOUBLE); col = 3.77; vol = -10.33; auto variableSpace = new VariableSpace(); auto block = new Context(1, variableSpace, false); // not-in-place auto timeStart = std::chrono::system_clock::now(); nd4j::ops::ConvolutionUtils::col2vol(*block, col, vol, sD, sH, sW, pD, pH, pW, dD, dH, dW); auto timeEnd = std::chrono::system_clock::now(); auto time = std::chrono::duration_cast (timeEnd - timeStart).count(); printf("time: %i \n", time); delete block; delete variableSpace; } TEST_F(PlaygroundTests, my) { int bS=32, iD=32,iH=64,iW=64, iC=128, kD=2,kH=2,kW=2, sD=1,sH=1,sW=1, pD=0,pH=0,pW=0, dD=2,dH=2,dW=2; int oD,oH,oW; // nd4j::ops::ConvolutionUtils::calcOutSizeDeconv3D(oD, oH, oW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, 0); nd4j::ops::ConvolutionUtils::calcOutSizeDeconv2D(oH, oW, kH, kW, sH, sW, pH, pW,dH, dW, iH, iW, 0); printf("!!%i, %i, %i\n", oD,oH,oW); // NDArray col('c', {bS, iC, kD, kH, kW, iD, iH, iW}, nd4j::DataType::DOUBLE); // NDArray vol('c', {bS, iC, oD, oH, oW}, nd4j::DataType::DOUBLE); NDArray col('c', {bS, iC, kH, kW, iH, iW}, nd4j::DataType::DOUBLE); NDArray im('c', {bS, iC, oH, oW}, nd4j::DataType::DOUBLE); col = 3.77; // vol = -10.33; im = -10.33; auto variableSpace = new VariableSpace(); auto block = new Context(1, variableSpace, false); // not-in-place auto timeStart = std::chrono::system_clock::now(); // nd4j::ops::ConvolutionUtils::col2vol(*block, col, vol, sD, sH, sW, pD, pH, pW, dD, dH, dW); nd4j::ops::helpers::col2im(*col.getContext(), col, im, sH, sW, pH, pW, iH, iW, dH, dW); auto timeEnd = std::chrono::system_clock::now(); auto time = std::chrono::duration_cast (timeEnd - timeStart).count(); printf("time: %i \n", time); delete block; delete variableSpace; } #include TEST_F(PlaygroundTests, my) { const int N = 10000; const Nd4jLong dim0(128), dim1(128), dim2(128); NDArray input('c', {dim0,dim1,dim2}, nd4j::DataType::DOUBLE); NDArray mean('c', {dim1}, nd4j::DataType::DOUBLE); NDArray variance('c', {dim1}, nd4j::DataType::DOUBLE); NDArray gamma('c', {dim1}, nd4j::DataType::DOUBLE); NDArray beta ('c', {dim1}, nd4j::DataType::DOUBLE); NDArray output('c', {dim0,dim1,dim2}, nd4j::DataType::DOUBLE); input.linspace(-100, 0.1); mean.linspace(-50, 0.15); variance.linspace(-5, 0.2); gamma = 1.5; beta = -2.5; // warm up ops::helpers::batchnorm(&input, &mean, &variance, &gamma, &beta, &output, {1}, 1e-5); auto timeStart = std::chrono::system_clock::now(); for (int i = 0; i < N; ++i) ops::helpers::batchnorm(&input, &mean, &variance, &gamma, &beta, &output, {1}, 1e-5); auto timeEnd = std::chrono::system_clock::now(); auto time = std::chrono::duration_cast ((timeEnd - timeStart)/N).count(); printf("time: %li \n", time); } */