cavis/libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp

/*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/

//
// @author raver119@gmail.com
//

#include "testlayers.h"
#include <ops/declarable/CustomOperations.h>
#include <loops/type_conversions.h>
#include <execution/Threads.h>
#include <chrono>
#include <execution/ThreadPool.h>

using namespace samediff;
using namespace sd;
using namespace sd::ops;
using namespace sd::graph;

class ThreadsTests : public testing::Test {
public:
    ThreadsTests() {
        nd4j_printf("\n","");
    }
};

TEST_F(ThreadsTests, th_test_1) {
    ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1023));
    ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1024));
    ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1026));

    ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 2043));
    ASSERT_EQ(2, ThreadsHelper::numberOfThreads(6, 2048));
}


TEST_F(ThreadsTests, th_test_2) {
    // in this case we'll get better split over second loop - exactly 32 elements per thread
    ASSERT_EQ(2, ThreadsHelper::pickLoop2d(32, 48, 1024));
    ASSERT_EQ(2, ThreadsHelper::pickLoop2d(6, 4, 16384));

    // in this case we'll get better split over first loop - 2 loops/2048 elements per thread
    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(32, 64, 1024));
    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(6, 6, 16384));

    // in this case none of loops are good enough, but second loop is too small for split
    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(6, 64, 32));

    // all loops are good enough, but we go with bigger one, since small
    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(2, 64, 32));

    // obviously split goes into second loop, to give 1024 elements per thread
    ASSERT_EQ(2, ThreadsHelper::pickLoop2d(2, 1, 2048));
}

TEST_F(ThreadsTests, th_test_3) {
    // typical conv cases
    ASSERT_EQ(1, ThreadsHelper::pickLoop3d(4, 32, 3, 128));
    ASSERT_EQ(2, ThreadsHelper::pickLoop3d(4, 1, 128, 64));
    ASSERT_EQ(3, ThreadsHelper::pickLoop3d(4, 1, 3, 128));

    // checking for optimal threads for conv inference
    ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 1, 3, 128));
    ASSERT_EQ(4, ThreadsHelper::numberOfThreads3d(4, 1, 3, 128));
    ASSERT_EQ(8, ThreadsHelper::numberOfThreads3d(8, 1, 3, 128));

    // checking for optimal threads for conv training
    ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 16, 3, 128));
    ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 8, 3, 128));


    ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 8, 3, 64));
    ASSERT_EQ(1, ThreadsHelper::pickLoop3d(6, 8, 3, 64));
}

TEST_F(ThreadsTests, th_test_5) {
    ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 32, 112, 112));

    ASSERT_EQ(1, ThreadsHelper::pickLoop3d(6, 32, 112, 112));

    for (auto e = 0; e < 6; e++) {
        auto span = Span3::build(1, e, 6, 0, 32, 1, 0, 112, 1, 0, 112, 1);

        nd4j_printf("Span start: %lld; stop: %lld\n", span.startX(), span.stopX());
    }
}

TEST_F(ThreadsTests, th_test_4) {
    // typical conv cases
    ASSERT_EQ(2, ThreadsHelper::numberOfThreads2d(2, 32, 3));
    ASSERT_EQ(4, ThreadsHelper::numberOfThreads2d(4, 32, 3));
    ASSERT_EQ(6, ThreadsHelper::numberOfThreads2d(6, 32, 1));
    ASSERT_EQ(8, ThreadsHelper::numberOfThreads2d(8, 16, 64));

    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(4, 32, 1));
    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(8, 19, 17));

    // primes edge cases
    ASSERT_EQ(6, ThreadsHelper::numberOfThreads2d(6, 19, 17));
    ASSERT_EQ(8, ThreadsHelper::numberOfThreads2d(8, 19, 17));

    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(8, 19, 17));

    for (auto e = 0; e < 6; e++) {
        auto span = Span2::build(1, e, 6, 0, 19, 1, 0, 17, 1);

        nd4j_printf("Span start: %lld; stop: %lld\n", span.startX(), span.stopX());
    }

    nd4j_printf("-----------------------\n","");
    for (auto e = 0; e < 6; e++) {
        auto span = Span2::build(1, e, 6, 0, 32, 1, 0, 3, 1);

        nd4j_printf("Span start: %lld; stop: %lld\n", span.startX(), span.stopX());
    }
}


TEST_F(ThreadsTests, test_span_converage_1) {
    for (int b = 1; b <= 128; b++) {
        for (int c = 1; c <= 64; c++) {
            for (int t = 1; t <= 64; t++) {

                auto threads = ThreadsHelper::numberOfThreads2d(t, b, c);
                auto loop = ThreadsHelper::pickLoop2d(threads, b, c);

                if (t > 1 && threads == 1 && (b > 1 && c > 1)) {
                    nd4j_printf("Got 1 thread for [%i, %i] loop; initial max threads: %i\n", b, c, t)
                }

                auto sum = 0;
                for (auto a = 0; a < threads; a++) {
                    auto span = Span2::build(loop, a,threads, 0, b, 1, 0, c, 1);

                    if (loop == 1)
                        sum += span.stopX() - span.startX();
                    else if (loop == 2)
                        sum += span.stopY() - span.startY();
                    else
                        throw std::runtime_error("Bad loop!");
                }

                if (loop == 1)
                    ASSERT_EQ(b, sum);
                else
                    ASSERT_EQ(c, sum);
            }
        }
    }
}

TEST_F(ThreadsTests, validation_test_2d_1) {
    if (1 > 0)
        return;

    std::vector<int> threads({1, 2, 4, 6, 8, 12, 16, 20, 32, 48, 64});

    for (int e = 1; e < 1024; e++) {
        for (int i = 1; i <= 1024; i++ ) {
            for (auto t:threads) {
                std::atomic<int64_t> sum;
                sum.store(0);

                auto func = PRAGMA_THREADS_FOR_2D {
                    for (auto x = start_x; x < stop_x; x += inc_x) {
                        for (auto y = start_y; y < stop_y; y += inc_y) {
                            sum++;
                        }
                    }
                };

                samediff::Threads::parallel_for(func, 0, e, 1, 0, i, 1, t, true);

                ASSERT_EQ(e * i, sum.load());
            }
        }

        nd4j_printf("Finished iteration %i\n", e);
    }
}

TEST_F(ThreadsTests, reduction_test_1) {

    auto func = PRAGMA_REDUCE_LONG {
        int64_t sum = 0;

        for (auto e = start; e < stop; e++) {
            sum++;
        };

        return sum;
    };

    auto sum = samediff::Threads::parallel_long(func, LAMBDA_AL {return _old + _new;}, 0, 8192, 1, 4);
    ASSERT_EQ(8192, sum);
}

/*
TEST_F(ThreadsTests, basic_test_1) {
    if (!Environment::getInstance()->isCPU())
        return;

    auto instance = samediff::ThreadPool::getInstance();

    auto array = NDArrayFactory::create<float>('c', {512, 768});
    auto like = array.like();
    auto buffer = array.bufferAsT<float>();
    auto lbuffer = like.bufferAsT<float>();

    auto func = PRAGMA_THREADS_FOR {
        PRAGMA_OMP_SIMD
        for (uint64_t e = start; e < stop; e += increment) {
            buffer[e] += 1.0f;
        }
    };

    auto timeStartThreads = std::chrono::system_clock::now();
    samediff::Threads::parallel_for(func, 0, array.lengthOf());
    auto timeEndThreads = std::chrono::system_clock::now();
    auto outerTimeThreads = std::chrono::duration_cast<std::chrono::microseconds> (timeEndThreads - timeStartThreads).count();

    auto timeStartOmp = std::chrono::system_clock::now();
    PRAGMA_OMP_PARALLEL_FOR_SIMD
    for (uint64_t e = 0; e < array.lengthOf(); e ++) {
        lbuffer[e] += 1.0f;
    }
    auto timeEndOmp = std::chrono::system_clock::now();
    auto outerTimeOmp = std::chrono::duration_cast<std::chrono::microseconds> (timeEndOmp - timeStartOmp).count();

    ASSERT_NEAR((float) array.lengthOf(), array.sumNumber().e<float>(0), 1e-5f);

    nd4j_printf("Threads time: %lld us; OMP time: %lld us; %p\n", outerTimeThreads, outerTimeOmp, instance)
}
 */
[WIP] Weekly update of repo (#8390) * [WIP] Fix compilation after nd4j changes (#37) * Fix compilation. * Some tests fixed * Disable tests temporarily. * Restored test * Tests restored. * Test restored. * [WIP] perf tests (#40) * special maxpool test Signed-off-by: raver119 <raver119@gmail.com> * special maxpool test Signed-off-by: raver119 <raver119@gmail.com> * Shyrma bnorm bp (#41) Batchnorm backprop mkldnn * Add SameDiff memory reuse memory manager (array cache) (#39) * Attention op comments Signed-off-by: AlexDBlack <blacka101@gmail.com> * ArrayCacheMemoryMgr - first pass Signed-off-by: AlexDBlack <blacka101@gmail.com> * Tweak array cache for use with SameDiff identity arrays Signed-off-by: AlexDBlack <blacka101@gmail.com> * ArrayCacheMemoryMgr javadoc and properly get max memory Signed-off-by: AlexDBlack <blacka101@gmail.com> * LRU cache policy + add tests Signed-off-by: AlexDBlack <blacka101@gmail.com> * Fixes Signed-off-by: AlexDBlack <blacka101@gmail.com> * Resize arrays internally if required for ArrayCacheMemoryMgr Signed-off-by: AlexDBlack <blacka101@gmail.com> * Test improvement Signed-off-by: AlexDBlack <blacka101@gmail.com> * Small polish Signed-off-by: AlexDBlack <blacka101@gmail.com> * SameDiff op runtime benchmarking listener (#42) Signed-off-by: AlexDBlack <blacka101@gmail.com> * INLINE_LOOPS for windows Signed-off-by: raver119 <raver119@gmail.com> * [WIP] ThreadPool (#8) This PR removes OpenMP use in 95% of cases 2019-11-13 15:15:18 +01:00			`/*******************************************************************************`
			`* Copyright (c) 2015-2018 Skymind, Inc.`
			`*`
			`* This program and the accompanying materials are made available under the`
			`* terms of the Apache License, Version 2.0 which is available at`
			`* https://www.apache.org/licenses/LICENSE-2.0.`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT`
			`* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the`
			`* License for the specific language governing permissions and limitations`
			`* under the License.`
			`*`
			`* SPDX-License-Identifier: Apache-2.0`
			`******************************************************************************/`

			`//`
			`// @author raver119@gmail.com`
			`//`

			`#include "testlayers.h"`
			`#include <ops/declarable/CustomOperations.h>`
			`#include <loops/type_conversions.h>`
			`#include <execution/Threads.h>`
			`#include <chrono>`
			`#include <execution/ThreadPool.h>`

			`using namespace samediff;`
libnd4j polishing (#273) * initial set of include changes Signed-off-by: raver119 <raver119@gmail.com> * one more tweak Signed-off-by: raver119 <raver119@gmail.com> * few more rearrangements Signed-off-by: raver119 <raver119@gmail.com> * few more rearrangements Signed-off-by: raver119 <raver119@gmail.com> * few more rearrangements Signed-off-by: raver119 <raver119@gmail.com> * cuda includes rearrangements Signed-off-by: raver119 <raver119@gmail.com> * java update Signed-off-by: raver119 <raver119@gmail.com> * = namespace changed to sd - few CMake variables renamed with SD_ prefix Signed-off-by: raver119 <raver119@gmail.com> * java update Signed-off-by: raver119 <raver119@gmail.com> * LoopKind minor fix Signed-off-by: raver119 <raver119@gmail.com> * few more changes Signed-off-by: raver119 <raver119@gmail.com> * few more changes Signed-off-by: raver119 <raver119@gmail.com> * few more changes Signed-off-by: raver119 <raver119@gmail.com> * sanitizer is optional now Signed-off-by: raver119 <raver119@gmail.com> * dev tests updated Signed-off-by: raver119 <raver119@gmail.com> * few more changes Signed-off-by: raver119 <raver119@gmail.com> * last update Signed-off-by: raver119 <raver119@gmail.com> * java update Signed-off-by: raver119 <raver119@gmail.com> 2020-03-02 10:49:41 +01:00			`using namespace sd;`
			`using namespace sd::ops;`
			`using namespace sd::graph;`
[WIP] Weekly update of repo (#8390) * [WIP] Fix compilation after nd4j changes (#37) * Fix compilation. * Some tests fixed * Disable tests temporarily. * Restored test * Tests restored. * Test restored. * [WIP] perf tests (#40) * special maxpool test Signed-off-by: raver119 <raver119@gmail.com> * special maxpool test Signed-off-by: raver119 <raver119@gmail.com> * Shyrma bnorm bp (#41) Batchnorm backprop mkldnn * Add SameDiff memory reuse memory manager (array cache) (#39) * Attention op comments Signed-off-by: AlexDBlack <blacka101@gmail.com> * ArrayCacheMemoryMgr - first pass Signed-off-by: AlexDBlack <blacka101@gmail.com> * Tweak array cache for use with SameDiff identity arrays Signed-off-by: AlexDBlack <blacka101@gmail.com> * ArrayCacheMemoryMgr javadoc and properly get max memory Signed-off-by: AlexDBlack <blacka101@gmail.com> * LRU cache policy + add tests Signed-off-by: AlexDBlack <blacka101@gmail.com> * Fixes Signed-off-by: AlexDBlack <blacka101@gmail.com> * Resize arrays internally if required for ArrayCacheMemoryMgr Signed-off-by: AlexDBlack <blacka101@gmail.com> * Test improvement Signed-off-by: AlexDBlack <blacka101@gmail.com> * Small polish Signed-off-by: AlexDBlack <blacka101@gmail.com> * SameDiff op runtime benchmarking listener (#42) Signed-off-by: AlexDBlack <blacka101@gmail.com> * INLINE_LOOPS for windows Signed-off-by: raver119 <raver119@gmail.com> * [WIP] ThreadPool (#8) This PR removes OpenMP use in 95% of cases 2019-11-13 15:15:18 +01:00
			`class ThreadsTests : public testing::Test {`
			`public:`
- 3d loops parallelism fix (#135) - additional check for maxMasterThreads <= maxThreads Signed-off-by: raver119 <raver119@gmail.com> 2019-12-19 14:50:08 +01:00			`ThreadsTests() {`
			`nd4j_printf("\n","");`
			`}`
[WIP] Weekly update of repo (#8390) * [WIP] Fix compilation after nd4j changes (#37) * Fix compilation. * Some tests fixed * Disable tests temporarily. * Restored test * Tests restored. * Test restored. * [WIP] perf tests (#40) * special maxpool test Signed-off-by: raver119 <raver119@gmail.com> * special maxpool test Signed-off-by: raver119 <raver119@gmail.com> * Shyrma bnorm bp (#41) Batchnorm backprop mkldnn * Add SameDiff memory reuse memory manager (array cache) (#39) * Attention op comments Signed-off-by: AlexDBlack <blacka101@gmail.com> * ArrayCacheMemoryMgr - first pass Signed-off-by: AlexDBlack <blacka101@gmail.com> * Tweak array cache for use with SameDiff identity arrays Signed-off-by: AlexDBlack <blacka101@gmail.com> * ArrayCacheMemoryMgr javadoc and properly get max memory Signed-off-by: AlexDBlack <blacka101@gmail.com> * LRU cache policy + add tests Signed-off-by: AlexDBlack <blacka101@gmail.com> * Fixes Signed-off-by: AlexDBlack <blacka101@gmail.com> * Resize arrays internally if required for ArrayCacheMemoryMgr Signed-off-by: AlexDBlack <blacka101@gmail.com> * Test improvement Signed-off-by: AlexDBlack <blacka101@gmail.com> * Small polish Signed-off-by: AlexDBlack <blacka101@gmail.com> * SameDiff op runtime benchmarking listener (#42) Signed-off-by: AlexDBlack <blacka101@gmail.com> * INLINE_LOOPS for windows Signed-off-by: raver119 <raver119@gmail.com> * [WIP] ThreadPool (#8) This PR removes OpenMP use in 95% of cases 2019-11-13 15:15:18 +01:00			`};`

			`TEST_F(ThreadsTests, th_test_1) {`
			`ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1023));`
			`ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1024));`
			`ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1026));`

			`ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 2043));`
			`ASSERT_EQ(2, ThreadsHelper::numberOfThreads(6, 2048));`
			`}`


			`TEST_F(ThreadsTests, th_test_2) {`
			`// in this case we'll get better split over second loop - exactly 32 elements per thread`
			`ASSERT_EQ(2, ThreadsHelper::pickLoop2d(32, 48, 1024));`
			`ASSERT_EQ(2, ThreadsHelper::pickLoop2d(6, 4, 16384));`

			`// in this case we'll get better split over first loop - 2 loops/2048 elements per thread`
			`ASSERT_EQ(1, ThreadsHelper::pickLoop2d(32, 64, 1024));`
			`ASSERT_EQ(1, ThreadsHelper::pickLoop2d(6, 6, 16384));`

			`// in this case none of loops are good enough, but second loop is too small for split`
			`ASSERT_EQ(1, ThreadsHelper::pickLoop2d(6, 64, 32));`

			`// all loops are good enough, but we go with bigger one, since small`
			`ASSERT_EQ(1, ThreadsHelper::pickLoop2d(2, 64, 32));`

			`// obviously split goes into second loop, to give 1024 elements per thread`
			`ASSERT_EQ(2, ThreadsHelper::pickLoop2d(2, 1, 2048));`
			`}`

			`TEST_F(ThreadsTests, th_test_3) {`
			`// typical conv cases`
			`ASSERT_EQ(1, ThreadsHelper::pickLoop3d(4, 32, 3, 128));`
			`ASSERT_EQ(2, ThreadsHelper::pickLoop3d(4, 1, 128, 64));`
			`ASSERT_EQ(3, ThreadsHelper::pickLoop3d(4, 1, 3, 128));`

			`// checking for optimal threads for conv inference`
			`ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 1, 3, 128));`
			`ASSERT_EQ(4, ThreadsHelper::numberOfThreads3d(4, 1, 3, 128));`
			`ASSERT_EQ(8, ThreadsHelper::numberOfThreads3d(8, 1, 3, 128));`

			`// checking for optimal threads for conv training`
			`ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 16, 3, 128));`
			`ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 8, 3, 128));`


			`ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 8, 3, 64));`
			`ASSERT_EQ(1, ThreadsHelper::pickLoop3d(6, 8, 3, 64));`
			`}`

- 3d loops parallelism fix (#135) - additional check for maxMasterThreads <= maxThreads Signed-off-by: raver119 <raver119@gmail.com> 2019-12-19 14:50:08 +01:00			`TEST_F(ThreadsTests, th_test_5) {`
			`ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 32, 112, 112));`

			`ASSERT_EQ(1, ThreadsHelper::pickLoop3d(6, 32, 112, 112));`

			`for (auto e = 0; e < 6; e++) {`
			`auto span = Span3::build(1, e, 6, 0, 32, 1, 0, 112, 1, 0, 112, 1);`

			`nd4j_printf("Span start: %lld; stop: %lld\n", span.startX(), span.stopX());`
			`}`
			`}`

[WIP] Weekly update of repo (#8390) * [WIP] Fix compilation after nd4j changes (#37) * Fix compilation. * Some tests fixed * Disable tests temporarily. * Restored test * Tests restored. * Test restored. * [WIP] perf tests (#40) * special maxpool test Signed-off-by: raver119 <raver119@gmail.com> * special maxpool test Signed-off-by: raver119 <raver119@gmail.com> * Shyrma bnorm bp (#41) Batchnorm backprop mkldnn * Add SameDiff memory reuse memory manager (array cache) (#39) * Attention op comments Signed-off-by: AlexDBlack <blacka101@gmail.com> * ArrayCacheMemoryMgr - first pass Signed-off-by: AlexDBlack <blacka101@gmail.com> * Tweak array cache for use with SameDiff identity arrays Signed-off-by: AlexDBlack <blacka101@gmail.com> * ArrayCacheMemoryMgr javadoc and properly get max memory Signed-off-by: AlexDBlack <blacka101@gmail.com> * LRU cache policy + add tests Signed-off-by: AlexDBlack <blacka101@gmail.com> * Fixes Signed-off-by: AlexDBlack <blacka101@gmail.com> * Resize arrays internally if required for ArrayCacheMemoryMgr Signed-off-by: AlexDBlack <blacka101@gmail.com> * Test improvement Signed-off-by: AlexDBlack <blacka101@gmail.com> * Small polish Signed-off-by: AlexDBlack <blacka101@gmail.com> * SameDiff op runtime benchmarking listener (#42) Signed-off-by: AlexDBlack <blacka101@gmail.com> * INLINE_LOOPS for windows Signed-off-by: raver119 <raver119@gmail.com> * [WIP] ThreadPool (#8) This PR removes OpenMP use in 95% of cases 2019-11-13 15:15:18 +01:00			`TEST_F(ThreadsTests, th_test_4) {`
			`// typical conv cases`
			`ASSERT_EQ(2, ThreadsHelper::numberOfThreads2d(2, 32, 3));`
			`ASSERT_EQ(4, ThreadsHelper::numberOfThreads2d(4, 32, 3));`
			`ASSERT_EQ(6, ThreadsHelper::numberOfThreads2d(6, 32, 1));`
			`ASSERT_EQ(8, ThreadsHelper::numberOfThreads2d(8, 16, 64));`

			`ASSERT_EQ(1, ThreadsHelper::pickLoop2d(4, 32, 1));`
			`ASSERT_EQ(1, ThreadsHelper::pickLoop2d(8, 19, 17));`

			`// primes edge cases`
			`ASSERT_EQ(6, ThreadsHelper::numberOfThreads2d(6, 19, 17));`
			`ASSERT_EQ(8, ThreadsHelper::numberOfThreads2d(8, 19, 17));`

			`ASSERT_EQ(1, ThreadsHelper::pickLoop2d(8, 19, 17));`

			`for (auto e = 0; e < 6; e++) {`
			`auto span = Span2::build(1, e, 6, 0, 19, 1, 0, 17, 1);`

			`nd4j_printf("Span start: %lld; stop: %lld\n", span.startX(), span.stopX());`
			`}`

			`nd4j_printf("-----------------------\n","");`
			`for (auto e = 0; e < 6; e++) {`
			`auto span = Span2::build(1, e, 6, 0, 32, 1, 0, 3, 1);`

			`nd4j_printf("Span start: %lld; stop: %lld\n", span.startX(), span.stopX());`
			`}`
			`}`


			`TEST_F(ThreadsTests, test_span_converage_1) {`
			`for (int b = 1; b <= 128; b++) {`
			`for (int c = 1; c <= 64; c++) {`
			`for (int t = 1; t <= 64; t++) {`

			`auto threads = ThreadsHelper::numberOfThreads2d(t, b, c);`
			`auto loop = ThreadsHelper::pickLoop2d(threads, b, c);`

			`if (t > 1 && threads == 1 && (b > 1 && c > 1)) {`
			`nd4j_printf("Got 1 thread for [%i, %i] loop; initial max threads: %i\n", b, c, t)`
			`}`

			`auto sum = 0;`
			`for (auto a = 0; a < threads; a++) {`
			`auto span = Span2::build(loop, a,threads, 0, b, 1, 0, c, 1);`

			`if (loop == 1)`
			`sum += span.stopX() - span.startX();`
			`else if (loop == 2)`
			`sum += span.stopY() - span.startY();`
			`else`
			`throw std::runtime_error("Bad loop!");`
			`}`

			`if (loop == 1)`
			`ASSERT_EQ(b, sum);`
			`else`
			`ASSERT_EQ(c, sum);`
			`}`
			`}`
			`}`
			`}`

			`TEST_F(ThreadsTests, validation_test_2d_1) {`
			`if (1 > 0)`
			`return;`

			`std::vector<int> threads({1, 2, 4, 6, 8, 12, 16, 20, 32, 48, 64});`

			`for (int e = 1; e < 1024; e++) {`
			`for (int i = 1; i <= 1024; i++ ) {`
			`for (auto t:threads) {`
			`std::atomic<int64_t> sum;`
			`sum.store(0);`

			`auto func = PRAGMA_THREADS_FOR_2D {`
			`for (auto x = start_x; x < stop_x; x += inc_x) {`
			`for (auto y = start_y; y < stop_y; y += inc_y) {`
			`sum++;`
			`}`
			`}`
			`};`

			`samediff::Threads::parallel_for(func, 0, e, 1, 0, i, 1, t, true);`

			`ASSERT_EQ(e * i, sum.load());`
			`}`
			`}`

			`nd4j_printf("Finished iteration %i\n", e);`
			`}`
			`}`

			`TEST_F(ThreadsTests, reduction_test_1) {`

			`auto func = PRAGMA_REDUCE_LONG {`
			`int64_t sum = 0;`

			`for (auto e = start; e < stop; e++) {`
			`sum++;`
			`};`

			`return sum;`
			`};`

			`auto sum = samediff::Threads::parallel_long(func, LAMBDA_AL {return _old + _new;}, 0, 8192, 1, 4);`
			`ASSERT_EQ(8192, sum);`
			`}`

			`/*`
			`TEST_F(ThreadsTests, basic_test_1) {`
			`if (!Environment::getInstance()->isCPU())`
			`return;`

			`auto instance = samediff::ThreadPool::getInstance();`

			`auto array = NDArrayFactory::create<float>('c', {512, 768});`
			`auto like = array.like();`
			`auto buffer = array.bufferAsT<float>();`
			`auto lbuffer = like.bufferAsT<float>();`

			`auto func = PRAGMA_THREADS_FOR {`
			`PRAGMA_OMP_SIMD`
			`for (uint64_t e = start; e < stop; e += increment) {`
			`buffer[e] += 1.0f;`
			`}`
			`};`

			`auto timeStartThreads = std::chrono::system_clock::now();`
			`samediff::Threads::parallel_for(func, 0, array.lengthOf());`
			`auto timeEndThreads = std::chrono::system_clock::now();`
			`auto outerTimeThreads = std::chrono::duration_cast<std::chrono::microseconds> (timeEndThreads - timeStartThreads).count();`

			`auto timeStartOmp = std::chrono::system_clock::now();`
			`PRAGMA_OMP_PARALLEL_FOR_SIMD`
			`for (uint64_t e = 0; e < array.lengthOf(); e ++) {`
			`lbuffer[e] += 1.0f;`
			`}`
			`auto timeEndOmp = std::chrono::system_clock::now();`
			`auto outerTimeOmp = std::chrono::duration_cast<std::chrono::microseconds> (timeEndOmp - timeStartOmp).count();`

			`ASSERT_NEAR((float) array.lengthOf(), array.sumNumber().e<float>(0), 1e-5f);`

			`nd4j_printf("Threads time: %lld us; OMP time: %lld us; %p\n", outerTimeThreads, outerTimeOmp, instance)`
			`}`
			`*/`