/******************************************************************************* * Copyright (c) 2015-2018 Skymind, Inc. * * This program and the accompanying materials are made available under the * terms of the Apache License, Version 2.0 which is available at * https://www.apache.org/licenses/LICENSE-2.0. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * SPDX-License-Identifier: Apache-2.0 ******************************************************************************/ // // @author raver119@gmail.com // #include #include #include #include #include #include

#include #ifdef _OPENMP

#include #endif


namespace sd {

    int ThreadsHelper::numberOfThreads(int maxThreads, uint64_t numberOfElements) {
        // let's see how many threads we actually need first
        auto optimalThreads = sd::math::nd4j_max (1, numberOfElements / 1024);

        // now return the smallest value
        return sd::math::nd4j_min (optimalThreads, maxThreads);
    }

    Span3::Span3(int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, int64_t startZ, int64_t stopZ, int64_t incZ) {
        _startX = startX;
        _startY = startY;
        _startZ = startZ;
        _stopX = stopX;
        _stopY = stopY;
        _stopZ = stopZ;
        _incX = incX;
        _incY = incY;
        _incZ = incZ;
    }

    Span3 Span3::build(int loop, uint64_t threadID, uint64_t numThreads, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, int64_t startZ, int64_t stopZ, int64_t incZ) {
        switch (loop) {
            case 1: {
                auto span = (stopX - startX) / numThreads;
                auto s = span * threadID;
                auto e = s + span;
                if (threadID == numThreads - 1)
                    e = stopX;

                return Span3(s, e, incX, startY, stopY, incY, startZ, stopZ, incZ);
            }
                break;
            case 2: {
                auto span = (stopY - startY) / numThreads;
                auto s = span * threadID;
                auto e = s + span;
                if (threadID == numThreads - 1)
                    e = stopY;

                return Span3(startX, stopX, incX, s, e, incY, startZ, stopZ, incZ);
            }
                break;
            case 3: {
                auto span = (stopZ - startZ) / numThreads;
                auto s = span * threadID;
                auto e = s + span;
                if (threadID == numThreads - 1)
                    e = stopZ;

                return Span3(startX, stopX, incX, startY, stopY, incY, s, e, incZ);
            }
                break;
            default:
                throw std::runtime_error("");
        }
        return Span3(startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ);
    }

    Span::Span(int64_t startX, int64_t stopX, int64_t incX) {
        _startX = startX;
        _stopX = stopX;
        _incX = incX;
    }

    Span Span::build(uint64_t threadID, uint64_t numThreads, int64_t startX, int64_t stopX, int64_t incX) {
        auto span = (stopX - startX) / numThreads;
        auto s = span * threadID;
        auto e = s + span;
        if (threadID == numThreads - 1)
            e = stopX;

        return Span(s, e, incX);
    }

    Span2::Span2(int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY) {
        _startX = startX;
        _startY = startY;
        _stopX = stopX;
        _stopY = stopY;
        _incX = incX;
        _incY = incY;
    }


    Span2 Span2::build(int loop, uint64_t threadID, uint64_t numThreads, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY) {

        switch (loop) {
            case 1: {
                auto span = (stopX - startX) / numThreads;
                auto s = span * threadID;
                auto e = s + span;
                if (threadID == numThreads - 1)
                    e = stopX;

                return Span2(s, e, incX, startY, stopY, incY);
            }
                break;
            case 2: {
                auto span = (stopY - startY) / numThreads;
                auto s = span * threadID;
                auto e = s + span;
                if (threadID == numThreads - 1)
                    e = stopY;

                return Span2(startX, stopX, incX, s, e, incY);
            }
                break;
            default:
                throw std::runtime_error("");
        }
    }

    int64_t Span::startX() const {
        return _startX;
    }

    int64_t Span::stopX() const {
        return _stopX;
    }

    int64_t Span::incX() const {
        return _incX;
    }

    int64_t Span2::startX() const {
        return _startX;
    }

    int64_t Span2::startY() const {
        return _startY;
    }

    int64_t Span2::stopX() const {
        return _stopX;
    }

    int64_t Span2::stopY() const {
        return _stopY;
    }

    int64_t Span2::incX() const {
        return _incX;
    }

    int64_t Span2::incY() const {
        return _incY;
    }

    int64_t Span3::startX() const {
        return _startX;
    }

    int64_t Span3::startY() const {
        return _startY;
    }

    int64_t Span3::startZ() const {
        return _startZ;
    }

    int64_t Span3::stopX() const {
        return _stopX;
    }

    int64_t Span3::stopY() const {
        return _stopY;
    }

    int64_t Span3::stopZ() const {
        return _stopZ;
    }

    int64_t Span3::incX() const {
        return _incX;
    }

    int64_t Span3::incY() const {
        return _incY;
    }

    int64_t Span3::incZ() const {
        return _incZ;
    }

    int ThreadsHelper::pickLoop2d(int numThreads, uint64_t itersX, uint64_t itersY) {
        // if one of dimensions is definitely too small - we just pick the other one
        if (itersX < numThreads && itersY >= numThreads)
            return 2;
        if (itersY < numThreads && itersX >= numThreads)
            return 1;

        // next step - we pick the most balanced dimension
        auto remX = itersX % numThreads;
        auto remY = itersY % numThreads;
        auto splitY = itersY / numThreads;

        // if there's no remainder left in some dimension - we're picking that dimension, because it'll be the most balanced work distribution
        if (remX == 0)
            return 1;
        if (remY == 0)
            return 2;

        // if there's no loop without a remainder - we're picking one with smaller remainder
        if (remX < remY)
            return 1;
        if (remY < remX && splitY >= 64) // we don't want too small splits over last dimension, or vectorization will fail
            return 2;
        // if loops are equally sized - give the preference to the first thread
        return 1;
    }


    static int threads_(int maxThreads, uint64_t elements) {

        if (elements == maxThreads) {
            return maxThreads;
        }
        else if (elements > maxThreads) {
            // if we have full load across thread, or at least half of threads can be utilized
            auto rem = elements % maxThreads;
            if (rem == 0 || rem >= maxThreads / 3)
                return maxThreads;
            else
                return threads_(maxThreads - 1, elements);

        }
        else if (elements < maxThreads) {
            return elements;
        }

        return 1;
    }

    int ThreadsHelper::numberOfThreads2d(int maxThreads, uint64_t iters_x, uint64_t iters_y) {
        // in some cases there's nothing to think about, part 1
        if (iters_x < maxThreads && iters_y < maxThreads)
            return sd::math::nd4j_max (iters_x, iters_y);

        auto remX = iters_x % maxThreads;
        auto remY = iters_y % maxThreads;

        // in some cases there's nothing to think about, part 2
        if ((iters_x >= maxThreads && remX == 0) || (iters_y >= maxThreads && remY == 0))
            return maxThreads;

        // at this point we suppose that there's no loop perfectly matches number of our threads
        // so let's pick something as equal as possible
        if (iters_x > maxThreads || iters_y > maxThreads)
            return maxThreads;
        else
            return numberOfThreads2d(maxThreads - 1, iters_x, iters_y);
    }

    int ThreadsHelper::numberOfThreads3d(int maxThreads, uint64_t itersX, uint64_t itersY, uint64_t itersZ) {
        // we don't want to run underloaded threads
        if (itersX * itersY * itersZ <= 32)
            return 1;

        auto remX = itersX % maxThreads;
        auto remY = itersY % maxThreads;
        auto remZ = itersZ % maxThreads;

        // if we have perfect balance across one of dimensions - just go for it
        if ((itersX >= maxThreads && remX == 0) || (itersY >= maxThreads && remY == 0) || (itersZ >= maxThreads && remZ == 0))
            return maxThreads;

        int threadsX = 0, threadsY = 0, threadsZ = 0;

        // now we look into possible number of
        threadsX = threads_(maxThreads, itersX);
        threadsY = threads_(maxThreads, itersY);
        threadsZ = threads_(maxThreads, itersZ);

        // we want to split as close to outer loop as possible, so checking it out first
        if (threadsX >= threadsY && threadsX >= threadsZ)
            return threadsX;
        else if (threadsY >= threadsX && threadsY >= threadsZ)
            return threadsY;
        else if (threadsZ >= threadsX && threadsZ >= threadsY)
            return threadsZ;

        return 1;
    }

    int ThreadsHelper::pickLoop3d(int numThreads, uint64_t itersX, uint64_t itersY, uint64_t itersZ) {
        auto remX = itersX % numThreads;
        auto remY = itersY % numThreads;
        auto remZ = itersZ % numThreads;

        auto splitX = itersX / numThreads;
        auto splitY = itersY / numThreads;
        auto splitZ = itersZ / numThreads;

        // if there's no remainder left in some dimension - we're picking that dimension, because it'll be the most balanced work distribution
        if (remX == 0)
            return 1;
        else if (remY == 0)
            return 2;
        else if (remZ == 0) // TODO: we don't want too smal splits over last dimension? or we do?
            return 3;

        if (itersX > numThreads)
            return 1;
        else if (itersY > numThreads)
            return 2;
        else if (itersZ > numThreads)
            return 3;

        return 1;
    }

#ifdef _OPENMP

    std::mutex Threads::gThreadmutex;
    uint64_t Threads::_nFreeThreads = sd::Environment::getInstance()->maxThreads();

    bool   Threads::tryAcquire(int numThreads){
        std::lock_guard lock( gThreadmutex );
        auto nThreads = _nFreeThreads - numThreads;
        if(nThreads >= 1){
            _nFreeThreads = nThreads;

            return true;
        }
        return false;
    }

    bool  Threads::freeThreads(int numThreads){
        std::lock_guard lock( gThreadmutex );
        _nFreeThreads += numThreads;
        // check if correct number of threads
        return _nFreeThreads > sd::Environment::getInstance()->maxThreads();
    }
#endif

    int Threads::parallel_tad(FUNC_1D function, int64_t start, int64_t stop, int64_t increment, uint32_t numThreads) {
        if (start > stop)
            throw std::runtime_error("Threads::parallel_for got start > stop");

        auto delta = (stop - start) / increment;

        if (numThreads > delta)
            numThreads = delta;

        if (numThreads == 0)
            return 0;

        // shortcut
        if (numThreads == 1) {
            function(0, start, stop, increment);
            return 1;
        }

#ifdef _OPENMP

        if (tryAcquire(numThreads)) {
#pragma omp parallel for num_threads(numThreads)
            for (auto e = start; e < stop; e += increment) {
                function(omp_get_thread_num(), e, e + 1, 1);
            }
            freeThreads(numThreads);
            return numThreads;
        }
        else {
            // if there were no threads available - we'll execute function right within current thread
            function(0, start, stop, increment);

            // we tell that parallelism request declined
            return 1;
        }
#else

        sd::Environment::getInstance()->maxThreads();
		auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads);
		if (ticket != nullptr) {
			// if we got our threads - we'll run our jobs here
			auto span = delta / numThreads;

			for (uint32_t e = 0; e < numThreads; e++) {
				auto start_ = span * e + start;
				auto stop_ = start_ + span;

				// last thread will process tail
				if (e == numThreads - 1)
					stop_ = stop;

				// putting the task into the queue for a given thread
				ticket->enqueue(e, numThreads, function, start_, stop_, increment);
			}

			// block and wait till all threads finished the job
			ticket->waitAndRelease();

			// we tell that parallelism request succeeded
			return numThreads;
		}
		else {
			// if there were no threads available - we'll execute function right within current thread
			function(0, start, stop, increment);

			// we tell that parallelism request declined
			return 1;
		}
#endif
    }

    int Threads::parallel_for(FUNC_1D function, int64_t start, int64_t stop, int64_t increment, uint32_t numThreads) {
        if (start > stop)
            throw std::runtime_error("Threads::parallel_for got start > stop");

        auto delta = (stop - start);

        // in some cases we just fire func as is
        if (delta == 0 || numThreads == 1) {
            function(0, start, stop, increment);
            return 1;
        }

        auto numElements = delta / increment;

        // we decide what's optimal number of threads we need here, and execute it in parallel_tad.
        numThreads = ThreadsHelper::numberOfThreads(numThreads, numElements);
        return parallel_tad(function, start, stop, increment, numThreads);
    }

    int Threads::parallel_for(FUNC_2D function, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, uint64_t numThreads, bool debug) {
        if (startX > stopX)
            throw std::runtime_error("Threads::parallel_for got startX > stopX");

        if (startY > stopY)
            throw std::runtime_error("Threads::parallel_for got startY > stopY");

        // number of elements per loop
        auto delta_x = (stopX - startX);
        auto delta_y = (stopY - startY);

        // number of iterations per loop
        auto itersX = delta_x / incX;
        auto itersY = delta_y / incY;

        // total number of iterations
        auto iters_t = itersX * itersY;

        // we are checking the case of number of requested threads was smaller
        numThreads = ThreadsHelper::numberOfThreads2d(numThreads, itersX, itersY);

        // basic shortcut for no-threading cases
        if (numThreads == 1) {
            function(0, startX, stopX, incX, startY, stopY, incY);
            return 1;
        }

        // We have couple of scenarios:
        // either we split workload along 1st loop, or 2nd
        auto splitLoop = ThreadsHelper::pickLoop2d(numThreads, itersX, itersY);

        // for debug mode we execute things inplace, without any threads
        if (debug) {
            for (int e = 0; e < numThreads; e++) {
                auto span = Span2::build(splitLoop, e, numThreads, startX, stopX, incX, startY, stopY, incY);

                function(e, span.startX(), span.stopX(), span.incX(), span.startY(), span.stopY(), span.incY());
            }

            // but we still mimic multithreaded execution
            return numThreads;
        }
        else {
#ifdef _OPENMP

            if (tryAcquire(numThreads)) {
#pragma omp parallel for num_threads(numThreads) collapse(2)
                for (auto x = startX; x < stopX; x += incX) {
                    for (auto y = startY; y < stopY; y += incY) {
                        function(omp_get_thread_num(), x, x+1, 1, y, y+1, 1);
                    }
                }
                freeThreads(numThreads);
                return numThreads;
            }
            else {
                // if there were no threads available - we'll execute function right within current thread
                function(0, startX, stopX, incX, startY, stopY, incY);

                // we tell that parallelism request declined
                return 1;
            }

#else

            auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads);
			if (ticket != nullptr) {

				for (int e = 0; e < numThreads; e++) {
					auto threadId = numThreads - e - 1;
					auto span = Span2::build(splitLoop, threadId, numThreads, startX, stopX, incX, startY, stopY, incY);

					ticket->enqueue(e, numThreads, function, span.startX(), span.stopX(), span.incX(), span.startY(), span.stopY(), span.incY());
				}

				// block until all threads finish their job
				ticket->waitAndRelease();

				return numThreads;
			}
			else {
				// if there were no threads available - we'll execute function right within current thread
				function(0, startX, stopX, incX, startY, stopY, incY);

				// we tell that parallelism request declined
				return 1;
			}
#endif
        };
    }


    int Threads::parallel_for(FUNC_3D function, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, int64_t startZ, int64_t stopZ, int64_t incZ, uint64_t numThreads) {
        if (startX > stopX)
            throw std::runtime_error("Threads::parallel_for got startX > stopX");

        if (startY > stopY)
            throw std::runtime_error("Threads::parallel_for got startY > stopY");

        if (startZ > stopZ)
            throw std::runtime_error("Threads::parallel_for got startZ > stopZ");

        if (numThreads == 1) {
            // loop is too small - executing function as is
            function(0, startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ);
            return 1;
        }

#ifdef _OPENMP

        if (tryAcquire(numThreads)) {
#pragma omp parallel for num_threads(numThreads) collapse(3)
            for (auto x = startX; x < stopX; x += incX) {
                for (auto y = startY; y < stopY; y += incY) {
                    for (auto z = startZ; z < stopZ; z += incZ) {
                        function(omp_get_thread_num(), x, x+1, 1, y, y+1, 1, z, z+1, 1);
                    }
                }
            }

            freeThreads(numThreads);
            return numThreads;
        }
        else {
            // if there were no threads available - we'll execute function right within current thread
            function(0, startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ);

            // we tell that parallelism request declined
            return 1;
        }
#else
        auto delta_x = stopX - startX;
        auto delta_y = stopY - startY;
        auto delta_z = stopZ - startZ;

        auto itersX = delta_x / incX;
        auto itersY = delta_y / incY;
        auto itersZ = delta_z / incZ;

        numThreads = ThreadsHelper::numberOfThreads3d(numThreads, itersX, itersY, itersZ);
        if (numThreads == 1) {
            // loop is too small - executing function as is
            function(0, startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ);
            return 1;
        }

        auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads);
		if (ticket != nullptr) {
			auto splitLoop = ThreadsHelper::pickLoop3d(numThreads, itersX, itersY, itersZ);

			for (int e = 0; e < numThreads; e++) {
				auto thread_id = numThreads - e - 1;
				auto span = Span3::build(splitLoop, thread_id, numThreads, startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ);

				ticket->enqueue(e, numThreads, function, span.startX(), span.stopX(), span.incX(), span.startY(), span.stopY(), span.incY(), span.startZ(), span.stopZ(), span.incZ());
			}

			// block until we're done
			ticket->waitAndRelease();

			// we tell that parallelism request succeeded
			return numThreads;
		}
		else {
			// if there were no threads available - we'll execute function right within current thread
			function(0, startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ);

			// we tell that parallelism request declined
			return 1;
		}
#endif
    }

    int Threads::parallel_do(FUNC_DO function, uint64_t numThreads) {

        if (numThreads == 1) {
            function(0, numThreads);
            return 1;
        }

#ifdef _OPENMP

        if (tryAcquire(numThreads)) {
#pragma omp parallel for num_threads(numThreads)
            for (int e = 0; e < numThreads; e++) {
                function(e, numThreads);
            }

            freeThreads(numThreads);
            return numThreads;
        }
        else {
            // if there's no threads available - we'll execute function sequentially one by one
            for (uint64_t e = 0; e < numThreads; e++)
                function(e, numThreads);

            return numThreads;
        }
#else
        auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads - 1);
		if (ticket != nullptr) {

			// submit tasks one by one
			for (uint64_t e = 0; e < numThreads - 1; e++)
				ticket->enqueue(e, numThreads, function);

			function(numThreads - 1, numThreads);

			ticket->waitAndRelease();

			return numThreads;
		}
		else {
			// if there's no threads available - we'll execute function sequentially one by one
			for (uint64_t e = 0; e < numThreads; e++)
				function(e, numThreads);

			return numThreads;
		}
#endif

        return numThreads;
    }

    int64_t Threads::parallel_long(FUNC_RL function, FUNC_AL aggregator, int64_t start, int64_t stop, int64_t increment, uint64_t numThreads) {
        if (start > stop)
            throw std::runtime_error("Threads::parallel_long got start > stop");

        auto delta = (stop - start);
        if (delta == 0 || numThreads == 1)
            return function(0, start, stop, increment);

        auto numElements = delta / increment;

        // we decide what's optimal number of threads we need here, and execute it
        numThreads = ThreadsHelper::numberOfThreads(numThreads, numElements);
        if (numThreads == 1)
            return function(0, start, stop, increment);

        // create temporary array
        int64_t intermediatery[256];
        auto span = delta / numThreads;

#ifdef _OPENMP
        if (tryAcquire(numThreads)) {
#pragma omp parallel for num_threads(numThreads)
            for (int e = 0; e < numThreads; e++) {
                auto start_ = span * e + start;
                auto stop_ = span * (e + 1) + start;

                intermediatery[e] = function(e, start_, e == numThreads - 1 ? stop : stop_, increment);
            }
            freeThreads(numThreads);
        }
        else{
            // if there were no thre ads available - we'll execute function right within current thread
            return	function(0, start, stop, increment);
        }
#else
        auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads - 1);
        if (ticket == nullptr)
            return function(0, start, stop, increment);

		// execute threads in parallel
		for (uint32_t e = 0; e < numThreads; e++) {
			auto start_ = span * e + start;
			auto stop_ = span * (e + 1) + start;

			if (e == numThreads - 1)
				intermediatery[e] = function(e, start_, stop, increment);
			else
				ticket->enqueue(e, numThreads, &intermediatery[e], function, start_, stop_, increment);
		}

		ticket->waitAndRelease();

#endif

        // aggregate results in single thread
        for (uint64_t e = 1; e < numThreads; e++)
            intermediatery[0] = aggregator(intermediatery[0], intermediatery[e]);

        // return accumulated result
        return intermediatery[0];
    }

    double Threads::parallel_double(FUNC_RD function, FUNC_AD aggregator, int64_t start, int64_t stop, int64_t increment, uint64_t numThreads) {
        if (start > stop)
            throw std::runtime_error("Threads::parallel_long got start > stop");

        auto delta = (stop - start);
        if (delta == 0 || numThreads == 1)
            return function(0, start, stop, increment);

        auto numElements = delta / increment;

        // we decide what's optimal number of threads we need here, and execute it
        numThreads = ThreadsHelper::numberOfThreads(numThreads, numElements);
        if (numThreads == 1)
            return function(0, start, stop, increment);

        // create temporary array
        double intermediatery[256];
        auto span = delta / numThreads;

#ifdef _OPENMP

        if (tryAcquire(numThreads)) {
#pragma omp parallel for num_threads(numThreads)
            for (int e = 0; e < numThreads; e++) {
                auto start_ = span * e + start;
                auto stop_ = span * (e + 1) + start;

                intermediatery[e] = function(e, start_, e == numThreads - 1 ? stop : stop_, increment);
            }
            freeThreads(numThreads);
        }
        else{
            // if there were no thre ads available - we'll execute function right within current thread
            return	function(0, start, stop, increment);
        }

#else

        auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads - 1);
        if (ticket == nullptr)
            return function(0, start, stop, increment);

		// execute threads in parallel
		for (uint32_t e = 0; e < numThreads; e++) {
			auto start_ = span * e + start;
			auto stop_ = span * (e + 1) + start;

			if (e == numThreads - 1)
				intermediatery[e] = function(e, start_, stop, increment);
			else
				ticket->enqueue(e, numThreads, &intermediatery[e], function, start_, stop_, increment);
		}

		ticket->waitAndRelease();

#endif

        // aggregate results in single thread
        for (uint64_t e = 1; e < numThreads; e++)
            intermediatery[0] = aggregator(intermediatery[0], intermediatery[e]);

        // return accumulated result
        return intermediatery[0];
    }


    int  Threads::parallel_aligned_increment(FUNC_1D function, int64_t start, int64_t stop, int64_t increment, size_t type_size, uint32_t req_numThreads) {
        if (start > stop)
            throw std::runtime_error("Threads::parallel_for got start > stop");
        auto num_elements = (stop - start);
        //this way we preserve increment starts offset
        //so we will parition considering delta but not total elements
        auto delta = (stop - start) / increment;


        // in some cases we just fire func as is
        if (delta == 0 || req_numThreads == 1) {
            function(0, start, stop, increment);
            return 1;
        }
        int numThreads = 0;

        struct th_span {
            Nd4jLong start;
            Nd4jLong end;
        };
#ifdef _OPENMP
        constexpr int max_thread_count = 8;
#else
        constexpr int max_thread_count = 1024;
#endif
        th_span thread_spans[max_thread_count];

        req_numThreads = req_numThreads > max_thread_count ? max_thread_count : req_numThreads;

#ifdef _OPENMP
        int adjusted_numThreads = max_thread_count;
#else
        int adjusted_numThreads = sd::ThreadsHelper::numberOfThreads(req_numThreads, (num_elements * sizeof(double)) / (200 * type_size));
#endif

        if (adjusted_numThreads > delta)
            adjusted_numThreads = delta;
        // shortcut
        if (adjusted_numThreads <= 1) {
            function(0, start, stop, increment);
            return 1;
        }



        //take span as ceil
        auto spand = std::ceil((double)delta / (double)adjusted_numThreads);
        numThreads = static_cast (std::ceil((double)delta / spand));
        auto span = static_cast (spand);


        //tail_add is additional value of the last part
        //it could be negative or positive
        //we will spread that value across
        auto tail_add = delta - numThreads * span;
        Nd4jLong begin = 0;
        Nd4jLong end = 0;

        //we will try enqueu bigger parts first
        decltype(span) span1, span2;
        int last = 0;
        if (tail_add >= 0) {
            //for span == 1  , tail_add is  0
            last = tail_add;
            span1 = span + 1;
            span2 = span;
        }
        else {
            last = numThreads + tail_add;// -std::abs(tail_add);
            span1 = span;
            span2 = span - 1;
        }
        for (int i = 0; i < last; i++) {
            end = begin + span1 * increment;
            // putting the task into the queue for a given thread
            thread_spans[i].start = begin;
            thread_spans[i].end = end;
            begin = end;
        }
        for (int i = last; i < numThreads - 1; i++) {
            end = begin + span2 * increment;
            // putting the task into the queue for a given thread
            thread_spans[i].start = begin;
            thread_spans[i].end = end;
            begin = end;
        }
        //for last one enqueue last offset as stop
        //we need it in case our ((stop-start) % increment ) > 0
        thread_spans[numThreads - 1].start = begin;
        thread_spans[numThreads - 1].end = stop;

#ifdef _OPENMP
        if (tryAcquire(numThreads)) {
#pragma omp parallel for num_threads(numThreads)
            for (size_t j = 0; j < numThreads; j++) {
                function(j, thread_spans[j].start, thread_spans[j].end, increment);
            }
            freeThreads(numThreads);
            return numThreads;
        }
        else {
            function(0, start, stop, increment);
            // we tell that parallelism request declined
            return 1;
        }
#else
        auto ticket = sd::ThreadPool::getInstance()->tryAcquire(numThreads);
		if (ticket != nullptr) {

			for (size_t j = 0; j < numThreads; j++) {
				ticket->enqueue(j, numThreads, function, thread_spans[j].start, thread_spans[j].end, increment);
			}
			// block and wait till all threads finished the job
			ticket->waitAndRelease();
			// we tell that parallelism request succeeded
			return numThreads;
		}
		else {
			// if there were no threads available - we'll execute function right within current thread
			function(0, start, stop, increment);
			// we tell that parallelism request declined
			return 1;
		}
#endif
    }
}