Pi build and initial ArmCompute library support (#494)

* - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai>
2020-06-26 11:03:46 +04:00 · 2020-06-26 11:03:46 +04:00 · 69ebc96068
commit 69ebc96068
parent fb578fdecd
15 changed files with 962 additions and 110 deletions
--- a/libnd4j/CMakeLists.txt
+++ b/libnd4j/CMakeLists.txt
@ -131,6 +131,23 @@ if(NOT SD_CUDA)
    endif()
 endif()
 #arm-compute entry
 if(${HELPERS_armcompute})
 find_package(ARMCOMPUTE REQUIRED)
 if(ARMCOMPUTE_FOUND)
    message("Found ARMCOMPUTE: ${ARMCOMPUTE_LIBRARIES}")
    set(HAVE_ARMCOMPUTE 1)
    # Add preprocessor definition for ARM Compute NEON 
    add_definitions(-DARMCOMPUTENEON_ENABLED)
    #build our library with neon support
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
    include_directories(${ARMCOMPUTE_INCLUDE})
    message("----${ARMCOMPUTE_INCLUDE}---")
 endif()
 endif()
 # new mkl-dnn entry
 if (${HELPERS_mkldnn})
--- a/libnd4j/blas/CMakeLists.txt
+++ b/libnd4j/blas/CMakeLists.txt
@ -146,6 +146,10 @@ if (HAVE_MKLDNN)
    file(GLOB_RECURSE CUSTOMOPS_MKLDNN_SOURCES false ../include/ops/declarable/platform/mkldnn/*.cpp ../include/ops/declarable/platform/mkldnn/mkldnnUtils.h)
 endif()
 if(HAVE_ARMCOMPUTE)
    file(GLOB_RECURSE CUSTOMOPS_ARMCOMPUTE_SOURCES false ../include/ops/declarable/platform/armcompute/*.cpp ../include/ops/declarable/platform/armcompute/*.h)
 endif()
 if(SD_CUDA)
    message("Build cublas")
    find_package(CUDA)
@ -243,7 +247,7 @@ if(SD_CUDA)
                ${CUSTOMOPS_HELPERS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES}
                ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
                ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES} ${CUSTOMOPS_CUDNN_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES}
-				${CUSTOMOPS_GENERIC_SOURCES}
+				${CUSTOMOPS_ARMCOMPUTE_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
 				)
        if (WIN32)
@ -351,8 +355,8 @@ elseif(SD_CPU)
    add_definitions(-D__CPUBLAS__=true)
    add_library(samediff_obj OBJECT ${LEGACY_SOURCES}
            ${LOOPS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
-            ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
+            ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES} 
-            ${OPS_SOURCES} ${PERF_SOURCES})
+            ${CUSTOMOPS_ARMCOMPUTE_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES})
    if(IOS)
        add_library(${SD_LIBRARY_NAME} STATIC $<TARGET_OBJECTS:samediff_obj>)
    else()
@ -378,12 +382,12 @@ elseif(SD_CPU)
    if (NOT BLAS_LIBRARIES)
        set(BLAS_LIBRARIES "")
    endif()
-    target_link_libraries(${SD_LIBRARY_NAME} ${MKLDNN} ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${BLAS_LIBRARIES} ${CPU_FEATURES})
+    target_link_libraries(${SD_LIBRARY_NAME} ${MKLDNN}  ${MKLDNN_LIBRARIES} ${ARMCOMPUTE_LIBRARIES} ${OPENBLAS_LIBRARIES} ${BLAS_LIBRARIES} ${CPU_FEATURES})
    if ("${SD_ALL_OPS}" AND "${SD_BUILD_MINIFIER}")
        message(STATUS "Building minifier...")
        add_executable(minifier ../minifier/minifier.cpp ../minifier/graphopt.cpp)
-        target_link_libraries(minifier samediff_obj ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES})
+        target_link_libraries(minifier samediff_obj ${MKLDNN_LIBRARIES} ${ARMCOMPUTE_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES})
    endif()
    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND "${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 4.9)
--- a/libnd4j/cmake/FindARMCOMPUTE.cmake
+++ b/libnd4j/cmake/FindARMCOMPUTE.cmake
@ -0,0 +1,74 @@
 ################################################################################
 # Copyright (c) 2020 Konduit K.K.
 #
 # This program and the accompanying materials are made available under the
 # terms of the Apache License, Version 2.0 which is available at
 # https://www.apache.org/licenses/LICENSE-2.0.
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 # License for the specific language governing permissions and limitations
 # under the License.
 #
 # SPDX-License-Identifier: Apache-2.0
 ################################################################################
 ### Find ARM COMPUTE LIBRARY STATIC libraries
 SET (COMPUTE_INCLUDE_DIRS 
    /usr/include
    ${ARMCOMPUTE_ROOT}
    ${ARMCOMPUTE_ROOT}/include
    ${ARMCOMPUTE_ROOT}/applications 
    ${ARMCOMPUTE_ROOT}/applications/arm_compute    
 )
 SET (COMPUTE_LIB_DIRS  
     /lib
     /usr/lib
    ${ARMCOMPUTE_ROOT}
    ${ARMCOMPUTE_ROOT}/lib 
    ${ARMCOMPUTE_ROOT}/build
 )
 find_path(ARMCOMPUTE_INCLUDE arm_compute/core/CL/ICLKernel.h
            PATHS ${COMPUTE_INCLUDE_DIRS}
            NO_DEFAULT_PATH NO_CMAKE_FIND_ROOT_PATH)
 find_path(ARMCOMPUTE_INCLUDE arm_compute/core/CL/ICLKernel.h)
 find_path(HALF_INCLUDE half/half.hpp)
 find_path(HALF_INCLUDE half/half.hpp
              PATHS ${ARMCOMPUTE_ROOT}/include
              NO_DEFAULT_PATH NO_CMAKE_FIND_ROOT_PATH)
 include_directories(SYSTEM ${HALF_INCLUDE})
 # Find the Arm Compute libraries if not already specified 
 if (NOT DEFINED ARMCOMPUTE_LIBRARIES)
    find_library(ARMCOMPUTE_LIBRARY NAMES arm_compute-static
                    PATHS ${COMPUTE_LIB_DIRS}
                    PATH_SUFFIXES "Release"
                    NO_DEFAULT_PATH NO_CMAKE_FIND_ROOT_PATH)
    find_library(ARMCOMPUTE_CORE_LIBRARY NAMES arm_compute_core-static
                    PATHS ${COMPUTE_LIB_DIRS}
                    PATH_SUFFIXES "Release"
                    NO_DEFAULT_PATH NO_CMAKE_FIND_ROOT_PATH)
    # In case it wasn't there, try a default search (will work in cases where
    # the library has been installed into a standard location) 
    find_library(ARMCOMPUTE_LIBRARY NAMES arm_compute-static) 
    find_library(ARMCOMPUTE_CORE_LIBRARY NAMES arm_compute_core-static)
    set(ARMCOMPUTE_LIBRARIES  ${ARMCOMPUTE_LIBRARY} ${ARMCOMPUTE_CORE_LIBRARY} )
 endif()
 INCLUDE(FindPackageHandleStandardArgs)
 FIND_PACKAGE_HANDLE_STANDARD_ARGS(ARMCOMPUTE REQUIRED_VARS ARMCOMPUTE_INCLUDE ARMCOMPUTE_LIBRARIES)
--- a/libnd4j/include/config.h.in
+++ b/libnd4j/include/config.h.in
@ -3,6 +3,8 @@
 #cmakedefine HAVE_MKLDNN
 #cmakedefine HAVE_ARMCOMPUTE
 #cmakedefine MKLDNN_PATH "@MKLDNN_PATH@"
 #cmakedefine HAVE_OPENBLAS
--- a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
@ -215,7 +215,9 @@ namespace helpers {
        auto maxValue = T(0); //sd::math::nd4j_abs(compoundBuffer[xInitialIndex]);
        auto result = -1;
        //auto loop = PRAGMA_THREADS_FOR {
-            auto start = column, stop = rowNum, increment = 1;
+            auto start = column;
 	    auto stop = rowNum;
 	    auto increment = 1;
            for (auto rowCounter = start; rowCounter < stop; rowCounter++) {
                Nd4jLong xPos[] = {rowCounter, column};
                auto xIndex = shape::getOffset(compoundShape, xPos, 0);
--- a/libnd4j/include/ops/declarable/platform/armcompute/armcomputeUtils.cpp
+++ b/libnd4j/include/ops/declarable/platform/armcompute/armcomputeUtils.cpp
@ -0,0 +1,278 @@
 /*******************************************************************************
 * Copyright (c) 2019 Konduit K.K.
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 // Created by Abdelrauf 2020
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
 #include <system/platform_boilerplate.h> 
 #include <ops/declarable/helpers/convolutions.h>
 #include <cstdint>
 #include <helpers/LoopsCoordsHelper.h>
 #include "armcomputeUtils.h"
 namespace sd      {
 namespace ops       {
 namespace platforms {
 Arm_DataType getArmType ( const DataType &dType){
     Arm_DataType  ret;
     switch (dType){  
        case HALF :
            ret = Arm_DataType::F16;
            break;        
        case FLOAT32 :
            ret = Arm_DataType::F32;
            break;
        case DOUBLE :
            ret = Arm_DataType::F64;
            break;
        case INT8 :
            ret = Arm_DataType::S8;
            break;
        case INT16 :
            ret = Arm_DataType::S16;
            break;
        case INT32 :
            ret = Arm_DataType::S32;
            break;
        case INT64 :
            ret = Arm_DataType::S64;
            break;
        case UINT8 :
            ret = Arm_DataType::U8;
            break;
        case UINT16 :
            ret = Arm_DataType::U16;
            break;        
        case UINT32 :
            ret = Arm_DataType::U32;
            break;        
        case UINT64 :
            ret = Arm_DataType::U64;
            break; 
        case BFLOAT16 : 
            ret = Arm_DataType::BFLOAT16;
            break;
        default:
            ret = Arm_DataType::UNKNOWN;
     }; 
    return ret;
 }
 bool isArmcomputeFriendly(const NDArray& arr) {
   auto dType = getArmType(arr.dataType());
  int rank = (int)(arr.rankOf());
  return dType != Arm_DataType::UNKNOWN && 
         rank<=arm_compute::MAX_DIMS &&
         arr.ordering() == 'c' &&
         arr.ews()==1 &&
         shape::strideDescendingCAscendingF(arr.shapeInfo()) == true;
 }
 Arm_TensorInfo getArmTensorInfo(int rank, Nd4jLong* bases,sd::DataType ndArrayType, arm_compute::DataLayout layout) {
    constexpr int numChannels = 1; 
    auto dType = getArmType(ndArrayType);
    Arm_TensorShape shape;
    shape.set_num_dimensions(rank); 
    for (int i = 0, j = rank - 1; i < rank; i++, j--) {
        shape[i] = static_cast<uint32_t>(bases[j]); 
    }
    // fill the rest unused with 1
    for (int i = rank; i < arm_compute::MAX_DIMS; i++) {
        shape[i] = 1;
    } 
    return Arm_TensorInfo(shape, numChannels, dType, layout); 
 }
 Arm_TensorInfo getArmTensorInfo(const NDArray& arr,
                                arm_compute::DataLayout layout) {
  auto dType = getArmType(arr.dataType());
  //
  constexpr int numChannels = 1;
  int rank = (int)(arr.rankOf());
  auto bases = arr.shapeOf();
  auto arrStrides = arr.stridesOf();
  // https://arm-software.github.io/ComputeLibrary/v20.05/_dimensions_8h_source.xhtml
  // note: underhood it is stored as std::array<T, num_max_dimensions> _id;
  // TensorShape is derived from Dimensions<uint32_t>
  // as well as Strides : public Dimensions<uint32_t>
  Arm_TensorShape shape;
  Arm_Strides strides;
  shape.set_num_dimensions(rank);
  strides.set_num_dimensions(rank);
  size_t element_size = arm_compute::data_size_from_type(dType);
  for (int i = 0, j = rank - 1; i < rank; i++, j--) {
    shape[i] = static_cast<uint32_t>(bases[j]);
    strides[i] = static_cast<uint32_t>(arrStrides[j]) * element_size;
  }
  // fill the rest unused with 1
  for (int i = rank; i < arm_compute::MAX_DIMS; i++) {
    shape[i] = 1;
  }
  size_t total_size;
  size_t size_ind = rank - 1;
  total_size = shape[size_ind] * strides[size_ind];
  Arm_TensorInfo info;
  info.init(shape, numChannels, dType, strides, 0, total_size);
  info.set_data_layout(layout);
  return info;
 }
 Arm_Tensor getArmTensor(const NDArray& arr, arm_compute::DataLayout layout) {
  // - Ownership of the backing memory is not transferred to the tensor itself.
  // - The tensor mustn't be memory managed.
  // - Padding requirements should be accounted by the client code.
  // In other words, if padding is required by the tensor after the function
  // configuration step, then the imported backing memory should account for it.
  // Padding can be checked through the TensorInfo::padding() interface.
  // Import existing pointer as backing memory
  auto info = getArmTensorInfo(arr, layout);
  Arm_Tensor tensor;
  tensor.allocator()->init(info);
  void* buff = (void*)arr.buffer();
  tensor.allocator()->import_memory(buff);
  return tensor;
 }
 void copyFromTensor(const Arm_Tensor& inTensor, NDArray& output) {
    //only for C order
    //only for C order
    if (output.ordering() != 'c') return;
    auto shapeInfo = output.shapeInfo();
    auto bases = &(shapeInfo[1]);
    Nd4jLong rank = shapeInfo[0];
    auto strides = output.stridesOf();
    int width = bases[rank - 1];
    uint8_t* outputBuffer = (uint8_t*)output.buffer(); 
    size_t offset = 0;
    arm_compute::Window window;
    arm_compute::Iterator tensor_it(&inTensor, window);
    int element_size = inTensor.info()->element_size();
    window.use_tensor_dimensions(inTensor.info()->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY);
 //    if (output.ews() == 1) {
        auto copySize = width * element_size;
        auto dest = outputBuffer;
        arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
            {
                auto src = tensor_it.ptr(); 
                memcpy(dest, src, copySize);
                dest += copySize;
            },
            tensor_it);
    // }
    // else {
    //     Nd4jLong coords[MAX_RANK] = {};
    //     if(strides[rank-1]!=1){
    //             throw std::runtime_error(  "not implemented for subarrays whose last stride is not 1");
    //         //TODO: implement to work with all subarrays properly
    //     }
    //     arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
    //         {
    //             auto src = tensor_it.ptr();
    //             auto dest = outputBuffer + offset * element_size;
    //             memcpy(dest, src, width * element_size);
    //             offset = sd::inc_coords(bases, strides, coords, offset, rank, 1);
    //         },
    //         tensor_it);
    // }
 }
 void copyToTensor(const NDArray& input, Arm_Tensor& outTensor) {
    //only for C order
    if (input.ordering() != 'c') return;
    auto shapeInfo = input.shapeInfo();
    auto bases = &(shapeInfo[1]);
    Nd4jLong rank = shapeInfo[0];
    auto strides = input.stridesOf();
    uint8_t *inputBuffer = (uint8_t*)input.buffer(); 
    int width = bases[rank - 1];
    size_t offset = 0; 
    arm_compute::Window window;
    arm_compute::Iterator tensor_it(&outTensor, window);
    int element_size = outTensor.info()->element_size(); 
    window.use_tensor_dimensions(outTensor.info()->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY);
 // if (input.ews() == 1) {
     auto copySize = width * element_size;
     auto src = inputBuffer;
     arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
         {
             auto dest = tensor_it.ptr(); 
             memcpy(dest,src, copySize);
             src += copySize;
         },
         tensor_it);
 //  }
 //  else {
 //      Nd4jLong coords[MAX_RANK] = {};
 //         if(strides[rank-1]!=1){
 //                 throw std::runtime_error(  "not implemented for subarrays whose last stride is not 1");
 //             //TODO: implement to work with all subarrays properly
 //         }     
 //      arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
 //          {
 //              auto dest = tensor_it.ptr();
 //              auto src = inputBuffer + offset * element_size;
 //              offset = sd::inc_coords(bases, strides, coords, offset, rank, 1);
 //          },
 //          tensor_it);
 //  }
 }
 // armcompute should be built with debug option
 void print_tensor(Arm_ITensor& tensor, const char* msg) {
    auto info = tensor.info();
  auto padding = info->padding();
  std::cout << msg << "\ntotal: " << info->total_size() << "\n";
  for (int i = 0; i < arm_compute::MAX_DIMS; i++) {
    std::cout << info->dimension(i) << ",";
  }
  std::cout << std::endl;
  for (int i = 0; i < arm_compute::MAX_DIMS; i++) {
    std::cout << info->strides_in_bytes()[i] << ",";
  }
  std::cout << "\npadding: l " << padding.left << ", r " << padding.right
            << ", t " << padding.top << ", b " << padding.bottom << std::endl;
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
  //note it did not print correctly fro NHWC
  std::cout << msg << ":\n";
  tensor.print(std::cout);
  std::cout << std::endl;
 #endif
 }
 }
 }
 }
--- a/libnd4j/include/ops/declarable/platform/armcompute/armcomputeUtils.h
+++ b/libnd4j/include/ops/declarable/platform/armcompute/armcomputeUtils.h
@ -0,0 +1,133 @@
 /*******************************************************************************
 * Copyright (c) 2019 Konduit K.K.
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 #ifndef DEV_TESTSARMCOMPUTEUTILS_H
 #define DEV_TESTSARMCOMPUTEUTILS_H
 #include <legacy/NativeOps.h>
 #include <array/NDArray.h> 
 #include <graph/Context.h>
 #include <ops/declarable/PlatformHelper.h>
 #include <system/platform_boilerplate.h>
 #include <arm_compute/runtime/NEON/NEFunctions.h>
 #include <arm_compute/core/Types.h>
 #include <arm_compute/core/TensorInfo.h>
 #include <arm_compute/core/TensorShape.h>
 #include <arm_compute/core/Strides.h>
 #include <arm_compute/core/Helpers.h>
 #include <arm_compute/core/ITensor.h>
 #include <arm_compute/core/Types.h>
 #include <arm_compute/core/Validate.h>
 #include <arm_compute/core/Window.h>
 #include <arm_compute/runtime/Tensor.h>
 #include <arm_compute/runtime/TensorAllocator.h> 
 #include <iostream>
 using namespace samediff;
 namespace sd {
    namespace ops {
        namespace platforms {
            using Arm_DataType = arm_compute::DataType;
            using Arm_Tensor = arm_compute::Tensor;
            using Arm_ITensor = arm_compute::ITensor;            
            using Arm_TensorInfo = arm_compute::TensorInfo;
            using Arm_TensorShape = arm_compute::TensorShape;
            using Arm_Strides = arm_compute::Strides;
            /**
             * Here we actually declare our platform helpers
             */
            DECLARE_PLATFORM(maxpool2d, ENGINE_CPU);
            DECLARE_PLATFORM(avgpool2d, ENGINE_CPU);
            //utils
            Arm_DataType getArmType(const sd::DataType& dType);
            Arm_TensorInfo getArmTensorInfo(int rank, Nd4jLong* bases, sd::DataType ndArrayType, arm_compute::DataLayout layout = arm_compute::DataLayout::UNKNOWN);
            Arm_TensorInfo getArmTensorInfo(const NDArray& arr, arm_compute::DataLayout layout = arm_compute::DataLayout::UNKNOWN);
            Arm_Tensor getArmTensor(const NDArray& arr, arm_compute::DataLayout layout = arm_compute::DataLayout::UNKNOWN);
            void copyFromTensor(const Arm_Tensor& inTensor, NDArray& output);
            void copyToTensor(const NDArray& input, Arm_Tensor& outTensor);
            void print_tensor(Arm_ITensor& tensor, const char* msg);
            bool isArmcomputeFriendly(const NDArray& arr);
            template<typename F>
            class ArmFunction {
            public:
               template<typename ...Args>
               void configure(NDArray *input , NDArray *output, arm_compute::DataLayout layout, Args&& ...args) {
                   auto inInfo = getArmTensorInfo(*input, layout);
                   auto outInfo = getArmTensorInfo(*output, layout);  
                   in.allocator()->init(inInfo);
                   out.allocator()->init(outInfo);
                   armFunction.configure(&in,&out,std::forward<Args>(args) ...);
                   if (in.info()->has_padding()) {
                       //allocate and copy
                       in.allocator()->allocate();
                       //copy 
                       copyToTensor(*input, in);
                   }
                   else {
                       //import buffer
                       void* buff = input->buffer();
                       in.allocator()->import_memory(buff);
                   } 
                   if (out.info()->has_padding()) {
                       //store pointer to our array to copy after run
                       out.allocator()->allocate();
                       outNd = output;
                   }
                   else {
                       //import
                       void* buff = output->buffer();
                       out.allocator()->import_memory(buff);
                   }
               }
               void run() {
                   armFunction.run();
                   if (outNd) {
                       copyFromTensor(out, *outNd);
                   }
               }
               private:
                   Arm_Tensor in;
                   Arm_Tensor out;
                   NDArray *outNd=nullptr;
                   F armFunction{};
            };          
        }
    }
 }
 #endif //DEV_TESTSARMCOMPUTEUTILS_H
--- a/libnd4j/include/ops/declarable/platform/armcompute/avgpooling2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/armcompute/avgpooling2d.cpp
@ -0,0 +1,106 @@
 /*******************************************************************************
 * Copyright (c) 2019 Konduit K.K.
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 // Created by Abdelrauf (rauf@konduit.ai) 2020
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
 #include <system/platform_boilerplate.h> 
 #include <ops/declarable/helpers/convolutions.h>
 #include "armcomputeUtils.h"
 namespace sd      {
 namespace ops       {
 namespace platforms {
 //////////////////////////////////////////////////////////////////////////
 PLATFORM_IMPL(avgpool2d, ENGINE_CPU) {
    auto input = INPUT_VARIABLE(0);
    auto output = OUTPUT_VARIABLE(0);
    // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode;
    const auto kH = INT_ARG(0);
    const auto kW = INT_ARG(1);
    const auto sH = INT_ARG(2);
    const auto sW = INT_ARG(3);
          auto pH = INT_ARG(4);
          auto pW = INT_ARG(5);
    const auto dH = INT_ARG(6);
    const auto dW = INT_ARG(7);
    const auto paddingMode = INT_ARG(8);
    const auto extraParam0 = INT_ARG(9);
    const int isNCHW  = block.getIArguments()->size() > 10 ? !INT_ARG(10) : 1;       // INT_ARG(10): 0-NCHW, 1-NHWC
    REQUIRE_TRUE(input->rankOf() == 4, 0, "AVGPOOL2D ARMCOMPUTE op: input should have rank of 4, but got %i instead", input->rankOf());
    REQUIRE_TRUE(dH != 0 && dW != 0, 0, "AVGPOOL2D ARMCOMPUTE op: dilation must not be zero, but got instead {%i, %i}", dH, dW);
    bool exclude_padding= (extraParam0 == 0) ? true : false;
    auto dataLayout = isNCHW ? arm_compute::DataLayout::NCHW : arm_compute::DataLayout::NHWC;
    // Calculate individual paddings
    unsigned int pad_left, pad_top, pad_right, pad_bottom;
    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, 0, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
    if(paddingMode){ 
        ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW); 
    }
    pad_left   = pW;
    pad_top    = pH;
    pad_right  = (oW - 1) * sW - iW + kW - pW ;
    pad_bottom = (oH - 1) * sH - iH + kH - pH ; 
 #if 0
    nd4j_printf("avgpool kH = %d, kW = %d, sH = %d, sW = %d  , pH = %d  , pW = %d, dH = %d, dW = %d, paddingMode = %d , isNCHW %d exclude pad %d \n" , kH , kW , sH , sW  , pH 
     , pW , dH , dW , paddingMode,isNCHW?1:0 ,exclude_padding?1:0);
 #endif
    auto poolPad = arm_compute::PadStrideInfo(sW, sH, pad_left,pad_right, pad_top, pad_bottom, arm_compute::DimensionRoundingType::FLOOR);
    auto poolInfo = arm_compute::PoolingLayerInfo(arm_compute::PoolingType::AVG, arm_compute::Size2D(kW, kH), dataLayout, poolPad, exclude_padding);
    ArmFunction<arm_compute::NEPoolingLayer> pool;    
    pool.configure(input,output, dataLayout, poolInfo);
    pool.run(); // run function
    return Status::OK();
 }
 //////////////////////////////////////////////////////////////////////////
 PLATFORM_CHECK(avgpool2d, ENGINE_CPU) { 
    auto input = INPUT_VARIABLE(0);
    auto output = OUTPUT_VARIABLE(0);
    const int dH = INT_ARG(6);
    const int dW = INT_ARG(7);
    // Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
    auto dTypeInput = getArmType(input->dataType());
    auto dTypeOutput = getArmType(output->dataType());  
    bool is_supported = dH==1 && dW==1 && isArmcomputeFriendly(*input) && isArmcomputeFriendly(*output)
            && (dTypeInput ==Arm_DataType::F32) 
            && (dTypeOutput ==Arm_DataType::F32); 
    return  is_supported; 
 }
 }
 }
 }
--- a/libnd4j/include/ops/declarable/platform/armcompute/maxpooling2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/armcompute/maxpooling2d.cpp
@ -0,0 +1,106 @@
 /*******************************************************************************
 * Copyright (c) 2019 Konduit K.K.
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 // Created by Abdelrauf 2020
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
 #include <system/platform_boilerplate.h> 
 #include <ops/declarable/helpers/convolutions.h>
 #include "armcomputeUtils.h"
 namespace sd      {
 namespace ops       {
 namespace platforms {
 //////////////////////////////////////////////////////////////////////////
 PLATFORM_IMPL(maxpool2d, ENGINE_CPU) {
    auto input = INPUT_VARIABLE(0);
    auto output = OUTPUT_VARIABLE(0);
    REQUIRE_TRUE(input->rankOf() == 4, 0, "MAXPOOL2D ARMCOMPUTE  OP: input array should have rank of 4, but got %i instead", input->rankOf());
    // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode;
    const int kH = INT_ARG(0);
    const int kW = INT_ARG(1);
    const int sH = INT_ARG(2);
    const int sW = INT_ARG(3);
          int pH = INT_ARG(4);
          int pW = INT_ARG(5);
    const int dH = INT_ARG(6);
    const int dW = INT_ARG(7);
    const int paddingMode = INT_ARG(8);
    // const int extraParam0 = INT_ARG(9);
    const int isNCHW  = block.getIArguments()->size() > 10 ? !INT_ARG(10) : 1;       // INT_ARG(10): 1-NHWC, 0-NCHW
    REQUIRE_TRUE(dH != 0 && dW != 0, 0, "MAXPOOL2D MKLDNN op: dilation must not be zero, but got instead {%i, %i}", dH, dW);
    auto dataLayout = isNCHW ? arm_compute::DataLayout::NCHW : arm_compute::DataLayout::NHWC;
    // Calculate individual paddings
    unsigned int pad_left, pad_top, pad_right, pad_bottom;
    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, 0, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
    if(paddingMode){ 
        ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW); 
    }
    pad_left   = pW;
    pad_top    = pH;
    pad_right  = (oW - 1) * sW - iW + kW - pW ;
    pad_bottom = (oH - 1) * sH - iH + kH - pH ; 
 #if 0
    nd4j_printf("avgpool kH = %d, kW = %d, sH = %d, sW = %d  , pH = %d  , pW = %d, dH = %d, dW = %d, paddingMode = %d , isNCHW %d exclude pad %d \n" , kH , kW , sH , sW  , pH 
     , pW , dH , dW , paddingMode,isNCHW?1:0 ,exclude_padding?1:0);
 #endif
    auto poolPad = arm_compute::PadStrideInfo(sW, sH, pad_left,pad_right, pad_top, pad_bottom, arm_compute::DimensionRoundingType::FLOOR);
    auto poolInfo = arm_compute::PoolingLayerInfo(arm_compute::PoolingType::MAX, arm_compute::Size2D(kW, kH), dataLayout, poolPad);
    ArmFunction<arm_compute::NEPoolingLayer> pool;
    pool.configure(input,output, dataLayout, poolInfo);
    pool.run(); // run function
    return Status::OK();
 }
 //////////////////////////////////////////////////////////////////////////
 PLATFORM_CHECK(maxpool2d, ENGINE_CPU) { 
    auto input = INPUT_VARIABLE(0);
    auto output = OUTPUT_VARIABLE(0);
    const int dH = INT_ARG(6);
    const int dW = INT_ARG(7);
    // Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
    auto dTypeInput = getArmType(input->dataType());
    auto dTypeOutput = getArmType(output->dataType());  
    bool is_supported = dH==1 && dW==1 && isArmcomputeFriendly(*input) && isArmcomputeFriendly(*output)
            && (dTypeInput ==Arm_DataType::F32) 
            && (dTypeOutput ==Arm_DataType::F32); 
    return  is_supported; 
 }
 }
 }
 }
--- a/libnd4j/pi_build.sh
+++ b/libnd4j/pi_build.sh
@ -0,0 +1,185 @@
 #!/bin/bash
 TARGET=armv7-a
 BLAS_TARGET_NAME=ARMV7
 ARMCOMPUTE_TARGET=armv7a
 #BASE_DIR=${HOME}/pi
 #https://stackoverflow.com/questions/59895/how-to-get-the-source-directory-of-a-bash-script-from-within-the-script-itself
 SOURCE="${BASH_SOURCE[0]}"
 ARMCOMPUTE_DEBUG=1
 LIBND4J_BUILD_MODE=Release
 while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
  DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )"
  SOURCE="$(readlink "$SOURCE")"
  [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
 done
 BASE_DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )"
 CMAKE=cmake #/snap/bin/cmake
 mkdir -p ${BASE_DIR}/helper_bin/
 CROSS_COMPILER_URL=https://sourceforge.net/projects/raspberry-pi-cross-compilers/files/Raspberry%20Pi%20GCC%20Cross-Compiler%20Toolchains/Buster/GCC%208.3.0/Raspberry%20Pi%203A%2B%2C%203B%2B%2C%204/cross-gcc-8.3.0-pi_3%2B.tar.gz/download
 CROSS_COMPILER_DIR=${BASE_DIR}/helper_bin/cross_compiler
 SCONS_LOCAL_URL=http://prdownloads.sourceforge.net/scons/scons-local-3.1.1.tar.gz
 SCONS_LOCAL_DIR=${BASE_DIR}/helper_bin/scons_local
 THIRD_PARTY=${BASE_DIR}/third_party_libs
 ARMCOMPUTE_GIT_URL=https://github.com/ARM-software/ComputeLibrary.git
 ARMCOMPUTE_TAG=v20.05
 ARMCOMPUTE_DIR=${THIRD_PARTY}/arm_compute_dir
 OPENBLAS_GIT_URL="https://github.com/xianyi/OpenBLAS.git"
 OPENBLAS_DIR=${THIRD_PARTY}/OpenBLAS
 LIBND4J_SRC_DIR=${BASE_DIR}
 LIBND4J_BUILD_DIR=${BASE_DIR}/build_pi
 #for some downloads
 XRTACT_STRIP="--strip-components=1"
 HAS_ARMCOMPUTE=1
 mkdir -p ${BASE_DIR}
 mkdir -p ${THIRD_PARTY}
 #change directory to base
 cd $BASE_DIR
 function message {
 	echo "BUILDER:::: ${@}"
 }
 function check_requirements {
 	for i in "${@}"
 	do
      if [ ! -e "$i" ]; then
         message "missing: ${i}"
 		 exit -2
 	  fi
 	done
 }
 function download_extract {
 	#$1 is url #2 is dir $3 is extract argument
 	if [ ! -f ${2}_file ]; then
 		message "download"
 		wget --quiet --show-progress -O ${2}_file ${1}
 	fi
 	message "extract"
    #extract
 	mkdir -p ${2}
 	command="tar -xzf ${2}_file --directory=${2} ${3} "
 	message $command
 	$command
 	check_requirements "${2}"
 }
 function git_check {
 	#$1 is url #$2 is dir #$3 is tag or branch if optional
 	command="git clone --quiet ${1} ${2}"
 	message "$command"
 	$command 
 	if [ -n "$3" ]; then
 		cd ${2}
 		command="git checkout ${3}"
 		message "$command"
 		$command 
 		cd ${BASE_DIR}
 	fi
 	check_requirements "${2}"
 }
 if [ ! -d ${CROSS_COMPILER_DIR} ]; then
 	#out file
 	message "download CROSS_COMPILER"
 	download_extract ${CROSS_COMPILER_URL} ${CROSS_COMPILER_DIR} ${XRTACT_STRIP}
 fi
 #useful exports
 export PI_FOLDER=${CROSS_COMPILER_DIR}
 export RPI_BIN=${PI_FOLDER}/bin/arm-linux-gnueabihf
 export PI_SYS_ROOT=${PI_FOLDER}/arm-linux-gnueabihf/libc
 export LD_LIBRARY_PATH=${PI_FOLDER}/lib:$LD_LIBRARY_PATH
 export CC=${RPI_BIN}-gcc
 export FC=${RPI_BIN}-gfortran
 export CXX=${RPI_BIN}-g++
 export CPP=${RPI_BIN}-cpp
 export RANLIB=${RPI_BIN}-gcc-ranlib
 export LD="${RPI_BIN}-ld"
 export AR="${RPI_BIN}-ar"
 #lets build OpenBlas 
 if [ ! -d "${OPENBLAS_DIR}" ]; then 
 	message "download OpenBLAS"
 	git_check "${OPENBLAS_GIT_URL}" "${OPENBLAS_DIR}"
 fi
 if [ ! -f "${THIRD_PARTY}/lib/libopenblas.so" ]; then
 	message "build and install OpenBLAS" 
 	cd ${OPENBLAS_DIR}
 	command="make TARGET=${BLAS_TARGET_NAME} HOSTCC=gcc CC=${CC} USE_THREAD=0 NOFORTRAN=1 CFLAGS=--sysroot=${PI_SYS_ROOT} LDFLAGS=\"-L${PI_SYS_ROOT}/../lib/ -lm\"  &>/dev/null"
 	message $command
 	eval $command 
    message "install it"
 	command="make PREFIX=${THIRD_PARTY} install"
 	message $command
 	$command
 	cd $BASE_DIR
 fi
 check_requirements ${THIRD_PARTY}/lib/libopenblas.so
 if [ ! -d ${SCONS_LOCAL_DIR} ]; then
 	#out file
 	message "download Scons local"
 	download_extract ${SCONS_LOCAL_URL} ${SCONS_LOCAL_DIR}
 fi
 check_requirements ${SCONS_LOCAL_DIR}/scons.py
 if [ ! -d "${ARMCOMPUTE_DIR}" ]; then 
 	message "download ArmCompute Source" 
 	git_check ${ARMCOMPUTE_GIT_URL} "${ARMCOMPUTE_DIR}" "tags/${ARMCOMPUTE_TAG}" 
 fi
 #build armcompute
 if [ ! -f "${ARMCOMPUTE_DIR}/build/libarm_compute-static.a" ]; then
 message "build arm compute"
 cd ${ARMCOMPUTE_DIR}
 command="CC=gcc CXX=g++ python3 ${SCONS_LOCAL_DIR}/scons.py Werror=1 -j$(nproc) toolchain_prefix=${RPI_BIN}- debug=${ARMCOMPUTE_DEBUG}  neon=1 opencl=0 extra_cxx_flags=-fPIC os=linux build=cross_compile arch=${ARMCOMPUTE_TARGET} &>/dev/null"
 message $command
 eval $command
 cd ${BASE_DIR} 
 fi
 check_requirements "${ARMCOMPUTE_DIR}/build/libarm_compute-static.a" "${ARMCOMPUTE_DIR}/build/libarm_compute_core-static.a"
 message "build cmake for LIBND4J. output: ${LIBND4J_BUILD_DIR}"
 TOOLCHAIN=${LIBND4J_SRC_DIR}/cmake/rpi.cmake
 cmake_cmd="${CMAKE}  -G \"Unix Makefiles\"  -B${LIBND4J_BUILD_DIR} -S${LIBND4J_SRC_DIR}  -DCMAKE_BUILD_TYPE=${LIBND4J_BUILD_MODE} -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN} -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DSD_ALL_OPS=true  -DSD_CPU=true -DSD_LIBRARY_NAME=nd4jcpu -DSD_BUILD_TESTS=ON -DSD_ARM_BUILD=true -DOPENBLAS_PATH=${THIRD_PARTY} -DSD_ARCH=${TARGET} -DARMCOMPUTE_ROOT=${ARMCOMPUTE_DIR} -DHELPERS_armcompute=${HAS_ARMCOMPUTE}"
 message $cmake_cmd
 eval $cmake_cmd
 #build
 message "lets build"
 cd ${LIBND4J_BUILD_DIR}
 make -j $(nproc)
--- a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
@ -52,14 +52,19 @@ elseif(WIN32)
 		set(CMAKE_CXX_FLAGS  " -fPIC")
 	endif()
 else()
    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
    set(CMAKE_CXX_FLAGS  " -fPIC")
    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
    IF(${SD_ARCH} MATCHES "arm*")
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${SD_ARCH}")
    else()
    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*")
        set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native")
    else()
        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -march=native -mtune=native")
    endif()
-
+    endif()
    if (SD_CPU AND SD_SANITIZE)
        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address")
    else()
@ -130,7 +135,7 @@ if (SD_CPU)
    endif()
 	add_executable(runtests ${TEST_SOURCES})
-	target_link_libraries(runtests samediff_obj ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES} gtest gtest_main)
+	target_link_libraries(runtests samediff_obj ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES}  ${ARMCOMPUTE_LIBRARIES}  gtest gtest_main)
 elseif(SD_CUDA)
 	add_executable(runtests ${TEST_SOURCES})
--- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp
@ -1113,7 +1113,10 @@ TYPED_TEST(TypedConvolutionTests2, maxpool2d_6) {
    ASSERT_EQ(ND4J_STATUS_OK, result.status());
    auto z = result.at(0);
-
+#if 0    
    exp.printIndexedBuffer("Expected");
    z->printIndexedBuffer("Z");
 #endif    
    ASSERT_TRUE(exp.isSameShape(z));
    ASSERT_TRUE(exp.equalsTo(z));
@ -1132,7 +1135,10 @@ TYPED_TEST(TypedConvolutionTests2, maxpool2d_7) {
    ASSERT_EQ(ND4J_STATUS_OK, result.status());
    auto z = result.at(0);
-
+#if 0    
    exp.printIndexedBuffer("Expected");
    z->printIndexedBuffer("Z");
 #endif    
    ASSERT_TRUE(exp.isSameShape(z));
    ASSERT_TRUE(exp.equalsTo(z));
@ -1151,7 +1157,10 @@ TYPED_TEST(TypedConvolutionTests2, maxpool2d_8) {
    ASSERT_EQ(ND4J_STATUS_OK, result.status());
    auto z = result.at(0);
-
+#if 0    
    exp.printIndexedBuffer("Expected");
    z->printIndexedBuffer("Z");
 #endif    
    ASSERT_TRUE(exp.isSameShape(z));
    ASSERT_TRUE(exp.equalsTo(z));
 }
@ -1204,7 +1213,10 @@ TYPED_TEST(TypedConvolutionTests2, maxpool2d_10) {
    auto* output = results.at(0);
    ASSERT_EQ(Status::OK(), results.status());
-
+#if 0    
    expOutput.printIndexedBuffer("expOutput");
    output->printIndexedBuffer("output");
 #endif    
    ASSERT_TRUE(expOutput.isSameShape(output));
    ASSERT_TRUE(expOutput.equalsTo(output));
 }
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp
@ -244,7 +244,8 @@ TEST_F(DeclarableOpsTests19, test_threshold_encode_decode) {
 #ifdef _RELEASE
 TEST_F(DeclarableOpsTests19, test_threshold_encode_decode_2) {
  // [2,1,135079944,1,1,8192,1,99]
-  auto initial = NDArrayFactory::create<float>('c', {1, 135079944});
+  constexpr int sizeX= 10*1000*1000;
  auto initial = NDArrayFactory::create<float>('c', {1, sizeX});
  initial = 1.0f;
  auto exp = initial.dup();
  auto neg = initial.like();
@ -254,7 +255,7 @@ TEST_F(DeclarableOpsTests19, test_threshold_encode_decode_2) {
  auto enc_result = enc.evaluate({&initial}, {0.5f});
  auto encoded = enc_result.at(1);
-  ASSERT_EQ(135079944 + 4, encoded->lengthOf());
+  ASSERT_EQ(sizeX + 4, encoded->lengthOf());
  ASSERT_NE(exp, initial);
 /*
  for (int e = 0; e < initial.lengthOf(); e++) {
--- a/libnd4j/tests_cpu/layers_tests/SessionLocalTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/SessionLocalTests.cpp
@ -1,93 +0,0 @@
 /*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 //
 // @author raver119@gmail.com
 //
 #ifndef LIBND4J_SESSIONLOCALTESTS_H
 #define LIBND4J_SESSIONLOCALTESTS_H
 #include "testlayers.h"
 #include <array/NDArrayFactory.h>
 #include <graph/SessionLocalStorage.h>
 using namespace sd::graph;
 class SessionLocalTests : public testing::Test {
 public:
 };
 TEST_F(SessionLocalTests, BasicTests_1) {
    VariableSpace variableSpace;
    SessionLocalStorage storage(&variableSpace, nullptr);
    if (omp_get_max_threads() <= 1)
        return;
    PRAGMA_OMP_PARALLEL_FOR_THREADS(4)
    for (int e = 0; e < 4; e++) {
        storage.startSession();
    }
    ASSERT_EQ(4, storage.numberOfSessions());
    PRAGMA_OMP_PARALLEL_FOR_THREADS(4)
    for (int e = 0; e < 4; e++) {
        storage.endSession();
    }
    ASSERT_EQ(0, storage.numberOfSessions());
 }
 TEST_F(SessionLocalTests, BasicTests_2) {
    VariableSpace variableSpace;
    SessionLocalStorage storage(&variableSpace, nullptr);
    if (omp_get_max_threads() <= 1)
        return;
    auto alpha = sd::NDArrayFactory::create_<float>('c',{5,5});
    alpha->assign(0.0);
    variableSpace.putVariable(-1, alpha);
    PRAGMA_OMP_PARALLEL_FOR_THREADS(4)
    for (int e = 0; e < 4; e++) {
        storage.startSession();
        auto varSpace = storage.localVariableSpace();
        auto arr = varSpace->getVariable(-1)->getNDArray();
        arr->applyScalar(sd::scalar::Add, (float) e+1, *arr);
    }
    float lastValue = 0.0f;
    for (int e = 1; e <= 4; e++) {
        auto varSpace = storage.localVariableSpace((Nd4jLong) e);
        auto arr = varSpace->getVariable(-1)->getNDArray();
        //nd4j_printf("Last value: %f; Current value: %f\n", lastValue, arr->e(0));
        ASSERT_NE(lastValue, arr->e<float>(0));
        lastValue = arr->e<float>(0);
    }
 }
 #endif //LIBND4J_SESSIONLOCALTESTS_H
--- a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
@ -45,6 +45,21 @@ if ("${BUILD_MKLDNN}")
    set(MKLDNN dnnl)
 endif()
 if (${HELPERS_armcompute})
 find_package(ARMCOMPUTE REQUIRED)
 if(ARMCOMPUTE_FOUND)
    message("Found ARMCOMPUTE: ${ARMCOMPUTE_LIBRARIES}")
    set(HAVE_ARMCOMPUTE 1)
    # Add preprocessor definition for ARM Compute NEON
    add_definitions(-DARMCOMPUTENEON_ENABLED)
    #build our library with neon support
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
    include_directories(${ARMCOMPUTE_INCLUDE})
 endif()
 endif()
 # Download and unpack flatbuffers at configure time
 configure_file(../../CMakeLists.txt.in flatbuffers-download/CMakeLists.txt)
 execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
@ -217,6 +232,10 @@ if ("${BUILD_MKLDNN}")
    file(GLOB_RECURSE CUSTOMOPS_PLATFORM_SOURCES false ../../include/ops/declarable/platform/mkldnn/*.cpp)
 endif()
 if(HAVE_ARMCOMPUTE)
    file(GLOB_RECURSE CUSTOMOPS_ARMCOMPUTE_SOURCES false ../include/ops/declarable/platform/armcompute/*.cpp ../include/ops/declarable/platform/armcompute/armcomputeUtils.h)
 endif()
 message("CPU backend")
 add_definitions(-D__CPUBLAS__=true)
@ -276,8 +295,9 @@ endforeach(TMP_PATH)
 add_executable(runtests ${LOOPS_SOURCES} ${LEGACY_SOURCES} ${EXEC_SOURCES} ${HELPERS_SOURCES}  ${ARRAY_SOURCES} ${TYPES_SOURCES}
-    ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_PLATFORM_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
+    ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_PLATFORM_SOURCES} 
    ${CUSTOMOPS_ARMCOMPUTE_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
    ${OPS_SOURCES} ${TEST_SOURCES} ${PERF_SOURCES})
-target_link_libraries(runtests gtest ${MKLDNN} gtest_main ${BLAS_LIBRARIES})
+target_link_libraries(runtests gtest ${MKLDNN} ${ARMCOMPUTE_LIBRARIES} gtest_main ${BLAS_LIBRARIES})