cavis/libnd4j/include/ops/declarable/platform/armcompute/armcomputeUtils.cpp

/*
 *  ******************************************************************************
 *  *
 *  *
 *  * This program and the accompanying materials are made available under the
 *  * terms of the Apache License, Version 2.0 which is available at
 *  * https://www.apache.org/licenses/LICENSE-2.0.
 *  *
 *  * See the NOTICE file distributed with this work for additional
 *  * information regarding copyright ownership.
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 *  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 *  * License for the specific language governing permissions and limitations
 *  * under the License.
 *  *
 *  * SPDX-License-Identifier: Apache-2.0
 *  *****************************************************************************
 */

 // Created by Abdelrauf 2020


#include <ops/declarable/PlatformHelper.h>
#include <ops/declarable/OpRegistrator.h>
#include <system/platform_boilerplate.h> 
#include <ops/declarable/helpers/convolutions.h>
#include <cstdint>
#include <helpers/LoopsCoordsHelper.h>

#include "armcomputeUtils.h"


namespace sd      {
namespace ops       {
namespace platforms {


Arm_DataType getArmType ( const DataType &dType){
     Arm_DataType  ret;
     switch (dType){  
        case HALF :
            ret = Arm_DataType::F16;
            break;        
        case FLOAT32 :
            ret = Arm_DataType::F32;
            break;
        case DOUBLE :
            ret = Arm_DataType::F64;
            break;
        case INT8 :
            ret = Arm_DataType::S8;
            break;
        case INT16 :
            ret = Arm_DataType::S16;
            break;
        case INT32 :
            ret = Arm_DataType::S32;
            break;
        case INT64 :
            ret = Arm_DataType::S64;
            break;
        case UINT8 :
            ret = Arm_DataType::U8;
            break;
        case UINT16 :
            ret = Arm_DataType::U16;
            break;        
        case UINT32 :
            ret = Arm_DataType::U32;
            break;        
        case UINT64 :
            ret = Arm_DataType::U64;
            break; 
        case BFLOAT16 : 
            ret = Arm_DataType::BFLOAT16;
            break;
        default:
            ret = Arm_DataType::UNKNOWN;
     }; 

    return ret;
}

bool isArmcomputeFriendly(const NDArray& arr) {
  auto dType = getArmType(arr.dataType());
  int rank = (int)(arr.rankOf());
  int ind = arr.ordering() == 'c' ? rank-1 : 0;
  auto arrStrides = arr.stridesOf();
  return dType != Arm_DataType::UNKNOWN && 
         rank<=arm_compute::MAX_DIMS &&
         arr.ordering() == 'c' &&
         arrStrides[ind] == 1 ;
}

Arm_TensorInfo getArmTensorInfo(int rank, Nd4jLong* bases,sd::DataType ndArrayType, arm_compute::DataLayout layout) {
    constexpr int numChannels = 1; 
    auto dType = getArmType(ndArrayType);

    Arm_TensorShape shape;
    shape.set_num_dimensions(rank); 
    for (int i = 0, j = rank - 1; i < rank; i++, j--) {
        shape[i] = static_cast<uint32_t>(bases[j]); 
    }
    // fill the rest unused with 1
    for (int i = rank; i < arm_compute::MAX_DIMS; i++) {
        shape[i] = 1;
    } 

    return Arm_TensorInfo(shape, numChannels, dType, layout); 
}

Arm_TensorInfo getArmTensorInfo(const NDArray& arr,
                                arm_compute::DataLayout layout) {
  auto dType = getArmType(arr.dataType());
  
 
  internal_print_nd_shape(arr,"shape")  ;
  internal_print_nd_array(arr,"data")  ;
  //
  constexpr int numChannels = 1;
  int rank = (int)(arr.rankOf());
  auto bases = arr.shapeOf();
  auto arrStrides = arr.stridesOf();

  // https://arm-software.github.io/ComputeLibrary/v20.05/_dimensions_8h_source.xhtml
  // note: underhood it is stored as std::array<T, num_max_dimensions> _id;
  // TensorShape is derived from Dimensions<uint32_t>
  // as well as Strides : public Dimensions<uint32_t>
  Arm_TensorShape shape;
  Arm_Strides strides;
  shape.set_num_dimensions(rank);
  strides.set_num_dimensions(rank);
  size_t element_size = arr.sizeOfT();
  for (int i = 0, j = rank - 1; i < rank; i++, j--) {
    shape[i] = static_cast<uint32_t>(bases[j]);
    strides[i] = static_cast<uint32_t>(arrStrides[j] * element_size);
  }
  // fill the rest unused with 1
  for (int i = rank; i < arm_compute::MAX_DIMS; i++) {
    shape[i] = 1;
  }
  
  size_t total_size = arr.lengthOf() * element_size;
  size_t offset=0;
  //size_t size_ind = rank - 1;
  //total_size = shape[size_ind] * strides[size_ind];
  if (arr.hasPaddedBuffer()){
      internal_printf("---has padded buffer %d\n",0);
      total_size = arr.getDataBuffer()->getLenInBytes();
      offset = arr.bufferOffset() * element_size;
  } 
  internal_printf(":: offset %d el size %d  arr.getDataBuffer()->getLenInBytes() %d lengthof %d \n",(int)arr.bufferOffset(), (int)element_size, (int)arr.getDataBuffer()->getLenInBytes(),  (int)arr.lengthOf());
  Arm_TensorInfo info;
  info.init(shape, numChannels, dType, strides, offset, total_size);
  info.set_data_layout(layout);

  return info;
}

Arm_Tensor getArmTensor(const NDArray& arr, arm_compute::DataLayout layout) {
  // - Ownership of the backing memory is not transferred to the tensor itself.
  // - The tensor mustn't be memory managed.
  // - Padding requirements should be accounted by the client code.
  // In other words, if padding is required by the tensor after the function
  // configuration step, then the imported backing memory should account for it.
  // Padding can be checked through the TensorInfo::padding() interface.

  // Import existing pointer as backing memory
  auto info = getArmTensorInfo(arr, layout);
  Arm_Tensor tensor;
  tensor.allocator()->init(info);
  //get without offset
  void* buff = arr.getDataBuffer()->primary();
  tensor.allocator()->import_memory(buff);
  return tensor;
}

void copyFromTensor(const Arm_Tensor& inTensor, sd::NDArray& output) {
    //only for C order
    if (output.ordering() != 'c') return;
    const Nd4jLong* shapeInfo = output.shapeInfo();
    const Nd4jLong* bases = &(shapeInfo[1]);
    const Nd4jLong rank = shapeInfo[0];
    const Nd4jLong* strides = output.stridesOf();
    int width = bases[rank - 1];
    uint8_t* outputBuffer = (uint8_t*)output.buffer(); 
    size_t offset = 0;
    arm_compute::Window window;
    arm_compute::Iterator tensor_it(&inTensor, window);

    int element_size = inTensor.info()->element_size();
    window.use_tensor_dimensions(inTensor.info()->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY);

    if (output.ews() == 1) {
        auto copySize = width * element_size;
        auto dest = outputBuffer;
        arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
            {
                auto src = tensor_it.ptr(); 
                memcpy(dest, src, copySize);
                dest += copySize;
            },
            tensor_it);
    }
    else {
        Nd4jLong coords[MAX_RANK] = {};
        auto copySize = width * element_size;
        arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
            {
                auto src = tensor_it.ptr();
                auto dest = outputBuffer + offset * element_size;
                memcpy(dest, src, copySize);
                offset = sd::inc_coords(bases, strides, coords, offset, rank, 1);
            },
            tensor_it);
    }
}

void copyToTensor(const sd::NDArray& input, Arm_Tensor& outTensor) {
    //only for C order
    if (input.ordering() != 'c') return;
    const Nd4jLong* shapeInfo = input.shapeInfo();
    const Nd4jLong* bases = &(shapeInfo[1]);
    const Nd4jLong rank = shapeInfo[0];
    const Nd4jLong* strides = input.stridesOf();
    uint8_t *inputBuffer = (uint8_t*)input.buffer(); 
    int width = bases[rank - 1];
    size_t offset = 0; 
    arm_compute::Window window;
    arm_compute::Iterator tensor_it(&outTensor, window);
    int element_size = outTensor.info()->element_size(); 

    window.use_tensor_dimensions(outTensor.info()->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY);
    
    if (input.ews() == 1) {

        auto copySize = width * element_size;
        auto src = inputBuffer;
        arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
         {
             auto dest = tensor_it.ptr(); 
             memcpy(dest,src, copySize);
             src += copySize;
         },
         tensor_it);
    }
    else {
        Nd4jLong coords[MAX_RANK] = {};
        auto copySize = width * element_size;
        arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
         {
             auto dest = tensor_it.ptr();
             auto src = inputBuffer + offset * element_size;
             memcpy(dest, src, copySize);
             offset = sd::inc_coords(bases, strides, coords, offset, rank, 1);
         },
         tensor_it);
   }
}


// armcompute should be built with debug option
void print_tensor(Arm_ITensor& tensor, const char* msg) {
  auto info = tensor.info();
  auto padding = info->padding();
  std::cout << msg << "\ntotal: " << info->total_size() << "\n";

  for (int i = 0; i < arm_compute::MAX_DIMS; i++) {
    std::cout << info->dimension(i) << ",";
  }
  std::cout << std::endl;
  for (int i = 0; i < arm_compute::MAX_DIMS; i++) {
    std::cout << info->strides_in_bytes()[i] << ",";
  }
  std::cout << "\npadding: l " << padding.left << ", r " << padding.right
            << ", t " << padding.top << ", b " << padding.bottom << std::endl;

#ifdef ARM_COMPUTE_ASSERTS_ENABLED
  //note it did not print correctly fro NHWC
  std::cout << msg << ":\n";
  tensor.print(std::cout);
  std::cout << std::endl;
#endif
}

}
}
}
Update copyrights remove attic and relocate elsewhere 2021-02-09 05:16:31 +01:00			`/*`
			`* ******************************************************************************`
			`* *`
			`* *`
			`* * This program and the accompanying materials are made available under the`
			`* * terms of the Apache License, Version 2.0 which is available at`
			`* * https://www.apache.org/licenses/LICENSE-2.0.`
			`* *`
			`* * See the NOTICE file distributed with this work for additional`
			`* * information regarding copyright ownership.`
			`* * Unless required by applicable law or agreed to in writing, software`
			`* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT`
			`* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the`
			`* * License for the specific language governing permissions and limitations`
			`* * under the License.`
			`* *`
			`* * SPDX-License-Identifier: Apache-2.0`
			`* *****************************************************************************`
			`*/`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00
			`// Created by Abdelrauf 2020`


			`#include <ops/declarable/PlatformHelper.h>`
			`#include <ops/declarable/OpRegistrator.h>`
			`#include <system/platform_boilerplate.h>`
			`#include <ops/declarable/helpers/convolutions.h>`
			`#include <cstdint>`
			`#include <helpers/LoopsCoordsHelper.h>`

			`#include "armcomputeUtils.h"`


			`namespace sd {`
			`namespace ops {`
			`namespace platforms {`



			`Arm_DataType getArmType ( const DataType &dType){`
			`Arm_DataType ret;`
			`switch (dType){`
			`case HALF :`
			`ret = Arm_DataType::F16;`
			`break;`
			`case FLOAT32 :`
			`ret = Arm_DataType::F32;`
			`break;`
			`case DOUBLE :`
			`ret = Arm_DataType::F64;`
			`break;`
			`case INT8 :`
			`ret = Arm_DataType::S8;`
			`break;`
			`case INT16 :`
			`ret = Arm_DataType::S16;`
			`break;`
			`case INT32 :`
			`ret = Arm_DataType::S32;`
			`break;`
			`case INT64 :`
			`ret = Arm_DataType::S64;`
			`break;`
			`case UINT8 :`
			`ret = Arm_DataType::U8;`
			`break;`
			`case UINT16 :`
			`ret = Arm_DataType::U16;`
			`break;`
			`case UINT32 :`
			`ret = Arm_DataType::U32;`
			`break;`
			`case UINT64 :`
			`ret = Arm_DataType::U64;`
			`break;`
			`case BFLOAT16 :`
			`ret = Arm_DataType::BFLOAT16;`
			`break;`
			`default:`
			`ret = Arm_DataType::UNKNOWN;`
			`};`

			`return ret;`
			`}`
Dev commits 2021-02-01 06:31:20 +01:00
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`bool isArmcomputeFriendly(const NDArray& arr) {`
Dev commits 2021-02-01 06:31:20 +01:00			`auto dType = getArmType(arr.dataType());`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`int rank = (int)(arr.rankOf());`
Dev commits 2021-02-01 06:31:20 +01:00			`int ind = arr.ordering() == 'c' ? rank-1 : 0;`
			`auto arrStrides = arr.stridesOf();`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`return dType != Arm_DataType::UNKNOWN &&`
			`rank<=arm_compute::MAX_DIMS &&`
			`arr.ordering() == 'c' &&`
Dev commits 2021-02-01 06:31:20 +01:00			`arrStrides[ind] == 1 ;`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`}`

			`Arm_TensorInfo getArmTensorInfo(int rank, Nd4jLong* bases,sd::DataType ndArrayType, arm_compute::DataLayout layout) {`
			`constexpr int numChannels = 1;`
			`auto dType = getArmType(ndArrayType);`

			`Arm_TensorShape shape;`
			`shape.set_num_dimensions(rank);`
			`for (int i = 0, j = rank - 1; i < rank; i++, j--) {`
			`shape[i] = static_cast<uint32_t>(bases[j]);`
			`}`
			`// fill the rest unused with 1`
			`for (int i = rank; i < arm_compute::MAX_DIMS; i++) {`
			`shape[i] = 1;`
			`}`

			`return Arm_TensorInfo(shape, numChannels, dType, layout);`
			`}`

			`Arm_TensorInfo getArmTensorInfo(const NDArray& arr,`
			`arm_compute::DataLayout layout) {`
			`auto dType = getArmType(arr.dataType());`
Dev commits 2021-02-01 06:31:20 +01:00

			`internal_print_nd_shape(arr,"shape") ;`
			`internal_print_nd_array(arr,"data") ;`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`//`
			`constexpr int numChannels = 1;`
			`int rank = (int)(arr.rankOf());`
			`auto bases = arr.shapeOf();`
			`auto arrStrides = arr.stridesOf();`

			`// https://arm-software.github.io/ComputeLibrary/v20.05/_dimensions_8h_source.xhtml`
			`// note: underhood it is stored as std::array<T, num_max_dimensions> _id;`
			`// TensorShape is derived from Dimensions<uint32_t>`
			`// as well as Strides : public Dimensions<uint32_t>`
			`Arm_TensorShape shape;`
			`Arm_Strides strides;`
			`shape.set_num_dimensions(rank);`
			`strides.set_num_dimensions(rank);`
Dev commits 2021-02-01 06:31:20 +01:00			`size_t element_size = arr.sizeOfT();`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`for (int i = 0, j = rank - 1; i < rank; i++, j--) {`
			`shape[i] = static_cast<uint32_t>(bases[j]);`
Dev commits 2021-02-01 06:31:20 +01:00			`strides[i] = static_cast<uint32_t>(arrStrides[j] * element_size);`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`}`
			`// fill the rest unused with 1`
			`for (int i = rank; i < arm_compute::MAX_DIMS; i++) {`
			`shape[i] = 1;`
			`}`
Dev commits 2021-02-01 06:31:20 +01:00
			`size_t total_size = arr.lengthOf() * element_size;`
			`size_t offset=0;`
			`//size_t size_ind = rank - 1;`
			`//total_size = shape[size_ind] * strides[size_ind];`
			`if (arr.hasPaddedBuffer()){`
			`internal_printf("---has padded buffer %d\n",0);`
			`total_size = arr.getDataBuffer()->getLenInBytes();`
			`offset = arr.bufferOffset() * element_size;`
			`}`
			`internal_printf(":: offset %d el size %d arr.getDataBuffer()->getLenInBytes() %d lengthof %d \n",(int)arr.bufferOffset(), (int)element_size, (int)arr.getDataBuffer()->getLenInBytes(), (int)arr.lengthOf());`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`Arm_TensorInfo info;`
Dev commits 2021-02-01 06:31:20 +01:00			`info.init(shape, numChannels, dType, strides, offset, total_size);`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`info.set_data_layout(layout);`

			`return info;`
			`}`

			`Arm_Tensor getArmTensor(const NDArray& arr, arm_compute::DataLayout layout) {`
			`// - Ownership of the backing memory is not transferred to the tensor itself.`
			`// - The tensor mustn't be memory managed.`
			`// - Padding requirements should be accounted by the client code.`
			`// In other words, if padding is required by the tensor after the function`
			`// configuration step, then the imported backing memory should account for it.`
			`// Padding can be checked through the TensorInfo::padding() interface.`

			`// Import existing pointer as backing memory`
			`auto info = getArmTensorInfo(arr, layout);`
			`Arm_Tensor tensor;`
			`tensor.allocator()->init(info);`
Dev commits 2021-02-01 06:31:20 +01:00			`//get without offset`
			`void* buff = arr.getDataBuffer()->primary();`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`tensor.allocator()->import_memory(buff);`
			`return tensor;`
			`}`

Dev commits 2021-02-01 06:31:20 +01:00			`void copyFromTensor(const Arm_Tensor& inTensor, sd::NDArray& output) {`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`//only for C order`
			`if (output.ordering() != 'c') return;`
Dev commits 2021-02-01 06:31:20 +01:00			`const Nd4jLong* shapeInfo = output.shapeInfo();`
			`const Nd4jLong* bases = &(shapeInfo[1]);`
			`const Nd4jLong rank = shapeInfo[0];`
			`const Nd4jLong* strides = output.stridesOf();`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`int width = bases[rank - 1];`
			`uint8_t* outputBuffer = (uint8_t*)output.buffer();`
			`size_t offset = 0;`
			`arm_compute::Window window;`
			`arm_compute::Iterator tensor_it(&inTensor, window);`

			`int element_size = inTensor.info()->element_size();`
			`window.use_tensor_dimensions(inTensor.info()->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY);`

Dev commits 2021-02-01 06:31:20 +01:00			`if (output.ews() == 1) {`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`auto copySize = width * element_size;`
			`auto dest = outputBuffer;`
			`arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)`
			`{`
			`auto src = tensor_it.ptr();`
			`memcpy(dest, src, copySize);`
			`dest += copySize;`
			`},`
			`tensor_it);`
Dev commits 2021-02-01 06:31:20 +01:00			`}`
			`else {`
			`Nd4jLong coords[MAX_RANK] = {};`
			`auto copySize = width * element_size;`
			`arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)`
			`{`
			`auto src = tensor_it.ptr();`
			`auto dest = outputBuffer + offset * element_size;`
			`memcpy(dest, src, copySize);`
			`offset = sd::inc_coords(bases, strides, coords, offset, rank, 1);`
			`},`
			`tensor_it);`
			`}`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`}`

Dev commits 2021-02-01 06:31:20 +01:00			`void copyToTensor(const sd::NDArray& input, Arm_Tensor& outTensor) {`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`//only for C order`
			`if (input.ordering() != 'c') return;`
Dev commits 2021-02-01 06:31:20 +01:00			`const Nd4jLong* shapeInfo = input.shapeInfo();`
			`const Nd4jLong* bases = &(shapeInfo[1]);`
			`const Nd4jLong rank = shapeInfo[0];`
			`const Nd4jLong* strides = input.stridesOf();`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`uint8_t inputBuffer = (uint8_t)input.buffer();`
			`int width = bases[rank - 1];`
			`size_t offset = 0;`
			`arm_compute::Window window;`
			`arm_compute::Iterator tensor_it(&outTensor, window);`
			`int element_size = outTensor.info()->element_size();`

			`window.use_tensor_dimensions(outTensor.info()->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY);`

Dev commits 2021-02-01 06:31:20 +01:00			`if (input.ews() == 1) {`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00
Dev commits 2021-02-01 06:31:20 +01:00			`auto copySize = width * element_size;`
			`auto src = inputBuffer;`
			`arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`{`
			`auto dest = tensor_it.ptr();`
			`memcpy(dest,src, copySize);`
			`src += copySize;`
			`},`
			`tensor_it);`
Dev commits 2021-02-01 06:31:20 +01:00			`}`
			`else {`
			`Nd4jLong coords[MAX_RANK] = {};`
			`auto copySize = width * element_size;`
			`arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)`
			`{`
			`auto dest = tensor_it.ptr();`
			`auto src = inputBuffer + offset * element_size;`
			`memcpy(dest, src, copySize);`
			`offset = sd::inc_coords(bases, strides, coords, offset, rank, 1);`
			`},`
			`tensor_it);`
			`}`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`}`


			`// armcompute should be built with debug option`
			`void print_tensor(Arm_ITensor& tensor, const char* msg) {`
Dev commits 2021-02-01 06:31:20 +01:00			`auto info = tensor.info();`
Pi build and initial ArmCompute library support (#494) * - raspberry Pi build and ArmCompute library support - initial ArmCompute platform implementations (Maxpool2d AvgPool2d for float32) Signed-off-by: AbdelRauf <rauf@konduit.ai> * - Build script for pi - small changes Signed-off-by: AbdelRauf <rauf@konduit.ai> 2020-06-26 09:03:46 +02:00			`auto padding = info->padding();`
			`std::cout << msg << "\ntotal: " << info->total_size() << "\n";`

			`for (int i = 0; i < arm_compute::MAX_DIMS; i++) {`
			`std::cout << info->dimension(i) << ",";`
			`}`
			`std::cout << std::endl;`
			`for (int i = 0; i < arm_compute::MAX_DIMS; i++) {`
			`std::cout << info->strides_in_bytes()[i] << ",";`
			`}`
			`std::cout << "\npadding: l " << padding.left << ", r " << padding.right`
			`<< ", t " << padding.top << ", b " << padding.bottom << std::endl;`

			`#ifdef ARM_COMPUTE_ASSERTS_ENABLED`
			`//note it did not print correctly fro NHWC`
			`std::cout << msg << ":\n";`
			`tensor.print(std::cout);`
			`std::cout << std::endl;`
			`#endif`
			`}`

			`}`
			`}`
			`}`