
291 lines
9.6 KiB

* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* *
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
// Created by Abdelrauf 2020
#include <ops/declarable/PlatformHelper.h>
#include <ops/declarable/OpRegistrator.h>
#include <system/platform_boilerplate.h>
#include <ops/declarable/helpers/convolutions.h>
#include <cstdint>
#include <helpers/LoopsCoordsHelper.h>
#include "armcomputeUtils.h"
namespace sd {
namespace ops {
namespace platforms {
Arm_DataType getArmType ( const DataType &dType){
Arm_DataType ret;
switch (dType){
case HALF :
ret = Arm_DataType::F16;
case FLOAT32 :
ret = Arm_DataType::F32;
case DOUBLE :
ret = Arm_DataType::F64;
case INT8 :
ret = Arm_DataType::S8;
case INT16 :
ret = Arm_DataType::S16;
case INT32 :
ret = Arm_DataType::S32;
case INT64 :
ret = Arm_DataType::S64;
case UINT8 :
ret = Arm_DataType::U8;
case UINT16 :
ret = Arm_DataType::U16;
case UINT32 :
ret = Arm_DataType::U32;
case UINT64 :
ret = Arm_DataType::U64;
case BFLOAT16 :
ret = Arm_DataType::BFLOAT16;
ret = Arm_DataType::UNKNOWN;
return ret;
bool isArmcomputeFriendly(const NDArray& arr) {
auto dType = getArmType(arr.dataType());
int rank = (int)(arr.rankOf());
int ind = arr.ordering() == 'c' ? rank-1 : 0;
auto arrStrides = arr.stridesOf();
return dType != Arm_DataType::UNKNOWN &&
rank<=arm_compute::MAX_DIMS &&
arr.ordering() == 'c' &&
arrStrides[ind] == 1 ;
Arm_TensorInfo getArmTensorInfo(int rank, Nd4jLong* bases,sd::DataType ndArrayType, arm_compute::DataLayout layout) {
constexpr int numChannels = 1;
auto dType = getArmType(ndArrayType);
Arm_TensorShape shape;
for (int i = 0, j = rank - 1; i < rank; i++, j--) {
shape[i] = static_cast<uint32_t>(bases[j]);
// fill the rest unused with 1
for (int i = rank; i < arm_compute::MAX_DIMS; i++) {
shape[i] = 1;
return Arm_TensorInfo(shape, numChannels, dType, layout);
Arm_TensorInfo getArmTensorInfo(const NDArray& arr,
arm_compute::DataLayout layout) {
auto dType = getArmType(arr.dataType());
internal_print_nd_shape(arr,"shape") ;
internal_print_nd_array(arr,"data") ;
constexpr int numChannels = 1;
int rank = (int)(arr.rankOf());
auto bases = arr.shapeOf();
auto arrStrides = arr.stridesOf();
// note: underhood it is stored as std::array<T, num_max_dimensions> _id;
// TensorShape is derived from Dimensions<uint32_t>
// as well as Strides : public Dimensions<uint32_t>
Arm_TensorShape shape;
Arm_Strides strides;
size_t element_size = arr.sizeOfT();
for (int i = 0, j = rank - 1; i < rank; i++, j--) {
shape[i] = static_cast<uint32_t>(bases[j]);
strides[i] = static_cast<uint32_t>(arrStrides[j] * element_size);
// fill the rest unused with 1
for (int i = rank; i < arm_compute::MAX_DIMS; i++) {
shape[i] = 1;
size_t total_size = arr.lengthOf() * element_size;
size_t offset=0;
//size_t size_ind = rank - 1;
//total_size = shape[size_ind] * strides[size_ind];
if (arr.hasPaddedBuffer()){
internal_printf("---has padded buffer %d\n",0);
total_size = arr.getDataBuffer()->getLenInBytes();
offset = arr.bufferOffset() * element_size;
internal_printf(":: offset %d el size %d arr.getDataBuffer()->getLenInBytes() %d lengthof %d \n",(int)arr.bufferOffset(), (int)element_size, (int)arr.getDataBuffer()->getLenInBytes(), (int)arr.lengthOf());
Arm_TensorInfo info;
info.init(shape, numChannels, dType, strides, offset, total_size);
return info;
Arm_Tensor getArmTensor(const NDArray& arr, arm_compute::DataLayout layout) {
// - Ownership of the backing memory is not transferred to the tensor itself.
// - The tensor mustn't be memory managed.
// - Padding requirements should be accounted by the client code.
// In other words, if padding is required by the tensor after the function
// configuration step, then the imported backing memory should account for it.
// Padding can be checked through the TensorInfo::padding() interface.
// Import existing pointer as backing memory
auto info = getArmTensorInfo(arr, layout);
Arm_Tensor tensor;
//get without offset
void* buff = arr.getDataBuffer()->primary();
return tensor;
void copyFromTensor(const Arm_Tensor& inTensor, sd::NDArray& output) {
//only for C order
if (output.ordering() != 'c') return;
const Nd4jLong* shapeInfo = output.shapeInfo();
const Nd4jLong* bases = &(shapeInfo[1]);
const Nd4jLong rank = shapeInfo[0];
const Nd4jLong* strides = output.stridesOf();
int width = bases[rank - 1];
uint8_t* outputBuffer = (uint8_t*)output.buffer();
size_t offset = 0;
arm_compute::Window window;
arm_compute::Iterator tensor_it(&inTensor, window);
int element_size =>element_size();
window.use_tensor_dimensions(>tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY);
if (output.ews() == 1) {
auto copySize = width * element_size;
auto dest = outputBuffer;
arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
auto src = tensor_it.ptr();
memcpy(dest, src, copySize);
dest += copySize;
else {
Nd4jLong coords[MAX_RANK] = {};
auto copySize = width * element_size;
arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
auto src = tensor_it.ptr();
auto dest = outputBuffer + offset * element_size;
memcpy(dest, src, copySize);
offset = sd::inc_coords(bases, strides, coords, offset, rank, 1);
void copyToTensor(const sd::NDArray& input, Arm_Tensor& outTensor) {
//only for C order
if (input.ordering() != 'c') return;
const Nd4jLong* shapeInfo = input.shapeInfo();
const Nd4jLong* bases = &(shapeInfo[1]);
const Nd4jLong rank = shapeInfo[0];
const Nd4jLong* strides = input.stridesOf();
uint8_t *inputBuffer = (uint8_t*)input.buffer();
int width = bases[rank - 1];
size_t offset = 0;
arm_compute::Window window;
arm_compute::Iterator tensor_it(&outTensor, window);
int element_size =>element_size();
window.use_tensor_dimensions(>tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY);
if (input.ews() == 1) {
auto copySize = width * element_size;
auto src = inputBuffer;
arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
auto dest = tensor_it.ptr();
memcpy(dest,src, copySize);
src += copySize;
else {
Nd4jLong coords[MAX_RANK] = {};
auto copySize = width * element_size;
arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
auto dest = tensor_it.ptr();
auto src = inputBuffer + offset * element_size;
memcpy(dest, src, copySize);
offset = sd::inc_coords(bases, strides, coords, offset, rank, 1);
// armcompute should be built with debug option
void print_tensor(Arm_ITensor& tensor, const char* msg) {
auto info =;
auto padding = info->padding();
std::cout << msg << "\ntotal: " << info->total_size() << "\n";
for (int i = 0; i < arm_compute::MAX_DIMS; i++) {
std::cout << info->dimension(i) << ",";
std::cout << std::endl;
for (int i = 0; i < arm_compute::MAX_DIMS; i++) {
std::cout << info->strides_in_bytes()[i] << ",";
std::cout << "\npadding: l " << padding.left << ", r " << padding.right
<< ", t " << << ", b " << padding.bottom << std::endl;
//note it did not print correctly fro NHWC
std::cout << msg << ":\n";
std::cout << std::endl;