279 lines
9.2 KiB
C++
279 lines
9.2 KiB
C++
|
/*******************************************************************************
|
||
|
* Copyright (c) 2019 Konduit K.K.
|
||
|
* This program and the accompanying materials are made available under the
|
||
|
* terms of the Apache License, Version 2.0 which is available at
|
||
|
* https://www.apache.org/licenses/LICENSE-2.0.
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||
|
* License for the specific language governing permissions and limitations
|
||
|
* under the License.
|
||
|
*
|
||
|
* SPDX-License-Identifier: Apache-2.0
|
||
|
******************************************************************************/
|
||
|
|
||
|
// Created by Abdelrauf 2020
|
||
|
|
||
|
|
||
|
#include <ops/declarable/PlatformHelper.h>
|
||
|
#include <ops/declarable/OpRegistrator.h>
|
||
|
#include <system/platform_boilerplate.h>
|
||
|
#include <ops/declarable/helpers/convolutions.h>
|
||
|
#include <cstdint>
|
||
|
#include <helpers/LoopsCoordsHelper.h>
|
||
|
|
||
|
#include "armcomputeUtils.h"
|
||
|
|
||
|
|
||
|
namespace sd {
|
||
|
namespace ops {
|
||
|
namespace platforms {
|
||
|
|
||
|
|
||
|
|
||
|
Arm_DataType getArmType ( const DataType &dType){
|
||
|
Arm_DataType ret;
|
||
|
switch (dType){
|
||
|
case HALF :
|
||
|
ret = Arm_DataType::F16;
|
||
|
break;
|
||
|
case FLOAT32 :
|
||
|
ret = Arm_DataType::F32;
|
||
|
break;
|
||
|
case DOUBLE :
|
||
|
ret = Arm_DataType::F64;
|
||
|
break;
|
||
|
case INT8 :
|
||
|
ret = Arm_DataType::S8;
|
||
|
break;
|
||
|
case INT16 :
|
||
|
ret = Arm_DataType::S16;
|
||
|
break;
|
||
|
case INT32 :
|
||
|
ret = Arm_DataType::S32;
|
||
|
break;
|
||
|
case INT64 :
|
||
|
ret = Arm_DataType::S64;
|
||
|
break;
|
||
|
case UINT8 :
|
||
|
ret = Arm_DataType::U8;
|
||
|
break;
|
||
|
case UINT16 :
|
||
|
ret = Arm_DataType::U16;
|
||
|
break;
|
||
|
case UINT32 :
|
||
|
ret = Arm_DataType::U32;
|
||
|
break;
|
||
|
case UINT64 :
|
||
|
ret = Arm_DataType::U64;
|
||
|
break;
|
||
|
case BFLOAT16 :
|
||
|
ret = Arm_DataType::BFLOAT16;
|
||
|
break;
|
||
|
default:
|
||
|
ret = Arm_DataType::UNKNOWN;
|
||
|
};
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
bool isArmcomputeFriendly(const NDArray& arr) {
|
||
|
auto dType = getArmType(arr.dataType());
|
||
|
int rank = (int)(arr.rankOf());
|
||
|
return dType != Arm_DataType::UNKNOWN &&
|
||
|
rank<=arm_compute::MAX_DIMS &&
|
||
|
arr.ordering() == 'c' &&
|
||
|
arr.ews()==1 &&
|
||
|
shape::strideDescendingCAscendingF(arr.shapeInfo()) == true;
|
||
|
}
|
||
|
|
||
|
Arm_TensorInfo getArmTensorInfo(int rank, Nd4jLong* bases,sd::DataType ndArrayType, arm_compute::DataLayout layout) {
|
||
|
constexpr int numChannels = 1;
|
||
|
auto dType = getArmType(ndArrayType);
|
||
|
|
||
|
Arm_TensorShape shape;
|
||
|
shape.set_num_dimensions(rank);
|
||
|
for (int i = 0, j = rank - 1; i < rank; i++, j--) {
|
||
|
shape[i] = static_cast<uint32_t>(bases[j]);
|
||
|
}
|
||
|
// fill the rest unused with 1
|
||
|
for (int i = rank; i < arm_compute::MAX_DIMS; i++) {
|
||
|
shape[i] = 1;
|
||
|
}
|
||
|
|
||
|
return Arm_TensorInfo(shape, numChannels, dType, layout);
|
||
|
}
|
||
|
|
||
|
Arm_TensorInfo getArmTensorInfo(const NDArray& arr,
|
||
|
arm_compute::DataLayout layout) {
|
||
|
auto dType = getArmType(arr.dataType());
|
||
|
|
||
|
//
|
||
|
constexpr int numChannels = 1;
|
||
|
int rank = (int)(arr.rankOf());
|
||
|
auto bases = arr.shapeOf();
|
||
|
auto arrStrides = arr.stridesOf();
|
||
|
|
||
|
// https://arm-software.github.io/ComputeLibrary/v20.05/_dimensions_8h_source.xhtml
|
||
|
// note: underhood it is stored as std::array<T, num_max_dimensions> _id;
|
||
|
// TensorShape is derived from Dimensions<uint32_t>
|
||
|
// as well as Strides : public Dimensions<uint32_t>
|
||
|
Arm_TensorShape shape;
|
||
|
Arm_Strides strides;
|
||
|
shape.set_num_dimensions(rank);
|
||
|
strides.set_num_dimensions(rank);
|
||
|
size_t element_size = arm_compute::data_size_from_type(dType);
|
||
|
for (int i = 0, j = rank - 1; i < rank; i++, j--) {
|
||
|
shape[i] = static_cast<uint32_t>(bases[j]);
|
||
|
strides[i] = static_cast<uint32_t>(arrStrides[j]) * element_size;
|
||
|
}
|
||
|
// fill the rest unused with 1
|
||
|
for (int i = rank; i < arm_compute::MAX_DIMS; i++) {
|
||
|
shape[i] = 1;
|
||
|
}
|
||
|
size_t total_size;
|
||
|
size_t size_ind = rank - 1;
|
||
|
total_size = shape[size_ind] * strides[size_ind];
|
||
|
|
||
|
Arm_TensorInfo info;
|
||
|
info.init(shape, numChannels, dType, strides, 0, total_size);
|
||
|
info.set_data_layout(layout);
|
||
|
|
||
|
return info;
|
||
|
}
|
||
|
|
||
|
Arm_Tensor getArmTensor(const NDArray& arr, arm_compute::DataLayout layout) {
|
||
|
// - Ownership of the backing memory is not transferred to the tensor itself.
|
||
|
// - The tensor mustn't be memory managed.
|
||
|
// - Padding requirements should be accounted by the client code.
|
||
|
// In other words, if padding is required by the tensor after the function
|
||
|
// configuration step, then the imported backing memory should account for it.
|
||
|
// Padding can be checked through the TensorInfo::padding() interface.
|
||
|
|
||
|
// Import existing pointer as backing memory
|
||
|
auto info = getArmTensorInfo(arr, layout);
|
||
|
Arm_Tensor tensor;
|
||
|
tensor.allocator()->init(info);
|
||
|
void* buff = (void*)arr.buffer();
|
||
|
tensor.allocator()->import_memory(buff);
|
||
|
return tensor;
|
||
|
}
|
||
|
|
||
|
void copyFromTensor(const Arm_Tensor& inTensor, NDArray& output) {
|
||
|
//only for C order
|
||
|
//only for C order
|
||
|
if (output.ordering() != 'c') return;
|
||
|
auto shapeInfo = output.shapeInfo();
|
||
|
auto bases = &(shapeInfo[1]);
|
||
|
Nd4jLong rank = shapeInfo[0];
|
||
|
auto strides = output.stridesOf();
|
||
|
int width = bases[rank - 1];
|
||
|
uint8_t* outputBuffer = (uint8_t*)output.buffer();
|
||
|
size_t offset = 0;
|
||
|
arm_compute::Window window;
|
||
|
arm_compute::Iterator tensor_it(&inTensor, window);
|
||
|
|
||
|
int element_size = inTensor.info()->element_size();
|
||
|
window.use_tensor_dimensions(inTensor.info()->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY);
|
||
|
|
||
|
// if (output.ews() == 1) {
|
||
|
auto copySize = width * element_size;
|
||
|
auto dest = outputBuffer;
|
||
|
arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
|
||
|
{
|
||
|
auto src = tensor_it.ptr();
|
||
|
memcpy(dest, src, copySize);
|
||
|
dest += copySize;
|
||
|
},
|
||
|
tensor_it);
|
||
|
// }
|
||
|
// else {
|
||
|
// Nd4jLong coords[MAX_RANK] = {};
|
||
|
// if(strides[rank-1]!=1){
|
||
|
// throw std::runtime_error( "not implemented for subarrays whose last stride is not 1");
|
||
|
// //TODO: implement to work with all subarrays properly
|
||
|
// }
|
||
|
// arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
|
||
|
// {
|
||
|
// auto src = tensor_it.ptr();
|
||
|
// auto dest = outputBuffer + offset * element_size;
|
||
|
// memcpy(dest, src, width * element_size);
|
||
|
// offset = sd::inc_coords(bases, strides, coords, offset, rank, 1);
|
||
|
// },
|
||
|
// tensor_it);
|
||
|
// }
|
||
|
}
|
||
|
|
||
|
void copyToTensor(const NDArray& input, Arm_Tensor& outTensor) {
|
||
|
//only for C order
|
||
|
if (input.ordering() != 'c') return;
|
||
|
auto shapeInfo = input.shapeInfo();
|
||
|
auto bases = &(shapeInfo[1]);
|
||
|
Nd4jLong rank = shapeInfo[0];
|
||
|
auto strides = input.stridesOf();
|
||
|
uint8_t *inputBuffer = (uint8_t*)input.buffer();
|
||
|
int width = bases[rank - 1];
|
||
|
size_t offset = 0;
|
||
|
arm_compute::Window window;
|
||
|
arm_compute::Iterator tensor_it(&outTensor, window);
|
||
|
int element_size = outTensor.info()->element_size();
|
||
|
|
||
|
window.use_tensor_dimensions(outTensor.info()->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY);
|
||
|
|
||
|
// if (input.ews() == 1) {
|
||
|
|
||
|
auto copySize = width * element_size;
|
||
|
auto src = inputBuffer;
|
||
|
arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
|
||
|
{
|
||
|
auto dest = tensor_it.ptr();
|
||
|
memcpy(dest,src, copySize);
|
||
|
src += copySize;
|
||
|
},
|
||
|
tensor_it);
|
||
|
// }
|
||
|
// else {
|
||
|
// Nd4jLong coords[MAX_RANK] = {};
|
||
|
// if(strides[rank-1]!=1){
|
||
|
// throw std::runtime_error( "not implemented for subarrays whose last stride is not 1");
|
||
|
// //TODO: implement to work with all subarrays properly
|
||
|
// }
|
||
|
// arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates& id)
|
||
|
// {
|
||
|
// auto dest = tensor_it.ptr();
|
||
|
// auto src = inputBuffer + offset * element_size;
|
||
|
// offset = sd::inc_coords(bases, strides, coords, offset, rank, 1);
|
||
|
// },
|
||
|
// tensor_it);
|
||
|
// }
|
||
|
}
|
||
|
|
||
|
|
||
|
// armcompute should be built with debug option
|
||
|
void print_tensor(Arm_ITensor& tensor, const char* msg) {
|
||
|
auto info = tensor.info();
|
||
|
auto padding = info->padding();
|
||
|
std::cout << msg << "\ntotal: " << info->total_size() << "\n";
|
||
|
|
||
|
for (int i = 0; i < arm_compute::MAX_DIMS; i++) {
|
||
|
std::cout << info->dimension(i) << ",";
|
||
|
}
|
||
|
std::cout << std::endl;
|
||
|
for (int i = 0; i < arm_compute::MAX_DIMS; i++) {
|
||
|
std::cout << info->strides_in_bytes()[i] << ",";
|
||
|
}
|
||
|
std::cout << "\npadding: l " << padding.left << ", r " << padding.right
|
||
|
<< ", t " << padding.top << ", b " << padding.bottom << std::endl;
|
||
|
|
||
|
#ifdef ARM_COMPUTE_ASSERTS_ENABLED
|
||
|
//note it did not print correctly fro NHWC
|
||
|
std::cout << msg << ":\n";
|
||
|
tensor.print(std::cout);
|
||
|
std::cout << std::endl;
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
}
|
||
|
}
|
||
|
}
|