* initial commit Signed-off-by: raver119@gmail.com <raver119@gmail.com> * another initial commit Signed-off-by: raver119@gmail.com <raver119@gmail.com> * another initial commit Signed-off-by: raver119@gmail.com <raver119@gmail.com> * one more initial commit Signed-off-by: raver119@gmail.com <raver119@gmail.com> * next step Signed-off-by: raver119@gmail.com <raver119@gmail.com> * next step Signed-off-by: raver119@gmail.com <raver119@gmail.com> * next step Signed-off-by: raver119@gmail.com <raver119@gmail.com> * next step Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Refactored buffer() and shapeInfo() methods usage with NDArray class. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt Graph class methods to use const shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt choose op to use constant shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt where op shape method to use constant shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt lstsq op to use constant empty shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt matrix_diag_part op shape routine to use constant shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt determinant ops to use constant shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt mean_pairwssqerr_loss ops to use constant shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt ops shape methods. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt shape methods for loss ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt log_loss op shape method. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt shape methods for ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt dilation2d ops shape methods. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopted deconv2d ops shape methods. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopted dynamicRNN op shape method. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopted shape methods for ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopted shape methods for lstm layer ops. Signed-off-by: shugeo <sgazeos@gmail.com> * few updates Signed-off-by: raver119@gmail.com <raver119@gmail.com> * first cuda tweak Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Adopt constant shapes for sconv2d ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt constant shapes for gru ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt constant shapes with shape methods for segment ops and so on. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopted constant shapes with unsorted_segment_* ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopted constant shapes with gamma op shape method. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopted shape methods of reduce_stddev ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopted shape methods for reduce_* ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt shape method for squeeze op. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt strided_slice shape method. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored concat op shape method to adopt constant shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopted shape method for mirror_pad op. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopted split op shape method. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopted tile ops shape methods. Signed-off-by: shugeo <sgazeos@gmail.com> * Added const cast for mkldnn routines handles. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored logSoftMaxForVector_ routine to conform with proper data and shape pointer casts. Signed-off-by: shugeo <sgazeos@gmail.com> * Cosmetic changes to proper usage of constant pointers. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored a couple shape comparators for strides and addBias helpers to proper use data pointers with inplace option. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored depthToSpace helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored histogram helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored im2col helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored gather and gatherND helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed buffer usage on percentile helper. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed gather shape with helpers and range buffer usage. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed buffer usage with space to depth helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed buffer usage and constant shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed buffer usage with LUP decomposition> Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored onehot_ helper. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored pad and prefix to use constant shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactoed softmax helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed space to batch helpers to use buffers properly. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed stack and split helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed buffer usage with sparse to dense helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed buffer usage with mindistance_ helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed buffer usage with tile helper. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed constant shape usage. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed constant shape usage with legacy pairwise bool ops. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored a couple of methods to adopt constant shape usage. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed broadcasting with constant shape." Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed const usage with inplace reverse and constant shapes with legacy reduction. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored legacy ops with const shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored sort to adopt constant shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Corrected sort for constant shape usage. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed constant shape usage with special methods. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored Context to conform with constant shape usage. Signed-off-by: shugeo <sgazeos@gmail.com> * CUDA broadcasting headers Signed-off-by: raver119@gmail.com <raver119@gmail.com> * pairwise/indexreduce/random headers Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Refactored native ops to adopt constant shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * legacy reduce3/scalar headers Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Corrected pullRow signature and tests. Signed-off-by: shugeo <sgazeos@gmail.com> * Corrected routines to proper use of constant shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored tests to use constant shapes properly. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored legacy ops tests to use constant shapes properly. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored buffer usage with NDArray tests. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed native ops tests. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed special concat routine. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed buffer usage with test. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed buffer usage with a test. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored TAD.h and tests. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored calcStrides* routines to use constant shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed miscelaneous errors with constant shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * NativeOps const changes Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Corrected definitions for declared functions. Signed-off-by: shugeo <sgazeos@gmail.com> * NativeOps const changes Signed-off-by: raver119@gmail.com <raver119@gmail.com> * few more const changes Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Fixed const shapes with shape routines. Signed-off-by: shugeo <sgazeos@gmail.com> * few more const changes Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Fixed shape method for broadcastable case. Signed-off-by: shugeo <sgazeos@gmail.com> * few more const changes Signed-off-by: raver119@gmail.com <raver119@gmail.com> * xw_plus_b BP shape fn restored Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Fixed signatures with broadcasting. Signed-off-by: shugeo <sgazeos@gmail.com> * Repaired backprops shape methods for a set of operations. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored broadcast bool for cuda. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored methods for 3 args with const qualifier. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed a couple of kernel signatures for broadcasting. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed kernels signatures for const buffers and shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored pairwise methods to persistent buffers and shapes usage. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt const to buffers and shapes with kernels. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopt const to buffers and shapes with scalar kernels. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored indexreduce kernels signatures to use const buffers and shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored pairwise kernels to adopt cons shapes and buffers. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored pairwise bool kernels to adopt cons shapes and buffers. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored random special ops to conform with const shapes and buffers. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored native ops to conform with const shapes and buffers under cuda platform. Signed-off-by: shugeo <sgazeos@gmail.com> * Cosmetical changes only. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed const shapes and buffers error. Signed-off-by: shugeo <sgazeos@gmail.com> * Corrected start pos routine. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored methods to conform with const shapes and buffers. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored helpers to use proper methods instead. Signed-off-by: shugeo <sgazeos@gmail.com> * bunch of changes Signed-off-by: raver119@gmail.com <raver119@gmail.com> * next bunch of changes Signed-off-by: raver119@gmail.com <raver119@gmail.com> * next bunch of changes Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Fixed execScalar declaration. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed execScalar declaration. Signed-off-by: shugeo <sgazeos@gmail.com> * Corrected const shape cases with sort and so on. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed const shapes for sort. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored kernel declarations to adopt const shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed kernels declarations to adopt const shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Corrected kernel declarations to adopt const shapes and buffers. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed kernels declarations to adopt const shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed segment helpers kernels declarations and so on to adopt const shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed const shape usage with segment and solve helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed kernel declaration with adjustWeight helper. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed cuda implementations for constant shape helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopted const shape usage with kernels. Signed-off-by: shugeo <sgazeos@gmail.com> * Adopted top_k kernels to use const shapes and buffers. Signed-off-by: shugeo <sgazeos@gmail.com> * Corrected kernels declarations to adopt const shapes with helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored NDArray definitions to adopt const shapes and buffers. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed const shapes with image suppression helpers. Signed-off-by: shugeo <sgazeos@gmail.com> * Slight improvement with buffers. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored buffer usage. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored buffer usage with tests. Signed-off-by: shugeo <sgazeos@gmail.com> * Fixed const shape usage with definitions. Signed-off-by: shugeo <sgazeos@gmail.com> * minor updates on cpu side Signed-off-by: raver119@gmail.com <raver119@gmail.com> * Refactored const shape usage with ConstantDescritor and native ops with cuda platform. Signed-off-by: shugeo <sgazeos@gmail.com> * Refactored tear and tile kernels to adopt with const shapes. Signed-off-by: shugeo <sgazeos@gmail.com> * softmax_loop fix Signed-off-by: raver119 <raver119@gmail.com> * update missing signature Signed-off-by: raver119@gmail.com <raver119@gmail.com> * softmax again Signed-off-by: raver119@gmail.com <raver119@gmail.com> * few more missing consts Signed-off-by: raver119 <raver119@gmail.com> * new methods updated Signed-off-by: raver119@gmail.com <raver119@gmail.com> Co-authored-by: shugeo <sgazeos@gmail.com>
1719 lines
83 KiB
Plaintext
1719 lines
83 KiB
Plaintext
/*******************************************************************************
|
|
* Copyright (c) 2015-2018 Skymind, Inc.
|
|
*
|
|
* This program and the accompanying materials are made available under the
|
|
* terms of the Apache License, Version 2.0 which is available at
|
|
* https://www.apache.org/licenses/LICENSE-2.0.
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
* License for the specific language governing permissions and limitations
|
|
* under the License.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
******************************************************************************/
|
|
|
|
#include <legacy/NativeOpExecutioner.h>
|
|
#include <cuda.h>
|
|
#include <system/op_boilerplate.h>
|
|
#include <helpers/DebugHelper.h>
|
|
#include <array/DataTypeUtils.h>
|
|
#include <exceptions/datatype_exception.h>
|
|
#include <exceptions/cuda_exception.h>
|
|
#include <helpers/CudaLaunchHelper.h>
|
|
#include <helpers/ShapeBuilders.h>
|
|
#include <helpers/PointersManager.h>
|
|
|
|
#include <array/ConstantDataBuffer.h>
|
|
#include <array/ShapeDescriptor.h>
|
|
#include <helpers/ConstantShapeHelper.h>
|
|
|
|
#include <loops/transform_float.h>
|
|
#include <loops/transform_bool.h>
|
|
#include <loops/transform_any.h>
|
|
#include <loops/transform_same.h>
|
|
#include <loops/transform_strict.h>
|
|
#include <loops/reduce_float.h>
|
|
#include <loops/reduce_same.h>
|
|
#include <loops/reduce_bool.h>
|
|
#include <loops/reduce_long.h>
|
|
#include <loops/indexreduce.h>
|
|
#include <loops/pairwise_transform.h>
|
|
#include <loops/pairwise_bool.h>
|
|
#include <loops/pairwise_int.h>
|
|
#include <loops/broadcasting_bool.h>
|
|
#include <loops/broadcasting_int.h>
|
|
#include <loops/broadcasting.h>
|
|
#include <loops/reduce_float.h>
|
|
#include <loops/reduce3.h>
|
|
#include <loops/summarystatsreduce.h>
|
|
#include <loops/transform_same.h>
|
|
#include <loops/random.h>
|
|
#include <loops/special_kernels.h>
|
|
#include <loops/scalar.h>
|
|
#include <loops/scalar_bool.h>
|
|
#include <loops/scalar_int.h>
|
|
|
|
using namespace sd;
|
|
|
|
/**
|
|
* This is utility kernel, that updates given special buffer with proper values in device memory
|
|
*/
|
|
extern "C" __global__ void prepareShapeBuffer(int *dimension, int *maxDimension, Nd4jLong *specialPointer, int rows, sd::DataType dataType) {
|
|
Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (tid > 0)
|
|
return;
|
|
|
|
dimension[0] = 0;
|
|
maxDimension[0] = 1;
|
|
|
|
specialPointer[0] = 2;
|
|
specialPointer[1] = rows;
|
|
specialPointer[2] = 1;
|
|
specialPointer[3] = 1;
|
|
specialPointer[4] = 1;
|
|
specialPointer[5] = 0;
|
|
specialPointer[6] = 1;
|
|
specialPointer[7] = 99;
|
|
|
|
ArrayOptions::setDataType(specialPointer, dataType);
|
|
|
|
//printf("special[0]: [%lld]\n", (long long) specialPointer[0]);
|
|
//shape::printShapeInfoLinear("prepareShapeBuffer", specialPointer);
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execPairwiseTransform(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void *extraParams) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
|
|
return;
|
|
|
|
if (xType != zType && yType != zType)
|
|
throw std::runtime_error("NativeOpExecutioner::execPairwiseTransform requires Z operand to have either X or Y type");
|
|
if (lc == nullptr)
|
|
throw std::runtime_error("NativeOpExecutioner::execPairwiseTransform: launch context cannot be nullptr !");
|
|
if (stream == nullptr)
|
|
throw std::runtime_error("NativeOpExecutioner::execPairwiseTransform: CUDA stream cannot be nullptr !");
|
|
|
|
dim3 launchDims(256, 1024, 8192);
|
|
|
|
#ifdef __ND4J_EXPERIMENTAL__
|
|
BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::pairwise_transforms::PairWiseTransform, ::executeCudaShaped(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, extraParams), LIBND4J_TYPES, LIBND4J_TYPES)
|
|
#else
|
|
BUILD_SINGLE_SELECTOR_THRICE(xType, functions::pairwise_transforms::PairWiseTransform, ::executeCudaShaped(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, extraParams), LIBND4J_TYPES)
|
|
#endif
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execPairwiseTransform failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execPairwiseBoolTransform( sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void *extraParams) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
|
|
return;
|
|
|
|
if (!DataTypeUtils::isB(zType))
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execPairwiseBoolTransform wrong Z operand data type", sd::DataType::BOOL, zType);
|
|
|
|
if (yType != xType)
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execPairwiseBoolTransform both operands must have same data type", xType, yType);
|
|
|
|
dim3 launchDims(256, 1024, 16384);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::pairwise_transforms::PairWiseBoolTransform, ::executeCudaShaped(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, extraParams), LIBND4J_TYPES, BOOL_TYPES)
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execPairwiseBoolTransform failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execPairwiseIntTransform( sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void * hZ, Nd4jLong const* hZShapeInfo,
|
|
void * dZ, Nd4jLong const* dZShapeInfo,
|
|
void *extraParams) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
|
|
return;
|
|
|
|
if (!DataTypeUtils::isZ(zType))
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execPairwiseIntTransform wrong Z operand data type", sd::DataType::BOOL, zType);
|
|
|
|
if (yType != xType || zType != xType)
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execPairwiseIntTransform both operands must have same data type", xType, yType);
|
|
|
|
dim3 launchDims(256, 1024, 16384);
|
|
|
|
BUILD_SINGLE_SELECTOR(xType, functions::pairwise_transforms::PairWiseIntTransform, ::executeCudaShaped(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, extraParams), INTEGER_TYPES)
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execPairwiseIntTransform failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execSummaryStatsScalar(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
bool biasCorrected) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
|
|
dim3 launchDims = dim3(256, 256, 32768);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::summarystats::SummaryStatsReduce, ::execSummaryStatsReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, nullptr, biasCorrected, reductionPointer), LIBND4J_TYPES, FLOAT_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execSummaryStatsScalar failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execBroadcastBool(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void *extraParams,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadOnlyShapeInfo, Nd4jLong const* tadOffsets,
|
|
Nd4jLong const* tadOnlyShapeInfoZ, Nd4jLong const* tadOffsetsZ) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
|
|
return;
|
|
|
|
if (!DataTypeUtils::isB(zType))
|
|
throw std::runtime_error("NativeOpExecutioner::execBroadcastBool requires Z operand to have BOOL type");
|
|
|
|
if (yType != xType)
|
|
throw std::runtime_error("NativeOpExecutioner::execBroadcastBool requires both X & Y operands to have same type");
|
|
|
|
if (sd::Environment::getInstance()->isDebugAndVerbose())
|
|
printf("F3B opNum:[%i]\n", opNum);
|
|
|
|
dim3 launchDims(256, 256, 1024);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::execBroadcast(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, extraParams, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, BOOL_TYPES)
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execBroadcastBool failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execBroadcastBool(sd::LaunchContext* lc, const int opNum,
|
|
const void *hX, const Nd4jLong *hXShapeInfo,
|
|
const void *dX, const Nd4jLong *dXShapeInfo,
|
|
const void *hY, const Nd4jLong *hYShapeInfo,
|
|
const void *dY, const Nd4jLong *dYShapeInfo,
|
|
void *hZ, const Nd4jLong *hZShapeInfo,
|
|
void *dZ, const Nd4jLong *dZShapeInfo,
|
|
void *extraParams) {
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
|
|
return;
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
dim3 launchDims;
|
|
|
|
launchDims.y = MAX_NUM_THREADS / 4; // threadsPerBlock
|
|
launchDims.x = (shape::length(hZShapeInfo) + launchDims.y - 1) / launchDims.y; // blocksPerGrid
|
|
launchDims.z = 1024; // shared memory
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::execBroadcast(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, extraParams), LIBND4J_TYPES, BOOL_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execBroadcastBool failed", res);
|
|
}
|
|
|
|
|
|
void NativeOpExecutioner::execInverseBroadcastBool(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void* hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void *extraParams,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadOnlyShapeInfo, Nd4jLong const* tadOffsets,
|
|
Nd4jLong const* tadOnlyShapeInfoZ, Nd4jLong const* tadOffsetsZ) {
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
|
|
return;
|
|
|
|
if (!DataTypeUtils::isB(zType))
|
|
throw std::runtime_error("NativeOpExecutioner::execBroadcastBool requires Z operand to have BOOL type");
|
|
|
|
if (yType != xType)
|
|
throw std::runtime_error("NativeOpExecutioner::execBroadcastBool requires both X & Y operands to have same type");
|
|
|
|
dim3 launchDims(256, 256, 1024);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::execInverseBroadcast(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, extraParams, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, BOOL_TYPES)
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execInverseBroadcastBool failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execBroadcastInt(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadOnlyShapeInfo, Nd4jLong const* tadOffsets,
|
|
Nd4jLong const* tadOnlyShapeInfoZ,Nd4jLong const* tadOffsetsZ) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
|
|
return;
|
|
|
|
if (!DataTypeUtils::isZ(zType))
|
|
throw std::runtime_error("NativeOpExecutioner::execBroadcastInt requires Z operand to have INT type");
|
|
|
|
if (yType != xType || zType != xType)
|
|
throw std::runtime_error("NativeOpExecutioner::execBroadcastInt requires both X & Y operands to have same type");
|
|
|
|
dim3 launchDims(256, 256, 1024);
|
|
|
|
BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt, ::execBroadcast(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), INTEGER_TYPES)
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execBroadcastBool failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execBroadcastInt(sd::LaunchContext* lc, const int opNum,
|
|
const void *hX, const Nd4jLong *hXShapeInfo,
|
|
const void *dX, const Nd4jLong *dXShapeInfo,
|
|
const void *hY, const Nd4jLong *hYShapeInfo,
|
|
const void *dY, const Nd4jLong *dYShapeInfo,
|
|
void *hZ, const Nd4jLong *hZShapeInfo,
|
|
void *dZ, const Nd4jLong *dZShapeInfo) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
|
|
return;
|
|
|
|
if (!DataTypeUtils::isZ(zType))
|
|
throw std::runtime_error("NativeOpExecutioner::execBroadcastInt requires Z operand to have INT type");
|
|
|
|
if (yType != xType || zType != xType)
|
|
throw std::runtime_error("NativeOpExecutioner::execBroadcastInt requires both X & Y operands to have same type");
|
|
|
|
dim3 launchDims;
|
|
|
|
launchDims.y = MAX_NUM_THREADS / 4; // threadsPerBlock
|
|
launchDims.x = (shape::length(hZShapeInfo) + launchDims.y - 1) / launchDims.y; // blocksPerGrid
|
|
launchDims.z = 1024; // shared memory
|
|
|
|
BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt, ::execBroadcast(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo), INTEGER_TYPES)
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execBroadcastBool failed", res);
|
|
}
|
|
|
|
void NativeOpExecutioner::execInverseBroadcastInt(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadOnlyShapeInfo, Nd4jLong const* tadOffsets,
|
|
Nd4jLong const* tadOnlyShapeInfoZ,Nd4jLong const* tadOffsetsZ) {
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
|
|
return;
|
|
|
|
if (!DataTypeUtils::isZ(zType))
|
|
throw std::runtime_error("NativeOpExecutioner::execBroadcastInt requires Z operand to have INT type");
|
|
|
|
if (yType != xType || zType != xType)
|
|
throw std::runtime_error("NativeOpExecutioner::execBroadcastInt requires both X & Y operands to have same type");
|
|
|
|
if (sd::Environment::getInstance()->isDebugAndVerbose())
|
|
printf("F3BI opNum:[%i]\n", opNum);
|
|
|
|
dim3 launchDims(256, 256, 1024);
|
|
|
|
BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt, ::execInverseBroadcast(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), INTEGER_TYPES)
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execInverseBroadcastInt failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
/**
|
|
*
|
|
* @param opNum
|
|
* @param dX
|
|
* @param dXShapeInfo
|
|
* @param dY
|
|
* @param dYShapeInfo
|
|
* @param dZ
|
|
* @param dZShapeInfo
|
|
* @param dimension
|
|
* @param dimensionLength
|
|
*/
|
|
void NativeOpExecutioner::execBroadcast(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadOnlyShapeInfo, Nd4jLong const* tadOffsets,
|
|
Nd4jLong const* tadOnlyShapeInfoZ,Nd4jLong const* tadOffsetsZ) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
|
|
return;
|
|
|
|
dim3 launchDims(256, 256, 1024);
|
|
|
|
#ifdef __ND4J_EXPERIMENTAL__
|
|
BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::broadcast::Broadcast, ::execBroadcast(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES);
|
|
#else
|
|
BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::execBroadcast(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES);
|
|
#endif
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execBroadcast failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execBroadcast(sd::LaunchContext *lc, const int opNum,
|
|
const void *hX, const Nd4jLong *hXShapeInfo,
|
|
const void *dX, const Nd4jLong *dXShapeInfo,
|
|
const void *hY, const Nd4jLong *hYShapeInfo,
|
|
const void *dY, const Nd4jLong *dYShapeInfo,
|
|
void *hZ, const Nd4jLong *hZShapeInfo,
|
|
void *dZ, const Nd4jLong *dZShapeInfo) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
|
|
return;
|
|
|
|
dim3 launchDims;
|
|
|
|
launchDims.y = MAX_NUM_THREADS / 4; // threadsPerBlock
|
|
launchDims.x = (shape::length(hZShapeInfo) + launchDims.y - 1) / launchDims.y; // blocksPerGrid
|
|
launchDims.z = 1024; // shared memory
|
|
|
|
#ifdef __ND4J_EXPERIMENTAL__
|
|
BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::broadcast::Broadcast, ::execBroadcast(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo), LIBND4J_TYPES, LIBND4J_TYPES);
|
|
#else
|
|
BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::execBroadcast(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo), LIBND4J_TYPES);
|
|
#endif
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execBroadcast failed", res);
|
|
}
|
|
|
|
void NativeOpExecutioner::execInverseBroadcast(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadOnlyShapeInfo, Nd4jLong const* tadOffsets,
|
|
Nd4jLong const* tadOnlyShapeInfoZ,Nd4jLong const* tadOffsetsZ) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
|
|
return;
|
|
|
|
dim3 launchDims(256, 256, 1024);
|
|
|
|
#ifdef __ND4J_EXPERIMENTAL__
|
|
BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::broadcast::Broadcast, ::execInverseBroadcast(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES);
|
|
#else
|
|
BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::execInverseBroadcast(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES);
|
|
#endif
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execInverseBroadcast failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execReduceSame(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
|
|
if (sd::Environment::getInstance()->isDebugAndVerbose())
|
|
printf("SF7 opNum:[%i]\n", opNum);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
auto xRank = shape::rank(hXShapeInfo);
|
|
|
|
if (zType != xType)
|
|
throw datatype_exception::build("NativeOpExecutioner::execReduceSame requires both X & Z operands to have same type", xType, zType);
|
|
|
|
auto numBlocks = shape::length(hZShapeInfo);
|
|
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 8192);
|
|
|
|
BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execReduceSame failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execReduceLong(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
int *dimension,int dimensionLength,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
|
|
if (sd::Environment::getInstance()->isDebugAndVerbose())
|
|
printf("LF7 opNum:[%i]\n", opNum);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (zType != sd::DataType::INT64)
|
|
throw datatype_exception::build("NativeOpExecutioner::execReduceLong wrong Z data type", sd::DataType::INT64, zType);
|
|
|
|
auto xRank = shape::rank(hXShapeInfo);
|
|
auto numBlocks = shape::length(hZShapeInfo);
|
|
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, LONG_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execReduceLong failed", res);
|
|
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execReduceBool(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
|
|
if (sd::Environment::getInstance()->isDebugAndVerbose())
|
|
printf("BF7 opNum:[%i]\n", opNum);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (zType != sd::DataType::BOOL)
|
|
throw std::runtime_error("NativeOpExecutioner::execReduceBool requires Z operand to have BOOL type");
|
|
|
|
auto xRank = shape::rank(hXShapeInfo);
|
|
auto numBlocks = shape::length(hZShapeInfo);
|
|
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, BOOL_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execReduceBool failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
/**
|
|
*
|
|
* @param opNum
|
|
* @param dX
|
|
* @param dXShapeInfo
|
|
* @param extraParams
|
|
* @param dZ
|
|
* @param dZShapeInfo
|
|
* @param dimension
|
|
* @param dimensionLength
|
|
*/
|
|
void NativeOpExecutioner::execIndexReduce(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
auto allocationPointer = lc->getAllocationPointer();
|
|
|
|
if (sd::Environment::getInstance()->isDebugAndVerbose())
|
|
printf("F2 opNum:[%i]\n", opNum);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
auto numBlocks = shape::length(hZShapeInfo);
|
|
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
|
|
|
|
if (zType != sd::DataType::INT64 && zType != sd::DataType::INT32)
|
|
throw datatype_exception::build("NativeOpExecutioner::execIndexReduce requires Z operand to have INT32/INT64 type", zType);
|
|
|
|
auto dz = reinterpret_cast<Nd4jLong*>(dZ);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::indexreduce::IndexReduce, ::executeIndexReduce(launchDims, stream, opNum, dX, dXShapeInfo, shape::rank(hXShapeInfo), extraParams, dz, dZShapeInfo, shape::rank(hZShapeInfo), dimension, dimensionLength, 1, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, INDEXING_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execIndexReduce failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
/**
|
|
*
|
|
* @param opNum
|
|
* @param dX
|
|
* @param dXShapeInfo
|
|
* @param extraParams
|
|
* @param dZ
|
|
* @param dZShapeInfo
|
|
*/
|
|
void NativeOpExecutioner::execReduceFloat(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
int *dimension,int dimensionLength,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
|
|
if (sd::Environment::getInstance()->isDebugAndVerbose())
|
|
printf("F8 opNum:[%i]\n", opNum);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
auto xRank = shape::rank(hXShapeInfo);
|
|
auto numBlocks = shape::length(hZShapeInfo);
|
|
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execReduceXD(launchDims, stream, opNum, xRank, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execReduceFloat failed", res);
|
|
}
|
|
|
|
|
|
/**
|
|
*
|
|
* @param opNum
|
|
* @param dX
|
|
* @param dXShapeInfo
|
|
* @param extraParams
|
|
*/
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execIndexReduceScalar(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo){
|
|
|
|
if (sd::Environment::getInstance()->isDebug())
|
|
printf("F1 opNum:[%i]\n", opNum);
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
auto allocationPointer = lc->getAllocationPointer();
|
|
|
|
auto xLength = shape::length(hXShapeInfo);
|
|
auto blockWidth = 256;
|
|
auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
|
|
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
|
|
|
|
if (sd::Environment::getInstance()->isDebugAndVerbose() && launchDims.x == 1)
|
|
printf("AF1 opNum:[%i]\n", opNum);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
// FIXME: we want Z to be one of integer types
|
|
//if (!DataTypeUtils::isZ(zType))
|
|
// throw sd::datatype_exception("NativeOpExecutioner::execIndexReduceScalar requires Z operand to have one of integer types")
|
|
if (zType != sd::DataType::INT64 && zType != sd::DataType::INT32)
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execIndexReduceScalar requires Z operand to have INT32/INT64 data type", zType);
|
|
|
|
auto dz = reinterpret_cast<Nd4jLong*>(dZ);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::indexreduce::IndexReduce, ::executeIndexReduceScalar(launchDims, stream,
|
|
opNum,
|
|
dX, dXShapeInfo, shape::rank(hXShapeInfo),
|
|
extraParams,
|
|
dz, dZShapeInfo, 0,
|
|
nullptr, 0,
|
|
1,
|
|
allocationPointer, reductionPointer,
|
|
nullptr, nullptr), LIBND4J_TYPES, INDEXING_TYPES);
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execIndexReduceScalar failed", res);
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execReduceFloatScalar(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
auto xLength = shape::length(hXShapeInfo);
|
|
auto blockWidth = 256;
|
|
auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
|
|
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execReduceScalar(launchDims, stream, opNum, dX,dXShapeInfo, hXShapeInfo, extraParams, dZ,dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, FLOAT_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execReduceFloatScalar failed", res);
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execReduceBoolScalar(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (zType != sd::DataType::BOOL)
|
|
throw std::runtime_error("NativeOpExecutioner::execReduceBoolScalar requires Z operand to have BOOL type");
|
|
|
|
auto xLength = shape::length(hXShapeInfo);
|
|
auto blockWidth = 256;
|
|
auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
|
|
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, BOOL_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execReduceBoolScalar failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execReduceSameScalar(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (zType != xType)
|
|
throw datatype_exception::build("NativeOpExecutioner::execReduceSameScalar requires both X & Z operands to have same type", xType, zType);
|
|
|
|
auto xLength = shape::length(hXShapeInfo);
|
|
auto blockWidth = 256;
|
|
auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
|
|
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
|
|
|
|
BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execReduceSameScalar failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execReduceLongScalar(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (zType != sd::DataType::INT64)
|
|
throw datatype_exception::build("NativeOpExecutioner::execReduceLongScalar wrong Z data type", sd::DataType::INT64, zType);
|
|
|
|
auto xLength = shape::length(hXShapeInfo);
|
|
auto blockWidth = 256;
|
|
auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
|
|
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, 0, reductionPointer, nullptr), LIBND4J_TYPES, LONG_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execReduceLongScalar failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execTransformSame(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void *extraParams,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xRank = shape::rank(hXShapeInfo);
|
|
auto zRank = shape::rank(hZShapeInfo);
|
|
auto xType = ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo)) {
|
|
return;
|
|
}
|
|
|
|
if (xType != zType) {
|
|
throw std::runtime_error("NativeOpExecutioner::execTransformSame requires X & Z to have same type");
|
|
}
|
|
|
|
dim3 launchDims(512, 512, 16384);
|
|
BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformSame, ::executeTransformShaped(launchDims, stream, opNum, dX, dXShapeInfo, xRank, extraParams, dZ, dZShapeInfo, zRank, nullptr, nullptr, nullptr, nullptr), LIBND4J_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execTransformSame failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execTransformBool(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void *extraParams,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xRank = shape::rank(hXShapeInfo);
|
|
auto zRank = shape::rank(hZShapeInfo);
|
|
auto xType = ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo)) {
|
|
return;
|
|
}
|
|
|
|
if (!DataTypeUtils::isB(zType)) {
|
|
throw std::runtime_error("NativeOpExecutioner::execTransformBool requires Z to have same boolean type");
|
|
}
|
|
|
|
dim3 launchDims(512, 512, 16384);
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformBool, ::executeTransformShaped(launchDims, stream, opNum, dX, dXShapeInfo, xRank, extraParams, dZ, dZShapeInfo, zRank, nullptr, nullptr, nullptr, nullptr), LIBND4J_TYPES, BOOL_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execTransformBool failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execTransformAny(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void *extraParams,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets, bool allowParallelism) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xRank = shape::rank(hXShapeInfo);
|
|
auto zRank = shape::rank(hZShapeInfo);
|
|
auto xType = ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo))
|
|
return;
|
|
|
|
if (opNum == sd::transform::Assign && shape::order(hXShapeInfo) == shape::order(hZShapeInfo) && shape::order(hXShapeInfo) == 'c' && xType == zType && shape::elementWiseStride(hXShapeInfo) == 1 && shape::elementWiseStride(hZShapeInfo) == 1) {
|
|
cudaMemcpyAsync(dZ, dX, shape::length(hXShapeInfo) * sd::DataTypeUtils::sizeOfElement(xType), cudaMemcpyDeviceToDevice, *stream);
|
|
}
|
|
else {
|
|
|
|
dim3 launchDims(512, 512, 2048);
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformAny, ::executeTransformShaped(launchDims, stream, opNum, dX, dXShapeInfo, xRank, extraParams, dZ, dZShapeInfo, zRank, nullptr, nullptr, nullptr, nullptr), LIBND4J_TYPES, LIBND4J_TYPES);
|
|
}
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execTransformAny failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execTransformStrict(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void *extraParams,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xRank = shape::rank(hXShapeInfo);
|
|
auto zRank = shape::rank(hZShapeInfo);
|
|
auto xType = ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo)) {
|
|
return;
|
|
}
|
|
|
|
if (xType != zType || !DataTypeUtils::isR(xType)) {
|
|
throw datatype_exception::build("NativeOpExecutioner::execTransformStrict requires X & Z to have same floating point type", xType, zType);
|
|
}
|
|
|
|
dim3 launchDims(512, 512, 16384);
|
|
BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformStrict, ::executeTransformShaped(launchDims, stream, opNum, dX, dXShapeInfo, xRank, extraParams, dZ, dZShapeInfo, zRank, nullptr, nullptr, nullptr, nullptr), FLOAT_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execTransformStrict failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execTransformFloat(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void *extraParams,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
|
|
auto xRank = shape::rank(hXShapeInfo);
|
|
auto zRank = shape::rank(hZShapeInfo);
|
|
auto xType = ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo))
|
|
return;
|
|
|
|
if (!DataTypeUtils::isR(zType))
|
|
throw datatype_exception::build("NativeOpExecutioner::execTransformFloat requires Z to have floating point type", zType);
|
|
|
|
dim3 launchDims(512, 512, 2048);
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformFloat, ::executeTransformShaped(launchDims, stream, opNum, dX, dXShapeInfo, xRank, extraParams, dZ, dZShapeInfo, zRank, nullptr, nullptr, nullptr, nullptr), LIBND4J_TYPES, FLOAT_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execTransformFloat failed", res);
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execSummaryStats(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
bool biasCorrected) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
|
|
dim3 launchDims = dim3(256, 256, 32768);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (!DataTypeUtils::isR(zType))
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execSummaryStats requires Z operand to have floating point data type", zType);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::summarystats::SummaryStatsReduce, ::execSummaryStatsReduce(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, nullptr, biasCorrected, reductionPointer), LIBND4J_TYPES, FLOAT_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execSummaryStats A failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execSummaryStats(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets,
|
|
bool biasCorrected) {
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
|
|
dim3 launchDims = dim3(256, 256, 32768);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (!DataTypeUtils::isR(zType))
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execSummaryStats requires Z operand to have floating point data type", zType);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::summarystats::SummaryStatsReduce, ::execSummaryStatsReduce(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, biasCorrected, reductionPointer), LIBND4J_TYPES, FLOAT_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execSummaryStats B failed", res);
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execReduce3(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
auto allocationPointer = lc->getAllocationPointer();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
auto blockWidth = 256;
|
|
auto numBlocks = CudaLaunchHelper::getReductionBlocks(shape::length(hXShapeInfo), blockWidth);
|
|
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
|
|
|
|
if (xType != yType)
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Y operand to have X type", xType, yType);
|
|
|
|
if (!DataTypeUtils::isR(zType))
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Z operand to have floating point data type", zType);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execScalar(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, extraParams, dZ, dZShapeInfo, allocationPointer, reductionPointer, nullptr), LIBND4J_TYPES, FLOAT_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execReduce3 failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execReduce3(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadOnlyShapeInfo, Nd4jLong const* tadOffsets,
|
|
Nd4jLong const* yTadOnlyShapeInfo, Nd4jLong const* yTadOffsets) {
|
|
|
|
if(shape::isScalar(hZShapeInfo)) {
|
|
NativeOpExecutioner::execReduce3(lc, opNum, hX, hXShapeInfo, dX, dXShapeInfo, extraParams, hY, hYShapeInfo, dY, dYShapeInfo, hZ, hZShapeInfo, dZ, dZShapeInfo);
|
|
return;
|
|
}
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto allocationPointer = lc->getAllocationPointer();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (xType != yType)
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Y operand to have X type", xType, yType);
|
|
|
|
if (!DataTypeUtils::isR(zType))
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Z operand to have floating point data type", zType);
|
|
|
|
|
|
auto numBlocks = shape::length(hZShapeInfo);
|
|
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(launchDims, stream, opNum,
|
|
dX, dXShapeInfo,
|
|
dY, dYShapeInfo,
|
|
extraParams,
|
|
dZ, dZShapeInfo,
|
|
dimension, dimensionLength,
|
|
1,
|
|
allocationPointer,
|
|
tadOnlyShapeInfo, tadOffsets,
|
|
yTadOnlyShapeInfo, yTadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execReduce3 B failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execReduce3Scalar(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo) {
|
|
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto allocationPointer = lc->getAllocationPointer();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
auto xLength = shape::length(hXShapeInfo);
|
|
auto blockWidth = 256;
|
|
auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
|
|
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
|
|
|
|
if (xType != yType)
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3Scalar requires Y operand to have X type", xType, yType);
|
|
|
|
if (!DataTypeUtils::isR(zType))
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3Scalar requires Z operand to have floating point data type", zType);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execScalar(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, extraParams, dZ, dZShapeInfo, allocationPointer, reductionPointer, nullptr), LIBND4J_TYPES, FLOAT_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execReduce3Scalar failed", res);
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execScalarBool(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void const* hScalar, Nd4jLong const* hScalarShapeInfo,
|
|
void const* dScalar, Nd4jLong const* dScalarShapeInfo,
|
|
void *extraParams, bool allowParallelism) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
dim3 launchDims = dim3(256, 512, 8192);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
|
|
return;
|
|
|
|
if (xType != yType )
|
|
throw std::runtime_error("NativeOpExecutioner::execScalarBool requires X & Y to have same type");
|
|
|
|
if (!DataTypeUtils::isB(zType) )
|
|
throw std::runtime_error("NativeOpExecutioner::execScalarBool requires Z operand to have BOOL type");
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::executeCudaShaped(launchDims, stream, opNum, dX, dXShapeInfo, dZ, dZShapeInfo, dScalar, extraParams), LIBND4J_TYPES, BOOL_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execScalarBool failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execScalarBool(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void const* hScalars, Nd4jLong const* hScalarShapeInfo,
|
|
void const* dScalars, Nd4jLong const* dScalarShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets,
|
|
Nd4jLong const* tadShapeInfoZ, Nd4jLong const* tadOffsetsZ) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
dim3 launchDims(256, 512, 8192);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
|
|
return;
|
|
|
|
if (xType != yType )
|
|
throw std::runtime_error("NativeOpExecutioner::execScalarBool requires X & Y to have same type");
|
|
|
|
if (!DataTypeUtils::isB(zType) )
|
|
throw std::runtime_error("NativeOpExecutioner::execScalarBool requires Z operand to have BOOL type");
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::executeCudaAlongDimension(launchDims, stream, opNum, dX, dXShapeInfo, dZ, dZShapeInfo, dScalars, extraParams, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, BOOL_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execScalarBool B failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execScalarInt(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void const* hScalar, Nd4jLong const* hScalarShapeInfo,
|
|
void const* dScalar, Nd4jLong const* dScalarShapeInfo,
|
|
void *extraParams, bool allowParallelism) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
dim3 launchDims = dim3(256, 512, 8192);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
|
|
return;
|
|
|
|
if (xType != yType || zType != xType)
|
|
throw std::runtime_error("NativeOpExecutioner::execScalarInt requires X & Y to have same type");
|
|
|
|
if (!DataTypeUtils::isZ(zType) )
|
|
throw std::runtime_error("NativeOpExecutioner::execScalarInt requires Z operand to have INT type");
|
|
|
|
BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::executeCudaShaped(launchDims, stream, opNum, dX, dXShapeInfo, dZ, dZShapeInfo, dScalar, extraParams), INTEGER_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execScalarInt failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execScalarInt(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void const* hScalars, Nd4jLong const* hScalarShapeInfo,
|
|
void const* dScalars, Nd4jLong const* dScalarShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets,
|
|
Nd4jLong const* tadShapeInfoZ, Nd4jLong const* tadOffsetsZ) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
dim3 launchDims(256, 512, 8192);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
|
|
return;
|
|
|
|
if (xType != yType || zType != xType)
|
|
throw std::runtime_error("NativeOpExecutioner::execScalarInt requires X & Y to have same type");
|
|
|
|
if (!DataTypeUtils::isZ(zType) )
|
|
throw std::runtime_error("NativeOpExecutioner::execScalarInt requires Z operand to have INT type");
|
|
|
|
BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::executeCudaAlongDimension(launchDims, stream, opNum, dX, dXShapeInfo, dZ, dZShapeInfo, dScalars, extraParams, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), INTEGER_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execScalarInt B failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execScalar(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void* hZ, Nd4jLong const* hZShapeInfo,
|
|
void* dZ, Nd4jLong const* dZShapeInfo,
|
|
void const* hScalar, Nd4jLong const* hScalarShapeInfo,
|
|
void const* dScalar, Nd4jLong const* dScalarShapeInfo,
|
|
void *extraParams, bool allowParallelism) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
dim3 launchDims(256, 512, 8192);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
|
|
return;
|
|
|
|
|
|
#ifdef __ND4J_EXPERIMENTAL__
|
|
BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::scalar::ScalarTransform, ::executeCudaShaped(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, dZ, dZShapeInfo, hZShapeInfo, dScalar, extraParams), LIBND4J_TYPES, LIBND4J_TYPES);
|
|
#else
|
|
BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::executeCudaShaped(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, dZ, dZShapeInfo, hZShapeInfo, dScalar, extraParams), LIBND4J_TYPES);
|
|
#endif
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execScalar failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execScalar(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void const* hScalars, Nd4jLong const* hScalarShapeInfo,
|
|
void const* dScalars, Nd4jLong const* dScalarShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets,
|
|
Nd4jLong const* tadShapeInfoZ, Nd4jLong const* tadOffsetsZ) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
|
|
return;
|
|
|
|
dim3 launchDims(256, 256, 16384);
|
|
|
|
#ifdef __ND4J_EXPERIMENTAL__
|
|
BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::scalar::ScalarTransform, ::executeCudaAlongDimension(launchDims, stream, opNum, dX, dXShapeInfo, dZ, dZShapeInfo, dScalars, extraParams, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES);
|
|
#else
|
|
BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::executeCudaAlongDimension(launchDims, stream, opNum, dX, dXShapeInfo, dZ, dZShapeInfo, dScalars, extraParams, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES);
|
|
#endif
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execScalar B failed", res);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execRandom(sd::LaunchContext *lc,
|
|
int opNum,
|
|
Nd4jPointer stateHost,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void *extraArguments) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto sizeOf = sizeof(sd::graph::RandomGenerator);
|
|
Nd4jPointer stateDevice;
|
|
|
|
cudaError_t res = cudaMalloc(reinterpret_cast<void **>(&stateDevice), sizeOf);
|
|
checkCudaErrors(cudaStreamSynchronize(*stream));
|
|
checkCudaErrors(cudaMemcpyAsync(stateDevice, stateHost, sizeOf, cudaMemcpyHostToDevice, *stream));
|
|
|
|
dim3 launchDims = dim3(512, 512, 32768);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
auto rng = reinterpret_cast<sd::graph::RandomGenerator*>(stateHost);
|
|
|
|
// functions::random::RandomFunction<float>::executeCudaSingle(launchDims, extraPointers, opNum, stateHost, dZ, dZShapeInfo, extraArguments),
|
|
BUILD_SINGLE_SELECTOR(zType, functions::random::RandomFunction, ::executeCudaSingle(launchDims, stream, opNum, stateDevice, dZ, dZShapeInfo, extraArguments), FLOAT_TYPES);
|
|
|
|
res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execRandom X failed", res);
|
|
|
|
cudaFree(stateDevice);
|
|
|
|
rng->rewindH(shape::length(hZShapeInfo));
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execRandom(sd::LaunchContext *lc,
|
|
int opNum,
|
|
Nd4jPointer stateHost,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void *extraArguments) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
|
|
auto sizeOf = sizeof(sd::graph::RandomGenerator);
|
|
Nd4jPointer stateDevice;
|
|
|
|
cudaError_t res = cudaMalloc(reinterpret_cast<void **>(&stateDevice), sizeOf);
|
|
checkCudaErrors(cudaStreamSynchronize(*stream));
|
|
checkCudaErrors(cudaMemcpyAsync(stateDevice, stateHost, sizeOf, cudaMemcpyHostToDevice, *stream));
|
|
|
|
auto rng = reinterpret_cast<sd::graph::RandomGenerator*>(stateHost);
|
|
|
|
dim3 launchDims = dim3(512, 512, 32768);
|
|
auto xType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
// functions::random::RandomFunction<float>::executeCudaDouble(launchDims, extraPointers, opNum, stateHost, dX, dXShapeInfo, dZ, dZShapeInfo, extraArguments);
|
|
BUILD_SINGLE_SELECTOR(xType, functions::random::RandomFunction, ::executeCudaDouble(launchDims, stream, opNum, stateDevice, dX, dXShapeInfo, dZ, dZShapeInfo, extraArguments), FLOAT_TYPES);
|
|
|
|
res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execRandom XY failed", res);
|
|
|
|
cudaFree(stateDevice);
|
|
|
|
rng->rewindH(shape::length(hZShapeInfo));
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execRandom(sd::LaunchContext *lc,
|
|
int opNum,
|
|
Nd4jPointer stateHost,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
void *extraArguments) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto sizeOf = sizeof(sd::graph::RandomGenerator);
|
|
Nd4jPointer stateDevice;
|
|
|
|
cudaError_t res = cudaMalloc(reinterpret_cast<void **>(&stateDevice), sizeOf);
|
|
checkCudaErrors(cudaStreamSynchronize(*stream));
|
|
checkCudaErrors(cudaMemcpyAsync(stateDevice, stateHost, sizeOf, cudaMemcpyHostToDevice, *stream));
|
|
|
|
auto rng = reinterpret_cast<sd::graph::RandomGenerator*>(stateHost);
|
|
|
|
dim3 launchDims = dim3(512, 512, 32768);
|
|
auto xType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
// functions::random::RandomFunction<float>::executeCudaTriple(launchDims, extraPointers, opNum, stateHost, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, extraArguments);
|
|
BUILD_SINGLE_SELECTOR(xType, functions::random::RandomFunction, ::executeCudaTriple(launchDims, stream, opNum, stateDevice, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, extraArguments), FLOAT_TYPES);
|
|
|
|
res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execRandom XYZ failed", res);
|
|
|
|
cudaFree(stateDevice);
|
|
|
|
rng->rewindH(shape::length(hZShapeInfo));
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execReduce3All(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParamsVals,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* xTadShapeInfo, Nd4jLong const* xOffsets,
|
|
Nd4jLong const* yTadShapeInfo, Nd4jLong const* yOffsets) {
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto allocationPointer = lc->getAllocationPointer();
|
|
auto reductionPointer = lc->getReductionPointer();
|
|
|
|
if (sd::Environment::getInstance()->isDebugAndVerbose())
|
|
printf("D119 opNum:[%i]\n", opNum);
|
|
|
|
dim3 launchDims(shape::length(hZShapeInfo), 256, 32768);
|
|
|
|
if (sd::Environment::getInstance()->isVerbose() && launchDims.x == 1)
|
|
printf("AD119 opNum:[%i]\n", opNum);
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (yType != xType)
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3All both operands must have same data type", xType, yType);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execAll(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, extraParamsVals, dZ, dZShapeInfo, dimension, dimensionLength, 1, allocationPointer, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets), LIBND4J_TYPES, FLOAT_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execReduce3All failed", res);
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
void NativeOpExecutioner::execReduce3TAD(sd::LaunchContext *lc,
|
|
int opNum,
|
|
void const* hX, Nd4jLong const* hXShapeInfo,
|
|
void const* dX, Nd4jLong const* dXShapeInfo,
|
|
void *extraParams,
|
|
void const* hY, Nd4jLong const* hYShapeInfo,
|
|
void const* dY, Nd4jLong const* dYShapeInfo,
|
|
void *hZ, Nd4jLong const* hZShapeInfo,
|
|
void *dZ, Nd4jLong const* dZShapeInfo,
|
|
int *dimension, int dimensionLength,
|
|
Nd4jLong const* tadShapeInfo, Nd4jLong const* tadOffsets,
|
|
Nd4jLong const* yTadShapeInfo, Nd4jLong const* yTadOffsets) {
|
|
|
|
if(shape::isScalar(hZShapeInfo)) {
|
|
NativeOpExecutioner::execReduce3(lc, opNum, hX, hXShapeInfo, dX, dXShapeInfo, extraParams, hY, hYShapeInfo, dY, dYShapeInfo, hZ, hZShapeInfo, dZ, dZShapeInfo);
|
|
return;
|
|
}
|
|
|
|
auto stream = lc->getCudaStream();
|
|
auto allocationPointer = lc->getAllocationPointer();
|
|
|
|
auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
|
|
auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
|
|
auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
|
|
|
|
if (xType != yType)
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3TAD requires Y operand to have X type", xType, yType);
|
|
|
|
if (!DataTypeUtils::isR(zType))
|
|
throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3TAD requires Z operand to have floating point data type", zType);
|
|
|
|
auto numBlocks = shape::length(hZShapeInfo);
|
|
dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
|
|
|
|
BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, extraParams, dZ, dZShapeInfo, dimension, dimensionLength, 1, allocationPointer, tadShapeInfo, tadOffsets, yTadShapeInfo, yTadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
|
|
|
|
// TODO: remove after the release
|
|
auto res = cudaStreamSynchronize(*stream);
|
|
if (res != 0)
|
|
throw cuda_exception::build("execReduce3TAD failed", res);
|
|
}
|
|
|