[WIP] More of CUDA (#63)
* less spam Signed-off-by: raver119 <raver119@gmail.com> * flatten kernel Signed-off-by: raver119 <raver119@gmail.com> * adjust_hue/adjust_saturation tweaks Signed-off-by: raver119 <raver119@gmail.com> * adjust_hue cuda single Signed-off-by: raver119 <raver119@gmail.com> * adjust_hue cuda batch Signed-off-by: raver119 <raver119@gmail.com> * adjust_saturation cuda Signed-off-by: raver119 <raver119@gmail.com>master
parent
cf2311859a
commit
91a8fb0d90
|
@ -1327,7 +1327,7 @@ void NativeOps::concat(
|
|||
// take into account indices for first array
|
||||
auto axisSize = shape::sizeAt(reinterpret_cast<Nd4jLong*>(inputShapeInfo[0]), axis);
|
||||
indices[0][2 * axis + 1] = axisSize;
|
||||
printf("The axe size is %lld\n", axisSize);
|
||||
//printf("The axe size is %lld\n", axisSize);
|
||||
// loop through the rest of input arrays
|
||||
for(int i = 1; i < numArrays; ++i) {
|
||||
indices[i][2 * axis] = indices[i-1][2 * axis + 1]; // index start from
|
||||
|
|
|
@ -42,7 +42,7 @@ namespace nd4j {
|
|||
}
|
||||
|
||||
char order = (char) INT_ARG(0);
|
||||
helpers::flatten(arrays, output, order);
|
||||
helpers::flatten(block.launchContext(), arrays, output, order);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
|
@ -25,7 +25,7 @@ namespace nd4j {
|
|||
namespace ops {
|
||||
namespace helpers {
|
||||
template <typename T>
|
||||
static FORCEINLINE void rgb_to_hv(nd4j::LaunchContext * context, T r, T g, T b, T* h, T* v_min, T* v_max) {
|
||||
static FORCEINLINE _CUDA_HD void rgb_to_hv(T r, T g, T b, T* h, T* v_min, T* v_max) {
|
||||
T v_mid;
|
||||
int h_category;
|
||||
// According to the figures in:
|
||||
|
@ -84,7 +84,7 @@ namespace helpers {
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
static FORCEINLINE void hv_to_rgb(nd4j::LaunchContext * context, T h, T v_min, T v_max, T* r, T* g, T* b) {
|
||||
static FORCEINLINE _CUDA_HD void hv_to_rgb(T h, T v_min, T v_max, T* r, T* g, T* b) {
|
||||
int h_category = static_cast<int>(h);
|
||||
T ratio = h - (T)h_category;
|
||||
bool increase = ((h_category & 0x1) == 0);
|
||||
|
|
|
@ -26,7 +26,7 @@ namespace nd4j {
|
|||
namespace ops {
|
||||
namespace helpers {
|
||||
template <typename T>
|
||||
static FORCEINLINE void rgb_to_hsv(nd4j::LaunchContext * context, T r, T g, T b, T* h, T* s, T* v) {
|
||||
static FORCEINLINE _CUDA_HD void rgb_to_hsv(T r, T g, T b, T* h, T* s, T* v) {
|
||||
T vv = nd4j::math::nd4j_max<T>(r, nd4j::math::nd4j_max<T>(g, b));
|
||||
T range = vv - nd4j::math::nd4j_min<T>(r, nd4j::math::nd4j_min<T>(g, b));
|
||||
if (vv > 0) {
|
||||
|
@ -54,7 +54,7 @@ namespace helpers {
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
static FORCEINLINE void hsv_to_rgb(nd4j::LaunchContext * context, T h, T s, T v, T* r, T* g, T* b) {
|
||||
static FORCEINLINE _CUDA_HD void hsv_to_rgb(T h, T s, T v, T* r, T* g, T* b) {
|
||||
T c = s * v;
|
||||
T m = v - c;
|
||||
T dh = h * 6;
|
||||
|
|
|
@ -42,7 +42,7 @@ namespace helpers {
|
|||
auto o = bOut + e * numChannels;
|
||||
|
||||
T h, v_min, v_max;
|
||||
helpers::rgb_to_hv(context, i[0], i[1], i[2], &h, &v_min, &v_max);
|
||||
helpers::rgb_to_hv(i[0], i[1], i[2], &h, &v_min, &v_max);
|
||||
|
||||
h += delta * kChannelRange;
|
||||
while (h < (T) 0.)
|
||||
|
@ -51,7 +51,7 @@ namespace helpers {
|
|||
while (h >= (T) kChannelRange)
|
||||
h -= (T) kChannelRange;
|
||||
|
||||
helpers::hv_to_rgb(context, h, v_min, v_max, o, o + 1, o + 2);
|
||||
helpers::hv_to_rgb(h, v_min, v_max, o, o + 1, o + 2);
|
||||
}
|
||||
} else {
|
||||
auto tadsChannelsIn = array->allTensorsAlongDimension({0});
|
||||
|
@ -76,7 +76,7 @@ namespace helpers {
|
|||
auto _bo = outputB + e;
|
||||
|
||||
T h, v_min, v_max;
|
||||
helpers::rgb_to_hv(context, _ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
|
||||
helpers::rgb_to_hv(_ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
|
||||
|
||||
h += delta * kChannelRange;
|
||||
while (h < (T) 0)
|
||||
|
@ -85,7 +85,7 @@ namespace helpers {
|
|||
while (h >= (T) kChannelRange)
|
||||
h -= (T) kChannelRange;
|
||||
|
||||
helpers::hv_to_rgb(context, h, v_min, v_max, _ro, _go, _bo);
|
||||
helpers::hv_to_rgb(h, v_min, v_max, _ro, _go, _bo);
|
||||
}
|
||||
|
||||
delete tadsChannelsIn;
|
||||
|
|
|
@ -43,10 +43,10 @@ namespace helpers {
|
|||
|
||||
T h, s, v;
|
||||
// Convert the RGB color to Hue/V-range.
|
||||
helpers::rgb_to_hsv(context, i[0], i[1], i[2], &h, &s, &v);
|
||||
helpers::rgb_to_hsv(i[0], i[1], i[2], &h, &s, &v);
|
||||
s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
|
||||
// Convert the hue and v-range back into RGB.
|
||||
helpers::hsv_to_rgb(context, h, s, v, o, o + 1, o + 2);
|
||||
helpers::hsv_to_rgb(h, s, v, o, o + 1, o + 2);
|
||||
}
|
||||
} else {
|
||||
auto tadsChannelsIn = array->allTensorsAlongDimension({0});
|
||||
|
@ -72,10 +72,10 @@ namespace helpers {
|
|||
|
||||
T h, s, v;
|
||||
// Convert the RGB color to Hue/V-range.
|
||||
helpers::rgb_to_hsv(context, _ri[0], _gi[0], _bi[0], &h, &s, &v);
|
||||
helpers::rgb_to_hsv(_ri[0], _gi[0], _bi[0], &h, &s, &v);
|
||||
s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
|
||||
// Convert the hue and v-range back into RGB.
|
||||
helpers::hsv_to_rgb(context, h, s, v, _ro, _go, _bo);
|
||||
helpers::hsv_to_rgb(h, s, v, _ro, _go, _bo);
|
||||
}
|
||||
|
||||
delete tadsChannelsIn;
|
||||
|
|
|
@ -58,7 +58,7 @@ namespace nd4j {
|
|||
}
|
||||
}
|
||||
|
||||
void flatten(std::vector<NDArray*> &inputs, NDArray *output, char order) {
|
||||
void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
|
||||
BUILD_SINGLE_SELECTOR(output->dataType(), flatten_, (inputs, output, order), LIBND4J_TYPES);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,14 +19,114 @@
|
|||
//
|
||||
|
||||
#include <ops/declarable/helpers/adjust_hue.h>
|
||||
#include <helpers/ConstantTadHelper.h>
|
||||
|
||||
namespace nd4j {
|
||||
namespace ops {
|
||||
namespace helpers {
|
||||
|
||||
template <typename T>
|
||||
static void _adjust_hue_single(NDArray *array, NDArray *output, float delta, bool isNHWC) {
|
||||
static void _CUDA_G adjustHueSingleNHWCKernel(void *xBuffer, Nd4jLong *xShapeInfo, void *zBuffer, Nd4jLong *zShapeInfo, Nd4jLong tuples, float delta) {
|
||||
int numChannels = 3;
|
||||
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
auto bIn = reinterpret_cast<T*>(xBuffer);
|
||||
auto bOut = reinterpret_cast<T*>(zBuffer);
|
||||
static const int kChannelRange = 6;
|
||||
|
||||
for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
|
||||
auto i = bIn + e * numChannels;
|
||||
auto o = bOut + e * numChannels;
|
||||
|
||||
T h, v_min, v_max;
|
||||
helpers::rgb_to_hv(i[0], i[1], i[2], &h, &v_min, &v_max);
|
||||
|
||||
h += delta * kChannelRange;
|
||||
while (h < (T) 0.)
|
||||
h += (T) kChannelRange;
|
||||
|
||||
while (h >= (T) kChannelRange)
|
||||
h -= (T) kChannelRange;
|
||||
|
||||
helpers::hv_to_rgb(h, v_min, v_max, o, o + 1, o + 2);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void _CUDA_G adjustHueSingleNCHWKernel(void *xBuffer, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, void *zBuffer, Nd4jLong *zTadShapeInfo, Nd4jLong *zOffsets, Nd4jLong tadLength, Nd4jLong tuples, float delta) {
|
||||
int numChannels = 3;
|
||||
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
static const int kChannelRange = 6;
|
||||
|
||||
auto bufferR = reinterpret_cast<T *>(xBuffer) + xOffsets[0];
|
||||
auto bufferG = reinterpret_cast<T *>(xBuffer) + xOffsets[1];
|
||||
auto bufferB = reinterpret_cast<T *>(xBuffer) + xOffsets[2];
|
||||
|
||||
auto outputR = reinterpret_cast<T *>(zBuffer) + zOffsets[0];
|
||||
auto outputG = reinterpret_cast<T *>(zBuffer) + zOffsets[1];
|
||||
auto outputB = reinterpret_cast<T *>(zBuffer) + zOffsets[2];
|
||||
|
||||
|
||||
for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
|
||||
auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
|
||||
auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
|
||||
auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
|
||||
|
||||
auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
|
||||
auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
|
||||
auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
|
||||
|
||||
T h, v_min, v_max;
|
||||
helpers::rgb_to_hv(_ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
|
||||
|
||||
h += delta * kChannelRange;
|
||||
while (h < (T) 0)
|
||||
h += (T) kChannelRange;
|
||||
|
||||
while (h >= (T) kChannelRange)
|
||||
h -= (T) kChannelRange;
|
||||
|
||||
helpers::hv_to_rgb(h, v_min, v_max, _ro, _go, _bo);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void _adjust_hue_single(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
|
||||
// numChannels is always 3
|
||||
auto tuples = array->lengthOf() / 3;
|
||||
if (isNHWC) {
|
||||
adjustHueSingleNHWCKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), array->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), tuples, delta);
|
||||
} else {
|
||||
// TODO: check this one
|
||||
auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {1, 2});
|
||||
auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {1, 2});
|
||||
|
||||
auto tadLength = shape::length(packX.primaryShapeInfo());
|
||||
|
||||
adjustHueSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static void _adjust_hue_batch(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
|
||||
auto xType = array->dataType();
|
||||
|
||||
// numChannels is always 3
|
||||
auto tuples = array->lengthOf() / 3;
|
||||
|
||||
if (isNHWC) {
|
||||
// in case of nhwc batch, we don't really care about examples: it's still bunch of RGB values
|
||||
BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (context, array, output, delta, isNHWC);, FLOAT_TYPES);
|
||||
} else {
|
||||
// TODO: check this one
|
||||
auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {0, 2, 3});
|
||||
auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {0, 2, 3});
|
||||
|
||||
auto tadLength = shape::length(packX.primaryShapeInfo());
|
||||
|
||||
adjustHueSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
|
||||
}
|
||||
}
|
||||
|
||||
void _adjust_hue(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
|
||||
|
@ -34,25 +134,11 @@ namespace helpers {
|
|||
|
||||
float d = delta->e<float>(0);
|
||||
if (array->rankOf() == 4) {
|
||||
auto tadsIn = array->allTensorsAlongDimension({0});
|
||||
auto tadsOut = output->allTensorsAlongDimension({0});
|
||||
|
||||
// FIXME: template selector should be moved out of loop
|
||||
PRAGMA_OMP_PARALLEL_FOR
|
||||
for (int e = 0; e < tadsIn->size(); e++) {
|
||||
BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
|
||||
}
|
||||
|
||||
|
||||
delete tadsIn;
|
||||
delete tadsOut;
|
||||
} else {
|
||||
BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (array, output, d, isNHWC);, FLOAT_TYPES);
|
||||
BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (context, array, output, d, isNHWC);, FLOAT_TYPES);
|
||||
}
|
||||
}
|
||||
|
||||
BUILD_SINGLE_TEMPLATE(template void _adjust_hue_single, (NDArray *array, NDArray *output, float delta, bool isNHWC);, FLOAT_TYPES);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
//
|
||||
|
||||
#include <ops/declarable/helpers/adjust_saturation.h>
|
||||
#include <helpers/ConstantTadHelper.h>
|
||||
|
||||
|
||||
namespace nd4j {
|
||||
|
@ -26,8 +27,96 @@ namespace ops {
|
|||
namespace helpers {
|
||||
|
||||
template <typename T>
|
||||
static void _adjust_saturation_single(NDArray *array, NDArray *output, float delta, bool isNHWC) {
|
||||
static void _CUDA_G adjustSaturationSingleNHWCKernel(void *xBuffer, Nd4jLong *xShapeInfo, void *zBuffer, Nd4jLong *zShapeInfo, Nd4jLong tuples, float delta) {
|
||||
int numChannels = 3;
|
||||
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
auto bIn = reinterpret_cast<T*>(xBuffer);
|
||||
auto bOut = reinterpret_cast<T*>(zBuffer);
|
||||
static const int kChannelRange = 6;
|
||||
|
||||
for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
|
||||
auto i = bIn + e * numChannels;
|
||||
auto o = bOut + e * numChannels;
|
||||
|
||||
T h, s, v;
|
||||
// Convert the RGB color to Hue/V-range.
|
||||
helpers::rgb_to_hsv(i[0], i[1], i[2], &h, &s, &v);
|
||||
s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
|
||||
|
||||
// Convert the hue and v-range back into RGB.
|
||||
helpers::hsv_to_rgb(h, s, v, o, o + 1, o + 2);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void _CUDA_G adjustSaturationSingleNCHWKernel(void *xBuffer, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, void *zBuffer, Nd4jLong *zTadShapeInfo, Nd4jLong *zOffsets, Nd4jLong tadLength, Nd4jLong tuples, float delta) {
|
||||
int numChannels = 3;
|
||||
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
static const int kChannelRange = 6;
|
||||
|
||||
auto bufferR = reinterpret_cast<T *>(xBuffer) + xOffsets[0];
|
||||
auto bufferG = reinterpret_cast<T *>(xBuffer) + xOffsets[1];
|
||||
auto bufferB = reinterpret_cast<T *>(xBuffer) + xOffsets[2];
|
||||
|
||||
auto outputR = reinterpret_cast<T *>(zBuffer) + zOffsets[0];
|
||||
auto outputG = reinterpret_cast<T *>(zBuffer) + zOffsets[1];
|
||||
auto outputB = reinterpret_cast<T *>(zBuffer) + zOffsets[2];
|
||||
|
||||
for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
|
||||
auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
|
||||
auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
|
||||
auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
|
||||
|
||||
auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
|
||||
auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
|
||||
auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
|
||||
|
||||
T h, s, v;
|
||||
// Convert the RGB color to Hue/V-range.
|
||||
helpers::rgb_to_hsv(_ri[0], _gi[0], _bi[0], &h, &s, &v);
|
||||
s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
|
||||
// Convert the hue and v-range back into RGB.
|
||||
helpers::hsv_to_rgb(h, s, v, _ro, _go, _bo);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void _adjust_saturation_single(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
|
||||
// numChannels is always 3
|
||||
auto tuples = array->lengthOf() / 3;
|
||||
|
||||
if (isNHWC) {
|
||||
adjustSaturationSingleNHWCKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), array->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), tuples, delta);
|
||||
} else {
|
||||
auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {1, 2});
|
||||
auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {1, 2});
|
||||
|
||||
auto tadLength = shape::length(packX.primaryShapeInfo());
|
||||
|
||||
adjustSaturationSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void _adjust_saturation_batch(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
|
||||
auto xType = array->dataType();
|
||||
|
||||
// numChannels is always 3
|
||||
auto tuples = array->lengthOf() / 3;
|
||||
|
||||
if (isNHWC) {
|
||||
// in case of nhwc batch, we don't really care about examples: it's still bunch of RGB values
|
||||
BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (context, array, output, delta, isNHWC);, FLOAT_TYPES);
|
||||
} else {
|
||||
// TODO: check this one
|
||||
auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {0, 2, 3});
|
||||
auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {0, 2, 3});
|
||||
|
||||
auto tadLength = shape::length(packX.primaryShapeInfo());
|
||||
|
||||
adjustSaturationSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
|
||||
}
|
||||
}
|
||||
|
||||
void adjust_saturation(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
|
||||
|
@ -35,24 +124,12 @@ namespace helpers {
|
|||
|
||||
float d = delta->e<float>(0);
|
||||
if (array->rankOf() == 4) {
|
||||
auto tadsIn = array->allTensorsAlongDimension({0});
|
||||
auto tadsOut = output->allTensorsAlongDimension({0});
|
||||
|
||||
// FIXME: template selector should be moved out of loop
|
||||
PRAGMA_OMP_PARALLEL_FOR
|
||||
for (int e = 0; e < tadsIn->size(); e++) {
|
||||
BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
|
||||
}
|
||||
|
||||
|
||||
delete tadsIn;
|
||||
delete tadsOut;
|
||||
BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_batch, (context, array, output, d, isNHWC);, FLOAT_TYPES);
|
||||
} else {
|
||||
BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (array, output, d, isNHWC);, FLOAT_TYPES);
|
||||
BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (context, array, output, d, isNHWC);, FLOAT_TYPES);
|
||||
}
|
||||
}
|
||||
|
||||
BUILD_SINGLE_TEMPLATE(template void _adjust_saturation_single, (NDArray *array, NDArray *output, float delta, bool isNHWC), FLOAT_TYPES);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,12 +19,68 @@
|
|||
//
|
||||
|
||||
#include <ops/declarable/helpers/flatten.h>
|
||||
#include <helpers/PointersManager.h>
|
||||
|
||||
namespace nd4j {
|
||||
namespace ops {
|
||||
namespace helpers {
|
||||
void flatten(std::vector<NDArray*> &inputs, NDArray *output, char order) {
|
||||
template <typename T>
|
||||
void _CUDA_G flattenKernel(void **xBuffers, Nd4jLong **xShapeInfos, Nd4jLong *offsets, Nd4jLong numInputs, void *zBuffer, Nd4jLong *zShapeInfo, char order) {
|
||||
|
||||
Nd4jLong xCoord[MAX_RANK];
|
||||
|
||||
for (Nd4jLong e = blockIdx.x; e < numInputs; e += gridDim.x) {
|
||||
auto z = reinterpret_cast<T*>(zBuffer) + offsets[e];
|
||||
|
||||
auto xBuffer = reinterpret_cast<T*>(xBuffers[e]);
|
||||
auto xShapeInfo = xShapeInfos[e];
|
||||
auto xShape = shape::shapeOf(xShapeInfo);
|
||||
auto xStride = shape::stride(xShapeInfo);
|
||||
auto xRank = shape::rank(xShapeInfo);
|
||||
auto xLength = shape::length(xShapeInfo);
|
||||
|
||||
for (uint i = threadIdx.x; i < xLength; i += blockDim.x) {
|
||||
shape::index2coords(xRank, xShape, i, xLength, xCoord, order);
|
||||
auto xOffset = shape::getOffset(0, xShape, xStride, xCoord, xRank);
|
||||
z[i] = xBuffer[xOffset];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void flatten_(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
|
||||
PointersManager pm(context, "flatten");
|
||||
|
||||
std::vector<void*> hdBuffers(inputs.size());
|
||||
std::vector<Nd4jLong> hOffsets(inputs.size());
|
||||
std::vector<Nd4jLong *> hdShapes(inputs.size());
|
||||
Nd4jLong cOffset = 0;
|
||||
|
||||
// calculating offsets in output
|
||||
for (int e = 0; e < inputs.size(); e++) {
|
||||
hOffsets[e] = cOffset;
|
||||
cOffset += inputs[e]->lengthOf();
|
||||
|
||||
hdBuffers[e] = inputs[e]->specialBuffer();
|
||||
hdShapes[e] = inputs[e]->specialShapeInfo();
|
||||
}
|
||||
|
||||
auto dBuffers = (void **) pm.replicatePointer(hdBuffers.data(), inputs.size() * sizeof(void*));
|
||||
auto dShapes = (Nd4jLong **)pm.replicatePointer(hdShapes.data(), inputs.size() * sizeof(Nd4jLong*));
|
||||
auto dOffsets = (Nd4jLong *) pm.replicatePointer(hOffsets.data(), inputs.size() * sizeof(Nd4jLong));
|
||||
|
||||
|
||||
flattenKernel<T><<<256, 512, 8192, *context->getCudaStream()>>>(dBuffers, dShapes, dOffsets, inputs.size(), output->getSpecialBuffer(), output->getSpecialShapeInfo(), order);
|
||||
|
||||
pm.synchronize();
|
||||
}
|
||||
|
||||
void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
|
||||
for (auto v:inputs)
|
||||
v->syncToDevice();
|
||||
|
||||
BUILD_SINGLE_SELECTOR(output->dataType(), flatten_, (context, inputs, output, order), LIBND4J_TYPES);
|
||||
NDArray::registerSpecialUse({output}, {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@
|
|||
namespace nd4j {
|
||||
namespace ops {
|
||||
namespace helpers {
|
||||
void flatten(std::vector<NDArray*> &inputs, NDArray *output, char order);
|
||||
void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue