[WIP] cuda concat (#107)
* - correct cuda concat Signed-off-by: Yurii <yurii@skymind.io> * - pooling 2d/3d : take into account possible case when input and gradI have different strides Signed-off-by: Yurii <yurii@skymind.io> * master pulled in Signed-off-by: raver119 <raver119@gmail.com> * floordiv_bp test reverted Signed-off-by: raver119 <raver119@gmail.com> * - add NDArray::printLinearBuffer method Signed-off-by: Yurii <yurii@skymind.io>master
parent
62a025439b
commit
7fa01288bb
|
@ -477,6 +477,11 @@ namespace nd4j {
|
||||||
*/
|
*/
|
||||||
void printBuffer(const char* msg = nullptr, Nd4jLong limit = -1, const bool sync = true) const;
|
void printBuffer(const char* msg = nullptr, Nd4jLong limit = -1, const bool sync = true) const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* print element by element consequently in a way they (elements) are stored in physical memory
|
||||||
|
*/
|
||||||
|
void printLinearBuffer() const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* prints _buffer (if host = true) or _bufferD (if host = false) as it is, that is in current state without checking buffer status
|
* prints _buffer (if host = true) or _bufferD (if host = false) as it is, that is in current state without checking buffer status
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -1137,6 +1137,39 @@ void NDArray::printBuffer(const char* msg, Nd4jLong limit, const bool sync) cons
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
// print element by element consequently in a way they (elements) are stored in physical memory
|
||||||
|
void NDArray::printLinearBuffer() const {
|
||||||
|
|
||||||
|
syncToHost();
|
||||||
|
|
||||||
|
const auto ews = this->ews() > 0 ? this->ews() : 1;
|
||||||
|
const auto len = this->lengthOf();
|
||||||
|
|
||||||
|
printf("[");
|
||||||
|
|
||||||
|
if (this->dataType() == nd4j::DataType::INT32) {
|
||||||
|
for(Nd4jLong e = 0; e < len; e++)
|
||||||
|
printf("%d, ", this->bufferAsT<int>()[e * ews]);
|
||||||
|
}
|
||||||
|
else if(this->dataType() == nd4j::DataType::INT64) {
|
||||||
|
for(Nd4jLong e = 0; e < len; e++)
|
||||||
|
printf("%lld, ", this->bufferAsT<Nd4jLong>()[e * ews]);
|
||||||
|
}
|
||||||
|
else if(this->dataType() == nd4j::DataType::FLOAT32) {
|
||||||
|
for(Nd4jLong e = 0; e < len; e++)
|
||||||
|
printf("%.3f, ", this->bufferAsT<float>()[e * ews]);
|
||||||
|
}
|
||||||
|
else if(this->dataType() == nd4j::DataType::DOUBLE) {
|
||||||
|
for(Nd4jLong e = 0; e < len; e++)
|
||||||
|
printf("%.3f, ", this->bufferAsT<double>()[e * ews]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
throw std::invalid_argument("NDArray::printLinearBuffer: not implemented yet for this data type !");
|
||||||
|
|
||||||
|
printf("]\n");
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
static void printFormatted(NDArray const* arr, int depth, int limit) {
|
static void printFormatted(NDArray const* arr, int depth, int limit) {
|
||||||
|
|
||||||
|
|
|
@ -1863,18 +1863,26 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
#endif
|
#endif
|
||||||
nd4j_debug("MKL-DNN is not used for pooling2d_bp!\n", 0);
|
nd4j_debug("MKL-DNN is not used for pooling2d_bp!\n", 0);
|
||||||
|
|
||||||
const Nd4jLong iStride0 = gradI.stridesOf()[0];
|
const Nd4jLong iStride0 = input.stridesOf()[0];
|
||||||
const Nd4jLong iStride1 = gradI.stridesOf()[1];
|
const Nd4jLong iStride1 = input.stridesOf()[1];
|
||||||
const Nd4jLong iStride2 = gradI.stridesOf()[2];
|
const Nd4jLong iStride2 = input.stridesOf()[2];
|
||||||
const Nd4jLong iStride3 = gradI.stridesOf()[3];
|
const Nd4jLong iStride3 = input.stridesOf()[3];
|
||||||
|
const Nd4jLong gIStride0 = gradI.stridesOf()[0];
|
||||||
|
const Nd4jLong gIStride1 = gradI.stridesOf()[1];
|
||||||
|
const Nd4jLong gIStride2 = gradI.stridesOf()[2];
|
||||||
|
const Nd4jLong gIStride3 = gradI.stridesOf()[3];
|
||||||
const Nd4jLong oStride0 = gradO.stridesOf()[0];
|
const Nd4jLong oStride0 = gradO.stridesOf()[0];
|
||||||
const Nd4jLong oStride1 = gradO.stridesOf()[1];
|
const Nd4jLong oStride1 = gradO.stridesOf()[1];
|
||||||
const Nd4jLong oStride2 = gradO.stridesOf()[2];
|
const Nd4jLong oStride2 = gradO.stridesOf()[2];
|
||||||
const Nd4jLong oStride3 = gradO.stridesOf()[3];
|
const Nd4jLong oStride3 = gradO.stridesOf()[3];
|
||||||
const Nd4jLong iStep2 = dH*iStride2;
|
const Nd4jLong iStep2 = dH*iStride2;
|
||||||
const Nd4jLong iStep3 = dW*iStride3;
|
const Nd4jLong iStep3 = dW*iStride3;
|
||||||
|
const Nd4jLong gIStep2 = dH*gIStride2;
|
||||||
|
const Nd4jLong gIStep3 = dW*gIStride3;
|
||||||
const int kProd = kH*kW;
|
const int kProd = kH*kW;
|
||||||
|
|
||||||
|
const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3;
|
||||||
|
|
||||||
Nd4jLong hstart, wstart,hend, wend, maxKH, maxKW;
|
Nd4jLong hstart, wstart,hend, wend, maxKH, maxKW;
|
||||||
T sum, valO, *pIn, *pgI;
|
T sum, valO, *pIn, *pgI;
|
||||||
|
|
||||||
|
@ -1901,14 +1909,16 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
if(wend > iW)
|
if(wend > iW)
|
||||||
wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
|
wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
|
||||||
|
|
||||||
|
sum = -DataTypeUtils::max<T>();
|
||||||
|
valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];
|
||||||
|
|
||||||
|
if(sameStrides) {
|
||||||
|
|
||||||
hstart *= iStride2;
|
hstart *= iStride2;
|
||||||
hend *= iStride2;
|
hend *= iStride2;
|
||||||
wstart *= iStride3;
|
wstart *= iStride3;
|
||||||
wend *= iStride3;
|
wend *= iStride3;
|
||||||
|
|
||||||
sum = -DataTypeUtils::max<T>();
|
|
||||||
valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];
|
|
||||||
|
|
||||||
// we set these to default values
|
// we set these to default values
|
||||||
maxKH = hstart;
|
maxKH = hstart;
|
||||||
maxKW = wstart;
|
maxKW = wstart;
|
||||||
|
@ -1924,6 +1934,24 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
}
|
}
|
||||||
gI[pIn - in + maxKH + maxKW] += valO;
|
gI[pIn - in + maxKH + maxKW] += valO;
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
|
||||||
|
// we set these to default values
|
||||||
|
maxKH = hstart;
|
||||||
|
maxKW = wstart;
|
||||||
|
|
||||||
|
for (Nd4jLong kh = hstart; kh < hend; kh += dH)
|
||||||
|
for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
|
||||||
|
T valIn = pIn[kh * iStride2 + kw * iStride3];
|
||||||
|
if (valIn > sum) {
|
||||||
|
sum = valIn;
|
||||||
|
maxKH = kh;
|
||||||
|
maxKW = kw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
gI[b * gIStride0 + c * gIStride1 + maxKH * gIStride2 + maxKW * gIStride3] += valO;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1936,7 +1964,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
for(int oh = 0; oh < oH; ++oh) {
|
for(int oh = 0; oh < oH; ++oh) {
|
||||||
for(int ow = 0; ow < oW; ++ow) {
|
for(int ow = 0; ow < oW; ++ow) {
|
||||||
|
|
||||||
pgI = gI + b * iStride0 + c * iStride1;
|
pgI = gI + b * gIStride0 + c * gIStride1;
|
||||||
|
|
||||||
hstart = oh * sH - pH;
|
hstart = oh * sH - pH;
|
||||||
wstart = ow * sW - pW;
|
wstart = ow * sW - pW;
|
||||||
|
@ -1952,20 +1980,20 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
if(wend > iW)
|
if(wend > iW)
|
||||||
wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
|
wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
|
||||||
|
|
||||||
hstart *= iStride2;
|
hstart *= gIStride2;
|
||||||
hend *= iStride2;
|
hend *= gIStride2;
|
||||||
wstart *= iStride3;
|
wstart *= gIStride3;
|
||||||
wend *= iStride3;
|
wend *= gIStride3;
|
||||||
|
|
||||||
valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];
|
valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];
|
||||||
|
|
||||||
if ((int) extraParam0 == 0) //Exclude padding
|
if ((int) extraParam0 == 0) //Exclude padding
|
||||||
valO /= static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(iStep2))) * static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(iStep3))); //Accounts for dilation
|
valO /= static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(gIStep2))) * static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(gIStep3))); //Accounts for dilation
|
||||||
else if ((int) extraParam0 == 1) //Include padding
|
else if ((int) extraParam0 == 1) //Include padding
|
||||||
valO /= kProd;
|
valO /= kProd;
|
||||||
|
|
||||||
for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
|
for (Nd4jLong kh = hstart; kh < hend; kh += gIStep2)
|
||||||
for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
|
for (Nd4jLong kw = wstart; kw < wend; kw += gIStep3)
|
||||||
pgI[kh + kw] += valO;
|
pgI[kh + kw] += valO;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1981,7 +2009,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
for(int ow = 0; ow < oW; ++ow) {
|
for(int ow = 0; ow < oW; ++ow) {
|
||||||
|
|
||||||
pIn = in + b * iStride0 + c * iStride1;
|
pIn = in + b * iStride0 + c * iStride1;
|
||||||
pgI = gI + (pIn - in);
|
pgI = sameStrides ? gI + (pIn - in) : gI + b * gIStride0 + c * gIStride1;
|
||||||
|
|
||||||
hstart = oh * sH - pH;
|
hstart = oh * sH - pH;
|
||||||
wstart = ow * sW - pW;
|
wstart = ow * sW - pW;
|
||||||
|
@ -1997,15 +2025,16 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
if(wend > iW)
|
if(wend > iW)
|
||||||
wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
|
wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
|
||||||
|
|
||||||
|
sum = static_cast<T>(0.f);
|
||||||
|
valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];
|
||||||
|
|
||||||
|
if(sameStrides) {
|
||||||
|
|
||||||
hstart *= iStride2;
|
hstart *= iStride2;
|
||||||
hend *= iStride2;
|
hend *= iStride2;
|
||||||
wstart *= iStride3;
|
wstart *= iStride3;
|
||||||
wend *= iStride3;
|
wend *= iStride3;
|
||||||
|
|
||||||
sum = static_cast<T>(0.f);
|
|
||||||
valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];
|
|
||||||
|
|
||||||
for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
|
for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
|
||||||
for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
|
for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
|
||||||
sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
|
sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
|
||||||
|
@ -2016,6 +2045,22 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
|
for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
|
||||||
pgI[kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(pIn[kh + kw]);
|
pgI[kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(pIn[kh + kw]);
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
|
||||||
|
for (Nd4jLong kh = hstart; kh < hend; kh += dH)
|
||||||
|
for (Nd4jLong kw = wstart; kw < wend; kw += dW)
|
||||||
|
sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh * iStride2 + kw * iStride3]), extraParam0);
|
||||||
|
|
||||||
|
valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1. - extraParam0) / extraParam0);
|
||||||
|
|
||||||
|
for (Nd4jLong kh = hstart; kh < hend; kh += dH) {
|
||||||
|
for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
|
||||||
|
const auto inVal = pIn[kh * iStride2 + kw * iStride3];
|
||||||
|
pgI[kh * gIStride2 + kw * gIStride3] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(inVal);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2144,11 +2189,16 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
#endif
|
#endif
|
||||||
nd4j_debug("MKL-DNN is not used for pooling3d_bp!\n", 0);
|
nd4j_debug("MKL-DNN is not used for pooling3d_bp!\n", 0);
|
||||||
|
|
||||||
const Nd4jLong iStride0 = gradI.stridesOf()[0];
|
const Nd4jLong iStride0 = input.stridesOf()[0];
|
||||||
const Nd4jLong iStride1 = gradI.stridesOf()[1];
|
const Nd4jLong iStride1 = input.stridesOf()[1];
|
||||||
const Nd4jLong iStride2 = gradI.stridesOf()[2];
|
const Nd4jLong iStride2 = input.stridesOf()[2];
|
||||||
const Nd4jLong iStride3 = gradI.stridesOf()[3];
|
const Nd4jLong iStride3 = input.stridesOf()[3];
|
||||||
const Nd4jLong iStride4 = gradI.stridesOf()[4];
|
const Nd4jLong iStride4 = input.stridesOf()[4];
|
||||||
|
const Nd4jLong gIStride0 = gradI.stridesOf()[0];
|
||||||
|
const Nd4jLong gIStride1 = gradI.stridesOf()[1];
|
||||||
|
const Nd4jLong gIStride2 = gradI.stridesOf()[2];
|
||||||
|
const Nd4jLong gIStride3 = gradI.stridesOf()[3];
|
||||||
|
const Nd4jLong gIStride4 = gradI.stridesOf()[4];
|
||||||
const Nd4jLong oStride0 = gradO.stridesOf()[0];
|
const Nd4jLong oStride0 = gradO.stridesOf()[0];
|
||||||
const Nd4jLong oStride1 = gradO.stridesOf()[1];
|
const Nd4jLong oStride1 = gradO.stridesOf()[1];
|
||||||
const Nd4jLong oStride2 = gradO.stridesOf()[2];
|
const Nd4jLong oStride2 = gradO.stridesOf()[2];
|
||||||
|
@ -2157,8 +2207,13 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
const Nd4jLong iStep2 = dD*iStride2;
|
const Nd4jLong iStep2 = dD*iStride2;
|
||||||
const Nd4jLong iStep3 = dH*iStride3;
|
const Nd4jLong iStep3 = dH*iStride3;
|
||||||
const Nd4jLong iStep4 = dW*iStride4;
|
const Nd4jLong iStep4 = dW*iStride4;
|
||||||
|
const Nd4jLong gIStep2 = dD*gIStride2;
|
||||||
|
const Nd4jLong gIStep3 = dH*gIStride3;
|
||||||
|
const Nd4jLong gIStep4 = dW*gIStride4;
|
||||||
const int kProd = kD*kH*kW;
|
const int kProd = kD*kH*kW;
|
||||||
|
|
||||||
|
const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3 && iStride4 == gIStride4;
|
||||||
|
|
||||||
Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
|
Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
|
||||||
T sum, valO, *pIn, *pgI;
|
T sum, valO, *pIn, *pgI;
|
||||||
|
|
||||||
|
@ -2192,6 +2247,11 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
if(wend > iW)
|
if(wend > iW)
|
||||||
wend -= dW * ((wend-iW + dW - 1) / dW);
|
wend -= dW * ((wend-iW + dW - 1) / dW);
|
||||||
|
|
||||||
|
sum = -DataTypeUtils::max<T>();
|
||||||
|
valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];
|
||||||
|
|
||||||
|
if(sameStrides) {
|
||||||
|
|
||||||
dstart *= iStride2;
|
dstart *= iStride2;
|
||||||
dend *= iStride2;
|
dend *= iStride2;
|
||||||
hstart *= iStride3;
|
hstart *= iStride3;
|
||||||
|
@ -2203,9 +2263,6 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
maxKH = hstart;
|
maxKH = hstart;
|
||||||
maxKW = wstart;
|
maxKW = wstart;
|
||||||
|
|
||||||
sum = -DataTypeUtils::max<T>();
|
|
||||||
valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];
|
|
||||||
|
|
||||||
for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
|
for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
|
||||||
for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
|
for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
|
||||||
for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
|
for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
|
||||||
|
@ -2219,6 +2276,27 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
}
|
}
|
||||||
gI[pIn - in + maxKD + maxKH + maxKW] += valO;
|
gI[pIn - in + maxKD + maxKH + maxKW] += valO;
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
|
||||||
|
// we set these to default values
|
||||||
|
maxKH = hstart;
|
||||||
|
maxKW = wstart;
|
||||||
|
maxKD = dstart;
|
||||||
|
|
||||||
|
for (Nd4jLong kd = dstart; kd < dend; kd += dD)
|
||||||
|
for (Nd4jLong kh = hstart; kh < hend; kh += dH)
|
||||||
|
for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
|
||||||
|
T valIn = pIn[kd * iStride2 + kh * iStride3 + kw * iStride4];
|
||||||
|
if (valIn > sum) {
|
||||||
|
sum = valIn;
|
||||||
|
maxKD = kd;
|
||||||
|
maxKH = kh;
|
||||||
|
maxKW = kw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
gI[b * gIStride0 + c * gIStride1 + maxKD * gIStride2 + maxKH * gIStride3 + maxKW * gIStride4] += valO;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2233,7 +2311,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
for(int oh = 0; oh < oH; ++oh) {
|
for(int oh = 0; oh < oH; ++oh) {
|
||||||
for(int ow = 0; ow < oW; ++ow) {
|
for(int ow = 0; ow < oW; ++ow) {
|
||||||
|
|
||||||
pgI = gI + b * iStride0 + c * iStride1;
|
pgI = gI + b * gIStride0 + c * gIStride1;
|
||||||
|
|
||||||
dstart = od * sD - pD;
|
dstart = od * sD - pD;
|
||||||
hstart = oh * sH - pH;
|
hstart = oh * sH - pH;
|
||||||
|
@ -2255,23 +2333,23 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
if(wend > iW)
|
if(wend > iW)
|
||||||
wend -= dW * ((wend-iW + dW - 1) / dW);
|
wend -= dW * ((wend-iW + dW - 1) / dW);
|
||||||
|
|
||||||
dstart *= iStride2;
|
dstart *= gIStride2;
|
||||||
dend *= iStride2;
|
dend *= gIStride2;
|
||||||
hstart *= iStride3;
|
hstart *= gIStride3;
|
||||||
hend *= iStride3;
|
hend *= gIStride3;
|
||||||
wstart *= iStride4;
|
wstart *= gIStride4;
|
||||||
wend *= iStride4;
|
wend *= gIStride4;
|
||||||
|
|
||||||
valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];
|
valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];
|
||||||
|
|
||||||
if (extraParam0 == 0) //Exclude padding
|
if (extraParam0 == 0) //Exclude padding
|
||||||
valO /= nd4j::math::nd4j_ceil<double,T>(static_cast<double>(dend-dstart) / static_cast<double>(iStep2)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(iStep3)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(iStep4)); //Accounts for dilation
|
valO /= nd4j::math::nd4j_ceil<double,T>(static_cast<double>(dend-dstart) / static_cast<double>(gIStep2)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(gIStep3)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(gIStep4)); //Accounts for dilation
|
||||||
else if (extraParam0 == 1) //Include padding
|
else if (extraParam0 == 1) //Include padding
|
||||||
valO /= kProd;
|
valO /= kProd;
|
||||||
|
|
||||||
for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
|
for (Nd4jLong kd = dstart; kd < dend; kd += gIStep2)
|
||||||
for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
|
for (Nd4jLong kh = hstart; kh < hend; kh += gIStep3)
|
||||||
for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
|
for (Nd4jLong kw = wstart; kw < wend; kw += gIStep4)
|
||||||
pgI[kd + kh + kw] += valO;
|
pgI[kd + kh + kw] += valO;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2311,6 +2389,11 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
if(wend > iW)
|
if(wend > iW)
|
||||||
wend -= dW * ((wend-iW + dW - 1) / dW);
|
wend -= dW * ((wend-iW + dW - 1) / dW);
|
||||||
|
|
||||||
|
sum = static_cast<T>(0.);
|
||||||
|
valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];
|
||||||
|
|
||||||
|
if(sameStrides) {
|
||||||
|
|
||||||
dstart *= iStride2;
|
dstart *= iStride2;
|
||||||
dend *= iStride2;
|
dend *= iStride2;
|
||||||
hstart *= iStride3;
|
hstart *= iStride3;
|
||||||
|
@ -2318,9 +2401,6 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
wstart *= iStride4;
|
wstart *= iStride4;
|
||||||
wend *= iStride4;
|
wend *= iStride4;
|
||||||
|
|
||||||
sum = static_cast<T>(0.);
|
|
||||||
valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];
|
|
||||||
|
|
||||||
for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
|
for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
|
||||||
for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
|
for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
|
||||||
for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
|
for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
|
||||||
|
@ -2333,6 +2413,23 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
|
||||||
for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
|
for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
|
||||||
pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f);
|
pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f);
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
|
||||||
|
for (Nd4jLong kd = dstart; kd < dend; kd += dD)
|
||||||
|
for (Nd4jLong kh = hstart; kh < hend; kh += dH)
|
||||||
|
for (Nd4jLong kw = wstart; kw < wend; kw += dW)
|
||||||
|
sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]), extraParam0);
|
||||||
|
|
||||||
|
valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
|
||||||
|
|
||||||
|
for (Nd4jLong kd = dstart; kd < dend; kd += dD)
|
||||||
|
for (Nd4jLong kh = hstart; kh < hend; kh += dH)
|
||||||
|
for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
|
||||||
|
const auto inVal = pIn[kD * iStride2 + kh * iStride3 + kw * iStride4];
|
||||||
|
pgI[kd * gIStride2 + kh * gIStride3 + kw * gIStride4] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(inVal);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,98 +32,47 @@
|
||||||
namespace nd4j {
|
namespace nd4j {
|
||||||
namespace ops {
|
namespace ops {
|
||||||
namespace helpers {
|
namespace helpers {
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
template<typename T>
|
template<typename T>
|
||||||
__global__ static void concatCuda(const int numOfArrs, void* pVx, void* pxShapeInfo, void* pVz, void* pzShapeInfo) {
|
__global__ static void concatCuda(const int numOfArrs, void* pVx, void* pxShapeInfo, void* pVz, void* pzShapeInfo) {
|
||||||
|
|
||||||
__shared__ int arrIdx, blocksPerArr;
|
__shared__ int arrIdx, blocksPerArr;
|
||||||
__shared__ T *x, *z;
|
|
||||||
__shared__ Nd4jLong *zShapeInfo, *xShapeInfo, arrLen, arrLenPerBlock, start, end;
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
|
|
||||||
blocksPerArr = (gridDim.x + numOfArrs - 1) / numOfArrs; // ceil
|
blocksPerArr = (gridDim.x + numOfArrs - 1) / numOfArrs; // ceil
|
||||||
arrIdx = blockIdx.x / blocksPerArr;
|
arrIdx = blockIdx.x / blocksPerArr;
|
||||||
|
|
||||||
x = reinterpret_cast<T*>(reinterpret_cast<void**>(pVx)[arrIdx]);
|
|
||||||
z = reinterpret_cast<T*>(reinterpret_cast<void**>(pVz)[arrIdx]);
|
|
||||||
xShapeInfo = reinterpret_cast<Nd4jLong**>(pxShapeInfo)[arrIdx];
|
|
||||||
zShapeInfo = reinterpret_cast<Nd4jLong**>(pzShapeInfo)[arrIdx];
|
|
||||||
arrLen = shape::length(xShapeInfo);
|
|
||||||
|
|
||||||
arrLenPerBlock = (arrLen + blocksPerArr - 1) / blocksPerArr; // ceil
|
|
||||||
|
|
||||||
start = (blockIdx.x % blocksPerArr) * arrLenPerBlock;
|
|
||||||
end = (start + arrLenPerBlock) > arrLen ? arrLen : (start + arrLenPerBlock);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
|
for(int j = arrIdx; j < numOfArrs; j += gridDim.x) {
|
||||||
|
|
||||||
|
const auto* x = reinterpret_cast<T*>(reinterpret_cast<void**>(pVx)[j]);
|
||||||
|
auto* z = reinterpret_cast<T*>(reinterpret_cast<void**>(pVz)[j]);
|
||||||
|
const auto* xShapeInfo = reinterpret_cast<Nd4jLong**>(pxShapeInfo)[j];
|
||||||
|
const auto* zShapeInfo = reinterpret_cast<Nd4jLong**>(pzShapeInfo)[j];
|
||||||
|
|
||||||
|
const auto arrLen = shape::length(xShapeInfo);
|
||||||
|
|
||||||
|
const auto arrLenPerBlock = (arrLen + blocksPerArr - 1) / blocksPerArr; // ceil
|
||||||
|
|
||||||
|
const auto start = (blockIdx.x % blocksPerArr) * arrLenPerBlock;
|
||||||
|
const auto end = (start + arrLenPerBlock) > arrLen ? arrLen : (start + arrLenPerBlock);
|
||||||
|
|
||||||
for (Nd4jLong i = start + threadIdx.x; i < end; i += blockDim.x)
|
for (Nd4jLong i = start + threadIdx.x; i < end; i += blockDim.x)
|
||||||
z[shape::getIndexOffset(i, zShapeInfo, arrLen)] = x[shape::getIndexOffset(i, xShapeInfo, arrLen)];
|
z[shape::getIndexOffset(i, zShapeInfo, arrLen)] = x[shape::getIndexOffset(i, xShapeInfo, arrLen)];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
template<typename T>
|
template<typename T>
|
||||||
__host__ static void concatCudaLauncher(const int numOfArrs, const cudaStream_t *stream, void* pVx, void* pxShapeInfo, void* pVz, void* pzShapeInfo) {
|
__host__ static void concatCudaLauncher(const int numOfArrs, const cudaStream_t *stream, void* pVx, void* pxShapeInfo, void* pVz, void* pzShapeInfo) {
|
||||||
|
|
||||||
concatCuda<T><<<512, 256, 1024, *stream>>>(numOfArrs, pVx, pxShapeInfo, pVz, pzShapeInfo);
|
concatCuda<T><<<512, 512, 512, *stream>>>(numOfArrs, pVx, pxShapeInfo, pVz, pzShapeInfo);
|
||||||
}
|
}
|
||||||
BUILD_SINGLE_TEMPLATE(template void concatCudaLauncher, (const int numOfArrs, const cudaStream_t *stream, void* pVx, void* pxShapeInfo, void* pVz, void* pzShapeInfo), LIBND4J_TYPES);
|
BUILD_SINGLE_TEMPLATE(template void concatCudaLauncher, (const int numOfArrs, const cudaStream_t *stream, void* pVx, void* pxShapeInfo, void* pVz, void* pzShapeInfo), LIBND4J_TYPES);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
|
|
||||||
|
|
||||||
const int numOfArrs = inArrs.size();
|
|
||||||
for(int i = 0; i < numOfArrs; ++i)
|
|
||||||
if(!inArrs[i]->isActualOnDeviceSide()) inArrs[i]->syncToDevice();
|
|
||||||
|
|
||||||
const int rank = inArrs[0]->rankOf();
|
|
||||||
const int rank2 = 2*rank;
|
|
||||||
std::vector<std::vector<Nd4jLong>> indices(numOfArrs, std::vector<Nd4jLong>(rank2,0));
|
|
||||||
|
|
||||||
// take into account indices for first array
|
|
||||||
indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis);
|
|
||||||
|
|
||||||
// loop through the rest of input arrays
|
|
||||||
for(int i = 1; i < numOfArrs; ++i) {
|
|
||||||
indices[i][2 * axis] = indices[i-1][2 * axis + 1]; // index start from
|
|
||||||
indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis); // index end with (excluding)
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<NDArray*> outSubArrs(numOfArrs);
|
|
||||||
for(int i = 0; i < numOfArrs; ++i)
|
|
||||||
outSubArrs[i] = new NDArray(output(indices[i], true));
|
|
||||||
|
|
||||||
// prepare arrays of pointers on buffers and shapes
|
|
||||||
std::vector<void*> hOutBuffers(numOfArrs), hInBuffers(numOfArrs);
|
|
||||||
std::vector<Nd4jLong*> hOutShapeInfo(numOfArrs), hInShapeInfo(numOfArrs);
|
|
||||||
for(int i = 0; i < numOfArrs; ++i) {
|
|
||||||
hOutBuffers[i] = outSubArrs[i]->getSpecialBuffer();
|
|
||||||
hInBuffers[i] = inArrs[i]->getSpecialBuffer();
|
|
||||||
hOutShapeInfo[i] = outSubArrs[i]->getSpecialShapeInfo();
|
|
||||||
hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo();
|
|
||||||
}
|
|
||||||
|
|
||||||
// allocate and copy all buffers and shapes arrays to global memory
|
|
||||||
PointersManager manager(context, "helpers::concat");
|
|
||||||
void* dOutBuffers = manager.replicatePointer(hOutBuffers.data(), hOutBuffers.size() * sizeof(void*));
|
|
||||||
void* dInBuffers = manager.replicatePointer(hInBuffers.data(), hInBuffers.size() * sizeof(void*));
|
|
||||||
void* dInShapeInfo = manager.replicatePointer(hInShapeInfo.data(), hInShapeInfo.size() * sizeof(Nd4jLong*));
|
|
||||||
void* dOutShapeInfo = manager.replicatePointer(hOutShapeInfo.data(), hOutShapeInfo.size() * sizeof(Nd4jLong*));
|
|
||||||
|
|
||||||
BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (numOfArrs, context->getCudaStream(), dInBuffers, dInShapeInfo, dOutBuffers, dOutShapeInfo), LIBND4J_TYPES);
|
|
||||||
|
|
||||||
manager.synchronize();
|
|
||||||
|
|
||||||
for(int i = 0; i < numOfArrs; ++i)
|
|
||||||
delete outSubArrs[i];
|
|
||||||
|
|
||||||
for(int i = 0; i < numOfArrs; ++i)
|
|
||||||
inArrs[i]->tickReadHost();
|
|
||||||
|
|
||||||
output.tickWriteDevice();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -130,6 +130,26 @@ namespace nd4j {
|
||||||
}
|
}
|
||||||
BUILD_DOUBLE_TEMPLATE(template void padCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const int mode, const void *vx, const Nd4jLong *xShapeInfo, const void *vy, const Nd4jLong *yShapeInfo, void *vz, const Nd4jLong *zShapeInfo, const void* vPadVal), LIBND4J_TYPES, INTEGER_TYPES);
|
BUILD_DOUBLE_TEMPLATE(template void padCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const int mode, const void *vx, const Nd4jLong *xShapeInfo, const void *vy, const Nd4jLong *yShapeInfo, void *vz, const Nd4jLong *zShapeInfo, const void* vPadVal), LIBND4J_TYPES, INTEGER_TYPES);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
void pad(nd4j::LaunchContext * context, const int mode, const NDArray& input, const NDArray& paddings, NDArray& output, const NDArray& padValue) {
|
||||||
|
|
||||||
|
PointersManager manager(context, "pad");
|
||||||
|
|
||||||
|
NDArray::prepareSpecialUse({&output}, {&input, &paddings, &padValue});
|
||||||
|
|
||||||
|
const int threadsPerBlock = MAX_NUM_THREADS / 4;
|
||||||
|
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||||
|
const int sharedMem = 8 * threadsPerBlock * output.rankOf() + 128;
|
||||||
|
|
||||||
|
const auto xType = input.dataType();
|
||||||
|
const auto yType = paddings.dataType();
|
||||||
|
|
||||||
|
BUILD_DOUBLE_SELECTOR(xType, yType, padCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), mode, input.getSpecialBuffer(), input.getSpecialShapeInfo(), paddings.getSpecialBuffer(), paddings.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), padValue.getSpecialBuffer()), LIBND4J_TYPES, INTEGER_TYPES);
|
||||||
|
|
||||||
|
NDArray::registerSpecialUse({&output}, {&input, &paddings, &padValue});
|
||||||
|
manager.synchronize();
|
||||||
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
void pad(nd4j::LaunchContext * context, const int mode, const NDArray& input, const NDArray& paddings, NDArray& output, const NDArray& padValue) {
|
void pad(nd4j::LaunchContext * context, const int mode, const NDArray& input, const NDArray& paddings, NDArray& output, const NDArray& padValue) {
|
||||||
|
|
||||||
|
|
|
@ -502,8 +502,8 @@ TEST_F(DeclarableOpsTests2, Test_FloorDiv_2) {
|
||||||
auto x = NDArrayFactory::create<float>('c', {1, 3}, {3.0, 6.0, -3.0});
|
auto x = NDArrayFactory::create<float>('c', {1, 3}, {3.0, 6.0, -3.0});
|
||||||
auto y = NDArrayFactory::create<float>('c', {1, 3}, {-2.0, 2.0, -2.0});
|
auto y = NDArrayFactory::create<float>('c', {1, 3}, {-2.0, 2.0, -2.0});
|
||||||
auto eps = NDArrayFactory::create<float>('c', {1, 3}, {1, 2, 3});
|
auto eps = NDArrayFactory::create<float>('c', {1, 3}, {1, 2, 3});
|
||||||
auto exp1 = NDArrayFactory::create<float>('c', {1, 3}, {1, 2., 3});
|
auto exp1 = NDArrayFactory::create<float>('c', {1, 3}, {0.f, 0.f, 0.f});
|
||||||
auto exp2 = NDArrayFactory::create<float>('c', {1, 3}, {-0, -2., 3});
|
auto exp2 = NDArrayFactory::create<float>('c', {1, 3}, {0.f, 0.f, 0.f});
|
||||||
|
|
||||||
nd4j::ops::floordiv_bp op;
|
nd4j::ops::floordiv_bp op;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue