diff --git a/libnd4j/blas/NDArray.h b/libnd4j/blas/NDArray.h index 1cfadf7f4..edbf264d8 100644 --- a/libnd4j/blas/NDArray.h +++ b/libnd4j/blas/NDArray.h @@ -477,6 +477,11 @@ namespace nd4j { */ void printBuffer(const char* msg = nullptr, Nd4jLong limit = -1, const bool sync = true) const; + /** + * print element by element consequently in a way they (elements) are stored in physical memory + */ + void printLinearBuffer() const; + /** * prints _buffer (if host = true) or _bufferD (if host = false) as it is, that is in current state without checking buffer status */ diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp index 72f117b9b..a6e81f88d 100644 --- a/libnd4j/blas/NDArray.hpp +++ b/libnd4j/blas/NDArray.hpp @@ -1137,6 +1137,39 @@ void NDArray::printBuffer(const char* msg, Nd4jLong limit, const bool sync) cons fflush(stdout); } +////////////////////////////////////////////////////////////////////////// +// print element by element consequently in a way they (elements) are stored in physical memory +void NDArray::printLinearBuffer() const { + + syncToHost(); + + const auto ews = this->ews() > 0 ? this->ews() : 1; + const auto len = this->lengthOf(); + + printf("["); + + if (this->dataType() == nd4j::DataType::INT32) { + for(Nd4jLong e = 0; e < len; e++) + printf("%d, ", this->bufferAsT()[e * ews]); + } + else if(this->dataType() == nd4j::DataType::INT64) { + for(Nd4jLong e = 0; e < len; e++) + printf("%lld, ", this->bufferAsT()[e * ews]); + } + else if(this->dataType() == nd4j::DataType::FLOAT32) { + for(Nd4jLong e = 0; e < len; e++) + printf("%.3f, ", this->bufferAsT()[e * ews]); + } + else if(this->dataType() == nd4j::DataType::DOUBLE) { + for(Nd4jLong e = 0; e < len; e++) + printf("%.3f, ", this->bufferAsT()[e * ews]); + } + else + throw std::invalid_argument("NDArray::printLinearBuffer: not implemented yet for this data type !"); + + printf("]\n"); + fflush(stdout); +} ////////////////////////////////////////////////////////////////////////// static void printFormatted(NDArray const* arr, int depth, int limit) { diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp index 22e7d4d2b..6d319d993 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp @@ -1863,17 +1863,25 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( #endif nd4j_debug("MKL-DNN is not used for pooling2d_bp!\n", 0); - const Nd4jLong iStride0 = gradI.stridesOf()[0]; - const Nd4jLong iStride1 = gradI.stridesOf()[1]; - const Nd4jLong iStride2 = gradI.stridesOf()[2]; - const Nd4jLong iStride3 = gradI.stridesOf()[3]; - const Nd4jLong oStride0 = gradO.stridesOf()[0]; - const Nd4jLong oStride1 = gradO.stridesOf()[1]; - const Nd4jLong oStride2 = gradO.stridesOf()[2]; - const Nd4jLong oStride3 = gradO.stridesOf()[3]; - const Nd4jLong iStep2 = dH*iStride2; - const Nd4jLong iStep3 = dW*iStride3; - const int kProd = kH*kW; + const Nd4jLong iStride0 = input.stridesOf()[0]; + const Nd4jLong iStride1 = input.stridesOf()[1]; + const Nd4jLong iStride2 = input.stridesOf()[2]; + const Nd4jLong iStride3 = input.stridesOf()[3]; + const Nd4jLong gIStride0 = gradI.stridesOf()[0]; + const Nd4jLong gIStride1 = gradI.stridesOf()[1]; + const Nd4jLong gIStride2 = gradI.stridesOf()[2]; + const Nd4jLong gIStride3 = gradI.stridesOf()[3]; + const Nd4jLong oStride0 = gradO.stridesOf()[0]; + const Nd4jLong oStride1 = gradO.stridesOf()[1]; + const Nd4jLong oStride2 = gradO.stridesOf()[2]; + const Nd4jLong oStride3 = gradO.stridesOf()[3]; + const Nd4jLong iStep2 = dH*iStride2; + const Nd4jLong iStep3 = dW*iStride3; + const Nd4jLong gIStep2 = dH*gIStride2; + const Nd4jLong gIStep3 = dW*gIStride3; + const int kProd = kH*kW; + + const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3; Nd4jLong hstart, wstart,hend, wend, maxKH, maxKW; T sum, valO, *pIn, *pgI; @@ -1901,28 +1909,48 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( if(wend > iW) wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); - hstart *= iStride2; - hend *= iStride2; - wstart *= iStride3; - wend *= iStride3; - sum = -DataTypeUtils::max(); valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3]; - // we set these to default values - maxKH = hstart; - maxKW = wstart; + if(sameStrides) { - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) { - T valIn = pIn[kh + kw]; - if (valIn > sum) { - sum = valIn; - maxKH = kh; - maxKW = kw; + hstart *= iStride2; + hend *= iStride2; + wstart *= iStride3; + wend *= iStride3; + + // we set these to default values + maxKH = hstart; + maxKW = wstart; + + for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) { + T valIn = pIn[kh + kw]; + if (valIn > sum) { + sum = valIn; + maxKH = kh; + maxKW = kw; + } } - } - gI[pIn - in + maxKH + maxKW] += valO; + gI[pIn - in + maxKH + maxKW] += valO; + } + else { + + // we set these to default values + maxKH = hstart; + maxKW = wstart; + + for (Nd4jLong kh = hstart; kh < hend; kh += dH) + for (Nd4jLong kw = wstart; kw < wend; kw += dW) { + T valIn = pIn[kh * iStride2 + kw * iStride3]; + if (valIn > sum) { + sum = valIn; + maxKH = kh; + maxKW = kw; + } + } + gI[b * gIStride0 + c * gIStride1 + maxKH * gIStride2 + maxKW * gIStride3] += valO; + } } } } @@ -1936,7 +1964,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( for(int oh = 0; oh < oH; ++oh) { for(int ow = 0; ow < oW; ++ow) { - pgI = gI + b * iStride0 + c * iStride1; + pgI = gI + b * gIStride0 + c * gIStride1; hstart = oh * sH - pH; wstart = ow * sW - pW; @@ -1952,20 +1980,20 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( if(wend > iW) wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); - hstart *= iStride2; - hend *= iStride2; - wstart *= iStride3; - wend *= iStride3; + hstart *= gIStride2; + hend *= gIStride2; + wstart *= gIStride3; + wend *= gIStride3; valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3]; if ((int) extraParam0 == 0) //Exclude padding - valO /= static_cast(nd4j::math::nd4j_ceil(static_cast(hend-hstart) / static_cast(iStep2))) * static_cast(nd4j::math::nd4j_ceil(static_cast(wend-wstart) / static_cast(iStep3))); //Accounts for dilation + valO /= static_cast(nd4j::math::nd4j_ceil(static_cast(hend-hstart) / static_cast(gIStep2))) * static_cast(nd4j::math::nd4j_ceil(static_cast(wend-wstart) / static_cast(gIStep3))); //Accounts for dilation else if ((int) extraParam0 == 1) //Include padding valO /= kProd; - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) + for (Nd4jLong kh = hstart; kh < hend; kh += gIStep2) + for (Nd4jLong kw = wstart; kw < wend; kw += gIStep3) pgI[kh + kw] += valO; } } @@ -1981,7 +2009,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( for(int ow = 0; ow < oW; ++ow) { pIn = in + b * iStride0 + c * iStride1; - pgI = gI + (pIn - in); + pgI = sameStrides ? gI + (pIn - in) : gI + b * gIStride0 + c * gIStride1; hstart = oh * sH - pH; wstart = ow * sW - pW; @@ -1997,24 +2025,41 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( if(wend > iW) wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil(static_cast(wend-iW) / static_cast(dW)); - - hstart *= iStride2; - hend *= iStride2; - wstart *= iStride3; - wend *= iStride3; - sum = static_cast(0.f); valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3]; - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) - sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kh + kw]), extraParam0); + if(sameStrides) { - valO *= nd4j::math::nd4j_pow(sum, ((T)1. - extraParam0) / extraParam0); + hstart *= iStride2; + hend *= iStride2; + wstart *= iStride3; + wend *= iStride3; - for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) - pgI[kh + kw] += valO * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kh + kw]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn(pIn[kh + kw]); + for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) + sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kh + kw]), extraParam0); + + valO *= nd4j::math::nd4j_pow(sum, ((T)1. - extraParam0) / extraParam0); + + for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) + pgI[kh + kw] += valO * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kh + kw]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn(pIn[kh + kw]); + } + else { + + for (Nd4jLong kh = hstart; kh < hend; kh += dH) + for (Nd4jLong kw = wstart; kw < wend; kw += dW) + sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kh * iStride2 + kw * iStride3]), extraParam0); + + valO *= nd4j::math::nd4j_pow(sum, ((T)1. - extraParam0) / extraParam0); + + for (Nd4jLong kh = hstart; kh < hend; kh += dH) { + for (Nd4jLong kw = wstart; kw < wend; kw += dW) { + const auto inVal = pIn[kh * iStride2 + kw * iStride3]; + pgI[kh * gIStride2 + kw * gIStride3] += valO * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn(inVal); + } + } + } } } } @@ -2144,11 +2189,16 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( #endif nd4j_debug("MKL-DNN is not used for pooling3d_bp!\n", 0); - const Nd4jLong iStride0 = gradI.stridesOf()[0]; - const Nd4jLong iStride1 = gradI.stridesOf()[1]; - const Nd4jLong iStride2 = gradI.stridesOf()[2]; - const Nd4jLong iStride3 = gradI.stridesOf()[3]; - const Nd4jLong iStride4 = gradI.stridesOf()[4]; + const Nd4jLong iStride0 = input.stridesOf()[0]; + const Nd4jLong iStride1 = input.stridesOf()[1]; + const Nd4jLong iStride2 = input.stridesOf()[2]; + const Nd4jLong iStride3 = input.stridesOf()[3]; + const Nd4jLong iStride4 = input.stridesOf()[4]; + const Nd4jLong gIStride0 = gradI.stridesOf()[0]; + const Nd4jLong gIStride1 = gradI.stridesOf()[1]; + const Nd4jLong gIStride2 = gradI.stridesOf()[2]; + const Nd4jLong gIStride3 = gradI.stridesOf()[3]; + const Nd4jLong gIStride4 = gradI.stridesOf()[4]; const Nd4jLong oStride0 = gradO.stridesOf()[0]; const Nd4jLong oStride1 = gradO.stridesOf()[1]; const Nd4jLong oStride2 = gradO.stridesOf()[2]; @@ -2157,8 +2207,13 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( const Nd4jLong iStep2 = dD*iStride2; const Nd4jLong iStep3 = dH*iStride3; const Nd4jLong iStep4 = dW*iStride4; + const Nd4jLong gIStep2 = dD*gIStride2; + const Nd4jLong gIStep3 = dH*gIStride3; + const Nd4jLong gIStep4 = dW*gIStride4; const int kProd = kD*kH*kW; + const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3 && iStride4 == gIStride4; + Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW; T sum, valO, *pIn, *pgI; @@ -2192,32 +2247,55 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( if(wend > iW) wend -= dW * ((wend-iW + dW - 1) / dW); - dstart *= iStride2; - dend *= iStride2; - hstart *= iStride3; - hend *= iStride3; - wstart *= iStride4; - wend *= iStride4; - - maxKD = dstart; - maxKH = hstart; - maxKW = wstart; - sum = -DataTypeUtils::max(); valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4]; - for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) - for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) { - T valIn = pIn[kd + kh + kw]; - if (valIn > sum) { - sum = valIn; - maxKD = kd; - maxKH = kh; - maxKW = kw; + if(sameStrides) { + + dstart *= iStride2; + dend *= iStride2; + hstart *= iStride3; + hend *= iStride3; + wstart *= iStride4; + wend *= iStride4; + + maxKD = dstart; + maxKH = hstart; + maxKW = wstart; + + for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) + for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) { + T valIn = pIn[kd + kh + kw]; + if (valIn > sum) { + sum = valIn; + maxKD = kd; + maxKH = kh; + maxKW = kw; + } } - } - gI[pIn - in + maxKD + maxKH + maxKW] += valO; + gI[pIn - in + maxKD + maxKH + maxKW] += valO; + } + else { + + // we set these to default values + maxKH = hstart; + maxKW = wstart; + maxKD = dstart; + + for (Nd4jLong kd = dstart; kd < dend; kd += dD) + for (Nd4jLong kh = hstart; kh < hend; kh += dH) + for (Nd4jLong kw = wstart; kw < wend; kw += dW) { + T valIn = pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]; + if (valIn > sum) { + sum = valIn; + maxKD = kd; + maxKH = kh; + maxKW = kw; + } + } + gI[b * gIStride0 + c * gIStride1 + maxKD * gIStride2 + maxKH * gIStride3 + maxKW * gIStride4] += valO; + } } } } @@ -2233,7 +2311,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( for(int oh = 0; oh < oH; ++oh) { for(int ow = 0; ow < oW; ++ow) { - pgI = gI + b * iStride0 + c * iStride1; + pgI = gI + b * gIStride0 + c * gIStride1; dstart = od * sD - pD; hstart = oh * sH - pH; @@ -2255,23 +2333,23 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( if(wend > iW) wend -= dW * ((wend-iW + dW - 1) / dW); - dstart *= iStride2; - dend *= iStride2; - hstart *= iStride3; - hend *= iStride3; - wstart *= iStride4; - wend *= iStride4; + dstart *= gIStride2; + dend *= gIStride2; + hstart *= gIStride3; + hend *= gIStride3; + wstart *= gIStride4; + wend *= gIStride4; valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4]; if (extraParam0 == 0) //Exclude padding - valO /= nd4j::math::nd4j_ceil(static_cast(dend-dstart) / static_cast(iStep2)) * nd4j::math::nd4j_ceil(static_cast(hend-hstart) / static_cast(iStep3)) * nd4j::math::nd4j_ceil(static_cast(wend-wstart) / static_cast(iStep4)); //Accounts for dilation + valO /= nd4j::math::nd4j_ceil(static_cast(dend-dstart) / static_cast(gIStep2)) * nd4j::math::nd4j_ceil(static_cast(hend-hstart) / static_cast(gIStep3)) * nd4j::math::nd4j_ceil(static_cast(wend-wstart) / static_cast(gIStep4)); //Accounts for dilation else if (extraParam0 == 1) //Include padding valO /= kProd; - for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) - for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) + for (Nd4jLong kd = dstart; kd < dend; kd += gIStep2) + for (Nd4jLong kh = hstart; kh < hend; kh += gIStep3) + for (Nd4jLong kw = wstart; kw < wend; kw += gIStep4) pgI[kd + kh + kw] += valO; } } @@ -2311,27 +2389,46 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d( if(wend > iW) wend -= dW * ((wend-iW + dW - 1) / dW); - dstart *= iStride2; - dend *= iStride2; - hstart *= iStride3; - hend *= iStride3; - wstart *= iStride4; - wend *= iStride4; - sum = static_cast(0.); valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4]; - for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) - for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) - sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd + kh + kw]), extraParam0); + if(sameStrides) { - valO *= nd4j::math::nd4j_pow(sum, ((T)1.f - extraParam0) / extraParam0); + dstart *= iStride2; + dend *= iStride2; + hstart *= iStride3; + hend *= iStride3; + wstart *= iStride4; + wend *= iStride4; - for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) - for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) - for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) - pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd + kh + kw]), extraParam0 - (T)1.f); + for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) + for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) + sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd + kh + kw]), extraParam0); + + valO *= nd4j::math::nd4j_pow(sum, ((T)1.f - extraParam0) / extraParam0); + + for (Nd4jLong kd = dstart; kd < dend; kd += iStep2) + for (Nd4jLong kh = hstart; kh < hend; kh += iStep3) + for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) + pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd + kh + kw]), extraParam0 - (T)1.f); + } + else { + + for (Nd4jLong kd = dstart; kd < dend; kd += dD) + for (Nd4jLong kh = hstart; kh < hend; kh += dH) + for (Nd4jLong kw = wstart; kw < wend; kw += dW) + sum += nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]), extraParam0); + + valO *= nd4j::math::nd4j_pow(sum, ((T)1.f - extraParam0) / extraParam0); + + for (Nd4jLong kd = dstart; kd < dend; kd += dD) + for (Nd4jLong kh = hstart; kh < hend; kh += dH) + for (Nd4jLong kw = wstart; kw < wend; kw += dW) { + const auto inVal = pIn[kD * iStride2 + kh * iStride3 + kw * iStride4]; + pgI[kd * gIStride2 + kh * gIStride3 + kw * gIStride4] += valO * nd4j::math::nd4j_pow(nd4j::math::nd4j_abs(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn(inVal); + } + } } } } diff --git a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu index 1a1730efc..8f4a49905 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu @@ -32,98 +32,47 @@ namespace nd4j { namespace ops { namespace helpers { - /////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////// template __global__ static void concatCuda(const int numOfArrs, void* pVx, void* pxShapeInfo, void* pVz, void* pzShapeInfo) { __shared__ int arrIdx, blocksPerArr; - __shared__ T *x, *z; - __shared__ Nd4jLong *zShapeInfo, *xShapeInfo, arrLen, arrLenPerBlock, start, end; if (threadIdx.x == 0) { blocksPerArr = (gridDim.x + numOfArrs - 1) / numOfArrs; // ceil arrIdx = blockIdx.x / blocksPerArr; - - x = reinterpret_cast(reinterpret_cast(pVx)[arrIdx]); - z = reinterpret_cast(reinterpret_cast(pVz)[arrIdx]); - xShapeInfo = reinterpret_cast(pxShapeInfo)[arrIdx]; - zShapeInfo = reinterpret_cast(pzShapeInfo)[arrIdx]; - arrLen = shape::length(xShapeInfo); - - arrLenPerBlock = (arrLen + blocksPerArr - 1) / blocksPerArr; // ceil - - start = (blockIdx.x % blocksPerArr) * arrLenPerBlock; - end = (start + arrLenPerBlock) > arrLen ? arrLen : (start + arrLenPerBlock); } __syncthreads(); - for (Nd4jLong i = start + threadIdx.x; i < end; i += blockDim.x) - z[shape::getIndexOffset(i, zShapeInfo, arrLen)] = x[shape::getIndexOffset(i, xShapeInfo, arrLen)]; + for(int j = arrIdx; j < numOfArrs; j += gridDim.x) { + + const auto* x = reinterpret_cast(reinterpret_cast(pVx)[j]); + auto* z = reinterpret_cast(reinterpret_cast(pVz)[j]); + const auto* xShapeInfo = reinterpret_cast(pxShapeInfo)[j]; + const auto* zShapeInfo = reinterpret_cast(pzShapeInfo)[j]; + + const auto arrLen = shape::length(xShapeInfo); + + const auto arrLenPerBlock = (arrLen + blocksPerArr - 1) / blocksPerArr; // ceil + + const auto start = (blockIdx.x % blocksPerArr) * arrLenPerBlock; + const auto end = (start + arrLenPerBlock) > arrLen ? arrLen : (start + arrLenPerBlock); + + for (Nd4jLong i = start + threadIdx.x; i < end; i += blockDim.x) + z[shape::getIndexOffset(i, zShapeInfo, arrLen)] = x[shape::getIndexOffset(i, xShapeInfo, arrLen)]; + } } /////////////////////////////////////////////////////////////////// template __host__ static void concatCudaLauncher(const int numOfArrs, const cudaStream_t *stream, void* pVx, void* pxShapeInfo, void* pVz, void* pzShapeInfo) { - concatCuda<<<512, 256, 1024, *stream>>>(numOfArrs, pVx, pxShapeInfo, pVz, pzShapeInfo); + concatCuda<<<512, 512, 512, *stream>>>(numOfArrs, pVx, pxShapeInfo, pVz, pzShapeInfo); } BUILD_SINGLE_TEMPLATE(template void concatCudaLauncher, (const int numOfArrs, const cudaStream_t *stream, void* pVx, void* pxShapeInfo, void* pVz, void* pzShapeInfo), LIBND4J_TYPES); - - ////////////////////////////////////////////////////////////////////////// - void concat(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output, const int axis) { - - const int numOfArrs = inArrs.size(); - for(int i = 0; i < numOfArrs; ++i) - if(!inArrs[i]->isActualOnDeviceSide()) inArrs[i]->syncToDevice(); - - const int rank = inArrs[0]->rankOf(); - const int rank2 = 2*rank; - std::vector> indices(numOfArrs, std::vector(rank2,0)); - - // take into account indices for first array - indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis); - - // loop through the rest of input arrays - for(int i = 1; i < numOfArrs; ++i) { - indices[i][2 * axis] = indices[i-1][2 * axis + 1]; // index start from - indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis); // index end with (excluding) - } - - std::vector outSubArrs(numOfArrs); - for(int i = 0; i < numOfArrs; ++i) - outSubArrs[i] = new NDArray(output(indices[i], true)); - - // prepare arrays of pointers on buffers and shapes - std::vector hOutBuffers(numOfArrs), hInBuffers(numOfArrs); - std::vector hOutShapeInfo(numOfArrs), hInShapeInfo(numOfArrs); - for(int i = 0; i < numOfArrs; ++i) { - hOutBuffers[i] = outSubArrs[i]->getSpecialBuffer(); - hInBuffers[i] = inArrs[i]->getSpecialBuffer(); - hOutShapeInfo[i] = outSubArrs[i]->getSpecialShapeInfo(); - hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo(); - } - - // allocate and copy all buffers and shapes arrays to global memory - PointersManager manager(context, "helpers::concat"); - void* dOutBuffers = manager.replicatePointer(hOutBuffers.data(), hOutBuffers.size() * sizeof(void*)); - void* dInBuffers = manager.replicatePointer(hInBuffers.data(), hInBuffers.size() * sizeof(void*)); - void* dInShapeInfo = manager.replicatePointer(hInShapeInfo.data(), hInShapeInfo.size() * sizeof(Nd4jLong*)); - void* dOutShapeInfo = manager.replicatePointer(hOutShapeInfo.data(), hOutShapeInfo.size() * sizeof(Nd4jLong*)); - - BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (numOfArrs, context->getCudaStream(), dInBuffers, dInShapeInfo, dOutBuffers, dOutShapeInfo), LIBND4J_TYPES); - - manager.synchronize(); - - for(int i = 0; i < numOfArrs; ++i) - delete outSubArrs[i]; - - for(int i = 0; i < numOfArrs; ++i) - inArrs[i]->tickReadHost(); - - output.tickWriteDevice(); - } } } } \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu index c3cc284ce..f6b9d27fa 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu @@ -32,7 +32,7 @@ namespace nd4j { namespace ops { namespace helpers { - /////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////// // x - input, y - paddings, z - output template __global__ static void padCuda(const int mode, @@ -130,6 +130,26 @@ namespace nd4j { } BUILD_DOUBLE_TEMPLATE(template void padCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const int mode, const void *vx, const Nd4jLong *xShapeInfo, const void *vy, const Nd4jLong *yShapeInfo, void *vz, const Nd4jLong *zShapeInfo, const void* vPadVal), LIBND4J_TYPES, INTEGER_TYPES); +/////////////////////////////////////////////////////////////////// + void pad(nd4j::LaunchContext * context, const int mode, const NDArray& input, const NDArray& paddings, NDArray& output, const NDArray& padValue) { + + PointersManager manager(context, "pad"); + + NDArray::prepareSpecialUse({&output}, {&input, &paddings, &padValue}); + + const int threadsPerBlock = MAX_NUM_THREADS / 4; + const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; + const int sharedMem = 8 * threadsPerBlock * output.rankOf() + 128; + + const auto xType = input.dataType(); + const auto yType = paddings.dataType(); + + BUILD_DOUBLE_SELECTOR(xType, yType, padCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), mode, input.getSpecialBuffer(), input.getSpecialShapeInfo(), paddings.getSpecialBuffer(), paddings.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), padValue.getSpecialBuffer()), LIBND4J_TYPES, INTEGER_TYPES); + + NDArray::registerSpecialUse({&output}, {&input, &paddings, &padValue}); + manager.synchronize(); + } + /////////////////////////////////////////////////////////////////// void pad(nd4j::LaunchContext * context, const int mode, const NDArray& input, const NDArray& paddings, NDArray& output, const NDArray& padValue) { diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp index 302e2a08c..c2af3cef4 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp @@ -502,8 +502,8 @@ TEST_F(DeclarableOpsTests2, Test_FloorDiv_2) { auto x = NDArrayFactory::create('c', {1, 3}, {3.0, 6.0, -3.0}); auto y = NDArrayFactory::create('c', {1, 3}, {-2.0, 2.0, -2.0}); auto eps = NDArrayFactory::create('c', {1, 3}, {1, 2, 3}); - auto exp1 = NDArrayFactory::create('c', {1, 3}, {1, 2., 3}); - auto exp2 = NDArrayFactory::create('c', {1, 3}, {-0, -2., 3}); + auto exp1 = NDArrayFactory::create('c', {1, 3}, {0.f, 0.f, 0.f}); + auto exp2 = NDArrayFactory::create('c', {1, 3}, {0.f, 0.f, 0.f}); nd4j::ops::floordiv_bp op;