2019-06-06 14:21:15 +02:00
|
|
|
|
/*******************************************************************************
|
|
|
|
|
* Copyright (c) 2015-2018 Skymind, Inc.
|
|
|
|
|
*
|
|
|
|
|
* This program and the accompanying materials are made available under the
|
|
|
|
|
* terms of the Apache License, Version 2.0 which is available at
|
|
|
|
|
* https://www.apache.org/licenses/LICENSE-2.0.
|
|
|
|
|
*
|
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
|
|
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
|
|
|
* License for the specific language governing permissions and limitations
|
|
|
|
|
* under the License.
|
|
|
|
|
*
|
|
|
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
******************************************************************************/
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
// implementation of operations for Simple Recurrent Unit: arXiv:1709.02755v2 [cs.CL] 12 Sep 2017
|
|
|
|
|
//
|
|
|
|
|
// @author Yurii Shyrma, created on 05.12.2017
|
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
#include<ops/declarable/helpers/sru.h>
|
|
|
|
|
#include <NDArrayFactory.h>
|
|
|
|
|
|
|
|
|
|
namespace nd4j {
|
|
|
|
|
namespace ops {
|
|
|
|
|
namespace helpers {
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
static FORCEINLINE NDArray activation(const NDArray& arr) {
|
|
|
|
|
|
|
|
|
|
// return (const_cast<NDArray<T>&>(arr)).template transform<simdOps::Tanh<T>>();
|
|
|
|
|
auto result = NDArray(&arr, false, arr.getContext());
|
|
|
|
|
(const_cast<NDArray&>(arr)).applyTransform(transform::Tanh, &result);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
static FORCEINLINE NDArray sigmoid(const NDArray& arr) {
|
|
|
|
|
return (const_cast<NDArray&>(arr)).transform(transform::Sigmoid);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
void sruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* c0, const NDArray* w, const NDArray* b, NDArray* h, NDArray* c) {
|
|
|
|
|
|
|
|
|
|
// x input [bS x inSize], bS - batch size, inSize - number of features
|
|
|
|
|
// c0 previous cell state c [bS x inSize], that is at previous time step t-1
|
|
|
|
|
// w weights [inSize x 3*inSize]
|
|
|
|
|
// b biases [2*inSize]
|
|
|
|
|
|
|
|
|
|
// h current cell output [bS x inSize], that is at current time step t
|
|
|
|
|
// c current cell state [bS x inSize], that is at current time step t
|
|
|
|
|
|
|
|
|
|
const int inSize = x->sizeAt(1); // inSize - number of features
|
|
|
|
|
|
|
|
|
|
auto z = mmul(*x, *w); // [bS x 3*inSize]
|
|
|
|
|
|
|
|
|
|
// forget gate = sigmoid(x*Wf + bf)
|
|
|
|
|
auto f = sigmoid(z({0,0, inSize, 2*inSize}) + (*b)({0, inSize}));
|
|
|
|
|
|
|
|
|
|
// reset gate = sigmoid(x*Wr + br)
|
|
|
|
|
auto r = sigmoid(z({0,0, 2*inSize, 3*inSize}) + (*b)({inSize, 2*inSize}));
|
|
|
|
|
|
|
|
|
|
// ◦ means element-wise product or so called Hadamard product
|
|
|
|
|
// current sell state = f◦c0 + (1 - f)◦(x*Wc)
|
|
|
|
|
c->assign(f * (*c0) + (1.f - f) * z({0, 0 ,0, inSize}) );
|
|
|
|
|
// *c = f*(*c0 - z({},{0, inSize})) + z({{},{0, inSize}});
|
|
|
|
|
|
|
|
|
|
// current cell output = r◦activation(c) + (1 - r)◦x
|
|
|
|
|
h->assign( r * activation(*c) + (1.f - r) * (*x) );
|
|
|
|
|
// *h = r * (activation<T>(c) - *x) + *x;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
void sruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* c0, const NDArray* w, const NDArray* b, NDArray* h, NDArray* c) {
|
|
|
|
|
|
|
|
|
|
// x input [bS x inSize x time]
|
|
|
|
|
// c0 initial cell state (at time step = 0) [bS x inSize],
|
|
|
|
|
// w weights, [3*inSize x inSize]
|
|
|
|
|
// b biases, [2*inSize]
|
|
|
|
|
|
|
|
|
|
// h cell outputs [bS x inSize x time]
|
|
|
|
|
// c cell states [bS x inSize x time]
|
|
|
|
|
|
Merge master to upstream (#7945)
* Shugeo strided slice zeros (#14)
* Modified strided_slice op to properly work with empty-like shapes.
* Fixed test for reduce_mean with empty-like input.
* [WIP] Last merge (#15)
* correct logsoftmax looss (#2)
* Small SameDiff listener fix (#4)
* Various fixes (#6)
* #7839 Fix for asXMatrix and tests
* #7866 EmbeddingSequenceLayer dtype fix + test
* #7856 SameDiff save/load stream methods
* #7859 RegressionEvaluation rank 4 fix + tests + axis configuration
* EvaluationBinary 3d/4d
* More evaluation 3d/4d tests
* #7847 Evaluation empty checks
* Small test ifx
* #7848 Fix median edge case
* Improve DL4J samediff layer tests
* [WIP] FastText wrapper implemented (#8)
* FastText implemented
* Some fixes
* Fix shapes for wordsNearest
* Validation of input vectors
* Fixes
* Fixed test
* Thread tagged
* Some tweaks
* setContextClassLoader for DeallocatorServiceThread
* Numpy format tests (#1)
* Various fixes (#11)
* #7852 SameDiff gather fix
* #7892 SameDiff placeholder to constant conversion
* #7890 validate input rank for MLN/CG init methods
* Fix broken permute shape calculation
* Permute and gather fixes
* Tests
* #7850 LogSumExp fix + test
* Handful of test fixes
* Empty arrays with non-scalar shapes (#10)
* minor rearrangements for lambdas
* empty tensors with non-scalar shapes
* numpy empty tensors with non-scalar shapes
* few more empty tweaks
* Small fixes
* conv3d signature update
* micro fix in batchnorm mkldnn
* Import fixes
* Fix
* MKL-DNN update
* Small fill fix
* fill with empty input + test
* Fixes
* Small error improvement
* Fix
* one special test
* couple of fixes for lstm
* Rewrite TFGraphMapper.getNDArrayFromTensor to be maintainable and less error prone
* Fixes
* FP16
* Unsigned
* BFloat16
* Fill op - empty tweaks
* - couple of fixes for empty arrays construction
- stack updated
* strided slice fix
* one transform test
* provide method for reducing shapeInfo in case of input array is empty
* Fixed reduceAlongDimensions to use empty input properly.
* couple of broadcast tests
* couple of tests broadcast tests + tweak to make them pass
* add check of non-empty to methods producing sub-arrays
* Fixed reshapeC with zeros in shape.
* complete empty check in reduce_... legacy ops
* Concat and cumsum/prod
* Tweak to empty shape inference on import
* add empty check to the rest of reduce legacy ops
* one more test
* correct typo in evalReduceShapeInfoEmpty
* Added tests for reduce_* ops to tests with zero shapes.
* few more tests for empty reductions
* Fixed strided_slice op with empty case and tests.
* one more empty reduction test
* Fixed strided_slice test.
* add empty check to NDArray::reshapei
* infOrMax
* empty min/max with infinity tests
* made unstack working correctly with empty arrays
* few IndexReduce tests + tweaks for empty shapes
* add test for empty concat
* few tests fixed
* Validation fix for reductions on empty shapes
* Reverse fix
* Reduction shape calc fixes
* SameDiff.generateOutputVariable: don't use shape function to determine number of outputs
* Range fix
* - NDArray constructor updated for scalars/empty arrays
- few tests fixed
* More fixes
* Empty creator fixes
* concat fix
* concat fix
* TF import tests: allow 'both all NaN' and 'both all inf' to pass
* Slice, zero fraction, and reshape fixes
* transpose, gather
* Zero fraction
* scalar cast fix
* Empty reduction axis support
* few more tests fixed
* Fixed input checks conforming with TF for concat op and tests.
* few tests fixed
* matmul scalar shape fix
* Fixed checkout for data type and scalarity with concat to allow non-empty scalars with vector concats.
* broadcast bool fix
* few more tests
* few more tests
* correct evalReduceShapeInfoEmpty
* argmax/argmin + tests
* one more empty edge case + one more test
* argmax/argmin/realdiv_bp tweaks
* empty reshape test + fix
* Helper fixes
* Small fixes
* Gather test fix
* Gather test fix
* Small fixes
* reduce scalar zero values
* scalar mean workaround
* Remove debug code
* along dim mean workaround
* one more test
* - equalsTo() tweak for empty arrays
- one more test
* broadcast tweaks
* [WIP] Fixing outstanding issues for NLP (#9)
* Avoid using not-inited objects
* Test fixed.
* Redundant method avoided for models like FastText
* KMeans++ implementation
* KMeans++ implementation
* Disable parallel execution
* KMeans++
* Tests
* Dev branch merge (#16)
* SameDiff: convertDataType and gradient check util improvements (#12)
* GradCheck util improvements
* StopGradient constructor + test
* SameDiff: Add datatype conversion
* Javadoc and add DataType.isNumerical()
* Small fix
* Fix SameDiff TF import test cases intermediate naming (workaround for bad default)
* TFGraphTestAllHelper: check intermediates in execution order
* Add missing debug listener
* [WIP] lstmBlock fix + other changes (#13)
- fixes lstmBlock issue
- changes NDArray method reshape(), permute(), transpose() by making them return instance instead of pointer
- CheckNumerics op
- fixes for ReduceBool IsInfOrNan & IsFinite
* Small test fix
* CheckNumerics op wrapper
* Fix some issues on master (#17)
* Fix DataVec test issue
* Fix issue with dl4j SameDiff output layer
* Dtype fix for lambda layers
* #7912 BertIterator dtype fix (use float32 not global default)
* [WIP] Next set of CUDA stuff (#7)
New CUDA implementations and improvements
* bad file
* Dev branch master merge (#23)
* SameDiff: convertDataType and gradient check util improvements (#12)
* GradCheck util improvements
* StopGradient constructor + test
* SameDiff: Add datatype conversion
* Javadoc and add DataType.isNumerical()
* Small fix
* Fix SameDiff TF import test cases intermediate naming (workaround for bad default)
* TFGraphTestAllHelper: check intermediates in execution order
* Add missing debug listener
* [WIP] lstmBlock fix + other changes (#13)
- fixes lstmBlock issue
- changes NDArray method reshape(), permute(), transpose() by making them return instance instead of pointer
- CheckNumerics op
- fixes for ReduceBool IsInfOrNan & IsFinite
* Small test fix
* CheckNumerics op wrapper
* Compatibility of deserialization (#18)
Signed-off-by: Alexander Stoyakin <alexander.stoyakin@gmail.com>
* SameDiff: add activation gradient checking support for debugging (#19)
* SameDiff gradient checker: first pass on activation gradient checks
* Fixes + tests for activation gradient checking
* Javadoc
* [WIP] Some nd4j data type corrections (#20)
* Adjust data type
* Set correct Data type.
* Size of proper data type.
* fix averaged cpu load (#22)
* SameDiff ops, TF import and fixes (#24)
* CheckNumerics tests + fixes + misc fixes
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fake quant
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fixes
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* FakeQuantWithMinMaxArgs
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* CheckNumerics fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fix libnd4j ALL_INTS and ALL_FLOATS declaration (uint and bfloat types)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Small fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Javadoc
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Exception tweak
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fix for out of scope stack allocated var use
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Ignores
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Ignore for known failing test (already logged issue)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Merge upstream to fork (#25)
* Add thousand-separator commas to TotalParams (#7915)
* Add thousand-separator commas to TotalParams
The number of parameters can be quite large, and it would help the reading of the summary printout to have the TotalParams column & values at the bottom have thousand-separator-commas in them.
* Add thousand-separator commas to MultiLayerNetwork
Corresponding change to MultiLayerNetwork
Signed-off-by: Jxtps Jxtps <jxtps435@gmail.com>
* Update contributing and issue/PR templates (#7934)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fix link to AdaDelta paper (#7942)
Fix link to AdaDelta paper hosted on matthewzeiler.com
Signed-off-by: Jxtps
* Fixes, and ignores for known/logged failing issues (#7943)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* SameDiff + DL4J/SameDiff: Multiple fixes (#28)
* #7919 HDF5 attribute buffer length fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #7909 Arbiter constructor exception ux improvements
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #7925 RNN output layer length checks
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #7939 Add listener for validating inputs are not incorrectly modified
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #7939 Integrate NonInplaceValidationListener into tests
* #7844 DL4J SameDiff fixes for variable minibatch size
* DL4J SameDiff fixes - ensure gradient for input placeholder is available
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Tweaks to ExternalErrorsFunction - use placeholders, make more robust
* Another fix
* More fixes
* More SameDiff/DL4J fixes
* Scope out scalar array creation in BaseScalarOp
* Remove debug code
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* [WIP] Final dev branch merge (#29)
* SameDiff: convertDataType and gradient check util improvements (#12)
* GradCheck util improvements
* StopGradient constructor + test
* SameDiff: Add datatype conversion
* Javadoc and add DataType.isNumerical()
* Small fix
* Fix SameDiff TF import test cases intermediate naming (workaround for bad default)
* TFGraphTestAllHelper: check intermediates in execution order
* Add missing debug listener
* [WIP] lstmBlock fix + other changes (#13)
- fixes lstmBlock issue
- changes NDArray method reshape(), permute(), transpose() by making them return instance instead of pointer
- CheckNumerics op
- fixes for ReduceBool IsInfOrNan & IsFinite
* Small test fix
* CheckNumerics op wrapper
* Compatibility of deserialization (#18)
Signed-off-by: Alexander Stoyakin <alexander.stoyakin@gmail.com>
* SameDiff: add activation gradient checking support for debugging (#19)
* SameDiff gradient checker: first pass on activation gradient checks
* Fixes + tests for activation gradient checking
* Javadoc
* [WIP] Some nd4j data type corrections (#20)
* Adjust data type
* Set correct Data type.
* Size of proper data type.
* fix averaged cpu load (#22)
* [WIP] Multiple dataset iterators (#27)
* Splitting dataset into arbitrary number
* Fixes
* Multiple split of iterator
* Test
* Test
* Some fixes
* signature change
* one more tweak
Signed-off-by: raver119 <raver119@gmail.com>
* one more test for sequential use of DataSetIteratorSplitter
Signed-off-by: raver119 <raver119@gmail.com>
* Fixes
* Fixes
* one more test for Alexander
Signed-off-by: raver119 <raver119@gmail.com>
* Some fixes
* Some fixes
* one more test for Alexander
Signed-off-by: raver119 <raver119@gmail.com>
* minor test fix
Signed-off-by: raver119 <raver119@gmail.com>
* Some fixes
* Some fixes
* couple of assertions tweaked
Signed-off-by: raver119 <raver119@gmail.com>
* MDS splitter test :/
Signed-off-by: raver119 <raver119@gmail.com>
* Minor refactoring
* Multi dataset
* Some fixes
* More tests
* Small number of test fixes/improvements (failures on CI) (#31)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* [WIP] More CUDA stuff (#26)
* initial commit
Signed-off-by: raver119 <raver119@gmail.com>
* LRN BP CUDA
Signed-off-by: raver119 <raver119@gmail.com>
* less memory
Signed-off-by: raver119 <raver119@gmail.com>
* Fixed bug with crop_and_resize op helper.
* get rid of unnecessary index-calculation dunction
Signed-off-by: Yurii <yurii@skymind.io>
* Fixed sort with nth_element cuda-based helper.
* Refactored nth_element.
* Refactored nth_element op and tests.
* Modified usage of dim array with sortTad routine.
* Refactored main routine of helper for non_max_image_suppression op.
* non_max_image_suppression op helper with cuda kernel implementation. Initial revision.
* fix vol2col cuda kernel
* meh
Signed-off-by: raver119 <raver119@gmail.com>
* topK concept
Signed-off-by: raver119 <raver119@gmail.com>
* unsorted topK with scanWitdh of 1
Signed-off-by: raver119 <raver119@gmail.com>
* correct vol2col tests
* sorted/unsorted topK
Signed-off-by: raver119 <raver119@gmail.com>
* implementation and fixing col2im/col2vol
* Corrected usage flags with input/output with reverse op.
* dup is const now
Signed-off-by: raver119 <raver119@gmail.com>
* percentile op
Signed-off-by: raver119 <raver119@gmail.com>
* group tests for mapool2d
Signed-off-by: Yurii <yurii@skymind.io>
* special test for george
Signed-off-by: raver119 <raver119@gmail.com>
* less threads for sortTad
Signed-off-by: raver119 <raver119@gmail.com>
* provide conv2d for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* remove auther in sort tad kernel code
Signed-off-by: Yurii <yurii@skymind.io>
* provide depthwise_conv2d for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* - max_pooling_with_argmax
- null check for special use
Signed-off-by: raver119 <raver119@gmail.com>
* dts cuda
Signed-off-by: raver119 <raver119@gmail.com>
* provide sconv2d for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* std cuda
Signed-off-by: raver119 <raver119@gmail.com>
* Refactored non_max_suppression op to conform TF implementation.
* Improved suppression helper.
* provide pooling3d for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* minor lstm rearrangements
Signed-off-by: raver119 <raver119@gmail.com>
* more of minor lstm rearrangements
Signed-off-by: raver119 <raver119@gmail.com>
* (bi)dynamic_rnn
Signed-off-by: raver119 <raver119@gmail.com>
* templates init order
Signed-off-by: raver119 <raver119@gmail.com>
* Refactored non_max_suppression op.
* Added cuda kernel for non_max_suppression.
* CPU sort by key/value
Signed-off-by: raver119 <raver119@gmail.com>
* CPU sort TAD by key/value
Signed-off-by: raver119 <raver119@gmail.com>
* CPU sort TAD by key/value tests
Signed-off-by: raver119 <raver119@gmail.com>
* Eliminate compiler error with cuda implementation.
* - repaired gradCheck in cuda
- provide conv2d_bp for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* missed signature
Signed-off-by: raver119 <raver119@gmail.com>
* provide depthwise_conv2d_bp for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* Implementation of lup helper with cuda kernel. Initial commit.
* further work on backprops for convolutions
Signed-off-by: Yurii <yurii@skymind.io>
* CUDA linear sort by key/val
Signed-off-by: raver119 <raver119@gmail.com>
* CUDA tad sort by key/val
Signed-off-by: raver119 <raver119@gmail.com>
* start providing of backprop for pooling2d/3d
Signed-off-by: Yurii <yurii@skymind.io>
* Added atomicAdd for bool datatype.
* dynamic partition concept
Signed-off-by: raver119 <raver119@gmail.com>
* dynamic partition concept
Signed-off-by: raver119 <raver119@gmail.com>
* dynamic partition scalar CUDA
Signed-off-by: raver119 <raver119@gmail.com>
* important comment
Signed-off-by: raver119 <raver119@gmail.com>
* fix pooling2d/3d backprop helpers
Signed-off-by: Yurii <yurii@skymind.io>
* Added non-linear test with dynamic_partition.
* Improved test for dynamic_partition.
* dynamic_partition TAD concept
Signed-off-by: raver119 <raver119@gmail.com>
* - dynamic_partition TAD CUDA impl
- dynamic_partition TAD CPU fix
Signed-off-by: raver119 <raver119@gmail.com>
* - rewrite cpu code for usampling2d/3d
- write cuda code for usampling2d/3d
Signed-off-by: Yurii <yurii@skymind.io>
* dynamic_stitch CUDA vector case
Signed-off-by: raver119 <raver119@gmail.com>
* dynamic_stitch CUDA TAD case concept
Signed-off-by: raver119 <raver119@gmail.com>
* dynamic_stitch CUDA TAD case impl
Signed-off-by: raver119 <raver119@gmail.com>
* Added tests for dynamic_stitch 3D-4D cases.
* minor tests tweaks
Signed-off-by: raver119 <raver119@gmail.com>
* Fixed type check for dynamic stitch.
* min/max bp
Signed-off-by: raver119 <raver119@gmail.com>
* rewrite code for upsampling2d/3d cpu
Signed-off-by: Yurii <yurii@skymind.io>
* reduce min/max/norm_max bp
Signed-off-by: raver119 <raver119@gmail.com>
* lup implementation. Additional enhancements.
* provide code for upsamling2d/3d backprop
Signed-off-by: Yurii <yurii@skymind.io>
* weightedCrossEntropyWithLogits
Signed-off-by: raver119 <raver119@gmail.com>
* Fixed template math atomicMul for 64bit ints.
* Refactored dynamic_partition_bp op.
* inverseBroadcast fix
Signed-off-by: raver119 <raver119@gmail.com>
* DynamicPartitionBP test datatype fixed.
* - nd4j_atomicMul Windows fix
- cpu/NDArrayLambda.hpp excluded from CUDA
Signed-off-by: raver119 <raver119@gmail.com>
2019-06-27 17:37:04 +02:00
|
|
|
|
auto wT = w->transpose(); // [3*inSize x inSize] -> [inSize x 3*inSize]
|
2019-06-06 14:21:15 +02:00
|
|
|
|
|
|
|
|
|
const int time = x->sizeAt(2);
|
|
|
|
|
|
|
|
|
|
NDArray ct_1(*c0);
|
|
|
|
|
|
|
|
|
|
// loop through time steps
|
|
|
|
|
for (int t = 0; t < time; ++t) {
|
|
|
|
|
|
|
|
|
|
auto xt = (*x)({0,0, 0,0, t,t+1});
|
|
|
|
|
auto ht = (*h)({0,0, 0,0, t,t+1});
|
|
|
|
|
auto ct = (*c)({0,0, 0,0, t,t+1});
|
|
|
|
|
|
Merge master to upstream (#7945)
* Shugeo strided slice zeros (#14)
* Modified strided_slice op to properly work with empty-like shapes.
* Fixed test for reduce_mean with empty-like input.
* [WIP] Last merge (#15)
* correct logsoftmax looss (#2)
* Small SameDiff listener fix (#4)
* Various fixes (#6)
* #7839 Fix for asXMatrix and tests
* #7866 EmbeddingSequenceLayer dtype fix + test
* #7856 SameDiff save/load stream methods
* #7859 RegressionEvaluation rank 4 fix + tests + axis configuration
* EvaluationBinary 3d/4d
* More evaluation 3d/4d tests
* #7847 Evaluation empty checks
* Small test ifx
* #7848 Fix median edge case
* Improve DL4J samediff layer tests
* [WIP] FastText wrapper implemented (#8)
* FastText implemented
* Some fixes
* Fix shapes for wordsNearest
* Validation of input vectors
* Fixes
* Fixed test
* Thread tagged
* Some tweaks
* setContextClassLoader for DeallocatorServiceThread
* Numpy format tests (#1)
* Various fixes (#11)
* #7852 SameDiff gather fix
* #7892 SameDiff placeholder to constant conversion
* #7890 validate input rank for MLN/CG init methods
* Fix broken permute shape calculation
* Permute and gather fixes
* Tests
* #7850 LogSumExp fix + test
* Handful of test fixes
* Empty arrays with non-scalar shapes (#10)
* minor rearrangements for lambdas
* empty tensors with non-scalar shapes
* numpy empty tensors with non-scalar shapes
* few more empty tweaks
* Small fixes
* conv3d signature update
* micro fix in batchnorm mkldnn
* Import fixes
* Fix
* MKL-DNN update
* Small fill fix
* fill with empty input + test
* Fixes
* Small error improvement
* Fix
* one special test
* couple of fixes for lstm
* Rewrite TFGraphMapper.getNDArrayFromTensor to be maintainable and less error prone
* Fixes
* FP16
* Unsigned
* BFloat16
* Fill op - empty tweaks
* - couple of fixes for empty arrays construction
- stack updated
* strided slice fix
* one transform test
* provide method for reducing shapeInfo in case of input array is empty
* Fixed reduceAlongDimensions to use empty input properly.
* couple of broadcast tests
* couple of tests broadcast tests + tweak to make them pass
* add check of non-empty to methods producing sub-arrays
* Fixed reshapeC with zeros in shape.
* complete empty check in reduce_... legacy ops
* Concat and cumsum/prod
* Tweak to empty shape inference on import
* add empty check to the rest of reduce legacy ops
* one more test
* correct typo in evalReduceShapeInfoEmpty
* Added tests for reduce_* ops to tests with zero shapes.
* few more tests for empty reductions
* Fixed strided_slice op with empty case and tests.
* one more empty reduction test
* Fixed strided_slice test.
* add empty check to NDArray::reshapei
* infOrMax
* empty min/max with infinity tests
* made unstack working correctly with empty arrays
* few IndexReduce tests + tweaks for empty shapes
* add test for empty concat
* few tests fixed
* Validation fix for reductions on empty shapes
* Reverse fix
* Reduction shape calc fixes
* SameDiff.generateOutputVariable: don't use shape function to determine number of outputs
* Range fix
* - NDArray constructor updated for scalars/empty arrays
- few tests fixed
* More fixes
* Empty creator fixes
* concat fix
* concat fix
* TF import tests: allow 'both all NaN' and 'both all inf' to pass
* Slice, zero fraction, and reshape fixes
* transpose, gather
* Zero fraction
* scalar cast fix
* Empty reduction axis support
* few more tests fixed
* Fixed input checks conforming with TF for concat op and tests.
* few tests fixed
* matmul scalar shape fix
* Fixed checkout for data type and scalarity with concat to allow non-empty scalars with vector concats.
* broadcast bool fix
* few more tests
* few more tests
* correct evalReduceShapeInfoEmpty
* argmax/argmin + tests
* one more empty edge case + one more test
* argmax/argmin/realdiv_bp tweaks
* empty reshape test + fix
* Helper fixes
* Small fixes
* Gather test fix
* Gather test fix
* Small fixes
* reduce scalar zero values
* scalar mean workaround
* Remove debug code
* along dim mean workaround
* one more test
* - equalsTo() tweak for empty arrays
- one more test
* broadcast tweaks
* [WIP] Fixing outstanding issues for NLP (#9)
* Avoid using not-inited objects
* Test fixed.
* Redundant method avoided for models like FastText
* KMeans++ implementation
* KMeans++ implementation
* Disable parallel execution
* KMeans++
* Tests
* Dev branch merge (#16)
* SameDiff: convertDataType and gradient check util improvements (#12)
* GradCheck util improvements
* StopGradient constructor + test
* SameDiff: Add datatype conversion
* Javadoc and add DataType.isNumerical()
* Small fix
* Fix SameDiff TF import test cases intermediate naming (workaround for bad default)
* TFGraphTestAllHelper: check intermediates in execution order
* Add missing debug listener
* [WIP] lstmBlock fix + other changes (#13)
- fixes lstmBlock issue
- changes NDArray method reshape(), permute(), transpose() by making them return instance instead of pointer
- CheckNumerics op
- fixes for ReduceBool IsInfOrNan & IsFinite
* Small test fix
* CheckNumerics op wrapper
* Fix some issues on master (#17)
* Fix DataVec test issue
* Fix issue with dl4j SameDiff output layer
* Dtype fix for lambda layers
* #7912 BertIterator dtype fix (use float32 not global default)
* [WIP] Next set of CUDA stuff (#7)
New CUDA implementations and improvements
* bad file
* Dev branch master merge (#23)
* SameDiff: convertDataType and gradient check util improvements (#12)
* GradCheck util improvements
* StopGradient constructor + test
* SameDiff: Add datatype conversion
* Javadoc and add DataType.isNumerical()
* Small fix
* Fix SameDiff TF import test cases intermediate naming (workaround for bad default)
* TFGraphTestAllHelper: check intermediates in execution order
* Add missing debug listener
* [WIP] lstmBlock fix + other changes (#13)
- fixes lstmBlock issue
- changes NDArray method reshape(), permute(), transpose() by making them return instance instead of pointer
- CheckNumerics op
- fixes for ReduceBool IsInfOrNan & IsFinite
* Small test fix
* CheckNumerics op wrapper
* Compatibility of deserialization (#18)
Signed-off-by: Alexander Stoyakin <alexander.stoyakin@gmail.com>
* SameDiff: add activation gradient checking support for debugging (#19)
* SameDiff gradient checker: first pass on activation gradient checks
* Fixes + tests for activation gradient checking
* Javadoc
* [WIP] Some nd4j data type corrections (#20)
* Adjust data type
* Set correct Data type.
* Size of proper data type.
* fix averaged cpu load (#22)
* SameDiff ops, TF import and fixes (#24)
* CheckNumerics tests + fixes + misc fixes
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fake quant
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fixes
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* FakeQuantWithMinMaxArgs
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* CheckNumerics fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fix libnd4j ALL_INTS and ALL_FLOATS declaration (uint and bfloat types)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Small fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Javadoc
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Exception tweak
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fix for out of scope stack allocated var use
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Ignores
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Ignore for known failing test (already logged issue)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Merge upstream to fork (#25)
* Add thousand-separator commas to TotalParams (#7915)
* Add thousand-separator commas to TotalParams
The number of parameters can be quite large, and it would help the reading of the summary printout to have the TotalParams column & values at the bottom have thousand-separator-commas in them.
* Add thousand-separator commas to MultiLayerNetwork
Corresponding change to MultiLayerNetwork
Signed-off-by: Jxtps Jxtps <jxtps435@gmail.com>
* Update contributing and issue/PR templates (#7934)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Fix link to AdaDelta paper (#7942)
Fix link to AdaDelta paper hosted on matthewzeiler.com
Signed-off-by: Jxtps
* Fixes, and ignores for known/logged failing issues (#7943)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* SameDiff + DL4J/SameDiff: Multiple fixes (#28)
* #7919 HDF5 attribute buffer length fix
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #7909 Arbiter constructor exception ux improvements
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #7925 RNN output layer length checks
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #7939 Add listener for validating inputs are not incorrectly modified
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* #7939 Integrate NonInplaceValidationListener into tests
* #7844 DL4J SameDiff fixes for variable minibatch size
* DL4J SameDiff fixes - ensure gradient for input placeholder is available
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* Tweaks to ExternalErrorsFunction - use placeholders, make more robust
* Another fix
* More fixes
* More SameDiff/DL4J fixes
* Scope out scalar array creation in BaseScalarOp
* Remove debug code
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* [WIP] Final dev branch merge (#29)
* SameDiff: convertDataType and gradient check util improvements (#12)
* GradCheck util improvements
* StopGradient constructor + test
* SameDiff: Add datatype conversion
* Javadoc and add DataType.isNumerical()
* Small fix
* Fix SameDiff TF import test cases intermediate naming (workaround for bad default)
* TFGraphTestAllHelper: check intermediates in execution order
* Add missing debug listener
* [WIP] lstmBlock fix + other changes (#13)
- fixes lstmBlock issue
- changes NDArray method reshape(), permute(), transpose() by making them return instance instead of pointer
- CheckNumerics op
- fixes for ReduceBool IsInfOrNan & IsFinite
* Small test fix
* CheckNumerics op wrapper
* Compatibility of deserialization (#18)
Signed-off-by: Alexander Stoyakin <alexander.stoyakin@gmail.com>
* SameDiff: add activation gradient checking support for debugging (#19)
* SameDiff gradient checker: first pass on activation gradient checks
* Fixes + tests for activation gradient checking
* Javadoc
* [WIP] Some nd4j data type corrections (#20)
* Adjust data type
* Set correct Data type.
* Size of proper data type.
* fix averaged cpu load (#22)
* [WIP] Multiple dataset iterators (#27)
* Splitting dataset into arbitrary number
* Fixes
* Multiple split of iterator
* Test
* Test
* Some fixes
* signature change
* one more tweak
Signed-off-by: raver119 <raver119@gmail.com>
* one more test for sequential use of DataSetIteratorSplitter
Signed-off-by: raver119 <raver119@gmail.com>
* Fixes
* Fixes
* one more test for Alexander
Signed-off-by: raver119 <raver119@gmail.com>
* Some fixes
* Some fixes
* one more test for Alexander
Signed-off-by: raver119 <raver119@gmail.com>
* minor test fix
Signed-off-by: raver119 <raver119@gmail.com>
* Some fixes
* Some fixes
* couple of assertions tweaked
Signed-off-by: raver119 <raver119@gmail.com>
* MDS splitter test :/
Signed-off-by: raver119 <raver119@gmail.com>
* Minor refactoring
* Multi dataset
* Some fixes
* More tests
* Small number of test fixes/improvements (failures on CI) (#31)
Signed-off-by: AlexDBlack <blacka101@gmail.com>
* [WIP] More CUDA stuff (#26)
* initial commit
Signed-off-by: raver119 <raver119@gmail.com>
* LRN BP CUDA
Signed-off-by: raver119 <raver119@gmail.com>
* less memory
Signed-off-by: raver119 <raver119@gmail.com>
* Fixed bug with crop_and_resize op helper.
* get rid of unnecessary index-calculation dunction
Signed-off-by: Yurii <yurii@skymind.io>
* Fixed sort with nth_element cuda-based helper.
* Refactored nth_element.
* Refactored nth_element op and tests.
* Modified usage of dim array with sortTad routine.
* Refactored main routine of helper for non_max_image_suppression op.
* non_max_image_suppression op helper with cuda kernel implementation. Initial revision.
* fix vol2col cuda kernel
* meh
Signed-off-by: raver119 <raver119@gmail.com>
* topK concept
Signed-off-by: raver119 <raver119@gmail.com>
* unsorted topK with scanWitdh of 1
Signed-off-by: raver119 <raver119@gmail.com>
* correct vol2col tests
* sorted/unsorted topK
Signed-off-by: raver119 <raver119@gmail.com>
* implementation and fixing col2im/col2vol
* Corrected usage flags with input/output with reverse op.
* dup is const now
Signed-off-by: raver119 <raver119@gmail.com>
* percentile op
Signed-off-by: raver119 <raver119@gmail.com>
* group tests for mapool2d
Signed-off-by: Yurii <yurii@skymind.io>
* special test for george
Signed-off-by: raver119 <raver119@gmail.com>
* less threads for sortTad
Signed-off-by: raver119 <raver119@gmail.com>
* provide conv2d for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* remove auther in sort tad kernel code
Signed-off-by: Yurii <yurii@skymind.io>
* provide depthwise_conv2d for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* - max_pooling_with_argmax
- null check for special use
Signed-off-by: raver119 <raver119@gmail.com>
* dts cuda
Signed-off-by: raver119 <raver119@gmail.com>
* provide sconv2d for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* std cuda
Signed-off-by: raver119 <raver119@gmail.com>
* Refactored non_max_suppression op to conform TF implementation.
* Improved suppression helper.
* provide pooling3d for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* minor lstm rearrangements
Signed-off-by: raver119 <raver119@gmail.com>
* more of minor lstm rearrangements
Signed-off-by: raver119 <raver119@gmail.com>
* (bi)dynamic_rnn
Signed-off-by: raver119 <raver119@gmail.com>
* templates init order
Signed-off-by: raver119 <raver119@gmail.com>
* Refactored non_max_suppression op.
* Added cuda kernel for non_max_suppression.
* CPU sort by key/value
Signed-off-by: raver119 <raver119@gmail.com>
* CPU sort TAD by key/value
Signed-off-by: raver119 <raver119@gmail.com>
* CPU sort TAD by key/value tests
Signed-off-by: raver119 <raver119@gmail.com>
* Eliminate compiler error with cuda implementation.
* - repaired gradCheck in cuda
- provide conv2d_bp for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* missed signature
Signed-off-by: raver119 <raver119@gmail.com>
* provide depthwise_conv2d_bp for cuda
Signed-off-by: Yurii <yurii@skymind.io>
* Implementation of lup helper with cuda kernel. Initial commit.
* further work on backprops for convolutions
Signed-off-by: Yurii <yurii@skymind.io>
* CUDA linear sort by key/val
Signed-off-by: raver119 <raver119@gmail.com>
* CUDA tad sort by key/val
Signed-off-by: raver119 <raver119@gmail.com>
* start providing of backprop for pooling2d/3d
Signed-off-by: Yurii <yurii@skymind.io>
* Added atomicAdd for bool datatype.
* dynamic partition concept
Signed-off-by: raver119 <raver119@gmail.com>
* dynamic partition concept
Signed-off-by: raver119 <raver119@gmail.com>
* dynamic partition scalar CUDA
Signed-off-by: raver119 <raver119@gmail.com>
* important comment
Signed-off-by: raver119 <raver119@gmail.com>
* fix pooling2d/3d backprop helpers
Signed-off-by: Yurii <yurii@skymind.io>
* Added non-linear test with dynamic_partition.
* Improved test for dynamic_partition.
* dynamic_partition TAD concept
Signed-off-by: raver119 <raver119@gmail.com>
* - dynamic_partition TAD CUDA impl
- dynamic_partition TAD CPU fix
Signed-off-by: raver119 <raver119@gmail.com>
* - rewrite cpu code for usampling2d/3d
- write cuda code for usampling2d/3d
Signed-off-by: Yurii <yurii@skymind.io>
* dynamic_stitch CUDA vector case
Signed-off-by: raver119 <raver119@gmail.com>
* dynamic_stitch CUDA TAD case concept
Signed-off-by: raver119 <raver119@gmail.com>
* dynamic_stitch CUDA TAD case impl
Signed-off-by: raver119 <raver119@gmail.com>
* Added tests for dynamic_stitch 3D-4D cases.
* minor tests tweaks
Signed-off-by: raver119 <raver119@gmail.com>
* Fixed type check for dynamic stitch.
* min/max bp
Signed-off-by: raver119 <raver119@gmail.com>
* rewrite code for upsampling2d/3d cpu
Signed-off-by: Yurii <yurii@skymind.io>
* reduce min/max/norm_max bp
Signed-off-by: raver119 <raver119@gmail.com>
* lup implementation. Additional enhancements.
* provide code for upsamling2d/3d backprop
Signed-off-by: Yurii <yurii@skymind.io>
* weightedCrossEntropyWithLogits
Signed-off-by: raver119 <raver119@gmail.com>
* Fixed template math atomicMul for 64bit ints.
* Refactored dynamic_partition_bp op.
* inverseBroadcast fix
Signed-off-by: raver119 <raver119@gmail.com>
* DynamicPartitionBP test datatype fixed.
* - nd4j_atomicMul Windows fix
- cpu/NDArrayLambda.hpp excluded from CUDA
Signed-off-by: raver119 <raver119@gmail.com>
2019-06-27 17:37:04 +02:00
|
|
|
|
helpers::sruCell(context, &xt, &ct_1, &wT, b, &ht, &ct);
|
2019-06-06 14:21:15 +02:00
|
|
|
|
ct_1.assign(ct);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
template <typename T>
|
|
|
|
|
static void sruBI_(NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* mask, NDArray* ht, NDArray* ct) {
|
|
|
|
|
|
|
|
|
|
// x input 3d tensor [time x bS x 2*inSize], time - number of time steps, bS - batch size, inSize - number of features
|
|
|
|
|
// w 2d tensor of weights [2*inSize x 6*inSize]
|
|
|
|
|
// b row of biases with twice length [1 × 4*inSize]
|
|
|
|
|
// c0 2d tensor of initial state [bS x 2*inSize] at time t=0
|
|
|
|
|
// mask optional, 2d tensor of dropout mask [bS x 2*inSize]
|
|
|
|
|
|
|
|
|
|
// ht [time x bS x 2*inSize]
|
|
|
|
|
// ct [time x bS x 2*inSize]
|
|
|
|
|
|
|
|
|
|
const Nd4jLong time = x->sizeAt(0); // time - number of time steps
|
|
|
|
|
const Nd4jLong bS = x->sizeAt(1); // bS - batch size
|
|
|
|
|
const Nd4jLong inSize = x->sizeAt(2) / 2; // inSize - number of features
|
|
|
|
|
|
|
|
|
|
// x = x * mask
|
|
|
|
|
if(mask)
|
|
|
|
|
x->applyBroadcast(broadcast::Multiply, {1, 2}, mask, x, nullptr); // apply mask
|
|
|
|
|
|
|
|
|
|
// U = x * w
|
|
|
|
|
NDArray wi = mmul(*x, *w); // U [time x bS x 6*inSize]
|
|
|
|
|
|
|
|
|
|
const Nd4jLong d2 = 2*inSize;
|
|
|
|
|
const Nd4jLong ncols = bS*d2;
|
|
|
|
|
const Nd4jLong ncolsWi = 3*ncols;
|
|
|
|
|
|
|
|
|
|
T* pI = x->bufferAsT<T>();
|
|
|
|
|
T* pWi = wi.bufferAsT<T>();
|
|
|
|
|
T* pBias = const_cast<NDArray*>(b)->bufferAsT<T>();
|
|
|
|
|
T* pInit = const_cast<NDArray*>(c0)->bufferAsT<T>();
|
|
|
|
|
T* pMask = mask ? const_cast<NDArray*>(mask)->bufferAsT<T>() : nullptr;
|
|
|
|
|
T* pHt = ht->bufferAsT<T>();
|
|
|
|
|
T* pCt = ct->bufferAsT<T>();
|
|
|
|
|
|
|
|
|
|
Nd4jLong ncolsRev, ncolsWiRev; // for reverse direction
|
|
|
|
|
T maskVal, cur, bF, bR, ft, rt, val;
|
|
|
|
|
T *pIVal(nullptr), *pWiVal(nullptr), *pHtVal(nullptr), *pCtVal(nullptr);
|
|
|
|
|
bool flip = false;
|
|
|
|
|
|
|
|
|
|
for (Nd4jLong col = 0; col < ncols; ++col) {
|
|
|
|
|
|
|
|
|
|
const auto colNum = col % d2;
|
|
|
|
|
flip = colNum >= inSize;
|
|
|
|
|
maskVal = mask ? *(pMask + col) : T(1);
|
|
|
|
|
cur = *(pInit + col);
|
|
|
|
|
bF = *(pBias + colNum);
|
|
|
|
|
bR = *(pBias + colNum + d2);
|
|
|
|
|
pWiVal = pWi + 3*col;
|
|
|
|
|
pIVal = pI + col;
|
|
|
|
|
pHtVal = pHt + col;
|
|
|
|
|
pCtVal = pCt + col;
|
|
|
|
|
|
|
|
|
|
if (flip) {
|
|
|
|
|
pIVal += (time-1)*ncols;
|
|
|
|
|
pWiVal += (time-1)*ncolsWi;
|
|
|
|
|
pHtVal += (time-1)*ncols;
|
|
|
|
|
pCtVal += (time-1)*ncols;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ncolsRev = flip ? -ncols : ncols;
|
|
|
|
|
ncolsWiRev = flip ? -ncolsWi : ncolsWi;
|
|
|
|
|
|
|
|
|
|
for (Nd4jLong t = 0; t < time; ++t) {
|
|
|
|
|
// evaluate sigmoids
|
|
|
|
|
ft = (1.)/(1. + nd4j::math::nd4j_exp<T, T>(-(*(pWiVal + 1) + bF)));
|
|
|
|
|
rt = (1.)/(1. + nd4j::math::nd4j_exp<T, T>(-(*(pWiVal + 2) + bR)));
|
|
|
|
|
|
|
|
|
|
cur = (cur - *pWiVal)*ft + *pWiVal;
|
|
|
|
|
*pCtVal = cur;
|
|
|
|
|
val = nd4j::math::nd4j_tanh<T, T>(cur);
|
|
|
|
|
*pHtVal = (val*maskVal - *pIVal)*rt + *pIVal;
|
|
|
|
|
|
|
|
|
|
pIVal += ncolsRev;
|
|
|
|
|
pWiVal += ncolsWiRev;
|
|
|
|
|
pCtVal += ncolsRev;
|
|
|
|
|
pHtVal += ncolsRev;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
template <typename T>
|
|
|
|
|
static void sruBIBP_(NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* ct, const NDArray* inGradC0, const NDArray* inGradHt, const NDArray* mask,
|
|
|
|
|
NDArray* gradI, NDArray* gradW, NDArray* gradB, NDArray* gradC0) {
|
|
|
|
|
|
|
|
|
|
// x input 3d tensor [time x bS x 2*inSize], time - number of time steps, bS - batch size, inSize - number of features
|
|
|
|
|
// w 2d tensor of weights [2*inSize x 6*inSize]
|
|
|
|
|
// b row of biases with twice length [1 × 4*inSize]
|
|
|
|
|
// c0 2d tensor of initial state [bS x 2*inSize] at time t=0
|
|
|
|
|
// ct [time x bS x 2*inSize]
|
|
|
|
|
// inGradC0 [bS x 2*inSize]
|
|
|
|
|
// inGradHt [time x bS x 2*inSize]
|
|
|
|
|
// mask optional, 2d tensor of dropout mask [bS x 2*inSize]
|
|
|
|
|
|
|
|
|
|
// gradI [time x bS x 2*inSize]
|
|
|
|
|
// gradW [time x 2*inSize x 6*inSize]
|
|
|
|
|
// gradB [1 x 4*inSize]
|
|
|
|
|
// gradC0 [bS x 2*inSize]
|
|
|
|
|
|
|
|
|
|
const Nd4jLong time = x->sizeAt(0); // time - number of time steps
|
|
|
|
|
const Nd4jLong bS = x->sizeAt(1);
|
|
|
|
|
const Nd4jLong inSize = x->sizeAt(2) / 2;
|
|
|
|
|
|
|
|
|
|
// x = x * mask
|
|
|
|
|
if(mask)
|
|
|
|
|
x->applyBroadcast(broadcast::Multiply, {1, 2}, mask, x, nullptr); // apply mask
|
|
|
|
|
|
|
|
|
|
// U = x * w
|
|
|
|
|
NDArray wi = mmul(*x, *w); // [time x bS x 2*inSize] * [2*inSize x 6*inSize] = [time x bS x 6*inSize]
|
|
|
|
|
NDArray gradBias(x->ordering(), {bS, 4*inSize}, x->dataType(), x->getContext());
|
|
|
|
|
NDArray gradWi (x->ordering(), {time, bS, 6*inSize}, x->dataType(), x->getContext());
|
|
|
|
|
|
|
|
|
|
const Nd4jLong d2 = 2*inSize;
|
|
|
|
|
const Nd4jLong ncols = bS*d2;
|
|
|
|
|
const Nd4jLong ncolsWi = 3*ncols;
|
|
|
|
|
T* pInput = x->bufferAsT<T>();
|
|
|
|
|
T* pWi = wi.bufferAsT<T>();
|
|
|
|
|
T* pBias = const_cast<NDArray*>(b)->bufferAsT<T>();
|
|
|
|
|
T* pInit = const_cast<NDArray*>(c0)->bufferAsT<T>();
|
|
|
|
|
T* pMask = mask ? const_cast<NDArray*>(mask)->bufferAsT<T>() : nullptr;
|
|
|
|
|
T* pState = const_cast<NDArray*>(ct)->bufferAsT<T>();
|
|
|
|
|
T* pInGradCt = const_cast<NDArray*>(inGradC0)->bufferAsT<T>();
|
|
|
|
|
T* pInGradHt = const_cast<NDArray*>(inGradHt)->bufferAsT<T>();
|
|
|
|
|
T* pGradWi = gradWi.bufferAsT<T>();
|
|
|
|
|
T* pGradInput = gradI->bufferAsT<T>();
|
|
|
|
|
T* pGradBias = gradBias.bufferAsT<T>();
|
|
|
|
|
T* pGradInit = gradC0->bufferAsT<T>();
|
|
|
|
|
|
|
|
|
|
Nd4jLong ncolsRev, ncolsWiRev; // for reverse direction
|
|
|
|
|
T gbF, gbR, cur, maskVal, bF, bR, ft, rt, val, prevVal, gft, grt, gradSateVal;
|
|
|
|
|
bool flip = false;
|
|
|
|
|
T *pInputVal(nullptr), *pWiVal(nullptr), *pStateVal(nullptr), *pInGradHtVal(nullptr), *pGradWiVal(nullptr), *pGradInputVal(nullptr);
|
|
|
|
|
|
|
|
|
|
for (Nd4jLong col = 0; col < ncols; ++col) {
|
|
|
|
|
gbF = gbR = (T)0.;
|
|
|
|
|
const auto colNum = col % d2;
|
|
|
|
|
flip = colNum >= inSize;
|
|
|
|
|
maskVal = mask ? *(pMask + col) : T(1.);
|
|
|
|
|
cur = *(pInGradCt + col);
|
|
|
|
|
bF = *(pBias + colNum);
|
|
|
|
|
bR = *(pBias + colNum + d2);
|
|
|
|
|
pWiVal = pWi + 3*col;
|
|
|
|
|
pInputVal = pInput + col;
|
|
|
|
|
pStateVal = pState + col;
|
|
|
|
|
pInGradHtVal = pInGradHt + col;
|
|
|
|
|
pGradWiVal = pGradWi + 3*col;
|
|
|
|
|
pGradInputVal = pGradInput + col;
|
|
|
|
|
if (!flip) {
|
|
|
|
|
pInputVal += (time-1)*ncols;
|
|
|
|
|
pWiVal += (time-1)*ncolsWi;
|
|
|
|
|
pStateVal += (time-1)*ncols;
|
|
|
|
|
pInGradHtVal += (time-1)*ncols;
|
|
|
|
|
pGradWiVal += (time-1)*ncolsWi;
|
|
|
|
|
pGradInputVal += (time-1)*ncols;
|
|
|
|
|
}
|
|
|
|
|
ncolsRev = flip ? -ncols : ncols;
|
|
|
|
|
ncolsWiRev = flip ? -ncolsWi : ncolsWi;
|
|
|
|
|
|
|
|
|
|
for (Nd4jLong t = 0; t < time; ++t) {
|
|
|
|
|
// evaluate sigmoids
|
|
|
|
|
ft = ((T)1.)/((T)1. + nd4j::math::nd4j_exp<T,T>(-(*(pWiVal + 1) + bF)));
|
|
|
|
|
rt = ((T)1.)/((T)1. + nd4j::math::nd4j_exp<T,T>(-(*(pWiVal + 2) + bR)));
|
|
|
|
|
|
|
|
|
|
val = nd4j::math::nd4j_tanh<T,T>(*pStateVal);
|
|
|
|
|
prevVal = (t < time-1) ? (*(pStateVal - ncolsRev)) : (*(pInit + col));
|
|
|
|
|
// grad wrt input
|
|
|
|
|
*pGradInputVal = *pInGradHtVal - (*pInGradHtVal)*rt ;
|
|
|
|
|
// grad wrt rt, wiR and bR
|
|
|
|
|
grt = (*pInGradHtVal) * (val*maskVal - *pInputVal) * (rt - rt*rt);
|
|
|
|
|
*(pGradWiVal + 2) = grt;
|
|
|
|
|
gbR += grt;
|
|
|
|
|
// grad wrt state
|
|
|
|
|
gradSateVal = (*pInGradHtVal) * maskVal * (rt - rt*val*val) + cur;
|
|
|
|
|
// grad wrt wi0
|
|
|
|
|
*pGradWiVal = gradSateVal - gradSateVal*ft;
|
|
|
|
|
// grad wrt ft, wi1, and bF
|
|
|
|
|
gft = gradSateVal * (prevVal - *pWiVal) * (ft - ft*ft);
|
|
|
|
|
*(pGradWiVal + 1) = gft;
|
|
|
|
|
gbF += gft;
|
|
|
|
|
// grad wrt c_previous
|
|
|
|
|
cur = gradSateVal * ft;
|
|
|
|
|
pInputVal -= ncolsRev;
|
|
|
|
|
pWiVal -= ncolsWiRev;
|
|
|
|
|
pStateVal -= ncolsRev;
|
|
|
|
|
pGradWiVal -= ncolsWiRev;
|
|
|
|
|
pGradInputVal -= ncolsRev;
|
|
|
|
|
pInGradHtVal -= ncolsRev;
|
|
|
|
|
}
|
|
|
|
|
*(pGradBias + col) = gbF;
|
|
|
|
|
*(pGradBias + col + ncols) = gbR;
|
|
|
|
|
*(pGradInit + col) = cur;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// gradB
|
|
|
|
|
gradBias.reduceAlongDimension(reduce::Sum, gradB, {0}, false, true); // [1 x 4*inSize]
|
|
|
|
|
|
|
|
|
|
// gradW
|
|
|
|
|
x->permutei({0, 2, 1}); // [time x bS x 2*inSize] -> [time x 2*inSize x bS]
|
|
|
|
|
*gradW = mmul(*x, gradWi); // [time x 2*inSize x bS ] * [time x bS x 6*inSize] = [time x 2*inSize x 6*inSize]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void sruBI(nd4j::LaunchContext * context, NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* mask, NDArray* ht, NDArray* ct) {
|
|
|
|
|
BUILD_SINGLE_SELECTOR(x->dataType(), sruBI_, (x, w, b, c0, mask, ht, ct), FLOAT_TYPES);
|
|
|
|
|
}
|
|
|
|
|
void sruBIBP(nd4j::LaunchContext * context, NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* ct, const NDArray* inGradC0, const NDArray* inGradH, const NDArray* mask, NDArray* gradI, NDArray* gradW, NDArray* gradB, NDArray* gradC0) {
|
|
|
|
|
BUILD_SINGLE_SELECTOR(x->dataType(), sruBIBP_, (x, w, b, c0, ct, inGradC0, inGradH, mask, gradI, gradW, gradB, gradC0), FLOAT_TYPES);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BUILD_SINGLE_TEMPLATE(template void sruBI_, (NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* mask, NDArray* ht, NDArray* ct), FLOAT_TYPES);
|
|
|
|
|
BUILD_SINGLE_TEMPLATE(template void sruBIBP_, (NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* ct, const NDArray* inGradC0, const NDArray* inGradH, const NDArray* mask, NDArray* gradI, NDArray* gradW, NDArray* gradB, NDArray* gradC0), FLOAT_TYPES);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
// template <typename T>
|
|
|
|
|
// void sruCellBP(const std::vector<NDArray<T>*>& inArrs, const std::vector<NDArray<T>*>& outArrs) {
|
|
|
|
|
|
|
|
|
|
// NDArray<T>* x = inArrs[0]; // input [bS x inSize], bS - batch size, inSize - number of features
|
|
|
|
|
// NDArray<T>* c0 = inArrs[1]; // previous cell state c [bS x inSize], that is at previous time step t-1
|
|
|
|
|
// NDArray<T>* w = inArrs[2]; // weights [inSize x 3*inSize]
|
|
|
|
|
// NDArray<T>* b = inArrs[3]; // biases [2*inSize]
|
|
|
|
|
// NDArray<T>* dLdC = inArrs[4]; // gradient of the loss func with respect to cell output [bS x inSize]
|
|
|
|
|
// NDArray<T>* dLdH = inArrs[5]; // gradient of the loss func with respect to cell state [bS x inSize]
|
|
|
|
|
|
|
|
|
|
// NDArray<T>* dLdX = outArrs[0]; // gradient of the loss func with respect to input [bS x inSize], so called epsilon
|
|
|
|
|
// NDArray<T>* dLdW = outArrs[1]; // gradient of the loss func with respect to weights [inSize x 3*inSize]
|
|
|
|
|
// NDArray<T>* dLdB = outArrs[2]; // gradient of the loss func with respect to biases [2*inSize]
|
|
|
|
|
// NDArray<T>* dLdC0 = outArrs[3]; // gradient of the loss func with respect to previous cell state [bS, inSize]
|
|
|
|
|
|
|
|
|
|
// const int inSize = x->sizeAt(1); // inSize - number of features
|
|
|
|
|
|
|
|
|
|
// //*********** feed forward ***********//
|
|
|
|
|
// NDArray<T> z = mmul(*x, *w); // [bS x 3*inSize]
|
|
|
|
|
|
|
|
|
|
// // forget gate = sigmoid(x*Wf + bf)
|
|
|
|
|
// NDArray<T> f = sigmoid<T>(z({{},{inSize, 2*inSize}}) + (*b)({{0, inSize}})); // [bS, inSize]
|
|
|
|
|
// NDArray<T> oneMinusF = 1. - f;
|
|
|
|
|
|
|
|
|
|
// // reset gate = sigmoid(x*Wr + br)
|
|
|
|
|
// NDArray<T> r = sigmoid<T>(z({{},{2*inSize, 3*inSize}}) + (*b)({{inSize, 2*inSize}})); // [bS, inSize]
|
|
|
|
|
// NDArray<T> oneMinusR = 1. - r;
|
|
|
|
|
|
|
|
|
|
// // current sell state = f◦c0 + (1 - f)◦(x*Wc) ---> c->assign( f*(*c0) + ((T)1. - f) * z({{},{0, inSize}}) );
|
|
|
|
|
// // current cell output = r◦activation(c) + (1 - r)◦x ---> h->assign( r*activation<T>(*c) + ((T)1. - r) * (*x) );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// //*********** back propagation ***********//
|
|
|
|
|
// // dCdC0 = f;
|
|
|
|
|
// // dFdX = Wf
|
|
|
|
|
// // dRdX = Wr
|
|
|
|
|
|
|
|
|
|
// NDArray<T> tanh = activation<T>(*c);
|
|
|
|
|
// NDArray<T> dFdBf = f * oneMinusF;
|
|
|
|
|
// NDArray<T> dRdBr = r * oneMinusR;
|
|
|
|
|
// NDArray<T> dHdR = tanh - *x;
|
|
|
|
|
// // dCdF = c0 - x*Wc;
|
|
|
|
|
// NDArray<T> dCdF = *c0 - z({{},{0, inSize}});
|
|
|
|
|
// // dHdC = r * (1 - tanh*tanh)
|
|
|
|
|
// NDArray<T> dHdC = r * (1. - tanh * tanh);
|
|
|
|
|
// // dCdX = dCdX + dCdF*dFdX = (1-f)*Wc + dCdF*Wf
|
|
|
|
|
// NDArray<T> dCdX = oneMinusF * (*w)({{},{0, inSize}}) + dCdF * (*w)({{},{inSize, 2*inSize}});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// // dLdC0 = dLdC * dCdC0 = dLdC * f
|
|
|
|
|
// dLdC0->assign((*dLdC) * f);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// // dLdBf = dLdH*dHdBf + dLdC*dCdBf = dLdH*dHdC*dCdBf + dLdC*dCdF*dFdBf = dLdH*dHdC*dCdF*dFdBf + dLdC*dCdF*dFdBf = (dLdH*dHdC + dLdC)*dCdF*dFdBf
|
|
|
|
|
// (*dLdB)({{0, inSize}}).assign(((*dLdH) * dHdC + *dLdC) * dCdF * dFdBf);
|
|
|
|
|
// // dLdBr = dLdH * dHdR * dRdBr
|
|
|
|
|
// (*dLdB)({{inSize, 2*inSize}}).assign((*dLdH) * dHdR * dRdBr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// // dLdWc = dLdH*dHdWc + dLdC*dCdWc = dLdH*dHdC*dCdWc + dLdC*dCdWc = (dLdH*dHdC + dLdC) * dCdWc = (dLdH*dHdC + dLdC) * (1-f)*x
|
|
|
|
|
// (*dLdW)({{}, {0, inSize}}).assign(((*dLdH) * dHdC + *dLdC) * oneMinusF * (*x));
|
|
|
|
|
// // dLdWf = dLdBf * x
|
|
|
|
|
// (*dLdW)({{}, {inSize, 2*inSize}}).assign((*dLdB)({{0, inSize}}) * (*x));
|
|
|
|
|
// // dLdWr = dLdBr * x
|
|
|
|
|
// (*dLdW)({{}, {2*inSize, 3*inSize}}).assign((*dLdB)({{inSize, 2*inSize}}) * (*x));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// // dLdX = dLdH*dHdX + dLdC*dCdX = dLdH*(dHdX + dHdR*dRdX + dHdC*dCdX) + dLdC*dCdF*dFdX = dLdH*(1 - r + dHdR*dRdX + dHdC*dCdX) + dLdC*dCdX
|
|
|
|
|
// dLdX->assign((*dLdH) * (oneMinusR + dHdR * (*w)({{},{2*inSize, 3*inSize}}) + dHdC * dCdX) + (*dLdC) * dCdX);
|
|
|
|
|
// }
|