From 8fc0e63ce7c765ec646db424e817acf927ba5194 Mon Sep 17 00:00:00 2001
From: Oleh <oleg.semeniv@gmail.com>
Date: Mon, 20 Jan 2020 11:59:12 +0200
Subject: [PATCH] Oleh powderev (#171)

* Libnd4j: Add broadcastable elementwise power derivative #7461 first step of Pow_bp operation implementation

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* Libnd4j: Add broadcastable elementwise power derivative #7461 some corrections of calculation steps

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* Libnd4j: Add broadcastable elementwise power derivative #7461 some bug fixes, the PowDerevative op made broadcastable, add the raw tests for op, need refactoring to use broadcast ops

* Libnd4j: Add broadcastable elementwise power derivative #7461 fixed several bugs add broadcast support and tests, need to fix scalar+array and array+scalar

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* Libnd4j: Add broadcastable elementwise power derivative #7461 fixed bugs for scalar inputs, fixed multinomial tests, added tests

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* Libnd4j: Add broadcastable elementwise power derivative #7461 fised bugs for different shapes support, tests updated

* Libnd4j: Add broadcastable elementwise power derivative #7461 applied all possible variants via tiled arrays, add support of broadcast for Pow and PowDerivative ops, covered by tests, before review have to be replaced tiled implementation by applyTrueBroadcast

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* Libnd4j: Add broadcastable elementwise power derivative #7461 replaced tile by broadcast implementation, fixed issue with negative x input, corrected tests, need additional testing

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* Libnd4j: Add broadcastable elementwise power derivative #7461 added and corrected test cases, corrected implementation need review

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* Libnd4j: Add broadcastable elementwise power derivative #7461 code clean up

* Libnd4j: Add broadcastable elementwise power derivative #7461 code clean up, removed some tests, add tests with scalar

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* Libnd4j: Add broadcastable elementwise power derivative #7461 code improvement and clean up, split tests

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* Libnd4j: Add broadcastable elementwise power derivative #7461 some code clean up

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* Libnd4j: Add broadcastable elementwise power derivative replace __isnanf by internal realization

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* pow_bp wrapper

* Fixed PowBp wrapper

* Tests added

* Test fixed

* Fix return type

* Disable powBp usage

* Pow backprop changed

Co-authored-by: Alexander Stoyakin <alexander.stoyakin@gmail.com>
---
 .../loops/BroadcastPairwiseConverter.h        |   1 +
 libnd4j/include/loops/legacy_ops.h            |   3 +-
 libnd4j/include/ops/BroadcastOpsTuple.h       |   3 +
 .../declarable/generic/broadcastable/pow.cpp  |  74 +++-
 .../ops/declarable/headers/broadcastable.h    |   1 +
 .../include/ops/impl/BroadcastOpsTuple.cpp    |   8 +
 .../layers_tests/DeclarableOpsTests15.cpp     | 333 +++++++++++++++++-
 libnd4j/tests_cpu/layers_tests/RNGTests.cpp   |  24 +-
 .../DifferentialFunctionFactory.java          |  19 +-
 .../converters/ImportClassMapping.java        |   1 +
 .../linalg/api/ops/impl/reduce/bp/PowBp.java  |  45 +++
 .../nd4j/linalg/api/ops/impl/scalar/Pow.java  |   6 +-
 .../api/ops/impl/transforms/custom/Pow.java   |   7 +-
 .../opvalidation/ReductionBpOpValidation.java |  56 +++
 14 files changed, 547 insertions(+), 34 deletions(-)
 create mode 100644 nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce/bp/PowBp.java

diff --git a/libnd4j/include/loops/BroadcastPairwiseConverter.h b/libnd4j/include/loops/BroadcastPairwiseConverter.h
index fb5acf19b..f1fda4a9a 100644
--- a/libnd4j/include/loops/BroadcastPairwiseConverter.h
+++ b/libnd4j/include/loops/BroadcastPairwiseConverter.h
@@ -53,6 +53,7 @@ inline pairwise::Ops fromBroadcastToPairwise(broadcast::Ops op) {
         case broadcast::LogicalXor: return pairwise::LogicalXor;
         case broadcast::LogicalNot: return pairwise::LogicalNot;
         case broadcast::LogicalAnd: return pairwise::LogicalAnd;
+        case broadcast::PowDerivative: return pairwise::PowDerivative;
         default:
             throw std::runtime_error("fromBroadcastToPairwise: Not convertible operation");
     }
diff --git a/libnd4j/include/loops/legacy_ops.h b/libnd4j/include/loops/legacy_ops.h
index 7de54a858..ea32b154c 100644
--- a/libnd4j/include/loops/legacy_ops.h
+++ b/libnd4j/include/loops/legacy_ops.h
@@ -80,7 +80,8 @@
        (30, LogicalAnd), \
        (31, DivideNoNan), \
        (32, IGamma), \
-       (33, IGammac)
+       (33, IGammac),\
+       (34, PowDerivative)
 
 // these ops return same data type as input
 #define TRANSFORM_SAME_OPS \
diff --git a/libnd4j/include/ops/BroadcastOpsTuple.h b/libnd4j/include/ops/BroadcastOpsTuple.h
index 256e37341..1bcd2df8b 100644
--- a/libnd4j/include/ops/BroadcastOpsTuple.h
+++ b/libnd4j/include/ops/BroadcastOpsTuple.h
@@ -52,6 +52,9 @@ namespace nd4j {
         static BroadcastOpsTuple Subtract();
         static BroadcastOpsTuple IGamma();
         static BroadcastOpsTuple IGammac();
+
+        static BroadcastOpsTuple Pow();
+        static BroadcastOpsTuple PowDerivative();
     };
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/pow.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/pow.cpp
index 7f7efd80c..56f77737d 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/pow.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/pow.cpp
@@ -16,6 +16,7 @@
 
 //
 // @author raver119@gmail.com
+// @author Oleh Semeniv (oleg.semeniv@gmail.com)
 //
 
 #include <op_boilerplate.h>
@@ -25,7 +26,7 @@
 #include <ops/declarable/CustomOperations.h>
 
 namespace nd4j {
-    namespace ops {
+namespace ops {
         BROADCASTABLE_OP_IMPL(Pow, 0, 0) {
             auto x = INPUT_VARIABLE(0);
             auto y = INPUT_VARIABLE(1);
@@ -51,7 +52,76 @@ namespace nd4j {
                 ->setAllowedInputTypes(1, {ALL_FLOATS, ALL_INTS})
                 ->setAllowedOutputTypes(0, {ALL_FLOATS, ALL_INTS});
         }
-    }
+
+       CUSTOM_OP_IMPL(Pow_bp, 3, 2, false, 0, 0) {
+
+           auto x = INPUT_VARIABLE(0);
+           auto y = INPUT_VARIABLE(1);
+           auto dLdz = INPUT_VARIABLE(2);
+       
+           auto dLdx = OUTPUT_VARIABLE(0);
+           auto dLdy = OUTPUT_VARIABLE(1);
+       
+           Nd4jLong* dLdzShapeInfo = nullptr;
+           const bool areShapesBroadcastable = ShapeUtils::evalBroadcastShapeInfo(x->getShapeInfo(), y->getShapeInfo(), true, dLdzShapeInfo, block.getWorkspace());
+           REQUIRE_TRUE(areShapesBroadcastable, 0, "POW_BP OP: the shapes of x %s"
+               " and y %s are not suitable for broadcast !", 
+               ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str());
+           REQUIRE_TRUE(shape::equalsSoft(dLdz->shapeInfo(), dLdzShapeInfo), 0, 
+               "POW_BP OP: wrong shape of next epsilon array (dLdOut),"
+               " expected is %s, but got %s instead !", 
+               ShapeUtils::shapeAsString(dLdzShapeInfo).c_str(), ShapeUtils::shapeAsString(dLdz).c_str());
+       
+           // dL/dy = x^y * log(x) * dL/dz
+           auto temp = x->applyTrueBroadcast(BroadcastOpsTuple::Pow(), *y); // a = x^y
+           x->applyTransform(transform::Log, *dLdx); // b = log(x)
+           dLdx->applyScalar(nd4j::scalar::ReplaceNans, 0, *dLdx);
+           temp *= *dLdx; // c = b*a
+           temp *= *dLdz; // dL/dy = c * dL/dz
+           if (dLdy->isSameShape(*dLdz)) {
+               dLdy->assign(temp); 
+           }
+           else {
+               std::vector<int> axesForY = ShapeUtils::evalBroadcastBackwardAxis(y->getShapeInfo(), dLdz->getShapeInfo());
+               dLdy->assign(temp.reduceAlongDimension(reduce::Sum, axesForY)); // dL/dy = sum(c * dL/dz)
+           }
+           
+           // dL/dx = y*x^(y-1) * dL/dz 
+           x->applyTrueBroadcast(BroadcastOpsTuple::PowDerivative(), *y, temp); // a = y*x^(y-1)
+           temp *= *dLdz; // dLdx = a*dL/dz
+
+           if (dLdx->isSameShape(*dLdz)) {
+               dLdx->assign(temp); // dLdx = a*dL/dz
+           }
+           else {
+               std::vector<int> axesForX = ShapeUtils::evalBroadcastBackwardAxis(x->getShapeInfo(), dLdz->getShapeInfo());
+               dLdx->assign(temp.reduceAlongDimension(reduce::Sum, axesForX)); // dLdx = a*dL/dz
+           }
+       
+           return Status::OK();
+       }
+       
+       DECLARE_SHAPE_FN(Pow_bp) {
+       
+           auto xShapeInfo = inputShape->at(0);
+           auto yShapeInfo = inputShape->at(1);
+       
+           Nd4jLong* dLdxShapeInfo = nullptr;
+           Nd4jLong* dLdyShapeInfo = nullptr;
+       
+           COPY_SHAPE(xShapeInfo, dLdxShapeInfo);
+           COPY_SHAPE(yShapeInfo, dLdyShapeInfo);
+       
+           return SHAPELIST(CONSTANT(dLdxShapeInfo), CONSTANT(dLdyShapeInfo));
+       }
+       
+       DECLARE_TYPES(Pow_bp) {
+           getOpDescriptor()
+               ->setAllowedInputTypes({ ALL_FLOATS, ALL_INTS })
+               ->setAllowedOutputTypes({ ALL_FLOATS }); // TODO maybe wourth to add ALL_INTS
+       }
+
+}
 }
 
 #endif
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/headers/broadcastable.h b/libnd4j/include/ops/declarable/headers/broadcastable.h
index 7ee53b52a..9a2dc9f62 100644
--- a/libnd4j/include/ops/declarable/headers/broadcastable.h
+++ b/libnd4j/include/ops/declarable/headers/broadcastable.h
@@ -356,6 +356,7 @@ namespace nd4j {
          */
         #if NOT_EXCLUDED(OP_Pow)
         DECLARE_BROADCASTABLE_OP(Pow, 0, 0);
+        DECLARE_CUSTOM_OP(Pow_bp, 3, 2, false, 0, 0);
         #endif
 
         /**
diff --git a/libnd4j/include/ops/impl/BroadcastOpsTuple.cpp b/libnd4j/include/ops/impl/BroadcastOpsTuple.cpp
index 0e9c99636..26cda74a4 100644
--- a/libnd4j/include/ops/impl/BroadcastOpsTuple.cpp
+++ b/libnd4j/include/ops/impl/BroadcastOpsTuple.cpp
@@ -55,4 +55,12 @@ namespace nd4j {
         return custom(nd4j::scalar::IGammac, nd4j::pairwise::IGammac, nd4j::broadcast::IGammac);
     }
 
+
+    BroadcastOpsTuple BroadcastOpsTuple::Pow() {
+        return custom(nd4j::scalar::Pow, nd4j::pairwise::Pow, nd4j::broadcast::Pow);
+    }
+    BroadcastOpsTuple BroadcastOpsTuple::PowDerivative() {
+        return custom(nd4j::scalar::PowDerivative, nd4j::pairwise::PowDerivative, nd4j::broadcast::PowDerivative);
+    }
+
 }
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
index 0d32daebd..96234f41d 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
@@ -1279,4 +1279,335 @@ TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_7) {
     ASSERT_TRUE(expected.equalsTo(output));
 
     delete result;
-}
\ No newline at end of file
+}
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+TEST_F(DeclarableOpsTests15, Pow_BP_Test1) {
+
+    // same shape
+    NDArray x('c', { 2,2,2 }, { 4,3,2,5,7,8,-9,-12 }, nd4j::DataType::FLOAT32);
+    NDArray y('c', { 2,2,2 }, { 2,3,-2,4,-1,-4,10,8 }, nd4j::DataType::FLOAT32);
+
+
+    NDArray dLdz('c', { 2,2,2 }, nd4j::DataType::FLOAT32);
+    NDArray dLdxExp('c', { 2,2,2 }, { 8,  27, -0.25,  500, -0.0204082, -0.000122, -3.87420e+09, -2.86654e+08 }, nd4j::DataType::FLOAT32);
+    NDArray dLdyExp('c', { 2,2,2 }, { 22.18071, 29.66253, 0.17329, 1005.89874, 0.27799, 0.00051, 0, 0 }, nd4j::DataType::FLOAT32);
+
+    dLdz.assign(1.0);
+
+    nd4j::ops::Pow_bp op;
+    auto results = op.execute({ &x, &y, &dLdz }, {}, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto* dLdx = results->at(0);
+    auto* dLdy = results->at(1);
+
+    ASSERT_TRUE(dLdxExp.isSameShape(dLdx));
+    ASSERT_TRUE(dLdxExp.equalsTo(dLdx));
+    ASSERT_TRUE(dLdyExp.isSameShape(dLdy));
+    ASSERT_TRUE(dLdyExp.equalsTo(dLdy));
+
+    delete results;
+}
+
+TEST_F(DeclarableOpsTests15, Pow_BP_Test2) {
+
+    NDArray x('c', { 1,2,3 }, nd4j::DataType::FLOAT32);
+    NDArray y('c', { 3,2,1 }, nd4j::DataType::FLOAT32);
+    NDArray dLdz('c', { 3,2,3 }, nd4j::DataType::FLOAT32);
+
+    NDArray dLdxExp('c', { 1,2,3 }, { 16.8, 19.2, 21.6, 24., 26.4, 28.8 }, nd4j::DataType::FLOAT32);
+    NDArray dLdyExp('c', { 3,2,1 }, { 13.30843, 33.27106, 53.2337, 73.19634, 93.15898, 113.12162 }, nd4j::DataType::FLOAT32);
+
+    x.assign(4.0);
+    y.assign(2.0);
+    dLdz.linspace(0.1, 0.1);
+
+    nd4j::ops::Pow_bp op;
+    auto results = op.execute({ &x, &y, &dLdz }, {}, {});
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto* dLdx = results->at(0);
+    auto* dLdy = results->at(1);
+
+    ASSERT_TRUE(dLdxExp.isSameShape(dLdx));
+    ASSERT_TRUE(dLdxExp.equalsTo(dLdx));
+    ASSERT_TRUE(dLdyExp.isSameShape(dLdy));
+    ASSERT_TRUE(dLdyExp.equalsTo(dLdy));
+
+    delete results;
+
+}
+
+TEST_F(DeclarableOpsTests15, Pow_BP_Test3) {
+
+    // y - same shape as dLdz
+    NDArray xY('c', { 1,2,3 }, nd4j::DataType::FLOAT32);
+    NDArray yY('c', { 3,2,3 }, nd4j::DataType::FLOAT32);
+
+    NDArray dLdxExpY('c', { 1,2,3 }, { 16.8, 19.2, 21.6, 24. , 26.4, 28.8 }, nd4j::DataType::FLOAT32);
+    NDArray dLdyExpY('c', { 3,2,3 }, { 2.21807,  4.43614,  6.65421, 8.87228, 11.09035, 13.30843, 15.5265 , 17.74457, 19.96264, 22.18071, 24.39878, 26.61685, 28.83492, 31.05299, 33.27106, 35.48914, 37.70721, 39.92528 }, nd4j::DataType::FLOAT32);
+    NDArray dLdz('c', { 3,2,3 }, nd4j::DataType::FLOAT32);
+
+    xY.assign(4.0);
+    yY.assign(2.0);
+    dLdz.linspace(0.1, 0.1);
+
+    nd4j::ops::Pow_bp op;
+    auto resultsY = op.execute({ &xY, &yY, &dLdz }, {}, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsY->status());
+
+    auto* dLdxY = resultsY->at(0);
+    auto* dLdyY = resultsY->at(1);
+
+    ASSERT_TRUE(dLdxExpY.isSameShape(dLdxY));
+    ASSERT_TRUE(dLdxExpY.equalsTo(dLdxY));
+    ASSERT_TRUE(dLdyExpY.isSameShape(dLdyY));
+    ASSERT_TRUE(dLdyExpY.equalsTo(dLdyY));
+
+    delete resultsY;
+}
+
+TEST_F(DeclarableOpsTests15, Pow_BP_Test4) {
+
+    // x - same shape ad dLdz
+    NDArray yX('c', { 1,2,3 }, nd4j::DataType::FLOAT32);
+    NDArray xX('c', { 3,2,3 }, nd4j::DataType::FLOAT32);
+
+    NDArray dLdxExpX('c', { 3,2,3 }, { 3.2,  6.4,  9.6, 12.8, 16. , 19.2, 22.4, 25.6, 28.8, 32. , 35.2, 38.4, 41.6, 44.8, 48., 51.2, 54.4, 57.6 }, nd4j::DataType::FLOAT32);
+    NDArray dLdyExpX('c', { 1,2,3 }, { 23.28975, 26.61685, 29.94396, 33.27106, 36.59817, 39.92528 }, nd4j::DataType::FLOAT32);
+
+    NDArray dLdz('c', { 3,2,3 }, nd4j::DataType::FLOAT32);
+    dLdz.linspace(0.1, 0.1);
+
+    nd4j::ops::Pow_bp op;
+
+    xX.assign(2.0);
+    yX.assign(4.0);
+
+    auto resultsX = op.execute({ &xX, &yX, &dLdz }, {}, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsX->status());
+
+    auto* dLdxX = resultsX->at(0);
+    auto* dLdyX = resultsX->at(1);
+
+    ASSERT_TRUE(dLdxExpX.isSameShape(dLdxX));
+    ASSERT_TRUE(dLdxExpX.equalsTo(dLdxX));
+    ASSERT_TRUE(dLdyExpX.isSameShape(dLdyX));
+    ASSERT_TRUE(dLdyExpX.equalsTo(dLdyX));
+
+    delete resultsX;
+}
+
+TEST_F(DeclarableOpsTests15, Pow_BP_Test5) {
+
+    // both single array
+    NDArray xConst('c', { 1 }, nd4j::DataType::FLOAT32);
+    NDArray yConst('c', { 1 }, nd4j::DataType::FLOAT32);
+    NDArray dLdz('c', { 1 }, nd4j::DataType::FLOAT32);
+    NDArray dLdxExp('c', { 1 }, nd4j::DataType::FLOAT32);
+    NDArray dLdyExp('c', { 1 }, nd4j::DataType::FLOAT32);
+
+    xConst.assign(3.0);
+    yConst.assign(4.0);
+    dLdz.assign(1.0);
+
+    dLdxExp.assign(4.0 * pow(3, 3));
+    dLdyExp.assign(pow(3, 4) * log(3));
+
+    nd4j::ops::Pow_bp op;
+    auto results = op.execute({ &xConst, &yConst, &dLdz }, {}, {});
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto* dLdx = results->at(0);
+    auto* dLdy = results->at(1);
+
+    ASSERT_TRUE(dLdxExp.isSameShape(dLdx));
+    ASSERT_TRUE(dLdxExp.equalsTo(dLdx));
+
+    ASSERT_TRUE(dLdyExp.isSameShape(dLdy));
+    ASSERT_TRUE(dLdyExp.equalsTo(dLdy));
+
+    delete results;
+}
+
+TEST_F(DeclarableOpsTests15, Pow_BP_Test6) {
+
+    // x single array
+    NDArray xConst('c', { 1 }, nd4j::DataType::FLOAT32);
+    NDArray y('c', { 2, 2, 2 }, nd4j::DataType::FLOAT32);
+    NDArray dLdzC('c', { 2, 2, 2 }, nd4j::DataType::FLOAT32);
+
+    xConst.assign(2.0);
+    y.assign(4.0);
+    dLdzC.linspace(0.1, 0.1);
+
+    NDArray dLdxExpXC('c', { 1 }, { 115.2 }, nd4j::DataType::FLOAT32);
+    NDArray dLdyExpXC('c', { 2, 2, 2 }, { 1.10904, 2.21807, 3.32711, 4.43614, 5.54518, 6.65421, 7.76325, 8.87228 }, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::Pow_bp op;
+    auto resultsXC = op.execute({ &xConst, &y, &dLdzC }, {}, {});
+    ASSERT_EQ(ND4J_STATUS_OK, resultsXC->status());
+
+    auto* dLdxXC = resultsXC->at(0);
+    auto* dLdyXC = resultsXC->at(1);
+
+    ASSERT_TRUE(dLdxExpXC.isSameShape(dLdxXC));
+    ASSERT_TRUE(dLdxExpXC.equalsTo(dLdxXC));
+    ASSERT_TRUE(dLdyExpXC.isSameShape(dLdyXC));
+    ASSERT_TRUE(dLdyExpXC.equalsTo(dLdyXC));
+
+    delete resultsXC;
+}
+
+TEST_F(DeclarableOpsTests15, Pow_BP_Test7) {
+
+    // Y - scalar
+    auto Y = NDArrayFactory::create<float>(2.f);
+    NDArray x('c', { 2, 2, 2 }, nd4j::DataType::FLOAT32);
+    NDArray dLdzC('c', { 2, 2, 2 }, nd4j::DataType::FLOAT32);
+    
+    dLdzC.linspace(0.1, 0.1);
+    x = 4.f;
+
+    NDArray dLdxExpYs('c', { 2, 2, 2 }, { 0.8, 1.6, 2.4, 3.2, 4., 4.8, 5.6, 6.4 }, nd4j::DataType::FLOAT32);
+
+    auto dLdyExpYs = NDArrayFactory::create<float>(79.85056f);
+
+    nd4j::ops::Pow_bp op;
+    auto resultsYs = op.execute({ &x, &Y, &dLdzC }, {}, {});
+    ASSERT_EQ(ND4J_STATUS_OK, resultsYs->status());
+
+    auto* dLdxY = resultsYs->at(0);
+    auto* dLdyY = resultsYs->at(1);
+
+    ASSERT_TRUE(dLdxExpYs.isSameShape(dLdxY));
+    ASSERT_TRUE(dLdxExpYs.equalsTo(dLdxY));
+    ASSERT_TRUE(dLdyExpYs.isSameShape(dLdyY));
+    ASSERT_TRUE(dLdyExpYs.equalsTo(dLdyY));
+
+    delete resultsYs;
+}
+
+TEST_F(DeclarableOpsTests15, Pow_BP_Test8) {
+    // both scalars
+
+    auto X = NDArrayFactory::create<float>(4.f);
+    auto Y = NDArrayFactory::create<float>(2.f);
+    NDArray dLdz = NDArrayFactory::create<float>(0.1f);
+
+    NDArray dLdxExp = NDArrayFactory::create<float>(2.f*4.f*0.1f);
+
+    NDArray dLdyExp = NDArrayFactory::create<float>(pow(4.f, 2.f) * log(4.f) * 0.1f);
+
+    nd4j::ops::Pow_bp op;
+    auto results = op.execute({ &X, &Y, &dLdz }, {}, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto* dLdx = results->at(0);
+    auto* dLdy = results->at(1);
+
+    ASSERT_TRUE(dLdxExp.isSameShape(dLdx));
+    ASSERT_TRUE(dLdxExp.equalsTo(dLdx));
+    ASSERT_TRUE(dLdyExp.isSameShape(dLdy));
+    ASSERT_TRUE(dLdyExp.equalsTo(dLdy));
+
+    delete results;
+}
+
+TEST_F(DeclarableOpsTests15, Pow_BP_Test9) {
+
+    nd4j::ops::Pow_bp op;
+    // diff shapes
+    NDArray x('c', { 3,2,1 }, nd4j::DataType::FLOAT32);
+    NDArray y('c', { 1,2,3 }, nd4j::DataType::FLOAT32);
+    NDArray dLdz('c', { 3,2,3 }, nd4j::DataType::FLOAT32);
+
+    NDArray dLdxExp('c', { 3,2,1 }, { 4.8, 12., 19.2, 26.4, 33.6, 40.8 }, nd4j::DataType::FLOAT32);
+    NDArray dLdyExp('c', { 1,2,3 }, { 46.57949, 53.2337 , 59.88792, 66.54213, 73.19634, 79.85056 }, nd4j::DataType::FLOAT32);
+
+    x.assign(4.0);
+    y.assign(2.0);
+    dLdz.linspace(0.1, 0.1);
+
+    auto results = op.execute({ &x, &y, &dLdz }, {}, {});
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto* dLdx = results->at(0);
+    auto* dLdy = results->at(1);
+
+    ASSERT_TRUE(dLdxExp.isSameShape(dLdx));
+    ASSERT_TRUE(dLdxExp.equalsTo(dLdx));
+    ASSERT_TRUE(dLdyExp.isSameShape(dLdy));
+    ASSERT_TRUE(dLdyExp.equalsTo(dLdy));
+
+    delete results;
+}
+
+TEST_F(DeclarableOpsTests15, Pow_BP_Test10) {
+
+    // diff shapes broadcastable
+    NDArray yB('c', { 1,2,3,1 }, nd4j::DataType::FLOAT32);
+    NDArray xB('c', { 2,3,1 }, nd4j::DataType::FLOAT32);
+
+    NDArray dLdyExpB('c', { 1,2,3,1 }, { 2.21807, 4.43614, 6.65421, 8.87228, 11.09035, 13.30843 }, nd4j::DataType::FLOAT32);
+    NDArray dLdxExpB('c', { 2,3,1 }, { 0.8, 1.6, 2.4, 3.2, 4., 4.8 }, nd4j::DataType::FLOAT32);
+    NDArray dLdzB('c', { 1,2,3,1 }, nd4j::DataType::FLOAT32);
+
+    dLdzB.linspace(0.1, 0.1);
+    xB.assign(4.0);
+    yB.assign(2.0);
+
+    nd4j::ops::Pow_bp op;
+    auto resultsB = op.execute({ &xB, &yB, &dLdzB }, {}, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsB->status());
+
+    auto* dLdxB = resultsB->at(0);
+    auto* dLdyB = resultsB->at(1);
+
+    ASSERT_TRUE(dLdxExpB.isSameShape(dLdxB));
+    ASSERT_TRUE(dLdxExpB.equalsTo(dLdxB));
+
+    ASSERT_TRUE(dLdyExpB.isSameShape(dLdyB));
+    ASSERT_TRUE(dLdyExpB.equalsTo(dLdyB));
+
+    delete resultsB;
+}
+
+TEST_F(DeclarableOpsTests15, Pow_BP_Test11) {
+
+    NDArray xB('c', { 3,2,1 }, { .4, 3, 5, .8, -9, -12 }, nd4j::DataType::FLOAT32);
+    NDArray yB('c', { 1,2,3 }, { 3, -2, .4, -4, 10, .8 }, nd4j::DataType::FLOAT32);
+
+    NDArray dLdxExpB('c', { 3,2,1 }, { -5.994056, 39366.191406, 7.508829, -2.223537, -std::numeric_limits<float>::quiet_NaN(), -std::numeric_limits<float>::quiet_NaN() }, nd4j::DataType::FLOAT32);
+    NDArray dLdyExpB('c', { 1,2,3 }, { 20.11211,  -1.119612, -std::numeric_limits<float>::quiet_NaN(), -0.1076, 12974.389648, -std::numeric_limits<float>::quiet_NaN() }, nd4j::DataType::FLOAT32);
+
+    NDArray dLdzB('c', { 3,2,3 }, { .1,.2,.3, .1,.2,.3, .1,.4,.1, .2,.1,.1, .3,.1,.5, .1, .7, .1 }, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::Pow_bp op;
+    auto resultsB = op.execute({ &xB, &yB, &dLdzB }, {}, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, resultsB->status());
+    auto* dLdxB = resultsB->at(0);
+    auto* dLdyB = resultsB->at(1);
+
+    ASSERT_TRUE(dLdxExpB.isSameShape(dLdxB));
+    for (int i = 0; i < dLdxB->lengthOf(); ++i) {
+        if (!nd4j::math::nd4j_isnan(dLdxB->e<float>(i)) && !nd4j::math::nd4j_isnan(dLdxExpB.e<float>(i)))
+            ASSERT_NEAR(dLdxB->e<float>(i), dLdxExpB.e<float>(i), 0.00001);
+    }
+
+    ASSERT_TRUE(dLdyExpB.isSameShape(dLdyB));
+    for (int i = 0; i < dLdyB->lengthOf(); ++i) {
+        if (!nd4j::math::nd4j_isnan(dLdyB->e<float>(i)) && !nd4j::math::nd4j_isnan(dLdyExpB.e<float>(i)))
+            ASSERT_NEAR(dLdyB->e<float>(i), dLdyExpB.e<float>(i), 0.00001);
+    }
+
+    delete resultsB;
+}
diff --git a/libnd4j/tests_cpu/layers_tests/RNGTests.cpp b/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
index 0e320c726..0d5572ec6 100644
--- a/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
@@ -1090,7 +1090,7 @@ TEST_F(RNGTests, test_multinomial_5) {
     // multinomial as binomial if 2 classes used
     int batchValue = 1;
     int ClassValue = 2;
-    int Samples = 1000000;
+    int Samples = 100000;
 
     NDArray samples('c', { 1 }, { 1.*Samples }, nd4j::DataType::INT32);
     
@@ -1107,8 +1107,8 @@ TEST_F(RNGTests, test_multinomial_5) {
     auto mean = output.meanNumber();
     // printf("Var: %f  Mean: %f \n", deviation.e<double>(0), mean.e<double>(0));
     // theoretical values for binomial
-    ASSERT_NEAR(0.5, deviation.e<double>(0), 3e-3);
-    ASSERT_NEAR(0.5, mean.e<double>(0), 3e-3);
+    ASSERT_NEAR(0.5, deviation.e<double>(0), 4e-3); // 1000000 3e-3);
+    ASSERT_NEAR(0.5, mean.e<double>(0), 4e-3); // 1000000 3e-3);
     
     for (int i = 0; i < output.lengthOf(); i++) {
         auto value = output.e<Nd4jLong>(i);
@@ -1122,8 +1122,8 @@ TEST_F(RNGTests, test_multinomial_5) {
     deviation = outputR->varianceNumber(variance::SummaryStatsStandardDeviation, false);
     mean = outputR->meanNumber();
     // printf("Random seed - Var: %f  Mean: %f \n", deviation.e<double>(0), mean.e<double>(0));
-    ASSERT_NEAR(0.5, deviation.e<double>(0), 35e-3);
-    ASSERT_NEAR(0.5, mean.e<double>(0), 35e-3);
+    ASSERT_NEAR(0.5, deviation.e<double>(0), 45e-3); // 1000000 35e-3);
+    ASSERT_NEAR(0.5, mean.e<double>(0), 45e-3); // 1000000 35e-3);
 
     for (int i = 0; i < outputR->lengthOf(); i++) {
         auto value = outputR->e<Nd4jLong>(i);
@@ -1138,7 +1138,7 @@ TEST_F(RNGTests, test_multinomial_6) {
 
     int batchValue = 1;
     int ClassValue = 5;
-    int Samples = 1000000;
+    int Samples = 100000;
     
     NDArray samples('c', { 1 }, { 1. * Samples }, nd4j::DataType::INT32);
 
@@ -1165,14 +1165,14 @@ TEST_F(RNGTests, test_multinomial_6) {
         auto c = countsR.e<double>(i);
         auto p = probExpect.e<double>(i);
         // printf("Get freq : %f  Expect freq: %f \n", c / Samples, p);
-        ASSERT_NEAR((c / Samples), p, 35e-3);
+        ASSERT_NEAR((c / Samples), p, 45e-3); // 1000000 35e-3);
     }
 
     auto deviation = outputR->varianceNumber(variance::SummaryStatsStandardDeviation, false);
     auto mean = outputR->meanNumber();
     // printf("Var: %f  Mean: %f \n", deviation.e<double>(0), mean.e<double>(0));
-    ASSERT_NEAR(1.2175, deviation.e<double>(0), 35e-3);
-    ASSERT_NEAR(2.906, mean.e<double>(0), 35e-3);
+    ASSERT_NEAR(1.2175, deviation.e<double>(0), 45e-3); // 1000000 35e-3);
+    ASSERT_NEAR(2.906, mean.e<double>(0), 45e-3); // 1000000 35e-3);
 
     delete resultR;
 
@@ -1195,12 +1195,12 @@ TEST_F(RNGTests, test_multinomial_6) {
         auto c = counts.e<double>(i);
         auto p = probExpect.e<double>(i);
         // printf("Get freq : %f  Expect freq: %f \n", c / Samples, p);
-        ASSERT_NEAR((c / Samples), p, 3e-3);
+        ASSERT_NEAR((c / Samples), p, 4e-3); // 1000000 3e-3);
     }
 
     deviation = output.varianceNumber(variance::SummaryStatsStandardDeviation, false);
     mean = output.meanNumber();
     // printf("Var: %f  Mean: %f \n", deviation.e<double>(0), mean.e<double>(0));
-    ASSERT_NEAR(1.2175, deviation.e<double>(0), 3e-3);
-    ASSERT_NEAR(2.906, mean.e<double>(0), 3e-3);
+    ASSERT_NEAR(1.2175, deviation.e<double>(0), 5e-3); // 1000000 3e-3);
+    ASSERT_NEAR(2.906, mean.e<double>(0), 5e-3); // 1000000 3e-3);
 }
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunctionFactory.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunctionFactory.java
index 0a725786e..e38af27d4 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunctionFactory.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunctionFactory.java
@@ -92,20 +92,7 @@ import org.nd4j.linalg.api.ops.impl.reduce.TensorMmul;
 import org.nd4j.linalg.api.ops.impl.reduce.ZeroFraction;
 import org.nd4j.linalg.api.ops.impl.reduce.bool.All;
 import org.nd4j.linalg.api.ops.impl.reduce.bool.Any;
-import org.nd4j.linalg.api.ops.impl.reduce.bp.CumProdBp;
-import org.nd4j.linalg.api.ops.impl.reduce.bp.CumSumBp;
-import org.nd4j.linalg.api.ops.impl.reduce.bp.DotBp;
-import org.nd4j.linalg.api.ops.impl.reduce.bp.MaxBp;
-import org.nd4j.linalg.api.ops.impl.reduce.bp.MeanBp;
-import org.nd4j.linalg.api.ops.impl.reduce.bp.MinBp;
-import org.nd4j.linalg.api.ops.impl.reduce.bp.Norm1Bp;
-import org.nd4j.linalg.api.ops.impl.reduce.bp.Norm2Bp;
-import org.nd4j.linalg.api.ops.impl.reduce.bp.NormMaxBp;
-import org.nd4j.linalg.api.ops.impl.reduce.bp.ProdBp;
-import org.nd4j.linalg.api.ops.impl.reduce.bp.SquaredNormBp;
-import org.nd4j.linalg.api.ops.impl.reduce.bp.StandardDeviationBp;
-import org.nd4j.linalg.api.ops.impl.reduce.bp.SumBp;
-import org.nd4j.linalg.api.ops.impl.reduce.bp.VarianceBp;
+import org.nd4j.linalg.api.ops.impl.reduce.bp.*;
 import org.nd4j.linalg.api.ops.impl.reduce.custom.BatchMmul;
 import org.nd4j.linalg.api.ops.impl.reduce.custom.LogSumExp;
 import org.nd4j.linalg.api.ops.impl.reduce.floating.AMean;
@@ -1420,6 +1407,10 @@ public class DifferentialFunctionFactory {
         return new PowDerivative(sameDiff(), iX, false, pow).outputVariable();
     }
 
+    public SDVariable[] powBp(SDVariable x, SDVariable pow, SDVariable gradient) {
+        return new PowBp(sameDiff(), x, pow, gradient).outputVariables();
+    }
+
     public SDVariable mishDerivative(SDVariable iX) {
         return new MishDerivative(sameDiff(), iX, false).outputVariable();
     }
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java
index e85c472c8..ad23d1266 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/imports/converters/ImportClassMapping.java
@@ -230,6 +230,7 @@ public class ImportClassMapping {
             org.nd4j.linalg.api.ops.impl.scalar.LogX.class,
             org.nd4j.linalg.api.ops.impl.scalar.Pow.class,
             org.nd4j.linalg.api.ops.impl.scalar.PowDerivative.class,
+            org.nd4j.linalg.api.ops.impl.reduce.bp.PowBp.class,
             org.nd4j.linalg.api.ops.impl.scalar.RectifiedLinear.class,
             org.nd4j.linalg.api.ops.impl.scalar.RectifiedLinearDerivative.class,
             org.nd4j.linalg.api.ops.impl.transforms.custom.ThresholdRelu.class,
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce/bp/PowBp.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce/bp/PowBp.java
new file mode 100644
index 000000000..c46414f79
--- /dev/null
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce/bp/PowBp.java
@@ -0,0 +1,45 @@
+package org.nd4j.linalg.api.ops.impl.reduce.bp;
+
+import lombok.NoArgsConstructor;
+import org.nd4j.autodiff.samediff.SDVariable;
+import org.nd4j.autodiff.samediff.SameDiff;
+import org.nd4j.base.Preconditions;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.ops.impl.transforms.BaseDynamicTransformOp;
+import org.nd4j.linalg.api.ops.impl.transforms.pairwise.arithmetic.bp.BaseArithmeticBackpropOp;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+@NoArgsConstructor
+public class PowBp extends BaseDynamicTransformOp {
+
+    public PowBp(SameDiff sameDiff, SDVariable x, SDVariable y, SDVariable dLdz) {
+        super(sameDiff,new SDVariable[]{x,y,dLdz}, false);
+    }
+
+    public PowBp(INDArray x, INDArray y, INDArray dLdz,
+                 INDArray dLdx, INDArray dLdy) {
+        super(new INDArray[]{x,y, dLdz}, new INDArray[]{dLdx, dLdy});
+    }
+
+    @Override
+    public String opName() {
+        return "Pow_bp";
+    }
+
+    @Override
+    public boolean isInplaceCall() {
+        return false;
+    }
+
+    @Override
+    public List<DataType> calculateOutputDataTypes(List<DataType> dataTypes){
+        Preconditions.checkState(dataTypes != null && dataTypes.size() == 3, "Expected exactly 3 input datatypes for %s, got input %s", getClass(), dataTypes);
+        //Gradient types: same as input
+        return Arrays.asList(arg(0).dataType(), arg(1).dataType());
+    }
+}
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/scalar/Pow.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/scalar/Pow.java
index 8aafce3d1..08ead2683 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/scalar/Pow.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/scalar/Pow.java
@@ -19,7 +19,9 @@ package org.nd4j.linalg.api.ops.impl.scalar;
 import lombok.val;
 import org.nd4j.autodiff.samediff.SDVariable;
 import org.nd4j.autodiff.samediff.SameDiff;
+import org.nd4j.base.Preconditions;
 import org.nd4j.imports.NoOpNameFoundException;
+import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.api.ops.BaseScalarOp;
 import org.nd4j.linalg.api.ops.BaseTransformOp;
@@ -29,6 +31,7 @@ import org.tensorflow.framework.GraphDef;
 import org.tensorflow.framework.NodeDef;
 
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 
@@ -89,9 +92,8 @@ public class Pow extends BaseScalarOp {
     }
 
     @Override
-    public List<SDVariable> doDiff(List<SDVariable> i_v1) {
+    public List<SDVariable> doDiff(List<SDVariable> i_v1) {        
         SDVariable g = f().powDerivative(arg(), this.pow).mul(i_v1.get(0));
         return Arrays.asList(g);
     }
-
 }
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/custom/Pow.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/custom/Pow.java
index df41438fe..e155a4f2a 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/custom/Pow.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/custom/Pow.java
@@ -62,11 +62,14 @@ public class Pow extends DynamicCustomOp {
         //dL/da = b*a^(b-1) * dL/dy
         //dL/db = a^b * log(a) * dL/dy
 
-        SDVariable a = arg(0);
+        /*SDVariable a = arg(0);
         SDVariable b = arg(1);
         SDVariable dlda = b.mul(sameDiff.math().pow(a,b.sub(1))).mul(f1.get(0));
         SDVariable dldb = outputVariable().mul(sameDiff.math().log(a)).mul(f1.get(0));
-        return Arrays.asList(dlda, dldb);
+        return Arrays.asList(dlda, dldb);*/
+
+        SDVariable[] g = f().powBp(arg(0), arg(1), f1.get(0));
+        return Arrays.asList(g);
     }
 
     @Override
diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/opvalidation/ReductionBpOpValidation.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/opvalidation/ReductionBpOpValidation.java
index 58afb2acb..0fbfa2671 100644
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/opvalidation/ReductionBpOpValidation.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/opvalidation/ReductionBpOpValidation.java
@@ -921,4 +921,60 @@ public class ReductionBpOpValidation extends BaseOpValidation {
             assertNull(err, err);
         }
     }
+
+    @Test
+    public void testPowBP() {
+
+        for (boolean keepDims : new boolean[]{false, true}) {
+
+            INDArray preReduceInput_1 = Nd4j.createFromArray(new double[]{
+                    4,3,2,5,7,8,-9,-12
+            }).reshape(2,2,2);
+            INDArray preReduceInput_2 = Nd4j.createFromArray(new double[]{
+                    2,3,-2,4,-1,-4,10,8
+            }).reshape(2,2,2);
+            INDArray preReduceInput_3 = Nd4j.linspace(1, 8, 8).reshape(2, 2,2);
+            INDArray gradOutput = Nd4j.valueArrayOf(new long[]{2, 2, 2}, 1.0);
+            INDArray dLdInExpected_1 = Nd4j.createFromArray(new double[]{
+                    8,  27, -0.25,  500, -0.0204082, -0.000122, -3.87420e+09, -2.86654e+08
+            }).reshape(2,2,2);
+            INDArray dLdInExpected_2 = Nd4j.createFromArray(new double[]{
+                    22.18071, 29.66253, 0.17329, 1005.89874, 0.27799, 0.00051, 0, 0
+            }).reshape(2,2,2);
+            INDArray output1 = Nd4j.createUninitialized(2, 2,2);
+            INDArray output2 = Nd4j.createUninitialized(2, 2,2);
+
+            String err = OpValidation.validate(new OpTestCase(new PowBp(preReduceInput_1, preReduceInput_2,
+                    gradOutput, output1, output2))
+                    .expectedOutput(0, dLdInExpected_1).expectedOutput(1, dLdInExpected_2));
+
+            assertNull(err);
+        }
+    }
+
+    @Test
+    public void testPowBP1() {
+
+            INDArray preReduceInput_1 = Nd4j.createFromArray(new float[]{
+                     0.0714f,    0.4735f,   -0.1249f,    0.4482f,
+                   -0.1376f,    0.5218f,    0.5558f,    0.2444f,
+                   -0.5297f,    0.4291f,    0.4913f,   -0.1178f
+            }).reshape(3,4);
+            INDArray preReduceInput_2 = Nd4j.scalar(2.0000f);
+
+            INDArray gradOutput = Nd4j.valueArrayOf(new long[]{3, 4}, 1.0f);
+
+            INDArray output1 = Nd4j.createUninitialized(DataType.FLOAT, 3,4);
+            INDArray output2 = Nd4j.scalar(DataType.FLOAT, 1.0); //Nd4j.createUninitialized(DataType.FLOAT, 3,4);
+
+            INDArray expected1 = Nd4j.createFromArray(new float[]{
+                    0.1428f,    0.9470f,   -0.2498f,    0.8964f,
+                   -0.2752f,    1.0436f,    1.1116f,    0.4888f,
+                   -1.0594f,    0.8582f,    0.9826f,   -0.2356f
+            }).reshape(3,4);
+            INDArray expected2 = Nd4j.scalar(DataType.FLOAT, -1.112316132);
+            String err = OpValidation.validate(new OpTestCase(new PowBp(preReduceInput_1, preReduceInput_2,
+                    gradOutput, output1, output2)).expectedOutput(0, expected1).expectedOutput(1, expected2));
+            assertNull(err);
+    }
 }