From cb56b0b06a3b47d0507da1255df694f3b26d01db Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Tue, 8 Oct 2019 19:00:41 +0300
Subject: [PATCH 01/15] The first approach for
 fake_quant_with_min_max_vars_per_channel op implementation.

---
 ...ke_quant_with_min_max_vars_per_channel.cpp | 73 +++++++++++++++++++
 .../ops/declarable/headers/parity_ops.h       |  3 +
 .../helpers/cpu/fake_quantization.cpp         |  4 +
 .../helpers/cuda/fake_quantization.cu         |  4 +
 .../declarable/helpers/fake_quantization.h    |  1 +
 5 files changed, 85 insertions(+)
 create mode 100644 libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp

diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp
new file mode 100644
index 000000000..ba8eb9e7b
--- /dev/null
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp
@@ -0,0 +1,73 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author George Shulinok <sgazeos@gmail.com>, created on 08.10.2019
+//
+
+#include <op_boilerplate.h>
+#if NOT_EXCLUDED(OP_fake_quant_with_min_max_vars_per_channel)
+
+#include <ops/declarable/CustomOperations.h>
+#include <ops/declarable/helpers/fake_quantization.h>
+namespace nd4j {
+    namespace ops {
+        CONFIGURABLE_OP_IMPL(fake_quant_with_min_max_vars_per_channel, 1, 1, true, 0, 0) {
+
+            auto x = INPUT_VARIABLE(0);
+
+            NDArray* min;
+            NDArray* max;
+
+            REQUIRE_TRUE(block.width() == 3 || block.getTArguments()->size() == 2, 0, "fake_quant_with_min_max_vars_per_channel: No minimum/maximum values provided by either input arrays or TArgs");
+
+            NDArray m;
+            NDArray m2;
+            if(block.width() == 3){
+                min = INPUT_VARIABLE(1);
+                max = INPUT_VARIABLE(2);
+            } else if(block.getTArguments()->size() == 2){
+                m = NDArrayFactory::create(x->dataType(), T_ARG(0), block.launchContext());
+                m2 = NDArrayFactory::create(x->dataType(), T_ARG(1), block.launchContext());
+                min = &m;
+                max = &m2;
+            }
+            auto output  = OUTPUT_VARIABLE(0);
+            int numBits = 8;
+            if (block.getIArguments() && block.getIArguments()->size())
+                numBits = INT_ARG(0);
+            bool narrowed = false;
+            //INT_ARG(1);
+            if (block.getIArguments()->size() == 2) {
+                numBits = INT_ARG(0);
+                narrowed = INT_ARG(1);
+                REQUIRE_TRUE(numBits > 1 && numBits < 17, 0, "fake_quant_with_min_max_vars_per_channel: Number of bits for quatization should be in between 2 and 16, but %i was given.", numBits);
+            }
+            helpers::fakeQuantWithMinMaxVarsPerChannel(x, min, max, numBits, narrowed, output);
+            return ND4J_STATUS_OK;
+        }
+
+        DECLARE_TYPES(fake_quant_with_min_max_vars_per_channel) {
+            getOpDescriptor()
+            -> setAllowedOutputTypes({ALL_FLOATS})
+            -> setAllowedInputTypes({ALL_INTS, ALL_FLOATS});
+        }
+
+        DECLARE_SYN(fake_quant_with_min_max_args_per_channel, fake_quant_with_min_max_vars_per_channel);
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/headers/parity_ops.h b/libnd4j/include/ops/declarable/headers/parity_ops.h
index 8f6849ef7..e0dc55937 100644
--- a/libnd4j/include/ops/declarable/headers/parity_ops.h
+++ b/libnd4j/include/ops/declarable/headers/parity_ops.h
@@ -1747,6 +1747,9 @@ namespace nd4j {
         #if NOT_EXCLUDED(OP_fake_quant_with_min_max_vars)
         DECLARE_CONFIGURABLE_OP(fake_quant_with_min_max_vars, 3, 1, true, 0, -2);
         #endif
+        #if NOT_EXCLUDED(OP_fake_quant_with_min_max_vars_per_channel)
+                DECLARE_CONFIGURABLE_OP(fake_quant_with_min_max_vars_per_channel, 3, 1, true, 0, -2);
+        #endif
 
         /**
          * compare_and_bitpack - compare with greater and pack result with uint8 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
index df162474f..88c451ffb 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
@@ -93,6 +93,10 @@ namespace helpers {
     void fakeQuantWithMinMaxVars(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVars_, (input, min, max, numBits, narrowed, output), FLOAT_TYPES);
     }
+    void fakeQuantWithMinMaxVarsPerChannel(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
+        BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVars_, (input, min, max, numBits, narrowed, output), FLOAT_TYPES);
+    }
+
     BUILD_SINGLE_TEMPLATE(template void fakeQuantWithMinMaxVars_, (NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output), FLOAT_TYPES);
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
index 9bb331685..4e62aafa8 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
@@ -91,6 +91,10 @@ namespace helpers {
     void fakeQuantWithMinMaxVars(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVars_, (input, min, max, numBits, narrowed, output), FLOAT_TYPES);
     }
+    void fakeQuantWithMinMaxVarsPerChannel(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
+        BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVars_, (input, min, max, numBits, narrowed, output), FLOAT_TYPES);
+    }
+
     BUILD_SINGLE_TEMPLATE(template void fakeQuantWithMinMaxVars_, (NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output), FLOAT_TYPES);
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/fake_quantization.h b/libnd4j/include/ops/declarable/helpers/fake_quantization.h
index aa0941db4..7a43a15cb 100644
--- a/libnd4j/include/ops/declarable/helpers/fake_quantization.h
+++ b/libnd4j/include/ops/declarable/helpers/fake_quantization.h
@@ -27,6 +27,7 @@ namespace ops {
 namespace helpers {
 
     void fakeQuantWithMinMaxVars(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output);
+    void fakeQuantWithMinMaxVarsPerChannel(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output);
 }
 }
 }

From 3a89e518115b4123fb44ff014a14624b3fa84c71 Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Wed, 9 Oct 2019 13:38:18 +0300
Subject: [PATCH 02/15] Added tests for
 fake_quant_with_min_max_vars_per_channel op.

---
 .../layers_tests/DeclarableOpsTests10.cpp     | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
index 446763096..191ee8524 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
@@ -2154,6 +2154,62 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_2) {
     delete results;
 }
 
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_3) {
+
+    NDArray x = NDArrayFactory::create<double>('c', {1,2,3,1}, {-63.80, -63.75, -63.4, -63.5, 0.0, 0.1});
+    NDArray exp = NDArrayFactory::create<double>('c', {1,2,3,1},  {-63.75, -63.75, -63.251953, -63.251953, 0.0, 0.0});
+    NDArray min = NDArrayFactory::create<double>(-63.65);
+    NDArray max = NDArrayFactory::create<double>(0.1);
+
+    nd4j::ops::fake_quant_with_min_max_vars_per_channel op;
+    auto results = op.execute({&x, &min, &max}, {}, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto result = results->at(0);
+    // result->printIndexedBuffer("Quantized2");
+    ASSERT_TRUE(exp.isSameShapeStrict(result));
+    ASSERT_TRUE(exp.equalsTo(result));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_4) {
+
+    NDArray x = NDArrayFactory::create<double>('c', {2,4,5,3});
+    NDArray exp = NDArrayFactory::create<double>('c', {2,4,5,3},
+            {1.0588236,  1.9607843,  3.019608,  4.0588236,  5.098039,  6.039216,  7.0588236,  8.039216,  9.058824,
+                 10.058824,  10.980392,  12.078432, 13.058824,  13.921569, 15.09804,  16.058825,  17.058825, 18.117647,
+                 19.058825,  20.,        21.137257, 22.058825,  22.941177, 23.882355, 25.058825,  26.078432, 26.901962,
+                 28.058825, 29.019608,   29.92157,  31.058825,  31.960785, 32.941177, 34.058823,  35.09804,  35.960785,
+                 37.058823, 38.039215,   38.980392, 40.058823,  40.980392, 42.000004, 43.058826,  43.92157,  45.01961,
+                 45.,       47.058823,   48.03922,  45.,        50.,       51.058826, 45.,        50.,       54.078434,
+                 45.,       50.,         57.09804,  45.,        50.,       60.11765,  45.,        50.,       62.862747,
+                 45.,       50.,         65.882355, 45.,        50.,       68.90196,  45.,        50.,       70.,
+                 45.,       50.,         70.,       45.,        50.,       70.,       45.,        50.,       70.,
+                 45.,       50.,         70.,       45.,        50.,       70.,       45.,        50.,       70.,
+                 45.,       50.,         70.,       45.,        50.,       70.,       45.,        50.,       70.,
+                 45.,       50.,         70.,       45.,        50.,       70.,       45.,        50.,       70.,
+                 45.,       50.,         70.,       45.,        50.,       70.,       45.,        50.,       70.,
+                 45.,       50.,        70.});
+    NDArray min = NDArrayFactory::create<double>({20., 20., 20.});
+    NDArray max = NDArrayFactory::create<double>({65., 70., 90.});
+
+    nd4j::ops::fake_quant_with_min_max_vars_per_channel op;
+    auto results = op.execute({&x, &min, &max}, {}, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto result = results->at(0);
+    // result->printIndexedBuffer("Quantized2");
+    ASSERT_TRUE(exp.isSameShapeStrict(result));
+    ASSERT_TRUE(exp.equalsTo(result));
+
+    delete results;
+}
+
 ////////////////////////////////////////////////////////////////////
 TYPED_TEST(TypedDeclarableOpsTests10, batchnorm_new_test1) {
 

From d0cbd33b0e7cff44807a56b9514c23a1247093d5 Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Wed, 9 Oct 2019 15:52:13 +0300
Subject: [PATCH 03/15] Added input checks for op.

---
 ...ke_quant_with_min_max_vars_per_channel.cpp | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp
index ba8eb9e7b..4d719b38b 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp
@@ -28,22 +28,19 @@ namespace nd4j {
         CONFIGURABLE_OP_IMPL(fake_quant_with_min_max_vars_per_channel, 1, 1, true, 0, 0) {
 
             auto x = INPUT_VARIABLE(0);
-
-            NDArray* min;
-            NDArray* max;
+            auto min = INPUT_VARIABLE(1);
+            auto max = INPUT_VARIABLE(2);
 
             REQUIRE_TRUE(block.width() == 3 || block.getTArguments()->size() == 2, 0, "fake_quant_with_min_max_vars_per_channel: No minimum/maximum values provided by either input arrays or TArgs");
+            auto depth = x->sizeAt(-1);
+            REQUIRE_TRUE(min->rankOf() == 1 && max->rankOf() == 1 && min->lengthOf() == max->lengthOf(), 0,
+                    "fake_quant_with_min_max_vars_per_channel: Min and Max should be 1D tensors with the same length");
+            REQUIRE_TRUE(depth == min->lengthOf(), 0, "fake_quant_with_min_max_vars_per_channel: Min length should be"
+                                                      " %lld, but %lld occurs.", depth, min->lengthOf());
 
-            NDArray m;
-            NDArray m2;
-            if(block.width() == 3){
-                min = INPUT_VARIABLE(1);
-                max = INPUT_VARIABLE(2);
-            } else if(block.getTArguments()->size() == 2){
-                m = NDArrayFactory::create(x->dataType(), T_ARG(0), block.launchContext());
-                m2 = NDArrayFactory::create(x->dataType(), T_ARG(1), block.launchContext());
-                min = &m;
-                max = &m2;
+            REQUIRE_TRUE(depth == max->lengthOf(), 0, "fake_quant_with_min_max_vars_per_channel: Max length should be"
+                                                      "%lld, but %lld occurs.", depth, max->lengthOf());
+            if(block.width() == 3) {
             }
             auto output  = OUTPUT_VARIABLE(0);
             int numBits = 8;
@@ -54,7 +51,9 @@ namespace nd4j {
             if (block.getIArguments()->size() == 2) {
                 numBits = INT_ARG(0);
                 narrowed = INT_ARG(1);
-                REQUIRE_TRUE(numBits > 1 && numBits < 17, 0, "fake_quant_with_min_max_vars_per_channel: Number of bits for quatization should be in between 2 and 16, but %i was given.", numBits);
+                REQUIRE_TRUE(numBits > 1 && numBits < 17, 0, "fake_quant_with_min_max_vars_per_channel: Number of bits"
+                                                             " for quatization should be in between 2 and 16, but %i "
+                                                             "was given.", numBits);
             }
             helpers::fakeQuantWithMinMaxVarsPerChannel(x, min, max, numBits, narrowed, output);
             return ND4J_STATUS_OK;

From 352f1eee802475e88ba8b30a757f4b965875afa3 Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Wed, 9 Oct 2019 21:39:59 +0300
Subject: [PATCH 04/15] Implemented fake_quant_with_min_max_per_channel helper
 for cpu platform. The first approach.

---
 .../helpers/cpu/fake_quantization.cpp         | 75 +++++++++++++++----
 .../layers_tests/DeclarableOpsTests10.cpp     | 19 +++--
 2 files changed, 73 insertions(+), 21 deletions(-)

diff --git a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
index 88c451ffb..a2d0c3c59 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
@@ -25,6 +25,55 @@ namespace nd4j {
 namespace ops {
 namespace helpers {
 
+    template <typename T>
+    static void Nudge(T min, T max, T quant_min, T quant_max, T* scale, T* nudged_min, T* nudged_max) {
+        *scale = (max - min) / (quant_max - quant_min);
+        auto zero_point_from_min = quant_min - min / *scale;
+        uint16_t const nudged_zero_point = [zero_point_from_min, quant_min, quant_max] {
+                if (zero_point_from_min < quant_min) {
+                    return static_cast<uint16_t>(quant_min);
+                }
+                if (zero_point_from_min > quant_max) {
+                    return static_cast<uint16_t>(quant_max);
+                }
+                return nd4j::math::nd4j_round<T,uint16_t>(zero_point_from_min);
+            }();
+            *nudged_min = (quant_min - nudged_zero_point) * (*scale);
+            *nudged_max = (quant_max - nudged_zero_point) * (*scale);
+    }
+
+    template <typename T>
+    void fakeQuantWithMinMaxVarsPerChannel_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
+        int lowIntBound = narrowed ? 1 : 0;
+        int upperIntBound = 1 << numBits - 1;
+
+        const float quant_min_float = static_cast<float>(lowIntBound);
+        const float quant_max_float = static_cast<float>(upperIntBound);
+//        auto scaleTensor(*input); // = NDArrayFactory::create(input->ordering(), input->getShapeAsVector(), input->getWorkspace());
+        auto clamped(*input); // = NDArrayFactory::create(input->ordering(), input->getShapeAsVector(), input->getWorkspace());
+        for (auto i = 0; i < min->lengthOf(); i++) {
+            T scale, nudged_min, nudged_max;
+            Nudge<T>(min->t<T>(i), max->t<T>(i), quant_min_float, quant_max_float, &scale, &nudged_min, &nudged_max);
+            auto wiseMinMax = LAMBDA_T(x, nudged_min, nudged_max) {
+                if (x < nudged_min) {
+                    return nudged_min;
+                }
+                else if (x > nudged_max)
+                    return nudged_max;
+                return x;
+            };
+//            scaleTensor.assign(scale);
+            input->applyLambda<T>(wiseMinMax, &clamped);
+            clamped -= nudged_min;
+            // auto nudgedScale = scale;
+            clamped /= scale;
+            clamped += T(0.5f);
+            clamped.applyTransform(transform::Floor, output, nullptr);
+            (*output) *= scale;
+            (*output) += nudged_min;
+        }
+    }
+
     template <typename T>
     void fakeQuantWithMinMaxVars_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
         int lowIntBound = narrowed ? 1 : 0;
@@ -35,15 +84,15 @@ namespace helpers {
         T scale = (max->t<T>(0) - min->t<T>(0)) / (quant_max_float - quant_min_float);
         const T zero_point_from_min = quant_min_float - min->e<T>(0) / scale;
         const uint16_t nudged_zero_point = [zero_point_from_min, lowIntBound,
-                                        quant_min_float, upperIntBound,
-                                        quant_max_float] {
-        if (zero_point_from_min < quant_min_float) {
-          return static_cast<uint16_t>(lowIntBound);
-        }
-        if (zero_point_from_min > quant_max_float) {
-          return static_cast<uint16_t>(upperIntBound);
-        }
-        return static_cast<uint16_t>(roundf(zero_point_from_min));
+                quant_min_float, upperIntBound,
+                quant_max_float] {
+            if (zero_point_from_min < quant_min_float) {
+                return static_cast<uint16_t>(lowIntBound);
+            }
+            if (zero_point_from_min > quant_max_float) {
+                return static_cast<uint16_t>(upperIntBound);
+            }
+            return static_cast<uint16_t>(roundf(zero_point_from_min));
         }();
 
         auto nudged_min = (quant_min_float - nudged_zero_point) * (scale);
@@ -71,10 +120,10 @@ namespace helpers {
         clamped.applyLambda<T>(wiseMax, output);
 //        const auto clamped_shifted = clamped - nudged_min;
         *output -= nudged_min;
-       // auto nudgedScale = scale;
+        // auto nudgedScale = scale;
         (*output) /= scaleTensor;
-        (*output) += T(0.5f);
-        output->applyTransform(transform::Floor, nullptr, nullptr);
+//        (*output) += T(0.5f);
+        output->applyTransform(transform::Round, nullptr, nullptr);
         (*output) *= scaleTensor;
         (*output) += nudged_min;
         //output->printIndexedBuffer("FAKE QUANTED");
@@ -94,7 +143,7 @@ namespace helpers {
         BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVars_, (input, min, max, numBits, narrowed, output), FLOAT_TYPES);
     }
     void fakeQuantWithMinMaxVarsPerChannel(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVars_, (input, min, max, numBits, narrowed, output), FLOAT_TYPES);
+        BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVarsPerChannel_, (input, min, max, numBits, narrowed, output), FLOAT_TYPES);
     }
 
     BUILD_SINGLE_TEMPLATE(template void fakeQuantWithMinMaxVars_, (NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output), FLOAT_TYPES);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
index 191ee8524..a6edb23c7 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
@@ -2159,8 +2159,8 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_3) {
 
     NDArray x = NDArrayFactory::create<double>('c', {1,2,3,1}, {-63.80, -63.75, -63.4, -63.5, 0.0, 0.1});
     NDArray exp = NDArrayFactory::create<double>('c', {1,2,3,1},  {-63.75, -63.75, -63.251953, -63.251953, 0.0, 0.0});
-    NDArray min = NDArrayFactory::create<double>(-63.65);
-    NDArray max = NDArrayFactory::create<double>(0.1);
+    NDArray min = NDArrayFactory::create<double>('c', {1},{-63.65});
+    NDArray max = NDArrayFactory::create<double>('c', {1}, {0.1});
 
     nd4j::ops::fake_quant_with_min_max_vars_per_channel op;
     auto results = op.execute({&x, &min, &max}, {}, {});
@@ -2178,8 +2178,8 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_3) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_4) {
 
-    NDArray x = NDArrayFactory::create<double>('c', {2,4,5,3});
-    NDArray exp = NDArrayFactory::create<double>('c', {2,4,5,3},
+    NDArray x = NDArrayFactory::create<float>('c', {2,4,5,3});
+    NDArray exp = NDArrayFactory::create<float>('c', {2,4,5,3},
             {1.0588236,  1.9607843,  3.019608,  4.0588236,  5.098039,  6.039216,  7.0588236,  8.039216,  9.058824,
                  10.058824,  10.980392,  12.078432, 13.058824,  13.921569, 15.09804,  16.058825,  17.058825, 18.117647,
                  19.058825,  20.,        21.137257, 22.058825,  22.941177, 23.882355, 25.058825,  26.078432, 26.901962,
@@ -2194,16 +2194,19 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_4) {
                  45.,       50.,         70.,       45.,        50.,       70.,       45.,        50.,       70.,
                  45.,       50.,         70.,       45.,        50.,       70.,       45.,        50.,       70.,
                  45.,       50.,        70.});
-    NDArray min = NDArrayFactory::create<double>({20., 20., 20.});
-    NDArray max = NDArrayFactory::create<double>({65., 70., 90.});
-
+    NDArray min = NDArrayFactory::create<float>({20., 20., 20.});
+    NDArray max = NDArrayFactory::create<float>({65., 70., 90.});
+    x.linspace(1.);
     nd4j::ops::fake_quant_with_min_max_vars_per_channel op;
     auto results = op.execute({&x, &min, &max}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto result = results->at(0);
-    // result->printIndexedBuffer("Quantized2");
+    result->printBuffer("Quantized per channels 4");
+    exp.printBuffer("Quantized per channest E");
+    auto diff = *result - exp;
+    diff.printIndexedBuffer("Difference");
     ASSERT_TRUE(exp.isSameShapeStrict(result));
     ASSERT_TRUE(exp.equalsTo(result));
 

From 3c0c59ab889875bd0d0d6f542569490c4e1d5ddb Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Wed, 9 Oct 2019 22:09:33 +0300
Subject: [PATCH 05/15] Refactored fake_quant_with_min_max_vars op.

---
 .../helpers/cpu/fake_quantization.cpp         | 74 ++++++-------------
 1 file changed, 21 insertions(+), 53 deletions(-)

diff --git a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
index a2d0c3c59..28437359e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
@@ -74,6 +74,20 @@ namespace helpers {
         }
     }
 
+    template <typename T>
+    static void WiseMinMax(NDArray* input, T min, T max, NDArray* output) {
+        auto wiseMinMax = LAMBDA_T(x, min, max) {
+            if (x < min) {
+                return min;
+            }
+            else if (x > max)
+                return max;
+            return x;
+        };
+
+        input->applyLambda<T>(wiseMinMax, output);
+    }
+
     template <typename T>
     void fakeQuantWithMinMaxVars_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
         int lowIntBound = narrowed ? 1 : 0;
@@ -81,62 +95,16 @@ namespace helpers {
 
         const float quant_min_float = static_cast<float>(lowIntBound);
         const float quant_max_float = static_cast<float>(upperIntBound);
-        T scale = (max->t<T>(0) - min->t<T>(0)) / (quant_max_float - quant_min_float);
-        const T zero_point_from_min = quant_min_float - min->e<T>(0) / scale;
-        const uint16_t nudged_zero_point = [zero_point_from_min, lowIntBound,
-                quant_min_float, upperIntBound,
-                quant_max_float] {
-            if (zero_point_from_min < quant_min_float) {
-                return static_cast<uint16_t>(lowIntBound);
-            }
-            if (zero_point_from_min > quant_max_float) {
-                return static_cast<uint16_t>(upperIntBound);
-            }
-            return static_cast<uint16_t>(roundf(zero_point_from_min));
-        }();
+        T nudged_min, nudged_max, scale;
 
-        auto nudged_min = (quant_min_float - nudged_zero_point) * (scale);
-        auto nudged_max = (quant_max_float - nudged_zero_point) * (scale);
-        //input->applyScalar(scalar::CompareAndSet, nudged_max, clamped, nullptr); //.cwiseMin(nudged_max).cwiseMax(nudged_min);
-        //input->applyScalar(scalar::CompareAndSet, nudged_min, clamped, nullptr); //.cwiseMin(nudged_max).cwiseMax(nudged_min);
-        auto wiseMax = LAMBDA_T(x, nudged_min) {
-            if (x < nudged_min) {
-                return nudged_min;
-            }
-            return x;
-
-        };
-        auto wiseMin = LAMBDA_T(x, nudged_max) {
-            if (x > nudged_max) {
-                return nudged_max;
-            }
-            return x;
-        };
-        auto scaleTensor(*input); // = NDArrayFactory::create(input->ordering(), input->getShapeAsVector(), input->getWorkspace());
-        auto clamped(*input); // = NDArrayFactory::create(input->ordering(), input->getShapeAsVector(), input->getWorkspace());
-        scaleTensor.assign(scale);
-        input->applyLambda<T>(wiseMin, &clamped);
-//        const auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
-        clamped.applyLambda<T>(wiseMax, output);
-//        const auto clamped_shifted = clamped - nudged_min;
+        Nudge<T>(min->t<T>(0), max->t<T>(0), quant_min_float, quant_max_float, &scale, &nudged_min, &nudged_max);
+        WiseMinMax<T>(input, nudged_min, nudged_max, output);
         *output -= nudged_min;
-        // auto nudgedScale = scale;
-        (*output) /= scaleTensor;
-//        (*output) += T(0.5f);
-        output->applyTransform(transform::Round, nullptr, nullptr);
-        (*output) *= scaleTensor;
+        (*output) /= scale;
+        (*output) += T(0.5f);
+        output->applyTransform(transform::Floor, nullptr, nullptr);
+        (*output) *= scale;
         (*output) += nudged_min;
-        //output->printIndexedBuffer("FAKE QUANTED");
-        /*
-    const auto nudged_scale_repl = inputs.constant(nudged_scale);
-
-    const auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
-    const auto clamped_shifted = clamped - nudged_min;
-    *output = (clamped_shifted / nudged_scale_repl + 0.5f).floor() *
-                            nudged_scale_repl +
-                        nudged_min;
-*/
-
     }
 
     void fakeQuantWithMinMaxVars(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {

From c13e945a96491b1d4451a5fccbc1c594d5c9c1e4 Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Thu, 10 Oct 2019 13:23:11 +0300
Subject: [PATCH 06/15] Fixed fake_quant_with_min_max_vars op and tests.

---
 .../helpers/cpu/fake_quantization.cpp         | 59 +++++++--------
 .../layers_tests/DeclarableOpsTests10.cpp     | 75 ++++++++++++++++---
 2 files changed, 91 insertions(+), 43 deletions(-)

diff --git a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
index 28437359e..622934407 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
@@ -26,51 +26,44 @@ namespace ops {
 namespace helpers {
 
     template <typename T>
-    static void Nudge(T min, T max, T quant_min, T quant_max, T* scale, T* nudged_min, T* nudged_max) {
-        *scale = (max - min) / (quant_max - quant_min);
-        auto zero_point_from_min = quant_min - min / *scale;
-        uint16_t const nudged_zero_point = [zero_point_from_min, quant_min, quant_max] {
-                if (zero_point_from_min < quant_min) {
+    static void Nudge(T min, T max, int quant_min, int quant_max, T* scale, T* nudged_min, T* nudged_max) {
+        T quant_max_float = static_cast<T>(quant_max);
+        T quant_min_float = static_cast<T>(quant_min);
+        *scale = (max - min) / (quant_max_float - quant_min_float);
+        auto zero_point_from_min = quant_min_float - min / *scale;
+        uint16_t const nudged_zero_point = [zero_point_from_min, quant_min, quant_max, quant_max_float, quant_min_float] {
+                if (zero_point_from_min < quant_min_float) {
                     return static_cast<uint16_t>(quant_min);
                 }
-                if (zero_point_from_min > quant_max) {
+                if (zero_point_from_min > quant_max_float) {
                     return static_cast<uint16_t>(quant_max);
                 }
                 return nd4j::math::nd4j_round<T,uint16_t>(zero_point_from_min);
             }();
-            *nudged_min = (quant_min - nudged_zero_point) * (*scale);
-            *nudged_max = (quant_max - nudged_zero_point) * (*scale);
+            *nudged_min = (quant_min_float - nudged_zero_point) * (*scale);
+            *nudged_max = (quant_max_float - nudged_zero_point) * (*scale);
     }
 
     template <typename T>
     void fakeQuantWithMinMaxVarsPerChannel_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
         int lowIntBound = narrowed ? 1 : 0;
-        int upperIntBound = 1 << numBits - 1;
+        int upperIntBound = (1 << numBits) - 1;
+        auto channels = input->sizeAt(-1);
 
-        const float quant_min_float = static_cast<float>(lowIntBound);
-        const float quant_max_float = static_cast<float>(upperIntBound);
-//        auto scaleTensor(*input); // = NDArrayFactory::create(input->ordering(), input->getShapeAsVector(), input->getWorkspace());
-        auto clamped(*input); // = NDArrayFactory::create(input->ordering(), input->getShapeAsVector(), input->getWorkspace());
-        for (auto i = 0; i < min->lengthOf(); i++) {
+        PRAGMA_OMP_PARALLEL_FOR
+        for (auto i = 0; i < channels; i++) {
             T scale, nudged_min, nudged_max;
-            Nudge<T>(min->t<T>(i), max->t<T>(i), quant_min_float, quant_max_float, &scale, &nudged_min, &nudged_max);
-            auto wiseMinMax = LAMBDA_T(x, nudged_min, nudged_max) {
-                if (x < nudged_min) {
-                    return nudged_min;
-                }
-                else if (x > nudged_max)
-                    return nudged_max;
-                return x;
-            };
-//            scaleTensor.assign(scale);
-            input->applyLambda<T>(wiseMinMax, &clamped);
-            clamped -= nudged_min;
-            // auto nudgedScale = scale;
-            clamped /= scale;
-            clamped += T(0.5f);
-            clamped.applyTransform(transform::Floor, output, nullptr);
-            (*output) *= scale;
-            (*output) += nudged_min;
+            Nudge<T>(min->t<T>(i), max->t<T>(i), lowIntBound, upperIntBound, &scale, &nudged_min, &nudged_max);
+
+            for (auto e = 0; e < input->lengthOf(); e += channels) {
+                T val = input->t<T>(e + i);
+                if ( val <= nudged_min)
+                    val = nudged_min;
+                else if (val >= nudged_max)
+                    val = nudged_max;
+
+                output->t<T>(e + i) = math::nd4j_floor<T,T>((val - nudged_min)/scale + T(0.5)) * scale + nudged_min;
+            }
         }
     }
 
@@ -91,7 +84,7 @@ namespace helpers {
     template <typename T>
     void fakeQuantWithMinMaxVars_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
         int lowIntBound = narrowed ? 1 : 0;
-        int upperIntBound = 1 << numBits - 1;
+        int upperIntBound = (1 << numBits) - 1;
 
         const float quant_min_float = static_cast<float>(lowIntBound);
         const float quant_max_float = static_cast<float>(upperIntBound);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
index a6edb23c7..6ae982cf8 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
@@ -2117,7 +2117,7 @@ TEST_F(DeclarableOpsTests10, Image_DrawBoundingBoxes_2) {
 TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_1) {
 
     NDArray x('c', {2,3}, {-63.80f, -63.75f, -63.70f, -63.5f, 0.0f, 0.1f}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {2,3},  {-63.75f, -63.75f, -63.75f, -63.251953f, 0.0f, 0.0f}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {2,3},  {-63.75, -63.75, -63.75, -63.5, 0., 0.}, nd4j::DataType::FLOAT32);
     NDArray min('c', {},  {-63.65f}, nd4j::DataType::FLOAT32);
     NDArray max('c', {},  {0.1f}, nd4j::DataType::FLOAT32);
 
@@ -2127,7 +2127,8 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_1) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto result = results->at(0);
-    // result->printIndexedBuffer("Quantized");
+    result->printBuffer("Quantized");
+    exp.printBuffer("Expected");
     ASSERT_TRUE(exp.isSameShapeStrict(result));
     ASSERT_TRUE(exp.equalsTo(result));
 
@@ -2137,7 +2138,7 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_1) {
 TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_2) {
 
     NDArray x = NDArrayFactory::create<double>('c', {2,3}, {-63.80, -63.75, -63.4, -63.5, 0.0, 0.1});
-    NDArray exp = NDArrayFactory::create<double>('c', {2,3},  {-63.75, -63.75, -63.251953, -63.251953, 0.0, 0.0});
+    NDArray exp = NDArrayFactory::create<double>('c', {2,3},  {-63.75, -63.75, -63.5 , -63.5 ,   0.  ,   0. });
     NDArray min = NDArrayFactory::create<double>(-63.65);
     NDArray max = NDArrayFactory::create<double>(0.1);
 
@@ -2158,7 +2159,7 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_2) {
 TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_3) {
 
     NDArray x = NDArrayFactory::create<double>('c', {1,2,3,1}, {-63.80, -63.75, -63.4, -63.5, 0.0, 0.1});
-    NDArray exp = NDArrayFactory::create<double>('c', {1,2,3,1},  {-63.75, -63.75, -63.251953, -63.251953, 0.0, 0.0});
+    NDArray exp = NDArrayFactory::create<double>('c', {1,2,3,1},  {-63.75, -63.75, -63.5 , -63.5 ,   0.  ,   0. });
     NDArray min = NDArrayFactory::create<double>('c', {1},{-63.65});
     NDArray max = NDArrayFactory::create<double>('c', {1}, {0.1});
 
@@ -2179,8 +2180,8 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_3) {
 TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_4) {
 
     NDArray x = NDArrayFactory::create<float>('c', {2,4,5,3});
-    NDArray exp = NDArrayFactory::create<float>('c', {2,4,5,3},
-            {1.0588236,  1.9607843,  3.019608,  4.0588236,  5.098039,  6.039216,  7.0588236,  8.039216,  9.058824,
+    NDArray exp = NDArrayFactory::create<float>('c', {2,4,5,3},{
+                  1.0588236,  1.9607843,  3.019608,  4.0588236,  5.098039,  6.039216,  7.0588236,  8.039216,  9.058824,
                  10.058824,  10.980392,  12.078432, 13.058824,  13.921569, 15.09804,  16.058825,  17.058825, 18.117647,
                  19.058825,  20.,        21.137257, 22.058825,  22.941177, 23.882355, 25.058825,  26.078432, 26.901962,
                  28.058825, 29.019608,   29.92157,  31.058825,  31.960785, 32.941177, 34.058823,  35.09804,  35.960785,
@@ -2203,10 +2204,64 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_4) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto result = results->at(0);
-    result->printBuffer("Quantized per channels 4");
-    exp.printBuffer("Quantized per channest E");
-    auto diff = *result - exp;
-    diff.printIndexedBuffer("Difference");
+//    result->printBuffer("Quantized per channels 4");
+//    exp.printBuffer("Quantized per channest E");
+//    auto diff = *result - exp;
+//    diff.printIndexedBuffer("Difference");
+    ASSERT_TRUE(exp.isSameShapeStrict(result));
+    ASSERT_TRUE(exp.equalsTo(result));
+
+    delete results;
+}
+
+TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_5) {
+    NDArray x = NDArrayFactory::create<float>('c', {2, 3, 5, 4});
+    NDArray exp = NDArrayFactory::create<float>('c', {2, 3, 5, 4},{
+            -19.92157   , -18.980392  , -18.039217  , -16.941177  ,
+            -19.92157   , -18.980392  , -18.039217  , -16.941177  ,
+            -19.92157   , -18.980392  , -18.039217  , -16.941177  ,
+            -19.92157   , -18.980392  , -18.039217  , -16.941177  ,
+            -19.92157   , -18.980392  , -18.039217  , -16.941177  ,
+            -19.92157   , -18.980392  , -18.039217  , -16.941177  ,
+            -19.92157   , -18.980392  , -18.039217  , -16.941177  ,
+            -19.92157   , -18.980392  , -18.039217  , -16.941177  ,
+            -19.92157   , -18.980392  , -18.039217  , -16.941177  ,
+            -19.92157   , -18.980392  , -18.039217  , -16.941177  ,
+            -19.92157   , -18.980392  , -18.039217  , -16.941177  ,
+            -16.        , -15.058824  , -13.960785  , -13.0196085 ,
+            -11.92157   , -10.980392  , -10.039217  ,  -8.941177  ,
+            -8.000001  ,  -7.0588236 ,  -5.960785  ,  -5.0196085 ,
+            -3.9215698 ,  -2.9803925 ,  -2.039217  ,  -0.94117737,
+            0.        ,   0.94117737,   2.039215  ,   2.9803925 ,
+            4.07843   ,   5.0196075 ,   5.960783  ,   7.0588226 ,
+            8.        ,   8.941177  ,  10.039215  ,  10.980392  ,
+            12.07843   ,  13.019608  ,  13.960783  ,  15.058823  ,
+            16.        ,  16.941177  ,  18.039217  ,  18.980392  ,
+            20.07843   ,  21.019608  ,  21.960783  ,  23.058823  ,
+            20.07843   ,  21.019608  ,  21.960783  ,  23.058823  ,
+            20.07843   ,  21.019608  ,  21.960783  ,  23.058823  ,
+            20.07843   ,  21.019608  ,  21.960783  ,  23.058823  ,
+            20.07843   ,  21.019608  ,  21.960783  ,  23.058823  ,
+            20.07843   ,  21.019608  ,  21.960783  ,  23.058823  ,
+            20.07843   ,  21.019608  ,  21.960783  ,  23.058823  ,
+            20.07843   ,  21.019608  ,  21.960783  ,  23.058823  ,
+            20.07843   ,  21.019608  ,  21.960783  ,  23.058823  ,
+            20.07843   ,  21.019608  ,  21.960783  ,  23.058823
+    });
+    NDArray min = NDArrayFactory::create<float>({-20., -19., -18., -17});
+    NDArray max = NDArrayFactory::create<float>({20., 21., 22., 23});
+    x.linspace(-60.);
+    nd4j::ops::fake_quant_with_min_max_vars_per_channel op;
+    auto results = op.execute({&x, &min, &max}, {}, {});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto result = results->at(0);
+//    result->printBuffer("Quantized per channels 5");
+//    exp.printBuffer("Quantized per channest E");
+//    auto diff = *result - exp;
+//    diff.printIndexedBuffer("Difference");
+
     ASSERT_TRUE(exp.isSameShapeStrict(result));
     ASSERT_TRUE(exp.equalsTo(result));
 

From 753565145c7c4d1cdbfe3380a26e003083c29821 Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Thu, 10 Oct 2019 14:00:49 +0300
Subject: [PATCH 07/15] Refactored fake_quant_with_min_max_vars op cuda
 implementation.

---
 .../helpers/cuda/fake_quantization.cu         | 71 ++++++++-----------
 1 file changed, 30 insertions(+), 41 deletions(-)

diff --git a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
index 4e62aafa8..9def35152 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
@@ -34,58 +34,47 @@ namespace helpers {
 // output - output tensor
 //
     template <typename T>
-    void fakeQuantWithMinMaxVars_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
-        int lowIntBound = narrowed?1:0;
-        int upperIntBound = 1 << numBits - 1;
-        min->syncToHost();
-        max->syncToHost();
-        const float quant_min_float = static_cast<float>(lowIntBound);
-        const float quant_max_float = static_cast<float>(upperIntBound);
-        T scale = (max->t<T>(0) - min->t<T>(0)) / (quant_max_float - quant_min_float);
-        const T zero_point_from_min = quant_min_float - min->t<T>(0) / scale;
-
-        const uint16_t nudged_zero_point = [zero_point_from_min, lowIntBound,
-                                        quant_min_float, upperIntBound,
-                                        quant_max_float] {
+    static void Nudge(T min, T max, int quant_min, int quant_max, T* scale, T* nudged_min, T* nudged_max) {
+        T quant_max_float = static_cast<T>(quant_max);
+        T quant_min_float = static_cast<T>(quant_min);
+        *scale = (max - min) / (quant_max_float - quant_min_float);
+        auto zero_point_from_min = quant_min_float - min / *scale;
+        uint16_t const nudged_zero_point = [zero_point_from_min, quant_min, quant_max, quant_max_float, quant_min_float] {
             if (zero_point_from_min < quant_min_float) {
-                return static_cast<uint16_t>(lowIntBound);
+                return static_cast<uint16_t>(quant_min);
             }
             if (zero_point_from_min > quant_max_float) {
-                return static_cast<uint16_t>(upperIntBound);
+                return static_cast<uint16_t>(quant_max);
             }
-            return static_cast<uint16_t>(roundf(zero_point_from_min));
+            return nd4j::math::nd4j_round<T,uint16_t>(zero_point_from_min);
         }();
+        *nudged_min = (quant_min_float - nudged_zero_point) * (*scale);
+        *nudged_max = (quant_max_float - nudged_zero_point) * (*scale);
+    }
 
-        auto nudged_min = (quant_min_float - nudged_zero_point) * (scale);
-        auto nudged_max = (quant_max_float - nudged_zero_point) * (scale);
+    template <typename T>
+    void fakeQuantWithMinMaxVars_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
+        int lowIntBound = narrowed?1:0;
+        int upperIntBound = (1 << numBits) - 1;
+        min->syncToHost();
+        max->syncToHost();
+        T scale, nudged_min, nudged_max;
+        Nudge(min->t<T>(0), max->t<T>(0), lowIntBound, upperIntBound, &scale, &nudged_min, &nudged_max);
 
-        auto wiseMax = LAMBDA_T(x, nudged_min) {
+        auto wiseMinMaxAndSoOn = LAMBDA_T(x, nudged_min, nudged_max, scale) {
+            T val = x;
             if (x < nudged_min) {
-                return nudged_min;
+                val = nudged_min;
             }
-            return x;
+            else if (x > nudged_max) {
+                val = nudged_max;
+            }
+            else
+                val = x;
+            return (math::nd4j_floor<T,T>((val - nudged_min) / scale + T(0.5)) * scale + nudged_min);
         };
 
-        auto wiseMin = LAMBDA_T(x, nudged_max) {
-            if (x > nudged_max) {
-                return nudged_max;
-            }
-            return x;
-        };
-
-        auto scaleTensor(*input);
-        auto clamped(*input);
-        scaleTensor.assign(scale);
-        input->applyLambda(wiseMin, &clamped);
-
-        clamped.applyLambda(wiseMax, output);
-        *output -= nudged_min;
-
-        (*output) /= scaleTensor;
-        (*output) += T(0.5f);
-        output->applyTransform(transform::Floor, nullptr, nullptr);
-        (*output) *= scaleTensor;
-        (*output) += nudged_min;
+        input->applyLambda(wiseMinMaxAndSoOn, output);
     }
 
     void fakeQuantWithMinMaxVars(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {

From 3504b0cda99fbb31fb2735a90c7b1d82fc4ae25c Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Thu, 10 Oct 2019 15:44:50 +0300
Subject: [PATCH 08/15] Implemented fake_quant_with_min_max_vars_per_channel
 fop cuda helper. The first working revision.

---
 .../helpers/cuda/fake_quantization.cu         | 33 ++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
index 9def35152..d491d056a 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
@@ -77,14 +77,45 @@ namespace helpers {
         input->applyLambda(wiseMinMaxAndSoOn, output);
     }
 
+    template <typename T>
+    void fakeQuantWithMinMaxVarsPerChannel_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
+        int lowIntBound = narrowed?1:0;
+        int upperIntBound = (1 << numBits) - 1;
+        min->syncToHost();
+        max->syncToHost();
+        T scale, nudged_min, nudged_max;
+        auto channels = min->lengthOf();
+        input->syncToHost();
+        input->syncToDevice();
+        output->syncToHost();
+        for (auto i = 0; i < channels; i++) {
+            Nudge(min->t<T>(i), max->t<T>(i), lowIntBound, upperIntBound, &scale, &nudged_min, &nudged_max);
+
+            //auto wiseMinMaxAndSoOn = LAMBDA_T(x, nudged_min, nudged_max, scale) {
+            for (auto e = 0; e < input->lengthOf(); e += channels) {
+                T val = input->t<T>(e + i);
+                if (val < nudged_min) {
+                    val = nudged_min;
+                } else if (val > nudged_max) {
+                    val = nudged_max;
+                }
+
+                output->t<T>(e + i) = (math::nd4j_floor<T, T>((val - nudged_min) / scale + T(0.5)) * scale + nudged_min);
+            };
+        }
+        output->syncToDevice();
+        output->tickWriteDevice();
+    }
+
     void fakeQuantWithMinMaxVars(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVars_, (input, min, max, numBits, narrowed, output), FLOAT_TYPES);
     }
     void fakeQuantWithMinMaxVarsPerChannel(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVars_, (input, min, max, numBits, narrowed, output), FLOAT_TYPES);
+        BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVarsPerChannel_, (input, min, max, numBits, narrowed, output), FLOAT_TYPES);
     }
 
     BUILD_SINGLE_TEMPLATE(template void fakeQuantWithMinMaxVars_, (NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void fakeQuantWithMinMaxVarsPerChannel_, (NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output), FLOAT_TYPES);
 
 }
 }

From 02d86166927cc804f2e74ab6f761fc92e4e491a6 Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Thu, 10 Oct 2019 16:40:56 +0300
Subject: [PATCH 09/15] Implementation of cuda kernel for
 fake_quant_with_min_max_vars_per_channels op.

---
 ...ke_quant_with_min_max_vars_per_channel.cpp |  2 +-
 .../helpers/cpu/fake_quantization.cpp         |  2 +-
 .../helpers/cuda/fake_quantization.cu         | 54 +++++++++++--------
 .../declarable/helpers/fake_quantization.h    |  2 +-
 4 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp
index 4d719b38b..e9bab7a1a 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp
@@ -55,7 +55,7 @@ namespace nd4j {
                                                              " for quatization should be in between 2 and 16, but %i "
                                                              "was given.", numBits);
             }
-            helpers::fakeQuantWithMinMaxVarsPerChannel(x, min, max, numBits, narrowed, output);
+            helpers::fakeQuantWithMinMaxVarsPerChannel(block.launchContext(), x, min, max, numBits, narrowed, output);
             return ND4J_STATUS_OK;
         }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
index 622934407..b09587cf7 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
@@ -103,7 +103,7 @@ namespace helpers {
     void fakeQuantWithMinMaxVars(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVars_, (input, min, max, numBits, narrowed, output), FLOAT_TYPES);
     }
-    void fakeQuantWithMinMaxVarsPerChannel(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
+    void fakeQuantWithMinMaxVarsPerChannel(LaunchContext* context, NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVarsPerChannel_, (input, min, max, numBits, narrowed, output), FLOAT_TYPES);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
index d491d056a..75a81f75a 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
@@ -34,7 +34,7 @@ namespace helpers {
 // output - output tensor
 //
     template <typename T>
-    static void Nudge(T min, T max, int quant_min, int quant_max, T* scale, T* nudged_min, T* nudged_max) {
+    static __host__ __device__ void Nudge(T min, T max, int quant_min, int quant_max, T* scale, T* nudged_min, T* nudged_max) {
         T quant_max_float = static_cast<T>(quant_max);
         T quant_min_float = static_cast<T>(quant_min);
         *scale = (max - min) / (quant_max_float - quant_min_float);
@@ -78,44 +78,54 @@ namespace helpers {
     }
 
     template <typename T>
-    void fakeQuantWithMinMaxVarsPerChannel_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
-        int lowIntBound = narrowed?1:0;
-        int upperIntBound = (1 << numBits) - 1;
-        min->syncToHost();
-        max->syncToHost();
-        T scale, nudged_min, nudged_max;
-        auto channels = min->lengthOf();
-        input->syncToHost();
-        input->syncToDevice();
-        output->syncToHost();
-        for (auto i = 0; i < channels; i++) {
-            Nudge(min->t<T>(i), max->t<T>(i), lowIntBound, upperIntBound, &scale, &nudged_min, &nudged_max);
+    static __global__ void fakeQuantWithMinMaxKernel(T* input, Nd4jLong* inputShape, T* min, T* max,
+            int lowIntBound, int upperIntBound, Nd4jLong channels,
+            T* output, Nd4jLong* outputShape, Nd4jLong length) {
 
+        for (auto i = blockIdx.x; i < (int)channels; i += gridDim.x) {
+            T scale, nudged_min, nudged_max;
+            Nudge(min[i], max[i], lowIntBound, upperIntBound, &scale, &nudged_min, &nudged_max);
             //auto wiseMinMaxAndSoOn = LAMBDA_T(x, nudged_min, nudged_max, scale) {
-            for (auto e = 0; e < input->lengthOf(); e += channels) {
-                T val = input->t<T>(e + i);
+            for (auto e = threadIdx.x; e < (int)length; e += (int)channels) {
+                T val = input[shape::getIndexOffset(e + i, inputShape)];
                 if (val < nudged_min) {
                     val = nudged_min;
                 } else if (val > nudged_max) {
                     val = nudged_max;
                 }
-
-                output->t<T>(e + i) = (math::nd4j_floor<T, T>((val - nudged_min) / scale + T(0.5)) * scale + nudged_min);
+                output[shape::getIndexOffset(e + i, outputShape)] = (math::nd4j_floor<T, T>((val - nudged_min) / scale + T(0.5)) * scale + nudged_min);
             };
         }
-        output->syncToDevice();
-        output->tickWriteDevice();
+
+    }
+
+    template <typename T>
+    void fakeQuantWithMinMaxVarsPerChannel_(LaunchContext* context, NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
+        int lowIntBound = narrowed?1:0;
+        int upperIntBound = (1 << numBits) - 1;
+        auto channels = min->lengthOf();
+        auto length = input->lengthOf();
+        NDArray::prepareSpecialUse({output}, {min, max, input});
+        auto stream = context->getCudaStream();
+        T* inputBuf = input->dataBuffer()->specialAsT<T>();
+        T* outputBuf = output->dataBuffer()->specialAsT<T>();
+        T* minBuf = min->dataBuffer()->specialAsT<T>();
+        T* maxBuf = max->dataBuffer()->specialAsT<T>();
+        fakeQuantWithMinMaxKernel<<<1, 1, 256, *stream>>>(inputBuf, input->specialShapeInfo(),
+                minBuf, maxBuf, lowIntBound, upperIntBound, channels, outputBuf, output->specialShapeInfo(), length);
+        NDArray::registerSpecialUse({output}, {min, max, input});
+
     }
 
     void fakeQuantWithMinMaxVars(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVars_, (input, min, max, numBits, narrowed, output), FLOAT_TYPES);
     }
-    void fakeQuantWithMinMaxVarsPerChannel(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVarsPerChannel_, (input, min, max, numBits, narrowed, output), FLOAT_TYPES);
+    void fakeQuantWithMinMaxVarsPerChannel(LaunchContext* context, NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
+        BUILD_SINGLE_SELECTOR(input->dataType(), fakeQuantWithMinMaxVarsPerChannel_, (context, input, min, max, numBits, narrowed, output), FLOAT_TYPES);
     }
 
     BUILD_SINGLE_TEMPLATE(template void fakeQuantWithMinMaxVars_, (NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output), FLOAT_TYPES);
-    BUILD_SINGLE_TEMPLATE(template void fakeQuantWithMinMaxVarsPerChannel_, (NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void fakeQuantWithMinMaxVarsPerChannel_, (LaunchContext* context, NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output), FLOAT_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/fake_quantization.h b/libnd4j/include/ops/declarable/helpers/fake_quantization.h
index 7a43a15cb..cadd8be7c 100644
--- a/libnd4j/include/ops/declarable/helpers/fake_quantization.h
+++ b/libnd4j/include/ops/declarable/helpers/fake_quantization.h
@@ -27,7 +27,7 @@ namespace ops {
 namespace helpers {
 
     void fakeQuantWithMinMaxVars(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output);
-    void fakeQuantWithMinMaxVarsPerChannel(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output);
+    void fakeQuantWithMinMaxVarsPerChannel(LaunchContext* context, NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output);
 }
 }
 }

From d5b352273dc17aa28bceeaf9707201274c6d8518 Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Thu, 10 Oct 2019 16:51:29 +0300
Subject: [PATCH 10/15] Implementation of cuda kernel for
 fake_quant_with_min_max_vars_per_channels op. Final revision.

---
 .../declarable/helpers/cuda/fake_quantization.cu    | 13 +++++++++----
 .../tests_cpu/layers_tests/DeclarableOpsTests10.cpp |  4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
index 75a81f75a..893e016ed 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
@@ -81,19 +81,24 @@ namespace helpers {
     static __global__ void fakeQuantWithMinMaxKernel(T* input, Nd4jLong* inputShape, T* min, T* max,
             int lowIntBound, int upperIntBound, Nd4jLong channels,
             T* output, Nd4jLong* outputShape, Nd4jLong length) {
+        __shared__ int block;
+        if (threadIdx.x == 0) {
+            block = length / channels;
+        }
+        __syncthreads();
 
         for (auto i = blockIdx.x; i < (int)channels; i += gridDim.x) {
             T scale, nudged_min, nudged_max;
             Nudge(min[i], max[i], lowIntBound, upperIntBound, &scale, &nudged_min, &nudged_max);
             //auto wiseMinMaxAndSoOn = LAMBDA_T(x, nudged_min, nudged_max, scale) {
-            for (auto e = threadIdx.x; e < (int)length; e += (int)channels) {
-                T val = input[shape::getIndexOffset(e + i, inputShape)];
+            for (auto e = threadIdx.x; e < block; e += blockDim.x) {
+                T val = input[shape::getIndexOffset(e * channels + i, inputShape)];
                 if (val < nudged_min) {
                     val = nudged_min;
                 } else if (val > nudged_max) {
                     val = nudged_max;
                 }
-                output[shape::getIndexOffset(e + i, outputShape)] = (math::nd4j_floor<T, T>((val - nudged_min) / scale + T(0.5)) * scale + nudged_min);
+                output[shape::getIndexOffset(e* channels + i, outputShape)] = (math::nd4j_floor<T, T>((val - nudged_min) / scale + T(0.5)) * scale + nudged_min);
             };
         }
 
@@ -111,7 +116,7 @@ namespace helpers {
         T* outputBuf = output->dataBuffer()->specialAsT<T>();
         T* minBuf = min->dataBuffer()->specialAsT<T>();
         T* maxBuf = max->dataBuffer()->specialAsT<T>();
-        fakeQuantWithMinMaxKernel<<<1, 1, 256, *stream>>>(inputBuf, input->specialShapeInfo(),
+        fakeQuantWithMinMaxKernel<<<128, 256, 256, *stream>>>(inputBuf, input->specialShapeInfo(),
                 minBuf, maxBuf, lowIntBound, upperIntBound, channels, outputBuf, output->specialShapeInfo(), length);
         NDArray::registerSpecialUse({output}, {min, max, input});
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
index 6ae982cf8..0652a398e 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
@@ -2127,8 +2127,8 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_1) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto result = results->at(0);
-    result->printBuffer("Quantized");
-    exp.printBuffer("Expected");
+//    result->printBuffer("Quantized");
+//    exp.printBuffer("Expected");
     ASSERT_TRUE(exp.isSameShapeStrict(result));
     ASSERT_TRUE(exp.equalsTo(result));
 

From 92636b0b864ea877184f008813972606a32981b4 Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Thu, 10 Oct 2019 17:08:59 +0300
Subject: [PATCH 11/15] Eliminated waste operator.

---
 .../parity_ops/fake_quant_with_min_max_vars_per_channel.cpp    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp
index e9bab7a1a..e5873d9dd 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp
@@ -40,8 +40,7 @@ namespace nd4j {
 
             REQUIRE_TRUE(depth == max->lengthOf(), 0, "fake_quant_with_min_max_vars_per_channel: Max length should be"
                                                       "%lld, but %lld occurs.", depth, max->lengthOf());
-            if(block.width() == 3) {
-            }
+
             auto output  = OUTPUT_VARIABLE(0);
             int numBits = 8;
             if (block.getIArguments() && block.getIArguments()->size())

From a09cb5e2be064e6aae44add1d55fbbe158dfe89e Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Thu, 10 Oct 2019 17:13:33 +0300
Subject: [PATCH 12/15] Added doc for fake_quant_with_min_max_per_channel op
 declaration.

---
 .../include/ops/declarable/headers/parity_ops.h  | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/libnd4j/include/ops/declarable/headers/parity_ops.h b/libnd4j/include/ops/declarable/headers/parity_ops.h
index e0dc55937..cbc7e56da 100644
--- a/libnd4j/include/ops/declarable/headers/parity_ops.h
+++ b/libnd4j/include/ops/declarable/headers/parity_ops.h
@@ -1747,6 +1747,22 @@ namespace nd4j {
         #if NOT_EXCLUDED(OP_fake_quant_with_min_max_vars)
         DECLARE_CONFIGURABLE_OP(fake_quant_with_min_max_vars, 3, 1, true, 0, -2);
         #endif
+
+/**
+         * fake_quant_with_min_max_vals_per_channel - tf.quantization.fake_quant_with_min_max_vars_per_channel
+         *
+         * input params:
+         *    0 - NDArray (input) - at least 2D.
+         *    1 - 1D Tensor - min values (min length equals to last dim of input)
+         *    2 - 1D Tensor - max value (length equals to min)
+         *
+         * int params (optional):
+         *    0 - num_bits (allowed interval [2, 16], default 8)
+         *    1 - narrow_range (default False)
+         *
+         * output:
+         *    0 - NDArray with the same shape as input
+         */
         #if NOT_EXCLUDED(OP_fake_quant_with_min_max_vars_per_channel)
                 DECLARE_CONFIGURABLE_OP(fake_quant_with_min_max_vars_per_channel, 3, 1, true, 0, -2);
         #endif

From c3f755d975d884a123ca25ecfb9bbc5b30be5e8d Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Thu, 10 Oct 2019 18:02:49 +0300
Subject: [PATCH 13/15] Refactored helpers both for cuda and cpu platforms.

---
 .../helpers/cpu/fake_quantization.cpp         | 43 +++++--------
 .../helpers/cuda/fake_quantization.cu         | 63 ++++++++++---------
 2 files changed, 48 insertions(+), 58 deletions(-)

diff --git a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
index b09587cf7..21163d44c 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
@@ -26,7 +26,7 @@ namespace ops {
 namespace helpers {
 
     template <typename T>
-    static void Nudge(T min, T max, int quant_min, int quant_max, T* scale, T* nudged_min, T* nudged_max) {
+    static void nudge(T min, T max, int quant_min, int quant_max, T* scale, T* nudged_min, T* nudged_max) {
         T quant_max_float = static_cast<T>(quant_max);
         T quant_min_float = static_cast<T>(quant_min);
         *scale = (max - min) / (quant_max_float - quant_min_float);
@@ -53,7 +53,7 @@ namespace helpers {
         PRAGMA_OMP_PARALLEL_FOR
         for (auto i = 0; i < channels; i++) {
             T scale, nudged_min, nudged_max;
-            Nudge<T>(min->t<T>(i), max->t<T>(i), lowIntBound, upperIntBound, &scale, &nudged_min, &nudged_max);
+            nudge<T>(min->t<T>(i), max->t<T>(i), lowIntBound, upperIntBound, &scale, &nudged_min, &nudged_max);
 
             for (auto e = 0; e < input->lengthOf(); e += channels) {
                 T val = input->t<T>(e + i);
@@ -67,37 +67,26 @@ namespace helpers {
         }
     }
 
-    template <typename T>
-    static void WiseMinMax(NDArray* input, T min, T max, NDArray* output) {
-        auto wiseMinMax = LAMBDA_T(x, min, max) {
-            if (x < min) {
-                return min;
-            }
-            else if (x > max)
-                return max;
-            return x;
-        };
-
-        input->applyLambda<T>(wiseMinMax, output);
-    }
-
     template <typename T>
     void fakeQuantWithMinMaxVars_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
         int lowIntBound = narrowed ? 1 : 0;
         int upperIntBound = (1 << numBits) - 1;
 
-        const float quant_min_float = static_cast<float>(lowIntBound);
-        const float quant_max_float = static_cast<float>(upperIntBound);
-        T nudged_min, nudged_max, scale;
+        T nudgedMin, nudgedMax, scale;
 
-        Nudge<T>(min->t<T>(0), max->t<T>(0), quant_min_float, quant_max_float, &scale, &nudged_min, &nudged_max);
-        WiseMinMax<T>(input, nudged_min, nudged_max, output);
-        *output -= nudged_min;
-        (*output) /= scale;
-        (*output) += T(0.5f);
-        output->applyTransform(transform::Floor, nullptr, nullptr);
-        (*output) *= scale;
-        (*output) += nudged_min;
+        nudge<T>(min->t<T>(0), max->t<T>(0), lowIntBound, upperIntBound, &scale, &nudgedMin, &nudgedMax);
+
+        auto fakeQuantizationWithMinMax = LAMBDA_T(x, nudgedMin, nudgedMax, scale) {
+            T val = x;
+            if (val < nudgedMin) {
+                val = nudgedMin;
+            }
+            else if (val > nudgedMax)
+                val = nudgedMax;
+            return (nd4j::math::nd4j_floor<T,T>((val - nudgedMin)/scale + T(0.5)) * scale + nudgedMin);
+        };
+
+        input->applyLambda<T>(fakeQuantizationWithMinMax, output);
     }
 
     void fakeQuantWithMinMaxVars(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
index 893e016ed..70eaac67b 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
@@ -34,44 +34,45 @@ namespace helpers {
 // output - output tensor
 //
     template <typename T>
-    static __host__ __device__ void Nudge(T min, T max, int quant_min, int quant_max, T* scale, T* nudged_min, T* nudged_max) {
-        T quant_max_float = static_cast<T>(quant_max);
-        T quant_min_float = static_cast<T>(quant_min);
-        *scale = (max - min) / (quant_max_float - quant_min_float);
-        auto zero_point_from_min = quant_min_float - min / *scale;
-        uint16_t const nudged_zero_point = [zero_point_from_min, quant_min, quant_max, quant_max_float, quant_min_float] {
-            if (zero_point_from_min < quant_min_float) {
-                return static_cast<uint16_t>(quant_min);
+    static __host__ __device__ void
+    nudge(T min, T max, int quantMin, int quantMax, T* scale, T* nudgedMin, T* nudgedMax) {
+        T quantMaxF = static_cast<T>(quantMax);
+        T quantMinF = static_cast<T>(quantMin);
+        *scale = (max - min) / (quantMaxF - quantMinF);
+        auto zeroPointFromMin = quantMinF - min / *scale;
+        uint16_t const nudgedZeroPoint = [zeroPointFromMin, quantMin, quantMax, quantMaxF, quantMinF] {
+            if (zeroPointFromMin < quantMinF) {
+                return static_cast<uint16_t>(quantMin);
             }
-            if (zero_point_from_min > quant_max_float) {
-                return static_cast<uint16_t>(quant_max);
+            if (zeroPointFromMin > quantMaxF) {
+                return static_cast<uint16_t>(quantMax);
             }
-            return nd4j::math::nd4j_round<T,uint16_t>(zero_point_from_min);
+            return nd4j::math::nd4j_round<T,uint16_t>(zeroPointFromMin);
         }();
-        *nudged_min = (quant_min_float - nudged_zero_point) * (*scale);
-        *nudged_max = (quant_max_float - nudged_zero_point) * (*scale);
+        *nudgedMin = (quantMinF - nudgedZeroPoint) * (*scale);
+        *nudgedMax = (quantMaxF - nudgedZeroPoint) * (*scale);
     }
 
     template <typename T>
     void fakeQuantWithMinMaxVars_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
         int lowIntBound = narrowed?1:0;
         int upperIntBound = (1 << numBits) - 1;
-        min->syncToHost();
+        min->syncToHost(); // these are scalars, so nothing much happened
         max->syncToHost();
-        T scale, nudged_min, nudged_max;
-        Nudge(min->t<T>(0), max->t<T>(0), lowIntBound, upperIntBound, &scale, &nudged_min, &nudged_max);
+        T scale, nudgedMin, nudgedMax;
+        nudge(min->t<T>(0), max->t<T>(0), lowIntBound, upperIntBound, &scale, &nudgedMin, &nudgedMax);
 
-        auto wiseMinMaxAndSoOn = LAMBDA_T(x, nudged_min, nudged_max, scale) {
+        auto wiseMinMaxAndSoOn = LAMBDA_T(x, nudgedMin, nudgedMax, scale) {
             T val = x;
-            if (x < nudged_min) {
-                val = nudged_min;
+            if (x < nudgedMin) {
+                val = nudgedMin;
             }
-            else if (x > nudged_max) {
-                val = nudged_max;
+            else if (x > nudgedMax) {
+                val = nudgedMax;
             }
             else
                 val = x;
-            return (math::nd4j_floor<T,T>((val - nudged_min) / scale + T(0.5)) * scale + nudged_min);
+            return (math::nd4j_floor<T,T>((val - nudgedMin) / scale + T(0.5)) * scale + nudgedMin);
         };
 
         input->applyLambda(wiseMinMaxAndSoOn, output);
@@ -88,20 +89,20 @@ namespace helpers {
         __syncthreads();
 
         for (auto i = blockIdx.x; i < (int)channels; i += gridDim.x) {
-            T scale, nudged_min, nudged_max;
-            Nudge(min[i], max[i], lowIntBound, upperIntBound, &scale, &nudged_min, &nudged_max);
-            //auto wiseMinMaxAndSoOn = LAMBDA_T(x, nudged_min, nudged_max, scale) {
+            T scale, nudgedMin, nudgedMax;
+            nudge(min[i], max[i], lowIntBound, upperIntBound, &scale, &nudgedMin, &nudgedMax);
+
             for (auto e = threadIdx.x; e < block; e += blockDim.x) {
                 T val = input[shape::getIndexOffset(e * channels + i, inputShape)];
-                if (val < nudged_min) {
-                    val = nudged_min;
-                } else if (val > nudged_max) {
-                    val = nudged_max;
+                if (val < nudgedMin) {
+                    val = nudgedMin;
+                } else if (val > nudgedMax) {
+                    val = nudgedMax;
                 }
-                output[shape::getIndexOffset(e* channels + i, outputShape)] = (math::nd4j_floor<T, T>((val - nudged_min) / scale + T(0.5)) * scale + nudged_min);
+                output[shape::getIndexOffset(e* channels + i, outputShape)] =
+                        (math::nd4j_floor<T, T>((val - nudgedMin) / scale + T(0.5)) * scale + nudgedMin);
             };
         }
-
     }
 
     template <typename T>

From c890de5a7b30925b77991c68c62c2ad611c51aae Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Thu, 10 Oct 2019 18:31:17 +0300
Subject: [PATCH 14/15] Added doc for fake_quant_with_min_max* op helpers
 implementations.

---
 .../helpers/cpu/fake_quantization.cpp         | 56 +++++++++++--------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
index 21163d44c..6ea2992b9 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
@@ -25,43 +25,54 @@ namespace nd4j {
 namespace ops {
 namespace helpers {
 
+    //
+    // nudge - nudged min max over scale
+    // scale = (Max - Min) / (quantMax - quantMin)
+    // quantMin = 0 or 1, quantMax = 2^b - 1 == (1 << b) - 1
+    //
     template <typename T>
-    static void nudge(T min, T max, int quant_min, int quant_max, T* scale, T* nudged_min, T* nudged_max) {
-        T quant_max_float = static_cast<T>(quant_max);
-        T quant_min_float = static_cast<T>(quant_min);
-        *scale = (max - min) / (quant_max_float - quant_min_float);
-        auto zero_point_from_min = quant_min_float - min / *scale;
-        uint16_t const nudged_zero_point = [zero_point_from_min, quant_min, quant_max, quant_max_float, quant_min_float] {
-                if (zero_point_from_min < quant_min_float) {
-                    return static_cast<uint16_t>(quant_min);
+    static void nudge(T min, T max, int quantMin, int quantMax, T* scale, T* nudgedMin, T* nudgedMax) {
+        // floating point instead integers
+        T quantMaxF = static_cast<T>(quantMax);
+        T quantMinF = static_cast<T>(quantMin);
+        // compute scale
+        *scale = (max - min) / (quantMaxF - quantMinF);
+        // compute left bound point
+        auto zeroPointFromMin = quantMinF - min / *scale;
+        // bound zero point to conform with range [0 or 1, 2^b - 1]
+        uint16_t const nudged_zero_point = [zeroPointFromMin, quantMin, quantMax, quantMaxF, quantMinF] {
+                if (zeroPointFromMin < quantMinF) {
+                    return static_cast<uint16_t>(quantMin);
                 }
-                if (zero_point_from_min > quant_max_float) {
-                    return static_cast<uint16_t>(quant_max);
+                if (zeroPointFromMin > quantMaxF) {
+                    return static_cast<uint16_t>(quantMax);
                 }
-                return nd4j::math::nd4j_round<T,uint16_t>(zero_point_from_min);
-            }();
-            *nudged_min = (quant_min_float - nudged_zero_point) * (*scale);
-            *nudged_max = (quant_max_float - nudged_zero_point) * (*scale);
+                return nd4j::math::nd4j_round<T,uint16_t>(zeroPointFromMin);
+        }();
+        // compute nudged min and max with computed nudged zero point
+        *nudgedMin = (quantMinF - nudged_zero_point) * (*scale);
+        *nudgedMax = (quantMaxF - nudged_zero_point) * (*scale);
     }
 
     template <typename T>
     void fakeQuantWithMinMaxVarsPerChannel_(NDArray* input, NDArray* min, NDArray* max, int numBits, bool narrowed, NDArray* output) {
-        int lowIntBound = narrowed ? 1 : 0;
-        int upperIntBound = (1 << numBits) - 1;
-        auto channels = input->sizeAt(-1);
+        int lowIntBound = narrowed ? 1 : 0; // 0 or 1
+        int upperIntBound = (1 << numBits) - 1; // 2^b - 1
+        auto channels = input->sizeAt(-1); // last dimension
 
         PRAGMA_OMP_PARALLEL_FOR
         for (auto i = 0; i < channels; i++) {
             T scale, nudged_min, nudged_max;
+            // nudge min and max first, with scale computing
             nudge<T>(min->t<T>(i), max->t<T>(i), lowIntBound, upperIntBound, &scale, &nudged_min, &nudged_max);
-
+            // slide using last dimension and process all for given channel
             for (auto e = 0; e < input->lengthOf(); e += channels) {
                 T val = input->t<T>(e + i);
                 if ( val <= nudged_min)
                     val = nudged_min;
                 else if (val >= nudged_max)
                     val = nudged_max;
-
+                // quantization itself
                 output->t<T>(e + i) = math::nd4j_floor<T,T>((val - nudged_min)/scale + T(0.5)) * scale + nudged_min;
             }
         }
@@ -73,16 +84,17 @@ namespace helpers {
         int upperIntBound = (1 << numBits) - 1;
 
         T nudgedMin, nudgedMax, scale;
-
+        // nudge with given min and max and compute scale and nudged min and max
         nudge<T>(min->t<T>(0), max->t<T>(0), lowIntBound, upperIntBound, &scale, &nudgedMin, &nudgedMax);
-
+        // quantization as one
         auto fakeQuantizationWithMinMax = LAMBDA_T(x, nudgedMin, nudgedMax, scale) {
-            T val = x;
+            T val = x; // boundign value between nudged min and max
             if (val < nudgedMin) {
                 val = nudgedMin;
             }
             else if (val > nudgedMax)
                 val = nudgedMax;
+            // converse value with scale and shifted with nudged min
             return (nd4j::math::nd4j_floor<T,T>((val - nudgedMin)/scale + T(0.5)) * scale + nudgedMin);
         };
 

From ace65355c5afb551b596bb5bc3d8d19f07ac67ac Mon Sep 17 00:00:00 2001
From: shugeo <sgazeos@gmail.com>
Date: Thu, 10 Oct 2019 18:35:28 +0300
Subject: [PATCH 15/15] Added doc for fake_quant_with_min_max* op helpers cuda
 implementations.

---
 .../ops/declarable/helpers/cuda/fake_quantization.cu   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
index 70eaac67b..292b7e1c6 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
@@ -84,22 +84,22 @@ namespace helpers {
             T* output, Nd4jLong* outputShape, Nd4jLong length) {
         __shared__ int block;
         if (threadIdx.x == 0) {
-            block = length / channels;
+            block = length / channels; // to loop with last dimension as block
         }
         __syncthreads();
 
         for (auto i = blockIdx.x; i < (int)channels; i += gridDim.x) {
             T scale, nudgedMin, nudgedMax;
             nudge(min[i], max[i], lowIntBound, upperIntBound, &scale, &nudgedMin, &nudgedMax);
-
-            for (auto e = threadIdx.x; e < block; e += blockDim.x) {
-                T val = input[shape::getIndexOffset(e * channels + i, inputShape)];
+            // loop over blocks to quantization between nudged min and max
+            for (auto b = threadIdx.x; b < block; b += blockDim.x) {
+                T val = input[shape::getIndexOffset(b * channels + i, inputShape)];
                 if (val < nudgedMin) {
                     val = nudgedMin;
                 } else if (val > nudgedMax) {
                     val = nudgedMax;
                 }
-                output[shape::getIndexOffset(e* channels + i, outputShape)] =
+                output[shape::getIndexOffset(b * channels + i, outputShape)] =
                         (math::nd4j_floor<T, T>((val - nudgedMin) / scale + T(0.5)) * scale + nudgedMin);
             };
         }