/*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/

//
//  @author raver119@gmail.com
//

#include <op_boilerplate.h>
#if NOT_EXCLUDED(OP_batched_gemm)

#include <ops/declarable/headers/blas.h>
#include <ops/declarable/helpers/batched_gemm.h>

namespace nd4j {
namespace ops  {

CUSTOM_OP_IMPL(batched_gemm, -1, -1, false, 0, 9) {

    int transA = INT_ARG(0);
    int transB = INT_ARG(1);
    int M = INT_ARG(2);
    int N = INT_ARG(3);
    int K = INT_ARG(4);
    int ldA = INT_ARG(5);
    int ldB = INT_ARG(6);
    int ldC = INT_ARG(7);
    int batchSize = INT_ARG(8);

    if (transA == 0)
        transA = 111;

    if (transB == 0)
        transB = 111;

    if (transA == 1)
        transA = 112;

    if (transB == 1)
        transB = 112;

    // basically A+B and 2 arrays of alpha and beta
    int expectedWidth = batchSize * 2 + 2;

    REQUIRE_TRUE((transA == 111 || transA == 112) && (transB == 111 || transB == 112), 0, "BatchedGemm: valid values for transA and transB are: 0/1 or 111/112, for NoTrans/Trans respectively")
    REQUIRE_TRUE(M > 0 && N > 0 && K > 0 && ldA > 0 && ldB > 0 && ldC > 0 && batchSize > 0, 0, "");
    REQUIRE_TRUE(block.width() == expectedWidth, 0, "BatchedGemm: expected number of input arrays is %i, but got %i instead", expectedWidth, block.width());

    auto alpha = INPUT_VARIABLE(0);
    auto beta = INPUT_VARIABLE(1);

    std::vector<NDArray*> vA(batchSize);
    std::vector<NDArray*> vB(batchSize);
    std::vector<NDArray*> vC(batchSize);

    auto firstType = INPUT_VARIABLE(0)->dataType();
    for(int e = 0; e < batchSize; e++) {
        vA[e] = INPUT_VARIABLE(e+2);
        vB[e] = INPUT_VARIABLE(e+2+batchSize);
        vC[e] = OUTPUT_VARIABLE(e);


        REQUIRE_TRUE(firstType == vC[e]->dataType(), 0, "BatchedGemm: all inputs and outputs must have same data type");

        REQUIRE_TRUE(vA[e]->rankOf() == 2, 0, "BatchedGemm: batch %i, rank of A should be equal to 2", e);
        REQUIRE_TRUE(vB[e]->rankOf() == 2, 0, "BatchedGemm: batch %i, rank of B should be equal to 2", e);
        REQUIRE_TRUE(vC[e]->rankOf() == 2, 0, "BatchedGemm: batch %i, rank of C should be equal to 2", e);

        REQUIRE_TRUE(M == vA[e]->sizeAt(0), 0, "BatchedGemm: batch %i, number of A.rows() should be equal to M", e);
        REQUIRE_TRUE(N == vB[e]->sizeAt(1), 0, "BatchedGemm: batch %i, number of B.columns() should be equal to N", e);
        REQUIRE_TRUE(K == vA[e]->sizeAt(1) && K == vB[e]->sizeAt(0), 0, "BatchedGemm: batch %i, number of A.columns() and B.rows() should be equal to K", e);
    };

    REQUIRE_TRUE(vA.size() == vB.size() && vA.size() == vC.size() && vA.size() == batchSize, 0, "BatchedGemm: mismatched numbers of A, B, C for unknown reason");

    nd4j::ops::helpers::bgemm(vA, vB, vC, alpha, beta, transA, transB, M, N, K, ldA, ldB, ldC);

    return Status::OK();
};


DECLARE_SHAPE_FN(batched_gemm) {
    int transA = INT_ARG(0);
    int transB = INT_ARG(1);
    int M = INT_ARG(2);
    int N = INT_ARG(3);
    int K = INT_ARG(4);
    int ldA = INT_ARG(5);
    int ldB = INT_ARG(6);
    int ldC = INT_ARG(7);
    int batchSize = INT_ARG(8);

    auto firstType = ArrayOptions::dataType(inputShape->at(0));
    for (int e = 1; e < block.width(); e++) {
        REQUIRE_TRUE(firstType == ArrayOptions::dataType(inputShape->at(1)), 0, "BatchedGemm: all inputs must have same data type");
    }

    auto shapeList = SHAPELIST();

    if (!(M > 0 && N > 0 && K > 0 && ldA > 0 && ldB > 0 && ldC > 0 && batchSize > 0)) {
        Nd4jLong *newShape;
        ALLOCATE(newShape, block.getWorkspace(), shape::shapeInfoLength(2), Nd4jLong);

        newShape[0] = 2;
        newShape[1] = 1;
        newShape[2] = 1;
        newShape[3] = 1;
        newShape[4] = 1;
        newShape[5] = 0;
        newShape[6] = 1;
        newShape[7] = 99;

        shapeList->push_back(newShape);
        return shapeList;
    }


    std::vector<Nd4jLong> shape({M, N});

    for (int e = 0; e < batchSize; e++) {
        auto newShape = ConstantShapeHelper::getInstance()->createShapeInfo(block.dataType(), 'f', shape);
        shapeList->push_back(newShape);
    }

    return shapeList;
}

DECLARE_TYPES(batched_gemm) {
    getOpDescriptor()
            ->setAllowedInputTypes({ALL_FLOATS})
//                    ->setAllowedInputTypes(1, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF})
            ->setAllowedOutputTypes({ALL_FLOATS});
}


}
}

#endif