raver119 c969b724bb [WIP] more CUDA stuff (#57)
* initial commit

Signed-off-by: raver119 <raver119@gmail.com>

* Added gradcheck test for dynamic_partition_bp op.

* - implementation of dilation op (cpu and cuda)

Signed-off-by: Yurii <yurii@skymind.io>

* Fixed broadcast_dynamic_shape 1D case and tests.

* Fixed usage of default integer arguments.

* Fixed dynamic_partition_bp op and tests.

* Eliminated test with grad check for dynamic_partition_bp op.

* start working on cuda svd - porting available corresponding api from cuSOLVER library

Signed-off-by: Yurii <yurii@skymind.io>

* provide prelu_bp

Signed-off-by: Yurii <yurii@skymind.io>

* - provide gruCell_bp (old version ??)

Signed-off-by: Yurii <yurii@skymind.io>

* - polishing cumsum_bp and cumprod_bp tests

Signed-off-by: Yurii <yurii@skymind.io>

* provide sparseSoftmaxCrossEntropyWithLogits and sparseSoftmaxCrossEntropyWithLogits_grad

Signed-off-by: Yurii <yurii@skymind.io>

* Fixed atomicMul with float input/output

* implementation of cuda kernel for triu_bp operation

Signed-off-by: Yurii <yurii@skymind.io>

* Refactored lup helper to add parrallel computing.

* cusolver libraries

Signed-off-by: raver119 <raver119@gmail.com>

* uncomment cuSolver APIs in svd.cu

Signed-off-by: Yurii <yurii@skymind.io>

* cusolver var

Signed-off-by: raver119 <raver119@gmail.com>

* - further work on cuSolver svd

Signed-off-by: Yurii <yurii@skymind.io>

* Implement usage of cuda solver to LUP decomposition.

* - correct naames in lup functions

Signed-off-by: Yurii <yurii@skymind.io>

* correct svdQR cuda

Signed-off-by: Yurii <yurii@skymind.io>

* - provide transpositions of input matrices in case of c order in svdCudaQR

Signed-off-by: Yurii <yurii@skymind.io>

* Fixed implementation issues with LUP usign cuda solver.

* Implementation of matrix_determinant helper with cuda kernels. Working revision.

* Implemented log_matrix_determinant helper with cuda kernels.

* - implementation of batched cuda svd

Signed-off-by: Yurii <yurii@skymind.io>

* Refactored cholesky helper and implementation of cuda solver cholesky batch.

* - implementation of cuda kernel for tile bp

Signed-off-by: Yurii <yurii@skymind.io>

* Implementation of cholesky and logdet with cuda kernels.

* - implementation of cuda kernel for sru_bidirectional

Signed-off-by: Yurii <yurii@skymind.io>

* Fixed cholesky helper.

* Cholesky op helper implementation. Working double-based cublas implementation.

* bad import excluded

Signed-off-by: raver119 <raver119@gmail.com>

* Finished with cuda implementation of cholesky helper and tests.

* - implementation of cuda kernel for sru_bidirectional_backprop operation

Signed-off-by: Yurii <yurii@skymind.io>

* Implementation of matrix_inverse op helper with cuda kernels. The first revision.

* - start working on gruCell_bp

Signed-off-by: Yurii <yurii@skymind.io>

* Implementation of matrix_inverse helper.

* - further work on new gruCell_bp

Signed-off-by: Yurii <yurii@skymind.io>

* cuBLAS related fixes

Signed-off-by: raver119 <raver119@gmail.com>

* calculateOutputShapes() now passes device buffers as well

Signed-off-by: raver119 <raver119@gmail.com>

* special concat/average/accumulate init host pointers now

Signed-off-by: raver119 <raver119@gmail.com>

* few more tweaks

Signed-off-by: raver119 <raver119@gmail.com>

* additional CudaDataBufferFactory signatures certain for data types

Signed-off-by: raver119 <raver119@gmail.com>

* cuSolver host buffer

Signed-off-by: raver119 <raver119@gmail.com>

* buffer to buffer memcpy host ptr allocation

Signed-off-by: raver119 <raver119@gmail.com>
2019-07-20 23:05:21 +10:00

154 lines
5.9 KiB
C++

/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
//
// @author GS <sgazeos@gmail.com>
//
#include <op_boilerplate.h>
#if NOT_EXCLUDED(OP_dynamic_partition)
#include <ops/declarable/CustomOperations.h>
#include <array>
#include <ops/declarable/helpers/dynamic.h>
namespace nd4j {
namespace ops {
CUSTOM_OP_IMPL(dynamic_partition, 2, 1, false, 0, 1) {
auto input = INPUT_VARIABLE(0);
auto indices = INPUT_VARIABLE(1);
// input->printShapeInfo("input");
// indices->printShapeInfo("indices");
REQUIRE_TRUE(input->rankOf() >= indices->rankOf(), 0,
"dynamic_partition: data tensor rank should be non-lesser than indices\' tensor, but %i < %i given,",
input->rankOf(), indices->rankOf());
for (int dim = 0; dim < indices->rankOf(); dim++) {
REQUIRE_TRUE(input->sizeAt(dim) == indices->sizeAt(dim), 0,
"dynamic_partition: dimensions should be equals for data and indices tensors, but at axis[%i] %i != %i given",
dim, input->sizeAt(dim), indices->sizeAt(dim));
}
auto numPartition = INT_ARG(0);
std::vector<NDArray *> outputList(numPartition);
for (int o = 0; o < numPartition; ++o) {
outputList[o] = OUTPUT_VARIABLE(o);
}
helpers::dynamicPartitionFunctor(block.launchContext(), input, indices, outputList);
return Status::OK();
}
DECLARE_SHAPE_FN(dynamic_partition) {
auto numPartition = INT_ARG(0);
auto indices = INPUT_VARIABLE(1);
std::vector<int> partitionSizes(numPartition, 0);
auto in = inputShape->at(0);
auto idx = inputShape->at(1);
for (int i = 0; i < numPartition; i++) {
for (int e = 0; e < indices->lengthOf(); ++e)
if (indices->e<Nd4jLong>(e) == i)
partitionSizes[i]++;
}
auto shapes = SHAPELIST();
int outRank = shape::rank(in) - shape::rank(idx) + 1;
for (int e = 0; e < numPartition; e++) {
Nd4jLong *newShape;
ALLOCATE(newShape, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong);
//shape::shapeVector(partitionSizes[e], newShape);
newShape[0] = outRank;
newShape[1] = partitionSizes[e];
for (int i = 1; i < outRank; ++i)
newShape[i + 1] = shape::sizeAt(in, outRank + i - 1);
shape::updateStrides(newShape, shape::order(in));
ArrayOptions::setDataType(newShape, ArrayOptions::dataType(in));
shapes->push_back(CONSTANT(newShape));
}
return shapes;
}
DECLARE_TYPES(dynamic_partition) {
getOpDescriptor()
->setAllowedInputTypes(nd4j::DataType::ANY)
->setAllowedOutputTypes({ALL_FLOATS, ALL_INTS});
}
DECLARE_TYPES(dynamic_partition_bp) {
getOpDescriptor()
->setAllowedInputTypes(nd4j::DataType::ANY)
->setSameMode(true);
}
CUSTOM_OP_IMPL(dynamic_partition_bp, 3, 2, false, 0, 1) {
auto input = INPUT_VARIABLE(0);
auto indices = INPUT_VARIABLE(1);
//auto gradOut = ;
auto numPartition = INT_ARG(0);
std::vector<NDArray*> outputList(2); // only for output
std::vector<NDArray*> gradOutList(numPartition);
for (Nd4jLong e = 0; e < numPartition; e++) {
gradOutList[e] = INPUT_VARIABLE(e + 2);
}
outputList[0] = OUTPUT_VARIABLE(0);
outputList[1] = OUTPUT_VARIABLE(1);
NDArray originalIndices(*indices); //->ordering(), indices->shapeInfo(), indices->dataType());
originalIndices.linspace(0);
ops::dynamic_partition op;
auto res = op.execute({&originalIndices, indices}, {}, {numPartition});
REQUIRE_TRUE(res->status() == ND4J_STATUS_OK, 0, "dynamic_partition_bp: Error with dynamic partitioning.");
ops::dynamic_stitch stichOp;
std::vector<NDArray*> partitions(numPartition * 2);
for (size_t i = 0; i < res->size(); i++) {
partitions[i] = res->at(i);
partitions[i + numPartition] = gradOutList[i];
}
auto result = stichOp.execute(partitions, {}, {numPartition}, {}, false);
REQUIRE_TRUE(result->status() == ND4J_STATUS_OK, 0, "dynamic_partition_bp: Error with dynamic partitioning.");
result->at(0)->reshapei(outputList[0]->getShapeAsVector());
outputList[1]->assign(indices);
outputList[0]->assign(result->at(0));
// helpers::dynamicPartitionFunctorBP(block.launchContext(), input, indices, gradOutList, outputList);
delete res;
delete result;
return ND4J_STATUS_OK;
}
DECLARE_SHAPE_FN(dynamic_partition_bp) {
auto numPartition = INT_ARG(0);
auto indices = INPUT_VARIABLE(1);
std::vector<int> partitionSizes(numPartition, 0);
auto shapes = SHAPELIST();
// just copy shape info from input and indices to output
for (Nd4jLong i = 0; i < 2; i++) {
Nd4jLong *newShape;
COPY_SHAPE(inputShape->at(i), newShape);
shapes->push_back(CONSTANT(newShape));
}
return shapes;
}
}
}
#endif