cavis/libnd4j/include/system/buffer.h

300 lines
6.2 KiB
C++
Executable File

/*******************************************************************************
* Copyright (c) 2015-2018 Skymind, Inc.
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
/*
* buffer.h
*
* Created on: Dec 24, 2015
* Author: agibsonccc
*/
#ifndef BUFFER_H_
#define BUFFER_H_
#ifdef __CUDACC__
#include <cuda.h>
#include <cuda_runtime.h>
#include <helpers/DebugHelper.h>
#endif
#include <system/dll.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <system/dll.h>
//Question: Should the indexes here really be int? Isn't size_t or Nd4jLong more appropriate?
namespace sd {
namespace buffer {
/**
* Represents both a cpu and gpu
* buffer - mainly used for testing
*/
template<typename T>
struct Buffer {
int length = 0;
int allocatedOnGpu = 0;
T *data = nullptr;
T *gData = nullptr;
T one, two;
public:
~Buffer() {
delete []data;
delete []gData;
}
void assign(T *val) {
data = val;
}
T &operator=(T x) {
one = x;
return x;
}
class Proxy {
Buffer<T> &a;
int idx;
public:
Proxy(Buffer &a, int idx) :
a(a), idx(idx) {
}
T &operator=(T x) {
a.two = x;
a.data[idx] = x;
return a.data[idx];
}
};
Proxy operator[](int index) {
return Proxy(*this, index);
}
};
/**
* Returns the size of the buffer
* in bytes
* @param buffer the buffer to get the size of
* @return the size of the buffer in bytes
*/
template<typename T>
#ifdef __CUDACC__
__host__ __device__
#endif
int bufferSize(Buffer<T> *buffer);
/**
* Copies data to the gpu
* @param buffer the buffer to copy
*/
#ifdef __CUDACC__
template<typename T>
__host__
void copyDataToGpu(Buffer<T> **buffer, cudaStream_t stream);
#endif
/**
* Copies data from the gpu
* @param buffer the buffer to copy
*/
#ifdef __CUDACC__
template<typename T>
__host__
void copyDataFromGpu(Buffer<T> **buffer, cudaStream_t stream);
#endif
/**
* Allocate buffer of the given
* length on the cpu and gpu.
*/
template<typename T>
#ifdef __CUDACC__
__host__
#endif
void allocBuffer(Buffer<T> **buffer, int length);
/**
* Frees the given buffer
* (gpu and cpu
*/
template<typename T>
#ifdef __CUDACC__
__host__
#endif
void freeBuffer(Buffer<T> **buffer);
/**
* Creates a buffer
* based on the data
* and also synchronizes
* the data on the gpu.
*/
template<typename T>
#ifdef __CUDACC__
__host__
#endif
Buffer<T>
*
createBuffer(T *data, int length);
/**
* Print the buffer on the host
* @param buff
*/
template<typename T>
#ifdef __CUDACC__
__host__
#endif
void printArr(Buffer<T> *buff);
/**
*
* @param buffer
* @return
*/
template<typename T>
#ifdef __CUDACC__
__host__ __device__
#endif
int bufferSize(Buffer<T> *buffer) {
return sizeof(T) * buffer->length;
}
#ifdef __CUDACC__
/**
*
* @param buffer
*/
template<typename T>
__host__ void copyDataToGpu(Buffer <T> **buffer, cudaStream_t stream) {
Buffer <T> *bufferRef = *buffer;
checkCudaErrors(cudaMemcpyAsync(bufferRef->gData, bufferRef->data, bufferSize(bufferRef), cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
}
/**
*
* @param buffer
*/
template<typename T>
__host__ void copyDataFromGpu(Buffer <T> **buffer, cudaStream_t stream) {
Buffer <T> *bufferRef = *buffer;
int bufferTotalSize = bufferSize(bufferRef);
checkCudaErrors(cudaMemcpyAsync(bufferRef->data, bufferRef->gData, bufferTotalSize, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
}
#endif
/**
* Allocate buffer of the given
* length on the cpu and gpu.
*/
template<typename T>
#ifdef __CUDACC__
__host__
#endif
void allocBuffer(Buffer<T> **buffer, int length) {
Buffer<T> *bufferRef = *buffer;
bufferRef->length = length;
bufferRef->data = reinterpret_cast<T *>(malloc(sizeof(T) * length));
CHECK_ALLOC(bufferRef->data, "Failed to allocate new buffer", sizeof(T) * length);
#ifdef __CUDACC__
checkCudaErrors(cudaMalloc(&bufferRef->gData, sizeof(T) * length));
#endif
}
/**
* Frees the given buffer
* (gpu and cpu
*/
template<typename T>
#ifdef __CUDACC__
__host__
#endif
void freeBuffer(Buffer<T> *buffer) {
#ifdef __CUDACC__
if(buffer->gData != nullptr)
checkCudaErrors(cudaFree(buffer->gData));
#endif
delete buffer;
}
/**
* Creates a buffer
* based on the data
* and also synchronizes
* the data on the gpu.
*/
template<typename T>
#ifdef __CUDACC__
__host__
#endif
Buffer<T> *createBuffer(T *data, int length) {
Buffer<T> *ret = new Buffer<T>;
T *buffData = new T[length];
for(int i = 0; i < length; i++)
buffData[i] = data[i];
ret->data = buffData;
ret->length = length;
return ret;
}
#ifdef __CUDACC__
template<typename T>
__host__
Buffer<T> *createBuffer(T *data, int length, cudaStream_t stream) {
Buffer<T> *ret = createBuffer(data, length);
T *gData;
T **gDataRef = &(gData);
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(gDataRef), sizeof(T) * length));
ret->gData = gData;
checkCudaErrors(cudaMemcpyAsync(ret->gData, ret->data, sizeof(T) * length, cudaMemcpyHostToDevice, stream));
return ret;
}
#endif
}
}
#ifdef __CUDACC__
template<typename T>
__host__ void printArr(sd::buffer::Buffer <T> *buff) {
for (int i = 0; i < buff->length; i++) {
printf("Buffer[%d] was %f\n", i, buff->data[i]);
}
}
#endif
#endif /* BUFFER_H_ */