cavis/libnd4j/include/buffer.h

/*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/

/*
 * buffer.h
 *
 *  Created on: Dec 24, 2015
 *      Author: agibsonccc
 */

#ifndef BUFFER_H_
#define BUFFER_H_
#ifdef __CUDACC__
#include <cuda.h>
#include <cuda_runtime.h>
#include <helpers/DebugHelper.h>
#endif
#include <dll.h>

#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <dll.h>

 //Question: Should the indexes here really be int? Isn't size_t or Nd4jLong more appropriate?
namespace nd4j {
	namespace buffer {
/**
 * Represents both a cpu and gpu
 * buffer - mainly used for testing
 */
		template<typename T>
		struct Buffer {
			int length = 0;
			int allocatedOnGpu = 0;
                        T *data = nullptr;
                        T *gData = nullptr;
			T one, two;
                public:
                        ~Buffer() {
                            delete []data;
                            delete []gData;
                        }

			void assign(T *val) {
				data = val;
			}

			T &operator=(T x) {
				one = x;
				return x;
			}

			class Proxy {
				Buffer<T> &a;
				int idx;
			public:
				Proxy(Buffer &a, int idx) :
						a(a), idx(idx) {
				}

				T &operator=(T x) {
					a.two = x;
					a.data[idx] = x;
					return a.data[idx];
				}
			};

		
			Proxy operator[](int index) {
				return Proxy(*this, index);
			}
		};

/**
 * Returns the size of the buffer
 * in bytes
 * @param buffer the buffer to get the size of
 * @return the size of the buffer in bytes
 */
		template<typename T>

#ifdef __CUDACC__
		__host__ __device__
#endif

		int bufferSize(Buffer<T> *buffer);

/**
 * Copies data to the gpu
 * @param buffer the buffer to copy
 */

#ifdef __CUDACC__
		template<typename T>
		__host__
		void copyDataToGpu(Buffer<T> **buffer, cudaStream_t stream);
#endif


/**
 * Copies data from the gpu
 * @param buffer the buffer to copy
 */

#ifdef __CUDACC__
		template<typename T>
		__host__
		void copyDataFromGpu(Buffer<T> **buffer, cudaStream_t stream);
#endif


/**
 * Allocate buffer of the given
 * length on the cpu and gpu.
 */
		template<typename T>
#ifdef __CUDACC__
		__host__
#endif
		void allocBuffer(Buffer<T> **buffer, int length);

/**
 * Frees the given buffer
 * (gpu and cpu
 */
		template<typename T>
#ifdef __CUDACC__
		__host__
#endif
		void freeBuffer(Buffer<T> **buffer);

/**
 * Creates a buffer
 * based on the data
 * and also synchronizes
 * the data on the gpu.
 */
		template<typename T>
#ifdef __CUDACC__
		__host__
#endif
		Buffer<T>
				*
				createBuffer(T *data, int length);

/**
 * Print the buffer on the host
 * @param buff
 */
		template<typename T>
#ifdef __CUDACC__
		__host__
#endif
		void printArr(Buffer<T> *buff);

/**
 *
 * @param buffer
 * @return
 */
		template<typename T>
#ifdef __CUDACC__
		__host__ __device__
#endif

		int bufferSize(Buffer<T> *buffer) {
			return sizeof(T) * buffer->length;
		}

#ifdef __CUDACC__
		/**
 *
 * @param buffer
 */
template<typename T>
__host__ void copyDataToGpu(Buffer <T> **buffer, cudaStream_t stream) {
	Buffer <T> *bufferRef = *buffer;
	checkCudaErrors(cudaMemcpyAsync(bufferRef->gData, bufferRef->data, bufferSize(bufferRef), cudaMemcpyHostToDevice, stream));
	checkCudaErrors(cudaStreamSynchronize(stream));
}

/**
 *
 * @param buffer
 */
template<typename T>
__host__ void copyDataFromGpu(Buffer <T> **buffer, cudaStream_t stream) {
	Buffer <T> *bufferRef = *buffer;
	int bufferTotalSize = bufferSize(bufferRef);
	checkCudaErrors(cudaMemcpyAsync(bufferRef->data, bufferRef->gData, bufferTotalSize, cudaMemcpyDeviceToHost, stream));
	checkCudaErrors(cudaStreamSynchronize(stream));
}
#endif

/**
 * Allocate buffer of the given
 * length on the cpu and gpu.
 */
		template<typename T>
#ifdef __CUDACC__
		__host__
#endif
		void allocBuffer(Buffer<T> **buffer, int length) {
			Buffer<T> *bufferRef = *buffer;
			bufferRef->length = length;
			bufferRef->data = reinterpret_cast<T *>(malloc(sizeof(T) * length));

			CHECK_ALLOC(bufferRef->data, "Failed to allocate new buffer", sizeof(T) * length);
#ifdef __CUDACC__
			checkCudaErrors(cudaMalloc(&bufferRef->gData, sizeof(T) * length));
#endif
		}

/**
 * Frees the given buffer
 * (gpu and cpu
 */
		template<typename T>
#ifdef __CUDACC__
		__host__
#endif

                void freeBuffer(Buffer<T> *buffer) {
#ifdef __CUDACC__
			if(buffer->gData != nullptr)
            checkCudaErrors(cudaFree(buffer->gData));
#endif

                        delete buffer;
		}

/**
 * Creates a buffer
 * based on the data
 * and also synchronizes
 * the data on the gpu.
 */
		template<typename T>
#ifdef __CUDACC__
		__host__
#endif
		Buffer<T> *createBuffer(T *data, int length) {
                        Buffer<T> *ret = new Buffer<T>;
                        T *buffData = new T[length];
			for(int i = 0; i < length; i++)
				buffData[i] = data[i];
			ret->data = buffData;
			ret->length = length;
			return ret;
		}


#ifdef __CUDACC__
		template<typename T>
		__host__
		Buffer<T> *createBuffer(T *data, int length, cudaStream_t stream) {
			Buffer<T> *ret = createBuffer(data, length);

			T *gData;
			T **gDataRef = &(gData);
			checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(gDataRef), sizeof(T) * length));
			ret->gData = gData;
			checkCudaErrors(cudaMemcpyAsync(ret->gData, ret->data, sizeof(T) * length, cudaMemcpyHostToDevice, stream));
			return ret;
		}
#endif
	}
}


#ifdef __CUDACC__
template<typename T>
__host__ void printArr(nd4j::buffer::Buffer <T> *buff) {
	for (int i = 0; i < buff->length; i++) {
		printf("Buffer[%d] was %f\n", i, buff->data[i]);
	}
}

#endif

#endif /* BUFFER_H_ */
Eclipse Migration Initial Commit 2019-06-06 14:21:15 +02:00			`/*******************************************************************************`
			`* Copyright (c) 2015-2018 Skymind, Inc.`
			`*`
			`* This program and the accompanying materials are made available under the`
			`* terms of the Apache License, Version 2.0 which is available at`
			`* https://www.apache.org/licenses/LICENSE-2.0.`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT`
			`* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the`
			`* License for the specific language governing permissions and limitations`
			`* under the License.`
			`*`
			`* SPDX-License-Identifier: Apache-2.0`
			`******************************************************************************/`

			`/*`
			`* buffer.h`
			`*`
			`* Created on: Dec 24, 2015`
			`* Author: agibsonccc`
			`*/`

			`#ifndef BUFFER_H_`
			`#define BUFFER_H_`
			`#ifdef __CUDACC__`
			`#include <cuda.h>`
			`#include <cuda_runtime.h>`
[WIP] Weekly update of repo (#8390) * [WIP] Fix compilation after nd4j changes (#37) * Fix compilation. * Some tests fixed * Disable tests temporarily. * Restored test * Tests restored. * Test restored. * [WIP] perf tests (#40) * special maxpool test Signed-off-by: raver119 <raver119@gmail.com> * special maxpool test Signed-off-by: raver119 <raver119@gmail.com> * Shyrma bnorm bp (#41) Batchnorm backprop mkldnn * Add SameDiff memory reuse memory manager (array cache) (#39) * Attention op comments Signed-off-by: AlexDBlack <blacka101@gmail.com> * ArrayCacheMemoryMgr - first pass Signed-off-by: AlexDBlack <blacka101@gmail.com> * Tweak array cache for use with SameDiff identity arrays Signed-off-by: AlexDBlack <blacka101@gmail.com> * ArrayCacheMemoryMgr javadoc and properly get max memory Signed-off-by: AlexDBlack <blacka101@gmail.com> * LRU cache policy + add tests Signed-off-by: AlexDBlack <blacka101@gmail.com> * Fixes Signed-off-by: AlexDBlack <blacka101@gmail.com> * Resize arrays internally if required for ArrayCacheMemoryMgr Signed-off-by: AlexDBlack <blacka101@gmail.com> * Test improvement Signed-off-by: AlexDBlack <blacka101@gmail.com> * Small polish Signed-off-by: AlexDBlack <blacka101@gmail.com> * SameDiff op runtime benchmarking listener (#42) Signed-off-by: AlexDBlack <blacka101@gmail.com> * INLINE_LOOPS for windows Signed-off-by: raver119 <raver119@gmail.com> * [WIP] ThreadPool (#8) This PR removes OpenMP use in 95% of cases 2019-11-13 15:15:18 +01:00			`#include <helpers/DebugHelper.h>`
Eclipse Migration Initial Commit 2019-06-06 14:21:15 +02:00			`#endif`
			`#include <dll.h>`

			`#include <stddef.h>`
			`#include <stdlib.h>`
			`#include <stdio.h>`
			`#include <dll.h>`

			`//Question: Should the indexes here really be int? Isn't size_t or Nd4jLong more appropriate?`
			`namespace nd4j {`
			`namespace buffer {`
			`/**`
			`* Represents both a cpu and gpu`
			`* buffer - mainly used for testing`
			`*/`
			`template<typename T>`
			`struct Buffer {`
			`int length = 0;`
			`int allocatedOnGpu = 0;`
			`T *data = nullptr;`
			`T *gData = nullptr;`
			`T one, two;`
			`public:`
			`~Buffer() {`
			`delete []data;`
			`delete []gData;`
			`}`

			`void assign(T *val) {`
			`data = val;`
			`}`

			`T &operator=(T x) {`
			`one = x;`
			`return x;`
			`}`

			`class Proxy {`
			`Buffer<T> &a;`
			`int idx;`
			`public:`
			`Proxy(Buffer &a, int idx) :`
			`a(a), idx(idx) {`
			`}`

			`T &operator=(T x) {`
			`a.two = x;`
			`a.data[idx] = x;`
			`return a.data[idx];`
			`}`
			`};`


			`Proxy operator[](int index) {`
			`return Proxy(*this, index);`
			`}`
			`};`

			`/**`
			`* Returns the size of the buffer`
			`* in bytes`
			`* @param buffer the buffer to get the size of`
			`* @return the size of the buffer in bytes`
			`*/`
			`template<typename T>`

			`#ifdef __CUDACC__`
			`__host__ __device__`
			`#endif`

			`int bufferSize(Buffer<T> *buffer);`

			`/**`
			`* Copies data to the gpu`
			`* @param buffer the buffer to copy`
			`*/`

			`#ifdef __CUDACC__`
			`template<typename T>`
			`__host__`
			`void copyDataToGpu(Buffer<T> **buffer, cudaStream_t stream);`
			`#endif`



			`/**`
			`* Copies data from the gpu`
			`* @param buffer the buffer to copy`
			`*/`

			`#ifdef __CUDACC__`
			`template<typename T>`
			`__host__`
			`void copyDataFromGpu(Buffer<T> **buffer, cudaStream_t stream);`
			`#endif`



			`/**`
			`* Allocate buffer of the given`
			`* length on the cpu and gpu.`
			`*/`
			`template<typename T>`
			`#ifdef __CUDACC__`
			`__host__`
			`#endif`
			`void allocBuffer(Buffer<T> **buffer, int length);`

			`/**`
			`* Frees the given buffer`
			`* (gpu and cpu`
			`*/`
			`template<typename T>`
			`#ifdef __CUDACC__`
			`__host__`
			`#endif`
			`void freeBuffer(Buffer<T> **buffer);`

			`/**`
			`* Creates a buffer`
			`* based on the data`
			`* and also synchronizes`
			`* the data on the gpu.`
			`*/`
			`template<typename T>`
			`#ifdef __CUDACC__`
			`__host__`
			`#endif`
			`Buffer<T>`
			`*`
			`createBuffer(T *data, int length);`

			`/**`
			`* Print the buffer on the host`
			`* @param buff`
			`*/`
			`template<typename T>`
			`#ifdef __CUDACC__`
			`__host__`
			`#endif`
			`void printArr(Buffer<T> *buff);`

			`/**`
			`*`
			`* @param buffer`
			`* @return`
			`*/`
			`template<typename T>`
			`#ifdef __CUDACC__`
			`__host__ __device__`
			`#endif`

			`int bufferSize(Buffer<T> *buffer) {`
			`return sizeof(T) * buffer->length;`
			`}`

			`#ifdef __CUDACC__`
			`/**`
			`*`
			`* @param buffer`
			`*/`
			`template<typename T>`
			`__host__ void copyDataToGpu(Buffer <T> **buffer, cudaStream_t stream) {`
			`Buffer <T> bufferRef = buffer;`
			`checkCudaErrors(cudaMemcpyAsync(bufferRef->gData, bufferRef->data, bufferSize(bufferRef), cudaMemcpyHostToDevice, stream));`
			`checkCudaErrors(cudaStreamSynchronize(stream));`
			`}`

			`/**`
			`*`
			`* @param buffer`
			`*/`
			`template<typename T>`
			`__host__ void copyDataFromGpu(Buffer <T> **buffer, cudaStream_t stream) {`
			`Buffer <T> bufferRef = buffer;`
			`int bufferTotalSize = bufferSize(bufferRef);`
			`checkCudaErrors(cudaMemcpyAsync(bufferRef->data, bufferRef->gData, bufferTotalSize, cudaMemcpyDeviceToHost, stream));`
			`checkCudaErrors(cudaStreamSynchronize(stream));`
			`}`
			`#endif`

			`/**`
			`* Allocate buffer of the given`
			`* length on the cpu and gpu.`
			`*/`
			`template<typename T>`
			`#ifdef __CUDACC__`
			`__host__`
			`#endif`
			`void allocBuffer(Buffer<T> **buffer, int length) {`
			`Buffer<T> bufferRef = buffer;`
			`bufferRef->length = length;`
			`bufferRef->data = reinterpret_cast<T >(malloc(sizeof(T) length));`

			`CHECK_ALLOC(bufferRef->data, "Failed to allocate new buffer", sizeof(T) * length);`
			`#ifdef __CUDACC__`
			`checkCudaErrors(cudaMalloc(&bufferRef->gData, sizeof(T) * length));`
			`#endif`
			`}`

			`/**`
			`* Frees the given buffer`
			`* (gpu and cpu`
			`*/`
			`template<typename T>`
			`#ifdef __CUDACC__`
			`__host__`
			`#endif`

			`void freeBuffer(Buffer<T> *buffer) {`
			`#ifdef __CUDACC__`
			`if(buffer->gData != nullptr)`
			`checkCudaErrors(cudaFree(buffer->gData));`
			`#endif`

			`delete buffer;`
			`}`

			`/**`
			`* Creates a buffer`
			`* based on the data`
			`* and also synchronizes`
			`* the data on the gpu.`
			`*/`
			`template<typename T>`
			`#ifdef __CUDACC__`
			`__host__`
			`#endif`
			`Buffer<T> createBuffer(T data, int length) {`
			`Buffer<T> *ret = new Buffer<T>;`
			`T *buffData = new T[length];`
			`for(int i = 0; i < length; i++)`
			`buffData[i] = data[i];`
			`ret->data = buffData;`
			`ret->length = length;`
			`return ret;`
			`}`



			`#ifdef __CUDACC__`
			`template<typename T>`
			`__host__`
			`Buffer<T> createBuffer(T data, int length, cudaStream_t stream) {`
			`Buffer<T> *ret = createBuffer(data, length);`

			`T *gData;`
			`T **gDataRef = &(gData);`
			`checkCudaErrors(cudaMalloc(reinterpret_cast<void *>(gDataRef), sizeof(T) length));`
			`ret->gData = gData;`
			`checkCudaErrors(cudaMemcpyAsync(ret->gData, ret->data, sizeof(T) * length, cudaMemcpyHostToDevice, stream));`
			`return ret;`
			`}`
			`#endif`
			`}`
			`}`



			`#ifdef __CUDACC__`
			`template<typename T>`
			`__host__ void printArr(nd4j::buffer::Buffer <T> *buff) {`
			`for (int i = 0; i < buff->length; i++) {`
			`printf("Buffer[%d] was %f\n", i, buff->data[i]);`
			`}`
			`}`

			`#endif`

			`#endif /* BUFFER_H_ */`