29 #include <base/runtime.hpp> 32 #include <cuda_runtime.h> 33 #include <cublas_v2.h> 34 #include <thrust/system/cuda/experimental/pinned_allocator.h> 36 #define NUM_CUBLAS_HANDLE 8 37 #define NUM_CUDA_STREAM 10 51 printf(
"Setup device %d\n", device_id );
52 if ( cudaSetDevice( device_id ) )
55 cudaGetDeviceCount( &device_count );
56 printf(
"cudaSetDevice(), fail to set device %d / %d\n",
57 device_id, device_count );
62 struct cudaDeviceProp prop;
63 cudaGetDeviceProperties( &prop, device_id );
64 this->device_id = device_id;
65 this->devicetype = hmlp::DeviceType::NVIDIA_GPU;
66 this->name = std::string( prop.name );
67 this->total_memory = prop.totalGlobalMem;
68 this->memory_left = prop.totalGlobalMem;
70 for (
int i = 0; i < NUM_CUDA_STREAM; i ++ )
72 if ( cudaStreamCreate( &(stream[ i ] ) ) )
73 printf(
"cudaStreamCreate(), fail on device %d\n", device_id );
76 for (
int i = 0; i < NUM_CUBLAS_HANDLE; i ++ )
78 if ( cublasCreate( &handle[ i ] ) )
79 printf(
"cublasCreate(), fail on device %d\n", device_id );
80 if ( cublasSetStream( handle[ i ], stream[ i ] ) )
81 printf(
"cublasSetStream(), fail on device %d\n", device_id );
83 std::cout << name <<
", " << this->total_memory / 1E+9 <<
"GB" << std::endl;
86 work_d = (
char*)malloc( work_size );
96 for (
int i = 0; i < NUM_CUBLAS_HANDLE; i ++ )
98 if ( cublasDestroy( handle[ i ] ) )
99 printf(
"cublasDestroy(), fail on device %d\n", device_id );
104 void prefetchd2h(
void *ptr_h,
void *ptr_d,
size_t size,
int stream_id )
106 if ( cudaSetDevice( device_id ) )
110 if ( cudaMemcpyAsync( ptr_h, ptr_d, size, cudaMemcpyDeviceToHost, stream[ stream_id ] ) )
116 void prefetchh2d(
void *ptr_d,
void *ptr_h,
size_t size,
int stream_id )
118 if ( cudaSetDevice( device_id ) )
120 printf(
"cudaSetDevice(), fail to set device %d\n", device_id );
124 struct cudaPointerAttributes attribute;
126 if ( cudaPointerGetAttributes ( &attribute, ptr_h ) )
128 printf(
"cudaPointerGetAttributes(), fail on device %d\n", device_id );
132 if ( attribute.isManaged )
134 printf(
"ptr_h is managed\n" );
135 if ( cudaMemPrefetchAsync( ptr_d, size, device_id, stream[ stream_id ] ) )
137 printf(
"cudaMemPrefetchAsync(), fail on device %d\n", device_id );
143 if ( cudaMemcpyAsync( ptr_d, ptr_h, size, cudaMemcpyHostToDevice, stream[ stream_id ] ) )
146 printf(
"cudaMemcpyAsync(), %lu bytes fail to device %d\n", size, device_id );
153 for (
int stream_id = 0; stream_id < NUM_CUDA_STREAM; stream_id ++ )
159 void wait(
int stream_id )
161 if ( cudaSetDevice( device_id ) )
165 if ( cudaStreamSynchronize( stream[ stream_id ] ) )
171 size_t get_memory_left()
176 void* malloc(
size_t size )
179 if ( cudaSetDevice( device_id ) )
183 if ( size + 268435456 < memory_left )
186 if ( cudaMalloc( (
void**)&ptr_d, size ) )
188 printf(
"cudaMalloc() error\n");
191 cudaMemset( ptr_d, 0, size );
195 printf(
"not allocated, only %5.2lf GB left\n", memory_left / 1E+9 );
200 void malloc(
void *ptr_d,
size_t size )
202 if ( cudaSetDevice( device_id ) )
207 if ( size + 1073741824 < memory_left )
210 if ( cudaMalloc( (
void**)&ptr_d, size ) )
212 printf(
"cudaMalloc() error\n");
215 cudaMemset( ptr_d, 0, size );
219 printf(
"not allocated, only %5.2lf GB left\n", memory_left / 1E+9 );
228 void free(
void *ptr_d,
size_t size )
230 if ( cudaSetDevice( device_id ) )
236 if ( cudaFree( ptr_d ) )
244 printf(
"try to free a null device pointer\n" );
246 printf(
"free %lu memory_left %5.2lfGB\n", size, memory_left / 1E+9 );
249 cudaStream_t &getstream(
int stream_id )
251 return stream[ stream_id ];
254 cublasHandle_t &gethandle(
int stream_id )
256 return handle[ stream_id ];
266 cudaStream_t stream[ NUM_CUDA_STREAM ];
269 cublasHandle_t handle[ NUM_CUBLAS_HANDLE ];
273 size_t work_size = 1073741824;
275 size_t total_memory = 0;
277 size_t memory_left = 0;
286 #endif // define HMLP_GPU_HPP Definition: hmlp_gpu.hpp:45
Nvidia(int device_id)
Definition: hmlp_gpu.hpp:49
This class describes devices or accelerators that require a master thread to control. A device can accept tasks from multiple workers. All received tasks are expected to be executed independently in a time-sharing fashion. Whether these tasks are executed in parallel, sequential or with some built-in context switching scheme does not matter.
Definition: device.hpp:125