HMLP: High-performance Machine Learning Primitives
hmlp_gpu.hpp
1 
22 #ifndef HMLP_GPU_HPP
23 #define HMLP_GPU_HPP
24 
25 #include <cassert>
26 #include <set>
27 #include <map>
28 
29 #include <base/runtime.hpp>
30 
32 #include <cuda_runtime.h>
33 #include <cublas_v2.h>
34 #include <thrust/system/cuda/experimental/pinned_allocator.h>
35 
36 #define NUM_CUBLAS_HANDLE 8
37 #define NUM_CUDA_STREAM 10
38 
39 namespace hmlp
40 {
41 namespace gpu
42 {
43 
44 
45 class Nvidia : public hmlp::Device
46 {
47  public:
48 
49  Nvidia( int device_id )
50  {
51  printf( "Setup device %d\n", device_id );
52  if ( cudaSetDevice( device_id ) )
53  {
54  int device_count = 0;
55  cudaGetDeviceCount( &device_count );
56  printf( "cudaSetDevice(), fail to set device %d / %d\n",
57  device_id, device_count );
58  exit( 1 );
59  }
60  else
61  {
62  struct cudaDeviceProp prop;
63  cudaGetDeviceProperties( &prop, device_id );
64  this->device_id = device_id;
65  this->devicetype = hmlp::DeviceType::NVIDIA_GPU;
66  this->name = std::string( prop.name );
67  this->total_memory = prop.totalGlobalMem;
68  this->memory_left = prop.totalGlobalMem;
69 
70  for ( int i = 0; i < NUM_CUDA_STREAM; i ++ )
71  {
72  if ( cudaStreamCreate( &(stream[ i ] ) ) )
73  printf( "cudaStreamCreate(), fail on device %d\n", device_id );
74  }
75 
76  for ( int i = 0; i < NUM_CUBLAS_HANDLE; i ++ )
77  {
78  if ( cublasCreate( &handle[ i ] ) )
79  printf( "cublasCreate(), fail on device %d\n", device_id );
80  if ( cublasSetStream( handle[ i ], stream[ i ] ) )
81  printf( "cublasSetStream(), fail on device %d\n", device_id );
82  }
83  std::cout << name << ", " << this->total_memory / 1E+9 << "GB" << std::endl;
84 
86  work_d = (char*)malloc( work_size );
87 
89  cache.Setup( this );
90 
91  }
92  };
93 
94  ~Nvidia()
95  {
96  for ( int i = 0; i < NUM_CUBLAS_HANDLE; i ++ )
97  {
98  if ( cublasDestroy( handle[ i ] ) )
99  printf( "cublasDestroy(), fail on device %d\n", device_id );
100  }
101  };
102 
103 
104  void prefetchd2h( void *ptr_h, void *ptr_d, size_t size, int stream_id )
105  {
106  if ( cudaSetDevice( device_id ) )
107  {
108  exit( 1 );
109  }
110  if ( cudaMemcpyAsync( ptr_h, ptr_d, size, cudaMemcpyDeviceToHost, stream[ stream_id ] ) )
111  {
112  exit( 1 );
113  }
114  };
115 
116  void prefetchh2d( void *ptr_d, void *ptr_h, size_t size, int stream_id )
117  {
118  if ( cudaSetDevice( device_id ) )
119  {
120  printf( "cudaSetDevice(), fail to set device %d\n", device_id );
121  exit( 1 );
122  }
123 
124  struct cudaPointerAttributes attribute;
125 
126  if ( cudaPointerGetAttributes ( &attribute, ptr_h ) )
127  {
128  printf( "cudaPointerGetAttributes(), fail on device %d\n", device_id );
129  exit( 1 );
130  }
131 
132  if ( attribute.isManaged )
133  {
134  printf( "ptr_h is managed\n" );
135  if ( cudaMemPrefetchAsync( ptr_d, size, device_id, stream[ stream_id ] ) )
136  {
137  printf( "cudaMemPrefetchAsync(), fail on device %d\n", device_id );
138  }
139  }
140 
141 
142 
143  if ( cudaMemcpyAsync( ptr_d, ptr_h, size, cudaMemcpyHostToDevice, stream[ stream_id ] ) )
144  //if ( cudaMemcpy( ptr_d, ptr_h, size, cudaMemcpyHostToDevice ) )
145  {
146  printf( "cudaMemcpyAsync(), %lu bytes fail to device %d\n", size, device_id );
147  exit( 1 );
148  }
149  };
150 
151  void waitexecute()
152  {
153  for ( int stream_id = 0; stream_id < NUM_CUDA_STREAM; stream_id ++ )
154  {
155  wait( stream_id );
156  }
157  };
158 
159  void wait( int stream_id )
160  {
161  if ( cudaSetDevice( device_id ) )
162  {
163  exit( 1 );
164  }
165  if ( cudaStreamSynchronize( stream[ stream_id ] ) )
166  {
167  exit( 1 );
168  }
169  };
170 
171  size_t get_memory_left()
172  {
173  return memory_left;
174  };
175 
176  void* malloc( size_t size )
177  {
178  void *ptr_d = NULL;
179  if ( cudaSetDevice( device_id ) )
180  {
181  exit( 1 );
182  }
183  if ( size + 268435456 < memory_left )
184  {
185  memory_left -= size;
186  if ( cudaMalloc( (void**)&ptr_d, size ) )
187  {
188  printf( "cudaMalloc() error\n");
189  exit( 1 );
190  }
191  cudaMemset( ptr_d, 0, size );
192  }
193  else
194  {
195  printf( "not allocated, only %5.2lf GB left\n", memory_left / 1E+9 );
196  }
197  return ptr_d;
198  };
199 
200  void malloc( void *ptr_d, size_t size )
201  {
202  if ( cudaSetDevice( device_id ) )
203  {
204  exit( 1 );
205  }
206 
207  if ( size + 1073741824 < memory_left )
208  {
209  memory_left -= size;
210  if ( cudaMalloc( (void**)&ptr_d, size ) )
211  {
212  printf( "cudaMalloc() error\n");
213  exit( 1 );
214  }
215  cudaMemset( ptr_d, 0, size );
216  }
217  else
218  {
219  printf( "not allocated, only %5.2lf GB left\n", memory_left / 1E+9 );
220  }
221  };
222 
223  char *workspace()
224  {
225  return work_d;
226  };
227 
228  void free( void *ptr_d, size_t size )
229  {
230  if ( cudaSetDevice( device_id ) )
231  {
232  exit( 1 );
233  }
234  if ( ptr_d )
235  {
236  if ( cudaFree( ptr_d ) )
237  {
238  exit( 1 );
239  }
240  memory_left += size;
241  }
242  else
243  {
244  printf( "try to free a null device pointer\n" );
245  }
246  printf( "free %lu memory_left %5.2lfGB\n", size, memory_left / 1E+9 );
247  };
248 
249  cudaStream_t &getstream( int stream_id )
250  {
251  return stream[ stream_id ];
252  }
253 
254  cublasHandle_t &gethandle( int stream_id )
255  {
256  return handle[ stream_id ];
257  };
258 
259  //Cache<8> cache;
260 
261  private:
262 
263  int device_id;
264 
266  cudaStream_t stream[ NUM_CUDA_STREAM ];
267 
269  cublasHandle_t handle[ NUM_CUBLAS_HANDLE ];
270 
271  char *work_d = NULL;
272 
273  size_t work_size = 1073741824;
274 
275  size_t total_memory = 0;
276 
277  size_t memory_left = 0;
278 
279 };
280 
281 
282 
283 }; // end namespace gpu
284 }; // end namespace hmlp
285 
286 #endif // define HMLP_GPU_HPP
Definition: hmlp_gpu.hpp:45
Nvidia(int device_id)
Definition: hmlp_gpu.hpp:49
This class describes devices or accelerators that require a master thread to control. A device can accept tasks from multiple workers. All received tasks are expected to be executed independently in a time-sharing fashion. Whether these tasks are executed in parallel, sequential or with some built-in context switching scheme does not matter.
Definition: device.hpp:125
Definition: gofmm.hpp:83