HMLP: High-performance Machine Learning Primitives
gofmm_gpu.hpp
1 
24 #ifndef GOFMM_GPU_HPP
25 #define GOFMM_GPU_HPP
26 
27 #include <omp.h>
28 
29 #define DEBUG_SPDASKIT_GPU 1
30 
31 using namespace std;
32 
33 
34 namespace hmlp
35 {
36 namespace gofmm
37 {
38 namespace gpu
39 {
40 
41 void assemble(
42  cudaStream_t stream, int m, int n,
43  const double *a,
44  const size_t *amap,
45  double *A, int lda );
46 void assemble(
47  cudaStream_t stream, int m, int n,
48  const float *a,
49  const size_t *amap,
50  float *A, int lda );
51 
52 
53 
54 
55 template<typename DEVICE, typename NODE>
56 void UpdateWeights( DEVICE *dev, NODE *node )
57 {
58  assert( dev && node );
59 
60 #ifdef DEBUG_SPDASKIT_GPU
61  printf( "\n%lu UpdateWeight on GPU\n", node->treelist_id ); fflush( stdout );
62 #endif
63 
64  if ( !node->parent || !node->data.isskel ) return;
65 
66  double beg, flops;
67 
69  auto &w = *node->setup->w;
70 
72  auto &data = node->data;
73  auto &proj = data.proj;
74  auto &skels = data.skels;
75  auto &w_skel = data.w_skel;
76  auto &w_leaf = data.w_leaf;
77  //auto *lchild = node->lchild;
78  //auto *rchild = node->rchild;
79 
80 
81  size_t s = data.skels.size();
82  size_t nrhs = w.col();
83 
88  w_skel.resize( 0, 0 );
89  w_skel.resize( s, nrhs );
90  w_skel.CacheD( dev );
91 
92 
93  auto m = w_skel.row();
94  auto n = w_skel.col();
95 
96  //printf( "-->w_proj\n" );
97  proj.FetchH2D( dev );
98 
99  assert( proj.device_data( dev ) );
100  assert( w_skel.device_data( dev ) );
101 
103  if ( node->isleaf )
104  {
106  auto k = w_leaf.row();
107  flops = 2.0 * m * nrhs * k;
108  //printf( "-->w_leaf\n" );
109  w_leaf.FetchH2D( dev );
110  assert( w_leaf.device_data( dev ) );
111 
112  beg = omp_get_wtime();
114  (
115  reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->gethandle( 0 ),
116  CUBLAS_OP_N, CUBLAS_OP_N,
117  m, nrhs, k,
118  1.0, proj.device_data( dev ), proj.row(),
119  w_leaf.device_data( dev ), w_leaf.row(),
120  0.0, w_skel.device_data( dev ), w_skel.row()
121  );
122  dev->wait( 0 );
123 
124  double gemm_time = omp_get_wtime() - beg;
125  printf( "n2s cublas m %lu n %lu k %lu, %5.2lf (%5.2lf GFLOPS)\n",
126  m, nrhs, k,
127  gemm_time, flops / ( gemm_time * 1E+9 ) ); fflush( stdout );
128  }
129  else
130  {
131  auto *lchild = node->lchild;
132  auto *rchild = node->rchild;
133  auto &w_lskel = lchild->data.w_skel;
134  auto &w_rskel = rchild->data.w_skel;
135  flops = 2.0 * m * nrhs * ( w_lskel.row() + w_rskel.row() );
136 
137  //printf( "-->w_lskel\n" );
138  w_lskel.FetchH2D( dev );
139  //printf( "-->w_rskel\n" );
140  w_rskel.FetchH2D( dev );
141  assert( w_lskel.device_data( dev ) );
142  assert( w_rskel.device_data( dev ) );
143 
144  beg = omp_get_wtime();
146  (
147  reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->gethandle( 0 ),
148  CUBLAS_OP_N, CUBLAS_OP_N,
149  m, nrhs, w_lskel.row(),
150  1.0, proj.device_data( dev ), proj.row(),
151  w_lskel.device_data( dev ), w_lskel.row(),
152  0.0, w_skel.device_data( dev ), w_skel.row()
153  );
154  dev->wait( 0 );
156  (
157  reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->gethandle( 0 ),
158  CUBLAS_OP_N, CUBLAS_OP_N,
159  m, nrhs, w_rskel.row(),
160  1.0, proj.device_data( dev ) + m * w_lskel.row(), proj.row(),
161  w_rskel.device_data( dev ), w_rskel.row(),
162  1.0, w_skel.device_data( dev ), w_skel.row()
163  );
164  dev->wait( 0 );
165 
166  double gemm_time = omp_get_wtime() - beg;
167  printf( "n2s cublas m %lu n %lu k %lu, %5.2lf (%5.2lf GFLOPS)\n",
168  m, nrhs, w_lskel.row() + w_rskel.row(),
169  gemm_time, flops / ( gemm_time * 1E+9 ) ); fflush( stdout );
170  }
171 
173  w_skel.Redistribute<true>( dev );
174  w_skel.FetchD2H( dev );
175 
176  //printf( "finish GPU task\n\n" );
177 };
178 
179 
180 template<bool NNPRUNE, typename NODE, typename T, typename DEVICE>
181 void SkeletonsToNodes( DEVICE *dev, NODE *node )
182 {
183 #ifdef DEBUG_SPDASKIT_GPU
184  printf( "%lu GPU Skel2Node u_skel.row() %lu\n", node->treelist_id, node->data.u_skel.row() ); fflush( stdout );
185 #endif
186 
187  double beg, flops, kij_s2n_time = 0.0, u_leaf_time, before_writeback_time, after_writeback_time;
188 
189  int stream_id = node->treelist_id % 8;
190 
192  auto &K = *node->setup->K;
193  auto &w = *node->setup->w;
194  auto &u = *node->setup->u;
195 
197  auto &lids = node->lids;
198  auto &data = node->data;
199  auto &proj = data.proj;
200  auto &skels = data.skels;
201  auto &u_skel = data.u_skel;
202  auto *lchild = node->lchild;
203  auto *rchild = node->rchild;
204 
205 
206  size_t s = data.skels.size();
207  size_t nrhs = w.col();
208 
209 
210 
211  if ( node->isleaf )
212  {
213  std::set<NODE*> *NearNodes;
214  if ( NNPRUNE ) NearNodes = &node->NNNearNodes;
215  else NearNodes = &node->NearNodes;
216  auto &amap = node->lids;
217  auto &u_leaf = node->data.u_leaf[ 0 ];
218 
219  u_leaf.resize( lids.size(), nrhs );
220 
221 
223  if ( data.isskel )
224  {
225  proj.CacheD( dev );
226  proj.PrefetchH2D( dev, stream_id );
227 
228  u_skel.CacheD( dev );
229  u_skel.PrefetchH2D( dev, stream_id );
230 
231  u_leaf.CacheD( dev );
232 
234  size_t m = u_leaf.row();
235  size_t n = u_leaf.col();
236  size_t k = u_skel.row();
237  flops = 2.0 * m * n * k;
238 
240  beg = omp_get_wtime();
241  xgemm
242  (
243  reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->gethandle( stream_id ),
244  CUBLAS_OP_T, CUBLAS_OP_N,
245  m, nrhs, k,
246  1.0, proj.device_data( dev ), proj.row(),
247  u_skel.device_data( dev ), u_skel.row(),
248  0.0, u_leaf.device_data( dev ), u_leaf.row()
249  );
250  //dev->wait( 0 );
251  double gemm_time = omp_get_wtime() - beg;
252  printf( "s2n cublas m %lu n %lu k %lu, %5.2lf (%5.2lf GFLOPS)\n",
253  m, nrhs, k,
254  gemm_time, flops / ( gemm_time * 1E+9 ) ); fflush( stdout );
255 
256  u_leaf.Redistribute<true>( dev );
257  //u_leaf.FetchD2H( dev );
258  u_leaf.PrefetchD2H( dev, stream_id );
259  }
260  }
261  else
262  {
263  if ( !node->parent || !node->data.isskel ) return;
264 
265  auto &u_lskel = lchild->data.u_skel;
266  auto &u_rskel = rchild->data.u_skel;
267  auto &lskel = lchild->data.skels;
268  auto &rskel = rchild->data.skels;
269 
270  proj.CacheD( dev );
271  //proj.FetchH2D( dev );
272  proj.PrefetchH2D( dev, stream_id );
273 
274  u_skel.CacheD( dev );
275  //u_skel.FetchH2D( dev );
276  u_skel.PrefetchH2D( dev, stream_id );
277 
278  u_lskel.CacheD( dev );
279  //u_lskel.FetchH2D( dev );
280  u_lskel.PrefetchH2D( dev, stream_id );
281 
282  u_rskel.CacheD( dev );
283  //u_rskel.FetchH2D( dev );
284  u_rskel.PrefetchH2D( dev, stream_id );
285 
287  flops = 2.0 * u_lskel.row() * u_lskel.col() * proj.row();
288  flops += 2.0 * u_rskel.row() * u_rskel.col() * proj.row();
289 
291  beg = omp_get_wtime();
292  xgemm
293  (
294  reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->gethandle( stream_id ),
295  CUBLAS_OP_T, CUBLAS_OP_N,
296  u_lskel.row(), u_lskel.col(), proj.row(),
297  1.0, proj.device_data( dev ), proj.row(),
298  u_skel.device_data( dev ), u_skel.row(),
299  1.0, u_lskel.device_data( dev ), u_lskel.row()
300  );
301  //dev->wait( 0 );
302  xgemm
303  (
304  reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->gethandle( stream_id ),
305  CUBLAS_OP_T, CUBLAS_OP_N,
306  u_rskel.row(), u_rskel.col(), proj.row(),
307  1.0, proj.device_data( dev ) + proj.row() * lskel.size(), proj.row(),
308  u_skel.device_data( dev ), u_skel.row(),
309  1.0, u_rskel.device_data( dev ), u_rskel.row()
310  );
311  //dev->wait( 0 );
312  double gemm_time = omp_get_wtime() - beg;
313  printf( "s2n cublas m %lu n %lu k %lu, %5.2lf (%5.2lf GFLOPS)\n",
314  u_lskel.row(), u_lskel.col(), proj.row(),
315  gemm_time, flops / ( gemm_time * 1E+9 ) ); fflush( stdout );
316 
317  u_lskel.Redistribute<true>( dev );
318  u_rskel.Redistribute<true>( dev );
319  //u_lskel.FetchD2H( dev );
320  u_lskel.PrefetchD2H( dev, stream_id );
321  u_rskel.PrefetchD2H( dev, stream_id );
322  }
323 };
324 
325 
326 
327 
328 
329 
330 
331 
332 
333 template<bool CACHE, bool NNPRUNE, typename NODE, typename T, typename DEVICE>
334 void LeavesToLeaves( DEVICE *dev, NODE *node )
335 {
336 #ifdef DEBUG_SPDASKIT_GPU
337  printf( "\n%lu LeavesToLeaves on GPU\n", node->treelist_id ); fflush( stdout );
338 #endif
339 
340  assert( node && node->isleaf );
341 
342  double beg, gemm_time = 0.0, total_time = 0.0, flops = 0.0;
343 
345  int stream_id = 0;
346 
348  auto &K = *node->setup->K;
349  auto &w = *node->setup->w;
350  auto &u = *node->setup->u;
351 
352  auto &lids = node->lids;
353  auto &data = node->data;
354  auto &Nearbmap = data.Nearbmap;
355  auto &NearKab = data.NearKab;
356  auto &w_leaf = data.w_leaf;
357 
358 
360  if ( NearKab.size() != lids.size() * Nearbmap.size() )
361  {
362  vector<size_t> bmap( Nearbmap.size() );
363  for ( size_t i = 0; i < bmap.size(); i ++ )
364  bmap[ i ] = Nearbmap[ i ];
365  assert( !CACHE );
366  NearKab = K( lids, bmap );
367 
368  printf( "Not cache\n" );
369  }
370 
371  if ( dev )
372  {
373  size_t m = NearKab.col();
374  size_t k = w_leaf.row();
375  size_t nrhs = w_leaf.col();
376 
377  flops += 2.0 * m * nrhs * k;
378 
379  w_leaf.FetchH2D( dev );
380  NearKab.FetchH2D( dev );
381  Nearbmap.FetchH2D( dev );
382 
383  //printf( "prepare cublas gemm\n" ); fflush( stdout );
384  assert( m * nrhs * sizeof(T) < 1200000000 );
385 
386  cublasHandle_t &handle =
387  reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->gethandle( stream_id );
388  T *A = NearKab.device_data( dev );
389  T *B = w_leaf.device_data( dev );
390  T *C = (T*)reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->workspace();
391 
392  //dev->wait( stream_id );
393 
394  beg = omp_get_wtime();
399  (
400  handle,
401  CUBLAS_OP_T, CUBLAS_OP_N,
402  m, nrhs, k,
403  1.0, A, NearKab.row(),
404  B, w_leaf.row(),
405  0.0, C, m
406  );
407 
408  dev->wait( stream_id );
409  gemm_time = omp_get_wtime() - beg;
410 
414  assemble
415  (
416  reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->getstream( stream_id ),
417  m, nrhs,
418  C,
419  Nearbmap.device_data( dev ),
420  u.device_data( dev ), u.row()
421  );
422 
424  dev->wait( stream_id );
425 
426  total_time = omp_get_wtime() - beg;
427  printf( "l2l cublas m %lu n %lu k %lu ldu %lu, %5.3lf (%5.2lf GFLOPS) total %5.3lf\n",
428  m, nrhs, k, u.row(),
429  gemm_time, flops / ( gemm_time * 1E+9 ), total_time );
430 
432  u.Redistribute<true>( dev );
433 
434 
436  //w_leaf.FreeD( dev );
437  //NearKab.FreeD( dev );
438  //Nearbmap.FreeD( dev );
439  }
440  else
441  {
442  printf( "cpu gemm begin\n" ); fflush( stdout );
443  std::set<NODE*> *NearNodes;
444  if ( NNPRUNE ) NearNodes = &node->NNNearNodes;
445  else NearNodes = &node->NearNodes;
446 
447  size_t n = w_leaf.col();
448  size_t k = NearKab.row();
449  size_t offset = 0;
450 
451  assert( NearKab.size() );
452  assert( k == w_leaf.row() );
453 
454  for ( auto it = NearNodes->begin();
455  it != NearNodes->end(); it ++ )
456  {
457  assert( omp_get_thread_num() > 0 && omp_get_thread_num() < 20 );
458  auto &u_leaf = (*it)->data.u_leaf[ omp_get_thread_num() ];
459  size_t m = (*it)->lids.size();
460 
461  assert( offset < NearKab.col() );
462  assert( w_leaf.size() == k * n );
463 
464 
465  if ( u_leaf.size() != m * n )
466  {
467  //printf( "u_leaf.size() %lu m %lu n %lu w.row() %lu\n",
468  // u_leaf.size(), m, n, w.row() );
469  u_leaf.resize( 0, 0 );
470  u_leaf.resize( m, n, 0.0 );
471  }
472 
473  //printf( "NearKab.col() %lu m %lu offset %lu\n",
474  // NearKab.col(), m, offset ); fflush( stdout );
475 
477  (
478  "T", "N",
479  m, n, k,
480  1.0, NearKab.data() + offset * k, k,
481  w_leaf.data(), w_leaf.row(),
482  1.0, u_leaf.data(), u_leaf.row()
483  );
484  offset += m;
485  }
486  printf( "cpu gemm finishd\n" ); fflush( stdout );
487  }
488 
490  if ( !CACHE ) NearKab.resize( 0, 0 );
491 
492  //printf( "Finish GPU LeavesToLeaves\n\n" );
493 };
494 
495 
496 template<bool CACHE, bool NNPRUNE, typename NODE, typename T>
498 {
499  public:
500 
501  NODE *arg;
502 
503  int stream_id;
504 
505  void Set( NODE *user_arg )
506  {
507  arg = user_arg;
508  stream_id = ( arg->treelist_id % 8 ) + 1;
509  name = std::string( "l2l" );
510  {
511  //label = std::to_string( arg->treelist_id );
512  std::ostringstream ss;
513  ss << arg->treelist_id;
514  label = ss.str();
515  }
516 
517  assert( arg->isleaf );
518 
519 
521  //--------------------------------------
522  double flops = 0.0, mops = 0.0;
523  auto *node = arg;
524  auto &data = node->data;
525  auto &w_leaf = data.w_leaf;
526  auto *NearNodes = &node->NearNodes;
527  if ( NNPRUNE ) NearNodes = &node->NNNearNodes;
528  auto &amap = node->lids;
529 
530  if ( !CACHE )
531  {
532  vector<size_t> bmap;
533  for ( auto it = NearNodes->begin(); it != NearNodes->end(); it ++ )
534  {
535  bmap.insert( bmap.end(), (*it)->lids.begin(), (*it)->lids.end() );
536  }
537  data.Nearbmap.resize( bmap.size(), 1 );
538  for ( size_t i = 0; i < bmap.size(); i ++ )
539  data.Nearbmap[ i ] = bmap[ i ];
540  }
541 
542  size_t m = w_leaf.col();
543  size_t n = arg->data.Nearbmap.size();
544  size_t k = arg->lids.size();
545 
546  flops += 2.0 * m * n * k;
547  mops += 2.0 * ( m * n + m * k + k * n );
548 
550  event.Set( name + label, flops, mops );
551 
553  cost = flops / 1E+9;
554 
555  //printf( "cost %5.2lf\n", cost );
556 
558  priority = false;
559 
561  this->stealable = false;
562  };
563 
564  void Prefetch( Worker* user_worker )
565  {
566  hmlp::Device *device = NULL;
567  if ( user_worker ) device = user_worker->GetDevice();
568 
569  //if ( !arg->data.NearKab.is_up_to_date( hmlp_get_device( 0 ) ) )
570  // device = NULL;
571 
572  if ( device )
573  {
574  if ( !CACHE )
575  {
576  assert( !arg->data.NearKab.size() );
577  auto &K = *arg->setup->K;
578  std::vector<size_t> bmap( arg->data.Nearbmap.size() );
579  for ( size_t i = 0; i < bmap.size(); i ++ )
580  bmap[ i ] = arg->data.Nearbmap[ i ];
581  arg->data.NearKab = K( arg->lids, bmap );
582 
583  printf( "not cache\n" );
584  }
585  //if ( arg->data.NearKab.size() < 4096 * 4096 * 4 )
586  // arg->data.NearKab.CacheD( device );
587  arg->data.NearKab.PrefetchH2D( device, stream_id );
588 
590  arg->data.Nearbmap.CacheD( device );
591  arg->data.Nearbmap.PrefetchH2D( device, stream_id );
592  //arg->data.Nearbmap.PrefetchH2D( device, 2 );
593  arg->data.w_leaf.CacheD( device );
594  arg->data.w_leaf.PrefetchH2D( device, stream_id );
595  //arg->data.w_leaf.PrefetchH2D( device, 3 );
596  }
597  };
598 
599  void DependencyAnalysis()
600  {
601  assert( arg->isleaf );
602  //if ( arg->data.NearKab.is_up_to_date( hmlp_get_device( 0 ) ) )
603  this->ForceEnqueue( 0 );
604  //else
605  // this->Enqueue();
606  };
607 
608  void Execute( Worker* user_worker )
609  {
610  hmlp::Device *device = NULL;
611  if ( user_worker ) device = user_worker->GetDevice();
612  gpu::LeavesToLeaves<CACHE, NNPRUNE, NODE, T>( device, arg );
613  };
614 
615 };
616 
617 
618 
619 
620 
621 
622 
623 
624 
625 
626 
627 
628 
629 };
630 };
631 };
633 #endif
Definition: hmlp_gpu.hpp:45
Definition: gofmm_gpu.hpp:497
void Prefetch(Worker *user_worker)
Definition: gofmm_gpu.hpp:564
void xgemm(const char *transA, const char *transB, int m, int n, int k, double alpha, const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc)
DGEMM wrapper.
Definition: blas_lapack.cpp:130
class Device * GetDevice()
Definition: thread.cpp:554
This class describes devices or accelerators that require a master thread to control. A device can accept tasks from multiple workers. All received tasks are expected to be executed independently in a time-sharing fashion. Whether these tasks are executed in parallel, sequential or with some built-in context switching scheme does not matter.
Definition: device.hpp:125
Definition: gofmm.hpp:83
void Set(NODE *user_arg)
Definition: gofmm_gpu.hpp:505
Definition: runtime.hpp:174
Definition: thread.hpp:166