29 #define DEBUG_SPDASKIT_GPU 1 42 cudaStream_t stream,
int m,
int n,
47 cudaStream_t stream,
int m,
int n,
55 template<
typename DEVICE,
typename NODE>
56 void UpdateWeights( DEVICE *dev, NODE *node )
58 assert( dev && node );
60 #ifdef DEBUG_SPDASKIT_GPU 61 printf(
"\n%lu UpdateWeight on GPU\n", node->treelist_id ); fflush( stdout );
64 if ( !node->parent || !node->data.isskel )
return;
69 auto &w = *node->setup->w;
72 auto &data = node->data;
73 auto &proj = data.proj;
74 auto &skels = data.skels;
75 auto &w_skel = data.w_skel;
76 auto &w_leaf = data.w_leaf;
81 size_t s = data.skels.size();
82 size_t nrhs = w.col();
88 w_skel.resize( 0, 0 );
89 w_skel.resize( s, nrhs );
93 auto m = w_skel.row();
94 auto n = w_skel.col();
99 assert( proj.device_data( dev ) );
100 assert( w_skel.device_data( dev ) );
106 auto k = w_leaf.row();
107 flops = 2.0 * m * nrhs * k;
109 w_leaf.FetchH2D( dev );
110 assert( w_leaf.device_data( dev ) );
112 beg = omp_get_wtime();
115 reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->gethandle( 0 ),
116 CUBLAS_OP_N, CUBLAS_OP_N,
118 1.0, proj.device_data( dev ), proj.row(),
119 w_leaf.device_data( dev ), w_leaf.row(),
120 0.0, w_skel.device_data( dev ), w_skel.row()
124 double gemm_time = omp_get_wtime() - beg;
125 printf(
"n2s cublas m %lu n %lu k %lu, %5.2lf (%5.2lf GFLOPS)\n",
127 gemm_time, flops / ( gemm_time * 1E+9 ) ); fflush( stdout );
131 auto *lchild = node->lchild;
132 auto *rchild = node->rchild;
133 auto &w_lskel = lchild->data.w_skel;
134 auto &w_rskel = rchild->data.w_skel;
135 flops = 2.0 * m * nrhs * ( w_lskel.row() + w_rskel.row() );
138 w_lskel.FetchH2D( dev );
140 w_rskel.FetchH2D( dev );
141 assert( w_lskel.device_data( dev ) );
142 assert( w_rskel.device_data( dev ) );
144 beg = omp_get_wtime();
147 reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->gethandle( 0 ),
148 CUBLAS_OP_N, CUBLAS_OP_N,
149 m, nrhs, w_lskel.row(),
150 1.0, proj.device_data( dev ), proj.row(),
151 w_lskel.device_data( dev ), w_lskel.row(),
152 0.0, w_skel.device_data( dev ), w_skel.row()
157 reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->gethandle( 0 ),
158 CUBLAS_OP_N, CUBLAS_OP_N,
159 m, nrhs, w_rskel.row(),
160 1.0, proj.device_data( dev ) + m * w_lskel.row(), proj.row(),
161 w_rskel.device_data( dev ), w_rskel.row(),
162 1.0, w_skel.device_data( dev ), w_skel.row()
166 double gemm_time = omp_get_wtime() - beg;
167 printf(
"n2s cublas m %lu n %lu k %lu, %5.2lf (%5.2lf GFLOPS)\n",
168 m, nrhs, w_lskel.row() + w_rskel.row(),
169 gemm_time, flops / ( gemm_time * 1E+9 ) ); fflush( stdout );
173 w_skel.Redistribute<
true>( dev );
174 w_skel.FetchD2H( dev );
180 template<
bool NNPRUNE,
typename NODE,
typename T,
typename DEVICE>
181 void SkeletonsToNodes( DEVICE *dev, NODE *node )
183 #ifdef DEBUG_SPDASKIT_GPU 184 printf(
"%lu GPU Skel2Node u_skel.row() %lu\n", node->treelist_id, node->data.u_skel.row() ); fflush( stdout );
187 double beg, flops, kij_s2n_time = 0.0, u_leaf_time, before_writeback_time, after_writeback_time;
189 int stream_id = node->treelist_id % 8;
192 auto &K = *node->setup->K;
193 auto &w = *node->setup->w;
194 auto &u = *node->setup->u;
197 auto &lids = node->lids;
198 auto &data = node->data;
199 auto &proj = data.proj;
200 auto &skels = data.skels;
201 auto &u_skel = data.u_skel;
202 auto *lchild = node->lchild;
203 auto *rchild = node->rchild;
206 size_t s = data.skels.size();
207 size_t nrhs = w.col();
213 std::set<NODE*> *NearNodes;
214 if ( NNPRUNE ) NearNodes = &node->NNNearNodes;
215 else NearNodes = &node->NearNodes;
216 auto &amap = node->lids;
217 auto &u_leaf = node->data.u_leaf[ 0 ];
219 u_leaf.resize( lids.size(), nrhs );
226 proj.PrefetchH2D( dev, stream_id );
228 u_skel.CacheD( dev );
229 u_skel.PrefetchH2D( dev, stream_id );
231 u_leaf.CacheD( dev );
234 size_t m = u_leaf.row();
235 size_t n = u_leaf.col();
236 size_t k = u_skel.row();
237 flops = 2.0 * m * n * k;
240 beg = omp_get_wtime();
243 reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->gethandle( stream_id ),
244 CUBLAS_OP_T, CUBLAS_OP_N,
246 1.0, proj.device_data( dev ), proj.row(),
247 u_skel.device_data( dev ), u_skel.row(),
248 0.0, u_leaf.device_data( dev ), u_leaf.row()
251 double gemm_time = omp_get_wtime() - beg;
252 printf(
"s2n cublas m %lu n %lu k %lu, %5.2lf (%5.2lf GFLOPS)\n",
254 gemm_time, flops / ( gemm_time * 1E+9 ) ); fflush( stdout );
256 u_leaf.Redistribute<
true>( dev );
258 u_leaf.PrefetchD2H( dev, stream_id );
263 if ( !node->parent || !node->data.isskel )
return;
265 auto &u_lskel = lchild->data.u_skel;
266 auto &u_rskel = rchild->data.u_skel;
267 auto &lskel = lchild->data.skels;
268 auto &rskel = rchild->data.skels;
272 proj.PrefetchH2D( dev, stream_id );
274 u_skel.CacheD( dev );
276 u_skel.PrefetchH2D( dev, stream_id );
278 u_lskel.CacheD( dev );
280 u_lskel.PrefetchH2D( dev, stream_id );
282 u_rskel.CacheD( dev );
284 u_rskel.PrefetchH2D( dev, stream_id );
287 flops = 2.0 * u_lskel.row() * u_lskel.col() * proj.row();
288 flops += 2.0 * u_rskel.row() * u_rskel.col() * proj.row();
291 beg = omp_get_wtime();
294 reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->gethandle( stream_id ),
295 CUBLAS_OP_T, CUBLAS_OP_N,
296 u_lskel.row(), u_lskel.col(), proj.row(),
297 1.0, proj.device_data( dev ), proj.row(),
298 u_skel.device_data( dev ), u_skel.row(),
299 1.0, u_lskel.device_data( dev ), u_lskel.row()
304 reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->gethandle( stream_id ),
305 CUBLAS_OP_T, CUBLAS_OP_N,
306 u_rskel.row(), u_rskel.col(), proj.row(),
307 1.0, proj.device_data( dev ) + proj.row() * lskel.size(), proj.row(),
308 u_skel.device_data( dev ), u_skel.row(),
309 1.0, u_rskel.device_data( dev ), u_rskel.row()
312 double gemm_time = omp_get_wtime() - beg;
313 printf(
"s2n cublas m %lu n %lu k %lu, %5.2lf (%5.2lf GFLOPS)\n",
314 u_lskel.row(), u_lskel.col(), proj.row(),
315 gemm_time, flops / ( gemm_time * 1E+9 ) ); fflush( stdout );
317 u_lskel.Redistribute<
true>( dev );
318 u_rskel.Redistribute<
true>( dev );
320 u_lskel.PrefetchD2H( dev, stream_id );
321 u_rskel.PrefetchD2H( dev, stream_id );
333 template<
bool CACHE,
bool NNPRUNE,
typename NODE,
typename T,
typename DEVICE>
334 void LeavesToLeaves( DEVICE *dev, NODE *node )
336 #ifdef DEBUG_SPDASKIT_GPU 337 printf(
"\n%lu LeavesToLeaves on GPU\n", node->treelist_id ); fflush( stdout );
340 assert( node && node->isleaf );
342 double beg, gemm_time = 0.0, total_time = 0.0, flops = 0.0;
348 auto &K = *node->setup->K;
349 auto &w = *node->setup->w;
350 auto &u = *node->setup->u;
352 auto &lids = node->lids;
353 auto &data = node->data;
354 auto &Nearbmap = data.Nearbmap;
355 auto &NearKab = data.NearKab;
356 auto &w_leaf = data.w_leaf;
360 if ( NearKab.size() != lids.size() * Nearbmap.size() )
362 vector<size_t> bmap( Nearbmap.size() );
363 for (
size_t i = 0; i < bmap.size(); i ++ )
364 bmap[ i ] = Nearbmap[ i ];
366 NearKab = K( lids, bmap );
368 printf(
"Not cache\n" );
373 size_t m = NearKab.col();
374 size_t k = w_leaf.row();
375 size_t nrhs = w_leaf.col();
377 flops += 2.0 * m * nrhs * k;
379 w_leaf.FetchH2D( dev );
380 NearKab.FetchH2D( dev );
381 Nearbmap.FetchH2D( dev );
384 assert( m * nrhs *
sizeof(T) < 1200000000 );
386 cublasHandle_t &handle =
388 T *A = NearKab.device_data( dev );
389 T *B = w_leaf.device_data( dev );
390 T *C = (T*)reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->workspace();
394 beg = omp_get_wtime();
401 CUBLAS_OP_T, CUBLAS_OP_N,
403 1.0, A, NearKab.row(),
408 dev->wait( stream_id );
409 gemm_time = omp_get_wtime() - beg;
416 reinterpret_cast<hmlp::gpu::Nvidia*>( dev )->getstream( stream_id ),
419 Nearbmap.device_data( dev ),
420 u.device_data( dev ), u.row()
424 dev->wait( stream_id );
426 total_time = omp_get_wtime() - beg;
427 printf(
"l2l cublas m %lu n %lu k %lu ldu %lu, %5.3lf (%5.2lf GFLOPS) total %5.3lf\n",
429 gemm_time, flops / ( gemm_time * 1E+9 ), total_time );
432 u.Redistribute<
true>( dev );
442 printf(
"cpu gemm begin\n" ); fflush( stdout );
443 std::set<NODE*> *NearNodes;
444 if ( NNPRUNE ) NearNodes = &node->NNNearNodes;
445 else NearNodes = &node->NearNodes;
447 size_t n = w_leaf.col();
448 size_t k = NearKab.row();
451 assert( NearKab.size() );
452 assert( k == w_leaf.row() );
454 for (
auto it = NearNodes->begin();
455 it != NearNodes->end(); it ++ )
457 assert( omp_get_thread_num() > 0 && omp_get_thread_num() < 20 );
458 auto &u_leaf = (*it)->data.u_leaf[ omp_get_thread_num() ];
459 size_t m = (*it)->lids.size();
461 assert( offset < NearKab.col() );
462 assert( w_leaf.size() == k * n );
465 if ( u_leaf.size() != m * n )
469 u_leaf.resize( 0, 0 );
470 u_leaf.resize( m, n, 0.0 );
480 1.0, NearKab.data() + offset * k, k,
481 w_leaf.data(), w_leaf.row(),
482 1.0, u_leaf.data(), u_leaf.row()
486 printf(
"cpu gemm finishd\n" ); fflush( stdout );
490 if ( !CACHE ) NearKab.resize( 0, 0 );
496 template<
bool CACHE,
bool NNPRUNE,
typename NODE,
typename T>
505 void Set( NODE *user_arg )
508 stream_id = ( arg->treelist_id % 8 ) + 1;
509 name = std::string(
"l2l" );
512 std::ostringstream ss;
513 ss << arg->treelist_id;
517 assert( arg->isleaf );
522 double flops = 0.0, mops = 0.0;
524 auto &data = node->data;
525 auto &w_leaf = data.w_leaf;
526 auto *NearNodes = &node->NearNodes;
527 if ( NNPRUNE ) NearNodes = &node->NNNearNodes;
528 auto &amap = node->lids;
533 for (
auto it = NearNodes->begin(); it != NearNodes->end(); it ++ )
535 bmap.insert( bmap.end(), (*it)->lids.begin(), (*it)->lids.end() );
537 data.Nearbmap.resize( bmap.size(), 1 );
538 for (
size_t i = 0; i < bmap.size(); i ++ )
539 data.Nearbmap[ i ] = bmap[ i ];
542 size_t m = w_leaf.col();
543 size_t n = arg->data.Nearbmap.size();
544 size_t k = arg->lids.size();
546 flops += 2.0 * m * n * k;
547 mops += 2.0 * ( m * n + m * k + k * n );
550 event.Set( name + label, flops, mops );
561 this->stealable =
false;
567 if ( user_worker ) device = user_worker->
GetDevice();
576 assert( !arg->data.NearKab.size() );
577 auto &K = *arg->setup->K;
578 std::vector<size_t> bmap( arg->data.Nearbmap.size() );
579 for (
size_t i = 0; i < bmap.size(); i ++ )
580 bmap[ i ] = arg->data.Nearbmap[ i ];
581 arg->data.NearKab = K( arg->lids, bmap );
583 printf(
"not cache\n" );
587 arg->data.NearKab.PrefetchH2D( device, stream_id );
590 arg->data.Nearbmap.CacheD( device );
591 arg->data.Nearbmap.PrefetchH2D( device, stream_id );
593 arg->data.w_leaf.CacheD( device );
594 arg->data.w_leaf.PrefetchH2D( device, stream_id );
599 void DependencyAnalysis()
601 assert( arg->isleaf );
603 this->ForceEnqueue( 0 );
608 void Execute(
Worker* user_worker )
611 if ( user_worker ) device = user_worker->
GetDevice();
612 gpu::LeavesToLeaves<CACHE, NNPRUNE, NODE, T>( device, arg );
Definition: hmlp_gpu.hpp:45
Definition: gofmm_gpu.hpp:497
void Prefetch(Worker *user_worker)
Definition: gofmm_gpu.hpp:564
void xgemm(const char *transA, const char *transB, int m, int n, int k, double alpha, const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc)
DGEMM wrapper.
Definition: blas_lapack.cpp:130
class Device * GetDevice()
Definition: thread.cpp:554
This class describes devices or accelerators that require a master thread to control. A device can accept tasks from multiple workers. All received tasks are expected to be executed independently in a time-sharing fashion. Whether these tasks are executed in parallel, sequential or with some built-in context switching scheme does not matter.
Definition: device.hpp:125
void Set(NODE *user_arg)
Definition: gofmm_gpu.hpp:505
Definition: runtime.hpp:174
Definition: thread.hpp:166