HMLP: High-performance Machine Learning Primitives
nbody.hpp
1 
23 #ifndef GNBX_HPP
24 #define GNBX_HPP
25 
26 #include <assert.h>
27 #include <typeinfo>
28 #include <algorithm>
29 
30 #include <hmlp.h>
31 #include <hmlp_internal.hpp>
32 #include <hmlp_base.hpp>
33 
35 //#include <hmlp_tci.hpp>
37 #include <primitives/rank_k.hpp>
38 
40 #include <packing.hpp>
41 #include <semiring_mrxnr.hpp>
42 #include <fused_mrxnr.hpp>
43 
44 using namespace std;
45 using namespace hmlp;
46 
47 namespace hmlp
48 {
49 namespace nbody
50 {
51 
53 // * @brief Macro kernel contains the 3rd and 2nd loops. Depending on the
54 // * configuration of the communicator, the 3rd loop may be parallelized.
55 // * b_next is the prefetch pointer.
56 // */
57 //template<int KC, typename SEMIRINGKERNEL, typename TA, typename TB, typename TV>
58 //void rank_k_macro_kernel
59 //(
60 // tci::Comm &Comm3rd,
61 // int ic, int jc, int pc,
62 // int m, int n, int k,
63 // TA *packA,
64 // TB *packB,
65 // TV *V, int rs_v, int cs_v,
66 // SEMIRINGKERNEL semiringkernel
67 //)
68 //{
69 // /** Get all block sizes */
70 // const static int MR = SEMIRINGKERNEL::mr;
71 // const static int NR = SEMIRINGKERNEL::nr;
72 // const static int PACK_MR = SEMIRINGKERNEL::pack_mr;
73 // const static int PACK_NR = SEMIRINGKERNEL::pack_nr;
74 // /** Create subcommunicators for each loop. */
75 // auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) );
76 // /** Compute loop ranges for each thread */
77 // auto Loop3rd = Comm3rd.DistributeOver1DGangs( 0, n, NR );
78 // auto Pack3rd = Comm3rd.DistributeOver1DGangs( 0, n, PACK_NR );
79 // auto Loop2nd = Comm2nd.DistributeOver1DThreads( 0, m, MR );
80 // auto Pack2nd = Comm2nd.DistributeOver1DThreads( 0, m, PACK_MR );
81 // /** Distribute range [0,n) over Comm3rd (jr loop). */
82 // for ( int j = Loop3rd.beg(), jp = Pack3rd.beg();
83 // j < Loop3rd.end();
84 // j += Loop3rd.inc(), jp += Pack3rd.inc() )
85 // {
86 // struct aux_s<TA, TB, TV, TV> aux;
87 // aux.pc = pc;
88 // aux.b_next = packB;
89 // aux.do_packC = 0;
90 // aux.jb = std::min( n - j, NR );
91 // /** Distribute range [0,m) over Comm2nd (ir loop). */
92 // for ( int i = Loop2nd.beg(), ip = Pack2nd.beg();
93 // i < Loop2nd.end();
94 // i += Loop2nd.inc(), ip += Pack2nd.inc() )
95 // {
96 // aux.ib = std::min( m - i, MR );
97 // /** Increase the b_next pointer. */
98 // if ( i + MR >= m ) aux.b_next += Pack3rd.inc() * k;
99 //
100 // if ( aux.jb == NR && aux.ib == MR )
101 // {
102 // semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
103 // &V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux );
104 // }
105 // else
106 // {
107 // TV vtmp[ MR * NR ];
108 //
109 // if ( pc ) // initilize ctmp
110 // {
111 // for ( auto jj = 0; jj < aux.jb; jj ++ )
112 // for ( auto ii = 0; ii < aux.ib; ii ++ )
113 // vtmp[ jj * MR + ii ] =
114 // V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ];
115 // }
116 //
117 // semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
118 // vtmp, 1, MR, &aux );
119 //
120 // for ( auto jj = 0; jj < aux.jb; jj ++ )
121 // for ( auto ii = 0; ii < aux.ib; ii ++ )
122 // V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ]
123 // = vtmp[ jj * MR + ii ];
124 // }
125 // } /** end 2nd loop */
126 // } /** end 3rd loop */
127 //}; /** end rank_k_macro_kernel() */
128 //
129 //
130 
131 
132 
139 template<int KC, typename FUSEDKERNEL, typename TA, typename TB, typename TC, typename TV>
140 void fused_macro_kernel
141 (
142  tci::Comm &Comm3rd,
143  int m, int n,
144  int ic, int jc, int pc,
145  int mc, int nc, int kc,
146  TA *packA,
147  TB *packB,
148  TC *C,
149  TV *V, int rs_v, int cs_v,
150  int batchId,
151  FUSEDKERNEL fusedkernel
152 )
153 {
155  const static int MR = FUSEDKERNEL::mr;
156  const static int NR = FUSEDKERNEL::nr;
157  const static int PACK_MR = FUSEDKERNEL::pack_mr;
158  const static int PACK_NR = FUSEDKERNEL::pack_nr;
160  auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) );
162  auto Loop3rd = Comm3rd.DistributeOver1DGangs( 0, nc, NR );
163  auto Pack3rd = Comm3rd.DistributeOver1DGangs( 0, nc, PACK_NR );
164  auto Loop2nd = Comm2nd.DistributeOver1DThreads( 0, mc, MR );
165  auto Pack2nd = Comm2nd.DistributeOver1DThreads( 0, mc, PACK_MR );
166 
168  for ( int j = Loop3rd.beg(), jp = Pack3rd.beg();
169  j < Loop3rd.end();
170  j += Loop3rd.inc(), jp += Pack3rd.inc() )
171  {
172  struct aux_s<TA, TB, TC, TV> aux;
173  aux.pc = pc;
174  aux.b_next = packB;
175  aux.do_packC = 0;
177  for ( int i = Loop2nd.beg(), ip = Pack2nd.beg();
178  i < Loop2nd.end();
179  i += Loop2nd.inc(), ip += Pack2nd.inc() )
180  {
185  aux.m = m;
186  aux.n = n;
187  aux.i = ic + i;
188  aux.j = jc + j;
189  aux.b = batchId;
190 
192  aux.ib = std::min( mc - i, MR );
193  aux.jb = std::min( nc - j, NR );
194 
196  aux.V = V + i * rs_v + j * cs_v;
197  aux.ldv = cs_v;
198 
200  if ( i + MR >= mc ) aux.b_next += Pack3rd.inc() * kc;
201 
202  //Comm3rd.Acquire2DLocks( Comm3rd.parent->GetGangRank(), Comm3rd.GetGangRank() );
203 
204  if ( aux.jb == NR && aux.ib == MR )
205  {
206  fusedkernel( kc, &packA[ ip * kc ], &packB[ jp * kc ],
207  C, &V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux );
208  }
209  else
210  {
211  TV vtmp[ MR * NR ];
212  if ( pc ) // initilize ctmp
213  {
214  for ( auto jj = 0; jj < aux.jb; jj ++ )
215  for ( auto ii = 0; ii < aux.ib; ii ++ )
216  vtmp[ jj * MR + ii ] =
217  V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ];
218  aux.V = vtmp;
219  aux.ldv = MR;
220  }
221  fusedkernel( kc, &packA[ ip * kc ], &packB[ jp * kc ],
222  C, vtmp, 1, MR, &aux );
223  }
224 
225  //Comm3rd.Release2DLocks( Comm3rd.parent->GetGangRank(), Comm3rd.GetGangRank() );
226  }
227  }
228 
229 };
241 template<
242  int MC, int NC, int KC,
243  typename TPACKA, typename TPACKB, typename TV,
244  typename TA, typename TB, typename TC,
245  typename SEMIRINGKERNEL, typename MICROKERNEL>
246 void nbody_internal
247 (
248  tci::Comm &Comm6th,
249  int batchId, int m, int n, int k, int k_stra,
250  TA& A,
251  TB& B,
252  TC& C,
253  TV* V, int rs_v, int cs_v,
254  SEMIRINGKERNEL semiringkernel,
255  MICROKERNEL microkernel
256 )
257 {
259  const static int MR = SEMIRINGKERNEL::mr;
260  const static int NR = SEMIRINGKERNEL::nr;
261  const static int PACK_MR = SEMIRINGKERNEL::pack_mr;
262  const static int PACK_NR = SEMIRINGKERNEL::pack_nr;
263  const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
264  const static int PACK_MC = ( MC / MR ) * PACK_MR;
265  const static int PACK_NC = ( NC / NR ) * PACK_NR;
267  auto Comm5th = Comm6th.Split( hmlp_read_nway_from_env( "KS_JC_NT" ) );
268  auto Comm4th = Comm5th.Split( hmlp_read_nway_from_env( "JS_PC_NT" ) );
269  auto Comm3th = Comm4th.Split( hmlp_read_nway_from_env( "KS_IC_NT" ) );
271  int nc = Comm6th.BalanceOver1DGangs( n, NC, NR );
272  int pack_nc = ( nc / NR ) * PACK_NR;
274  auto *packB = Comm4th.AllocateSharedMemory<ALIGN_SIZE, TPACKB>( KC * ( pack_nc + 1 ) );
276  auto *packA = Comm3th.AllocateSharedMemory<ALIGN_SIZE, TPACKA>( KC * ( PACK_MC + 1 ) );
278  //if ( Comm5th.GetGangSize() > 1 ) Comm5th.Create2DLocks( Comm4th.GetGangSize(), Comm3th.GetCommSize() );
279 
280  //printf( "%2d Allocate shared memory A %lu\n",
281  // omp_get_thread_num(), packA ); fflush( stdout );
282 
284  auto Loop6th = Comm6th.DistributeOver1DGangs( 0, n, nc );
286  auto Loop5th = Comm5th.DistributeOver1DGangs( k_stra, k, KC );
288  auto Loop4th = Comm4th.DistributeOver1DGangs( 0, m, MC );
290  for ( int jc = Loop6th.beg();
291  jc < Loop6th.end();
292  jc += Loop6th.inc() )
293  {
294  auto jb = std::min( n - jc, nc );
296  for ( int pc = Loop5th.beg();
297  pc < Loop5th.end();
298  pc += Loop5th.inc() )
299  {
300  auto pb = std::min( k - pc, KC );
301  auto is_the_last_pc_iteration = ( pc + KC >= k );
303  auto LooppkB = Comm4th.DistributeOver1DThreads( 0, jb, NR );
304  auto PackpkB = Comm4th.DistributeOver1DThreads( 0, jb, PACK_NR );
306  for ( int j = LooppkB.beg(), jp = PackpkB.beg();
307  j < LooppkB.end();
308  j += LooppkB.inc(), jp += PackpkB.inc() )
309  {
310  B.Pack( k, pc, pb, n, jc + j, std::min( jb - j, NR ),
311  &packB[ jp * pb ] );
312  }
314  Comm4th.Barrier();
316  for ( int ic = Loop4th.beg();
317  ic < Loop4th.end();
318  ic += Loop4th.inc() )
319  {
320  auto ib = std::min( m - ic, MC );
322  auto LooppkA = Comm3th.DistributeOver1DThreads( 0, ib, MR );
323  auto PackpkA = Comm3th.DistributeOver1DThreads( 0, ib, PACK_MR );
325  for ( int i = LooppkA.beg(), ip = PackpkA.beg();
326  i < LooppkA.end();
327  i += LooppkA.inc(), ip += PackpkA.inc() )
328  {
329  A.Pack( m, ic + i, std::min( ib - i, MR ),
330  k, pc, pb, &packA[ ip * pb ] );
331  }
333  Comm3th.Barrier();
335  if ( is_the_last_pc_iteration )
336  {
337  //printf( "%2d after packA\n", omp_get_thread_num() ); fflush( stdout );
338  fused_macro_kernel<KC>( Comm3th,
339  m, n, ic, jc, pc, ib, jb, pb, packA, packB,
340  &C, V + ic * rs_v + jc * cs_v, rs_v, cs_v,
341  batchId, microkernel );
342  //printf( "%2d fused_macrokernel\n", omp_get_thread_num() ); fflush( stdout );
343  }
345  else
346  {
347  //printf( "%2d after packA (rank-k)\n", omp_get_thread_num() ); fflush( stdout );
348  rank_k_macro_kernel<KC>( Comm3th,
349  ic, jc, pc, ib, jb, pb, packA, packB,
350  V + ic * rs_v + jc * cs_v, rs_v, cs_v,
351  semiringkernel );
352  //printf( "%2d rank_k_macrokernel\n", omp_get_thread_num() ); fflush( stdout );
353  }
355  Comm3th.Barrier();
356  }
357  Comm4th.Barrier();
358  }
359  Comm5th.Barrier();
360  }
361  //printf( "%2d End of all loops\n", omp_get_thread_num() ); fflush( stdout );
362  Comm6th.Barrier();
363  //Comm5th.Destroy2DLocks();
365  //printf( "%2d Free shared memory B\n", omp_get_thread_num() ); fflush( stdout );
366  Comm4th.FreeSharedMemory( packB );
367  //printf( "%2d Free shared memory A, %lu\n",
368  // omp_get_thread_num(), packA ); fflush( stdout );
369  Comm3th.FreeSharedMemory( packA );
370 };
382 template<
383  int MC, int NC, int KC,
384  typename TPACKA, typename TPACKB, typename TV,
385  typename TA, typename TB, typename TC,
386  typename SEMIRINGKERNEL, typename MICROKERNEL>
387 void nbody
388 (
389  int batchId, int m, int n, int k,
390  TA& A,
391  TB& B,
392  TC& C,
393  SEMIRINGKERNEL semiringkernel,
394  MICROKERNEL microkernel
395 )
396 {
397  const static int MR = SEMIRINGKERNEL::mr;
398  const static int NR = SEMIRINGKERNEL::nr;
399  const static int PACK_MR = SEMIRINGKERNEL::pack_mr;
400  const static int PACK_NR = SEMIRINGKERNEL::pack_nr;
401  const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
402  const static int PACK_MC = ( MC / MR ) * PACK_MR;
403  const static int PACK_NC = ( NC / NR ) * PACK_NR;
404  const static bool USE_STRASSEN = false;
405 
407  if ( m == 0 || n == 0 || k == 0 ) return;
408 
409 
410  TV *V = NULL;
411  int rs_v = 0;
412  int cs_v = 0;
413 
414 
415  if ( k > KC && !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value )
416  {
417  printf( "here m %d n %d\n", m, n );
418  V = hmlp_malloc<ALIGN_SIZE, TV>( m * n );
419  rs_v = 1;
420  cs_v = m;
421  }
422  else
423  {
425  V = reinterpret_cast<TV*>( C.X );
426  rs_v = C.rs;
427  cs_v = C.cs;
428  }
429 
430 
431  int k_stra = 0;
432  if ( USE_STRASSEN )
433  {
434  assert( typeid(TPACKA) == typeid(TPACKB) );
435  assert( typeid(TC) == typeid(TV) );
436  k_stra = k - k % KC;
437 
438  if ( k_stra == k ) k_stra -= KC;
439  }
440 
441  tci::Parallelize( NULL, nbody_internal<MC, NC, KC, TPACKA, TPACKB, TV,
442  TA, TB, TC, SEMIRINGKERNEL, MICROKERNEL>,
443  batchId, m, n, k, k_stra, A, B, C, V, rs_v, cs_v,
444  semiringkernel, microkernel );
445 
446  if ( k > KC && !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value )
447  {
448  hmlp_free( V );
449  }
450 };
459 //template<
460 // int MR, int NR, int MC, int NC, int KC,
461 // typename TPACKA, typename TPACKB, typename TPACKC, typename TV,
462 // typename TA, typename TB, typename TC,
463 // typename OPKERNEL, typename OP1, typename OP2>
464 //void nbody
465 //(
466 // int batchId, int m, int n, int k,
467 // TA& A,
468 // TB& B,
469 // TC& C,
470 // OPKERNEL opkernel, OP1 op1, OP2 op2, TV initV
471 //)
472 //{
473 // semiring_mrxnr<MR, NR, OP1, OP2, TPACKA, TPACKB, TV, TV> semiringkernel;
474 // gnbx_mrxnr<MR, NR, OPKERNEL, OP1, OP2, TPACKA, TPACKB, TC, TPACKC, TV> gkrmkernel;
475 //
476 // semiringkernel.op1 = op1;
477 // semiringkernel.op2 = op2;
478 // semiringkernel.initV = initV;
479 //
480 // gkrmkernel.op1 = op1;
481 // gkrmkernel.op2 = op2;
482 // gkrmkernel.opkernel = opkernel;
483 // gkrmkernel.initV = initV;
484 //
485 // nbody<MC, NC, KC, TPACKA, TPACKB, TV>
486 // ( batchId, m, n, k, A, B, C, semiringkernel, gkrmkernel );
487 //
488 //}; /** end nbody() */
489 
490 };
491 };
493 #endif
Comm Split(int num_groups)
Definition: tci.cpp:149
Definition: tci.hpp:89
Definition: hmlp_internal.hpp:38
Definition: packing.hpp:198
Range DistributeOver1DGangs(int beg, int end, int nb)
Definition: tci.cpp:335
void hmlp_free(T *ptr)
Free the aligned memory.
Definition: util.hpp:88
Definition: gofmm.hpp:83