GCC Code Coverage Report
Directory: . Exec Total Coverage
File: frame/primitives/nbody.hpp Lines: 0 105 0.0 %
Date: 2019-01-14 Branches: 0 316 0.0 %

Line Exec Source
1
/**
2
 *  HMLP (High-Performance Machine Learning Primitives)
3
 *
4
 *  Copyright (C) 2014-2017, The University of Texas at Austin
5
 *
6
 *  This program is free software: you can redistribute it and/or modify
7
 *  it under the terms of the GNU General Public License as published by
8
 *  the Free Software Foundation, either version 3 of the License, or
9
 *  (at your option) any later version.
10
 *
11
 *  This program is distributed in the hope that it will be useful,
12
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
 *  GNU General Public License for more details.
15
 *
16
 *  You should have received a copy of the GNU General Public License
17
 *  along with this program. If not, see the LICENSE file.
18
 *
19
 **/
20
21
22
23
#ifndef GNBX_HPP
24
#define GNBX_HPP
25
26
#include <assert.h>
27
#include <typeinfo>
28
#include <algorithm>
29
30
#include <hmlp.h>
31
#include <hmlp_internal.hpp>
32
#include <hmlp_base.hpp>
33
34
/** Use Thread Control Interface (TCI). */
35
//#include <hmlp_tci.hpp>
36
/** Use rank-k update template. */
37
#include <primitives/rank_k.hpp>
38
39
/** reference microkernels */
40
#include <packing.hpp>
41
#include <semiring_mrxnr.hpp>
42
#include <fused_mrxnr.hpp>
43
44
using namespace std;
45
using namespace hmlp;
46
47
namespace hmlp
48
{
49
namespace nbody
50
{
51
52
///**
53
// *  @brief Macro kernel contains the 3rd and 2nd loops. Depending on the
54
// *         configuration of the communicator, the 3rd loop may be parallelized.
55
// *         b_next is the prefetch pointer.
56
// */
57
//template<int KC, typename SEMIRINGKERNEL, typename TA, typename TB, typename TV>
58
//void rank_k_macro_kernel
59
//(
60
//  tci::Comm &Comm3rd,
61
//  int ic, int jc, int pc,
62
//  int  m, int  n, int  k,
63
//  TA *packA,
64
//  TB *packB,
65
//  TV *V, int rs_v, int cs_v,
66
//  SEMIRINGKERNEL semiringkernel
67
//)
68
//{
69
//  /** Get all block sizes */
70
//  const static int MR         = SEMIRINGKERNEL::mr;
71
//  const static int NR         = SEMIRINGKERNEL::nr;
72
//  const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
73
//  const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
74
//  /** Create subcommunicators for each loop. */
75
//  auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) );
76
//  /** Compute loop ranges for each thread */
77
//  auto Loop3rd = Comm3rd.DistributeOver1DGangs(        0, n,      NR );
78
//  auto Pack3rd = Comm3rd.DistributeOver1DGangs(        0, n, PACK_NR );
79
//  auto Loop2nd = Comm2nd.DistributeOver1DThreads(      0, m,      MR );
80
//  auto Pack2nd = Comm2nd.DistributeOver1DThreads(      0, m, PACK_MR );
81
//  /** Distribute range [0,n) over Comm3rd (jr loop). */
82
//  for ( int j  = Loop3rd.beg(), jp  = Pack3rd.beg();
83
//            j  < Loop3rd.end();
84
//            j += Loop3rd.inc(), jp += Pack3rd.inc() )
85
//  {
86
//    struct aux_s<TA, TB, TV, TV> aux;
87
//    aux.pc       = pc;
88
//    aux.b_next   = packB;
89
//    aux.do_packC = 0;
90
//    aux.jb       = std::min( n - j, NR );
91
//    /** Distribute range [0,m) over Comm2nd (ir loop). */
92
//    for ( int i  = Loop2nd.beg(), ip  = Pack2nd.beg();
93
//              i  < Loop2nd.end();
94
//              i += Loop2nd.inc(), ip += Pack2nd.inc() )
95
//    {
96
//      aux.ib = std::min( m - i, MR );
97
//      /** Increase the b_next pointer. */
98
//      if ( i + MR >= m ) aux.b_next += Pack3rd.inc() * k;
99
//
100
//      if ( aux.jb == NR && aux.ib == MR )
101
//      {
102
//        semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
103
//          &V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux );
104
//      }
105
//      else
106
//      {
107
//        TV vtmp[ MR * NR ];
108
//
109
//        if ( pc ) // initilize ctmp
110
//        {
111
//          for ( auto jj = 0; jj < aux.jb; jj ++ )
112
//            for ( auto ii = 0; ii < aux.ib; ii ++ )
113
//              vtmp[ jj * MR + ii ] =
114
//                V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ];
115
//        }
116
//
117
//        semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
118
//          vtmp, 1, MR, &aux );
119
//
120
//        for ( auto jj = 0; jj < aux.jb; jj ++ )
121
//          for ( auto ii = 0; ii < aux.ib; ii ++ )
122
//            V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ]
123
//              = vtmp[ jj * MR + ii ];
124
//      }
125
//    } /** end 2nd loop */
126
//  } /** end 3rd loop */
127
//}; /** end rank_k_macro_kernel() */
128
//
129
//
130
131
132
133
/**
134
 *  @brief fused_macro_kernel contains the 3rd, 2nd loops and the fused micro
135
 *         kernel. Notice that here C has type TC, which is differnet from the
136
 *         one in rank_k_macro_kernel. ctmp used in the conner case is also
137
 *         type TC.
138
 */
139
template<int KC, typename FUSEDKERNEL, typename TA, typename TB, typename TC, typename TV>
140
void fused_macro_kernel
141
(
142
  tci::Comm &Comm3rd,
143
  int m, int n,
144
  int ic, int jc, int pc,
145
  int mc, int nc, int kc,
146
  TA *packA,
147
  TB *packB,
148
  TC *C,
149
  TV *V, int rs_v, int cs_v,
150
  int batchId,
151
  FUSEDKERNEL fusedkernel
152
)
153
{
154
  /** Get all block sizes. */
155
  const static int MR         = FUSEDKERNEL::mr;
156
  const static int NR         = FUSEDKERNEL::nr;
157
  const static int PACK_MR    = FUSEDKERNEL::pack_mr;
158
  const static int PACK_NR    = FUSEDKERNEL::pack_nr;
159
  /** Create subcommunicators for each loop. */
160
  auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) );
161
  /** Compute loop ranges for each thread */
162
  auto Loop3rd = Comm3rd.DistributeOver1DGangs(        0, nc,      NR );
163
  auto Pack3rd = Comm3rd.DistributeOver1DGangs(        0, nc, PACK_NR );
164
  auto Loop2nd = Comm2nd.DistributeOver1DThreads(      0, mc,      MR );
165
  auto Pack2nd = Comm2nd.DistributeOver1DThreads(      0, mc, PACK_MR );
166
167
  /** Distribute range [0,n) over Comm3rd (jr loop). */
168
  for ( int j  = Loop3rd.beg(), jp  = Pack3rd.beg();
169
            j  < Loop3rd.end();
170
            j += Loop3rd.inc(), jp += Pack3rd.inc() )
171
  {
172
    struct aux_s<TA, TB, TC, TV> aux;
173
    aux.pc       = pc;
174
    aux.b_next   = packB;
175
    aux.do_packC = 0;
176
    /** Distribute range [0,m) over Comm2nd (ir loop). */
177
    for ( int i  = Loop2nd.beg(), ip  = Pack2nd.beg();
178
              i  < Loop2nd.end();
179
              i += Loop2nd.inc(), ip += Pack2nd.inc() )
180
    {
181
      /**
182
       *  These auxiluary infos are used to access data in the closure of
183
       *  opkernel and opreduce.
184
       */
185
      aux.m = m;
186
      aux.n = n;
187
      aux.i = ic + i;
188
      aux.j = jc + j;
189
      aux.b = batchId;
190
191
      /** Encapsulate edge case information. */
192
      aux.ib = std::min( mc - i, MR );
193
      aux.jb = std::min( nc - j, NR );
194
195
      /** Prepare the intermediate semiring rank-k update. */
196
      aux.V = V + i * rs_v + j * cs_v;
197
      aux.ldv = cs_v;
198
199
      /** Increase the b_next pointer. */
200
      if ( i + MR >= mc ) aux.b_next += Pack3rd.inc() * kc;
201
202
      //Comm3rd.Acquire2DLocks( Comm3rd.parent->GetGangRank(), Comm3rd.GetGangRank() );
203
204
      if ( aux.jb == NR && aux.ib == MR )
205
      {
206
        fusedkernel( kc, &packA[ ip * kc ], &packB[ jp * kc ],
207
          C, &V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux );
208
      }
209
      else
210
      {
211
        TV vtmp[ MR * NR ];
212
        if ( pc ) // initilize ctmp
213
        {
214
          for ( auto jj = 0; jj < aux.jb; jj ++ )
215
            for ( auto ii = 0; ii < aux.ib; ii ++ )
216
              vtmp[ jj * MR + ii ] =
217
                V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ];
218
          aux.V = vtmp;
219
          aux.ldv = MR;
220
        }
221
        fusedkernel( kc, &packA[ ip * kc ], &packB[ jp * kc ],
222
          C, vtmp, 1, MR, &aux );
223
      }
224
225
      //Comm3rd.Release2DLocks( Comm3rd.parent->GetGangRank(), Comm3rd.GetGangRank() );
226
    }
227
  }
228
229
}; /** end fused_macro_kernel() */
230
231
232
233
234
/**
235
 *  @breif This function contains the loop body of the 6th to 4th loops,
236
 *         including all packing and unpacking routines. Notice that this
237
 *         function is executed by all threads in the root communicator.
238
 *         To access each thread in different level of communicators, use
239
 *         their ids.
240
 */
241
template<
242
  int MC, int NC, int KC,
243
  typename TPACKA, typename TPACKB, typename TV,
244
  typename     TA, typename     TB, typename TC,
245
  typename SEMIRINGKERNEL, typename MICROKERNEL>
246
void nbody_internal
247
(
248
  tci::Comm &Comm6th,
249
  int batchId, int m, int n, int k, int k_stra,
250
  TA& A,
251
  TB& B,
252
  TC& C,
253
  TV* V, int rs_v, int cs_v,
254
  SEMIRINGKERNEL semiringkernel,
255
  MICROKERNEL microkernel
256
)
257
{
258
  /** Get all block sizes. */
259
  const static int MR         = SEMIRINGKERNEL::mr;
260
  const static int NR         = SEMIRINGKERNEL::nr;
261
  const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
262
  const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
263
  const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
264
  const static int PACK_MC    = ( MC / MR ) * PACK_MR;
265
  const static int PACK_NC    = ( NC / NR ) * PACK_NR;
266
  /** Create subcommunicators for each loop. */
267
  auto Comm5th = Comm6th.Split( hmlp_read_nway_from_env( "KS_JC_NT" ) );
268
  auto Comm4th = Comm5th.Split( hmlp_read_nway_from_env( "JS_PC_NT" ) );
269
  auto Comm3th = Comm4th.Split( hmlp_read_nway_from_env( "KS_IC_NT" ) );
270
  /** Adjuest nc and pack_nc if the 6th loop is parallelized. */
271
  int nc = Comm6th.BalanceOver1DGangs( n, NC, NR );
272
  int pack_nc = ( nc / NR ) * PACK_NR;
273
  /** Allocate packB (shared over Comm4th, private for each Comm5th gang). */
274
  auto *packB = Comm4th.AllocateSharedMemory<ALIGN_SIZE, TPACKB>( KC * ( pack_nc + 1 ) );
275
  /** Allocate packA (shared over Comm3th, private for each Comm4th gang). */
276
  auto *packA = Comm3th.AllocateSharedMemory<ALIGN_SIZE, TPACKA>( KC * ( PACK_MC + 1 ) );
277
  /** If kc loop is parallelized, then create IC_NT * JR_NT locks. */
278
  //if ( Comm5th.GetGangSize() > 1 ) Comm5th.Create2DLocks( Comm4th.GetGangSize(), Comm3th.GetCommSize() );
279
280
  //printf( "%2d Allocate shared memory A %lu\n",
281
  //    omp_get_thread_num(), packA ); fflush( stdout );
282
283
  /** Distribute range [0,n) over Comm6th. */
284
  auto Loop6th = Comm6th.DistributeOver1DGangs(      0, n, nc );
285
  /** Distribute range [k_stra,k) over Comm5th. */
286
  auto Loop5th = Comm5th.DistributeOver1DGangs( k_stra, k, KC );
287
  /** Distribute range [0,m) over Comm4th. */
288
  auto Loop4th = Comm4th.DistributeOver1DGangs(      0, m, MC );
289
  /** Distribute range [0,n) over Comm6th. */
290
  for ( int jc  = Loop6th.beg();
291
            jc  < Loop6th.end();
292
            jc += Loop6th.inc() )
293
  {
294
    auto jb = std::min( n - jc, nc );
295
    /** Distribute range [k_stra,k) over Comm5th. */
296
    for ( int pc  = Loop5th.beg();
297
              pc  < Loop5th.end();
298
              pc += Loop5th.inc() )
299
    {
300
      auto pb = std::min( k - pc, KC );
301
      auto is_the_last_pc_iteration = ( pc + KC >= k );
302
      /** Distribute range [0,jb) over Comm4th. */
303
      auto LooppkB = Comm4th.DistributeOver1DThreads( 0, jb,      NR );
304
      auto PackpkB = Comm4th.DistributeOver1DThreads( 0, jb, PACK_NR );
305
      /** PackB and typecast from TB to TPACKB.  */
306
      for ( int j  = LooppkB.beg(), jp  = PackpkB.beg();
307
                j  < LooppkB.end();
308
                j += LooppkB.inc(), jp += PackpkB.inc() )
309
      {
310
        B.Pack( k, pc, pb, n, jc + j, std::min( jb - j, NR ),
311
            &packB[ jp * pb ] );
312
      }
313
      /** Synchronize all threads in Comm4th. */
314
      Comm4th.Barrier();
315
      /** Distribute range [0,m) over Comm4th. */
316
      for ( int ic  = Loop4th.beg();
317
                ic  < Loop4th.end();
318
                ic += Loop4th.inc() )
319
      {
320
        auto ib = std::min( m - ic, MC );
321
        /** Distribute range [0,ib) over Comm3th. */
322
        auto LooppkA = Comm3th.DistributeOver1DThreads( 0, ib, MR );
323
        auto PackpkA = Comm3th.DistributeOver1DThreads( 0, ib, PACK_MR );
324
        /** packA and typecast from TA to TPACKA. */
325
        for ( int i  = LooppkA.beg(), ip  = PackpkA.beg();
326
                  i  < LooppkA.end();
327
                  i += LooppkA.inc(), ip += PackpkA.inc() )
328
        {
329
          A.Pack( m, ic + i, std::min( ib - i, MR ),
330
              k, pc, pb, &packA[ ip * pb ] );
331
        }
332
        /** Synchronize all threads in Comm3th. */
333
        Comm3th.Barrier();
334
        /** Invoke the fused kernel, if this is the last iteration. */
335
        if ( is_the_last_pc_iteration )
336
        {
337
          //printf( "%2d after packA\n", omp_get_thread_num() ); fflush( stdout );
338
          fused_macro_kernel<KC>( Comm3th,
339
            m, n, ic, jc, pc, ib, jb, pb, packA, packB,
340
            &C, V + ic * rs_v + jc * cs_v, rs_v, cs_v,
341
            batchId, microkernel );
342
          //printf( "%2d fused_macrokernel\n", omp_get_thread_num() ); fflush( stdout );
343
        }
344
        /** Otherwise, invoke the semiubg rank-k kernel. */
345
        else
346
        {
347
          //printf( "%2d after packA (rank-k)\n", omp_get_thread_num() ); fflush( stdout );
348
          rank_k_macro_kernel<KC>( Comm3th,
349
            ic, jc, pc, ib, jb, pb, packA, packB,
350
            V + ic * rs_v + jc * cs_v, rs_v, cs_v,
351
            semiringkernel );
352
          //printf( "%2d rank_k_macrokernel\n", omp_get_thread_num() ); fflush( stdout );
353
        }
354
        /** Synchronize all threads in Comm3th. */
355
        Comm3th.Barrier();
356
      } /** end 4th loop */
357
      Comm4th.Barrier();
358
    } /** end 5th loop */
359
    Comm5th.Barrier();
360
  } /** end 6th loop */
361
  //printf( "%2d End of all loops\n", omp_get_thread_num() ); fflush( stdout );
362
  Comm6th.Barrier();
363
  //Comm5th.Destroy2DLocks();
364
  /** Free packing buffer. */
365
  //printf( "%2d Free shared memory B\n", omp_get_thread_num() ); fflush( stdout );
366
  Comm4th.FreeSharedMemory( packB );
367
  //printf( "%2d Free shared memory A, %lu\n",
368
  //    omp_get_thread_num(), packA ); fflush( stdout );
369
  Comm3th.FreeSharedMemory( packA );
370
}; /** end nbody_internal() */
371
372
373
374
375
376
/**
377
 *  @breif This is the main routine of gkmx. All packing buffers are
378
 *         managed here. The communicator and the parallel section
379
 *         start here.
380
 *
381
 */
382
template<
383
  int MC, int NC, int KC,
384
  typename TPACKA, typename TPACKB, typename TV,
385
  typename     TA, typename     TB, typename TC,
386
  typename SEMIRINGKERNEL, typename MICROKERNEL>
387
void nbody
388
(
389
  int batchId, int m, int n, int k,
390
  TA& A,
391
  TB& B,
392
  TC& C,
393
  SEMIRINGKERNEL semiringkernel,
394
  MICROKERNEL microkernel
395
)
396
{
397
  const static int MR         = SEMIRINGKERNEL::mr;
398
  const static int NR         = SEMIRINGKERNEL::nr;
399
  const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
400
  const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
401
  const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
402
  const static int PACK_MC    = ( MC / MR ) * PACK_MR;
403
  const static int PACK_NC    = ( NC / NR ) * PACK_NR;
404
  const static bool USE_STRASSEN = false;
405
406
  /** Early return if possible. */
407
  if ( m == 0 || n == 0 || k == 0 ) return;
408
409
410
  TV *V = NULL;
411
  int rs_v = 0;
412
  int cs_v = 0;
413
414
415
  if ( k > KC && !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value )
416
  {
417
    printf( "here m %d n %d\n", m, n );
418
    V = hmlp_malloc<ALIGN_SIZE, TV>( m * n );
419
    rs_v = 1;
420
    cs_v = m;
421
  }
422
  else
423
  {
424
    /** Directly use C for intermediate semiring rank-k update */
425
    V = reinterpret_cast<TV*>( C.X );
426
    rs_v = C.rs;
427
    cs_v = C.cs;
428
  }
429
430
431
  int k_stra = 0;
432
  if ( USE_STRASSEN )
433
  {
434
    assert( typeid(TPACKA) == typeid(TPACKB) );
435
    assert( typeid(TC) == typeid(TV) );
436
    k_stra = k - k % KC;
437
438
    if ( k_stra == k ) k_stra -= KC;
439
  }
440
441
  tci::Parallelize( NULL, nbody_internal<MC, NC, KC, TPACKA, TPACKB, TV,
442
      TA, TB, TC, SEMIRINGKERNEL, MICROKERNEL>,
443
      batchId, m, n, k, k_stra, A, B, C, V, rs_v, cs_v,
444
      semiringkernel, microkernel );
445
446
  if ( k > KC && !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value )
447
  {
448
    hmlp_free( V );
449
  }
450
}; /** end nbody() */
451
452
453
454
455
456
/**
457
 *  @beief
458
 */
459
//template<
460
//  int MR, int NR, int MC, int NC, int KC,
461
//  typename TPACKA, typename TPACKB, typename TPACKC, typename TV,
462
//  typename     TA, typename     TB, typename     TC,
463
//  typename OPKERNEL, typename OP1, typename OP2>
464
//void nbody
465
//(
466
//  int batchId, int m, int n, int k,
467
//  TA& A,
468
//  TB& B,
469
//  TC& C,
470
//  OPKERNEL opkernel, OP1 op1, OP2 op2, TV initV
471
//)
472
//{
473
//  semiring_mrxnr<MR, NR, OP1, OP2, TPACKA, TPACKB, TV, TV> semiringkernel;
474
//  gnbx_mrxnr<MR, NR, OPKERNEL, OP1, OP2, TPACKA, TPACKB, TC, TPACKC, TV> gkrmkernel;
475
//
476
//  semiringkernel.op1 = op1;
477
//  semiringkernel.op2 = op2;
478
//  semiringkernel.initV = initV;
479
//
480
//  gkrmkernel.op1 = op1;
481
//  gkrmkernel.op2 = op2;
482
//  gkrmkernel.opkernel = opkernel;
483
//  gkrmkernel.initV = initV;
484
//
485
//  nbody<MC, NC, KC, TPACKA, TPACKB, TV>
486
//    ( batchId, m, n, k, A, B, C, semiringkernel, gkrmkernel );
487
//
488
//}; /** end nbody() */
489
490
}; /** end namespace nbody */
491
}; /** end namespace hmlp */
492
493
#endif /** define GNBX_HPP */