GCC Code Coverage Report
Directory: . Exec Total Coverage
File: frame/primitives/rank_k.hpp Lines: 0 32 0.0 %
Date: 2019-01-14 Branches: 0 100 0.0 %

Line Exec Source
1
/**
2
 *  HMLP (High-Performance Machine Learning Primitives)
3
 *
4
 *  Copyright (C) 2014-2017, The University of Texas at Austin
5
 *
6
 *  This program is free software: you can redistribute it and/or modify
7
 *  it under the terms of the GNU General Public License as published by
8
 *  the Free Software Foundation, either version 3 of the License, or
9
 *  (at your option) any later version.
10
 *
11
 *  This program is distributed in the hope that it will be useful,
12
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
 *  GNU General Public License for more details.
15
 *
16
 *  You should have received a copy of the GNU General Public License
17
 *  along with this program. If not, see the LICENSE file.
18
 *
19
 **/
20
21
22
23
#ifndef RANK_K_HPP
24
#define RANK_K_HPP
25
26
#include <assert.h>
27
#include <typeinfo>
28
#include <algorithm>
29
30
#include <hmlp.h>
31
#include <hmlp_internal.hpp>
32
#include <hmlp_base.hpp>
33
34
/** reference microkernels */
35
#include <packing.hpp>
36
#include <semiring_mrxnr.hpp>
37
#include <fused_mrxnr.hpp>
38
39
using namespace std;
40
using namespace hmlp;
41
42
namespace hmlp
43
{
44
/**
45
 *  @brief Macro kernel contains the 3rd and 2nd loops. Depending on the
46
 *         configuration of the communicator, the 3rd loop may be parallelized.
47
 *         b_next is the prefetch pointer.
48
 */
49
template<int KC, typename SEMIRINGKERNEL, typename TA, typename TB, typename TV>
50
void rank_k_macro_kernel
51
(
52
  tci::Comm &Comm3rd,
53
  int ic, int jc, int pc,
54
  int  m, int  n, int  k,
55
  TA *packA,
56
  TB *packB,
57
  TV *V, int rs_v, int cs_v,
58
  SEMIRINGKERNEL semiringkernel
59
)
60
{
61
  /** Get all block sizes */
62
  const static int MR         = SEMIRINGKERNEL::mr;
63
  const static int NR         = SEMIRINGKERNEL::nr;
64
  const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
65
  const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
66
  /** Create subcommunicators for each loop. */
67
  auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) );
68
  /** Compute loop ranges for each thread */
69
  auto Loop3rd = Comm3rd.DistributeOver1DGangs(        0, n,      NR );
70
  auto Pack3rd = Comm3rd.DistributeOver1DGangs(        0, n, PACK_NR );
71
  auto Loop2nd = Comm2nd.DistributeOver1DThreads(      0, m,      MR );
72
  auto Pack2nd = Comm2nd.DistributeOver1DThreads(      0, m, PACK_MR );
73
  /** Distribute range [0,n) over Comm3rd (jr loop). */
74
  for ( int j  = Loop3rd.beg(), jp  = Pack3rd.beg();
75
            j  < Loop3rd.end();
76
            j += Loop3rd.inc(), jp += Pack3rd.inc() )
77
  {
78
    struct aux_s<TA, TB, TV, TV> aux;
79
    aux.pc       = pc;
80
    aux.b_next   = packB;
81
    aux.do_packC = 0;
82
    aux.jb       = std::min( n - j, NR );
83
    /** Distribute range [0,m) over Comm2nd (ir loop). */
84
    for ( int i  = Loop2nd.beg(), ip  = Pack2nd.beg();
85
              i  < Loop2nd.end();
86
              i += Loop2nd.inc(), ip += Pack2nd.inc() )
87
    {
88
      aux.ib = std::min( m - i, MR );
89
      /** Increase the b_next pointer. */
90
      if ( i + MR >= m ) aux.b_next += Pack3rd.inc() * k;
91
92
      if ( aux.jb == NR && aux.ib == MR )
93
      {
94
        semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
95
          &V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux );
96
      }
97
      else
98
      {
99
        TV vtmp[ MR * NR ];
100
101
        if ( pc ) // initilize ctmp
102
        {
103
          for ( auto jj = 0; jj < aux.jb; jj ++ )
104
            for ( auto ii = 0; ii < aux.ib; ii ++ )
105
              vtmp[ jj * MR + ii ] =
106
                V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ];
107
        }
108
109
        semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
110
          vtmp, 1, MR, &aux );
111
112
        for ( auto jj = 0; jj < aux.jb; jj ++ )
113
          for ( auto ii = 0; ii < aux.ib; ii ++ )
114
            V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ]
115
              = vtmp[ jj * MR + ii ];
116
      }
117
    } /** end 2nd loop */
118
  } /** end 3rd loop */
119
}; /** end rank_k_macro_kernel() */
120
121
122
123
/**
124
 *  @breif This function contains the loop body of the 6th to 4th loops,
125
 *         including all packing and unpacking routines. Notice that this
126
 *         function is executed by all threads in the root communicator.
127
 *         To access each thread in different level of communicators, use
128
 *         their ids.
129
 */
130
template<
131
  int MC, int NC, int KC,
132
  typename TPACKA, typename TPACKB, typename TV,
133
  typename     TA, typename     TB, typename TC,
134
  typename SEMIRINGKERNEL>
135
void rank_k_internal
136
(
137
  tci::Comm &Comm6th,
138
  int batchId, int m, int n, int k, int k_stra,
139
  TA& A,
140
  TB& B,
141
  TV* V, int rs_v, int cs_v,
142
  SEMIRINGKERNEL semiringkernel
143
)
144
{
145
  /** Get all block sizes. */
146
  const static int MR         = SEMIRINGKERNEL::mr;
147
  const static int NR         = SEMIRINGKERNEL::nr;
148
  const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
149
  const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
150
  const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
151
  const static int PACK_MC    = ( MC / MR ) * PACK_MR;
152
  const static int PACK_NC    = ( NC / NR ) * PACK_NR;
153
  /** Create subcommunicators for each loop. */
154
  auto Comm5th = Comm6th.Split( hmlp_read_nway_from_env( "KS_JC_NT" ) );
155
  auto Comm4th = Comm5th.Split( 1 );
156
  auto Comm3th = Comm4th.Split( hmlp_read_nway_from_env( "KS_IC_NT" ) );
157
  /** Adjuest nc and pack_nc if the 6th loop is parallelized. */
158
  int nc = Comm6th.BalanceOver1DGangs( n, NC, NR );
159
  int pack_nc = ( nc / NR ) * PACK_NR;
160
  /** Allocate packB (shared over Comm4th, private for each Comm5th gang). */
161
  auto *packB = Comm4th.AllocateSharedMemory<ALIGN_SIZE, TPACKB>( KC * ( pack_nc + 1 ) );
162
  /** Allocate packA (shared over Comm3th, private for each Comm4th gang). */
163
  auto *packA = Comm3th.AllocateSharedMemory<ALIGN_SIZE, TPACKA>( KC * ( PACK_MC + 1 ) );
164
  /** Distribute range [0,n) over Comm6th. */
165
  auto Loop6th = Comm6th.DistributeOver1DGangs(      0, n, nc );
166
  /** Distribute range [k_stra,k) over Comm5th. */
167
  auto Loop5th = Comm5th.DistributeOver1DGangs( k_stra, k, KC );
168
  /** Distribute range [0,m) over Comm4th. */
169
  auto Loop4th = Comm4th.DistributeOver1DGangs(      0, m, MC );
170
  /** Distribute range [0,n) over Comm6th. */
171
  for ( int jc  = Loop6th.beg();
172
            jc  < Loop6th.end();
173
            jc += Loop6th.inc() )
174
  {
175
    auto jb = std::min( n - jc, nc );
176
    /** Distribute range [k_stra,k) over Comm5th. */
177
    for ( int pc  = Loop5th.beg();
178
              pc  < Loop5th.end();
179
              pc += Loop5th.inc() )
180
    {
181
      auto pb = std::min( k - pc, KC );
182
      /** Distribute range [0,jb) over Comm4th. */
183
      auto LooppkB = Comm4th.DistributeOver1DThreads( 0, jb,      NR );
184
      auto PackpkB = Comm4th.DistributeOver1DThreads( 0, jb, PACK_NR );
185
      /** PackB and typecast from TB to TPACKB.  */
186
      for ( int j  = LooppkB.beg(), jp  = PackpkB.beg();
187
                j  < LooppkB.end();
188
                j += LooppkB.inc(), jp += PackpkB.inc() )
189
      {
190
        B.Pack( k, pc, pb, n, jc + j, std::min( jb - j, NR ),
191
            &packB[ jp * pb ] );
192
      }
193
      /** Synchronize all threads in Comm4th. */
194
      Comm4th.Barrier();
195
      /** Distribute range [0,m) over Comm4th. */
196
      for ( int ic  = Loop4th.beg();
197
                ic  < Loop4th.end();
198
                ic += Loop4th.inc() )
199
      {
200
        auto ib = std::min( m - ic, MC );
201
        /** Distribute range [0,ib) over Comm3th. */
202
        auto LooppkA = Comm3th.DistributeOver1DThreads( 0, ib, MR );
203
        auto PackpkA = Comm3th.DistributeOver1DThreads( 0, ib, PACK_MR );
204
        /** packA and typecast from TA to TPACKA. */
205
        for ( int i  = LooppkA.beg(), ip  = PackpkA.beg();
206
                  i  < LooppkA.end();
207
                  i += LooppkA.inc(), ip += PackpkA.inc() )
208
        {
209
          A.Pack( m, ic + i, std::min( ib - i, MR ),
210
              k, pc, pb, &packA[ ip * pb ] );
211
        }
212
        /** Synchronize all threads in Comm3th. */
213
        Comm3th.Barrier();
214
        /** Otherwise, invoke the semiubg rank-k kernel. */
215
        rank_k_macro_kernel<KC>( Comm3th,
216
          ic, jc, pc, ib, jb, pb, packA, packB,
217
          V + ic * rs_v + jc * cs_v, rs_v, cs_v,
218
          semiringkernel );
219
        /** Synchronize all threads in Comm3th. */
220
        Comm3th.Barrier();
221
      } /** end 4th loop */
222
      Comm4th.Barrier();
223
    } /** end 5th loop */
224
    Comm5th.Barrier();
225
  } /** end 6th loop */
226
  Comm6th.Barrier();
227
  /** Free packing buffer. */
228
  Comm3th.FreeSharedMemory( packA );
229
  Comm4th.FreeSharedMemory( packB );
230
}; /** end nbody_internal() */
231
232
233
234
235
236
/**
237
 *  @breif This is the main routine of gkmx. All packing buffers are
238
 *         managed here. The communicator and the parallel section
239
 *         start here.
240
 *
241
 */
242
template<
243
  int MC, int NC, int KC,
244
  typename TPACKA, typename TPACKB, typename TV,
245
  typename     TA, typename     TB, typename TC,
246
  typename SEMIRINGKERNEL>
247
void rank_k
248
(
249
  int batchId, int m, int n, int k,
250
  TA& A,
251
  TB& B,
252
  TC& C,
253
  SEMIRINGKERNEL semiringkernel
254
)
255
{
256
  const static int MR         = SEMIRINGKERNEL::mr;
257
  const static int NR         = SEMIRINGKERNEL::nr;
258
  const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
259
  const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
260
  const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
261
  const static int PACK_MC    = ( MC / MR ) * PACK_MR;
262
  const static int PACK_NC    = ( NC / NR ) * PACK_NR;
263
  const static bool USE_STRASSEN = false;
264
265
  /** Early return if possible. */
266
  if ( m == 0 || n == 0 || k == 0 ) return;
267
  /** Type C must be MatrixLike.  */
268
  if ( !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value )
269
  {
270
    exit( 1 );
271
  }
272
  /** Now get the pointer, row and column stride. */
273
  auto *V = reinterpret_cast<TV*>( C.X );
274
  auto rs_v = C.rs;
275
  auto cs_v = C.cs;
276
277
278
  int k_stra = 0;
279
  if ( USE_STRASSEN )
280
  {
281
    assert( typeid(TPACKA) == typeid(TPACKB) );
282
    assert( typeid(TC) == typeid(TV) );
283
    k_stra = k - k % KC;
284
285
    if ( k_stra == k ) k_stra -= KC;
286
  }
287
288
  tci::Parallelize( NULL, rank_k_internal<MC, NC, KC, TPACKA, TPACKB, TV,
289
      TA, TB, TC, SEMIRINGKERNEL>,
290
      batchId, m, n, k, k_stra, A, B, C, V, rs_v, cs_v,
291
      semiringkernel );
292
}; /** end rank_k() */
293
294
}; /** end namespace hmlp */
295
296
#endif /** define RANK_K_HPP */