1 |
|
/** |
2 |
|
* HMLP (High-Performance Machine Learning Primitives) |
3 |
|
* |
4 |
|
* Copyright (C) 2014-2017, The University of Texas at Austin |
5 |
|
* |
6 |
|
* This program is free software: you can redistribute it and/or modify |
7 |
|
* it under the terms of the GNU General Public License as published by |
8 |
|
* the Free Software Foundation, either version 3 of the License, or |
9 |
|
* (at your option) any later version. |
10 |
|
* |
11 |
|
* This program is distributed in the hope that it will be useful, |
12 |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 |
|
* GNU General Public License for more details. |
15 |
|
* |
16 |
|
* You should have received a copy of the GNU General Public License |
17 |
|
* along with this program. If not, see the LICENSE file. |
18 |
|
* |
19 |
|
**/ |
20 |
|
|
21 |
|
|
22 |
|
|
23 |
|
#ifndef GNBX_HPP |
24 |
|
#define GNBX_HPP |
25 |
|
|
26 |
|
#include <assert.h> |
27 |
|
#include <typeinfo> |
28 |
|
#include <algorithm> |
29 |
|
|
30 |
|
#include <hmlp.h> |
31 |
|
#include <hmlp_internal.hpp> |
32 |
|
#include <hmlp_base.hpp> |
33 |
|
|
34 |
|
/** Use Thread Control Interface (TCI). */ |
35 |
|
//#include <hmlp_tci.hpp> |
36 |
|
/** Use rank-k update template. */ |
37 |
|
#include <primitives/rank_k.hpp> |
38 |
|
|
39 |
|
/** reference microkernels */ |
40 |
|
#include <packing.hpp> |
41 |
|
#include <semiring_mrxnr.hpp> |
42 |
|
#include <fused_mrxnr.hpp> |
43 |
|
|
44 |
|
using namespace std; |
45 |
|
using namespace hmlp; |
46 |
|
|
47 |
|
namespace hmlp |
48 |
|
{ |
49 |
|
namespace nbody |
50 |
|
{ |
51 |
|
|
52 |
|
///** |
53 |
|
// * @brief Macro kernel contains the 3rd and 2nd loops. Depending on the |
54 |
|
// * configuration of the communicator, the 3rd loop may be parallelized. |
55 |
|
// * b_next is the prefetch pointer. |
56 |
|
// */ |
57 |
|
//template<int KC, typename SEMIRINGKERNEL, typename TA, typename TB, typename TV> |
58 |
|
//void rank_k_macro_kernel |
59 |
|
//( |
60 |
|
// tci::Comm &Comm3rd, |
61 |
|
// int ic, int jc, int pc, |
62 |
|
// int m, int n, int k, |
63 |
|
// TA *packA, |
64 |
|
// TB *packB, |
65 |
|
// TV *V, int rs_v, int cs_v, |
66 |
|
// SEMIRINGKERNEL semiringkernel |
67 |
|
//) |
68 |
|
//{ |
69 |
|
// /** Get all block sizes */ |
70 |
|
// const static int MR = SEMIRINGKERNEL::mr; |
71 |
|
// const static int NR = SEMIRINGKERNEL::nr; |
72 |
|
// const static int PACK_MR = SEMIRINGKERNEL::pack_mr; |
73 |
|
// const static int PACK_NR = SEMIRINGKERNEL::pack_nr; |
74 |
|
// /** Create subcommunicators for each loop. */ |
75 |
|
// auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) ); |
76 |
|
// /** Compute loop ranges for each thread */ |
77 |
|
// auto Loop3rd = Comm3rd.DistributeOver1DGangs( 0, n, NR ); |
78 |
|
// auto Pack3rd = Comm3rd.DistributeOver1DGangs( 0, n, PACK_NR ); |
79 |
|
// auto Loop2nd = Comm2nd.DistributeOver1DThreads( 0, m, MR ); |
80 |
|
// auto Pack2nd = Comm2nd.DistributeOver1DThreads( 0, m, PACK_MR ); |
81 |
|
// /** Distribute range [0,n) over Comm3rd (jr loop). */ |
82 |
|
// for ( int j = Loop3rd.beg(), jp = Pack3rd.beg(); |
83 |
|
// j < Loop3rd.end(); |
84 |
|
// j += Loop3rd.inc(), jp += Pack3rd.inc() ) |
85 |
|
// { |
86 |
|
// struct aux_s<TA, TB, TV, TV> aux; |
87 |
|
// aux.pc = pc; |
88 |
|
// aux.b_next = packB; |
89 |
|
// aux.do_packC = 0; |
90 |
|
// aux.jb = std::min( n - j, NR ); |
91 |
|
// /** Distribute range [0,m) over Comm2nd (ir loop). */ |
92 |
|
// for ( int i = Loop2nd.beg(), ip = Pack2nd.beg(); |
93 |
|
// i < Loop2nd.end(); |
94 |
|
// i += Loop2nd.inc(), ip += Pack2nd.inc() ) |
95 |
|
// { |
96 |
|
// aux.ib = std::min( m - i, MR ); |
97 |
|
// /** Increase the b_next pointer. */ |
98 |
|
// if ( i + MR >= m ) aux.b_next += Pack3rd.inc() * k; |
99 |
|
// |
100 |
|
// if ( aux.jb == NR && aux.ib == MR ) |
101 |
|
// { |
102 |
|
// semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ], |
103 |
|
// &V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux ); |
104 |
|
// } |
105 |
|
// else |
106 |
|
// { |
107 |
|
// TV vtmp[ MR * NR ]; |
108 |
|
// |
109 |
|
// if ( pc ) // initilize ctmp |
110 |
|
// { |
111 |
|
// for ( auto jj = 0; jj < aux.jb; jj ++ ) |
112 |
|
// for ( auto ii = 0; ii < aux.ib; ii ++ ) |
113 |
|
// vtmp[ jj * MR + ii ] = |
114 |
|
// V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ]; |
115 |
|
// } |
116 |
|
// |
117 |
|
// semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ], |
118 |
|
// vtmp, 1, MR, &aux ); |
119 |
|
// |
120 |
|
// for ( auto jj = 0; jj < aux.jb; jj ++ ) |
121 |
|
// for ( auto ii = 0; ii < aux.ib; ii ++ ) |
122 |
|
// V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ] |
123 |
|
// = vtmp[ jj * MR + ii ]; |
124 |
|
// } |
125 |
|
// } /** end 2nd loop */ |
126 |
|
// } /** end 3rd loop */ |
127 |
|
//}; /** end rank_k_macro_kernel() */ |
128 |
|
// |
129 |
|
// |
130 |
|
|
131 |
|
|
132 |
|
|
133 |
|
/** |
134 |
|
* @brief fused_macro_kernel contains the 3rd, 2nd loops and the fused micro |
135 |
|
* kernel. Notice that here C has type TC, which is differnet from the |
136 |
|
* one in rank_k_macro_kernel. ctmp used in the conner case is also |
137 |
|
* type TC. |
138 |
|
*/ |
139 |
|
template<int KC, typename FUSEDKERNEL, typename TA, typename TB, typename TC, typename TV> |
140 |
|
void fused_macro_kernel |
141 |
|
( |
142 |
|
tci::Comm &Comm3rd, |
143 |
|
int m, int n, |
144 |
|
int ic, int jc, int pc, |
145 |
|
int mc, int nc, int kc, |
146 |
|
TA *packA, |
147 |
|
TB *packB, |
148 |
|
TC *C, |
149 |
|
TV *V, int rs_v, int cs_v, |
150 |
|
int batchId, |
151 |
|
FUSEDKERNEL fusedkernel |
152 |
|
) |
153 |
|
{ |
154 |
|
/** Get all block sizes. */ |
155 |
|
const static int MR = FUSEDKERNEL::mr; |
156 |
|
const static int NR = FUSEDKERNEL::nr; |
157 |
|
const static int PACK_MR = FUSEDKERNEL::pack_mr; |
158 |
|
const static int PACK_NR = FUSEDKERNEL::pack_nr; |
159 |
|
/** Create subcommunicators for each loop. */ |
160 |
|
auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) ); |
161 |
|
/** Compute loop ranges for each thread */ |
162 |
|
auto Loop3rd = Comm3rd.DistributeOver1DGangs( 0, nc, NR ); |
163 |
|
auto Pack3rd = Comm3rd.DistributeOver1DGangs( 0, nc, PACK_NR ); |
164 |
|
auto Loop2nd = Comm2nd.DistributeOver1DThreads( 0, mc, MR ); |
165 |
|
auto Pack2nd = Comm2nd.DistributeOver1DThreads( 0, mc, PACK_MR ); |
166 |
|
|
167 |
|
/** Distribute range [0,n) over Comm3rd (jr loop). */ |
168 |
|
for ( int j = Loop3rd.beg(), jp = Pack3rd.beg(); |
169 |
|
j < Loop3rd.end(); |
170 |
|
j += Loop3rd.inc(), jp += Pack3rd.inc() ) |
171 |
|
{ |
172 |
|
struct aux_s<TA, TB, TC, TV> aux; |
173 |
|
aux.pc = pc; |
174 |
|
aux.b_next = packB; |
175 |
|
aux.do_packC = 0; |
176 |
|
/** Distribute range [0,m) over Comm2nd (ir loop). */ |
177 |
|
for ( int i = Loop2nd.beg(), ip = Pack2nd.beg(); |
178 |
|
i < Loop2nd.end(); |
179 |
|
i += Loop2nd.inc(), ip += Pack2nd.inc() ) |
180 |
|
{ |
181 |
|
/** |
182 |
|
* These auxiluary infos are used to access data in the closure of |
183 |
|
* opkernel and opreduce. |
184 |
|
*/ |
185 |
|
aux.m = m; |
186 |
|
aux.n = n; |
187 |
|
aux.i = ic + i; |
188 |
|
aux.j = jc + j; |
189 |
|
aux.b = batchId; |
190 |
|
|
191 |
|
/** Encapsulate edge case information. */ |
192 |
|
aux.ib = std::min( mc - i, MR ); |
193 |
|
aux.jb = std::min( nc - j, NR ); |
194 |
|
|
195 |
|
/** Prepare the intermediate semiring rank-k update. */ |
196 |
|
aux.V = V + i * rs_v + j * cs_v; |
197 |
|
aux.ldv = cs_v; |
198 |
|
|
199 |
|
/** Increase the b_next pointer. */ |
200 |
|
if ( i + MR >= mc ) aux.b_next += Pack3rd.inc() * kc; |
201 |
|
|
202 |
|
//Comm3rd.Acquire2DLocks( Comm3rd.parent->GetGangRank(), Comm3rd.GetGangRank() ); |
203 |
|
|
204 |
|
if ( aux.jb == NR && aux.ib == MR ) |
205 |
|
{ |
206 |
|
fusedkernel( kc, &packA[ ip * kc ], &packB[ jp * kc ], |
207 |
|
C, &V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux ); |
208 |
|
} |
209 |
|
else |
210 |
|
{ |
211 |
|
TV vtmp[ MR * NR ]; |
212 |
|
if ( pc ) // initilize ctmp |
213 |
|
{ |
214 |
|
for ( auto jj = 0; jj < aux.jb; jj ++ ) |
215 |
|
for ( auto ii = 0; ii < aux.ib; ii ++ ) |
216 |
|
vtmp[ jj * MR + ii ] = |
217 |
|
V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ]; |
218 |
|
aux.V = vtmp; |
219 |
|
aux.ldv = MR; |
220 |
|
} |
221 |
|
fusedkernel( kc, &packA[ ip * kc ], &packB[ jp * kc ], |
222 |
|
C, vtmp, 1, MR, &aux ); |
223 |
|
} |
224 |
|
|
225 |
|
//Comm3rd.Release2DLocks( Comm3rd.parent->GetGangRank(), Comm3rd.GetGangRank() ); |
226 |
|
} |
227 |
|
} |
228 |
|
|
229 |
|
}; /** end fused_macro_kernel() */ |
230 |
|
|
231 |
|
|
232 |
|
|
233 |
|
|
234 |
|
/** |
235 |
|
* @breif This function contains the loop body of the 6th to 4th loops, |
236 |
|
* including all packing and unpacking routines. Notice that this |
237 |
|
* function is executed by all threads in the root communicator. |
238 |
|
* To access each thread in different level of communicators, use |
239 |
|
* their ids. |
240 |
|
*/ |
241 |
|
template< |
242 |
|
int MC, int NC, int KC, |
243 |
|
typename TPACKA, typename TPACKB, typename TV, |
244 |
|
typename TA, typename TB, typename TC, |
245 |
|
typename SEMIRINGKERNEL, typename MICROKERNEL> |
246 |
|
void nbody_internal |
247 |
|
( |
248 |
|
tci::Comm &Comm6th, |
249 |
|
int batchId, int m, int n, int k, int k_stra, |
250 |
|
TA& A, |
251 |
|
TB& B, |
252 |
|
TC& C, |
253 |
|
TV* V, int rs_v, int cs_v, |
254 |
|
SEMIRINGKERNEL semiringkernel, |
255 |
|
MICROKERNEL microkernel |
256 |
|
) |
257 |
|
{ |
258 |
|
/** Get all block sizes. */ |
259 |
|
const static int MR = SEMIRINGKERNEL::mr; |
260 |
|
const static int NR = SEMIRINGKERNEL::nr; |
261 |
|
const static int PACK_MR = SEMIRINGKERNEL::pack_mr; |
262 |
|
const static int PACK_NR = SEMIRINGKERNEL::pack_nr; |
263 |
|
const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size; |
264 |
|
const static int PACK_MC = ( MC / MR ) * PACK_MR; |
265 |
|
const static int PACK_NC = ( NC / NR ) * PACK_NR; |
266 |
|
/** Create subcommunicators for each loop. */ |
267 |
|
auto Comm5th = Comm6th.Split( hmlp_read_nway_from_env( "KS_JC_NT" ) ); |
268 |
|
auto Comm4th = Comm5th.Split( hmlp_read_nway_from_env( "JS_PC_NT" ) ); |
269 |
|
auto Comm3th = Comm4th.Split( hmlp_read_nway_from_env( "KS_IC_NT" ) ); |
270 |
|
/** Adjuest nc and pack_nc if the 6th loop is parallelized. */ |
271 |
|
int nc = Comm6th.BalanceOver1DGangs( n, NC, NR ); |
272 |
|
int pack_nc = ( nc / NR ) * PACK_NR; |
273 |
|
/** Allocate packB (shared over Comm4th, private for each Comm5th gang). */ |
274 |
|
auto *packB = Comm4th.AllocateSharedMemory<ALIGN_SIZE, TPACKB>( KC * ( pack_nc + 1 ) ); |
275 |
|
/** Allocate packA (shared over Comm3th, private for each Comm4th gang). */ |
276 |
|
auto *packA = Comm3th.AllocateSharedMemory<ALIGN_SIZE, TPACKA>( KC * ( PACK_MC + 1 ) ); |
277 |
|
/** If kc loop is parallelized, then create IC_NT * JR_NT locks. */ |
278 |
|
//if ( Comm5th.GetGangSize() > 1 ) Comm5th.Create2DLocks( Comm4th.GetGangSize(), Comm3th.GetCommSize() ); |
279 |
|
|
280 |
|
//printf( "%2d Allocate shared memory A %lu\n", |
281 |
|
// omp_get_thread_num(), packA ); fflush( stdout ); |
282 |
|
|
283 |
|
/** Distribute range [0,n) over Comm6th. */ |
284 |
|
auto Loop6th = Comm6th.DistributeOver1DGangs( 0, n, nc ); |
285 |
|
/** Distribute range [k_stra,k) over Comm5th. */ |
286 |
|
auto Loop5th = Comm5th.DistributeOver1DGangs( k_stra, k, KC ); |
287 |
|
/** Distribute range [0,m) over Comm4th. */ |
288 |
|
auto Loop4th = Comm4th.DistributeOver1DGangs( 0, m, MC ); |
289 |
|
/** Distribute range [0,n) over Comm6th. */ |
290 |
|
for ( int jc = Loop6th.beg(); |
291 |
|
jc < Loop6th.end(); |
292 |
|
jc += Loop6th.inc() ) |
293 |
|
{ |
294 |
|
auto jb = std::min( n - jc, nc ); |
295 |
|
/** Distribute range [k_stra,k) over Comm5th. */ |
296 |
|
for ( int pc = Loop5th.beg(); |
297 |
|
pc < Loop5th.end(); |
298 |
|
pc += Loop5th.inc() ) |
299 |
|
{ |
300 |
|
auto pb = std::min( k - pc, KC ); |
301 |
|
auto is_the_last_pc_iteration = ( pc + KC >= k ); |
302 |
|
/** Distribute range [0,jb) over Comm4th. */ |
303 |
|
auto LooppkB = Comm4th.DistributeOver1DThreads( 0, jb, NR ); |
304 |
|
auto PackpkB = Comm4th.DistributeOver1DThreads( 0, jb, PACK_NR ); |
305 |
|
/** PackB and typecast from TB to TPACKB. */ |
306 |
|
for ( int j = LooppkB.beg(), jp = PackpkB.beg(); |
307 |
|
j < LooppkB.end(); |
308 |
|
j += LooppkB.inc(), jp += PackpkB.inc() ) |
309 |
|
{ |
310 |
|
B.Pack( k, pc, pb, n, jc + j, std::min( jb - j, NR ), |
311 |
|
&packB[ jp * pb ] ); |
312 |
|
} |
313 |
|
/** Synchronize all threads in Comm4th. */ |
314 |
|
Comm4th.Barrier(); |
315 |
|
/** Distribute range [0,m) over Comm4th. */ |
316 |
|
for ( int ic = Loop4th.beg(); |
317 |
|
ic < Loop4th.end(); |
318 |
|
ic += Loop4th.inc() ) |
319 |
|
{ |
320 |
|
auto ib = std::min( m - ic, MC ); |
321 |
|
/** Distribute range [0,ib) over Comm3th. */ |
322 |
|
auto LooppkA = Comm3th.DistributeOver1DThreads( 0, ib, MR ); |
323 |
|
auto PackpkA = Comm3th.DistributeOver1DThreads( 0, ib, PACK_MR ); |
324 |
|
/** packA and typecast from TA to TPACKA. */ |
325 |
|
for ( int i = LooppkA.beg(), ip = PackpkA.beg(); |
326 |
|
i < LooppkA.end(); |
327 |
|
i += LooppkA.inc(), ip += PackpkA.inc() ) |
328 |
|
{ |
329 |
|
A.Pack( m, ic + i, std::min( ib - i, MR ), |
330 |
|
k, pc, pb, &packA[ ip * pb ] ); |
331 |
|
} |
332 |
|
/** Synchronize all threads in Comm3th. */ |
333 |
|
Comm3th.Barrier(); |
334 |
|
/** Invoke the fused kernel, if this is the last iteration. */ |
335 |
|
if ( is_the_last_pc_iteration ) |
336 |
|
{ |
337 |
|
//printf( "%2d after packA\n", omp_get_thread_num() ); fflush( stdout ); |
338 |
|
fused_macro_kernel<KC>( Comm3th, |
339 |
|
m, n, ic, jc, pc, ib, jb, pb, packA, packB, |
340 |
|
&C, V + ic * rs_v + jc * cs_v, rs_v, cs_v, |
341 |
|
batchId, microkernel ); |
342 |
|
//printf( "%2d fused_macrokernel\n", omp_get_thread_num() ); fflush( stdout ); |
343 |
|
} |
344 |
|
/** Otherwise, invoke the semiubg rank-k kernel. */ |
345 |
|
else |
346 |
|
{ |
347 |
|
//printf( "%2d after packA (rank-k)\n", omp_get_thread_num() ); fflush( stdout ); |
348 |
|
rank_k_macro_kernel<KC>( Comm3th, |
349 |
|
ic, jc, pc, ib, jb, pb, packA, packB, |
350 |
|
V + ic * rs_v + jc * cs_v, rs_v, cs_v, |
351 |
|
semiringkernel ); |
352 |
|
//printf( "%2d rank_k_macrokernel\n", omp_get_thread_num() ); fflush( stdout ); |
353 |
|
} |
354 |
|
/** Synchronize all threads in Comm3th. */ |
355 |
|
Comm3th.Barrier(); |
356 |
|
} /** end 4th loop */ |
357 |
|
Comm4th.Barrier(); |
358 |
|
} /** end 5th loop */ |
359 |
|
Comm5th.Barrier(); |
360 |
|
} /** end 6th loop */ |
361 |
|
//printf( "%2d End of all loops\n", omp_get_thread_num() ); fflush( stdout ); |
362 |
|
Comm6th.Barrier(); |
363 |
|
//Comm5th.Destroy2DLocks(); |
364 |
|
/** Free packing buffer. */ |
365 |
|
//printf( "%2d Free shared memory B\n", omp_get_thread_num() ); fflush( stdout ); |
366 |
|
Comm4th.FreeSharedMemory( packB ); |
367 |
|
//printf( "%2d Free shared memory A, %lu\n", |
368 |
|
// omp_get_thread_num(), packA ); fflush( stdout ); |
369 |
|
Comm3th.FreeSharedMemory( packA ); |
370 |
|
}; /** end nbody_internal() */ |
371 |
|
|
372 |
|
|
373 |
|
|
374 |
|
|
375 |
|
|
376 |
|
/** |
377 |
|
* @breif This is the main routine of gkmx. All packing buffers are |
378 |
|
* managed here. The communicator and the parallel section |
379 |
|
* start here. |
380 |
|
* |
381 |
|
*/ |
382 |
|
template< |
383 |
|
int MC, int NC, int KC, |
384 |
|
typename TPACKA, typename TPACKB, typename TV, |
385 |
|
typename TA, typename TB, typename TC, |
386 |
|
typename SEMIRINGKERNEL, typename MICROKERNEL> |
387 |
|
void nbody |
388 |
|
( |
389 |
|
int batchId, int m, int n, int k, |
390 |
|
TA& A, |
391 |
|
TB& B, |
392 |
|
TC& C, |
393 |
|
SEMIRINGKERNEL semiringkernel, |
394 |
|
MICROKERNEL microkernel |
395 |
|
) |
396 |
|
{ |
397 |
|
const static int MR = SEMIRINGKERNEL::mr; |
398 |
|
const static int NR = SEMIRINGKERNEL::nr; |
399 |
|
const static int PACK_MR = SEMIRINGKERNEL::pack_mr; |
400 |
|
const static int PACK_NR = SEMIRINGKERNEL::pack_nr; |
401 |
|
const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size; |
402 |
|
const static int PACK_MC = ( MC / MR ) * PACK_MR; |
403 |
|
const static int PACK_NC = ( NC / NR ) * PACK_NR; |
404 |
|
const static bool USE_STRASSEN = false; |
405 |
|
|
406 |
|
/** Early return if possible. */ |
407 |
|
if ( m == 0 || n == 0 || k == 0 ) return; |
408 |
|
|
409 |
|
|
410 |
|
TV *V = NULL; |
411 |
|
int rs_v = 0; |
412 |
|
int cs_v = 0; |
413 |
|
|
414 |
|
|
415 |
|
if ( k > KC && !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value ) |
416 |
|
{ |
417 |
|
printf( "here m %d n %d\n", m, n ); |
418 |
|
V = hmlp_malloc<ALIGN_SIZE, TV>( m * n ); |
419 |
|
rs_v = 1; |
420 |
|
cs_v = m; |
421 |
|
} |
422 |
|
else |
423 |
|
{ |
424 |
|
/** Directly use C for intermediate semiring rank-k update */ |
425 |
|
V = reinterpret_cast<TV*>( C.X ); |
426 |
|
rs_v = C.rs; |
427 |
|
cs_v = C.cs; |
428 |
|
} |
429 |
|
|
430 |
|
|
431 |
|
int k_stra = 0; |
432 |
|
if ( USE_STRASSEN ) |
433 |
|
{ |
434 |
|
assert( typeid(TPACKA) == typeid(TPACKB) ); |
435 |
|
assert( typeid(TC) == typeid(TV) ); |
436 |
|
k_stra = k - k % KC; |
437 |
|
|
438 |
|
if ( k_stra == k ) k_stra -= KC; |
439 |
|
} |
440 |
|
|
441 |
|
tci::Parallelize( NULL, nbody_internal<MC, NC, KC, TPACKA, TPACKB, TV, |
442 |
|
TA, TB, TC, SEMIRINGKERNEL, MICROKERNEL>, |
443 |
|
batchId, m, n, k, k_stra, A, B, C, V, rs_v, cs_v, |
444 |
|
semiringkernel, microkernel ); |
445 |
|
|
446 |
|
if ( k > KC && !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value ) |
447 |
|
{ |
448 |
|
hmlp_free( V ); |
449 |
|
} |
450 |
|
}; /** end nbody() */ |
451 |
|
|
452 |
|
|
453 |
|
|
454 |
|
|
455 |
|
|
456 |
|
/** |
457 |
|
* @beief |
458 |
|
*/ |
459 |
|
//template< |
460 |
|
// int MR, int NR, int MC, int NC, int KC, |
461 |
|
// typename TPACKA, typename TPACKB, typename TPACKC, typename TV, |
462 |
|
// typename TA, typename TB, typename TC, |
463 |
|
// typename OPKERNEL, typename OP1, typename OP2> |
464 |
|
//void nbody |
465 |
|
//( |
466 |
|
// int batchId, int m, int n, int k, |
467 |
|
// TA& A, |
468 |
|
// TB& B, |
469 |
|
// TC& C, |
470 |
|
// OPKERNEL opkernel, OP1 op1, OP2 op2, TV initV |
471 |
|
//) |
472 |
|
//{ |
473 |
|
// semiring_mrxnr<MR, NR, OP1, OP2, TPACKA, TPACKB, TV, TV> semiringkernel; |
474 |
|
// gnbx_mrxnr<MR, NR, OPKERNEL, OP1, OP2, TPACKA, TPACKB, TC, TPACKC, TV> gkrmkernel; |
475 |
|
// |
476 |
|
// semiringkernel.op1 = op1; |
477 |
|
// semiringkernel.op2 = op2; |
478 |
|
// semiringkernel.initV = initV; |
479 |
|
// |
480 |
|
// gkrmkernel.op1 = op1; |
481 |
|
// gkrmkernel.op2 = op2; |
482 |
|
// gkrmkernel.opkernel = opkernel; |
483 |
|
// gkrmkernel.initV = initV; |
484 |
|
// |
485 |
|
// nbody<MC, NC, KC, TPACKA, TPACKB, TV> |
486 |
|
// ( batchId, m, n, k, A, B, C, semiringkernel, gkrmkernel ); |
487 |
|
// |
488 |
|
//}; /** end nbody() */ |
489 |
|
|
490 |
|
}; /** end namespace nbody */ |
491 |
|
}; /** end namespace hmlp */ |
492 |
|
|
493 |
|
#endif /** define GNBX_HPP */ |