22 #ifndef HMLP_PACKING_HPP 23 #define HMLP_PACKING_HPP 110 int w0,
int h0,
int d0,
int s,
int p,
114 int nx = ( w0 - w1 + 2 * p ) / s + 1;
116 #pragma omp parallel for 117 for (
auto y0 = -1 * p; y0 <= h0 - h1 + p; y0 += s )
119 for (
auto x0 = -1 * p; x0 <= w0 - w1 + p; x0 += s )
121 auto i = ( ( y0 + p ) / s ) * nx + ( x0 + p ) / s;
125 for (
auto j = 0, z = 0, x = 0, y = 0; j < n; j ++ )
130 if ( 0 <= x1 && x1 < w0 && 0 <= y1 && y1 < h0 )
132 packX[ i * n + j ] = X[ y1 * w0 * d0 + x1 * d0 + z ];
136 packX[ i * n + j ] = 0.0;
160 template<
int FOLD,
bool ZEROPAD=true,
typename T>
165 int x0,
int y0,
int offset,
167 int w0,
int h0,
int d0,
int s,
int p,
173 for (
auto i = 0; i < m; i ++ )
178 x = ( offset / d0 ) % w1,
179 y = ( offset / d0 ) / w1;
185 if ( 0 <= x1 && x1 < w0 && 0 <= y1 && y1 < h0 )
187 packX[ j * FOLD + i ] = X[ y1 * w0 * d0 + x1 * d0 + z ];
191 packX[ j * FOLD + i ] = 0.0;
209 if ( ( x0 + w1 ) > ( w0 + p ) )
211 x0 = -1 * p; y0 += s;
223 template<
bool TRANS,
int FOLD,
bool ZEROPAD=false,
typename T>
227 T *X0, T *X1,
int ldx, T gamma,
int *xmap, T *packX
236 for (
auto i = 0; i < m; i ++ )
238 x0_pntr[ i ] = X0 + ldx * xmap[ i ];
239 x1_pntr[ i ] = X1 + ldx * xmap[ i ];
241 for (
auto i = m; i < FOLD; i ++ )
243 x0_pntr[ i ] = X0 + ldx * xmap[ 0 ];
244 x1_pntr[ i ] = X1 + ldx * xmap[ 0 ];
246 for (
auto j = 0; j < n; j ++ )
248 for (
auto i = 0; i < m; i ++ )
252 *packX = ( *x0_pntr[ i ] ) + gamma * ( *x1_pntr[ i ] ) ;
258 for (
auto i = m; i < FOLD; i ++ )
260 if ( ZEROPAD ) *packX ++ = (T)0.0;
261 else *packX ++ = (*x0_pntr[ i ] ++) + gamma * (*x1_pntr[ i ] ++) ;
269 for (
auto i = 0; i < m; i ++ )
271 x0_pntr[ i ] = X0 + xmap[ i ];
272 x1_pntr[ i ] = X1 + xmap[ i ];
274 for (
auto i = m; i < FOLD; i ++ )
276 x0_pntr[ i ] = X0 + xmap[ 0 ];
277 x1_pntr[ i ] = X1 + xmap[ 0 ];
280 for (
auto j = 0; j < n; j ++ )
283 for (
auto i = 0; i < m; i ++ )
285 *packX = *x0_pntr[ i ] + gamma * *x1_pntr[ i ];
293 for (
auto i = m; i < FOLD; i ++ )
297 if ( ZEROPAD ) *packX ++ = (T)0.0;
300 *packX = (*x0_pntr[ i ]) + gamma * (*x1_pntr[ i ]);
317 template<
bool TRANS,
int FOLD,
bool ZEROPAD=false,
typename T>
321 T *X0, T *X1,
int ldx, T gamma, T *packX
325 for (
int i = 0; i < FOLD; i ++ ) xmap[ i ] = i;
326 pack2D<TRANS, FOLD, ZEROPAD, T>
329 X0, X1, ldx, gamma, xmap, packX
338 template<
bool TRANS,
int FOLD,
bool ZEROPAD=false,
typename T>
342 T *X,
int ldx,
int *xmap, T *packX
349 for (
auto i = 0; i < m; i ++ )
351 x_pntr[ i ] = X + ldx * xmap[ i ];
353 for (
auto i = m; i < FOLD; i ++ )
355 x_pntr[ i ] = X + ldx * xmap[ 0 ];
357 for (
auto j = 0; j < n; j ++ )
359 for (
auto i = 0; i < m; i ++ )
361 *packX ++ = *x_pntr[ i ] ++;
363 for (
auto i = m; i < FOLD; i ++ )
365 if ( ZEROPAD ) *packX ++ = (T)0.0;
366 else *packX ++ = *x_pntr[ i ] ++;
373 for (
auto i = 0; i < m; i ++ )
375 x_pntr[ i ] = X + xmap[ i ];
377 for (
auto i = m; i < FOLD; i ++ )
379 x_pntr[ i ] = X + xmap[ 0 ];
381 for (
auto j = 0; j < n; j ++ )
383 for (
auto i = 0; i < m; i ++ )
385 *packX = *x_pntr[ i ];
389 for (
auto i = m; i < FOLD; i ++ )
391 if ( ZEROPAD ) *packX ++ = (T)0.0;
394 *packX = *x_pntr[ i ];
406 template<
bool TRANS,
int FOLD,
bool ZEROPAD=false,
typename T>
410 T *X,
int ldx, T *packX
414 for (
int i = 0; i < FOLD; i ++ ) xmap[ i ] = i;
415 pack2D<TRANS, FOLD, ZEROPAD, T>
428 template<
int PACK_MR,
typename TA>
429 inline void packA_kcxmc(
431 TA *A,
int lda,
int *amap, TA *packA )
433 TA *a_pntr[ PACK_MR ];
435 for (
auto i = 0; i < m; i ++ ) a_pntr[ i ] = A + lda * amap[ i ];
436 for (
auto i = m; i < PACK_MR; i ++ ) a_pntr[ i ] = A + lda * amap[ 0 ];
437 for (
auto p = 0; p < k; p ++ )
439 for (
auto i = 0; i < PACK_MR; i ++ )
441 *packA ++ = *a_pntr[ i ] ++;
449 template<
int PACK_NR,
typename TB>
450 inline void packB_kcxnc(
452 TB *B,
int ldb,
int *bmap, TB *packB )
455 TB *b_pntr[ PACK_NR ];
457 for ( j = 0; j < n; j ++ ) b_pntr[ j ] = B + ldb * bmap[ j ];
458 for ( j = n; j < PACK_NR; j ++ ) b_pntr[ j ] = B + ldb * bmap[ 0 ];
459 for ( p = 0; p < k; p ++ )
461 for ( j = 0; j < PACK_NR; j ++ )
463 *packB ++ = *b_pntr[ j ] ++;
471 template<
int PACK_NR,
typename TC>
472 inline void packw_rhsxnc(
474 TC *w,
int ldw,
int *wmap, TC *packw )
477 TC *w_pntr[ PACK_NR ];
479 for ( j = 0; j < n; j ++ ) w_pntr[ j ] = w + ldw * wmap[ j ];
481 for ( p = 0; p < rhs; p ++ )
483 for ( j = 0; j < n; j ++ )
485 *packw ++ = *w_pntr[ j ] ++;
487 for ( j = n; j < PACK_NR; j ++ )
497 template<
int PACK_MR,
typename TC>
498 inline void packu_rhsxmc(
500 TC *u,
int ldu,
int *umap, TC *packu )
503 TC *u_pntr[ PACK_MR ];
505 for ( i = 0; i < m; i ++ ) u_pntr[ i ] = u + ldu * umap[ i ];
506 for ( p = 0; p < rhs; p ++ )
508 for ( i = 0; i < m; i ++ )
510 *packu ++ = *u_pntr[ i ] ++;
512 for ( i = m; i < PACK_MR; i ++ )
523 #endif // define HMLP_PACKING_HPP void pack2Dimg(int m, int n, T *packX, int x0, int y0, int offset, T *X, int w0, int h0, int d0, int s, int p, int w1, int h1)
pack image into 2D packed buffer. Notice that here X is d leading.
Definition: hmlp_packing.hpp:162
void im2col(int m, int n, T *packX, T *X, int w0, int h0, int d0, int s, int p, int w1, int h1)
Definition: hmlp_packing.hpp:106
void pack2D(int m, int n, T *X0, T *X1, int ldx, T gamma, int *xmap, T *packX)
This is the default packing routine for GKMX, GSKS, GSKNN and STRASSEN.
Definition: hmlp_packing.hpp:225