Head

GCC Code Coverage Report

Directory:	.		Exec	Total	Coverage
File:	frame/primitives/nbody.hpp	Lines:	0	105	0.0 %
Date:	2019-01-14	Branches:	0	316	0.0 %


/**
 *  HMLP (High-Performance Machine Learning Primitives)
 *
 *  Copyright (C) 2014-2017, The University of Texas at Austin
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program. If not, see the LICENSE file.
 *
 **/



#ifndef GNBX_HPP
#define GNBX_HPP

#include <assert.h>
#include <typeinfo>
#include <algorithm>

#include <hmlp.h>
#include <hmlp_internal.hpp>
#include <hmlp_base.hpp>

/** Use Thread Control Interface (TCI). */
//#include <hmlp_tci.hpp>
/** Use rank-k update template. */
#include <primitives/rank_k.hpp>

/** reference microkernels */
#include <packing.hpp>
#include <semiring_mrxnr.hpp>
#include <fused_mrxnr.hpp>

using namespace std;
using namespace hmlp;

namespace hmlp
{
namespace nbody
{

///**
// *  @brief Macro kernel contains the 3rd and 2nd loops. Depending on the
// *         configuration of the communicator, the 3rd loop may be parallelized.
// *         b_next is the prefetch pointer.
// */
//template<int KC, typename SEMIRINGKERNEL, typename TA, typename TB, typename TV>
//void rank_k_macro_kernel
//(
//  tci::Comm &Comm3rd,
//  int ic, int jc, int pc,
//  int  m, int  n, int  k,
//  TA *packA,
//  TB *packB,
//  TV *V, int rs_v, int cs_v,
//  SEMIRINGKERNEL semiringkernel
//)
//{
//  /** Get all block sizes */
//  const static int MR         = SEMIRINGKERNEL::mr;
//  const static int NR         = SEMIRINGKERNEL::nr;
//  const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
//  const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
//  /** Create subcommunicators for each loop. */
//  auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) );
//  /** Compute loop ranges for each thread */
//  auto Loop3rd = Comm3rd.DistributeOver1DGangs(        0, n,      NR );
//  auto Pack3rd = Comm3rd.DistributeOver1DGangs(        0, n, PACK_NR );
//  auto Loop2nd = Comm2nd.DistributeOver1DThreads(      0, m,      MR );
//  auto Pack2nd = Comm2nd.DistributeOver1DThreads(      0, m, PACK_MR );
//  /** Distribute range [0,n) over Comm3rd (jr loop). */
//  for ( int j  = Loop3rd.beg(), jp  = Pack3rd.beg();
//            j  < Loop3rd.end();
//            j += Loop3rd.inc(), jp += Pack3rd.inc() )
//  {
//    struct aux_s<TA, TB, TV, TV> aux;
//    aux.pc       = pc;
//    aux.b_next   = packB;
//    aux.do_packC = 0;
//    aux.jb       = std::min( n - j, NR );
//    /** Distribute range [0,m) over Comm2nd (ir loop). */
//    for ( int i  = Loop2nd.beg(), ip  = Pack2nd.beg();
//              i  < Loop2nd.end();
//              i += Loop2nd.inc(), ip += Pack2nd.inc() )
//    {
//      aux.ib = std::min( m - i, MR );
//      /** Increase the b_next pointer. */
//      if ( i + MR >= m ) aux.b_next += Pack3rd.inc() * k;
//
//      if ( aux.jb == NR && aux.ib == MR )
//      {
//        semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
//          &V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux );
//      }
//      else
//      {
//        TV vtmp[ MR * NR ];
//
//        if ( pc ) // initilize ctmp
//        {
//          for ( auto jj = 0; jj < aux.jb; jj ++ )
//            for ( auto ii = 0; ii < aux.ib; ii ++ )
//              vtmp[ jj * MR + ii ] =
//                V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ];
//        }
//
//        semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
//          vtmp, 1, MR, &aux );
//
//        for ( auto jj = 0; jj < aux.jb; jj ++ )
//          for ( auto ii = 0; ii < aux.ib; ii ++ )
//            V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ]
//              = vtmp[ jj * MR + ii ];
//      }
//    } /** end 2nd loop */
//  } /** end 3rd loop */
//}; /** end rank_k_macro_kernel() */
//
//



/**
 *  @brief fused_macro_kernel contains the 3rd, 2nd loops and the fused micro
 *         kernel. Notice that here C has type TC, which is differnet from the
 *         one in rank_k_macro_kernel. ctmp used in the conner case is also
 *         type TC.
 */
template<int KC, typename FUSEDKERNEL, typename TA, typename TB, typename TC, typename TV>
void fused_macro_kernel
(
  tci::Comm &Comm3rd,
  int m, int n,
  int ic, int jc, int pc,
  int mc, int nc, int kc,
  TA *packA,
  TB *packB,
  TC *C,
  TV *V, int rs_v, int cs_v,
  int batchId,
  FUSEDKERNEL fusedkernel
)
{
  /** Get all block sizes. */
  const static int MR         = FUSEDKERNEL::mr;
  const static int NR         = FUSEDKERNEL::nr;
  const static int PACK_MR    = FUSEDKERNEL::pack_mr;
  const static int PACK_NR    = FUSEDKERNEL::pack_nr;
  /** Create subcommunicators for each loop. */
  auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) );
  /** Compute loop ranges for each thread */
  auto Loop3rd = Comm3rd.DistributeOver1DGangs(        0, nc,      NR );
  auto Pack3rd = Comm3rd.DistributeOver1DGangs(        0, nc, PACK_NR );
  auto Loop2nd = Comm2nd.DistributeOver1DThreads(      0, mc,      MR );
  auto Pack2nd = Comm2nd.DistributeOver1DThreads(      0, mc, PACK_MR );

  /** Distribute range [0,n) over Comm3rd (jr loop). */
  for ( int j  = Loop3rd.beg(), jp  = Pack3rd.beg();
            j  < Loop3rd.end();
            j += Loop3rd.inc(), jp += Pack3rd.inc() )
  {
    struct aux_s<TA, TB, TC, TV> aux;
    aux.pc       = pc;
    aux.b_next   = packB;
    aux.do_packC = 0;
    /** Distribute range [0,m) over Comm2nd (ir loop). */
    for ( int i  = Loop2nd.beg(), ip  = Pack2nd.beg();
              i  < Loop2nd.end();
              i += Loop2nd.inc(), ip += Pack2nd.inc() )
    {
      /**
       *  These auxiluary infos are used to access data in the closure of
       *  opkernel and opreduce.
       */
      aux.m = m;
      aux.n = n;
      aux.i = ic + i;
      aux.j = jc + j;
      aux.b = batchId;

      /** Encapsulate edge case information. */
      aux.ib = std::min( mc - i, MR );
      aux.jb = std::min( nc - j, NR );

      /** Prepare the intermediate semiring rank-k update. */
      aux.V = V + i * rs_v + j * cs_v;
      aux.ldv = cs_v;

      /** Increase the b_next pointer. */
      if ( i + MR >= mc ) aux.b_next += Pack3rd.inc() * kc;

      //Comm3rd.Acquire2DLocks( Comm3rd.parent->GetGangRank(), Comm3rd.GetGangRank() );

      if ( aux.jb == NR && aux.ib == MR )
      {
        fusedkernel( kc, &packA[ ip * kc ], &packB[ jp * kc ],
          C, &V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux );
      }
      else
      {
        TV vtmp[ MR * NR ];
        if ( pc ) // initilize ctmp
        {
          for ( auto jj = 0; jj < aux.jb; jj ++ )
            for ( auto ii = 0; ii < aux.ib; ii ++ )
              vtmp[ jj * MR + ii ] =
                V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ];
          aux.V = vtmp;
          aux.ldv = MR;
        }
        fusedkernel( kc, &packA[ ip * kc ], &packB[ jp * kc ],
          C, vtmp, 1, MR, &aux );
      }

      //Comm3rd.Release2DLocks( Comm3rd.parent->GetGangRank(), Comm3rd.GetGangRank() );
    }
  }

}; /** end fused_macro_kernel() */




/**
 *  @breif This function contains the loop body of the 6th to 4th loops,
 *         including all packing and unpacking routines. Notice that this
 *         function is executed by all threads in the root communicator.
 *         To access each thread in different level of communicators, use
 *         their ids.
 */
template<
  int MC, int NC, int KC,
  typename TPACKA, typename TPACKB, typename TV,
  typename     TA, typename     TB, typename TC,
  typename SEMIRINGKERNEL, typename MICROKERNEL>
void nbody_internal
(
  tci::Comm &Comm6th,
  int batchId, int m, int n, int k, int k_stra,
  TA& A,
  TB& B,
  TC& C,
  TV* V, int rs_v, int cs_v,
  SEMIRINGKERNEL semiringkernel,
  MICROKERNEL microkernel
)
{
  /** Get all block sizes. */
  const static int MR         = SEMIRINGKERNEL::mr;
  const static int NR         = SEMIRINGKERNEL::nr;
  const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
  const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
  const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
  const static int PACK_MC    = ( MC / MR ) * PACK_MR;
  const static int PACK_NC    = ( NC / NR ) * PACK_NR;
  /** Create subcommunicators for each loop. */
  auto Comm5th = Comm6th.Split( hmlp_read_nway_from_env( "KS_JC_NT" ) );
  auto Comm4th = Comm5th.Split( hmlp_read_nway_from_env( "JS_PC_NT" ) );
  auto Comm3th = Comm4th.Split( hmlp_read_nway_from_env( "KS_IC_NT" ) );
  /** Adjuest nc and pack_nc if the 6th loop is parallelized. */
  int nc = Comm6th.BalanceOver1DGangs( n, NC, NR );
  int pack_nc = ( nc / NR ) * PACK_NR;
  /** Allocate packB (shared over Comm4th, private for each Comm5th gang). */
  auto *packB = Comm4th.AllocateSharedMemory<ALIGN_SIZE, TPACKB>( KC * ( pack_nc + 1 ) );
  /** Allocate packA (shared over Comm3th, private for each Comm4th gang). */
  auto *packA = Comm3th.AllocateSharedMemory<ALIGN_SIZE, TPACKA>( KC * ( PACK_MC + 1 ) );
  /** If kc loop is parallelized, then create IC_NT * JR_NT locks. */
  //if ( Comm5th.GetGangSize() > 1 ) Comm5th.Create2DLocks( Comm4th.GetGangSize(), Comm3th.GetCommSize() );

  //printf( "%2d Allocate shared memory A %lu\n",
  //    omp_get_thread_num(), packA ); fflush( stdout );

  /** Distribute range [0,n) over Comm6th. */
  auto Loop6th = Comm6th.DistributeOver1DGangs(      0, n, nc );
  /** Distribute range [k_stra,k) over Comm5th. */
  auto Loop5th = Comm5th.DistributeOver1DGangs( k_stra, k, KC );
  /** Distribute range [0,m) over Comm4th. */
  auto Loop4th = Comm4th.DistributeOver1DGangs(      0, m, MC );
  /** Distribute range [0,n) over Comm6th. */
  for ( int jc  = Loop6th.beg();
            jc  < Loop6th.end();
            jc += Loop6th.inc() )
  {
    auto jb = std::min( n - jc, nc );
    /** Distribute range [k_stra,k) over Comm5th. */
    for ( int pc  = Loop5th.beg();
              pc  < Loop5th.end();
              pc += Loop5th.inc() )
    {
      auto pb = std::min( k - pc, KC );
      auto is_the_last_pc_iteration = ( pc + KC >= k );
      /** Distribute range [0,jb) over Comm4th. */
      auto LooppkB = Comm4th.DistributeOver1DThreads( 0, jb,      NR );
      auto PackpkB = Comm4th.DistributeOver1DThreads( 0, jb, PACK_NR );
      /** PackB and typecast from TB to TPACKB.  */
      for ( int j  = LooppkB.beg(), jp  = PackpkB.beg();
                j  < LooppkB.end();
                j += LooppkB.inc(), jp += PackpkB.inc() )
      {
        B.Pack( k, pc, pb, n, jc + j, std::min( jb - j, NR ),
            &packB[ jp * pb ] );
      }
      /** Synchronize all threads in Comm4th. */
      Comm4th.Barrier();
      /** Distribute range [0,m) over Comm4th. */
      for ( int ic  = Loop4th.beg();
                ic  < Loop4th.end();
                ic += Loop4th.inc() )
      {
        auto ib = std::min( m - ic, MC );
        /** Distribute range [0,ib) over Comm3th. */
        auto LooppkA = Comm3th.DistributeOver1DThreads( 0, ib, MR );
        auto PackpkA = Comm3th.DistributeOver1DThreads( 0, ib, PACK_MR );
        /** packA and typecast from TA to TPACKA. */
        for ( int i  = LooppkA.beg(), ip  = PackpkA.beg();
                  i  < LooppkA.end();
                  i += LooppkA.inc(), ip += PackpkA.inc() )
        {
          A.Pack( m, ic + i, std::min( ib - i, MR ),
              k, pc, pb, &packA[ ip * pb ] );
        }
        /** Synchronize all threads in Comm3th. */
        Comm3th.Barrier();
        /** Invoke the fused kernel, if this is the last iteration. */
        if ( is_the_last_pc_iteration )
        {
          //printf( "%2d after packA\n", omp_get_thread_num() ); fflush( stdout );
          fused_macro_kernel<KC>( Comm3th,
            m, n, ic, jc, pc, ib, jb, pb, packA, packB,
            &C, V + ic * rs_v + jc * cs_v, rs_v, cs_v,
            batchId, microkernel );
          //printf( "%2d fused_macrokernel\n", omp_get_thread_num() ); fflush( stdout );
        }
        /** Otherwise, invoke the semiubg rank-k kernel. */
        else
        {
          //printf( "%2d after packA (rank-k)\n", omp_get_thread_num() ); fflush( stdout );
          rank_k_macro_kernel<KC>( Comm3th,
            ic, jc, pc, ib, jb, pb, packA, packB,
            V + ic * rs_v + jc * cs_v, rs_v, cs_v,
            semiringkernel );
          //printf( "%2d rank_k_macrokernel\n", omp_get_thread_num() ); fflush( stdout );
        }
        /** Synchronize all threads in Comm3th. */
        Comm3th.Barrier();
      } /** end 4th loop */
      Comm4th.Barrier();
    } /** end 5th loop */
    Comm5th.Barrier();
  } /** end 6th loop */
  //printf( "%2d End of all loops\n", omp_get_thread_num() ); fflush( stdout );
  Comm6th.Barrier();
  //Comm5th.Destroy2DLocks();
  /** Free packing buffer. */
  //printf( "%2d Free shared memory B\n", omp_get_thread_num() ); fflush( stdout );
  Comm4th.FreeSharedMemory( packB );
  //printf( "%2d Free shared memory A, %lu\n",
  //    omp_get_thread_num(), packA ); fflush( stdout );
  Comm3th.FreeSharedMemory( packA );
}; /** end nbody_internal() */





/**
 *  @breif This is the main routine of gkmx. All packing buffers are
 *         managed here. The communicator and the parallel section
 *         start here.
 *
 */
template<
  int MC, int NC, int KC,
  typename TPACKA, typename TPACKB, typename TV,
  typename     TA, typename     TB, typename TC,
  typename SEMIRINGKERNEL, typename MICROKERNEL>
void nbody
(
  int batchId, int m, int n, int k,
  TA& A,
  TB& B,
  TC& C,
  SEMIRINGKERNEL semiringkernel,
  MICROKERNEL microkernel
)
{
  const static int MR         = SEMIRINGKERNEL::mr;
  const static int NR         = SEMIRINGKERNEL::nr;
  const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
  const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
  const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
  const static int PACK_MC    = ( MC / MR ) * PACK_MR;
  const static int PACK_NC    = ( NC / NR ) * PACK_NR;
  const static bool USE_STRASSEN = false;

  /** Early return if possible. */
  if ( m == 0 || n == 0 || k == 0 ) return;


  TV *V = NULL;
  int rs_v = 0;
  int cs_v = 0;


  if ( k > KC && !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value )
  {
    printf( "here m %d n %d\n", m, n );
    V = hmlp_malloc<ALIGN_SIZE, TV>( m * n );
    rs_v = 1;
    cs_v = m;
  }
  else
  {
    /** Directly use C for intermediate semiring rank-k update */
    V = reinterpret_cast<TV*>( C.X );
    rs_v = C.rs;
    cs_v = C.cs;
  }


  int k_stra = 0;
  if ( USE_STRASSEN )
  {
    assert( typeid(TPACKA) == typeid(TPACKB) );
    assert( typeid(TC) == typeid(TV) );
    k_stra = k - k % KC;

    if ( k_stra == k ) k_stra -= KC;
  }

  tci::Parallelize( NULL, nbody_internal<MC, NC, KC, TPACKA, TPACKB, TV,
      TA, TB, TC, SEMIRINGKERNEL, MICROKERNEL>,
      batchId, m, n, k, k_stra, A, B, C, V, rs_v, cs_v,
      semiringkernel, microkernel );

  if ( k > KC && !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value )
  {
    hmlp_free( V );
  }
}; /** end nbody() */





/**
 *  @beief
 */
//template<
//  int MR, int NR, int MC, int NC, int KC,
//  typename TPACKA, typename TPACKB, typename TPACKC, typename TV,
//  typename     TA, typename     TB, typename     TC,
//  typename OPKERNEL, typename OP1, typename OP2>
//void nbody
//(
//  int batchId, int m, int n, int k,
//  TA& A,
//  TB& B,
//  TC& C,
//  OPKERNEL opkernel, OP1 op1, OP2 op2, TV initV
//)
//{
//  semiring_mrxnr<MR, NR, OP1, OP2, TPACKA, TPACKB, TV, TV> semiringkernel;
//  gnbx_mrxnr<MR, NR, OPKERNEL, OP1, OP2, TPACKA, TPACKB, TC, TPACKC, TV> gkrmkernel;
//
//  semiringkernel.op1 = op1;
//  semiringkernel.op2 = op2;
//  semiringkernel.initV = initV;
//
//  gkrmkernel.op1 = op1;
//  gkrmkernel.op2 = op2;
//  gkrmkernel.opkernel = opkernel;
//  gkrmkernel.initV = initV;
//
//  nbody<MC, NC, KC, TPACKA, TPACKB, TV>
//    ( batchId, m, n, k, A, B, C, semiringkernel, gkrmkernel );
//
//}; /** end nbody() */

}; /** end namespace nbody */
}; /** end namespace hmlp */

#endif /** define GNBX_HPP */


Generated by: GCOVR (Version 3.2)

Line	Exec	Source
1		/**
2		* HMLP (High-Performance Machine Learning Primitives)
3		*
4		* Copyright (C) 2014-2017, The University of Texas at Austin
5		*
6		* This program is free software: you can redistribute it and/or modify
7		* it under the terms of the GNU General Public License as published by
8		* the Free Software Foundation, either version 3 of the License, or
9		* (at your option) any later version.
10		*
11		* This program is distributed in the hope that it will be useful,
12		* but WITHOUT ANY WARRANTY; without even the implied warranty of
13		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		* GNU General Public License for more details.
15		*
16		* You should have received a copy of the GNU General Public License
17		* along with this program. If not, see the LICENSE file.
18		*
19		**/
20
21
22
23		#ifndef GNBX_HPP
24		#define GNBX_HPP
25
26		#include <assert.h>
27		#include <typeinfo>
28		#include <algorithm>
29
30		#include <hmlp.h>
31		#include <hmlp_internal.hpp>
32		#include <hmlp_base.hpp>
33
34		/** Use Thread Control Interface (TCI). */
35		//#include <hmlp_tci.hpp>
36		/** Use rank-k update template. */
37		#include <primitives/rank_k.hpp>
38
39		/** reference microkernels */
40		#include <packing.hpp>
41		#include <semiring_mrxnr.hpp>
42		#include <fused_mrxnr.hpp>
43
44		using namespace std;
45		using namespace hmlp;
46
47		namespace hmlp
48		{
49		namespace nbody
50		{
51
52		///**
53		// * @brief Macro kernel contains the 3rd and 2nd loops. Depending on the
54		// * configuration of the communicator, the 3rd loop may be parallelized.
55		// * b_next is the prefetch pointer.
56		// */
57		//template<int KC, typename SEMIRINGKERNEL, typename TA, typename TB, typename TV>
58		//void rank_k_macro_kernel
59		//(
60		// tci::Comm &Comm3rd,
61		// int ic, int jc, int pc,
62		// int m, int n, int k,
63		// TA *packA,
64		// TB *packB,
65		// TV *V, int rs_v, int cs_v,
66		// SEMIRINGKERNEL semiringkernel
67		//)
68		//{
69		// /** Get all block sizes */
70		// const static int MR = SEMIRINGKERNEL::mr;
71		// const static int NR = SEMIRINGKERNEL::nr;
72		// const static int PACK_MR = SEMIRINGKERNEL::pack_mr;
73		// const static int PACK_NR = SEMIRINGKERNEL::pack_nr;
74		// /** Create subcommunicators for each loop. */
75		// auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) );
76		// /** Compute loop ranges for each thread */
77		// auto Loop3rd = Comm3rd.DistributeOver1DGangs( 0, n, NR );
78		// auto Pack3rd = Comm3rd.DistributeOver1DGangs( 0, n, PACK_NR );
79		// auto Loop2nd = Comm2nd.DistributeOver1DThreads( 0, m, MR );
80		// auto Pack2nd = Comm2nd.DistributeOver1DThreads( 0, m, PACK_MR );
81		// /** Distribute range [0,n) over Comm3rd (jr loop). */
82		// for ( int j = Loop3rd.beg(), jp = Pack3rd.beg();
83		// j < Loop3rd.end();
84		// j += Loop3rd.inc(), jp += Pack3rd.inc() )
85		// {
86		// struct aux_s<TA, TB, TV, TV> aux;
87		// aux.pc = pc;
88		// aux.b_next = packB;
89		// aux.do_packC = 0;
90		// aux.jb = std::min( n - j, NR );
91		// /** Distribute range [0,m) over Comm2nd (ir loop). */
92		// for ( int i = Loop2nd.beg(), ip = Pack2nd.beg();
93		// i < Loop2nd.end();
94		// i += Loop2nd.inc(), ip += Pack2nd.inc() )
95		// {
96		// aux.ib = std::min( m - i, MR );
97		// /** Increase the b_next pointer. */
98		// if ( i + MR >= m ) aux.b_next += Pack3rd.inc() * k;
99		//
100		// if ( aux.jb == NR && aux.ib == MR )
101		// {
102		// semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
103		// &V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux );
104		// }
105		// else
106		// {
107		// TV vtmp[ MR * NR ];
108		//
109		// if ( pc ) // initilize ctmp
110		// {
111		// for ( auto jj = 0; jj < aux.jb; jj ++ )
112		// for ( auto ii = 0; ii < aux.ib; ii ++ )
113		// vtmp[ jj * MR + ii ] =
114		// V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ];
115		// }
116		//
117		// semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
118		// vtmp, 1, MR, &aux );
119		//
120		// for ( auto jj = 0; jj < aux.jb; jj ++ )
121		// for ( auto ii = 0; ii < aux.ib; ii ++ )
122		// V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ]
123		// = vtmp[ jj * MR + ii ];
124		// }
125		// } /** end 2nd loop */
126		// } /** end 3rd loop */
127		//}; /** end rank_k_macro_kernel() */
128		//
129		//
130
131
132
133		/**
134		* @brief fused_macro_kernel contains the 3rd, 2nd loops and the fused micro
135		* kernel. Notice that here C has type TC, which is differnet from the
136		* one in rank_k_macro_kernel. ctmp used in the conner case is also
137		* type TC.
138		*/
139		template<int KC, typename FUSEDKERNEL, typename TA, typename TB, typename TC, typename TV>
140		void fused_macro_kernel
141		(
142		tci::Comm &Comm3rd,
143		int m, int n,
144		int ic, int jc, int pc,
145		int mc, int nc, int kc,
146		TA *packA,
147		TB *packB,
148		TC *C,
149		TV *V, int rs_v, int cs_v,
150		int batchId,
151		FUSEDKERNEL fusedkernel
152		)
153		{
154		/** Get all block sizes. */
155		const static int MR = FUSEDKERNEL::mr;
156		const static int NR = FUSEDKERNEL::nr;
157		const static int PACK_MR = FUSEDKERNEL::pack_mr;
158		const static int PACK_NR = FUSEDKERNEL::pack_nr;
159		/** Create subcommunicators for each loop. */
160		auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) );
161		/** Compute loop ranges for each thread */
162		auto Loop3rd = Comm3rd.DistributeOver1DGangs( 0, nc, NR );
163		auto Pack3rd = Comm3rd.DistributeOver1DGangs( 0, nc, PACK_NR );
164		auto Loop2nd = Comm2nd.DistributeOver1DThreads( 0, mc, MR );
165		auto Pack2nd = Comm2nd.DistributeOver1DThreads( 0, mc, PACK_MR );
166
167		/** Distribute range [0,n) over Comm3rd (jr loop). */
168		for ( int j = Loop3rd.beg(), jp = Pack3rd.beg();
169		j < Loop3rd.end();
170		j += Loop3rd.inc(), jp += Pack3rd.inc() )
171		{
172		struct aux_s<TA, TB, TC, TV> aux;
173		aux.pc = pc;
174		aux.b_next = packB;
175		aux.do_packC = 0;
176		/** Distribute range [0,m) over Comm2nd (ir loop). */
177		for ( int i = Loop2nd.beg(), ip = Pack2nd.beg();
178		i < Loop2nd.end();
179		i += Loop2nd.inc(), ip += Pack2nd.inc() )
180		{
181		/**
182		* These auxiluary infos are used to access data in the closure of
183		* opkernel and opreduce.
184		*/
185		aux.m = m;
186		aux.n = n;
187		aux.i = ic + i;
188		aux.j = jc + j;
189		aux.b = batchId;
190
191		/** Encapsulate edge case information. */
192		aux.ib = std::min( mc - i, MR );
193		aux.jb = std::min( nc - j, NR );
194
195		/** Prepare the intermediate semiring rank-k update. */
196		aux.V = V + i * rs_v + j * cs_v;
197		aux.ldv = cs_v;
198
199		/** Increase the b_next pointer. */
200		if ( i + MR >= mc ) aux.b_next += Pack3rd.inc() * kc;
201
202		//Comm3rd.Acquire2DLocks( Comm3rd.parent->GetGangRank(), Comm3rd.GetGangRank() );
203
204		if ( aux.jb == NR && aux.ib == MR )
205		{
206		fusedkernel( kc, &packA[ ip * kc ], &packB[ jp * kc ],
207		C, &V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux );
208		}
209		else
210		{
211		TV vtmp[ MR * NR ];
212		if ( pc ) // initilize ctmp
213		{
214		for ( auto jj = 0; jj < aux.jb; jj ++ )
215		for ( auto ii = 0; ii < aux.ib; ii ++ )
216		vtmp[ jj * MR + ii ] =
217		V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ];
218		aux.V = vtmp;
219		aux.ldv = MR;
220		}
221		fusedkernel( kc, &packA[ ip * kc ], &packB[ jp * kc ],
222		C, vtmp, 1, MR, &aux );
223		}
224
225		//Comm3rd.Release2DLocks( Comm3rd.parent->GetGangRank(), Comm3rd.GetGangRank() );
226		}
227		}
228
229		}; /** end fused_macro_kernel() */
230
231
232
233
234		/**
235		* @breif This function contains the loop body of the 6th to 4th loops,
236		* including all packing and unpacking routines. Notice that this
237		* function is executed by all threads in the root communicator.
238		* To access each thread in different level of communicators, use
239		* their ids.
240		*/
241		template<
242		int MC, int NC, int KC,
243		typename TPACKA, typename TPACKB, typename TV,
244		typename TA, typename TB, typename TC,
245		typename SEMIRINGKERNEL, typename MICROKERNEL>
246		void nbody_internal
247		(
248		tci::Comm &Comm6th,
249		int batchId, int m, int n, int k, int k_stra,
250		TA& A,
251		TB& B,
252		TC& C,
253		TV* V, int rs_v, int cs_v,
254		SEMIRINGKERNEL semiringkernel,
255		MICROKERNEL microkernel
256		)
257		{
258		/** Get all block sizes. */
259		const static int MR = SEMIRINGKERNEL::mr;
260		const static int NR = SEMIRINGKERNEL::nr;
261		const static int PACK_MR = SEMIRINGKERNEL::pack_mr;
262		const static int PACK_NR = SEMIRINGKERNEL::pack_nr;
263		const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
264		const static int PACK_MC = ( MC / MR ) * PACK_MR;
265		const static int PACK_NC = ( NC / NR ) * PACK_NR;
266		/** Create subcommunicators for each loop. */
267		auto Comm5th = Comm6th.Split( hmlp_read_nway_from_env( "KS_JC_NT" ) );
268		auto Comm4th = Comm5th.Split( hmlp_read_nway_from_env( "JS_PC_NT" ) );
269		auto Comm3th = Comm4th.Split( hmlp_read_nway_from_env( "KS_IC_NT" ) );
270		/** Adjuest nc and pack_nc if the 6th loop is parallelized. */
271		int nc = Comm6th.BalanceOver1DGangs( n, NC, NR );
272		int pack_nc = ( nc / NR ) * PACK_NR;
273		/** Allocate packB (shared over Comm4th, private for each Comm5th gang). */
274		auto packB = Comm4th.AllocateSharedMemory<ALIGN_SIZE, TPACKB>( KC ( pack_nc + 1 ) );
275		/** Allocate packA (shared over Comm3th, private for each Comm4th gang). */
276		auto packA = Comm3th.AllocateSharedMemory<ALIGN_SIZE, TPACKA>( KC ( PACK_MC + 1 ) );
277		/** If kc loop is parallelized, then create IC_NT * JR_NT locks. */
278		//if ( Comm5th.GetGangSize() > 1 ) Comm5th.Create2DLocks( Comm4th.GetGangSize(), Comm3th.GetCommSize() );
279
280		//printf( "%2d Allocate shared memory A %lu\n",
281		// omp_get_thread_num(), packA ); fflush( stdout );
282
283		/** Distribute range [0,n) over Comm6th. */
284		auto Loop6th = Comm6th.DistributeOver1DGangs( 0, n, nc );
285		/** Distribute range [k_stra,k) over Comm5th. */
286		auto Loop5th = Comm5th.DistributeOver1DGangs( k_stra, k, KC );
287		/** Distribute range [0,m) over Comm4th. */
288		auto Loop4th = Comm4th.DistributeOver1DGangs( 0, m, MC );
289		/** Distribute range [0,n) over Comm6th. */
290		for ( int jc = Loop6th.beg();
291		jc < Loop6th.end();
292		jc += Loop6th.inc() )
293		{
294		auto jb = std::min( n - jc, nc );
295		/** Distribute range [k_stra,k) over Comm5th. */
296		for ( int pc = Loop5th.beg();
297		pc < Loop5th.end();
298		pc += Loop5th.inc() )
299		{
300		auto pb = std::min( k - pc, KC );
301		auto is_the_last_pc_iteration = ( pc + KC >= k );
302		/** Distribute range [0,jb) over Comm4th. */
303		auto LooppkB = Comm4th.DistributeOver1DThreads( 0, jb, NR );
304		auto PackpkB = Comm4th.DistributeOver1DThreads( 0, jb, PACK_NR );
305		/** PackB and typecast from TB to TPACKB. */
306		for ( int j = LooppkB.beg(), jp = PackpkB.beg();
307		j < LooppkB.end();
308		j += LooppkB.inc(), jp += PackpkB.inc() )
309		{
310		B.Pack( k, pc, pb, n, jc + j, std::min( jb - j, NR ),
311		&packB[ jp * pb ] );
312		}
313		/** Synchronize all threads in Comm4th. */
314		Comm4th.Barrier();
315		/** Distribute range [0,m) over Comm4th. */
316		for ( int ic = Loop4th.beg();
317		ic < Loop4th.end();
318		ic += Loop4th.inc() )
319		{
320		auto ib = std::min( m - ic, MC );
321		/** Distribute range [0,ib) over Comm3th. */
322		auto LooppkA = Comm3th.DistributeOver1DThreads( 0, ib, MR );
323		auto PackpkA = Comm3th.DistributeOver1DThreads( 0, ib, PACK_MR );
324		/** packA and typecast from TA to TPACKA. */
325		for ( int i = LooppkA.beg(), ip = PackpkA.beg();
326		i < LooppkA.end();
327		i += LooppkA.inc(), ip += PackpkA.inc() )
328		{
329		A.Pack( m, ic + i, std::min( ib - i, MR ),
330		k, pc, pb, &packA[ ip * pb ] );
331		}
332		/** Synchronize all threads in Comm3th. */
333		Comm3th.Barrier();
334		/** Invoke the fused kernel, if this is the last iteration. */
335		if ( is_the_last_pc_iteration )
336		{
337		//printf( "%2d after packA\n", omp_get_thread_num() ); fflush( stdout );
338		fused_macro_kernel<KC>( Comm3th,
339		m, n, ic, jc, pc, ib, jb, pb, packA, packB,
340		&C, V + ic * rs_v + jc * cs_v, rs_v, cs_v,
341		batchId, microkernel );
342		//printf( "%2d fused_macrokernel\n", omp_get_thread_num() ); fflush( stdout );
343		}
344		/** Otherwise, invoke the semiubg rank-k kernel. */
345		else
346		{
347		//printf( "%2d after packA (rank-k)\n", omp_get_thread_num() ); fflush( stdout );
348		rank_k_macro_kernel<KC>( Comm3th,
349		ic, jc, pc, ib, jb, pb, packA, packB,
350		V + ic * rs_v + jc * cs_v, rs_v, cs_v,
351		semiringkernel );
352		//printf( "%2d rank_k_macrokernel\n", omp_get_thread_num() ); fflush( stdout );
353		}
354		/** Synchronize all threads in Comm3th. */
355		Comm3th.Barrier();
356		} /** end 4th loop */
357		Comm4th.Barrier();
358		} /** end 5th loop */
359		Comm5th.Barrier();
360		} /** end 6th loop */
361		//printf( "%2d End of all loops\n", omp_get_thread_num() ); fflush( stdout );
362		Comm6th.Barrier();
363		//Comm5th.Destroy2DLocks();
364		/** Free packing buffer. */
365		//printf( "%2d Free shared memory B\n", omp_get_thread_num() ); fflush( stdout );
366		Comm4th.FreeSharedMemory( packB );
367		//printf( "%2d Free shared memory A, %lu\n",
368		// omp_get_thread_num(), packA ); fflush( stdout );
369		Comm3th.FreeSharedMemory( packA );
370		}; /** end nbody_internal() */
371
372
373
374
375
376		/**
377		* @breif This is the main routine of gkmx. All packing buffers are
378		* managed here. The communicator and the parallel section
379		* start here.
380		*
381		*/
382		template<
383		int MC, int NC, int KC,
384		typename TPACKA, typename TPACKB, typename TV,
385		typename TA, typename TB, typename TC,
386		typename SEMIRINGKERNEL, typename MICROKERNEL>
387		void nbody
388		(
389		int batchId, int m, int n, int k,
390		TA& A,
391		TB& B,
392		TC& C,
393		SEMIRINGKERNEL semiringkernel,
394		MICROKERNEL microkernel
395		)
396		{
397		const static int MR = SEMIRINGKERNEL::mr;
398		const static int NR = SEMIRINGKERNEL::nr;
399		const static int PACK_MR = SEMIRINGKERNEL::pack_mr;
400		const static int PACK_NR = SEMIRINGKERNEL::pack_nr;
401		const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
402		const static int PACK_MC = ( MC / MR ) * PACK_MR;
403		const static int PACK_NC = ( NC / NR ) * PACK_NR;
404		const static bool USE_STRASSEN = false;
405
406		/** Early return if possible. */
407		if ( m == 0 \|\| n == 0 \|\| k == 0 ) return;
408
409
410		TV *V = NULL;
411		int rs_v = 0;
412		int cs_v = 0;
413
414
415		if ( k > KC && !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value )
416		{
417		printf( "here m %d n %d\n", m, n );
418		V = hmlp_malloc<ALIGN_SIZE, TV>( m * n );
419		rs_v = 1;
420		cs_v = m;
421		}
422		else
423		{
424		/** Directly use C for intermediate semiring rank-k update */
425		V = reinterpret_cast<TV*>( C.X );
426		rs_v = C.rs;
427		cs_v = C.cs;
428		}
429
430
431		int k_stra = 0;
432		if ( USE_STRASSEN )
433		{
434		assert( typeid(TPACKA) == typeid(TPACKB) );
435		assert( typeid(TC) == typeid(TV) );
436		k_stra = k - k % KC;
437
438		if ( k_stra == k ) k_stra -= KC;
439		}
440
441		tci::Parallelize( NULL, nbody_internal<MC, NC, KC, TPACKA, TPACKB, TV,
442		TA, TB, TC, SEMIRINGKERNEL, MICROKERNEL>,
443		batchId, m, n, k, k_stra, A, B, C, V, rs_v, cs_v,
444		semiringkernel, microkernel );
445
446		if ( k > KC && !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value )
447		{
448		hmlp_free( V );
449		}
450		}; /** end nbody() */
451
452
453
454
455
456		/**
457		* @beief
458		*/
459		//template<
460		// int MR, int NR, int MC, int NC, int KC,
461		// typename TPACKA, typename TPACKB, typename TPACKC, typename TV,
462		// typename TA, typename TB, typename TC,
463		// typename OPKERNEL, typename OP1, typename OP2>
464		//void nbody
465		//(
466		// int batchId, int m, int n, int k,
467		// TA& A,
468		// TB& B,
469		// TC& C,
470		// OPKERNEL opkernel, OP1 op1, OP2 op2, TV initV
471		//)
472		//{
473		// semiring_mrxnr<MR, NR, OP1, OP2, TPACKA, TPACKB, TV, TV> semiringkernel;
474		// gnbx_mrxnr<MR, NR, OPKERNEL, OP1, OP2, TPACKA, TPACKB, TC, TPACKC, TV> gkrmkernel;
475		//
476		// semiringkernel.op1 = op1;
477		// semiringkernel.op2 = op2;
478		// semiringkernel.initV = initV;
479		//
480		// gkrmkernel.op1 = op1;
481		// gkrmkernel.op2 = op2;
482		// gkrmkernel.opkernel = opkernel;
483		// gkrmkernel.initV = initV;
484		//
485		// nbody<MC, NC, KC, TPACKA, TPACKB, TV>
486		// ( batchId, m, n, k, A, B, C, semiringkernel, gkrmkernel );
487		//
488		//}; /** end nbody() */
489
490		}; /** end namespace nbody */
491		}; /** end namespace hmlp */
492
493		#endif /** define GNBX_HPP */