docs/html/nbody_8hpp_source.html

 #ifndef GNBX_HPP
 #define GNBX_HPP

 #include <assert.h>
 #include <typeinfo>
 #include <algorithm>

 #include <hmlp.h>
 #include <hmlp_internal.hpp>
 #include <hmlp_base.hpp>

 //#include <hmlp_tci.hpp>
 #include <primitives/rank_k.hpp>

 #include <packing.hpp>
 #include <semiring_mrxnr.hpp>
 #include <fused_mrxnr.hpp>

 using namespace std;
 using namespace hmlp;

 namespace hmlp
 {
 namespace nbody
 {

 // *  @brief Macro kernel contains the 3rd and 2nd loops. Depending on the
 // *         configuration of the communicator, the 3rd loop may be parallelized.
 // *         b_next is the prefetch pointer.
 // */
 //template<int KC, typename SEMIRINGKERNEL, typename TA, typename TB, typename TV>
 //void rank_k_macro_kernel
 //(
 //  tci::Comm &Comm3rd,
 //  int ic, int jc, int pc,
 //  int  m, int  n, int  k,
 //  TA *packA,
 //  TB *packB,
 //  TV *V, int rs_v, int cs_v,
 //  SEMIRINGKERNEL semiringkernel
 //)
 //{
 //  /** Get all block sizes */
 //  const static int MR         = SEMIRINGKERNEL::mr;
 //  const static int NR         = SEMIRINGKERNEL::nr;
 //  const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
 //  const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
 //  /** Create subcommunicators for each loop. */
 //  auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) );
 //  /** Compute loop ranges for each thread */
 //  auto Loop3rd = Comm3rd.DistributeOver1DGangs(        0, n,      NR );
 //  auto Pack3rd = Comm3rd.DistributeOver1DGangs(        0, n, PACK_NR );
 //  auto Loop2nd = Comm2nd.DistributeOver1DThreads(      0, m,      MR );
 //  auto Pack2nd = Comm2nd.DistributeOver1DThreads(      0, m, PACK_MR );
 //  /** Distribute range [0,n) over Comm3rd (jr loop). */
 //  for ( int j  = Loop3rd.beg(), jp  = Pack3rd.beg();
 //            j  < Loop3rd.end();
 //            j += Loop3rd.inc(), jp += Pack3rd.inc() )
 //  {
 //    struct aux_s<TA, TB, TV, TV> aux;
 //    aux.pc       = pc;
 //    aux.b_next   = packB;
 //    aux.do_packC = 0;
 //    aux.jb       = std::min( n - j, NR );
 //    /** Distribute range [0,m) over Comm2nd (ir loop). */
 //    for ( int i  = Loop2nd.beg(), ip  = Pack2nd.beg();
 //              i  < Loop2nd.end();
 //              i += Loop2nd.inc(), ip += Pack2nd.inc() )
 //    {
 //      aux.ib = std::min( m - i, MR );
 //      /** Increase the b_next pointer. */
 //      if ( i + MR >= m ) aux.b_next += Pack3rd.inc() * k;
 //
 //      if ( aux.jb == NR && aux.ib == MR )
 //      {
 //        semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
 //          &V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux );
 //      }
 //      else
 //      {
 //        TV vtmp[ MR * NR ];
 //
 //        if ( pc ) // initilize ctmp
 //        {
 //          for ( auto jj = 0; jj < aux.jb; jj ++ )
 //            for ( auto ii = 0; ii < aux.ib; ii ++ )
 //              vtmp[ jj * MR + ii ] =
 //                V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ];
 //        }
 //
 //        semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
 //          vtmp, 1, MR, &aux );
 //
 //        for ( auto jj = 0; jj < aux.jb; jj ++ )
 //          for ( auto ii = 0; ii < aux.ib; ii ++ )
 //            V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ]
 //              = vtmp[ jj * MR + ii ];
 //      }
 //    } /** end 2nd loop */
 //  } /** end 3rd loop */
 //}; /** end rank_k_macro_kernel() */
 //
 //


 template<int KC, typename FUSEDKERNEL, typename TA, typename TB, typename TC, typename TV>
 void fused_macro_kernel
 (
   tci::Comm &Comm3rd,
   int m, int n,
   int ic, int jc, int pc,
   int mc, int nc, int kc,
   TA *packA,
   TB *packB,
   TC *C,
   TV *V, int rs_v, int cs_v,
   int batchId,
   FUSEDKERNEL fusedkernel
 )
 {
   const static int MR         = FUSEDKERNEL::mr;
   const static int NR         = FUSEDKERNEL::nr;
   const static int PACK_MR    = FUSEDKERNEL::pack_mr;
   const static int PACK_NR    = FUSEDKERNEL::pack_nr;
   auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) );
   auto Loop3rd = Comm3rd.DistributeOver1DGangs(        0, nc,      NR );
   auto Pack3rd = Comm3rd.DistributeOver1DGangs(        0, nc, PACK_NR );
   auto Loop2nd = Comm2nd.DistributeOver1DThreads(      0, mc,      MR );
   auto Pack2nd = Comm2nd.DistributeOver1DThreads(      0, mc, PACK_MR );

   for ( int j  = Loop3rd.beg(), jp  = Pack3rd.beg();
             j  < Loop3rd.end();
             j += Loop3rd.inc(), jp += Pack3rd.inc() )
   {
     struct aux_s<TA, TB, TC, TV> aux;
     aux.pc       = pc;
     aux.b_next   = packB;
     aux.do_packC = 0;
     for ( int i  = Loop2nd.beg(), ip  = Pack2nd.beg();
               i  < Loop2nd.end();
               i += Loop2nd.inc(), ip += Pack2nd.inc() )
     {
       aux.m = m;
       aux.n = n;
       aux.i = ic + i;
       aux.j = jc + j;
       aux.b = batchId;

       aux.ib = std::min( mc - i, MR );
       aux.jb = std::min( nc - j, NR );

       aux.V = V + i * rs_v + j * cs_v;
       aux.ldv = cs_v;

       if ( i + MR >= mc ) aux.b_next += Pack3rd.inc() * kc;

       //Comm3rd.Acquire2DLocks( Comm3rd.parent->GetGangRank(), Comm3rd.GetGangRank() );

       if ( aux.jb == NR && aux.ib == MR )
       {
         fusedkernel( kc, &packA[ ip * kc ], &packB[ jp * kc ],
           C, &V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux );
       }
       else
       {
         TV vtmp[ MR * NR ];
         if ( pc ) // initilize ctmp
         {
           for ( auto jj = 0; jj < aux.jb; jj ++ )
             for ( auto ii = 0; ii < aux.ib; ii ++ )
               vtmp[ jj * MR + ii ] =
                 V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ];
           aux.V = vtmp;
           aux.ldv = MR;
         }
         fusedkernel( kc, &packA[ ip * kc ], &packB[ jp * kc ],
           C, vtmp, 1, MR, &aux );
       }

       //Comm3rd.Release2DLocks( Comm3rd.parent->GetGangRank(), Comm3rd.GetGangRank() );
     }
   }

 };
 template<
   int MC, int NC, int KC,
   typename TPACKA, typename TPACKB, typename TV,
   typename     TA, typename     TB, typename TC,
   typename SEMIRINGKERNEL, typename MICROKERNEL>
 void nbody_internal
 (
   tci::Comm &Comm6th,
   int batchId, int m, int n, int k, int k_stra,
   TA& A,
   TB& B,
   TC& C,
   TV* V, int rs_v, int cs_v,
   SEMIRINGKERNEL semiringkernel,
   MICROKERNEL microkernel
 )
 {
   const static int MR         = SEMIRINGKERNEL::mr;
   const static int NR         = SEMIRINGKERNEL::nr;
   const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
   const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
   const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
   const static int PACK_MC    = ( MC / MR ) * PACK_MR;
   const static int PACK_NC    = ( NC / NR ) * PACK_NR;
   auto Comm5th = Comm6th.Split( hmlp_read_nway_from_env( "KS_JC_NT" ) );
   auto Comm4th = Comm5th.Split( hmlp_read_nway_from_env( "JS_PC_NT" ) );
   auto Comm3th = Comm4th.Split( hmlp_read_nway_from_env( "KS_IC_NT" ) );
   int nc = Comm6th.BalanceOver1DGangs( n, NC, NR );
   int pack_nc = ( nc / NR ) * PACK_NR;
   auto *packB = Comm4th.AllocateSharedMemory<ALIGN_SIZE, TPACKB>( KC * ( pack_nc + 1 ) );
   auto *packA = Comm3th.AllocateSharedMemory<ALIGN_SIZE, TPACKA>( KC * ( PACK_MC + 1 ) );
   //if ( Comm5th.GetGangSize() > 1 ) Comm5th.Create2DLocks( Comm4th.GetGangSize(), Comm3th.GetCommSize() );

   //printf( "%2d Allocate shared memory A %lu\n",
   //    omp_get_thread_num(), packA ); fflush( stdout );

   auto Loop6th = Comm6th.DistributeOver1DGangs(      0, n, nc );
   auto Loop5th = Comm5th.DistributeOver1DGangs( k_stra, k, KC );
   auto Loop4th = Comm4th.DistributeOver1DGangs(      0, m, MC );
   for ( int jc  = Loop6th.beg();
             jc  < Loop6th.end();
             jc += Loop6th.inc() )
   {
     auto jb = std::min( n - jc, nc );
     for ( int pc  = Loop5th.beg();
               pc  < Loop5th.end();
               pc += Loop5th.inc() )
     {
       auto pb = std::min( k - pc, KC );
       auto is_the_last_pc_iteration = ( pc + KC >= k );
       auto LooppkB = Comm4th.DistributeOver1DThreads( 0, jb,      NR );
       auto PackpkB = Comm4th.DistributeOver1DThreads( 0, jb, PACK_NR );
       for ( int j  = LooppkB.beg(), jp  = PackpkB.beg();
                 j  < LooppkB.end();
                 j += LooppkB.inc(), jp += PackpkB.inc() )
       {
         B.Pack( k, pc, pb, n, jc + j, std::min( jb - j, NR ),
             &packB[ jp * pb ] );
       }
       Comm4th.Barrier();
       for ( int ic  = Loop4th.beg();
                 ic  < Loop4th.end();
                 ic += Loop4th.inc() )
       {
         auto ib = std::min( m - ic, MC );
         auto LooppkA = Comm3th.DistributeOver1DThreads( 0, ib, MR );
         auto PackpkA = Comm3th.DistributeOver1DThreads( 0, ib, PACK_MR );
         for ( int i  = LooppkA.beg(), ip  = PackpkA.beg();
                   i  < LooppkA.end();
                   i += LooppkA.inc(), ip += PackpkA.inc() )
         {
           A.Pack( m, ic + i, std::min( ib - i, MR ),
               k, pc, pb, &packA[ ip * pb ] );
         }
         Comm3th.Barrier();
         if ( is_the_last_pc_iteration )
         {
           //printf( "%2d after packA\n", omp_get_thread_num() ); fflush( stdout );
           fused_macro_kernel<KC>( Comm3th,
             m, n, ic, jc, pc, ib, jb, pb, packA, packB,
             &C, V + ic * rs_v + jc * cs_v, rs_v, cs_v,
             batchId, microkernel );
           //printf( "%2d fused_macrokernel\n", omp_get_thread_num() ); fflush( stdout );
         }
         else
         {
           //printf( "%2d after packA (rank-k)\n", omp_get_thread_num() ); fflush( stdout );
           rank_k_macro_kernel<KC>( Comm3th,
             ic, jc, pc, ib, jb, pb, packA, packB,
             V + ic * rs_v + jc * cs_v, rs_v, cs_v,
             semiringkernel );
           //printf( "%2d rank_k_macrokernel\n", omp_get_thread_num() ); fflush( stdout );
         }
         Comm3th.Barrier();
       }
       Comm4th.Barrier();
     }
     Comm5th.Barrier();
   }
   //printf( "%2d End of all loops\n", omp_get_thread_num() ); fflush( stdout );
   Comm6th.Barrier();
   //Comm5th.Destroy2DLocks();
   //printf( "%2d Free shared memory B\n", omp_get_thread_num() ); fflush( stdout );
   Comm4th.FreeSharedMemory( packB );
   //printf( "%2d Free shared memory A, %lu\n",
   //    omp_get_thread_num(), packA ); fflush( stdout );
   Comm3th.FreeSharedMemory( packA );
 };
 template<
   int MC, int NC, int KC,
   typename TPACKA, typename TPACKB, typename TV,
   typename     TA, typename     TB, typename TC,
   typename SEMIRINGKERNEL, typename MICROKERNEL>
 void nbody
 (
   int batchId, int m, int n, int k,
   TA& A,
   TB& B,
   TC& C,
   SEMIRINGKERNEL semiringkernel,
   MICROKERNEL microkernel
 )
 {
   const static int MR         = SEMIRINGKERNEL::mr;
   const static int NR         = SEMIRINGKERNEL::nr;
   const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
   const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
   const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
   const static int PACK_MC    = ( MC / MR ) * PACK_MR;
   const static int PACK_NC    = ( NC / NR ) * PACK_NR;
   const static bool USE_STRASSEN = false;

   if ( m == 0 || n == 0 || k == 0 ) return;


   TV *V = NULL;
   int rs_v = 0;
   int cs_v = 0;


   if ( k > KC && !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value )
   {
     printf( "here m %d n %d\n", m, n );
     V = hmlp_malloc<ALIGN_SIZE, TV>( m * n );
     rs_v = 1;
     cs_v = m;
   }
   else
   {
     V = reinterpret_cast<TV*>( C.X );
     rs_v = C.rs;
     cs_v = C.cs;
   }


   int k_stra = 0;
   if ( USE_STRASSEN )
   {
     assert( typeid(TPACKA) == typeid(TPACKB) );
     assert( typeid(TC) == typeid(TV) );
     k_stra = k - k % KC;

     if ( k_stra == k ) k_stra -= KC;
   }

   tci::Parallelize( NULL, nbody_internal<MC, NC, KC, TPACKA, TPACKB, TV,
       TA, TB, TC, SEMIRINGKERNEL, MICROKERNEL>,
       batchId, m, n, k, k_stra, A, B, C, V, rs_v, cs_v,
       semiringkernel, microkernel );

   if ( k > KC && !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value )
   {
     hmlp_free( V );
   }
 };
 //template<
 //  int MR, int NR, int MC, int NC, int KC,
 //  typename TPACKA, typename TPACKB, typename TPACKC, typename TV,
 //  typename     TA, typename     TB, typename     TC,
 //  typename OPKERNEL, typename OP1, typename OP2>
 //void nbody
 //(
 //  int batchId, int m, int n, int k,
 //  TA& A,
 //  TB& B,
 //  TC& C,
 //  OPKERNEL opkernel, OP1 op1, OP2 op2, TV initV
 //)
 //{
 //  semiring_mrxnr<MR, NR, OP1, OP2, TPACKA, TPACKB, TV, TV> semiringkernel;
 //  gnbx_mrxnr<MR, NR, OPKERNEL, OP1, OP2, TPACKA, TPACKB, TC, TPACKC, TV> gkrmkernel;
 //
 //  semiringkernel.op1 = op1;
 //  semiringkernel.op2 = op2;
 //  semiringkernel.initV = initV;
 //
 //  gkrmkernel.op1 = op1;
 //  gkrmkernel.op2 = op2;
 //  gkrmkernel.opkernel = opkernel;
 //  gkrmkernel.initV = initV;
 //
 //  nbody<MC, NC, KC, TPACKA, TPACKB, TV>
 //    ( batchId, m, n, k, A, B, C, semiringkernel, gkrmkernel );
 //
 //}; /** end nbody() */

 };
 };
 #endif
hmlp::tci::Comm::Split
Comm Split(int num_groups)
Definition: tci.cpp:149

std

hmlp::tci::Comm
Definition: tci.hpp:89

aux_s
Definition: hmlp_internal.hpp:38

hmlp::MatrixLike
Definition: packing.hpp:198

hmlp::tci::Comm::DistributeOver1DGangs
Range DistributeOver1DGangs(int beg, int end, int nb)
Definition: tci.cpp:335

hmlp::hmlp_free
void hmlp_free(T *ptr)
Free the aligned memory.
Definition: util.hpp:88

hmlp
Definition: gofmm.hpp:83