docs/html/gofmm__mpi_8hpp_source.html

 #ifndef GOFMM_MPI_HPP
 #define GOFMM_MPI_HPP

 #include <gofmm.hpp>
 #include <tree_mpi.hpp>
 #include <igofmm_mpi.hpp>
 //#include <DistData.hpp>
 using namespace std;
 using namespace hmlp;


 namespace hmlp
 {
 namespace mpigofmm
 {


 // *  @biref This class does not have to inherit DistData, but it have to
 // *         inherit DistVirtualMatrix<T>
 // *
 // */
 //template<typename T>
 //class DistSPDMatrix : public DistData<STAR, CBLK, T>
 //{
 //  public:
 //
 //    DistSPDMatrix( size_t m, size_t n, mpi::Comm comm ) :
 //      DistData<STAR, CBLK, T>( m, n, comm )
 //    {
 //    };
 //
 //
 //    /** ESSENTIAL: this is an abstract function  */
 //    virtual T operator()( size_t i, size_t j, mpi::Comm comm )
 //    {
 //      T Kij = 0;
 //
 //      /** MPI */
 //      int size, rank;
 //      hmlp::mpi::Comm_size( comm, &size );
 //      hmlp::mpi::Comm_rank( comm, &rank );
 //
 //      std::vector<std::vector<size_t>> sendrids( size );
 //      std::vector<std::vector<size_t>> recvrids( size );
 //      std::vector<std::vector<size_t>> sendcids( size );
 //      std::vector<std::vector<size_t>> recvcids( size );
 //
 //      /** request Kij from rank ( j % size ) */
 //      sendrids[ i % size ].push_back( i );
 //      sendcids[ j % size ].push_back( j );
 //
 //      /** exchange ids */
 //      mpi::AlltoallVector( sendrids, recvrids, comm );
 //      mpi::AlltoallVector( sendcids, recvcids, comm );
 //
 //      /** allocate buffer for data */
 //      std::vector<std::vector<T>> senddata( size );
 //      std::vector<std::vector<T>> recvdata( size );
 //
 //      /** fetch subrows */
 //      for ( size_t p = 0; p < size; p ++ )
 //      {
 //        assert( recvrids[ p ].size() == recvcids[ p ].size() );
 //        for ( size_t j = 0; j < recvcids[ p ].size(); j ++ )
 //        {
 //          size_t rid = recvrids[ p ][ j ];
 //          size_t cid = recvcids[ p ][ j ];
 //          senddata[ p ].push_back( (*this)( rid, cid ) );
 //        }
 //      }
 //
 //      /** exchange data */
 //      mpi::AlltoallVector( senddata, recvdata, comm );
 //
 //      for ( size_t p = 0; p < size; p ++ )
 //      {
 //        assert( recvdata[ p ].size() <= 1 );
 //        if ( recvdata[ p ] ) Kij = recvdata[ p ][ 0 ];
 //      }
 //
 //      return Kij;
 //    };
 //
 //
 //    /** ESSENTIAL: return a submatrix */
 //    virtual hmlp::Data<T> operator()
 //    ( std::vector<size_t> &imap, std::vector<size_t> &jmap, hmlp::mpi::Comm comm )
 //    {
 //      hmlp::Data<T> KIJ( imap.size(), jmap.size() );
 //
 //      /** MPI */
 //      int size, rank;
 //      hmlp::mpi::Comm_size( comm, &size );
 //      hmlp::mpi::Comm_rank( comm, &rank );
 //
 //
 //
 //      std::vector<std::vector<size_t>> jmapcids( size );
 //
 //      std::vector<std::vector<size_t>> sendrids( size );
 //      std::vector<std::vector<size_t>> recvrids( size );
 //      std::vector<std::vector<size_t>> sendcids( size );
 //      std::vector<std::vector<size_t>> recvcids( size );
 //
 //      /** request KIJ from rank ( j % size ) */
 //      for ( size_t j = 0; j < jmap.size(); j ++ )
 //      {
 //        size_t cid = jmap[ j ];
 //        sendcids[ cid % size ].push_back( cid );
 //        jmapcids[ cid % size ].push_back(   j );
 //      }
 //
 //      for ( size_t p = 0; p < size; p ++ )
 //      {
 //        if ( sendcids[ p ].size() ) sendrids[ p ] = imap;
 //      }
 //
 //      /** exchange ids */
 //      mpi::AlltoallVector( sendrids, recvrids, comm );
 //      mpi::AlltoallVector( sendcids, recvcids, comm );
 //
 //      /** allocate buffer for data */
 //      std::vector<hmlp::Data<T>> senddata( size );
 //      std::vector<hmlp::Data<T>> recvdata( size );
 //
 //      /** fetch submatrix */
 //      for ( size_t p = 0; p < size; p ++ )
 //      {
 //        if ( recvcids[ p ].size() && recvrids[ p ].size() )
 //        {
 //          senddata[ p ] = (*this)( recvrids[ p ], recvcids[ p ] );
 //        }
 //      }
 //
 //      /** exchange data */
 //      mpi::AlltoallVector( senddata, recvdata, comm );
 //
 //      /** merging data */
 //      for ( size_t p = 0; j < size; p ++ )
 //      {
 //        assert( recvdata[ p ].size() == imap.size() * recvcids[ p ].size() );
 //        recvdata[ p ].resize( imap.size(), recvcids[ p ].size() );
 //        for ( size_t j = 0; j < recvcids[ p ]; i ++ )
 //        {
 //          for ( size_t i = 0; i < imap.size(); i ++ )
 //          {
 //            KIJ( i, jmapcids[ p ][ j ] ) = recvdata[ p ]( i, j );
 //          }
 //        }
 //      };
 //
 //      return KIJ;
 //    };
 //
 //
 //
 //
 //
 //    virtual hmlp::Data<T> operator()
 //    ( std::vector<int> &imap, std::vector<int> &jmap, hmlp::mpi::Comm comm )
 //    {
 //      printf( "operator() not implemented yet\n" );
 //      exit( 1 );
 //    };
 //
 //
 //
 //    /** overload operator */
 //
 //
 //  private:
 //
 //}; /** end class DistSPDMatrix */
 //
 //


 template<typename SPDMATRIX, typename SPLITTER, typename T>
 class Setup : public mpitree::Setup<SPLITTER, T>,
               public gofmm::Configuration<T>
 {
   public:

     void FromConfiguration( gofmm::Configuration<T> &config,
         SPDMATRIX &K, SPLITTER &splitter,
         DistData<STAR, CBLK, pair<T, size_t>>* NN_cblk )
     {
       this->CopyFrom( config );
       this->K = &K;
       this->splitter = splitter;
       this->NN_cblk = NN_cblk;
     };

     SPDMATRIX *K = NULL;

     Data<T> *w = NULL;
     Data<T> *u = NULL;

     Data<T> *input = NULL;
     Data<T> *output = NULL;

     T lambda = 0.0;

     //bool issymmetric = true;

     bool do_ulv_factorization = true;


   private:

 };
 template<typename NODE>
 class DistTreeViewTask : public Task
 {
   public:

     NODE *arg = NULL;

     void Set( NODE *user_arg )
     {
       arg = user_arg;
       name = string( "TreeView" );
       label = to_string( arg->treelist_id );
       cost = 1.0;
     };

     void DependencyAnalysis() { arg->DependOnParent( this ); };

     void Execute( Worker* user_worker )
     {
       auto *node   = arg;

       auto &w = *(node->setup->w);
       auto &u = *(node->setup->u);

       auto &U = node->data.u_view;
       auto &W = node->data.w_view;

       U.Set( u );
       W.Set( w );

       if ( !node->isleaf && !node->child )
       {
         assert( node->lchild && node->rchild );
         auto &UL = node->lchild->data.u_view;
         auto &UR = node->rchild->data.u_view;
         auto &WL = node->lchild->data.w_view;
         auto &WR = node->rchild->data.w_view;
         U.Partition2x1( UL,
                         UR, node->lchild->n, TOP );
         W.Partition2x1( WL,
                         WR, node->lchild->n, TOP );
       }
     };

 };
 template<typename T>
 vector<vector<size_t>> DistMedianSplit( vector<T> &values, mpi::Comm comm )
 {
   int n = 0;
   int num_points_owned = values.size();
   mpi::Allreduce( &num_points_owned, &n, 1, MPI_SUM, comm );
   T  median = combinatorics::Select( n / 2, values, comm );

   vector<vector<size_t>> split( 2 );
   vector<size_t> middle;

   if ( n == 0 ) return split;

   for ( size_t i = 0; i < values.size(); i ++ )
   {
     auto v = values[ i ];
     if ( std::fabs( v - median ) < 1E-6 ) middle.push_back( i );
     else if ( v < median ) split[ 0 ].push_back( i );
     else split[ 1 ].push_back( i );
   }

   int nmid = 0;
   int nlhs = 0;
   int nrhs = 0;
   int num_mid_owned = middle.size();
   int num_lhs_owned = split[ 0 ].size();
   int num_rhs_owned = split[ 1 ].size();

   mpi::Allreduce( &num_mid_owned, &nmid, 1, MPI_SUM, comm );
   mpi::Allreduce( &num_lhs_owned, &nlhs, 1, MPI_SUM, comm );
   mpi::Allreduce( &num_rhs_owned, &nrhs, 1, MPI_SUM, comm );

   if ( nmid )
   {
     int nlhs_required, nrhs_required;

     if ( nlhs > nrhs )
     {
       nlhs_required = ( n - 1 ) / 2 + 1 - nlhs;
       nrhs_required = nmid - nlhs_required;
     }
     else
     {
       nrhs_required = ( n - 1 ) / 2 + 1 - nrhs;
       nlhs_required = nmid - nrhs_required;
     }

     assert( nlhs_required >= 0 && nrhs_required >= 0 );

     double lhs_ratio = ( (double)nlhs_required ) / nmid;
     int nlhs_required_owned = num_mid_owned * lhs_ratio;
     int nrhs_required_owned = num_mid_owned - nlhs_required_owned;

     //printf( "rank %d [ %d %d ] [ %d %d ]\n",
     //  global_rank,
     //  nlhs_required_owned, nlhs_required,
     //  nrhs_required_owned, nrhs_required ); fflush( stdout );

     assert( nlhs_required_owned >= 0 && nrhs_required_owned >= 0 );

     for ( size_t i = 0; i < middle.size(); i ++ )
     {
       if ( i < nlhs_required_owned )
         split[ 0 ].push_back( middle[ i ] );
       else
         split[ 1 ].push_back( middle[ i ] );
     }
   }

   return split;
 };
 template<typename SPDMATRIX, int N_SPLIT, typename T>
 struct centersplit : public gofmm::centersplit<SPDMATRIX, N_SPLIT, T>
 {

   centersplit() : gofmm::centersplit<SPDMATRIX, N_SPLIT, T>() {};

   centersplit( SPDMATRIX& K ) : gofmm::centersplit<SPDMATRIX, N_SPLIT, T>( K ) {};

   inline vector<vector<size_t> > operator() ( vector<size_t>& gids ) const
   {
     return gofmm::centersplit<SPDMATRIX, N_SPLIT, T>::operator() ( gids );
   };

   inline vector<vector<size_t> > operator() ( vector<size_t>& gids, mpi::Comm comm ) const
   {
     assert( N_SPLIT == 2 );
     assert( this->Kptr );

     int size; mpi::Comm_size( comm, &size );
     int rank; mpi::Comm_rank( comm, &rank );
     auto &K = *(this->Kptr);

     vector<T> temp( gids.size(), 0.0 );

     auto column_samples = combinatorics::SampleWithoutReplacement(
         this->n_centroid_samples, gids );

     mpi::Bcast( column_samples.data(), column_samples.size(), 0, comm );
     K.BcastIndices( column_samples, 0, comm );

     auto DIC = K.Distances( this->metric, gids, column_samples );

     for ( auto & it : temp ) it = 0;

     for ( size_t j = 0; j < DIC.col(); j ++ )
       for ( size_t i = 0; i < DIC.row(); i ++ )
         temp[ i ] += DIC( i, j );

     auto idf2c = distance( temp.begin(), max_element( temp.begin(), temp.end() ) );

     mpi::NumberIntPair<T> local_max_pair, max_pair;
     local_max_pair.val = temp[ idf2c ];
     local_max_pair.key = rank;

     mpi::Allreduce( &local_max_pair, &max_pair, 1, MPI_MAXLOC, comm );

     int gidf2c = gids[ idf2c ];
     mpi::Bcast( &gidf2c, 1, MPI_INT, max_pair.key, comm );


     //printf( "rank %d val %E key %d; global val %E key %d\n",
     //    rank, local_max_pair.val, local_max_pair.key,
     //    max_pair.val, max_pair.key ); fflush( stdout );
     //printf( "rank %d gidf2c %d\n", rank, gidf2c  ); fflush( stdout );

     vector<size_t> P( 1, gidf2c );
     K.BcastIndices( P, max_pair.key, comm );

     auto DIP = K.Distances( this->metric, gids, P );

     auto idf2f = distance( DIP.begin(), max_element( DIP.begin(), DIP.end() ) );

     local_max_pair.val = DIP[ idf2f ];
     local_max_pair.key = rank;

     mpi::Allreduce( &local_max_pair, &max_pair, 1, MPI_MAXLOC, comm );

     int gidf2f = gids[ idf2f ];
     mpi::Bcast( &gidf2f, 1, MPI_INT, max_pair.key, comm );

     //printf( "rank %d val %E key %d; global val %E key %d\n",
     //    rank, local_max_pair.val, local_max_pair.key,
     //    max_pair.val, max_pair.key ); fflush( stdout );
     //printf( "rank %d gidf2f %d\n", rank, gidf2f  ); fflush( stdout );

     vector<size_t> Q( 1, gidf2f );
     K.BcastIndices( Q, max_pair.key, comm );

     auto DIQ = K.Distances( this->metric, gids, P );

     for ( size_t i = 0; i < temp.size(); i ++ )
       temp[ i ] = DIP[ i ] - DIQ[ i ];

     auto split = DistMedianSplit( temp, comm );

     mpi::Status status;
     vector<size_t> sent_gids;
     int partner = ( rank + size / 2 ) % size;
     if ( rank < size / 2 )
     {
       for ( auto it : split[ 1 ] )
         sent_gids.push_back( gids[ it ] );
       K.SendIndices( sent_gids, partner, comm );
       K.RecvIndices( partner, comm, &status );
     }
     else
     {
       for ( auto it : split[ 0 ] )
         sent_gids.push_back( gids[ it ] );
       K.RecvIndices( partner, comm, &status );
       K.SendIndices( sent_gids, partner, comm );
     }

     return split;
   };


 };
 template<typename SPDMATRIX, int N_SPLIT, typename T>
 struct randomsplit : public gofmm::randomsplit<SPDMATRIX, N_SPLIT, T>
 {

   randomsplit() : gofmm::randomsplit<SPDMATRIX, N_SPLIT, T>() {};

   randomsplit( SPDMATRIX& K ) : gofmm::randomsplit<SPDMATRIX, N_SPLIT, T>( K ) {};

   inline vector<vector<size_t> > operator() ( vector<size_t>& gids ) const
   {
     return gofmm::randomsplit<SPDMATRIX, N_SPLIT, T>::operator() ( gids );
   };

   inline vector<vector<size_t> > operator() ( vector<size_t>& gids, mpi::Comm comm ) const
   {
     assert( N_SPLIT == 2 );
     assert( this->Kptr );

     int size, rank, global_rank, global_size;
     mpi::Comm_size( comm, &size );
     mpi::Comm_rank( comm, &rank );
     mpi::Comm_rank( MPI_COMM_WORLD, &global_rank );
     mpi::Comm_size( MPI_COMM_WORLD, &global_size );
     SPDMATRIX &K = *(this->Kptr);
     //vector<vector<size_t>> split( N_SPLIT );

     if ( size == global_size )
     {
       for ( size_t i = 0; i < gids.size(); i ++ )
         assert( gids[ i ] == i * size + rank );
     }


     int n = 0;
     int num_points_owned = gids.size();
     vector<T> temp( gids.size(), 0.0 );

     mpi::Allreduce( &num_points_owned, &n, 1, MPI_INT, MPI_SUM, comm );

     //if ( n == 0 ) return split;

     size_t gidf2c, gidf2f;
     if ( gids.size() )
     {
       gidf2c = gids[ std::rand() % gids.size() ];
       gidf2f = gids[ std::rand() % gids.size() ];
     }

     mpi::NumberIntPair<T> local_max_pair, max_pair;
     local_max_pair.val = gids.size();
     local_max_pair.key = rank;

     mpi::Allreduce( &local_max_pair, &max_pair, 1, MPI_MAXLOC, comm );

     mpi::Bcast( &gidf2c, 1, max_pair.key, comm );
     vector<size_t> P( 1, gidf2c );
     K.BcastIndices( P, max_pair.key, comm );

     if ( rank == max_pair.key ) local_max_pair.val = 0;

     mpi::Allreduce( &local_max_pair, &max_pair, 1, MPI_MAXLOC, comm );

     mpi::Bcast( &gidf2f, 1, max_pair.key, comm );
     vector<size_t> Q( 1, gidf2f );
     K.BcastIndices( Q, max_pair.key, comm );


     auto DIP = K.Distances( this->metric, gids, P );
     auto DIQ = K.Distances( this->metric, gids, Q );

     for ( size_t i = 0; i < temp.size(); i ++ )
       temp[ i ] = DIP[ i ] - DIQ[ i ];

     auto split = DistMedianSplit( temp, comm );

     mpi::Status status;
     vector<size_t> sent_gids;
     int partner = ( rank + size / 2 ) % size;
     if ( rank < size / 2 )
     {
       for ( auto it : split[ 1 ] )
         sent_gids.push_back( gids[ it ] );
       K.SendIndices( sent_gids, partner, comm );
       K.RecvIndices( partner, comm, &status );
     }
     else
     {
       for ( auto it : split[ 0 ] )
         sent_gids.push_back( gids[ it ] );
       K.RecvIndices( partner, comm, &status );
       K.SendIndices( sent_gids, partner, comm );
     }

     return split;
   };


 };
 template<typename NODE>
 void DistUpdateWeights( NODE *node )
 {
   using T = typename NODE::T;
   mpi::Status status;
   auto comm = node->GetComm();
   int  size = node->GetCommSize();
   int  rank = node->GetCommRank();

   if ( !node->parent || !node->data.isskel ) return;

   if ( size < 2 )
   {
     gofmm::UpdateWeights( node );
   }
   else
   {
     auto &w = *node->setup->w;
     size_t nrhs = w.col();

     auto &data   = node->data;
     auto &proj   = data.proj;
     auto &w_skel = data.w_skel;

     if ( rank == 0 )
     {
       size_t s  = proj.row();
          size_t sl = node->child->data.skels.size();
          size_t sr = proj.col() - sl;
       w_skel.resize( s, nrhs );
       View<T> P( false,   proj ), PL, PR;
       View<T> W( false, w_skel ), WL( false, node->child->data.w_skel );
       P.Partition1x2( PL, PR, sl, LEFT );
       gemm::xgemm<GEMM_NB>( (T)1.0, PL, WL, (T)0.0, W );

       Data<T> w_skel_sib;
       mpi::ExchangeVector( w_skel, size / 2, 0, w_skel_sib, size / 2, 0, comm, &status );
       #pragma omp parallel for
          for ( size_t i = 0; i < w_skel.size(); i ++ )
             w_skel[ i ] += w_skel_sib[ i ];
     }

     if ( rank == size / 2 )
     {
       size_t s  = proj.row();
          size_t sr = node->child->data.skels.size();
          size_t sl = proj.col() - sr;
       w_skel.resize( s, nrhs );
       View<T> P( false,   proj ), PL, PR;
       View<T> W( false, w_skel ), WR( false, node->child->data.w_skel );
       P.Partition1x2( PL, PR, sl, LEFT );
       gemm::xgemm<GEMM_NB>( (T)1.0, PR, WR, (T)0.0, W );


          Data<T> w_skel_sib;
          mpi::ExchangeVector( w_skel, 0, 0, w_skel_sib, 0, 0, comm, &status );
          w_skel.clear();
     }
   }
 };
 template<typename NODE, typename T>
 class DistUpdateWeightsTask : public Task
 {
   public:

     NODE *arg = NULL;

     void Set( NODE *user_arg )
     {
       arg = user_arg;
       name = string( "DistN2S" );
       label = to_string( arg->treelist_id );

       double flops = 0.0, mops = 0.0;
       auto &gids = arg->gids;
       auto &skels = arg->data.skels;
       auto &w = *arg->setup->w;

          if ( !arg->child )
          {
         if ( arg->isleaf )
         {
           auto m = skels.size();
           auto n = w.col();
           auto k = gids.size();
           flops = 2.0 * m * n * k;
           mops = 2.0 * ( m * n + m * k + k * n );
         }
         else
         {
           auto &lskels = arg->lchild->data.skels;
           auto &rskels = arg->rchild->data.skels;
           auto m = skels.size();
           auto n = w.col();
           auto k = lskels.size() + rskels.size();
           flops = 2.0 * m * n * k;
           mops  = 2.0 * ( m * n + m * k + k * n );
         }
          }
          else
          {
             if ( arg->GetCommRank() == 0 )
             {
           auto &lskels = arg->child->data.skels;
           auto m = skels.size();
           auto n = w.col();
           auto k = lskels.size();
           flops = 2.0 * m * n * k;
           mops = 2.0 * ( m * n + m * k + k * n );
             }
             if ( arg->GetCommRank() == arg->GetCommSize() / 2 )
             {
           auto &rskels = arg->child->data.skels;
           auto m = skels.size();
           auto n = w.col();
           auto k = rskels.size();
           flops = 2.0 * m * n * k;
           mops = 2.0 * ( m * n + m * k + k * n );
             }
          }

       event.Set( label + name, flops, mops );
       cost = flops / 1E+9;
       priority = true;
     };

     void DependencyAnalysis() { arg->DependOnChildren( this ); };

     void Execute( Worker* user_worker ) { DistUpdateWeights( arg ); };

 };
 //template<bool NNPRUNE, typename NODE, typename T>
 //class DistSkeletonsToSkeletonsTask : public Task
 //{
 //  public:
 //
 //    NODE *arg = NULL;
 //
 //    void Set( NODE *user_arg )
 //    {
 //      arg = user_arg;
 //      name = string( "DistS2S" );
 //      label = to_string( arg->treelist_id );
 //      /** compute flops and mops */
 //      double flops = 0.0, mops = 0.0;
 //      auto &w = *arg->setup->w;
 //      size_t m = arg->data.skels.size();
 //      size_t n = w.col();
 //
 //      auto *FarNodes = &arg->FarNodes;
 //      if ( NNPRUNE ) FarNodes = &arg->NNFarNodes;
 //
 //      for ( auto it = FarNodes->begin(); it != FarNodes->end(); it ++ )
 //      {
 //        size_t k = (*it)->data.skels.size();
 //        flops += 2.0 * m * n * k;
 //        mops  += m * k; // cost of Kab
 //        mops  += 2.0 * ( m * n + n * k + k * n );
 //      }
 //
 //      /** setup the event */
 //      event.Set( label + name, flops, mops );
 //
 //      /** assume computation bound */
 //      cost = flops / 1E+9;
 //
 //      /** "LOW" priority */
 //      priority = false;
 //    };
 //
 //
 //
 //    void DependencyAnalysis()
 //    {
 //      for ( auto p : arg->data.FarDependents )
 //        hmlp_msg_dependency_analysis( 306, p, R, this );
 //
 //      auto *FarNodes = &arg->FarNodes;
 //      if ( NNPRUNE ) FarNodes = &arg->NNFarNodes;
 //      for ( auto it : *FarNodes ) it->DependencyAnalysis( R, this );
 //
 //      arg->DependencyAnalysis( RW, this );
 //      this->TryEnqueue();
 //    };
 //
 //    /**
 //     *  @brief Notice that S2S depends on all Far interactions, which
 //     *         may include local tree nodes or let nodes.
 //     *         For HSS case, the only Far interaction is the sibling.
 //     *         Skeleton weight of the sibling will always be exchanged
 //     *         by default in N2S. Thus, currently we do not need
 //     *         a distributed S2S, because the skeleton weight is already
 //     *         in place.
 //     *
 //     */
 //    void Execute( Worker* user_worker )
 //    {
 //      auto *node = arg;
 //      /** MPI Support. */
 //      auto comm = node->GetComm();
 //      auto size = node->GetCommSize();
 //      auto rank = node->GetCommRank();
 //
 //      if ( size < 2 )
 //      {
 //        gofmm::SkeletonsToSkeletons<NNPRUNE, NODE, T>( node );
 //      }
 //      else
 //      {
 //        /** Only 0th rank (owner) will execute this task. */
 //        if ( rank == 0 ) gofmm::SkeletonsToSkeletons<NNPRUNE, NODE, T>( node );
 //      }
 //    };
 //
 //}; /** end class DistSkeletonsToSkeletonsTask */
 //

 template<typename NODE, typename LETNODE, typename T>
 class S2STask2 : public Task
 {
   public:

     NODE *arg = NULL;

     vector<LETNODE*> Sources;

     int p = 0;

     Lock *lock = NULL;

     int *num_arrived_subtasks;

     void Set( NODE *user_arg, vector<LETNODE*> user_src, int user_p, Lock *user_lock,
         int *user_num_arrived_subtasks )
     {
       arg = user_arg;
       Sources = user_src;
       p = user_p;
       lock = user_lock;
       num_arrived_subtasks = user_num_arrived_subtasks;
       name = string( "S2S" );
       label = to_string( arg->treelist_id );

       double flops = 0.0, mops = 0.0;
       size_t nrhs = arg->setup->w->col();
       size_t m = arg->data.skels.size();
       for ( auto src : Sources )
       {
         size_t k = src->data.skels.size();
         flops += 2 * m * k * nrhs;
         mops  += 2 * ( m * k + ( m + k ) * nrhs );
         flops += 2 * m * nrhs;
         flops += m * k * ( 2 * 18 + 100 );
       }
       event.Set( label + name, flops, mops );
       cost = flops / 1E+9;
       if ( arg->treelist_id == 0 ) priority = true;
     };

     void DependencyAnalysis()
     {
       if ( p == hmlp_get_mpi_rank() )
       {
         for ( auto src : Sources ) src->DependencyAnalysis( R, this );
       }
       else hmlp_msg_dependency_analysis( 306, p, R, this );
       this->TryEnqueue();
     };

     void Execute( Worker* user_worker )
     {
       auto *node = arg;
       if ( !node->parent || !node->data.isskel ) return;
       size_t nrhs = node->setup->w->col();
       auto &K = *node->setup->K;
       auto &I = node->data.skels;

       Data<T> u( I.size(), nrhs, 0.0 );

       for ( auto src : Sources )
       {
         auto &J = src->data.skels;
         auto &w = src->data.w_skel;
         bool is_cached = true;

         auto &KIJ = node->DistFar[ p ][ src->morton ];
         if ( KIJ.row() != I.size() || KIJ.col() != J.size() )
         {
           //printf( "KIJ %lu %lu I %lu J %lu\n", KIJ.row(), KIJ.col(), I.size(), J.size() );
           KIJ = K( I, J );
           is_cached = false;
         }

         assert( w.col() == nrhs );
         assert( w.row() == J.size() );
         //xgemm
         //(
         //  "N", "N", u.row(), u.col(), w.row(),
         //  1.0, KIJ.data(), KIJ.row(),
         //         w.data(),   w.row(),
         //  1.0,   u.data(),   u.row()
         //);
         gemm::xgemm( (T)1.0, KIJ, w, (T)1.0, u );

         if ( !is_cached )
         {
           KIJ.resize( 0, 0 );
           KIJ.shrink_to_fit();
         }
       }

       lock->Acquire();
       {
         auto &u_skel = node->data.u_skel;
         for ( int i = 0; i < u.size(); i ++ )
           u_skel[ i ] += u[ i ];
       }
       lock->Release();
       #pragma omp atomic update
       *num_arrived_subtasks += 1;
     };
 };

 template<typename NODE, typename LETNODE, typename T>
 class S2SReduceTask2 : public Task
 {
   public:

     NODE *arg = NULL;

     vector<S2STask2<NODE, LETNODE, T>*> subtasks;

     Lock lock;

     int num_arrived_subtasks = 0;

     const size_t batch_size = 2;

     void Set( NODE *user_arg )
     {
       arg = user_arg;
       name = string( "S2SR" );
       label = to_string( arg->treelist_id );

       if ( arg )
       {
         size_t nrhs = arg->setup->w->col();
         auto &I = arg->data.skels;
         arg->data.u_skel.resize( 0, 0 );
         arg->data.u_skel.resize( I.size(), nrhs, 0 );
       }

       for ( int p = 0; p < hmlp_get_mpi_size(); p ++ )
       {
         vector<LETNODE*> Sources;
         for ( auto &it : arg->DistFar[ p ] )
         {
           Sources.push_back( (*arg->morton2node)[ it.first ] );
           if ( Sources.size() == batch_size )
           {
             subtasks.push_back( new S2STask2<NODE, LETNODE, T>() );
             subtasks.back()->Submit();
             subtasks.back()->Set( user_arg, Sources, p, &lock, &num_arrived_subtasks );
             subtasks.back()->DependencyAnalysis();
             Sources.clear();
           }
         }
         if ( Sources.size() )
         {
           subtasks.push_back( new S2STask2<NODE, LETNODE, T>() );
           subtasks.back()->Submit();
           subtasks.back()->Set( user_arg, Sources, p, &lock, &num_arrived_subtasks );
           subtasks.back()->DependencyAnalysis();
           Sources.clear();
         }
       }
       double flops = 0, mops = 0;
       event.Set( label + name, flops, mops );
       priority = true;
     };

     void DependencyAnalysis()
     {
       for ( auto task : subtasks ) Scheduler::DependencyAdd( task, this );
       arg->DependencyAnalysis( RW, this );
       this->TryEnqueue();
     };

     void Execute( Worker* user_worker )
     {
       assert( num_arrived_subtasks == subtasks.size() );
     };
 };


 template<bool NNPRUNE, typename NODE, typename T>
 void DistSkeletonsToNodes( NODE *node )
 {
   auto comm = node->GetComm();
   auto size = node->GetCommSize();
   auto rank = node->GetCommRank();
   mpi::Status status;

   auto &K = *node->setup->K;
   auto &w = *node->setup->w;


   size_t nrhs = w.col();


   if ( !node->parent || !node->data.isskel ) return;

   if ( size < 2 )
   {
     gofmm::SkeletonsToNodes( node );
   }
   else
   {
     auto &data = node->data;
     auto &proj = data.proj;
     auto &u_skel = data.u_skel;

     if ( rank == 0 )
     {
          size_t sl = node->child->data.skels.size();
          size_t sr = proj.col() - sl;
       mpi::SendVector( u_skel, size / 2, 0, comm );
       View<T> P(  true,   proj ), PL, PR;
       View<T> U( false, u_skel ), UL( false, node->child->data.u_skel );
       P.Partition2x1( PL,
                       PR, sl, TOP );
       gemm::xgemm<GEMM_NB>( (T)1.0, PL, U, (T)1.0, UL );
     }

     if ( rank == size / 2 )
     {
       size_t s  = proj.row();
          size_t sr = node->child->data.skels.size();
       size_t sl = proj.col() - sr;
       mpi::RecvVector( u_skel, 0, 0, comm, &status );
          u_skel.resize( s, nrhs );
       View<T> P(  true,   proj ), PL, PR;
       View<T> U( false, u_skel ), UR( false, node->child->data.u_skel );
       P.Partition2x1( PL,
                       PR, sl, TOP );
       gemm::xgemm<GEMM_NB>( (T)1.0, PR, U, (T)1.0, UR );
     }
   }
 };
 template<bool NNPRUNE, typename NODE, typename T>
 class DistSkeletonsToNodesTask : public Task
 {
   public:

     NODE *arg;

     void Set( NODE *user_arg )
     {
       arg = user_arg;
       name = string( "PS2N" );
       label = to_string( arg->l );

       double flops = 0.0, mops = 0.0;
       auto &gids = arg->gids;
       auto &skels = arg->data.skels;
       auto &w = *arg->setup->w;

          if ( !arg->child )
          {
         if ( arg->isleaf )
         {
           auto m = skels.size();
           auto n = w.col();
           auto k = gids.size();
           flops = 2.0 * m * n * k;
           mops = 2.0 * ( m * n + m * k + k * n );
         }
         else
         {
           auto &lskels = arg->lchild->data.skels;
           auto &rskels = arg->rchild->data.skels;
           auto m = skels.size();
           auto n = w.col();
           auto k = lskels.size() + rskels.size();
           flops = 2.0 * m * n * k;
           mops  = 2.0 * ( m * n + m * k + k * n );
         }
          }
          else
          {
             if ( arg->GetCommRank() == 0 )
             {
           auto &lskels = arg->child->data.skels;
           auto m = skels.size();
           auto n = w.col();
           auto k = lskels.size();
           flops = 2.0 * m * n * k;
           mops = 2.0 * ( m * n + m * k + k * n );
             }
             if ( arg->GetCommRank() == arg->GetCommSize() / 2 )
             {
           auto &rskels = arg->child->data.skels;
           auto m = skels.size();
           auto n = w.col();
           auto k = rskels.size();
           flops = 2.0 * m * n * k;
           mops = 2.0 * ( m * n + m * k + k * n );
             }
          }

       event.Set( label + name, flops, mops );
       cost = flops / 1E+9;
       priority = true;
     };

     void DependencyAnalysis() { arg->DependOnParent( this ); };

     void Execute( Worker* user_worker ) { DistSkeletonsToNodes<NNPRUNE, NODE, T>( arg ); };

 };
 template<typename NODE, typename T>
 class L2LTask2 : public Task
 {
   public:

     NODE *arg = NULL;

     vector<NODE*> Sources;

     int p = 0;

     Lock *lock = NULL;

     int *num_arrived_subtasks;

     void Set( NODE *user_arg, vector<NODE*> user_src, int user_p, Lock *user_lock,
         int* user_num_arrived_subtasks )
     {
       arg = user_arg;
       Sources = user_src;
       p = user_p;
       lock = user_lock;
       num_arrived_subtasks = user_num_arrived_subtasks;
       name = string( "L2L" );
       label = to_string( arg->treelist_id );

       double flops = 0.0, mops = 0.0;
       size_t nrhs = arg->setup->w->col();
       size_t m = arg->gids.size();
       for ( auto src : Sources )
       {
         size_t k = src->gids.size();
         flops += 2 * m * k * nrhs;
         mops  += 2 * ( m * k + ( m + k ) * nrhs );
         flops += 2 * m * nrhs;
         flops += m * k * ( 2 * 18 + 100 );
       }
       event.Set( label + name, flops, mops );
       cost = flops / 1E+9;
       priority = false;
     };

     void DependencyAnalysis()
     {
       if ( p != hmlp_get_mpi_rank() )
         hmlp_msg_dependency_analysis( 300, p, R, this );
       this->TryEnqueue();
     };

     void Execute( Worker* user_worker )
     {
       auto *node = arg;
       size_t nrhs = node->setup->w->col();
       auto &K = *node->setup->K;
       auto &I = node->gids;

       double beg = omp_get_wtime();
       Data<T> u( I.size(), nrhs, 0.0 );
       size_t k;

       for ( auto src : Sources )
       {
         View<T> &W = src->data.w_view;
         Data<T> &w = src->data.w_leaf;

         bool is_cached = true;
         auto &J = src->gids;
         auto &KIJ = node->DistNear[ p ][ src->morton ];
         if ( KIJ.row() != I.size() || KIJ.col() != J.size() )
         {
           KIJ = K( I, J );
           is_cached = false;
         }

         if ( W.col() == nrhs && W.row() == J.size() )
         {
           k += W.row();
           xgemm
           (
             "N", "N", u.row(), u.col(), W.row(),
             1.0, KIJ.data(), KIJ.row(),
                    W.data(),   W.ld(),
             1.0,   u.data(),   u.row()
           );
         }
         else
         {
           k += w.row();
           xgemm
           (
             "N", "N", u.row(), u.col(), w.row(),
             1.0, KIJ.data(), KIJ.row(),
                    w.data(),   w.row(),
             1.0,   u.data(),   u.row()
           );
         }

         if ( !is_cached )
         {
           KIJ.resize( 0, 0 );
           KIJ.shrink_to_fit();
         }
       }

       double lock_beg = omp_get_wtime();
       lock->Acquire();
       {
         View<T> &U = node->data.u_view;
         for ( int j = 0; j < u.col(); j ++ )
           for ( int i = 0; i < u.row(); i ++ )
             U( i, j ) += u( i, j );
       }
       lock->Release();
       double lock_time = omp_get_wtime() - lock_beg;

       double gemm_time = omp_get_wtime() - beg;
       double GFLOPS = 2.0 * u.row() * u.col() * k / ( 1E+9 * gemm_time );
       //printf( "GEMM %4lu %4lu %4lu %lf GFLOPS, lock(%lf/%lf)\n",
       //    u.row(), u.col(), k, GFLOPS, lock_time, gemm_time ); fflush( stdout );
       #pragma omp atomic update
       *num_arrived_subtasks += 1;
     };
 };


 template<typename NODE, typename T>
 class L2LReduceTask2 : public Task
 {
   public:

     NODE *arg = NULL;

     vector<L2LTask2<NODE, T>*> subtasks;

     Lock lock;

     int num_arrived_subtasks = 0;

     const size_t batch_size = 2;

     void Set( NODE *user_arg )
     {
       arg = user_arg;
       name = string( "L2LR" );
       label = to_string( arg->treelist_id );
       for ( int p = 0; p < hmlp_get_mpi_size(); p ++ )
       {
         vector<NODE*> Sources;
         for ( auto &it : arg->DistNear[ p ] )
         {
           Sources.push_back( (*arg->morton2node)[ it.first ] );
           if ( Sources.size() == batch_size )
           {
             subtasks.push_back( new L2LTask2<NODE, T>() );
             subtasks.back()->Submit();
             subtasks.back()->Set( user_arg, Sources, p, &lock, &num_arrived_subtasks );
             subtasks.back()->DependencyAnalysis();
             Sources.clear();
           }
         }
         if ( Sources.size() )
         {
           subtasks.push_back( new L2LTask2<NODE, T>() );
           subtasks.back()->Submit();
           subtasks.back()->Set( user_arg, Sources, p, &lock, &num_arrived_subtasks );
           subtasks.back()->DependencyAnalysis();
           Sources.clear();
         }
       }


       double flops = 0, mops = 0;
       event.Set( label + name, flops, mops );
       priority = false;
     };

     void DependencyAnalysis()
     {
       for ( auto task : subtasks ) Scheduler::DependencyAdd( task, this );
       arg->DependencyAnalysis( RW, this );
       this->TryEnqueue();
     };

     void Execute( Worker* user_worker )
     {
       assert( num_arrived_subtasks == subtasks.size() );
     };
 };


 template<typename TREE>
 void FindNearInteractions( TREE &tree )
 {
   mpi::PrintProgress( "[BEG] Finish FindNearInteractions ...", tree.GetComm() );
   using NODE = typename TREE::NODE;
   auto &setup = tree.setup;
   auto &NN = *setup.NN;
   double budget = setup.Budget();
   size_t n_leafs = ( 1 << tree.depth );
   auto level_beg = tree.treelist.begin() + n_leafs - 1;

   #pragma omp parallel for
   for ( size_t node_ind = 0; node_ind < n_leafs; node_ind ++ )
   {
     auto *node = *(level_beg + node_ind);
     auto &data = node->data;
     size_t n_nodes = ( 1 << node->l );

     node->NNNearNodes.insert( node );
     node->NNNearNodeMortonIDs.insert( node->morton );

     multimap<size_t, size_t> sorted_ballot = gofmm::NearNodeBallots( node );

     for ( auto it  = sorted_ballot.rbegin();
                it != sorted_ballot.rend(); it ++ )
     {
       if ( node->NNNearNodes.size() >= n_nodes * budget ) break;

       #pragma omp critical
       {
         if ( !(*node->morton2node).count( (*it).second ) )
         {
           (*node->morton2node)[ (*it).second ] = new NODE( (*it).second );
         }
         auto *target = (*node->morton2node)[ (*it).second ];
         node->NNNearNodeMortonIDs.insert( (*it).second );
         node->NNNearNodes.insert( target );
       }
     }
   }
   mpi::PrintProgress( "[END] Finish FindNearInteractions ...", tree.GetComm() );
 };
 template<typename NODE>
 void FindFarNodes( MortonHelper::Recursor r, NODE *target )
 {
   if ( r.second > target->l ) return;
   size_t node_morton = MortonHelper::MortonID( r );

   //bool prunable = true;
   auto & NearMortonIDs = target->NNNearNodeMortonIDs;

   if ( MortonHelper::ContainAny( node_morton, NearMortonIDs ) )
   {
     FindFarNodes( MortonHelper::RecurLeft( r ), target );
     FindFarNodes( MortonHelper::RecurRight( r ), target );
   }
   else
   {
     if ( node_morton >= target->morton )
       target->NNFarNodeMortonIDs.insert( node_morton );
   }
 };
 template<typename TREE>
 void SymmetrizeNearInteractions( TREE & tree )
 {
   mpi::PrintProgress( "[BEG] SymmetrizeNearInteractions ...", tree.GetComm() );

   using NODE = typename TREE::NODE;
   int comm_size; mpi::Comm_size( tree.GetComm(), &comm_size );
   int comm_rank; mpi::Comm_rank( tree.GetComm(), &comm_rank );

   vector<vector<pair<size_t, size_t>>> sendlist( comm_size );
   vector<vector<pair<size_t, size_t>>> recvlist( comm_size );


   int n_nodes = 1 << tree.depth;
   auto level_beg = tree.treelist.begin() + n_nodes - 1;

   #pragma omp parallel
   {
     vector<vector<pair<size_t, size_t>>> list( comm_size );

     #pragma omp for
     for ( int node_ind = 0; node_ind < n_nodes; node_ind ++ )
     {
       auto *node = *(level_beg + node_ind);
       //auto & NearMortonIDs = node->NNNearNodeMortonIDs;
       for ( auto it : node->NNNearNodeMortonIDs )
       {
         int dest = tree.Morton2Rank( it );
         if ( dest >= comm_size ) printf( "%8lu dest %d\n", it, dest );
         list[ dest ].push_back( make_pair( it, node->morton ) );
       }
     }
     #pragma omp critical
     {
       for ( int p = 0; p < comm_size; p ++ )
       {
         sendlist[ p ].insert( sendlist[ p ].end(),
             list[ p ].begin(), list[ p ].end() );
       }
     }
   };
   mpi::AlltoallVector( sendlist, recvlist, tree.GetComm() );


   for ( int p = 0; p < comm_size; p ++ )
   {
     for ( auto & query : recvlist[ p ]  )
     {
       #pragma omp critical
       {
         auto* node = tree.morton2node[ query.first ];
         if ( !tree.morton2node.count( query.second ) )
         {
           tree.morton2node[ query.second ] = new NODE( query.second );
         }
         node->data.lock.Acquire();
         {
           node->NNNearNodes.insert( tree.morton2node[ query.second ] );
           node->NNNearNodeMortonIDs.insert( query.second );
         }
         node->data.lock.Release();
       }
     };
   }
   mpi::Barrier( tree.GetComm() );
   mpi::PrintProgress( "[END] SymmetrizeNearInteractions ...", tree.GetComm() );
 };
 template<typename TREE>
 void SymmetrizeFarInteractions( TREE & tree )
 {
   mpi::PrintProgress( "[BEG] SymmetrizeFarInteractions ...", tree.GetComm() );

   using NODE = typename TREE::NODE;
   //int comm_size; mpi::Comm_size( tree.GetComm(), &comm_size );
   //int comm_rank; mpi::Comm_rank( tree.GetComm(), &comm_rank );

   vector<vector<pair<size_t, size_t>>> sendlist( tree.GetCommSize() );
   vector<vector<pair<size_t, size_t>>> recvlist( tree.GetCommSize() );

   #pragma omp parallel
   {
     vector<vector<pair<size_t, size_t>>> list( tree.GetCommSize() );

     #pragma omp for
     for ( size_t i = 1; i < tree.treelist.size(); i ++ )
     {
       auto *node = tree.treelist[ i ];
       for ( auto it  = node->NNFarNodeMortonIDs.begin();
                  it != node->NNFarNodeMortonIDs.end(); it ++ )
       {
         #pragma omp critical
         {
           if ( !tree.morton2node.count( *it ) )
           {
             tree.morton2node[ *it ] = new NODE( *it );
           }
           node->NNFarNodes.insert( tree.morton2node[ *it ] );
         }
         int dest = tree.Morton2Rank( *it );
         if ( dest >= tree.GetCommSize() ) printf( "%8lu dest %d\n", *it, dest );
         list[ dest ].push_back( make_pair( *it, node->morton ) );
       }
     }

     #pragma omp critical
     {
       for ( int p = 0; p < tree.GetCommSize(); p ++ )
       {
         sendlist[ p ].insert( sendlist[ p ].end(),
             list[ p ].begin(), list[ p ].end() );
       }
     }
   }


   #pragma omp parallel
   {
     vector<vector<pair<size_t, size_t>>> list( tree.GetCommSize() );

     #pragma omp for
     for ( size_t i = 0; i < tree.mpitreelists.size(); i ++ )
     {
       auto *node = tree.mpitreelists[ i ];
       for ( auto it  = node->NNFarNodeMortonIDs.begin();
                  it != node->NNFarNodeMortonIDs.end(); it ++ )
       {
         #pragma omp critical
         {
           if ( !tree.morton2node.count( *it ) )
           {
             tree.morton2node[ *it ] = new NODE( *it );
           }
           node->NNFarNodes.insert( tree.morton2node[ *it ] );
         }
         int dest = tree.Morton2Rank( *it );
         if ( dest >= tree.GetCommSize() ) printf( "%8lu dest %d\n", *it, dest ); fflush( stdout );
         list[ dest ].push_back( make_pair( *it, node->morton ) );
       }
     }

     #pragma omp critical
     {
       for ( int p = 0; p < tree.GetCommSize(); p ++ )
       {
         sendlist[ p ].insert( sendlist[ p ].end(),
             list[ p ].begin(), list[ p ].end() );
       }
     }
   }

   mpi::AlltoallVector( sendlist, recvlist, tree.GetComm() );

   for ( int p = 0; p < tree.GetCommSize(); p ++ )
   {
     //#pragma omp parallel for
     for ( auto & query : recvlist[ p ] )
     {
       #pragma omp critical
       {
         if ( !tree.morton2node.count( query.second ) )
         {
           tree.morton2node[ query.second ] = new NODE( query.second );
           //printf( "rank %d, %8lu level %lu creates far LET %8lu (symmetrize)\n",
           //    comm_rank, node->morton, node->l, query.second );
         }
         auto* node = tree.morton2node[ query.first ];
         node->data.lock.Acquire();
         {
           node->NNFarNodes.insert( tree.morton2node[ query.second ] );
           node->NNFarNodeMortonIDs.insert( query.second );
         }
         node->data.lock.Release();
         assert( tree.Morton2Rank( node->morton ) == tree.GetCommRank() );
       }
     }
   }

   mpi::Barrier( tree.GetComm() );
   mpi::PrintProgress( "[END] SymmetrizeFarInteractions ...", tree.GetComm() );
 };
 template<typename TREE>
 void BuildInteractionListPerRank( TREE &tree, bool is_near )
 {
   using T = typename TREE::T;
   int comm_size; mpi::Comm_size( tree.GetComm(), &comm_size );
   int comm_rank; mpi::Comm_rank( tree.GetComm(), &comm_rank );

   vector<set<size_t>> lists( comm_size );

   if ( is_near )
   {
     int n_nodes = 1 << tree.depth;
     auto level_beg = tree.treelist.begin() + n_nodes - 1;

     #pragma omp parallel
     {
       vector<set<size_t>> list( comm_size );

       #pragma omp for
       for ( int node_ind = 0; node_ind < n_nodes; node_ind ++ )
       {
         auto *node = *(level_beg + node_ind);
         auto & NearMortonIDs = node->NNNearNodeMortonIDs;
         node->DistNear.resize( comm_size );
         for ( auto it : NearMortonIDs )
         {
           int dest = tree.Morton2Rank( it );
           if ( dest >= comm_size ) printf( "%8lu dest %d\n", it, dest );
           if ( dest != comm_rank ) list[ dest ].insert( node->morton );
           node->DistNear[ dest ][ it ] = Data<T>();
         }
       }
       #pragma omp critical
       {
         for ( int p = 0; p < comm_size; p ++ )
           lists[ p ].insert( list[ p ].begin(), list[ p ].end() );
       }
     };
     vector<vector<size_t>> recvlist( comm_size );
     if ( !tree.NearSentToRank.size() ) tree.NearSentToRank.resize( comm_size );
     if ( !tree.NearRecvFromRank.size() ) tree.NearRecvFromRank.resize( comm_size );
     #pragma omp parallel for
     for ( int p = 0; p < comm_size; p ++ )
     {
       tree.NearSentToRank[ p ].insert( tree.NearSentToRank[ p ].end(),
           lists[ p ].begin(), lists[ p ].end() );
     }

     mpi::AlltoallVector( tree.NearSentToRank, recvlist, tree.GetComm() );

     #pragma omp parallel for
     for ( int p = 0; p < comm_size; p ++ )
       for ( int i = 0; i < recvlist[ p ].size(); i ++ )
         tree.NearRecvFromRank[ p ][ recvlist[ p ][ i ] ] = i;
   }
   else
   {
     #pragma omp parallel
     {
       vector<set<size_t>> list( comm_size );

       #pragma omp for
       for ( size_t i = 1; i < tree.treelist.size(); i ++ )
       {
         auto *node = tree.treelist[ i ];
         node->DistFar.resize( comm_size );
         for ( auto it  = node->NNFarNodeMortonIDs.begin();
                    it != node->NNFarNodeMortonIDs.end(); it ++ )
         {
           int dest = tree.Morton2Rank( *it );
           if ( dest >= comm_size ) printf( "%8lu dest %d\n", *it, dest );
           if ( dest != comm_rank )
           {
             list[ dest ].insert( node->morton );
             //node->data.FarDependents.insert( dest );
           }
           node->DistFar[ dest ][ *it ] = Data<T>();
         }
       }

       #pragma omp for
       for ( size_t i = 0; i < tree.mpitreelists.size(); i ++ )
       {
         auto *node = tree.mpitreelists[ i ];
         node->DistFar.resize( comm_size );
         if ( tree.Morton2Rank( node->morton ) == comm_rank )
         {
           for ( auto it  = node->NNFarNodeMortonIDs.begin();
                      it != node->NNFarNodeMortonIDs.end(); it ++ )
           {
             int dest = tree.Morton2Rank( *it );
             if ( dest >= comm_size ) printf( "%8lu dest %d\n", *it, dest );
             if ( dest != comm_rank )
             {
               list[ dest ].insert( node->morton );
               //node->data.FarDependents.insert( dest );
             }
             node->DistFar[ dest ][ *it ]  = Data<T>();
           }
         }
       }
       #pragma omp critical
       {
         for ( int p = 0; p < comm_size; p ++ )
           lists[ p ].insert( list[ p ].begin(), list[ p ].end() );
       }
     };
     vector<vector<size_t>> recvlist( comm_size );
     if ( !tree.FarSentToRank.size() ) tree.FarSentToRank.resize( comm_size );
     if ( !tree.FarRecvFromRank.size() ) tree.FarRecvFromRank.resize( comm_size );
     #pragma omp parallel for
     for ( int p = 0; p < comm_size; p ++ )
     {
       tree.FarSentToRank[ p ].insert( tree.FarSentToRank[ p ].end(),
           lists[ p ].begin(), lists[ p ].end() );
     }


     mpi::AlltoallVector( tree.FarSentToRank, recvlist, tree.GetComm() );

     #pragma omp parallel for
     for ( int p = 0; p < comm_size; p ++ )
       for ( int i = 0; i < recvlist[ p ].size(); i ++ )
         tree.FarRecvFromRank[ p ][ recvlist[ p ][ i ] ] = i;
   }

   mpi::Barrier( tree.GetComm() );
 };
 template<typename TREE>
 pair<double, double> NonCompressedRatio( TREE &tree )
 {
   int comm_size; mpi::Comm_size( tree.GetComm(), &comm_size );
   int comm_rank; mpi::Comm_rank( tree.GetComm(), &comm_rank );

   double ratio_n = 0.0;
   double ratio_f = 0.0;


   for ( auto &tar : tree.treelist )
   {
     if ( tar->isleaf )
     {
       for ( auto nearID : tar->NNNearNodeMortonIDs )
       {
         auto *src = tree.morton2node[ nearID ];
         assert( src );
         double m = tar->gids.size();
         double n = src->gids.size();
         double N = tree.n;
         ratio_n += ( m / N ) * ( n / N );
       }
     }

     for ( auto farID : tar->NNFarNodeMortonIDs )
     {
       auto *src = tree.morton2node[ farID ];
       assert( src );
       double m = tar->data.skels.size();
       double n = src->data.skels.size();
       double N = tree.n;
       ratio_f += ( m / N ) * ( n / N );
     }
   }

   for ( auto &tar : tree.mpitreelists )
   {
     if ( !tar->child || tar->GetCommRank() ) continue;
     for ( auto farID : tar->NNFarNodeMortonIDs )
     {
       auto *src = tree.morton2node[ farID ];
       assert( src );
       double m = tar->data.skels.size();
       double n = src->data.skels.size();
       double N = tree.n;
       ratio_f += ( m / N ) * ( n / N );
     }
   }

   pair<double, double> ret( 0, 0 );
   mpi::Allreduce( &ratio_n, &(ret.first),  1, MPI_SUM, tree.GetComm() );
   mpi::Allreduce( &ratio_f, &(ret.second), 1, MPI_SUM, tree.GetComm() );

   return ret;
 };


 template<typename T, typename TREE>
 void PackNear( TREE &tree, string option, int p,
     vector<size_t> &sendsizes,
     vector<size_t> &sendskels,
     vector<T> &sendbuffs )
 {
   vector<size_t> offsets( 1, 0 );

   for ( auto it : tree.NearSentToRank[ p ] )
   {
     auto *node = tree.morton2node[ it ];
     auto &gids = node->gids;
     if ( !option.compare( string( "leafgids" ) ) )
     {
       sendsizes.push_back( gids.size() );
       sendskels.insert( sendskels.end(), gids.begin(), gids.end() );
     }
     else
     {
       auto &w_view = node->data.w_view;
       sendsizes.push_back( gids.size() * w_view.col() );
       offsets.push_back( sendsizes.back() + offsets.back() );
     }
   }

   if ( offsets.size() ) sendbuffs.resize( offsets.back() );

   if ( !option.compare( string( "leafweights" ) ) )
   {
     #pragma omp parallel for
     for ( size_t i = 0; i < tree.NearSentToRank[ p ].size(); i ++ )
     {
       auto *node = tree.morton2node[ tree.NearSentToRank[ p ][ i ] ];
       auto &gids = node->gids;
       auto &w_view = node->data.w_view;
       auto  w_leaf = w_view.toData();
       size_t offset = offsets[ i ];
       for ( size_t j = 0; j < w_leaf.size(); j ++ )
         sendbuffs[ offset + j ] = w_leaf[ j ];
     }
   }
 };


 template<typename T, typename TREE>
 void UnpackLeaf( TREE &tree, string option, int p,
     const vector<size_t> &recvsizes,
     const vector<size_t> &recvskels,
     const vector<T> &recvbuffs )
 {
   vector<size_t> offsets( 1, 0 );
   for ( auto it : recvsizes ) offsets.push_back( offsets.back() + it );

   for ( auto it : tree.NearRecvFromRank[ p ] )
   {
     auto *node = tree.morton2node[ it.first ];
     if ( !option.compare( string( "leafgids" ) ) )
     {
       auto &gids = node->gids;
       size_t i = it.second;
       gids.reserve( recvsizes[ i ] );
       for ( uint64_t j  = offsets[ i + 0 ];
                      j  < offsets[ i + 1 ];
                      j ++ )
       {
         gids.push_back( recvskels[ j ] );
       }
     }
     else
     {
       size_t nrhs = tree.setup.w->col();
       auto &w_leaf = node->data.w_leaf;
       size_t i = it.second;
       w_leaf.resize( recvsizes[ i ] / nrhs, nrhs );
       //printf( "%d recv w_leaf from %d [%lu %lu]\n",
       //    comm_rank, p, w_leaf.row(), w_leaf.col() ); fflush( stdout );
       for ( uint64_t j  = offsets[ i + 0 ], jj = 0;
                      j  < offsets[ i + 1 ];
                      j ++,                 jj ++ )
       {
         w_leaf[ jj ] = recvbuffs[ j ];
       }
     }
   }
 };


 template<typename T, typename TREE>
 void PackFar( TREE &tree, string option, int p,
     vector<size_t> &sendsizes,
     vector<size_t> &sendskels,
     vector<T> &sendbuffs )
 {
   for ( auto it : tree.FarSentToRank[ p ] )
   {
     auto *node = tree.morton2node[ it ];
     auto &skels = node->data.skels;
     if ( !option.compare( string( "skelgids" ) ) )
     {
       sendsizes.push_back( skels.size() );
       sendskels.insert( sendskels.end(), skels.begin(), skels.end() );
     }
     else
     {
       auto &w_skel = node->data.w_skel;
       sendsizes.push_back( w_skel.size() );
       sendbuffs.insert( sendbuffs.end(), w_skel.begin(), w_skel.end() );
     }
   }
 };
 template<typename TREE, typename T>
 void PackWeights( TREE &tree, int p,
     vector<T> &sendbuffs, vector<size_t> &sendsizes )
 {
   for ( auto it : tree.NearSentToRank[ p ] )
   {
     auto *node = tree.morton2node[ it ];
     auto w_leaf = node->data.w_view.toData();
     sendbuffs.insert( sendbuffs.end(), w_leaf.begin(), w_leaf.end() );
     sendsizes.push_back( w_leaf.size() );
   }
 };
 template<typename TREE, typename T>
 void UnpackWeights( TREE &tree, int p,
     const vector<T> recvbuffs, const vector<size_t> &recvsizes )
 {
   vector<size_t> offsets( 1, 0 );
   for ( auto it : recvsizes ) offsets.push_back( offsets.back() + it );

   for ( auto it : tree.NearRecvFromRank[ p ] )
   {
     auto *node = tree.morton2node[ it.first ];
     size_t nrhs = tree.setup.w->col();
     auto &w_leaf = node->data.w_leaf;
     size_t i = it.second;
     w_leaf.resize( recvsizes[ i ] / nrhs, nrhs );
     for ( uint64_t j  = offsets[ i + 0 ], jj = 0;
                    j  < offsets[ i + 1 ];
                    j ++,                  jj ++ )
     {
       w_leaf[ jj ] = recvbuffs[ j ];
     }
   }
 };
 template<typename TREE>
 void PackSkeletons( TREE &tree, int p,
     vector<size_t> &sendbuffs, vector<size_t> &sendsizes )
 {
   for ( auto it : tree.FarSentToRank[ p ] )
   {
     auto *node = tree.morton2node[ it ];
     auto &skels = node->data.skels;
     sendbuffs.insert( sendbuffs.end(), skels.begin(), skels.end() );
     sendsizes.push_back( skels.size() );
   }
 };
 template<typename TREE>
 void UnpackSkeletons( TREE &tree, int p,
     const vector<size_t> recvbuffs, const vector<size_t> &recvsizes )
 {
   vector<size_t> offsets( 1, 0 );
   for ( auto it : recvsizes ) offsets.push_back( offsets.back() + it );

   for ( auto it : tree.FarRecvFromRank[ p ] )
   {
     auto *node = tree.morton2node[ it.first ];
     auto &skels = node->data.skels;
     size_t i = it.second;
     skels.clear();
     skels.reserve( recvsizes[ i ] );
     for ( uint64_t j  = offsets[ i + 0 ];
                    j  < offsets[ i + 1 ];
                    j ++ )
     {
       skels.push_back( recvbuffs[ j ] );
     }
   }
 };
 template<typename TREE, typename T>
 void PackSkeletonWeights( TREE &tree, int p,
     vector<T> &sendbuffs, vector<size_t> &sendsizes )
 {
   for ( auto it : tree.FarSentToRank[ p ] )
   {
     auto *node = tree.morton2node[ it ];
     auto &w_skel = node->data.w_skel;
     sendbuffs.insert( sendbuffs.end(), w_skel.begin(), w_skel.end() );
     sendsizes.push_back( w_skel.size() );
   }
 };
 template<typename TREE, typename T>
 void UnpackSkeletonWeights( TREE &tree, int p,
     const vector<T> recvbuffs, const vector<size_t> &recvsizes )
 {
   vector<size_t> offsets( 1, 0 );
   for ( auto it : recvsizes ) offsets.push_back( offsets.back() + it );

   for ( auto it : tree.FarRecvFromRank[ p ] )
   {
     auto *node = tree.morton2node[ it.first ];
     size_t nrhs = tree.setup.w->col();
     auto &w_skel = node->data.w_skel;
     size_t i = it.second;
     w_skel.resize( recvsizes[ i ] / nrhs, nrhs );
     for ( uint64_t j  = offsets[ i + 0 ], jj = 0;
                    j  < offsets[ i + 1 ];
                    j ++,                  jj ++ )
     {
       w_skel[ jj ] = recvbuffs[ j ];
     }
   }
 };
 template<typename T, typename TREE>
 void UnpackFar( TREE &tree, string option, int p,
     const vector<size_t> &recvsizes,
     const vector<size_t> &recvskels,
     const vector<T> &recvbuffs )
 {
   vector<size_t> offsets( 1, 0 );
   for ( auto it : recvsizes ) offsets.push_back( offsets.back() + it );

   for ( auto it : tree.FarRecvFromRank[ p ] )
   {
     auto *node = tree.morton2node[ it.first ];
     if ( !option.compare( string( "skelgids" ) ) )
     {
       auto &skels = node->data.skels;
       size_t i = it.second;
       skels.clear();
       skels.reserve( recvsizes[ i ] );
       for ( uint64_t j  = offsets[ i + 0 ];
                      j  < offsets[ i + 1 ];
                      j ++ )
       {
         skels.push_back( recvskels[ j ] );
       }
     }
     else
     {
       size_t nrhs = tree.setup.w->col();
       auto &w_skel = node->data.w_skel;
       size_t i = it.second;
       w_skel.resize( recvsizes[ i ] / nrhs, nrhs );
       //printf( "%d recv w_skel (%8lu) from %d [%lu %lu], i %lu, offset[%lu %lu] \n",
       //    comm_rank, (*it).first, p, w_skel.row(), w_skel.col(), i,
       //    offsets[ p ][ i + 0 ], offsets[ p ][ i + 1 ] ); fflush( stdout );
       for ( uint64_t j  = offsets[ i + 0 ], jj = 0;
                      j  < offsets[ i + 1 ];
                      j ++,                  jj ++ )
       {
         w_skel[ jj ] = recvbuffs[ j ];
         //if ( jj < 5 ) printf( "%E ", w_skel[ jj ] ); fflush( stdout );
       }
       //printf( "\n" ); fflush( stdout );
     }
   }
 };


 template<typename T, typename TREE>
 class PackNearTask : public SendTask<T, TREE>
 {
   public:

     PackNearTask( TREE *tree, int src, int tar, int key )
       : SendTask<T, TREE>( tree, src, tar, key )
     {
       this->Submit();
       this->DependencyAnalysis();
     };

     void DependencyAnalysis()
     {
       TREE &tree = *(this->arg);
       tree.DependOnNearInteractions( this->tar, this );
     };

     void Pack()
     {
       PackWeights( *this->arg, this->tar,
           this->send_buffs, this->send_sizes );
     };

 };
 template<typename T, typename TREE>
 class UnpackLeafTask : public RecvTask<T, TREE>
 {
   public:

     UnpackLeafTask( TREE *tree, int src, int tar, int key )
       : RecvTask<T, TREE>( tree, src, tar, key )
     {
       this->Submit();
       this->DependencyAnalysis();
     };

     void Unpack()
     {
       UnpackWeights( *this->arg, this->src,
           this->recv_buffs, this->recv_sizes );
     };

 };
 template<typename T, typename TREE>
 class PackFarTask : public SendTask<T, TREE>
 {
   public:

     PackFarTask( TREE *tree, int src, int tar, int key )
       : SendTask<T, TREE>( tree, src, tar, key )
     {
       this->Submit();
       this->DependencyAnalysis();
     };

     void DependencyAnalysis()
     {
       TREE &tree = *(this->arg);
       tree.DependOnFarInteractions( this->tar, this );
     };

     void Pack()
     {
       PackSkeletonWeights( *this->arg, this->tar,
           this->send_buffs, this->send_sizes );
     };

 };
 template<typename T, typename TREE>
 class UnpackFarTask : public RecvTask<T, TREE>
 {
   public:

     UnpackFarTask( TREE *tree, int src, int tar, int key )
       : RecvTask<T, TREE>( tree, src, tar, key )
     {
       this->Submit();
       this->DependencyAnalysis();
     };

     void Unpack()
     {
       UnpackSkeletonWeights( *this->arg, this->src,
           this->recv_buffs, this->recv_sizes );
     };

 };
 template<typename TREE>
 void ExchangeLET( TREE &tree, string option )
 {
   using T = typename TREE::T;
   int comm_size; mpi::Comm_size( tree.GetComm(), &comm_size );
   int comm_rank; mpi::Comm_rank( tree.GetComm(), &comm_rank );

   vector<vector<size_t>> sendsizes( comm_size );
   vector<vector<size_t>> recvsizes( comm_size );
   vector<vector<size_t>> sendskels( comm_size );
   vector<vector<size_t>> recvskels( comm_size );
   vector<vector<T>>      sendbuffs( comm_size );
   vector<vector<T>>      recvbuffs( comm_size );

   #pragma omp parallel for
   for ( int p = 0; p < comm_size; p ++ )
   {
     if ( !option.compare( 0, 4, "leaf" ) )
     {
       PackNear( tree, option, p, sendsizes[ p ], sendskels[ p ], sendbuffs[ p ] );
     }
     else if ( !option.compare( 0, 4, "skel" ) )
     {
       PackFar( tree, option, p, sendsizes[ p ], sendskels[ p ], sendbuffs[ p ] );
     }
     else
     {
       printf( "ExchangeLET: option <%s> not available.\n", option.data() );
       exit( 1 );
     }
   }

   mpi::AlltoallVector( sendsizes, recvsizes, tree.GetComm() );
   if ( !option.compare( string( "skelgids" ) ) ||
        !option.compare( string( "leafgids" ) ) )
   {
     auto &K = *tree.setup.K;
     mpi::AlltoallVector( sendskels, recvskels, tree.GetComm() );
     K.RequestIndices( recvskels );
   }
   else
   {
     double beg = omp_get_wtime();
     mpi::AlltoallVector( sendbuffs, recvbuffs, tree.GetComm() );
     double a2av_time = omp_get_wtime() - beg;
     if ( comm_rank == 0 ) printf( "a2av_time %lfs\n", a2av_time );
   }


   #pragma omp parallel for
   for ( int p = 0; p < comm_size; p ++ )
   {
     if ( !option.compare( 0, 4, "leaf" ) )
     {
       UnpackLeaf( tree, option, p, recvsizes[ p ], recvskels[ p ], recvbuffs[ p ] );
     }
     else if ( !option.compare( 0, 4, "skel" ) )
     {
       UnpackFar( tree, option, p, recvsizes[ p ], recvskels[ p ], recvbuffs[ p ] );
     }
     else
     {
       printf( "ExchangeLET: option <%s> not available.\n", option.data() );
       exit( 1 );
     }
   }


 };
 template<typename T, typename TREE>
 void AsyncExchangeLET( TREE &tree, string option )
 {
   int comm_size; mpi::Comm_size( tree.GetComm(), &comm_size );
   int comm_rank; mpi::Comm_rank( tree.GetComm(), &comm_rank );

   for ( int p = 0; p < comm_size; p ++ )
   {
     if ( !option.compare( 0, 4, "leaf" ) )
     {
       auto *task = new PackNearTask<T, TREE>( &tree, comm_rank, p, 300 );
       //task->Set( &tree, comm_rank, p, 300 );
       //task->Submit();
       //task->DependencyAnalysis();
     }
     else if ( !option.compare( 0, 4, "skel" ) )
     {
       auto *task = new PackFarTask<T, TREE>( &tree, comm_rank, p, 306 );
       //task->Set( &tree, comm_rank, p, 306 );
       //task->Submit();
       //task->DependencyAnalysis();
     }
     else
     {
       printf( "AsyncExchangeLET: option <%s> not available.\n", option.data() );
       exit( 1 );
     }
   }

   for ( int p = 0; p < comm_size; p ++ )
   {
     if ( !option.compare( 0, 4, "leaf" ) )
     {
       auto *task = new UnpackLeafTask<T, TREE>( &tree, p, comm_rank, 300 );
       //task->Set( &tree, p, comm_rank, 300 );
       //task->Submit();
       //task->DependencyAnalysis();
     }
     else if ( !option.compare( 0, 4, "skel" ) )
     {
       auto *task = new UnpackFarTask<T, TREE>( &tree, p, comm_rank, 306 );
       //task->Set( &tree, p, comm_rank, 306 );
       //task->Submit();
       //task->DependencyAnalysis();
     }
     else
     {
       printf( "AsyncExchangeLET: option <%s> not available.\n", option.data() );
       exit( 1 );
     }
   }

 };
 template<typename T, typename TREE>
 void ExchangeNeighbors( TREE &tree )
 {
   mpi::PrintProgress( "[BEG] ExchangeNeighbors ...", tree.GetComm() );

   int comm_rank; mpi::Comm_rank( tree.GetComm(), &comm_rank );
   int comm_size; mpi::Comm_size( tree.GetComm(), &comm_size );

   vector<vector<size_t>> send_buff( comm_size );
   vector<vector<size_t>> recv_buff( comm_size );

   unordered_set<size_t> requested_gids;
   auto &NN = *tree.setup.NN;

   for ( auto & it : NN )
   {
     if ( it.second >= 0 && it.second < tree.n )
       requested_gids.insert( it.second );
   }

   for ( auto it : tree.treelist[ 0 ]->gids ) requested_gids.erase( it );

   for ( auto it :requested_gids )
   {
     int p = it % comm_size;
     if ( p != comm_rank ) send_buff[ p ].push_back( it );
   }

   auto &K = *tree.setup.K;
   K.RequestIndices( send_buff );

   mpi::PrintProgress( "[END] ExchangeNeighbors ...", tree.GetComm() );
 };
 template<bool SYMMETRIC, typename NODE, typename T>
 void MergeFarNodes( NODE *node )
 {
   //if ( !node->data.isskel ) return;

   //if ( node->isleaf )
   //{
   //   auto & NearMortonIDs = node->NNNearNodeMortonIDs;
   //   #pragma omp critical
   //   {
   //     int rank;
   //     mpi::Comm_rank( MPI_COMM_WORLD, &rank );
   //     string outfile = to_string( rank );
   //     FILE * pFile = fopen( outfile.data(), "a+" );
   //     fprintf( pFile, "(%8lu) ", node->morton );
   //     for ( auto it = NearMortonIDs.begin(); it != NearMortonIDs.end(); it ++ )
   //       fprintf( pFile, "%8lu, ", (*it) );
   //     fprintf( pFile, "\n" ); //fflush( stdout );
   //   }

   //   //auto & NearNodes = node->NNNearNodes;
   //   //for ( auto it = NearNodes.begin(); it != NearNodes.end(); it ++ )
   //   //{
   //   //  if ( !(*it)->NNNearNodes.count( node ) )
   //   //  {
   //   //    printf( "(%8lu) misses %lu\n", (*it)->morton, node->morton ); fflush( stdout );
   //   //  }
   //   //}
   //};


   assert( !node->FarNodeMortonIDs.size() );
   assert( !node->FarNodes.size() );
   node->FarNodeMortonIDs.insert( node->sibling->morton );
   node->FarNodes.insert( node->sibling );

   if ( node->isleaf )
   {
     FindFarNodes( MortonHelper::Root(), node );
   }
   else
   {
     auto *lchild = node->lchild;
     auto *rchild = node->rchild;

     auto &pNNFarNodes =   node->NNFarNodeMortonIDs;
     auto &lNNFarNodes = lchild->NNFarNodeMortonIDs;
     auto &rNNFarNodes = rchild->NNFarNodeMortonIDs;

     for ( auto it  = lNNFarNodes.begin();
                it != lNNFarNodes.end(); it ++ )
     {
       if ( rNNFarNodes.count( *it ) )
       {
         pNNFarNodes.insert( *it );
       }
     }
     for ( auto it  = pNNFarNodes.begin();
                it != pNNFarNodes.end(); it ++ )
     {
       lNNFarNodes.erase( *it );
       rNNFarNodes.erase( *it );
     }
   }

 };
 template<bool SYMMETRIC, typename NODE, typename T>
 class MergeFarNodesTask : public Task
 {
   public:

     NODE *arg;

     void Set( NODE *user_arg )
     {
       arg = user_arg;
       name = string( "merge" );
       label = to_string( arg->treelist_id );
       cost = 5.0;
       priority = true;
     };

     void DependencyAnalysis()
     {
       arg->DependencyAnalysis( RW, this );
       if ( !arg->isleaf )
       {
         arg->lchild->DependencyAnalysis( RW, this );
         arg->rchild->DependencyAnalysis( RW, this );
       }
       this->TryEnqueue();
     };

     void Execute( Worker* user_worker )
     {
       MergeFarNodes<SYMMETRIC, NODE, T>( arg );
     };

 };
 template<bool SYMMETRIC, typename NODE, typename T>
 void DistMergeFarNodes( NODE *node )
 {
   mpi::Status status;
   mpi::Comm comm = node->GetComm();
   int comm_size = node->GetCommSize();
   int comm_rank = node->GetCommRank();

   //if ( !node->data.isskel ) return;


   if ( !node->parent ) return;

   if ( node->GetCommSize() < 2 )
   {
     MergeFarNodes<SYMMETRIC, NODE, T>( node );
   }
   else
   {
     auto *child = node->child;

     if ( comm_rank == 0 )
     {
       auto &pNNFarNodes =  node->NNFarNodeMortonIDs;
       auto &lNNFarNodes = child->NNFarNodeMortonIDs;
       vector<size_t> recvFarNodes;

       mpi::RecvVector( recvFarNodes, comm_size / 2, 0, comm, &status );

       for ( auto it : recvFarNodes )
       {
         if ( lNNFarNodes.count( it ) )
         {
           pNNFarNodes.insert( it );
         }
       }

       recvFarNodes.clear();
       recvFarNodes.reserve( pNNFarNodes.size() );

       for ( auto it : pNNFarNodes )
       {
         lNNFarNodes.erase( it );
         recvFarNodes.push_back( it );
       }

       mpi::SendVector( recvFarNodes, comm_size / 2, 0, comm );
     }


     if ( comm_rank == comm_size / 2 )
     {
       auto &rNNFarNodes = child->NNFarNodeMortonIDs;
       vector<size_t> sendFarNodes( rNNFarNodes.begin(), rNNFarNodes.end() );

       mpi::SendVector( sendFarNodes, 0, 0, comm );
       mpi::RecvVector( sendFarNodes, 0, 0, comm, &status );
       for ( auto it : sendFarNodes ) rNNFarNodes.erase( it );
     }
   }

 };
 template<bool SYMMETRIC, typename NODE, typename T>
 class DistMergeFarNodesTask : public Task
 {
   public:

     NODE *arg = NULL;

     void Set( NODE *user_arg )
     {
       arg = user_arg;
       name = string( "dist-merge" );
       label = to_string( arg->treelist_id );
       cost = 5.0;
       priority = true;
     };

     void DependencyAnalysis()
     {
       arg->DependencyAnalysis( RW, this );
       if ( !arg->isleaf )
       {
         if ( arg->GetCommSize() > 1 )
         {
           arg->child->DependencyAnalysis( RW, this );
         }
         else
         {
           arg->lchild->DependencyAnalysis( RW, this );
           arg->rchild->DependencyAnalysis( RW, this );
         }
       }
       this->TryEnqueue();
     };

     void Execute( Worker* user_worker )
     {
       DistMergeFarNodes<SYMMETRIC, NODE, T>( arg );
     };

 };
 template<bool NNPRUNE, typename NODE>
 class CacheFarNodesTask : public Task
 {
   public:

     NODE *arg = NULL;

     void Set( NODE *user_arg )
     {
       arg = user_arg;
       name = string( "FKIJ" );
       label = to_string( arg->treelist_id );
       double flops = 0, mops = 0;
       cost = 5.0;
     };

     void DependencyAnalysis()
     {
       arg->DependencyAnalysis( RW, this );
       this->TryEnqueue();
     };

     void Execute( Worker* user_worker )
     {
       auto *node = arg;
       auto &K = *node->setup->K;

       for ( int p = 0; p < node->DistFar.size(); p ++ )
       {
         for ( auto &it : node->DistFar[ p ] )
         {
           auto *src = (*node->morton2node)[ it.first ];
           auto &I = node->data.skels;
           auto &J = src->data.skels;
           it.second = K( I, J );
           //printf( "Cache I %lu J %lu\n", I.size(), J.size() ); fflush( stdout );
         }
       }
     };

 };
 template<bool NNPRUNE, typename NODE>
 class CacheNearNodesTask : public Task
 {
   public:

     NODE *arg = NULL;

     void Set( NODE *user_arg )
     {
       arg = user_arg;
       name = string( "NKIJ" );
       label = to_string( arg->treelist_id );
       cost = 5.0;
     };

     void DependencyAnalysis()
     {
       arg->DependencyAnalysis( RW, this );
       this->TryEnqueue();
     };

     void Execute( Worker* user_worker )
     {
       auto *node = arg;
       auto &K = *node->setup->K;

       for ( int p = 0; p < node->DistNear.size(); p ++ )
       {
         for ( auto &it : node->DistNear[ p ] )
         {
           auto *src = (*node->morton2node)[ it.first ];
           auto &I = node->gids;
           auto &J = src->gids;
           it.second = K( I, J );
           //printf( "Cache I %lu J %lu\n", I.size(), J.size() ); fflush( stdout );
         }
       }
     };

 };
 template<typename NODE, typename T>
 void DistRowSamples( NODE *node, size_t nsamples )
 {
   mpi::Comm comm = node->GetComm();
   int size = node->GetCommSize();
   int rank = node->GetCommRank();

   auto &K = *node->setup->K;

   vector<size_t> &I = node->data.candidate_rows;

    I.clear();

    if ( rank == 0 )
   {
     I.reserve( nsamples );

     auto &snids = node->data.snids;
     multimap<T, size_t> ordered_snids = gofmm::flip_map( snids );

     for ( auto it  = ordered_snids.begin();
                it != ordered_snids.end(); it++ )
     {
       I.push_back( (*it).second );
       if ( I.size() >= nsamples ) break;
     }
   }

    vector<size_t> candidates( nsamples );

    size_t n_required = nsamples - I.size();

    mpi::Bcast( &n_required, 1, 0, comm );

    while ( n_required )
    {
       if ( rank == 0 )
       {
      for ( size_t i = 0; i < nsamples; i ++ )
       {
         auto important_sample = K.ImportantSample( 0 );
         candidates[ i ] =  important_sample.second;
       }
       }

       mpi::Bcast( candidates.data(), candidates.size(), 0, comm );

       vector<size_t> vconsensus( nsamples, 0 );
      vector<size_t> validation = node->setup->ContainAny( candidates, node->morton );

       mpi::Reduce( validation.data(), vconsensus.data(), nsamples, MPI_SUM, 0, comm );

      if ( rank == 0 )
       {
      for ( size_t i = 0; i < nsamples; i ++ )
          {
             if ( I.size() >= nsamples )
             {
                I.resize( nsamples );
                break;
             }
             if ( !vconsensus[ i ] )
             {
                if ( find( I.begin(), I.end(), candidates[ i ] ) == I.end() )
                   I.push_back( candidates[ i ] );
             }
          };

        n_required = nsamples - I.size();
       }

      mpi::Bcast( &n_required, 1, 0, comm );
    }

 };
 template<bool NNPRUNE, typename NODE>
 void DistSkeletonKIJ( NODE *node )
 {
   using T = typename NODE::T;
    if ( !node->parent ) return;
   auto &K = *(node->setup->K);
   auto &data = node->data;
   auto &candidate_rows = data.candidate_rows;
   auto &candidate_cols = data.candidate_cols;
   auto &KIJ = data.KIJ;

   auto comm = node->GetComm();
   auto size = node->GetCommSize();
   auto rank = node->GetCommRank();
   mpi::Status status;

   if ( size < 2 )
   {
     gofmm::SkeletonKIJ<NNPRUNE>( node );
   }
   else
   {
     NODE *child = node->child;
       size_t nsamples = 0;

     int child_isskel = child->data.isskel;
     mpi::Bcast( &child_isskel, 1, 0, child->GetComm() );
     child->data.isskel = child_isskel;


     if ( rank == 0 )
     {
       candidate_cols = child->data.skels;
       vector<size_t> rskel;
       mpi::RecvVector( rskel, size / 2, 10, comm, &status );
       K.RecvIndices( size / 2, comm, &status );
       candidate_cols.insert( candidate_cols.end(), rskel.begin(), rskel.end() );
       nsamples = 2 * candidate_cols.size();
       if ( nsamples < 2 * node->setup->LeafNodeSize() )
         nsamples = 2 * node->setup->LeafNodeSize();

       auto &lsnids = node->child->data.snids;
       vector<T>      recv_rsdist;
       vector<size_t> recv_rsnids;

       mpi::RecvVector( recv_rsdist, size / 2, 20, comm, &status );
       mpi::RecvVector( recv_rsnids, size / 2, 30, comm, &status );
       K.RecvIndices( size / 2, comm, &status );


       auto &snids = node->data.snids;
       snids = lsnids;

       for ( size_t i = 0; i < recv_rsdist.size(); i ++ )
       {
         pair<size_t, T> query( recv_rsnids[ i ], recv_rsdist[ i ] );
         auto ret = snids.insert( query );
         if ( !ret.second )
         {
           if ( ret.first->second > recv_rsdist[ i ] )
             ret.first->second = recv_rsdist[ i ];
         }
       }

       for ( auto gid : node->gids ) snids.erase( gid );
     }

     if ( rank == size / 2 )
     {
       mpi::SendVector( child->data.skels, 0, 10, comm );
       K.SendIndices( child->data.skels, 0, comm );

       auto &rsnids = node->child->data.snids;
       vector<T>      send_rsdist;
       vector<size_t> send_rsnids;

       send_rsdist.reserve( rsnids.size() );
       send_rsnids.reserve( rsnids.size() );

       for ( auto it = rsnids.begin(); it != rsnids.end(); it ++ )
       {
         send_rsnids.push_back( (*it).first  );
         send_rsdist.push_back( (*it).second );
       }

       mpi::SendVector( send_rsdist, 0, 20, comm );
       mpi::SendVector( send_rsnids, 0, 30, comm );

       K.SendIndices( send_rsnids, 0, comm );
     }

       mpi::Bcast( &nsamples, 1, 0, comm );
       DistRowSamples<NODE, T>( node, nsamples );
     if ( rank != 0 )
     {
       assert( !candidate_rows.size() );
       assert( !candidate_cols.size() );
     }
     KIJ = K( candidate_rows, candidate_cols );
   }
 };
 template<bool NNPRUNE, typename NODE, typename T>
 class DistSkeletonKIJTask : public Task
 {
   public:

     NODE *arg = NULL;

     void Set( NODE *user_arg )
     {
       arg = user_arg;
       name = string( "par-gskm" );
       label = to_string( arg->treelist_id );
       cost = 5.0;
       priority = true;
     };

     void DependencyAnalysis() { arg->DependOnChildren( this ); };

     void Execute( Worker* user_worker ) { DistSkeletonKIJ<NNPRUNE>( arg ); };

 };
 template<typename NODE, typename T>
 void DistSkeletonize( NODE *node )
 {
   if ( !node->parent ) return;

   auto &K   = *(node->setup->K);
   auto &NN  = *(node->setup->NN);
   auto maxs = node->setup->MaximumRank();
   auto stol = node->setup->Tolerance();
   bool secure_accuracy = node->setup->SecureAccuracy();
   bool use_adaptive_ranks = node->setup->UseAdaptiveRanks();

   auto &data  = node->data;
   auto &skels = data.skels;
   auto &proj  = data.proj;
   auto &jpvt  = data.jpvt;
   auto &KIJ   = data.KIJ;
   auto &candidate_cols = data.candidate_cols;

   size_t N = K.col();
   size_t m = KIJ.row();
   size_t n = KIJ.col();
   size_t q = node->n;

   if ( secure_accuracy )
   {
   }


   T scaled_stol = std::sqrt( (T)n / q ) * std::sqrt( (T)m / (N - q) ) * stol;

   scaled_stol *= std::sqrt( (T)q / N );

   lowrank::id
   (
     use_adaptive_ranks, secure_accuracy,
     KIJ.row(), KIJ.col(), maxs, scaled_stol,
     KIJ, skels, proj, jpvt
   );

   KIJ.resize( 0, 0 );
   KIJ.shrink_to_fit();

   if ( secure_accuracy )
   {
     data.isskel = (skels.size() != 0);
   }
   else
   {
     assert( skels.size() );
     assert( proj.size() );
     assert( jpvt.size() );
     data.isskel = true;
   }

   for ( size_t i = 0; i < skels.size(); i ++ )
   {
     skels[ i ] = candidate_cols[ skels[ i ] ];
   }


 };
 template<typename NODE, typename T>
 class SkeletonizeTask : public hmlp::Task
 {
   public:

     NODE *arg;

     void Set( NODE *user_arg )
     {
       arg = user_arg;
       name = string( "SK" );
       label = to_string( arg->treelist_id );
       cost = 5.0;
       priority = true;
     };

     void GetEventRecord()
     {
       double flops = 0.0, mops = 0.0;

       auto &K = *arg->setup->K;
       size_t n = arg->data.proj.col();
       size_t m = 2 * n;
       size_t k = arg->data.proj.row();

       flops += ( 4.0 / 3.0 ) * n * n * ( 3 * m - n );
       mops += ( 2.0 / 3.0 ) * n * n * ( 3 * m - n );

       /* TRSM */
       flops += k * ( k - 1 ) * ( n + 1 );
       mops  += 2.0 * ( k * k + k * n );

       event.Set( label + name, flops, mops );
       arg->data.skeletonize = event;
     };

     void DependencyAnalysis()
     {
       arg->DependencyAnalysis( RW, this );
       this->TryEnqueue();
     };

     void Execute( Worker* user_worker )
     {
       //printf( "%d Par-Skel beg\n", global_rank );

       DistSkeletonize<NODE, T>( arg );

       //printf( "%d Par-Skel end\n", global_rank );
     };

 };
 template<typename NODE, typename T>
 class DistSkeletonizeTask : public hmlp::Task
 {
   public:

     NODE *arg;

     void Set( NODE *user_arg )
     {
       arg = user_arg;
       name = string( "PSK" );
       label = to_string( arg->treelist_id );

       cost = 5.0;
       priority = true;
     };

     void GetEventRecord()
     {
       double flops = 0.0, mops = 0.0;

       auto &K = *arg->setup->K;
       size_t n = arg->data.proj.col();
       size_t m = 2 * n;
       size_t k = arg->data.proj.row();

          if ( arg->GetCommRank() == 0 )
          {
         flops += ( 4.0 / 3.0 ) * n * n * ( 3 * m - n );
         mops += ( 2.0 / 3.0 ) * n * n * ( 3 * m - n );

         /* TRSM */
         flops += k * ( k - 1 ) * ( n + 1 );
         mops  += 2.0 * ( k * k + k * n );
          }

       event.Set( label + name, flops, mops );
       arg->data.skeletonize = event;
     };

     void DependencyAnalysis()
     {
       arg->DependencyAnalysis( RW, this );
       this->TryEnqueue();
     };

     void Execute( Worker* user_worker )
     {
       mpi::Comm comm = arg->GetComm();

       double beg = omp_get_wtime();
       if ( arg->GetCommRank() == 0 )
       {
         DistSkeletonize<NODE, T>( arg );
       }
       double skel_t = omp_get_wtime() - beg;

          int isskel = arg->data.isskel;
          mpi::Bcast( &isskel, 1, 0, comm );
          arg->data.isskel = isskel;

       auto &skels = arg->data.skels;
       size_t nskels = skels.size();
       mpi::Bcast( &nskels, 1, 0, comm );
       if ( skels.size() != nskels ) skels.resize( nskels );
       mpi::Bcast( skels.data(), skels.size(), 0, comm );

     };

 };
 template<typename NODE>
 class InterpolateTask : public Task
 {
   public:

     NODE *arg = NULL;

     void Set( NODE *user_arg )
     {
       arg = user_arg;
       name = string( "PROJ" );
       label = to_string( arg->treelist_id );
       // Need an accurate cost model.
       cost = 1.0;
     };

     void DependencyAnalysis() { arg->DependOnNoOne( this ); };

     void Execute( Worker* user_worker )
     {
       auto comm = arg->GetComm();
       if ( arg->GetCommRank() == 0 ) gofmm::Interpolate( arg );

       auto &proj  = arg->data.proj;
       size_t nrow  = proj.row();
       size_t ncol  = proj.col();
       mpi::Bcast( &nrow, 1, 0, comm );
       mpi::Bcast( &ncol, 1, 0, comm );
       if ( proj.row() != nrow || proj.col() != ncol ) proj.resize( nrow, ncol );
       mpi::Bcast( proj.data(), proj.size(), 0, comm );
     };

 };
 template<bool NNPRUNE = true, typename TREE, typename T>
 DistData<RIDS, STAR, T> Evaluate( TREE &tree, DistData<RIDS, STAR, T> &weights )
 {
   try
   {
     int size; mpi::Comm_size( tree.GetComm(), &size );
     int rank; mpi::Comm_rank( tree.GetComm(), &rank );
     using NODE    = typename TREE::NODE;
     using MPINODE = typename TREE::MPINODE;

     double beg, time_ratio, evaluation_time = 0.0;
     double direct_evaluation_time = 0.0, computeall_time, telescope_time, let_exchange_time, async_time;
     double overhead_time;
     double forward_permute_time, backward_permute_time;

     tree.DependencyCleanUp();

     size_t n    = weights.row();
     size_t nrhs = weights.col();

     auto &gids_owned = tree.treelist[ 0 ]->gids;
     DistData<RIDS, STAR, T> potentials( n, nrhs, gids_owned, tree.GetComm() );
     potentials.setvalue( 0.0 );

     tree.setup.w = &weights;
     tree.setup.u = &potentials;

     gofmm::TreeViewTask<NODE>           seqVIEWtask;
     mpigofmm::DistTreeViewTask<MPINODE> mpiVIEWtask;
     gofmm::UpdateWeightsTask<NODE, T>           seqN2Stask;
     mpigofmm::DistUpdateWeightsTask<MPINODE, T> mpiN2Stask;
     //mpigofmm::DistLeavesToLeavesTask<NNPRUNE, NODE, T> seqL2Ltask;
     //mpigofmm::L2LReduceTask<NODE, T> seqL2LReducetask;
     mpigofmm::L2LReduceTask2<NODE, T> seqL2LReducetask2;
     //gofmm::SkeletonsToSkeletonsTask<NNPRUNE, NODE, T>           seqS2Stask;
     //mpigofmm::DistSkeletonsToSkeletonsTask<NNPRUNE, MPINODE, T> mpiS2Stask;
     //mpigofmm::S2SReduceTask<NODE, T>    seqS2SReducetask;
     //mpigofmm::S2SReduceTask<MPINODE, T> mpiS2SReducetask;
     mpigofmm::S2SReduceTask2<NODE, NODE, T>    seqS2SReducetask2;
     mpigofmm::S2SReduceTask2<MPINODE, NODE, T> mpiS2SReducetask2;
     gofmm::SkeletonsToNodesTask<NNPRUNE, NODE, T>           seqS2Ntask;
     mpigofmm::DistSkeletonsToNodesTask<NNPRUNE, MPINODE, T> mpiS2Ntask;

       mpi::Barrier( tree.GetComm() );

       //{
       //  /** Stage 1: TreeView and upward telescoping */
       //  beg = omp_get_wtime();
       //  tree.DependencyCleanUp();
       //  tree.DistTraverseDown( mpiVIEWtask );
       //  tree.LocaTraverseDown( seqVIEWtask );
       //  tree.LocaTraverseUp( seqN2Stask );
       //  tree.DistTraverseUp( mpiN2Stask );
       //  hmlp_run();
       //  mpi::Barrier( tree.GetComm() );
       //  telescope_time = omp_get_wtime() - beg;

       //  /** Stage 2: LET exchange */
       //  beg = omp_get_wtime();
       //  ExchangeLET<T>( tree, string( "skelweights" ) );
       //  mpi::Barrier( tree.GetComm() );
       //  ExchangeLET<T>( tree, string( "leafweights" ) );
       //  mpi::Barrier( tree.GetComm() );
       //  let_exchange_time = omp_get_wtime() - beg;

       //  /** Stage 3: L2L */
       //  beg = omp_get_wtime();
       //  tree.DependencyCleanUp();
       //  tree.LocaTraverseLeafs( seqL2LReducetask2 );
       //  hmlp_run();
       //  mpi::Barrier( tree.GetComm() );
       //  direct_evaluation_time = omp_get_wtime() - beg;

       //  /** Stage 4: S2S and downward telescoping */
       //  beg = omp_get_wtime();
       //  tree.DependencyCleanUp();
       //  tree.LocaTraverseUnOrdered( seqS2SReducetask2 );
       //  tree.DistTraverseUnOrdered( mpiS2SReducetask2 );
       //  tree.DistTraverseDown( mpiS2Ntask );
       //  tree.LocaTraverseDown( seqS2Ntask );
       //  hmlp_run();
       //  mpi::Barrier( tree.GetComm() );
       //  computeall_time = omp_get_wtime() - beg;
       //}


     potentials.setvalue( 0.0 );
     mpi::Barrier( tree.GetComm() );

     beg = omp_get_wtime();
     tree.DependencyCleanUp();
     tree.DistTraverseDown( mpiVIEWtask );
     tree.LocaTraverseDown( seqVIEWtask );
     tree.ExecuteAllTasks();
     AsyncExchangeLET<T>( tree, string( "leafweights" ) );
     tree.LocaTraverseUp( seqN2Stask );
     tree.DistTraverseUp( mpiN2Stask );
     AsyncExchangeLET<T>( tree, string( "skelweights" ) );
     tree.LocaTraverseLeafs( seqL2LReducetask2 );
     tree.LocaTraverseUnOrdered( seqS2SReducetask2 );
     tree.DistTraverseUnOrdered( mpiS2SReducetask2 );
     tree.DistTraverseDown( mpiS2Ntask );
     tree.LocaTraverseDown( seqS2Ntask );
     overhead_time = omp_get_wtime() - beg;
     tree.ExecuteAllTasks();
     async_time = omp_get_wtime() - beg;


     evaluation_time += direct_evaluation_time;
     evaluation_time += telescope_time;
     evaluation_time += let_exchange_time;
     evaluation_time += computeall_time;
     time_ratio = 100 / evaluation_time;

     if ( rank == 0 && REPORT_EVALUATE_STATUS )
     {
       printf( "========================================================\n");
       printf( "GOFMM evaluation phase\n" );
       printf( "========================================================\n");
       //printf( "Allocate ------------------------------ %5.2lfs (%5.1lf%%)\n",
       //    allocate_time, allocate_time * time_ratio );
       //printf( "Forward permute ----------------------- %5.2lfs (%5.1lf%%)\n",
       //    forward_permute_time, forward_permute_time * time_ratio );
       printf( "Upward telescope ---------------------- %5.2lfs (%5.1lf%%)\n",
           telescope_time, telescope_time * time_ratio );
       printf( "LET exchange -------------------------- %5.2lfs (%5.1lf%%)\n",
           let_exchange_time, let_exchange_time * time_ratio );
       printf( "L2L ----------------------------------- %5.2lfs (%5.1lf%%)\n",
           direct_evaluation_time, direct_evaluation_time * time_ratio );
       printf( "S2S, S2N ------------------------------ %5.2lfs (%5.1lf%%)\n",
           computeall_time, computeall_time * time_ratio );
       //printf( "Backward permute ---------------------- %5.2lfs (%5.1lf%%)\n",
       //    backward_permute_time, backward_permute_time * time_ratio );
       printf( "========================================================\n");
       printf( "Evaluate ------------------------------ %5.2lfs (%5.1lf%%)\n",
           evaluation_time, evaluation_time * time_ratio );
       printf( "Evaluate (Async) ---------------------- %5.2lfs (%5.2lfs)\n",
           async_time, overhead_time );
       printf( "========================================================\n\n");
     }

     return potentials;
   }
   catch ( const exception & e )
   {
     cout << e.what() << endl;
     exit( 1 );
   }
 };
 template<bool NNPRUNE = true, typename TREE, typename T>
 DistData<RBLK, STAR, T> Evaluate( TREE &tree, DistData<RBLK, STAR, T> &w_rblk )
 {
   size_t n    = w_rblk.row();
   size_t nrhs = w_rblk.col();
   DistData<RIDS, STAR, T> w_rids( n, nrhs, tree.treelist[ 0 ]->gids, tree.GetComm() );
   w_rids = w_rblk;
   auto u_rids = Evaluate<NNPRUNE>( tree, w_rids );
   mpi::Barrier( tree.GetComm() );
   DistData<RBLK, STAR, T> u_rblk( n, nrhs, tree.GetComm() );
   u_rblk = u_rids;
   return u_rblk;
 };
 template<typename SPLITTER, typename T, typename SPDMATRIX>
 DistData<STAR, CBLK, pair<T, size_t>> FindNeighbors
 (
   SPDMATRIX &K,
   SPLITTER splitter,
    gofmm::Configuration<T> &config,
   mpi::Comm CommGOFMM,
   size_t n_iter = 10
 )
 {
   using DATA  = gofmm::NodeData<T>;
   using SETUP = mpigofmm::Setup<SPDMATRIX, SPLITTER, T>;
   using TREE  = mpitree::Tree<SETUP, DATA>;
   using NODE  = typename TREE::NODE;
   DistanceMetric metric = config.MetricType();
   size_t n = config.ProblemSize();
    size_t k = config.NeighborSize();
   pair<T, size_t> init( numeric_limits<T>::max(), n );
   gofmm::NeighborsTask<NODE, T> NEIGHBORStask;
   TREE rkdt( CommGOFMM );
   rkdt.setup.FromConfiguration( config, K, splitter, NULL );
   return rkdt.AllNearestNeighbor( n_iter, n, k, init, NEIGHBORStask );
 };
 template<typename SPLITTER, typename RKDTSPLITTER, typename T, typename SPDMATRIX>
 mpitree::Tree<mpigofmm::Setup<SPDMATRIX, SPLITTER, T>, gofmm::NodeData<T>>
 *Compress
 (
   SPDMATRIX &K,
   DistData<STAR, CBLK, pair<T, size_t>> &NN_cblk,
   SPLITTER splitter,
   RKDTSPLITTER rkdtsplitter,
    gofmm::Configuration<T> &config,
   mpi::Comm CommGOFMM
 )
 {
   try
   {
     int size; mpi::Comm_size( CommGOFMM, &size );
     int rank; mpi::Comm_rank( CommGOFMM, &rank );

     DistanceMetric metric = config.MetricType();
     size_t n = config.ProblemSize();
      size_t m = config.LeafNodeSize();
      size_t k = config.NeighborSize();
      size_t s = config.MaximumRank();

     const bool SYMMETRIC = true;
     const bool NNPRUNE   = true;
     const bool CACHE     = true;

     using SETUP   = mpigofmm::Setup<SPDMATRIX, SPLITTER, T>;
     using DATA    = gofmm::NodeData<T>;
     using TREE    = mpitree::Tree<SETUP, DATA>;
     using NODE    = typename TREE::NODE;
     using MPINODE = typename TREE::MPINODE;

     double beg, omptask45_time, omptask_time, ref_time;
     double time_ratio, compress_time = 0.0, other_time = 0.0;
     double ann_time, tree_time, skel_time, mpi_skel_time, mergefarnodes_time, cachefarnodes_time;
     double local_skel_time, dist_skel_time, let_time;
     double nneval_time, nonneval_time, fmm_evaluation_time, symbolic_evaluation_time;
     double exchange_neighbor_time, symmetrize_time;

     beg = omp_get_wtime();
     if ( k && NN_cblk.row() * NN_cblk.col() != k * n )
     {
       NN_cblk = mpigofmm::FindNeighbors( K, rkdtsplitter,
           config, CommGOFMM );
     }
     ann_time = omp_get_wtime() - beg;

     auto *tree_ptr = new TREE( CommGOFMM );
      auto &tree = *tree_ptr;

     tree.setup.FromConfiguration( config, K, splitter, &NN_cblk );

     beg = omp_get_wtime();
     tree.TreePartition();
     tree_time = omp_get_wtime() - beg;

     vector<size_t> perm = tree.GetPermutation();
     if ( rank == 0 )
     {
       ofstream perm_file( "perm.txt" );
       for ( auto &id : perm ) perm_file << id << " ";
       perm_file.close();
     }


     DistData<STAR, CIDS, pair<T, size_t>> NN( k, n, tree.treelist[ 0 ]->gids, tree.GetComm() );
     NN = NN_cblk;
     tree.setup.NN = &NN;
     beg = omp_get_wtime();
     ExchangeNeighbors<T>( tree );
     exchange_neighbor_time = omp_get_wtime() - beg;


     beg = omp_get_wtime();
     FindNearInteractions( tree );
     mpigofmm::SymmetrizeNearInteractions( tree );
     BuildInteractionListPerRank( tree, true );
     ExchangeLET( tree, string( "leafgids" ) );
     symmetrize_time = omp_get_wtime() - beg;


     mpi::PrintProgress( "[BEG] MergeFarNodes ...", tree.GetComm() );
     beg = omp_get_wtime();
     tree.DependencyCleanUp();
     MergeFarNodesTask<true, NODE, T> seqMERGEtask;
     DistMergeFarNodesTask<true, MPINODE, T> mpiMERGEtask;
     tree.LocaTraverseUp( seqMERGEtask );
     tree.DistTraverseUp( mpiMERGEtask );
     tree.ExecuteAllTasks();
     mergefarnodes_time += omp_get_wtime() - beg;
     mpi::PrintProgress( "[END] MergeFarNodes ...", tree.GetComm() );

     beg = omp_get_wtime();
     mpigofmm::SymmetrizeFarInteractions( tree );
     BuildInteractionListPerRank( tree, false );
     symmetrize_time += omp_get_wtime() - beg;

     mpi::PrintProgress( "[BEG] Skeletonization ...", tree.GetComm() );
      beg = omp_get_wtime();
     tree.DependencyCleanUp();
     gofmm::SkeletonKIJTask<NNPRUNE, NODE, T> seqGETMTXtask;
     mpigofmm::DistSkeletonKIJTask<NNPRUNE, MPINODE, T> mpiGETMTXtask;
     mpigofmm::SkeletonizeTask<NODE, T> seqSKELtask;
     mpigofmm::DistSkeletonizeTask<MPINODE, T> mpiSKELtask;
     tree.LocaTraverseUp( seqGETMTXtask, seqSKELtask );
     //tree.DistTraverseUp( mpiGETMTXtask, mpiSKELtask );
     gofmm::InterpolateTask<NODE> seqPROJtask;
     mpigofmm::InterpolateTask<MPINODE> mpiPROJtask;
     tree.LocaTraverseUnOrdered( seqPROJtask );
     //tree.DistTraverseUnOrdered( mpiPROJtask );

     mpigofmm::CacheNearNodesTask<NNPRUNE, NODE> seqNEARKIJtask;
     //tree.LocaTraverseLeafs( seqNEARKIJtask );

     tree.ExecuteAllTasks();
     skel_time = omp_get_wtime() - beg;

      beg = omp_get_wtime();
     tree.DistTraverseUp( mpiGETMTXtask, mpiSKELtask );
     tree.DistTraverseUnOrdered( mpiPROJtask );
     tree.ExecuteAllTasks();
     mpi_skel_time = omp_get_wtime() - beg;
     mpi::PrintProgress( "[END] Skeletonization ...", tree.GetComm() );


     ExchangeLET( tree, string( "skelgids" ) );

     beg = omp_get_wtime();
     //mpigofmm::CacheNearNodesTask<NNPRUNE, NODE> seqNEARKIJtask;
     //tree.LocaTraverseLeafs( seqNEARKIJtask );
     mpigofmm::CacheFarNodesTask<NNPRUNE,    NODE> seqFARKIJtask;
     mpigofmm::CacheFarNodesTask<NNPRUNE, MPINODE> mpiFARKIJtask;
     //tree.LocaTraverseUnOrdered( seqFARKIJtask );
     //tree.DistTraverseUnOrdered( mpiFARKIJtask );
     cachefarnodes_time = omp_get_wtime() - beg;
     tree.ExecuteAllTasks();
     cachefarnodes_time = omp_get_wtime() - beg;


     auto ratio = NonCompressedRatio( tree );

     double exact_ratio = (double) m / n;

     if ( rank == 0 && REPORT_COMPRESS_STATUS )
     {
       compress_time += ann_time;
       compress_time += tree_time;
       compress_time += exchange_neighbor_time;
       compress_time += symmetrize_time;
       compress_time += skel_time;
       compress_time += mpi_skel_time;
       compress_time += mergefarnodes_time;
       compress_time += cachefarnodes_time;
       time_ratio = 100.0 / compress_time;
       printf( "========================================================\n");
       printf( "GOFMM compression phase\n" );
       printf( "========================================================\n");
       printf( "NeighborSearch ------------------------ %5.2lfs (%5.1lf%%)\n", ann_time, ann_time * time_ratio );
       printf( "TreePartitioning ---------------------- %5.2lfs (%5.1lf%%)\n", tree_time, tree_time * time_ratio );
       printf( "ExchangeNeighbors --------------------- %5.2lfs (%5.1lf%%)\n", exchange_neighbor_time, exchange_neighbor_time * time_ratio );
       printf( "MergeFarNodes ------------------------- %5.2lfs (%5.1lf%%)\n", mergefarnodes_time, mergefarnodes_time * time_ratio );
       printf( "Symmetrize ---------------------------- %5.2lfs (%5.1lf%%)\n", symmetrize_time, symmetrize_time * time_ratio );
       printf( "Skeletonization (HMLP Runtime   ) ----- %5.2lfs (%5.1lf%%)\n", skel_time, skel_time * time_ratio );
       printf( "Skeletonization (MPI            ) ----- %5.2lfs (%5.1lf%%)\n", mpi_skel_time, mpi_skel_time * time_ratio );
       printf( "Cache KIJ ----------------------------- %5.2lfs (%5.1lf%%)\n", cachefarnodes_time, cachefarnodes_time * time_ratio );
       printf( "========================================================\n");
       printf( "%5.3lf%% and %5.3lf%% uncompressed--------- %5.2lfs (%5.1lf%%)\n",
           100 * ratio.first, 100 * ratio.second, compress_time, compress_time * time_ratio );
       printf( "========================================================\n\n");
     }

     tree_ptr->DependencyCleanUp();
     mpi::Barrier( tree.GetComm() );

     return tree_ptr;
   }
   catch ( const exception & e )
   {
     cout << e.what() << endl;
     exit( 1 );
   }
 };
 template<typename TREE, typename T>
 pair<T, T> ComputeError( TREE &tree, size_t gid, Data<T> potentials )
 {
   int comm_rank; mpi::Comm_rank( tree.GetComm(), &comm_rank );
   int comm_size; mpi::Comm_size( tree.GetComm(), &comm_size );

   pair<T, T> ret( 0, 0 );

   auto &K = *tree.setup.K;
   auto &w = *tree.setup.w;

   auto  I = vector<size_t>( 1, gid );
   auto &J = tree.treelist[ 0 ]->gids;

   K.BcastIndices( I, gid % comm_size, tree.GetComm() );

    Data<T> Kab = K( I, J );

   auto loc_exact = potentials;
   auto glb_exact = potentials;

   xgemm( "N", "N", Kab.row(), w.col(), w.row(),
     1.0,       Kab.data(),       Kab.row(),
                  w.data(),         w.row(),
     0.0, loc_exact.data(), loc_exact.row() );
   //gemm::xgemm( (T)1.0, Kab, w, (T)0.0, loc_exact );


   mpi::Allreduce( loc_exact.data(), glb_exact.data(),
       loc_exact.size(), MPI_SUM, tree.GetComm() );

   for ( uint64_t j = 0; j < w.col(); j ++ )
   {
     T exac = glb_exact[ j ];
     T pred = potentials[ j ];
     ret.first  += ( pred - exac ) * ( pred - exac );
     ret.second += exac * exac;
   }

   return ret;
 };
 template<typename TREE>
 void SelfTesting( TREE &tree, size_t ntest, size_t nrhs )
 {
   using T = typename TREE::T;
   int rank; mpi::Comm_rank( tree.GetComm(), &rank );
   int size; mpi::Comm_size( tree.GetComm(), &size );
   size_t n = tree.n;
   if ( ntest > n ) ntest = n;
   vector<size_t> all_rhs( nrhs );
   for ( size_t rhs = 0; rhs < nrhs; rhs ++ ) all_rhs[ rhs ] = rhs;

   //auto A = tree.CheckAllInteractions();

   DistData<RIDS, STAR, T> w_rids( n, nrhs, tree.treelist[ 0 ]->gids, tree.GetComm() );
   DistData<RBLK, STAR, T> u_rblk( n, nrhs, tree.GetComm() );
   w_rids.randn();
   auto u_rids = mpigofmm::Evaluate<true>( tree, w_rids );
   assert( !u_rids.HasIllegalValue() );
   u_rblk = u_rids;
   if ( rank == 0 )
   {
     printf( "========================================================\n");
     printf( "Accuracy report\n" );
     printf( "========================================================\n");
   }
   T nnerr_avg = 0.0, nonnerr_avg = 0.0, fmmerr_avg = 0.0;
   T sse_2norm = 0.0, ssv_2norm = 0.0;
   for ( size_t i = 0; i < ntest; i ++ )
   {
     size_t tar = i * n / ntest;
     Data<T> potentials( (size_t)1, nrhs );
     if ( rank == ( tar % size ) ) potentials = u_rblk( vector<size_t>( 1, tar ), all_rhs );
     mpi::Bcast( potentials.data(), nrhs, tar % size, tree.GetComm() );
     auto sse_ssv = mpigofmm::ComputeError( tree, tar, potentials );
     auto fmmerr  = sqrt( sse_ssv.first / sse_ssv.second );
     fmmerr_avg += fmmerr;
     sse_2norm += sse_ssv.first;
     ssv_2norm += sse_ssv.second;
     if ( i < 10 && rank == 0 )
     {
       printf( "gid %6lu, ASKIT %3.1E, HODLR %3.1E, GOFMM %3.1E\n",
           tar, 0.0, 0.0, fmmerr );
     }
   }
   if ( rank == 0 )
   {
     printf( "========================================================\n");
     printf( "Elementwise ASKIT %3.1E, HODLR %3.1E, GOFMM %3.1E\n",
         nnerr_avg / ntest , nonnerr_avg / ntest, fmmerr_avg / ntest );
     printf( "F-norm      ASKIT %3.1E, HODLR %3.1E, GOFMM %3.1E\n",
         0.0, 0.0, sqrt( sse_2norm / ssv_2norm ) );
     printf( "========================================================\n");
   }

   T lambda = 10.0;
   mpigofmm::DistFactorize( tree, lambda );
   mpigofmm::ComputeError( tree, lambda, w_rids, u_rids );
 };
 template<typename SPDMATRIX>
 void LaunchHelper( SPDMATRIX &K, gofmm::CommandLineHelper &cmd, mpi::Comm CommGOFMM )
 {
   using T = typename SPDMATRIX::T;
   const int N_CHILDREN = 2;
   using SPLITTER     = mpigofmm::centersplit<SPDMATRIX, N_CHILDREN, T>;
   using RKDTSPLITTER = mpigofmm::randomsplit<SPDMATRIX, N_CHILDREN, T>;
   SPLITTER splitter( K );
   splitter.Kptr = &K;
   splitter.metric = cmd.metric;
   RKDTSPLITTER rkdtsplitter( K );
   rkdtsplitter.Kptr = &K;
   rkdtsplitter.metric = cmd.metric;
   gofmm::Configuration<T> config( cmd.metric,
       cmd.n, cmd.m, cmd.k, cmd.s, cmd.stol, cmd.budget );
   DistData<STAR, CBLK, pair<T, size_t>> NN( 0, cmd.n, CommGOFMM );
   auto *tree_ptr = mpigofmm::Compress( K, NN, splitter, rkdtsplitter, config, CommGOFMM );
   auto &tree = *tree_ptr;

   mpigofmm::SelfTesting( tree, 100, cmd.nrhs );

   delete tree_ptr;
 };
 };
 };
 #endif
hmlp::mpitree::Tree
This distributed tree inherits the shared memory tree with some additional MPI data structure and fun...
Definition: tree_mpi.hpp:620

hmlp::mpigofmm::L2LTask2::Set
void Set(NODE *user_arg, vector< NODE * > user_src, int user_p, Lock *user_lock, int *user_num_arrived_subtasks)
Definition: gofmm_mpi.hpp:1314

hmlp::gofmm::Configuration
Configuration contains all user-defined parameters.
Definition: gofmm.hpp:212

hmlp::mpigofmm::SkeletonizeTask::GetEventRecord
void GetEventRecord()
Definition: gofmm_mpi.hpp:3443

hmlp::mpigofmm::CacheFarNodesTask::Set
void Set(NODE *user_arg)
Definition: gofmm_mpi.hpp:2968

hmlp::mpigofmm::S2STask2::Set
void Set(NODE *user_arg, vector< LETNODE * > user_src, int user_p, Lock *user_lock, int *user_num_arrived_subtasks)
Definition: gofmm_mpi.hpp:955

hmlp::mpigofmm::MergeFarNodesTask::DependencyAnalysis
void DependencyAnalysis()
Definition: gofmm_mpi.hpp:2802

hmlp::mpigofmm::PackFarTask
Definition: gofmm_mpi.hpp:2445

MPI_Status
Definition: mpi_prototypes.h:81

hmlp::SendTask
Definition: runtime.hpp:331

hmlp::mpigofmm::DistUpdateWeightsTask::Set
void Set(NODE *user_arg)
Definition: gofmm_mpi.hpp:779

hmlp::mpi::NumberIntPair
Definition: hmlp_mpi.hpp:152

hmlp::mpigofmm::centersplit
This the main splitter used to build the Spd-Askit tree. First compute the approximate center using s...
Definition: gofmm_mpi.hpp:408

hmlp::DistData
Definition: DistData.hpp:249

hmlp::mpigofmm::DistSkeletonizeTask::Set
void Set(NODE *user_arg)
Definition: gofmm_mpi.hpp:3494

hmlp::mpigofmm::DistMergeFarNodesTask
Definition: gofmm_mpi.hpp:2910

hmlp::mpigofmm::SkeletonizeTask::Set
void Set(NODE *user_arg)
Definition: gofmm_mpi.hpp:3432

hmlp::hmlp_msg_dependency_analysis
void hmlp_msg_dependency_analysis(int key, int p, ReadWriteType type, Task *task)
Definition: runtime.cpp:1485

std

hmlp::mpigofmm::MergeFarNodesTask::Set
void Set(NODE *user_arg)
Definition: gofmm_mpi.hpp:2790

hmlp::RecvTask
Definition: runtime.hpp:380

hmlp::gofmm::TreeViewTask
This task creates an hierarchical tree view for w<RIDS> and u<RIDS>.
Definition: gofmm.hpp:414

hmlp::gofmm::SkeletonsToNodesTask
Definition: gofmm.hpp:1867

hmlp::View::row
size_t row()
Definition: View.hpp:345

hmlp::mpigofmm::S2SReduceTask2::Execute
void Execute(Worker *user_worker)
Definition: gofmm_mpi.hpp:1122

hmlp::mpigofmm::PackNearTask
Definition: gofmm_mpi.hpp:2380

hmlp::mpigofmm::L2LTask2
Definition: gofmm_mpi.hpp:1298

hmlp::mpigofmm::DistTreeViewTask
This task creates an hierarchical tree view for weights<RIDS> and potentials<RIDS>.
Definition: gofmm_mpi.hpp:259

hmlp::mpigofmm::MergeFarNodesTask
Definition: gofmm_mpi.hpp:2784

hmlp::mpigofmm::L2LTask2::Execute
void Execute(Worker *user_worker)
Definition: gofmm_mpi.hpp:1353

hmlp::mpigofmm::DistMergeFarNodesTask::DependencyAnalysis
void DependencyAnalysis()
Definition: gofmm_mpi.hpp:2928

hmlp::xgemm
void xgemm(const char *transA, const char *transB, int m, int n, int k, double alpha, const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc)
DGEMM wrapper.
Definition: blas_lapack.cpp:130

hmlp::mpigofmm::CacheNearNodesTask
Definition: gofmm_mpi.hpp:3014

hmlp::mpitree::Setup
Data and setup that are shared with all nodes.
Definition: tree_mpi.hpp:260

hmlp::mpigofmm::S2SReduceTask2
Definition: gofmm_mpi.hpp:1053

hmlp::gofmm::SkeletonKIJTask
Definition: gofmm.hpp:1168

hmlp::mpigofmm::DistTreeViewTask::Execute
void Execute(Worker *user_worker)
Definition: gofmm_mpi.hpp:276

hmlp::View::col
size_t col()
Definition: View.hpp:348

hmlp::View::data
T * data()
Definition: View.hpp:354

hmlp::mpigofmm::PackFarTask::PackFarTask
PackFarTask(TREE *tree, int src, int tar, int key)
Definition: gofmm_mpi.hpp:2449

hmlp::mpigofmm::UnpackLeafTask::UnpackLeafTask
UnpackLeafTask(TREE *tree, int src, int tar, int key)
Definition: gofmm_mpi.hpp:2426

hmlp::mpigofmm::DistTreeViewTask::DependencyAnalysis
void DependencyAnalysis()
Definition: gofmm_mpi.hpp:274

hmlp::mpigofmm::InterpolateTask::Execute
void Execute(Worker *user_worker)
Definition: gofmm_mpi.hpp:3587

hmlp::mpigofmm::DistSkeletonizeTask::Execute
void Execute(Worker *user_worker)
Definition: gofmm_mpi.hpp:3536

hmlp::mpigofmm::DistSkeletonKIJTask::Set
void Set(NODE *user_arg)
Definition: gofmm_mpi.hpp:3317

hmlp::mpigofmm::L2LReduceTask2
Definition: gofmm_mpi.hpp:1436

hmlp::gofmm::randomsplit
This the splitter used in the randomized tree.
Definition: gofmm.hpp:658

hmlp::DistData< RIDS, STAR, T >
Ecah MPI process own ( rids.size() ) rows of A, and rids denote the distribution. i...
Definition: DistData.hpp:1219

hmlp::mpigofmm::PackNearTask::Pack
void Pack()
Definition: gofmm_mpi.hpp:2399

hmlp::mpigofmm::PackNearTask::PackNearTask
PackNearTask(TREE *tree, int src, int tar, int key)
Definition: gofmm_mpi.hpp:2384

hmlp::Lock
Wrapper for omp or pthread mutex.
Definition: tci.hpp:50

hmlp::mpigofmm::DistSkeletonsToNodesTask
Definition: gofmm_mpi.hpp:1221

hmlp::mpigofmm::DistSkeletonizeTask::GetEventRecord
void GetEventRecord()
Definition: gofmm_mpi.hpp:3506

hmlp::DistDataBase::row
size_t row()
Definition: DistData.hpp:217

hmlp::Data::row
size_t row() const noexcept
Definition: Data.hpp:278

hmlp::mpigofmm::UnpackLeafTask
Definition: gofmm_mpi.hpp:2422

hmlp::mpigofmm::S2SReduceTask2::Set
void Set(NODE *user_arg)
Definition: gofmm_mpi.hpp:1067

hmlp::mpigofmm::PackFarTask::Pack
void Pack()
Definition: gofmm_mpi.hpp:2464

hmlp::mpigofmm::DistUpdateWeightsTask
Notice that NODE here is MPITree::Node.
Definition: gofmm_mpi.hpp:773

hmlp::mpigofmm::S2STask2
Notice that S2S depends on all Far interactions, which may include local tree nodes or let nodes...
Definition: gofmm_mpi.hpp:941

hmlp::mpigofmm::Setup
These are data that shared by the whole local tree. Distributed setup inherits mpitree::Setup.
Definition: gofmm_mpi.hpp:209

hmlp::mpigofmm::UnpackFarTask
Definition: gofmm_mpi.hpp:2475

hmlp::Data< T >

hmlp::gofmm::NeighborsTask
Definition: gofmm.hpp:738

hmlp::Lock::Acquire
void Acquire()
Definition: tci.cpp:53

hmlp::View
Definition: View.hpp:43

hmlp::mpigofmm::S2STask2::Execute
void Execute(Worker *user_worker)
Definition: gofmm_mpi.hpp:996

hmlp::View::ld
size_t ld()
Definition: View.hpp:351

hmlp::gofmm::InterpolateTask
The correponding task of Interpolate().
Definition: gofmm.hpp:945

hmlp::gofmm::CommandLineHelper::metric
DistanceMetric metric
Definition: gofmm.hpp:193

hmlp::mpigofmm::CacheNearNodesTask::Set
void Set(NODE *user_arg)
Definition: gofmm_mpi.hpp:3020

hmlp::gofmm::CommandLineHelper
This is a helper class that parses the arguments from command lines.
Definition: gofmm.hpp:89

hmlp::mpigofmm::Setup::FromConfiguration
void FromConfiguration(gofmm::Configuration< T > &config, SPDMATRIX &K, SPLITTER &splitter, DistData< STAR, CBLK, pair< T, size_t >> *NN_cblk)
Definition: gofmm_mpi.hpp:215

hmlp::STAR
Definition: DistData.hpp:156

hmlp::gofmm::CommandLineHelper::n
size_t n
Definition: gofmm.hpp:185

hmlp::mpigofmm::DistSkeletonizeTask
Definition: gofmm_mpi.hpp:3488

hmlp::mpigofmm::UnpackFarTask::UnpackFarTask
UnpackFarTask(TREE *tree, int src, int tar, int key)
Definition: gofmm_mpi.hpp:2479

hmlp::gofmm::UpdateWeightsTask
Definition: gofmm.hpp:1470

hmlp::mpigofmm::CacheFarNodesTask
Definition: gofmm_mpi.hpp:2962

hmlp::DistData< RBLK, STAR, T >
Ecah MPI process own ( n / size ) rows of A in a cyclic fashion (Round Robin). i.e. If there are 3 MPI processes, then.
Definition: DistData.hpp:619

hmlp::mpigofmm::L2LTask2::Sources
vector< NODE * > Sources
Definition: gofmm_mpi.hpp:1305

hmlp::gofmm::CommandLineHelper::stol
double stol
Definition: gofmm.hpp:190

hmlp::mpigofmm::DistSkeletonsToNodesTask::Set
void Set(NODE *user_arg)
Definition: gofmm_mpi.hpp:1227

hmlp::mpigofmm::InterpolateTask
Definition: gofmm_mpi.hpp:3570

hmlp::gofmm::centersplit
This the main splitter used to build the Spd-Askit tree. First compute the approximate center using s...
Definition: gofmm.hpp:580

hmlp::mpigofmm::SkeletonizeTask
Definition: gofmm_mpi.hpp:3426

hmlp
Definition: gofmm.hpp:83

hmlp::gofmm::NodeData
This class contains all GOFMM related data carried by a tree node.
Definition: gofmm.hpp:345

hmlp::mpigofmm::DistSkeletonKIJTask
Definition: gofmm_mpi.hpp:3311

hmlp::Task
Definition: runtime.hpp:174

hmlp::mpigofmm::L2LReduceTask2::Set
void Set(NODE *user_arg)
Definition: gofmm_mpi.hpp:1450

hmlp::mpigofmm::L2LTask2::DependencyAnalysis
void DependencyAnalysis()
Definition: gofmm_mpi.hpp:1345

hmlp::mpigofmm::randomsplit
Definition: gofmm_mpi.hpp:546

hmlp::mpigofmm::DistMergeFarNodesTask::Set
void Set(NODE *user_arg)
Definition: gofmm_mpi.hpp:2916

hmlp::Worker
Definition: thread.hpp:166