GCC Code Coverage Report
Directory: . Exec Total Coverage
File: frame/base/DistData.hpp Lines: 0 319 0.0 %
Date: 2019-01-14 Branches: 0 538 0.0 %

Line Exec Source
1
#ifndef DISTDATA_HPP
2
#define DISTDATA_HPP
3
4
#include <Data.hpp>
5
#include <hmlp_mpi.hpp>
6
7
using namespace std;
8
using namespace hmlp;
9
10
namespace hmlp
11
{
12
namespace mpi
13
{
14
15
template<typename T>
16
int SendData( Data<T> &senddata, int dest, mpi::Comm comm )
17
{
18
  int m = senddata.row();
19
  int n = senddata.col();
20
  Send( &m, 1, dest, 0, comm );
21
  Send( &n, 1, dest, 0, comm );
22
  Send( senddata.data(), m * n, dest, 0, comm );
23
  return 0;
24
}; /** end SendData() */
25
26
template<typename T>
27
int RecvData( Data<T> &recvdata, int source, mpi::Comm comm )
28
{
29
  int m, n;
30
  mpi::Status status;
31
  Recv( &m, 1, source, 0, comm, &status );
32
  Recv( &n, 1, source, 0, comm, &status );
33
  recvdata.resize( m, n );
34
  Recv( recvdata.data(), m * n, source, 0, comm, &status );
35
  return 0;
36
}; /** end RecvData() */
37
38
39
40
/**
41
 *  This interface supports hmlp::Data. [ m, n ] will be set
42
 *  after the routine.
43
 *
44
 */
45
template<typename T>
46
int AlltoallData( size_t m,
47
    vector<Data<T>> &sendvector,
48
    vector<Data<T>> &recvvector, mpi::Comm comm )
49
{
50
  int size = 0; Comm_size( comm, &size );
51
  int rank = 0; Comm_rank( comm, &rank );
52
53
  assert( sendvector.size() == size );
54
  assert( recvvector.size() == size );
55
56
57
  /** determine the concatenated dimension */
58
  //size_t m = 0;
59
60
  ///** all data need to have the same m */
61
  //for ( size_t p = 0; p < size; p ++ )
62
  //{
63
  //  if ( sendvector[ p ].row() )
64
  //  {
65
  //    if ( m ) assert( m == sendvector[ p ].row() );
66
  //    else     m = sendvector[ p ].row();
67
  //  }
68
  //}
69
70
  //printf( "after all assertion m %lu\n", m ); fflush( stdout );
71
72
73
74
  vector<T> sendbuf;
75
  vector<T> recvbuf;
76
  vector<int> sendcounts( size, 0 );
77
  vector<int> recvcounts( size, 0 );
78
  vector<int> sdispls( size + 1, 0 );
79
  vector<int> rdispls( size + 1, 0 );
80
81
  for ( size_t p = 0; p < size; p ++ )
82
  {
83
    sendcounts[ p ] = sendvector[ p ].size();
84
    sdispls[ p + 1 ] = sdispls[ p ] + sendcounts[ p ];
85
    sendbuf.insert( sendbuf.end(),
86
        sendvector[ p ].begin(),
87
        sendvector[ p ].end() );
88
  }
89
90
  /** Exchange sendcount. */
91
  Alltoall( sendcounts.data(), 1, recvcounts.data(), 1, comm );
92
93
  /** Compute total receiving count. */
94
  size_t total_recvcount = 0;
95
  for ( size_t p = 0; p < size; p ++ )
96
  {
97
    rdispls[ p + 1 ] = rdispls[ p ] + recvcounts[ p ];
98
    total_recvcount += recvcounts[ p ];
99
  }
100
101
  /** Resize receving buffer. */
102
  recvbuf.resize( total_recvcount );
103
104
  Alltoallv( sendbuf.data(), sendcounts.data(), sdispls.data(),
105
             recvbuf.data(), recvcounts.data(), rdispls.data(), comm );
106
107
  //printf( "after Alltoallv\n" ); fflush( stdout );
108
109
  recvvector.resize( size );
110
  for ( size_t p = 0; p < size; p ++ )
111
  {
112
    recvvector[ p ].insert( recvvector[ p ].end(),
113
        recvbuf.begin() + rdispls[ p ],
114
        recvbuf.begin() + rdispls[ p + 1 ] );
115
116
    if ( recvvector[ p ].size() && m )
117
    {
118
      recvvector[ p ].resize( m, recvvector[ p ].size() / m );
119
    }
120
  }
121
122
  //printf( "after resize\n" ); fflush( stdout );
123
  //mpi::Barrier( comm );
124
125
  return 0;
126
127
}; /** end AlltoallVector() */
128
}; /** end namespace mpi */
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
typedef enum
150
{
151
  CBLK, /** Elemental MC */
152
  RBLK, /** Elemental MR */
153
  CIDS, /** Distributed according to column ids */
154
  RIDS, /** Distributed according to    row ids */
155
  USER, /** Distributed acoording to user defined maps */
156
  STAR, /** Elemental STAR */
157
  CIRC  /** Elemental CIRC */
158
} Distribution_t;
159
160
161
#ifdef HMLP_MIC_AVX512
162
/** use hbw::allocator for Intel Xeon Phi */
163
template<class T, class Allocator = hbw::allocator<T> >
164
#elif  HMLP_USE_CUDA
165
/** use pinned (page-lock) memory for NVIDIA GPUs */
166
template<class T, class Allocator = thrust::system::cuda::experimental::pinned_allocator<T> >
167
#else
168
/** use default stl allocator */
169
template<class T, class Allocator = std::allocator<T> >
170
#endif
171
class DistDataBase : public Data<T, Allocator>, public mpi::MPIObject
172
{
173
  public:
174
175
    /** Default constructor */
176
    DistDataBase( size_t m, size_t n, mpi::Comm comm ) : mpi::MPIObject( comm )
177
    {
178
      this->global_m = m;
179
      this->global_n = n;
180
      this->comm = comm;
181
      mpi::Comm_size( comm, &comm_size );
182
      mpi::Comm_rank( comm, &comm_rank );
183
    };
184
185
    /** Constrcut a zero matrix (but mpi::Comm is still required) */
186
    DistDataBase( mpi::Comm comm ) : DistDataBase( 0, 0, comm ) {};
187
188
    /** Copy constructor for hmlp::Data. */
189
    DistDataBase( size_t m, size_t n, Data<T, Allocator>& other_data, mpi::Comm comm )
190
      : Data<T, Allocator>( other_data ), mpi::MPIObject( comm )
191
    {
192
      this->global_m = m;
193
      this->global_n = n;
194
      this->comm = comm;
195
      mpi::Comm_size( comm, &comm_size );
196
      mpi::Comm_rank( comm, &comm_rank );
197
    }
198
199
    /** Copy constructor for std::vector. */
200
    DistDataBase( size_t m, size_t n, size_t owned_rows, size_t owned_cols,
201
        vector<T, Allocator>& other_vector, mpi::Comm comm )
202
      : Data<T, Allocator>( owned_rows, owned_cols, other_vector ), mpi::MPIObject( comm )
203
    {
204
      this->global_m = m;
205
      this->global_n = n;
206
      this->comm = comm;
207
      mpi::Comm_size( comm, &comm_size );
208
      mpi::Comm_rank( comm, &comm_rank );
209
    }
210
211
    /** MPI support */
212
    mpi::Comm GetComm() { return comm; };
213
    int GetSize() { return comm_size; };
214
    int GetRank() { return comm_rank; };
215
216
    /** Get total row and column numbers across all MPI ranks */
217
    size_t row() { return global_m; };
218
    size_t col() { return global_n; };
219
220
    /** Get row and column numbers owned by this MPI rank */
221
    size_t row_owned() { return Data<T>::row(); };
222
    size_t col_owned() { return Data<T>::col(); };
223
224
  private:
225
226
    /** Total row and column numbers across all MPI ranks */
227
    size_t global_m = 0;
228
    size_t global_n = 0;
229
230
    /** MPI support */
231
    mpi::Comm comm = MPI_COMM_WORLD;
232
    int comm_size = 0;
233
    int comm_rank = 0;
234
235
}; /** end class DistDataBase */
236
237
238
/** */
239
#ifdef HMLP_MIC_AVX512
240
/** use hbw::allocator for Intel Xeon Phi */
241
template<Distribution_t ROWDIST, Distribution_t COLDIST, class T, class Allocator = hbw::allocator<T> >
242
#elif  HMLP_USE_CUDA
243
/** use pinned (page-lock) memory for NVIDIA GPUs */
244
template<Distribution_t ROWDIST, Distribution_t COLDIST, class T, class Allocator = thrust::system::cuda::experimental::pinned_allocator<T> >
245
#else
246
/** use default stl allocator */
247
template<Distribution_t ROWDIST, Distribution_t COLDIST, class T, class Allocator = std::allocator<T> >
248
#endif
249
class DistData : public DistDataBase<T, Allocator>
250
{
251
  public:
252
  private:
253
};
254
255
256
257
258
template<typename T>
259
class DistData<CIRC, CIRC, T> : public DistDataBase<T>
260
{
261
  public:
262
263
264
    #ifdef HMLP_MIC_AVX512
265
    /** use hbw::allocator for Intel Xeon Phi */
266
    using ALLOCATOR = hbw::allocator<T>;
267
    #elif  HMLP_USE_CUDA
268
    /** use pinned (page-lock) memory for NVIDIA GPUs */
269
    using ALLOCATOR = thrust::system::cuda::experimental::pinned_allocator<T>;
270
    #else
271
    /** use default stl allocator */
272
    using ALLOCATOR = std::allocator<T>;
273
    #endif
274
275
276
    DistData( size_t m, size_t n, int owner, mpi::Comm comm ) :
277
      DistDataBase<T>( m, n, comm )
278
    {
279
      this->owner = owner;
280
      if ( this->GetRank() == owner ) this->resize( m, n );
281
    };
282
283
284
    /** redistribute from CIDS to CBLK */
285
    DistData<CIRC, CIRC, T> & operator = ( DistData<STAR, CIDS, T> &A )
286
    {
287
      /** MPI */
288
      mpi::Comm comm = this->GetComm();
289
      int size = this->GetSize();
290
      int rank = this->GetRank();
291
292
      /** allocate buffer for ids */
293
      std::vector<std::vector<size_t>> sendids = A.CIRCOwnership( owner );
294
      std::vector<std::vector<size_t>> recvids( size );
295
296
      /** exchange cids */
297
      mpi::AlltoallVector( sendids, recvids, comm );
298
299
      /** allocate buffer for data */
300
      std::vector<std::vector<T, ALLOCATOR>> senddata( size );
301
      std::vector<std::vector<T, ALLOCATOR>> recvdata( size );
302
303
      std::vector<size_t> amap( this->row() );
304
      for ( size_t i = 0; i < amap.size(); i ++ ) amap[ i ] = i;
305
306
      /** extract rows from A<STAR,CIDS> */
307
      senddata[ owner ] = A( amap, sendids[ owner ] );
308
309
      /** exchange data */
310
      mpi::AlltoallVector( senddata, recvdata, comm );
311
312
      if ( rank == owner )
313
      {
314
        for ( size_t p = 0; p < size; p ++ )
315
        {
316
          size_t ld = this->row();
317
318
          for ( size_t j = 0; j < recvids[ p ].size(); j ++ )
319
          {
320
            size_t cid = recvids[ p ][ j ];
321
322
            for ( size_t i = 0; i < this->row(); i ++ )
323
            {
324
              (*this)( i, cid ) = recvdata[ p ][ j * ld + i ];
325
            }
326
          };
327
        };
328
      }
329
330
      mpi::Barrier( comm );
331
      return (*this);
332
    };
333
334
335
336
  private:
337
338
    int owner = 0;
339
340
};
341
342
343
/**
344
 *  TODO: ADD block size
345
 */
346
template<typename T>
347
class DistData<STAR, CBLK, T> : public DistDataBase<T>
348
{
349
  public:
350
351
    #ifdef HMLP_MIC_AVX512
352
    /** use hbw::allocator for Intel Xeon Phi */
353
    using ALLOCATOR = hbw::allocator<T>;
354
    #elif  HMLP_USE_CUDA
355
    /** use pinned (page-lock) memory for NVIDIA GPUs */
356
    using ALLOCATOR = thrust::system::cuda::experimental::pinned_allocator<T>;
357
    #else
358
    /** use default stl allocator */
359
    using ALLOCATOR = std::allocator<T>;
360
    #endif
361
362
363
    DistData( size_t m, size_t n, mpi::Comm comm )
364
      : DistDataBase<T>( m, n, comm )
365
    {
366
      /** MPI */
367
      int size = this->GetSize();
368
      int rank = this->GetRank();
369
370
      /** Columns are distributed. */
371
      size_t edge_n = n % size;
372
      size_t local_n = ( n - edge_n ) / size;
373
      if ( rank < edge_n ) local_n ++;
374
375
      /** Resize the local buffer using Data<T>::resize(). */
376
      this->resize( m, local_n );
377
    };
378
379
    /** We should use this and move mpi::Comm to the front. */
380
    DistData( size_t m, size_t n, T initT, mpi::Comm comm )
381
      : DistDataBase<T>( m, n, comm )
382
    {
383
      /** MPI */
384
      int size = this->GetSize();
385
      int rank = this->GetRank();
386
387
      size_t edge_n = n % size;
388
      size_t local_n = ( n - edge_n ) / size;
389
      if ( rank < edge_n ) local_n ++;
390
391
      /** resize the local buffer */
392
      this->resize( m, local_n, initT );
393
    };
394
395
    /** Construct distributed data from local column data. */
396
    DistData( size_t m, size_t n, Data<T>& local_column_data, mpi::Comm comm )
397
      : DistDataBase<T>( m, n, local_column_data, comm )
398
    {
399
      /** MPI */
400
      int size = this->GetSize();
401
      int rank = this->GetRank();
402
403
      size_t edge_n = n % size;
404
      size_t local_n = ( n - edge_n ) / size;
405
      if ( rank < edge_n ) local_n ++;
406
407
      /** The row number of local_column_data must be m. */
408
      assert( m == local_column_data.row() );
409
      /** The column number of local_column_data must be local_n. */
410
      assert( local_n == local_column_data.col() );
411
    };
412
413
    /** Construct distributed data from local std::vector. */
414
    DistData( size_t m, size_t n, vector<T>& local_vector, mpi::Comm comm )
415
      : DistDataBase<T>( m, n, m, local_vector.size() / m, local_vector, comm )
416
    {
417
      /** MPI */
418
      int size = this->GetSize();
419
      int rank = this->GetRank();
420
421
      size_t edge_n = n % size;
422
      size_t local_n = ( n - edge_n ) / size;
423
      if ( rank < edge_n ) local_n ++;
424
425
      /** The column number of local_column_data must be local_n. */
426
      assert( local_n == this->col_owned() );
427
    };
428
429
    /** Constructor that reads a binary file. */
430
    DistData( size_t m, size_t n, mpi::Comm comm, string &filename )
431
      : DistData<STAR, CBLK, T>( m, n, comm )
432
    {
433
      read( m, n, filename );
434
    };
435
436
    /**
437
     *  read a dense column-major matrix
438
     */
439
    void read( size_t m, size_t n, string &filename )
440
    {
441
      assert( this->row() == m );
442
      assert( this->col() == n );
443
444
      /** MPI */
445
      int size = this->GetSize();
446
      int rank = this->GetRank();
447
448
      /** print out filename */
449
      cout << filename << endl;
450
451
      std::ifstream file( filename.data(),
452
          std::ios::in|std::ios::binary|std::ios::ate );
453
454
      if ( file.is_open() )
455
      {
456
        auto size = file.tellg();
457
        assert( size == m * n * sizeof(T) );
458
459
        //for ( size_t j = rank; j < n; j += size )
460
        //{
461
        //  size_t byte_offset = j * m * sizeof(T);
462
        //  file.seekg( byte_offset, std::ios::beg );
463
        //  file.read( (char*)this->columndata( j / size ), m * sizeof(T) );
464
        //}
465
466
467
        file.close();
468
      }
469
470
      #pragma omp parallel
471
      {
472
        std::ifstream ompfile( filename.data(),
473
          std::ios::in|std::ios::binary|std::ios::ate );
474
475
        if ( ompfile.is_open() )
476
        {
477
          #pragma omp for
478
          for ( size_t j = rank; j < n; j += size )
479
          {
480
            size_t byte_offset = j * m * sizeof(T);
481
            ompfile.seekg( byte_offset, std::ios::beg );
482
            ompfile.read( (char*)this->columndata( j / size ), m * sizeof(T) );
483
          }
484
          ompfile.close();
485
        }
486
      } /** end omp parallel */
487
488
489
490
491
492
    }; /** end void read() */
493
494
495
496
497
498
499
500
501
    /**
502
     *  Overload operator () to allow accessing data using gids
503
     */
504
    T & operator () ( size_t i , size_t j )
505
    {
506
      /** Assert that Kij is stored on this MPI process. */
507
      // TODO: take care of block sizes.
508
      assert( j % this->GetSize() == this->GetRank() );
509
      /** Return reference of Kij */
510
      return DistDataBase<T>::operator () ( i, j / this->GetSize() );
511
    };
512
513
514
    /**
515
     *  Overload operator () to return a local submatrix using gids
516
     */
517
    Data<T> operator () ( vector<size_t> &I, vector<size_t> &J )
518
    {
519
      for ( auto it = J.begin(); it != J.end(); it ++ )
520
      {
521
        /** assert that Kij is stored on this MPI process */
522
        assert( (*it) % this->GetSize() == this->GetRank() );
523
        (*it) = (*it) / this->GetSize();
524
      }
525
      return DistDataBase<T>::operator () ( I, J );
526
    };
527
528
529
    /** redistribute from CIRC to CBLK */
530
    DistData<STAR, CBLK, T> & operator = ( const DistData<CIRC, CIRC, T> &A )
531
    {
532
      printf( "not implemented yet\n" );
533
      exit( 1 );
534
      return (*this);
535
    };
536
537
538
    /** Redistribute from CIDS to CBLK */
539
    DistData<STAR, CBLK, T> & operator = ( DistData<STAR, CIDS, T> &A )
540
    {
541
      /** MPI */
542
      mpi::Comm comm = this->GetComm();
543
      int size = this->GetSize();
544
      int rank = this->GetRank();
545
546
      /** allocate buffer for ids */
547
      vector<vector<size_t>> sendids = A.CBLKOwnership();
548
      vector<vector<size_t>> recvids( size );
549
550
      /** exchange cids */
551
      mpi::AlltoallVector( sendids, recvids, comm );
552
553
      /** allocate buffer for data */
554
      vector<vector<T, ALLOCATOR>> senddata( size );
555
      vector<vector<T, ALLOCATOR>> recvdata( size );
556
557
      vector<size_t> amap( this->row() );
558
      for ( size_t i = 0; i < amap.size(); i ++ ) amap[ i ] = i;
559
560
      /** extract rows from A<STAR, CIDS> */
561
      #pragma omp parallel for
562
      for ( size_t p = 0; p < size; p ++ )
563
      {
564
        /** recvids should be gids (not local posiiton) */
565
        senddata[ p ] = A( amap, sendids[ p ] );
566
      }
567
568
      /** exchange data */
569
      mpi::AlltoallVector( senddata, recvdata, comm );
570
571
      /**
572
       *  It is possible that the received cid has several copies on
573
       *  different MPI rank.
574
       */
575
576
      #pragma omp parallel for
577
      for ( size_t p = 0; p < size; p ++ )
578
      {
579
        size_t ld = this->row();
580
581
        for ( size_t j = 0; j < recvids[ p ].size(); j ++ )
582
        {
583
          size_t cid = recvids[ p ][ j ];
584
585
          for ( size_t i = 0; i < this->row(); i ++ )
586
          {
587
            (*this)( i, cid ) = recvdata[ p ][ j * ld + i ];
588
          }
589
        };
590
      };
591
592
      mpi::Barrier( comm );
593
      /** free all buffers and return */
594
      return (*this);
595
    };
596
597
598
599
600
601
602
603
  private:
604
605
}; /** end class DistData<STAR, CBLK, T> */
606
607
608
609
/**
610
 *  @brief Ecah MPI process own ( n / size ) rows of A
611
 *         in a cyclic fashion (Round Robin). i.e.
612
 *         If there are 3 MPI processes, then
613
 *
614
 *         rank0 owns A(0,:), A(3,:), A(6,:), ...
615
 *         rank1 owns A(1,:), A(4,:), A(7,:), ...
616
 *         rank2 owns A(2,:), A(5,:), A(8,:), ...
617
 */
618
template<typename T>
619
class DistData<RBLK, STAR, T> : public DistDataBase<T>
620
{
621
  public:
622
623
    #ifdef HMLP_MIC_AVX512
624
    /** use hbw::allocator for Intel Xeon Phi */
625
    using ALLOCATOR = hbw::allocator<T>;
626
    #elif  HMLP_USE_CUDA
627
    /** use pinned (page-lock) memory for NVIDIA GPUs */
628
    using ALLOCATOR = thrust::system::cuda::experimental::pinned_allocator<T>;
629
    #else
630
    /** use default stl allocator */
631
    using ALLOCATOR = std::allocator<T>;
632
    #endif
633
634
635
636
    DistData( size_t m, size_t n, mpi::Comm comm )
637
      : DistDataBase<T>( m, n, comm )
638
    {
639
      /** MPI */
640
      int size = this->GetSize();
641
      int rank = this->GetRank();
642
      size_t edge_m = m % size;
643
      size_t local_m = ( m - edge_m ) / size;
644
      if ( rank < edge_m ) local_m ++;
645
646
      /** resize the local buffer */
647
      this->resize( local_m, n );
648
    };
649
650
    /** Construct distributed data from local row data. */
651
    DistData( size_t m, size_t n, Data<T>& local_row_data, mpi::Comm comm )
652
      : DistDataBase<T>( m, n, local_row_data, comm )
653
    {
654
      /** MPI */
655
      int size = this->GetSize();
656
      int rank = this->GetRank();
657
      size_t edge_m = m % size;
658
      size_t local_m = ( m - edge_m ) / size;
659
      if ( rank < edge_m ) local_m ++;
660
661
      assert( local_m == local_row_data.row() );
662
      assert( n == local_row_data.col() );
663
    };
664
665
    /** Construct distributed data from local vector. */
666
    DistData( size_t m, size_t n, vector<T>& local_vector, mpi::Comm comm )
667
      : DistDataBase<T>( m, n, local_vector.size() / n, n, local_vector, comm )
668
    {
669
      /** MPI */
670
      int size = this->GetSize();
671
      int rank = this->GetRank();
672
      size_t edge_m = m % size;
673
      size_t local_m = ( m - edge_m ) / size;
674
      if ( rank < edge_m ) local_m ++;
675
676
      assert( local_m == this->row_owned() );
677
    };
678
679
680
    /**
681
     *  Overload operator () to allow accessing data using gids
682
     */
683
    T & operator () ( size_t i , size_t j )
684
    {
685
      /** assert that Kij is stored on this MPI process */
686
      if ( i % this->GetSize() != this->GetRank() )
687
      {
688
        printf( "ERROR: accessing %lu on rank %d\n", i, this->GetRank() );
689
        exit( 1 );
690
      }
691
      /** return reference of Kij */
692
      return DistDataBase<T>::operator () ( i / this->GetSize(), j );
693
    };
694
695
696
    /**
697
     *  Overload operator () to return a local submatrix using gids
698
     */
699
    hmlp::Data<T> operator () ( std::vector<size_t> I, std::vector<size_t> J )
700
    {
701
      for ( auto it = I.begin(); it != I.end(); it ++ )
702
      {
703
        /** assert that Kij is stored on this MPI process */
704
        assert( (*it) % this->GetSize() == this->GetRank() );
705
        (*it) = (*it) / this->GetSize();
706
      }
707
      return DistDataBase<T>::operator () ( I, J );
708
    };
709
710
711
    /** redistribute from CIRC to RBLK */
712
    DistData<RBLK, STAR, T> & operator = ( const DistData<CIRC, CIRC, T> &A )
713
    {
714
      printf( "not implemented yet\n" );
715
      exit( 1 );
716
      return (*this);
717
    };
718
719
720
721
    /** redistribute from RIDS to RBLK */
722
    DistData<RBLK, STAR, T> & operator = ( DistData<RIDS, STAR, T> &A )
723
    {
724
      /** MPI */
725
      mpi::Comm comm = this->GetComm();
726
      int size = this->GetSize();
727
      int rank = this->GetRank();
728
729
      /** allocate buffer for ids */
730
      vector<vector<size_t>> sendids = A.RBLKOwnership();
731
      vector<vector<size_t>> recvids( size );
732
733
      /** Exchange rids. */
734
      mpi::AlltoallVector( sendids, recvids, comm );
735
736
      /** Allocate buffer for data. */
737
      vector<vector<T, ALLOCATOR>> senddata( size );
738
      vector<vector<T, ALLOCATOR>> recvdata( size );
739
740
      vector<size_t> bmap( this->col() );
741
      for ( size_t j = 0; j < bmap.size(); j ++ ) bmap[ j ] = j;
742
743
      /** Extract rows from A<RBLK,STAR> */
744
      #pragma omp parallel for
745
      for ( size_t p = 0; p < size; p ++ )
746
      {
747
        /** recvids should be gids (not local posiiton) */
748
        senddata[ p ] = A( sendids[ p ], bmap );
749
      }
750
751
      /** exchange data */
752
      mpi::AlltoallVector( senddata, recvdata, comm );
753
754
      #pragma omp parallel for
755
      for ( size_t p = 0; p < size; p ++ )
756
      {
757
        size_t ld = recvdata[ p ].size() / this->col();
758
        assert( ld == recvids[ p ].size() );
759
        for ( size_t i = 0; i < recvids[ p ].size(); i ++ )
760
        {
761
          size_t rid = recvids[ p ][ i ];
762
          for ( size_t j = 0; j < this->col(); j ++ )
763
          {
764
            (*this)( rid, j ) = recvdata[ p ][ j * ld + i ];
765
          }
766
        };
767
      };
768
769
      mpi::Barrier( comm );
770
      /** free all buffers and return */
771
      return (*this);
772
    };
773
774
  private:
775
776
}; /** end class DistData<RBLK, START, T> */
777
778
779
780
781
/**
782
 *  @brief Ecah MPI process own ( cids.size() ) columns of A,
783
 *         and cids denote the distribution. i.e.
784
 *         ranki owns A(:,cids[0]),
785
 *                    A(:,cids[1]),
786
 *                    A(:,cids[2]), ...
787
 */
788
template<typename T>
789
class DistData<STAR, CIDS, T> : public DistDataBase<T>
790
{
791
  public:
792
793
    #ifdef HMLP_MIC_AVX512
794
    /** use hbw::allocator for Intel Xeon Phi */
795
    using ALLOCATOR = hbw::allocator<T>;
796
    #elif  HMLP_USE_CUDA
797
    /** use pinned (page-lock) memory for NVIDIA GPUs */
798
    using ALLOCATOR = thrust::system::cuda::experimental::pinned_allocator<T>;
799
    #else
800
    /** use default stl allocator */
801
    using ALLOCATOR = std::allocator<T>;
802
    #endif
803
804
    /** Default constructor */
805
    DistData( size_t m, size_t n, vector<size_t> &cids, mpi::Comm comm ) :
806
      DistDataBase<T>( m, n, comm )
807
    {
808
      /** now check if (sum cids.size() >= n) */
809
      size_t bcast_n = cids.size();
810
      size_t reduc_n = 0;
811
      mpi::Allreduce( &bcast_n, &reduc_n, 1, MPI_SUM, comm );
812
      assert( reduc_n == n );
813
      this->cids = cids;
814
      this->resize( m, cids.size() );
815
816
      for ( size_t j = 0; j < cids.size(); j ++ )
817
        cid2col[ cids[ j ] ] = j;
818
    };
819
820
    /** Default constructor */
821
    DistData( size_t m, size_t n, vector<size_t> &cids, T initT, mpi::Comm comm ) :
822
      DistDataBase<T>( m, n, comm )
823
    {
824
      /** now check if (sum cids.size() >= n) */
825
      size_t bcast_n = cids.size();
826
      size_t reduc_n = 0;
827
      mpi::Allreduce( &bcast_n, &reduc_n, 1, MPI_SUM, comm );
828
      assert( reduc_n == n );
829
      this->cids = cids;
830
      this->resize( m, cids.size(), initT );
831
832
      for ( size_t j = 0; j < cids.size(); j ++ )
833
        cid2col[ cids[ j ] ] = j;
834
    };
835
836
    DistData( size_t m, size_t n, vector<size_t> &cids, Data<T> &A, mpi::Comm comm ) :
837
      DistDataBase<T>( m, n, comm )
838
    {
839
      assert( A.row() == m );
840
      assert( A.col() == cids.size() );
841
842
      /** now check if (sum cids.size() >= n) */
843
      size_t bcast_n = cids.size();
844
      size_t reduc_n = 0;
845
      mpi::Allreduce( &bcast_n, &reduc_n, 1, MPI_SUM, comm );
846
      assert( reduc_n == n );
847
      this->cids = cids;
848
849
      this->insert( this->end(), A.begin(), A.end() );
850
      this->resize( A.row(), A.col() );
851
852
      for ( size_t j = 0; j < cids.size(); j ++ ) cid2col[ cids[ j ] ] = j;
853
    };
854
855
    /**
856
     *  Overload operator () to allow accessing data using gids
857
     */
858
    T & operator () ( size_t i , size_t j )
859
    {
860
      /** assert that Kij is stored on this MPI process */
861
      assert( cid2col.count( j ) == 1 );
862
      /** return reference of Kij */
863
      return DistDataBase<T>::operator () ( i, cid2col[ j ] );
864
    };
865
866
867
    /**
868
     *  Overload operator () to return a local submatrix using gids
869
     */
870
    Data<T> operator () ( vector<size_t> I, vector<size_t> J )
871
    {
872
      for ( auto it = J.begin(); it != J.end(); it ++ )
873
      {
874
        /** assert that Kij is stored on this MPI process */
875
        assert( cid2col.count(*it) == 1 );
876
        (*it) = cid2col[ (*it) ];
877
      }
878
      return DistDataBase<T>::operator () ( I, J );
879
    };
880
881
    bool HasColumn( size_t cid )
882
    {
883
      return cid2col.count( cid );
884
    };
885
886
    T *columndata( size_t cid )
887
    {
888
      assert( cid2col.count( cid ) == 1 );
889
      return Data<T>::columndata( cid2col[ cid ] );
890
    };
891
892
    pair<size_t, T*> GetIDAndColumnPointer( size_t j )
893
    {
894
      return pair<size_t, T*>( cids[ j ], Data<T>::columndata( j ) );
895
    };
896
897
    /**
898
     *  @brief Return a vector of vector that indicates the RBLK ownership
899
     *         of each MPI rank.
900
     */
901
    vector<vector<size_t>> CBLKOwnership()
902
    {
903
      /** MPI */
904
      mpi::Comm comm = this->GetComm();
905
      int size = this->GetSize();
906
      int rank = this->GetRank();
907
908
      vector<vector<size_t>> ownership( size );
909
910
      for ( auto it = cids.begin(); it != cids.end(); it ++ )
911
      {
912
        size_t cid = (*it);
913
        /**
914
         *  While in CBLK distribution, rid is owned by
915
         *  rank ( cid % size ) at position ( cid / size )
916
         */
917
        ownership[ cid % size ].push_back( cid );
918
      };
919
920
      return ownership;
921
922
    }; /** end CBLKOwnership() */
923
924
925
    vector<vector<size_t>> CIRCOwnership( int owner )
926
    {
927
      /** MPI */
928
      mpi::Comm comm = this->GetComm();
929
      int size = this->GetSize();
930
      int rank = this->GetRank();
931
932
      vector<vector<size_t>> ownership( size );
933
      ownership[ owner ] = cids;
934
      return ownership;
935
    }; /** end CIRCOwnership() */
936
937
938
939
    /** Redistribution from CBLK to CIDS */
940
    DistData<STAR, CIDS, T> & operator = ( DistData<STAR, CBLK, T> &A )
941
    {
942
      /** assertion: must provide rids */
943
      assert( cids.size() );
944
945
      /** MPI */
946
      mpi::Comm comm = this->GetComm();
947
      int size = this->GetSize();
948
      int rank = this->GetRank();
949
950
      /** allocate buffer for ids */
951
      vector<vector<size_t>> sendids = CBLKOwnership();
952
      vector<vector<size_t>> recvids( size );
953
954
      /**
955
       *  exchange cids:
956
       *
957
       *  sendids contain all  required ids from each rank
958
       *  recvids contain all requested ids from each rank
959
       *
960
       */
961
      mpi::AlltoallVector( sendids, recvids, comm );
962
963
964
      /** allocate buffer for data */
965
      vector<vector<T, ALLOCATOR>> senddata( size );
966
      vector<vector<T, ALLOCATOR>> recvdata( size );
967
968
      vector<size_t> amap( this->row() );
969
      for ( size_t i = 0; i < amap.size(); i ++ ) amap[ i ] = i;
970
971
972
      /** extract columns from A<STAR,CBLK> */
973
      #pragma omp parallel for
974
      for ( size_t p = 0; p < size; p ++ )
975
      {
976
        /** recvids should be gids (not local posiiton) */
977
        senddata[ p ] = A( amap, recvids[ p ] );
978
        assert( senddata[ p ].size() == amap.size() * recvids[ p ].size() );
979
      }
980
981
      /** exchange data */
982
      mpi::AlltoallVector( senddata, recvdata, comm );
983
984
      #pragma omp parallel for
985
      for ( size_t p = 0; p < size; p ++ )
986
      {
987
        assert( recvdata[ p ].size() == sendids[ p ].size() * this->row() );
988
989
        size_t ld = this->row();
990
991
        for ( size_t j = 0; j < sendids[ p ].size(); j ++ )
992
        {
993
          size_t cid = sendids[ p ][ j ];
994
          for ( size_t i = 0; i < this->row(); i ++ )
995
          {
996
            (*this)( i, cid ) = recvdata[ p ][ j * ld + i ];
997
          }
998
        };
999
      };
1000
1001
      mpi::Barrier( comm );
1002
      return *this;
1003
    };
1004
1005
  private:
1006
1007
    vector<size_t> cids;
1008
1009
    vector<size_t> cids_from_other_ranks;
1010
1011
    /** Use hash table: cid2col[ cid ] = local column index */
1012
    unordered_map<size_t, size_t> cid2col;
1013
1014
}; /** end class DistData<STAR, CIDS, T> */
1015
1016
1017
template<typename T>
1018
class DistData<STAR, USER, T> : public DistDataBase<T>
1019
{
1020
  public:
1021
1022
    #ifdef HMLP_MIC_AVX512
1023
    /** use hbw::allocator for Intel Xeon Phi */
1024
    using ALLOCATOR = hbw::allocator<T>;
1025
    #elif  HMLP_USE_CUDA
1026
    /** use pinned (page-lock) memory for NVIDIA GPUs */
1027
    using ALLOCATOR = thrust::system::cuda::experimental::pinned_allocator<T>;
1028
    #else
1029
    /** use default stl allocator */
1030
    using ALLOCATOR = std::allocator<T>;
1031
    #endif
1032
1033
    /** Default constructor */
1034
    DistData( size_t m, size_t n, mpi::Comm comm )
1035
    : DistDataBase<T>( m, n, comm ), all_rows( m )
1036
    {
1037
      for ( size_t i = 0; i < m; i ++ ) all_rows[ i ] = i;
1038
    };
1039
1040
    /** Constructed from DistDAta<STAR, CBLK> */
1041
    DistData( DistData<STAR, CBLK, T> &A )
1042
    : DistData( A.row(), A.col(), A.GetComm() )
1043
    {
1044
      /** MPI */
1045
      mpi::Comm comm = this->GetComm();
1046
      int comm_size; mpi::Comm_size( comm, &comm_size );
1047
      int comm_rank; mpi::Comm_rank( comm, &comm_rank );
1048
1049
      /** Insert column indecies to hash table (cyclic) */
1050
      for ( size_t j  = comm_rank, i = 0;
1051
                   j  < this->col();
1052
                   j += comm_size, i ++ )
1053
      {
1054
        cid2col[ j ] = i;
1055
      }
1056
1057
      /** Check if matches the column number */
1058
      assert( cid2col.size() == A.col_owned() );
1059
      assert( A.size() == cid2col.size() * this->row() );
1060
1061
      /** Use vector<T>::insert */
1062
      this->insert( this->end(), A.begin(), A.end() );
1063
1064
      /** Resize to update owned_row() and owned_col() */
1065
      this->resize( A.row_owned(), A.col_owned() );
1066
    };
1067
1068
1069
1070
    /**
1071
     *  Insert cids and A in to this object.
1072
     *  Note: this is not thread safe.
1073
     */
1074
    void InsertColumns( const vector<size_t>& cids, const Data<T>& A )
1075
    {
1076
      int comm_rank = this->GetRank();
1077
1078
      /** Check if all dimensions match */
1079
      assert( cids.size() == A.col() );
1080
      assert( this->row() == A.row() );
1081
1082
      /** Remove duplicated columns */
1083
      set<size_t> unique_cids;
1084
      vector<size_t> new_cids, msk_cids;
1085
      new_cids.reserve( cids.size() );
1086
      msk_cids.reserve( cids.size() );
1087
1088
      //printf( "rank %d Here\n", comm_rank ); fflush( stdout );
1089
1090
      /**
1091
       * Check if cids already exist using HasColumn(). Make sure
1092
       * there is not duplication using unique_cids.
1093
       */
1094
      //#pragma omp critical
1095
      //{
1096
        for ( size_t i = 0; i < cids.size(); i ++ )
1097
        {
1098
          size_t cid = cids[ i ];
1099
          if ( !HasColumn( cid ) && !unique_cids.count( cid ) )
1100
          {
1101
            new_cids.push_back( cid );
1102
            msk_cids.push_back( i );
1103
            unique_cids.insert( cid );
1104
          }
1105
        }
1106
1107
        if ( new_cids.size() )
1108
       // printf( "rank %d cids[ 0 ] %lu cids %lu new_cids %lu\n",
1109
       //     comm_rank, cids[ 0 ], cids.size(), new_cids.size() ); fflush( stdout );
1110
1111
1112
        /** Insert to hash table one-by-one */
1113
        for ( size_t i = 0; i < new_cids.size(); i ++ )
1114
          cid2col[ new_cids[ i ] ] = i + this->col_owned();
1115
1116
        //printf( "rank %d new_cids %lu cid2col %lu\n",
1117
        //    comm_rank, new_cids.size(), cid2col.size() ); fflush( stdout );
1118
        /** Use vector<T>::insert */
1119
        Data<T> new_A = A( all_rows, msk_cids );
1120
        //printf( "rank %d A %lu %lu\n",
1121
        //    comm_rank, A.row(), A.col() ); fflush( stdout );
1122
        this->insert( this->end(), new_A.begin(), new_A.end() );
1123
        //printf( "rank %d this->size %lu row_owned %lu col_owned %lu\n",
1124
        //    comm_rank, this->size(), this->row_owned(), this->col_owned() ); fflush( stdout );
1125
1126
        /** Check if total size matches */
1127
        assert( this->row_owned() * cid2col.size() == this->size() );
1128
1129
        /** Resize to update owned_row() and owned_col() */
1130
        //this->resize( this->row(), this->size() / this->row() );
1131
        this->resize( this->row(), cid2col.size() );
1132
        //printf( "rank %d this->size %lu row_owned %lu col_owned %lu done\n",
1133
        //    comm_rank, this->size(), this->row_owned(), this->col_owned() ); fflush( stdout );
1134
1135
      //} /** end omp critical */
1136
    };
1137
1138
1139
1140
1141
1142
    /**
1143
     *  Overload operator () to allow accessing data using gids
1144
     */
1145
    T & operator () ( size_t i , size_t j )
1146
    {
1147
      /** assert that Kij is stored on this MPI process */
1148
      assert( cid2col.count( j ) == 1 );
1149
      /** return reference of Kij */
1150
      return DistDataBase<T>::operator () ( i, cid2col[ j ] );
1151
    };
1152
1153
    /**
1154
     *  Overload operator () to return a local submatrix using gids
1155
     */
1156
    Data<T> operator () ( vector<size_t> I, vector<size_t> J )
1157
    {
1158
      int comm_rank = this->GetRank();
1159
1160
      for ( auto it = J.begin(); it != J.end(); it ++ )
1161
      {
1162
        /** Assert that Kij is stored on this MPI process */
1163
        if ( cid2col.count( *it ) != 1 )
1164
        {
1165
          printf( "rank %d cid2col.count(%lu) = %lu\n",
1166
              comm_rank, *it, cid2col.count( *it ) ); fflush( stdout );
1167
          exit( 1 );
1168
        }
1169
        /** Overwrite J with its corresponding offset */
1170
        (*it) = cid2col[ (*it) ];
1171
      }
1172
      return DistDataBase<T>::operator () ( I, J );
1173
    };
1174
1175
    /** Check if cid already exists */
1176
    bool HasColumn( size_t cid )
1177
    {
1178
      return cid2col.count( cid );
1179
    };
1180
1181
    /** Return pointer of cid column using Data<T>::columndata */
1182
    T *columndata( size_t cid )
1183
    {
1184
      assert( cid2col.count( cid ) == 1 );
1185
      return Data<T>::columndata( cid2col[ cid ] );
1186
    };
1187
1188
1189
  private:
1190
1191
    /** all_rows = [ 0, 1, ..., m-1 ] */
1192
    vector<size_t> all_rows;
1193
1194
1195
    /** Use hash table: cid2col[ cid ] = local column index */
1196
    unordered_map<size_t, size_t> cid2col;
1197
1198
}; /** end class DistData<STAR, USER, T> */
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
/**
1212
 *  @brief Ecah MPI process own ( rids.size() ) rows of A,
1213
 *         and rids denote the distribution. i.e.
1214
 *         ranki owns A(rids[0],:),
1215
 *                    A(rids[1],:),
1216
 *                    A(rids[2],:), ...
1217
 */
1218
template<typename T>
1219
class DistData<RIDS, STAR, T> : public DistDataBase<T>
1220
{
1221
  public:
1222
1223
    #ifdef HMLP_MIC_AVX512
1224
    /** use hbw::allocator for Intel Xeon Phi */
1225
    using ALLOCATOR = hbw::allocator<T>;
1226
    #elif  HMLP_USE_CUDA
1227
    /** use pinned (page-lock) memory for NVIDIA GPUs */
1228
    using ALLOCATOR = thrust::system::cuda::experimental::pinned_allocator<T>;
1229
    #else
1230
    /** use default stl allocator */
1231
    using ALLOCATOR = std::allocator<T>;
1232
    #endif
1233
1234
1235
    /** default constructor */
1236
    DistData( size_t m, size_t n, std::vector<size_t> &rids, mpi::Comm comm ) :
1237
      DistDataBase<T>( m, n, comm )
1238
    {
1239
      /** now check if (sum rids.size() == m) */
1240
      size_t bcast_m = rids.size();
1241
      size_t reduc_m = 0;
1242
      mpi::Allreduce( &bcast_m, &reduc_m, 1, MPI_SUM, comm );
1243
1244
      if ( reduc_m != m )
1245
        printf( "%lu %lu\n", reduc_m, m ); fflush( stdout );
1246
1247
1248
      assert( reduc_m == m );
1249
      this->rids = rids;
1250
      this->resize( rids.size(), n );
1251
1252
      for ( size_t i = 0; i < rids.size(); i ++ )
1253
        rid2row[ rids[ i ] ] = i;
1254
    };
1255
1256
1257
    /**
1258
     *  Overload operator () to allow accessing data using gids
1259
     */
1260
    T & operator () ( size_t i , size_t j )
1261
    {
1262
      /** assert that Kij is stored on this MPI process */
1263
      assert( rid2row.count( i ) == 1 );
1264
      /** return reference of Kij */
1265
      return DistDataBase<T>::operator () ( rid2row[ i ], j );
1266
    };
1267
1268
1269
    /**
1270
     *  Overload operator () to return a local submatrix using gids
1271
     */
1272
    hmlp::Data<T> operator () ( std::vector<size_t> I, std::vector<size_t> J )
1273
    {
1274
      for ( auto it = I.begin(); it != I.end(); it ++ )
1275
      {
1276
        /** assert that Kij is stored on this MPI process */
1277
        assert( rid2row.count(*it) == 1 );
1278
        (*it) = rid2row[ (*it) ];
1279
      }
1280
      return DistDataBase<T>::operator () ( I, J );
1281
    };
1282
1283
1284
1285
1286
1287
1288
    /**
1289
     *  @brief Return a vector of vector that indicates the RBLK ownership
1290
     *         of each MPI rank.
1291
     */
1292
    std::vector<std::vector<size_t>> RBLKOwnership()
1293
    {
1294
      /** MPI */
1295
      mpi::Comm comm = this->GetComm();
1296
      int size = this->GetSize();
1297
      int rank = this->GetRank();
1298
1299
      std::vector<std::vector<size_t>> ownership( size );
1300
1301
      for ( auto it = rids.begin(); it != rids.end(); it ++ )
1302
      {
1303
        size_t rid = (*it);
1304
        /**
1305
         *  While in RBLK distribution, rid is owned by
1306
         *  rank ( rid % size ) at position ( rid / size )
1307
         */
1308
        ownership[ rid % size ].push_back( rid );
1309
      };
1310
1311
      return ownership;
1312
1313
    }; /** end RBLKOwnership() */
1314
1315
1316
1317
    /**
1318
     *  redistribution from RBLK to RIDS (MPI_Alltoallv)
1319
     */
1320
    DistData<RIDS, STAR, T> & operator = ( DistData<RBLK, STAR, T> &A )
1321
    {
1322
      /** assertion: must provide rids */
1323
      assert( rids.size() );
1324
1325
      /** MPI */
1326
      mpi::Comm comm = this->GetComm();
1327
      int size = this->GetSize();
1328
      int rank = this->GetRank();
1329
1330
      //printf( "Enter redistrivution rids.size() %lu\n", rids.size() ); fflush( stdout );
1331
      //mpi::Barrier( comm );
1332
1333
      /** allocate buffer for ids */
1334
      std::vector<std::vector<size_t>> sendids = RBLKOwnership();
1335
      std::vector<std::vector<size_t>> recvids( size );
1336
1337
      //for ( size_t i = 0; i < rids.size(); i ++ )
1338
      //{
1339
      //  /** since A has RBLK distribution, rid is stored at rank (rid) % size */
1340
      //  size_t rid = rids[ i ];
1341
      //  /** ( rid / size ) is the local id of A */
1342
      //  sendids[ rid % size ].push_back( rid );
1343
      //}
1344
1345
      //printf( "before All2allvector1\n" ); fflush( stdout );
1346
      //mpi::Barrier( comm );
1347
1348
      /**
1349
       *  exchange rids:
1350
       *
1351
       *  sendids contain all  required ids from each rank
1352
       *  recvids contain all requested ids from each rank
1353
       *
1354
       */
1355
      mpi::AlltoallVector( sendids, recvids, comm );
1356
1357
      //printf( "Finish All2allvector1\n" );fflush( stdout );
1358
      //mpi::Barrier( comm );
1359
1360
1361
      std::vector<std::vector<T, ALLOCATOR>> senddata( size );
1362
      std::vector<std::vector<T, ALLOCATOR>> recvdata( size );
1363
1364
      std::vector<size_t> bmap( this->col() );
1365
      for ( size_t j = 0; j < bmap.size(); j ++ ) bmap[ j ] = j;
1366
1367
      /** extract rows from A<RBLK,STAR> */
1368
      #pragma omp parallel for
1369
      for ( size_t p = 0; p < size; p ++ )
1370
      {
1371
        /** recvids should be gids (not local posiiton) */
1372
        senddata[ p ] = A( recvids[ p ], bmap );
1373
        assert( senddata[ p ].size() == recvids[ p ].size() * bmap.size() );
1374
      }
1375
1376
      //printf( "before All2allvector2\n" ); fflush( stdout );
1377
      /** exchange data */
1378
      mpi::AlltoallVector( senddata, recvdata, comm );
1379
1380
1381
      #pragma omp parallel for
1382
      for ( size_t p = 0; p < size; p ++ )
1383
      {
1384
        assert( recvdata[ p ].size() == sendids[ p ].size() * this->col() );
1385
1386
        size_t ld = recvdata[ p ].size() / this->col();
1387
1388
        assert( ld == sendids[ p ].size() );
1389
1390
1391
        for ( size_t i = 0; i < sendids[ p ].size(); i ++ )
1392
        {
1393
          size_t rid = sendids[ p ][ i ];
1394
          for ( size_t j = 0; j < this->col(); j ++ )
1395
          {
1396
            (*this)( rid, j ) = recvdata[ p ][ j * ld + i ];
1397
          }
1398
        };
1399
      };
1400
      mpi::Barrier( comm );
1401
1402
      return (*this);
1403
    };
1404
1405
1406
1407
1408
1409
1410
1411
  private:
1412
1413
    /** owned row gids */
1414
    vector<size_t> rids;
1415
1416
    map<size_t, size_t> rid2row;
1417
1418
1419
}; /** end class DistData<RIDS, STAR, T> */
1420
1421
1422
1423
1424
template<typename T>
1425
class DistData<STAR, STAR, T> : public DistDataBase<T>
1426
{
1427
  public:
1428
1429
  private:
1430
1431
}; /** end class DistData<STAR, STAR, T> */
1432
1433
1434
1435
1436
1437
1438
}; /** end namespace hmlp */
1439
1440
1441
#endif /** define DISTDATA_HPP */
1442