GCC Code Coverage Report
Directory: . Exec Total Coverage
File: frame/primitives/combinatorics.hpp Lines: 0 127 0.0 %
Date: 2019-01-14 Branches: 0 342 0.0 %

Line Exec Source
1
#ifndef COMBINATORICS_HPP
2
#define COMBINATORICS_HPP
3
4
/** Use STL, and vector. */
5
#include <stdlib.h>
6
#include <stdio.h>
7
#include <vector>
8
#include <random>
9
#include <algorithm>
10
11
/** Use MPI support. */
12
#include <hmlp_mpi.hpp>
13
14
using namespace std;
15
using namespace hmlp;
16
17
18
namespace hmlp
19
{
20
namespace combinatorics
21
{
22
23
24
template<typename T>
25
vector<T> SampleWithoutReplacement( int l, vector<T> v )
26
{
27
  if ( l >= v.size() ) return v;
28
  random_device rd;
29
  std::mt19937 generator( rd() );
30
  shuffle( v.begin(), v.end(), generator );
31
  vector<T> ret( l );
32
  for ( int i = 0; i < l; i ++ ) ret[ i ] = v[ i ];
33
  return ret;
34
}; /** end SampleWithoutReplacement() */
35
36
37
38
39
40
#ifdef HMLP_MIC_AVX512
41
/** use hbw::allocator for Intel Xeon Phi */
42
template<class T, class Allocator = hbw::allocator<T> >
43
#elif  HMLP_USE_CUDA
44
/** use pinned (page-lock) memory for NVIDIA GPUs */
45
template<class T, class Allocator = thrust::system::cuda::experimental::pinned_allocator<T> >
46
#else
47
/** use default stl allocator */
48
template<class T, class Allocator = std::allocator<T> >
49
#endif
50
vector<T> Sum( size_t d, size_t n, vector<T, Allocator> &X, vector<size_t> &gids )
51
{
52
  bool do_general_stride = ( gids.size() == n );
53
54
  /** assertion */
55
  if ( !do_general_stride ) assert( X.size() == d * n );
56
57
  /** declaration */
58
  int n_split = omp_get_max_threads();
59
  std::vector<T> sum( d, 0.0 );
60
  std::vector<T> temp( d * n_split, 0.0 );
61
62
  /** compute partial sum on each thread */
63
  #pragma omp parallel for num_threads( n_split )
64
  for ( int j = 0; j < n_split; j ++ )
65
    for ( int i = j; i < n; i += n_split )
66
      for ( int p = 0; p < d; p ++ )
67
        if ( do_general_stride )
68
          temp[ j * d + p ] += X[ gids[ i ] * d + p ];
69
        else
70
          temp[ j * d + p ] += X[ i * d + p ];
71
72
  /** reduce all temporary buffers */
73
  for ( int j = 0; j < n_split; j ++ )
74
    for ( int p = 0; p < d; p ++ )
75
      sum[ p ] += temp[ j * d + p ];
76
77
  return sum;
78
79
}; /** end Sum() */
80
81
82
/** TODO */
83
#ifdef HMLP_MIC_AVX512
84
/** use hbw::allocator for Intel Xeon Phi */
85
template<class T, class Allocator = hbw::allocator<T> >
86
#elif  HMLP_USE_CUDA
87
/** use pinned (page-lock) memory for NVIDIA GPUs */
88
template<class T, class Allocator = thrust::system::cuda::experimental::pinned_allocator<T> >
89
#else
90
/** use default stl allocator */
91
template<class T, class Allocator = std::allocator<T> >
92
#endif
93
vector<T> Sum( size_t d, size_t n, vector<T, Allocator> &X, mpi::Comm comm )
94
{
95
  size_t num_points_owned = X.size() / d;
96
97
  /** gids */
98
  vector<size_t> dummy_gids;
99
100
  /** local sum */
101
  auto local_sum = Sum( d, num_points_owned, X, dummy_gids );
102
103
  /** total sum */
104
  vector<T> total_sum( d, 0 );
105
106
  /** Allreduce */
107
  hmlp::mpi::Allreduce(
108
      local_sum.data(), total_sum.data(), d, MPI_SUM, comm );
109
110
  return total_sum;
111
112
}; /** end Sum() */
113
114
115
116
117
#ifdef HMLP_MIC_AVX512
118
/** use hbw::allocator for Intel Xeon Phi */
119
template<class T, class Allocator = hbw::allocator<T> >
120
#elif  HMLP_USE_CUDA
121
/** use pinned (page-lock) memory for NVIDIA GPUs */
122
template<class T, class Allocator = thrust::system::cuda::experimental::pinned_allocator<T> >
123
#else
124
/** use default stl allocator */
125
template<class T, class Allocator = std::allocator<T> >
126
#endif
127
std::vector<T> Mean( size_t d, size_t n, std::vector<T, Allocator> &X, std::vector<size_t> &gids )
128
{
129
  /** sum over n points in d dimensions */
130
  auto mean = Sum( d, n, X, gids );
131
132
  /** compute average */
133
  for ( int p = 0; p < d; p ++ ) mean[ p ] /= n;
134
135
  return mean;
136
137
}; /** end Mean() */
138
139
140
/**
141
 *  @brief Compute the mean values. (alternative interface)
142
 *
143
 */
144
template<typename T>
145
std::vector<T> Mean( size_t d, size_t n, std::vector<T> &X )
146
{
147
  /** assertion */
148
  assert( X.size() == d * n );
149
150
  /** gids */
151
  std::vector<std::size_t> dummy_gids;
152
153
  return Mean( d, n, X, dummy_gids );
154
155
}; /** end Mean() */
156
157
158
159
160
/** TODO */
161
/**
162
 *  @biref distributed mean values over d dimensions
163
 */
164
template<typename T>
165
std::vector<T> Mean( size_t d, size_t n, std::vector<T> &X,
166
    hmlp::mpi::Comm comm )
167
{
168
  /** sum over n points in d dimensions */
169
  auto mean = Sum( d, n, X, comm );
170
171
  /** compute average */
172
  for ( int p = 0; p < d; p ++ ) mean[ p ] /= n;
173
174
  return mean;
175
176
}; /** end Mean() */
177
178
179
180
/**
181
 *  @brief Parallel prefix scan
182
 */
183
template<typename TA, typename TB>
184
void Scan( std::vector<TA> &A, std::vector<TB> &B )
185
{
186
  assert( A.size() == B.size() - 1 );
187
188
  /** number of threads */
189
  size_t p = omp_get_max_threads();
190
191
  /** problem size */
192
  size_t n = B.size();
193
194
  /** step size */
195
  size_t nb = n / p;
196
197
  /** private temporary buffer for each thread */
198
  std::vector<TB> sum( p, (TB)0 );
199
200
  /** B[ 0 ] = (TB)0 */
201
  B[ 0 ] = (TB)0;
202
203
  /** small problem size: sequential */
204
  if ( n < 100 * p )
205
  {
206
    size_t beg = 0;
207
    size_t end = n;
208
    for ( size_t j = beg + 1; j < end; j ++ )
209
      B[ j ] = B[ j - 1 ] + A[ j - 1 ];
210
    return;
211
  }
212
213
  /** parallel local scan */
214
  #pragma omp parallel for schedule( static )
215
  for ( size_t i = 0; i < p; i ++ )
216
  {
217
    size_t beg = i * nb;
218
    size_t end = beg + nb;
219
    /** deal with the edge case */
220
    if ( i == p - 1 ) end = n;
221
    if ( i != 0 ) B[ beg ] = (TB)0;
222
    for ( size_t j = beg + 1; j < end; j ++ )
223
    {
224
      B[ j ] = B[ j - 1 ] + A[ j - 1 ];
225
    }
226
  }
227
228
  /** sequential scan on local sum */
229
  for ( size_t i = 1; i < p; i ++ )
230
  {
231
    sum[ i ] = sum[ i - 1 ] + B[ i * nb - 1 ] + A[ i * nb - 1 ];
232
  }
233
234
  #pragma omp parallel for schedule( static )
235
  for ( size_t i = 1; i < p; i ++ )
236
  {
237
    size_t beg = i * nb;
238
    size_t end = beg + nb;
239
    /** deal with the edge case */
240
    if ( i == p - 1 ) end = n;
241
    TB sum_ = sum[ i ];
242
    for ( size_t j = beg; j < end; j ++ )
243
    {
244
      B[ j ] += sum_;
245
    }
246
  }
247
248
}; /** end Scan() */
249
250
251
252
/**
253
 *  @brief Select the kth element in x in the increasing order.
254
 *
255
 *  @para
256
 *
257
 *  @TODO  The mean function is parallel, but the splitter is not.
258
 *         I need something like a parallel scan.
259
 */
260
template<typename T>
261
T Select( size_t n, size_t k, std::vector<T> &x )
262
{
263
  /** assertion */
264
  assert( k <= n && x.size() == n );
265
266
  /** early return */
267
  if ( n == 1 ) return x[ 0 ];
268
269
  std::vector<T> mean = Mean( (size_t)1, n, x );
270
  std::vector<T> lhs, rhs;
271
  std::vector<size_t> lflag( n, 0 );
272
  std::vector<size_t> rflag( n, 0 );
273
  std::vector<size_t> pscan( n + 1, 0 );
274
275
  /** mark flags */
276
  #pragma omp parallel for
277
  for ( size_t i = 0; i < n; i ++ )
278
  {
279
    if ( x[ i ] > mean[ 0 ] ) rflag[ i ] = 1;
280
    else                      lflag[ i ] = 1;
281
  }
282
283
  /**
284
   *  prefix sum on flags of left hand side
285
   *  input:  flags
286
   *  output: zero-base index
287
   **/
288
  Scan( lflag, pscan );
289
290
  /** resize left hand side */
291
  lhs.resize( pscan[ n ] );
292
293
  #pragma omp parallel for
294
  for ( size_t i = 0; i < n; i ++ )
295
  {
296
	  if ( lflag[ i ] )
297
      lhs[ pscan[ i ] ] = x[ i ];
298
  }
299
300
  /**
301
   *  prefix sum on flags of right hand side
302
   *  input:  flags
303
   *  output: zero-base index
304
   **/
305
  Scan( rflag, pscan );
306
307
  /** resize right hand side */
308
  rhs.resize( pscan[ n ] );
309
310
  #pragma omp parallel for
311
  for ( size_t i = 0; i < n; i ++ )
312
  {
313
	  if ( rflag[ i ] )
314
      rhs[ pscan[ i ] ] = x[ i ];
315
  }
316
317
  int nlhs = lhs.size();
318
  int nrhs = rhs.size();
319
320
  if ( nlhs == k || nlhs == n || nrhs == n )
321
  {
322
    return mean[ 0 ];
323
  }
324
  else if ( nlhs > k )
325
  {
326
    rhs.clear();
327
    return Select( nlhs, k, lhs );
328
  }
329
  else
330
  {
331
    lhs.clear();
332
    return Select( nrhs, k - nlhs, rhs );
333
  }
334
335
}; /** end Select() */
336
337
338
339
template<typename T>
340
T Select( size_t k, std::vector<T> &x, hmlp::mpi::Comm comm )
341
{
342
  /** declaration */
343
  std::vector<T> lhs, rhs;
344
  lhs.reserve( x.size() );
345
  rhs.reserve( x.size() );
346
  int n = 0;
347
  int num_points_owned = x.size();
348
349
  /** reduce to get the total problem size */
350
  hmlp::mpi::Allreduce( &num_points_owned, &n, 1, MPI_SUM, comm );
351
352
  /** TODO: mean value */
353
  std::vector<T> mean = Mean( (size_t)1, n, x, comm );
354
355
  for ( size_t i = 0; i < x.size(); i ++ )
356
  {
357
    if ( x[ i ] < mean[ 0 ] ) lhs.push_back( x[ i ] );
358
    else                      rhs.push_back( x[ i ] );
359
  }
360
361
  /** reduce nlhs and nrhs */
362
  int nlhs = 0;
363
  int nrhs = 0;
364
  int num_lhs_owned = lhs.size();
365
  int num_rhs_owned = rhs.size();
366
  hmlp::mpi::Allreduce( &num_lhs_owned, &nlhs, 1, MPI_SUM, comm );
367
  hmlp::mpi::Allreduce( &num_rhs_owned, &nrhs, 1, MPI_SUM, comm );
368
369
  if ( nlhs == k || n == 1 || n == nlhs || n == nrhs )
370
  {
371
    return mean[ 0 ];
372
  }
373
  else if ( nlhs > k )
374
  {
375
    rhs.clear();
376
    return Select( k, lhs, comm );
377
  }
378
  else
379
  {
380
    lhs.clear();
381
    return Select( k - nlhs, rhs, comm );
382
  }
383
384
}; /** end Select() */
385
386
387
388
template<typename T>
389
vector<vector<size_t>> MedianThreeWaySplit( vector<T> &v, T tol )
390
{
391
  size_t n = v.size();
392
  auto median = Select( n, n / 2, v );
393
  /** Split indices of v into 3-way: lhs, rhs, and mid. */
394
  vector<vector<size_t>> three_ways( 3 );
395
  auto & lhs = three_ways[ 0 ];
396
  auto & rhs = three_ways[ 1 ];
397
  auto & mid = three_ways[ 2 ];
398
  for ( size_t i = 0; i < v.size(); i ++ )
399
  {
400
    if ( std::fabs( v[ i ] - median ) < tol ) mid.push_back( i );
401
    else if ( v[ i ] < median ) lhs.push_back( i );
402
    else rhs.push_back( i );
403
  }
404
  return three_ways;
405
}; /** end MedianTreeWaySplit() */
406
407
408
409
/** @brief Split values into two halfs accroding to the median. */
410
template<typename T>
411
vector<vector<size_t>> MedianSplit( vector<T> &v )
412
{
413
  auto three_ways = MedianThreeWaySplit( v, (T)1E-6 );
414
  vector<vector<size_t>> two_ways( 2 );
415
  two_ways[ 0 ] = three_ways[ 0 ];
416
  two_ways[ 1 ] = three_ways[ 1 ];
417
  auto & lhs = two_ways[ 0 ];
418
  auto & rhs = two_ways[ 1 ];
419
  auto & mid = three_ways[ 2 ];
420
  for ( auto it : mid )
421
  {
422
    if ( lhs.size() < rhs.size() ) lhs.push_back( it );
423
    else rhs.push_back( it );
424
  }
425
  return two_ways;
426
}; /** end MedianSplit() */
427
428
429
430
431
432
433
434
435
}; /** end namespace combinatorics */
436
}; /** end namespace hmlp */
437
438
#endif /** define COMBINATORICS_HPP */