HMLP: High-performance Machine Learning Primitives
All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Pages
combinatorics.hpp
1 #ifndef COMBINATORICS_HPP
2 #define COMBINATORICS_HPP
3 
5 #include <stdlib.h>
6 #include <stdio.h>
7 #include <vector>
8 #include <random>
9 #include <algorithm>
10 
12 #include <hmlp_mpi.hpp>
13 
14 using namespace std;
15 using namespace hmlp;
16 
17 
18 namespace hmlp
19 {
20 namespace combinatorics
21 {
22 
23 
24 template<typename T>
25 vector<T> SampleWithoutReplacement( int l, vector<T> v )
26 {
27  if ( l >= v.size() ) return v;
28  random_device rd;
29  std::mt19937 generator( rd() );
30  shuffle( v.begin(), v.end(), generator );
31  vector<T> ret( l );
32  for ( int i = 0; i < l; i ++ ) ret[ i ] = v[ i ];
33  return ret;
34 };
40 #ifdef HMLP_MIC_AVX512
41 
42 template<class T, class Allocator = hbw::allocator<T> >
43 #elif HMLP_USE_CUDA
44 
45 template<class T, class Allocator = thrust::system::cuda::experimental::pinned_allocator<T> >
46 #else
47 
48 template<class T, class Allocator = std::allocator<T> >
49 #endif
50 vector<T> Sum( size_t d, size_t n, vector<T, Allocator> &X, vector<size_t> &gids )
51 {
52  bool do_general_stride = ( gids.size() == n );
53 
55  if ( !do_general_stride ) assert( X.size() == d * n );
56 
58  int n_split = omp_get_max_threads();
59  std::vector<T> sum( d, 0.0 );
60  std::vector<T> temp( d * n_split, 0.0 );
61 
63  #pragma omp parallel for num_threads( n_split )
64  for ( int j = 0; j < n_split; j ++ )
65  for ( int i = j; i < n; i += n_split )
66  for ( int p = 0; p < d; p ++ )
67  if ( do_general_stride )
68  temp[ j * d + p ] += X[ gids[ i ] * d + p ];
69  else
70  temp[ j * d + p ] += X[ i * d + p ];
71 
73  for ( int j = 0; j < n_split; j ++ )
74  for ( int p = 0; p < d; p ++ )
75  sum[ p ] += temp[ j * d + p ];
76 
77  return sum;
78 
79 };
83 #ifdef HMLP_MIC_AVX512
84 
85 template<class T, class Allocator = hbw::allocator<T> >
86 #elif HMLP_USE_CUDA
87 
88 template<class T, class Allocator = thrust::system::cuda::experimental::pinned_allocator<T> >
89 #else
90 
91 template<class T, class Allocator = std::allocator<T> >
92 #endif
93 vector<T> Sum( size_t d, size_t n, vector<T, Allocator> &X, mpi::Comm comm )
94 {
95  size_t num_points_owned = X.size() / d;
96 
98  vector<size_t> dummy_gids;
99 
101  auto local_sum = Sum( d, num_points_owned, X, dummy_gids );
102 
104  vector<T> total_sum( d, 0 );
105 
108  local_sum.data(), total_sum.data(), d, MPI_SUM, comm );
109 
110  return total_sum;
111 
112 };
117 #ifdef HMLP_MIC_AVX512
118 
119 template<class T, class Allocator = hbw::allocator<T> >
120 #elif HMLP_USE_CUDA
121 
122 template<class T, class Allocator = thrust::system::cuda::experimental::pinned_allocator<T> >
123 #else
124 
125 template<class T, class Allocator = std::allocator<T> >
126 #endif
127 std::vector<T> Mean( size_t d, size_t n, std::vector<T, Allocator> &X, std::vector<size_t> &gids )
128 {
130  auto mean = Sum( d, n, X, gids );
131 
133  for ( int p = 0; p < d; p ++ ) mean[ p ] /= n;
134 
135  return mean;
136 
137 };
144 template<typename T>
145 std::vector<T> Mean( size_t d, size_t n, std::vector<T> &X )
146 {
148  assert( X.size() == d * n );
149 
151  std::vector<std::size_t> dummy_gids;
152 
153  return Mean( d, n, X, dummy_gids );
154 
155 };
164 template<typename T>
165 std::vector<T> Mean( size_t d, size_t n, std::vector<T> &X,
166  hmlp::mpi::Comm comm )
167 {
169  auto mean = Sum( d, n, X, comm );
170 
172  for ( int p = 0; p < d; p ++ ) mean[ p ] /= n;
173 
174  return mean;
175 
176 };
183 template<typename TA, typename TB>
184 void Scan( std::vector<TA> &A, std::vector<TB> &B )
185 {
186  assert( A.size() == B.size() - 1 );
187 
189  size_t p = omp_get_max_threads();
190 
192  size_t n = B.size();
193 
195  size_t nb = n / p;
196 
198  std::vector<TB> sum( p, (TB)0 );
199 
201  B[ 0 ] = (TB)0;
202 
204  if ( n < 100 * p )
205  {
206  size_t beg = 0;
207  size_t end = n;
208  for ( size_t j = beg + 1; j < end; j ++ )
209  B[ j ] = B[ j - 1 ] + A[ j - 1 ];
210  return;
211  }
212 
214  #pragma omp parallel for schedule( static )
215  for ( size_t i = 0; i < p; i ++ )
216  {
217  size_t beg = i * nb;
218  size_t end = beg + nb;
220  if ( i == p - 1 ) end = n;
221  if ( i != 0 ) B[ beg ] = (TB)0;
222  for ( size_t j = beg + 1; j < end; j ++ )
223  {
224  B[ j ] = B[ j - 1 ] + A[ j - 1 ];
225  }
226  }
227 
229  for ( size_t i = 1; i < p; i ++ )
230  {
231  sum[ i ] = sum[ i - 1 ] + B[ i * nb - 1 ] + A[ i * nb - 1 ];
232  }
233 
234  #pragma omp parallel for schedule( static )
235  for ( size_t i = 1; i < p; i ++ )
236  {
237  size_t beg = i * nb;
238  size_t end = beg + nb;
240  if ( i == p - 1 ) end = n;
241  TB sum_ = sum[ i ];
242  for ( size_t j = beg; j < end; j ++ )
243  {
244  B[ j ] += sum_;
245  }
246  }
247 
248 };
260 template<typename T>
261 T Select( size_t n, size_t k, std::vector<T> &x )
262 {
264  assert( k <= n && x.size() == n );
265 
267  if ( n == 1 ) return x[ 0 ];
268 
269  std::vector<T> mean = Mean( (size_t)1, n, x );
270  std::vector<T> lhs, rhs;
271  std::vector<size_t> lflag( n, 0 );
272  std::vector<size_t> rflag( n, 0 );
273  std::vector<size_t> pscan( n + 1, 0 );
274 
276  #pragma omp parallel for
277  for ( size_t i = 0; i < n; i ++ )
278  {
279  if ( x[ i ] > mean[ 0 ] ) rflag[ i ] = 1;
280  else lflag[ i ] = 1;
281  }
282 
288  Scan( lflag, pscan );
289 
291  lhs.resize( pscan[ n ] );
292 
293  #pragma omp parallel for
294  for ( size_t i = 0; i < n; i ++ )
295  {
296  if ( lflag[ i ] )
297  lhs[ pscan[ i ] ] = x[ i ];
298  }
299 
305  Scan( rflag, pscan );
306 
308  rhs.resize( pscan[ n ] );
309 
310  #pragma omp parallel for
311  for ( size_t i = 0; i < n; i ++ )
312  {
313  if ( rflag[ i ] )
314  rhs[ pscan[ i ] ] = x[ i ];
315  }
316 
317  int nlhs = lhs.size();
318  int nrhs = rhs.size();
319 
320  if ( nlhs == k || nlhs == n || nrhs == n )
321  {
322  return mean[ 0 ];
323  }
324  else if ( nlhs > k )
325  {
326  rhs.clear();
327  return Select( nlhs, k, lhs );
328  }
329  else
330  {
331  lhs.clear();
332  return Select( nrhs, k - nlhs, rhs );
333  }
334 
335 };
339 template<typename T>
340 T Select( size_t k, std::vector<T> &x, hmlp::mpi::Comm comm )
341 {
343  std::vector<T> lhs, rhs;
344  lhs.reserve( x.size() );
345  rhs.reserve( x.size() );
346  int n = 0;
347  int num_points_owned = x.size();
348 
350  hmlp::mpi::Allreduce( &num_points_owned, &n, 1, MPI_SUM, comm );
351 
353  std::vector<T> mean = Mean( (size_t)1, n, x, comm );
354 
355  for ( size_t i = 0; i < x.size(); i ++ )
356  {
357  if ( x[ i ] < mean[ 0 ] ) lhs.push_back( x[ i ] );
358  else rhs.push_back( x[ i ] );
359  }
360 
362  int nlhs = 0;
363  int nrhs = 0;
364  int num_lhs_owned = lhs.size();
365  int num_rhs_owned = rhs.size();
366  hmlp::mpi::Allreduce( &num_lhs_owned, &nlhs, 1, MPI_SUM, comm );
367  hmlp::mpi::Allreduce( &num_rhs_owned, &nrhs, 1, MPI_SUM, comm );
368 
369  if ( nlhs == k || n == 1 || n == nlhs || n == nrhs )
370  {
371  return mean[ 0 ];
372  }
373  else if ( nlhs > k )
374  {
375  rhs.clear();
376  return Select( k, lhs, comm );
377  }
378  else
379  {
380  lhs.clear();
381  return Select( k - nlhs, rhs, comm );
382  }
383 
384 };
388 template<typename T>
389 vector<vector<size_t>> MedianThreeWaySplit( vector<T> &v, T tol )
390 {
391  size_t n = v.size();
392  auto median = Select( n, n / 2, v );
394  vector<vector<size_t>> three_ways( 3 );
395  auto & lhs = three_ways[ 0 ];
396  auto & rhs = three_ways[ 1 ];
397  auto & mid = three_ways[ 2 ];
398  for ( size_t i = 0; i < v.size(); i ++ )
399  {
400  if ( std::fabs( v[ i ] - median ) < tol ) mid.push_back( i );
401  else if ( v[ i ] < median ) lhs.push_back( i );
402  else rhs.push_back( i );
403  }
404  return three_ways;
405 };
410 template<typename T>
411 vector<vector<size_t>> MedianSplit( vector<T> &v )
412 {
413  auto three_ways = MedianThreeWaySplit( v, (T)1E-6 );
414  vector<vector<size_t>> two_ways( 2 );
415  two_ways[ 0 ] = three_ways[ 0 ];
416  two_ways[ 1 ] = three_ways[ 1 ];
417  auto & lhs = two_ways[ 0 ];
418  auto & rhs = two_ways[ 1 ];
419  auto & mid = three_ways[ 2 ];
420  for ( auto it : mid )
421  {
422  if ( lhs.size() < rhs.size() ) lhs.push_back( it );
423  else rhs.push_back( it );
424  }
425  return two_ways;
426 };
435 };
436 };
438 #endif
int Allreduce(T *sendbuf, T *recvbuf, int count, Op op, Comm comm)
Definition: hmlp_mpi.hpp:274
Definition: gofmm.hpp:83