1 #ifndef COMBINATORICS_HPP 2 #define COMBINATORICS_HPP 12 #include <hmlp_mpi.hpp> 20 namespace combinatorics
25 vector<T> SampleWithoutReplacement(
int l, vector<T> v )
27 if ( l >= v.size() )
return v;
29 std::mt19937 generator( rd() );
30 shuffle( v.begin(), v.end(), generator );
32 for (
int i = 0; i < l; i ++ ) ret[ i ] = v[ i ];
40 #ifdef HMLP_MIC_AVX512 42 template<
class T,
class Allocator = hbw::allocator<T> >
45 template<
class T,
class Allocator = thrust::system::cuda::experimental::pinned_allocator<T> >
48 template<
class T,
class Allocator = std::allocator<T> >
50 vector<T> Sum(
size_t d,
size_t n, vector<T, Allocator> &X, vector<size_t> &gids )
52 bool do_general_stride = ( gids.size() == n );
55 if ( !do_general_stride ) assert( X.size() == d * n );
58 int n_split = omp_get_max_threads();
59 std::vector<T> sum( d, 0.0 );
60 std::vector<T> temp( d * n_split, 0.0 );
63 #pragma omp parallel for num_threads( n_split ) 64 for (
int j = 0; j < n_split; j ++ )
65 for (
int i = j; i < n; i += n_split )
66 for (
int p = 0; p < d; p ++ )
67 if ( do_general_stride )
68 temp[ j * d + p ] += X[ gids[ i ] * d + p ];
70 temp[ j * d + p ] += X[ i * d + p ];
73 for (
int j = 0; j < n_split; j ++ )
74 for (
int p = 0; p < d; p ++ )
75 sum[ p ] += temp[ j * d + p ];
83 #ifdef HMLP_MIC_AVX512 85 template<
class T,
class Allocator = hbw::allocator<T> >
88 template<
class T,
class Allocator = thrust::system::cuda::experimental::pinned_allocator<T> >
91 template<
class T,
class Allocator = std::allocator<T> >
93 vector<T> Sum(
size_t d,
size_t n, vector<T, Allocator> &X, mpi::Comm comm )
95 size_t num_points_owned = X.size() / d;
98 vector<size_t> dummy_gids;
101 auto local_sum = Sum( d, num_points_owned, X, dummy_gids );
104 vector<T> total_sum( d, 0 );
108 local_sum.data(), total_sum.data(), d, MPI_SUM, comm );
117 #ifdef HMLP_MIC_AVX512 119 template<
class T,
class Allocator = hbw::allocator<T> >
122 template<
class T,
class Allocator = thrust::system::cuda::experimental::pinned_allocator<T> >
125 template<
class T,
class Allocator = std::allocator<T> >
127 std::vector<T> Mean(
size_t d,
size_t n, std::vector<T, Allocator> &X, std::vector<size_t> &gids )
130 auto mean = Sum( d, n, X, gids );
133 for (
int p = 0; p < d; p ++ ) mean[ p ] /= n;
145 std::vector<T> Mean(
size_t d,
size_t n, std::vector<T> &X )
148 assert( X.size() == d * n );
151 std::vector<std::size_t> dummy_gids;
153 return Mean( d, n, X, dummy_gids );
165 std::vector<T> Mean(
size_t d,
size_t n, std::vector<T> &X,
166 hmlp::mpi::Comm comm )
169 auto mean = Sum( d, n, X, comm );
172 for (
int p = 0; p < d; p ++ ) mean[ p ] /= n;
183 template<
typename TA,
typename TB>
184 void Scan( std::vector<TA> &A, std::vector<TB> &B )
186 assert( A.size() == B.size() - 1 );
189 size_t p = omp_get_max_threads();
198 std::vector<TB> sum( p, (TB)0 );
208 for (
size_t j = beg + 1; j < end; j ++ )
209 B[ j ] = B[ j - 1 ] + A[ j - 1 ];
214 #pragma omp parallel for schedule( static ) 215 for (
size_t i = 0; i < p; i ++ )
218 size_t end = beg + nb;
220 if ( i == p - 1 ) end = n;
221 if ( i != 0 ) B[ beg ] = (TB)0;
222 for (
size_t j = beg + 1; j < end; j ++ )
224 B[ j ] = B[ j - 1 ] + A[ j - 1 ];
229 for (
size_t i = 1; i < p; i ++ )
231 sum[ i ] = sum[ i - 1 ] + B[ i * nb - 1 ] + A[ i * nb - 1 ];
234 #pragma omp parallel for schedule( static ) 235 for (
size_t i = 1; i < p; i ++ )
238 size_t end = beg + nb;
240 if ( i == p - 1 ) end = n;
242 for (
size_t j = beg; j < end; j ++ )
261 T Select(
size_t n,
size_t k, std::vector<T> &x )
264 assert( k <= n && x.size() == n );
267 if ( n == 1 )
return x[ 0 ];
269 std::vector<T> mean = Mean( (
size_t)1, n, x );
270 std::vector<T> lhs, rhs;
271 std::vector<size_t> lflag( n, 0 );
272 std::vector<size_t> rflag( n, 0 );
273 std::vector<size_t> pscan( n + 1, 0 );
276 #pragma omp parallel for 277 for (
size_t i = 0; i < n; i ++ )
279 if ( x[ i ] > mean[ 0 ] ) rflag[ i ] = 1;
288 Scan( lflag, pscan );
291 lhs.resize( pscan[ n ] );
293 #pragma omp parallel for 294 for (
size_t i = 0; i < n; i ++ )
297 lhs[ pscan[ i ] ] = x[ i ];
305 Scan( rflag, pscan );
308 rhs.resize( pscan[ n ] );
310 #pragma omp parallel for 311 for (
size_t i = 0; i < n; i ++ )
314 rhs[ pscan[ i ] ] = x[ i ];
317 int nlhs = lhs.size();
318 int nrhs = rhs.size();
320 if ( nlhs == k || nlhs == n || nrhs == n )
327 return Select( nlhs, k, lhs );
332 return Select( nrhs, k - nlhs, rhs );
340 T Select(
size_t k, std::vector<T> &x, hmlp::mpi::Comm comm )
343 std::vector<T> lhs, rhs;
344 lhs.reserve( x.size() );
345 rhs.reserve( x.size() );
347 int num_points_owned = x.size();
353 std::vector<T> mean = Mean( (
size_t)1, n, x, comm );
355 for (
size_t i = 0; i < x.size(); i ++ )
357 if ( x[ i ] < mean[ 0 ] ) lhs.push_back( x[ i ] );
358 else rhs.push_back( x[ i ] );
364 int num_lhs_owned = lhs.size();
365 int num_rhs_owned = rhs.size();
369 if ( nlhs == k || n == 1 || n == nlhs || n == nrhs )
376 return Select( k, lhs, comm );
381 return Select( k - nlhs, rhs, comm );
389 vector<vector<size_t>> MedianThreeWaySplit( vector<T> &v, T tol )
392 auto median = Select( n, n / 2, v );
394 vector<vector<size_t>> three_ways( 3 );
395 auto & lhs = three_ways[ 0 ];
396 auto & rhs = three_ways[ 1 ];
397 auto & mid = three_ways[ 2 ];
398 for (
size_t i = 0; i < v.size(); i ++ )
400 if ( std::fabs( v[ i ] - median ) < tol ) mid.push_back( i );
401 else if ( v[ i ] < median ) lhs.push_back( i );
402 else rhs.push_back( i );
411 vector<vector<size_t>> MedianSplit( vector<T> &v )
413 auto three_ways = MedianThreeWaySplit( v, (T)1E-6 );
414 vector<vector<size_t>> two_ways( 2 );
415 two_ways[ 0 ] = three_ways[ 0 ];
416 two_ways[ 1 ] = three_ways[ 1 ];
417 auto & lhs = two_ways[ 0 ];
418 auto & rhs = two_ways[ 1 ];
419 auto & mid = three_ways[ 2 ];
420 for (
auto it : mid )
422 if ( lhs.size() < rhs.size() ) lhs.push_back( it );
423 else rhs.push_back( it );
int Allreduce(T *sendbuf, T *recvbuf, int count, Op op, Comm comm)
Definition: hmlp_mpi.hpp:274