1 |
|
#ifndef DISTDATA_HPP |
2 |
|
#define DISTDATA_HPP |
3 |
|
|
4 |
|
#include <Data.hpp> |
5 |
|
#include <hmlp_mpi.hpp> |
6 |
|
|
7 |
|
using namespace std; |
8 |
|
using namespace hmlp; |
9 |
|
|
10 |
|
namespace hmlp |
11 |
|
{ |
12 |
|
namespace mpi |
13 |
|
{ |
14 |
|
|
15 |
|
template<typename T> |
16 |
|
int SendData( Data<T> &senddata, int dest, mpi::Comm comm ) |
17 |
|
{ |
18 |
|
int m = senddata.row(); |
19 |
|
int n = senddata.col(); |
20 |
|
Send( &m, 1, dest, 0, comm ); |
21 |
|
Send( &n, 1, dest, 0, comm ); |
22 |
|
Send( senddata.data(), m * n, dest, 0, comm ); |
23 |
|
return 0; |
24 |
|
}; /** end SendData() */ |
25 |
|
|
26 |
|
template<typename T> |
27 |
|
int RecvData( Data<T> &recvdata, int source, mpi::Comm comm ) |
28 |
|
{ |
29 |
|
int m, n; |
30 |
|
mpi::Status status; |
31 |
|
Recv( &m, 1, source, 0, comm, &status ); |
32 |
|
Recv( &n, 1, source, 0, comm, &status ); |
33 |
|
recvdata.resize( m, n ); |
34 |
|
Recv( recvdata.data(), m * n, source, 0, comm, &status ); |
35 |
|
return 0; |
36 |
|
}; /** end RecvData() */ |
37 |
|
|
38 |
|
|
39 |
|
|
40 |
|
/** |
41 |
|
* This interface supports hmlp::Data. [ m, n ] will be set |
42 |
|
* after the routine. |
43 |
|
* |
44 |
|
*/ |
45 |
|
template<typename T> |
46 |
|
int AlltoallData( size_t m, |
47 |
|
vector<Data<T>> &sendvector, |
48 |
|
vector<Data<T>> &recvvector, mpi::Comm comm ) |
49 |
|
{ |
50 |
|
int size = 0; Comm_size( comm, &size ); |
51 |
|
int rank = 0; Comm_rank( comm, &rank ); |
52 |
|
|
53 |
|
assert( sendvector.size() == size ); |
54 |
|
assert( recvvector.size() == size ); |
55 |
|
|
56 |
|
|
57 |
|
/** determine the concatenated dimension */ |
58 |
|
//size_t m = 0; |
59 |
|
|
60 |
|
///** all data need to have the same m */ |
61 |
|
//for ( size_t p = 0; p < size; p ++ ) |
62 |
|
//{ |
63 |
|
// if ( sendvector[ p ].row() ) |
64 |
|
// { |
65 |
|
// if ( m ) assert( m == sendvector[ p ].row() ); |
66 |
|
// else m = sendvector[ p ].row(); |
67 |
|
// } |
68 |
|
//} |
69 |
|
|
70 |
|
//printf( "after all assertion m %lu\n", m ); fflush( stdout ); |
71 |
|
|
72 |
|
|
73 |
|
|
74 |
|
vector<T> sendbuf; |
75 |
|
vector<T> recvbuf; |
76 |
|
vector<int> sendcounts( size, 0 ); |
77 |
|
vector<int> recvcounts( size, 0 ); |
78 |
|
vector<int> sdispls( size + 1, 0 ); |
79 |
|
vector<int> rdispls( size + 1, 0 ); |
80 |
|
|
81 |
|
for ( size_t p = 0; p < size; p ++ ) |
82 |
|
{ |
83 |
|
sendcounts[ p ] = sendvector[ p ].size(); |
84 |
|
sdispls[ p + 1 ] = sdispls[ p ] + sendcounts[ p ]; |
85 |
|
sendbuf.insert( sendbuf.end(), |
86 |
|
sendvector[ p ].begin(), |
87 |
|
sendvector[ p ].end() ); |
88 |
|
} |
89 |
|
|
90 |
|
/** Exchange sendcount. */ |
91 |
|
Alltoall( sendcounts.data(), 1, recvcounts.data(), 1, comm ); |
92 |
|
|
93 |
|
/** Compute total receiving count. */ |
94 |
|
size_t total_recvcount = 0; |
95 |
|
for ( size_t p = 0; p < size; p ++ ) |
96 |
|
{ |
97 |
|
rdispls[ p + 1 ] = rdispls[ p ] + recvcounts[ p ]; |
98 |
|
total_recvcount += recvcounts[ p ]; |
99 |
|
} |
100 |
|
|
101 |
|
/** Resize receving buffer. */ |
102 |
|
recvbuf.resize( total_recvcount ); |
103 |
|
|
104 |
|
Alltoallv( sendbuf.data(), sendcounts.data(), sdispls.data(), |
105 |
|
recvbuf.data(), recvcounts.data(), rdispls.data(), comm ); |
106 |
|
|
107 |
|
//printf( "after Alltoallv\n" ); fflush( stdout ); |
108 |
|
|
109 |
|
recvvector.resize( size ); |
110 |
|
for ( size_t p = 0; p < size; p ++ ) |
111 |
|
{ |
112 |
|
recvvector[ p ].insert( recvvector[ p ].end(), |
113 |
|
recvbuf.begin() + rdispls[ p ], |
114 |
|
recvbuf.begin() + rdispls[ p + 1 ] ); |
115 |
|
|
116 |
|
if ( recvvector[ p ].size() && m ) |
117 |
|
{ |
118 |
|
recvvector[ p ].resize( m, recvvector[ p ].size() / m ); |
119 |
|
} |
120 |
|
} |
121 |
|
|
122 |
|
//printf( "after resize\n" ); fflush( stdout ); |
123 |
|
//mpi::Barrier( comm ); |
124 |
|
|
125 |
|
return 0; |
126 |
|
|
127 |
|
}; /** end AlltoallVector() */ |
128 |
|
}; /** end namespace mpi */ |
129 |
|
|
130 |
|
|
131 |
|
|
132 |
|
|
133 |
|
|
134 |
|
|
135 |
|
|
136 |
|
|
137 |
|
|
138 |
|
|
139 |
|
|
140 |
|
|
141 |
|
|
142 |
|
|
143 |
|
|
144 |
|
|
145 |
|
|
146 |
|
|
147 |
|
|
148 |
|
|
149 |
|
typedef enum |
150 |
|
{ |
151 |
|
CBLK, /** Elemental MC */ |
152 |
|
RBLK, /** Elemental MR */ |
153 |
|
CIDS, /** Distributed according to column ids */ |
154 |
|
RIDS, /** Distributed according to row ids */ |
155 |
|
USER, /** Distributed acoording to user defined maps */ |
156 |
|
STAR, /** Elemental STAR */ |
157 |
|
CIRC /** Elemental CIRC */ |
158 |
|
} Distribution_t; |
159 |
|
|
160 |
|
|
161 |
|
#ifdef HMLP_MIC_AVX512 |
162 |
|
/** use hbw::allocator for Intel Xeon Phi */ |
163 |
|
template<class T, class Allocator = hbw::allocator<T> > |
164 |
|
#elif HMLP_USE_CUDA |
165 |
|
/** use pinned (page-lock) memory for NVIDIA GPUs */ |
166 |
|
template<class T, class Allocator = thrust::system::cuda::experimental::pinned_allocator<T> > |
167 |
|
#else |
168 |
|
/** use default stl allocator */ |
169 |
|
template<class T, class Allocator = std::allocator<T> > |
170 |
|
#endif |
171 |
|
class DistDataBase : public Data<T, Allocator>, public mpi::MPIObject |
172 |
|
{ |
173 |
|
public: |
174 |
|
|
175 |
|
/** Default constructor */ |
176 |
|
DistDataBase( size_t m, size_t n, mpi::Comm comm ) : mpi::MPIObject( comm ) |
177 |
|
{ |
178 |
|
this->global_m = m; |
179 |
|
this->global_n = n; |
180 |
|
this->comm = comm; |
181 |
|
mpi::Comm_size( comm, &comm_size ); |
182 |
|
mpi::Comm_rank( comm, &comm_rank ); |
183 |
|
}; |
184 |
|
|
185 |
|
/** Constrcut a zero matrix (but mpi::Comm is still required) */ |
186 |
|
DistDataBase( mpi::Comm comm ) : DistDataBase( 0, 0, comm ) {}; |
187 |
|
|
188 |
|
/** Copy constructor for hmlp::Data. */ |
189 |
|
DistDataBase( size_t m, size_t n, Data<T, Allocator>& other_data, mpi::Comm comm ) |
190 |
|
: Data<T, Allocator>( other_data ), mpi::MPIObject( comm ) |
191 |
|
{ |
192 |
|
this->global_m = m; |
193 |
|
this->global_n = n; |
194 |
|
this->comm = comm; |
195 |
|
mpi::Comm_size( comm, &comm_size ); |
196 |
|
mpi::Comm_rank( comm, &comm_rank ); |
197 |
|
} |
198 |
|
|
199 |
|
/** Copy constructor for std::vector. */ |
200 |
|
DistDataBase( size_t m, size_t n, size_t owned_rows, size_t owned_cols, |
201 |
|
vector<T, Allocator>& other_vector, mpi::Comm comm ) |
202 |
|
: Data<T, Allocator>( owned_rows, owned_cols, other_vector ), mpi::MPIObject( comm ) |
203 |
|
{ |
204 |
|
this->global_m = m; |
205 |
|
this->global_n = n; |
206 |
|
this->comm = comm; |
207 |
|
mpi::Comm_size( comm, &comm_size ); |
208 |
|
mpi::Comm_rank( comm, &comm_rank ); |
209 |
|
} |
210 |
|
|
211 |
|
/** MPI support */ |
212 |
|
mpi::Comm GetComm() { return comm; }; |
213 |
|
int GetSize() { return comm_size; }; |
214 |
|
int GetRank() { return comm_rank; }; |
215 |
|
|
216 |
|
/** Get total row and column numbers across all MPI ranks */ |
217 |
|
size_t row() { return global_m; }; |
218 |
|
size_t col() { return global_n; }; |
219 |
|
|
220 |
|
/** Get row and column numbers owned by this MPI rank */ |
221 |
|
size_t row_owned() { return Data<T>::row(); }; |
222 |
|
size_t col_owned() { return Data<T>::col(); }; |
223 |
|
|
224 |
|
private: |
225 |
|
|
226 |
|
/** Total row and column numbers across all MPI ranks */ |
227 |
|
size_t global_m = 0; |
228 |
|
size_t global_n = 0; |
229 |
|
|
230 |
|
/** MPI support */ |
231 |
|
mpi::Comm comm = MPI_COMM_WORLD; |
232 |
|
int comm_size = 0; |
233 |
|
int comm_rank = 0; |
234 |
|
|
235 |
|
}; /** end class DistDataBase */ |
236 |
|
|
237 |
|
|
238 |
|
/** */ |
239 |
|
#ifdef HMLP_MIC_AVX512 |
240 |
|
/** use hbw::allocator for Intel Xeon Phi */ |
241 |
|
template<Distribution_t ROWDIST, Distribution_t COLDIST, class T, class Allocator = hbw::allocator<T> > |
242 |
|
#elif HMLP_USE_CUDA |
243 |
|
/** use pinned (page-lock) memory for NVIDIA GPUs */ |
244 |
|
template<Distribution_t ROWDIST, Distribution_t COLDIST, class T, class Allocator = thrust::system::cuda::experimental::pinned_allocator<T> > |
245 |
|
#else |
246 |
|
/** use default stl allocator */ |
247 |
|
template<Distribution_t ROWDIST, Distribution_t COLDIST, class T, class Allocator = std::allocator<T> > |
248 |
|
#endif |
249 |
|
class DistData : public DistDataBase<T, Allocator> |
250 |
|
{ |
251 |
|
public: |
252 |
|
private: |
253 |
|
}; |
254 |
|
|
255 |
|
|
256 |
|
|
257 |
|
|
258 |
|
template<typename T> |
259 |
|
class DistData<CIRC, CIRC, T> : public DistDataBase<T> |
260 |
|
{ |
261 |
|
public: |
262 |
|
|
263 |
|
|
264 |
|
#ifdef HMLP_MIC_AVX512 |
265 |
|
/** use hbw::allocator for Intel Xeon Phi */ |
266 |
|
using ALLOCATOR = hbw::allocator<T>; |
267 |
|
#elif HMLP_USE_CUDA |
268 |
|
/** use pinned (page-lock) memory for NVIDIA GPUs */ |
269 |
|
using ALLOCATOR = thrust::system::cuda::experimental::pinned_allocator<T>; |
270 |
|
#else |
271 |
|
/** use default stl allocator */ |
272 |
|
using ALLOCATOR = std::allocator<T>; |
273 |
|
#endif |
274 |
|
|
275 |
|
|
276 |
|
DistData( size_t m, size_t n, int owner, mpi::Comm comm ) : |
277 |
|
DistDataBase<T>( m, n, comm ) |
278 |
|
{ |
279 |
|
this->owner = owner; |
280 |
|
if ( this->GetRank() == owner ) this->resize( m, n ); |
281 |
|
}; |
282 |
|
|
283 |
|
|
284 |
|
/** redistribute from CIDS to CBLK */ |
285 |
|
DistData<CIRC, CIRC, T> & operator = ( DistData<STAR, CIDS, T> &A ) |
286 |
|
{ |
287 |
|
/** MPI */ |
288 |
|
mpi::Comm comm = this->GetComm(); |
289 |
|
int size = this->GetSize(); |
290 |
|
int rank = this->GetRank(); |
291 |
|
|
292 |
|
/** allocate buffer for ids */ |
293 |
|
std::vector<std::vector<size_t>> sendids = A.CIRCOwnership( owner ); |
294 |
|
std::vector<std::vector<size_t>> recvids( size ); |
295 |
|
|
296 |
|
/** exchange cids */ |
297 |
|
mpi::AlltoallVector( sendids, recvids, comm ); |
298 |
|
|
299 |
|
/** allocate buffer for data */ |
300 |
|
std::vector<std::vector<T, ALLOCATOR>> senddata( size ); |
301 |
|
std::vector<std::vector<T, ALLOCATOR>> recvdata( size ); |
302 |
|
|
303 |
|
std::vector<size_t> amap( this->row() ); |
304 |
|
for ( size_t i = 0; i < amap.size(); i ++ ) amap[ i ] = i; |
305 |
|
|
306 |
|
/** extract rows from A<STAR,CIDS> */ |
307 |
|
senddata[ owner ] = A( amap, sendids[ owner ] ); |
308 |
|
|
309 |
|
/** exchange data */ |
310 |
|
mpi::AlltoallVector( senddata, recvdata, comm ); |
311 |
|
|
312 |
|
if ( rank == owner ) |
313 |
|
{ |
314 |
|
for ( size_t p = 0; p < size; p ++ ) |
315 |
|
{ |
316 |
|
size_t ld = this->row(); |
317 |
|
|
318 |
|
for ( size_t j = 0; j < recvids[ p ].size(); j ++ ) |
319 |
|
{ |
320 |
|
size_t cid = recvids[ p ][ j ]; |
321 |
|
|
322 |
|
for ( size_t i = 0; i < this->row(); i ++ ) |
323 |
|
{ |
324 |
|
(*this)( i, cid ) = recvdata[ p ][ j * ld + i ]; |
325 |
|
} |
326 |
|
}; |
327 |
|
}; |
328 |
|
} |
329 |
|
|
330 |
|
mpi::Barrier( comm ); |
331 |
|
return (*this); |
332 |
|
}; |
333 |
|
|
334 |
|
|
335 |
|
|
336 |
|
private: |
337 |
|
|
338 |
|
int owner = 0; |
339 |
|
|
340 |
|
}; |
341 |
|
|
342 |
|
|
343 |
|
/** |
344 |
|
* TODO: ADD block size |
345 |
|
*/ |
346 |
|
template<typename T> |
347 |
|
class DistData<STAR, CBLK, T> : public DistDataBase<T> |
348 |
|
{ |
349 |
|
public: |
350 |
|
|
351 |
|
#ifdef HMLP_MIC_AVX512 |
352 |
|
/** use hbw::allocator for Intel Xeon Phi */ |
353 |
|
using ALLOCATOR = hbw::allocator<T>; |
354 |
|
#elif HMLP_USE_CUDA |
355 |
|
/** use pinned (page-lock) memory for NVIDIA GPUs */ |
356 |
|
using ALLOCATOR = thrust::system::cuda::experimental::pinned_allocator<T>; |
357 |
|
#else |
358 |
|
/** use default stl allocator */ |
359 |
|
using ALLOCATOR = std::allocator<T>; |
360 |
|
#endif |
361 |
|
|
362 |
|
|
363 |
|
DistData( size_t m, size_t n, mpi::Comm comm ) |
364 |
|
: DistDataBase<T>( m, n, comm ) |
365 |
|
{ |
366 |
|
/** MPI */ |
367 |
|
int size = this->GetSize(); |
368 |
|
int rank = this->GetRank(); |
369 |
|
|
370 |
|
/** Columns are distributed. */ |
371 |
|
size_t edge_n = n % size; |
372 |
|
size_t local_n = ( n - edge_n ) / size; |
373 |
|
if ( rank < edge_n ) local_n ++; |
374 |
|
|
375 |
|
/** Resize the local buffer using Data<T>::resize(). */ |
376 |
|
this->resize( m, local_n ); |
377 |
|
}; |
378 |
|
|
379 |
|
/** We should use this and move mpi::Comm to the front. */ |
380 |
|
DistData( size_t m, size_t n, T initT, mpi::Comm comm ) |
381 |
|
: DistDataBase<T>( m, n, comm ) |
382 |
|
{ |
383 |
|
/** MPI */ |
384 |
|
int size = this->GetSize(); |
385 |
|
int rank = this->GetRank(); |
386 |
|
|
387 |
|
size_t edge_n = n % size; |
388 |
|
size_t local_n = ( n - edge_n ) / size; |
389 |
|
if ( rank < edge_n ) local_n ++; |
390 |
|
|
391 |
|
/** resize the local buffer */ |
392 |
|
this->resize( m, local_n, initT ); |
393 |
|
}; |
394 |
|
|
395 |
|
/** Construct distributed data from local column data. */ |
396 |
|
DistData( size_t m, size_t n, Data<T>& local_column_data, mpi::Comm comm ) |
397 |
|
: DistDataBase<T>( m, n, local_column_data, comm ) |
398 |
|
{ |
399 |
|
/** MPI */ |
400 |
|
int size = this->GetSize(); |
401 |
|
int rank = this->GetRank(); |
402 |
|
|
403 |
|
size_t edge_n = n % size; |
404 |
|
size_t local_n = ( n - edge_n ) / size; |
405 |
|
if ( rank < edge_n ) local_n ++; |
406 |
|
|
407 |
|
/** The row number of local_column_data must be m. */ |
408 |
|
assert( m == local_column_data.row() ); |
409 |
|
/** The column number of local_column_data must be local_n. */ |
410 |
|
assert( local_n == local_column_data.col() ); |
411 |
|
}; |
412 |
|
|
413 |
|
/** Construct distributed data from local std::vector. */ |
414 |
|
DistData( size_t m, size_t n, vector<T>& local_vector, mpi::Comm comm ) |
415 |
|
: DistDataBase<T>( m, n, m, local_vector.size() / m, local_vector, comm ) |
416 |
|
{ |
417 |
|
/** MPI */ |
418 |
|
int size = this->GetSize(); |
419 |
|
int rank = this->GetRank(); |
420 |
|
|
421 |
|
size_t edge_n = n % size; |
422 |
|
size_t local_n = ( n - edge_n ) / size; |
423 |
|
if ( rank < edge_n ) local_n ++; |
424 |
|
|
425 |
|
/** The column number of local_column_data must be local_n. */ |
426 |
|
assert( local_n == this->col_owned() ); |
427 |
|
}; |
428 |
|
|
429 |
|
/** Constructor that reads a binary file. */ |
430 |
|
DistData( size_t m, size_t n, mpi::Comm comm, string &filename ) |
431 |
|
: DistData<STAR, CBLK, T>( m, n, comm ) |
432 |
|
{ |
433 |
|
read( m, n, filename ); |
434 |
|
}; |
435 |
|
|
436 |
|
/** |
437 |
|
* read a dense column-major matrix |
438 |
|
*/ |
439 |
|
void read( size_t m, size_t n, string &filename ) |
440 |
|
{ |
441 |
|
assert( this->row() == m ); |
442 |
|
assert( this->col() == n ); |
443 |
|
|
444 |
|
/** MPI */ |
445 |
|
int size = this->GetSize(); |
446 |
|
int rank = this->GetRank(); |
447 |
|
|
448 |
|
/** print out filename */ |
449 |
|
cout << filename << endl; |
450 |
|
|
451 |
|
std::ifstream file( filename.data(), |
452 |
|
std::ios::in|std::ios::binary|std::ios::ate ); |
453 |
|
|
454 |
|
if ( file.is_open() ) |
455 |
|
{ |
456 |
|
auto size = file.tellg(); |
457 |
|
assert( size == m * n * sizeof(T) ); |
458 |
|
|
459 |
|
//for ( size_t j = rank; j < n; j += size ) |
460 |
|
//{ |
461 |
|
// size_t byte_offset = j * m * sizeof(T); |
462 |
|
// file.seekg( byte_offset, std::ios::beg ); |
463 |
|
// file.read( (char*)this->columndata( j / size ), m * sizeof(T) ); |
464 |
|
//} |
465 |
|
|
466 |
|
|
467 |
|
file.close(); |
468 |
|
} |
469 |
|
|
470 |
|
#pragma omp parallel |
471 |
|
{ |
472 |
|
std::ifstream ompfile( filename.data(), |
473 |
|
std::ios::in|std::ios::binary|std::ios::ate ); |
474 |
|
|
475 |
|
if ( ompfile.is_open() ) |
476 |
|
{ |
477 |
|
#pragma omp for |
478 |
|
for ( size_t j = rank; j < n; j += size ) |
479 |
|
{ |
480 |
|
size_t byte_offset = j * m * sizeof(T); |
481 |
|
ompfile.seekg( byte_offset, std::ios::beg ); |
482 |
|
ompfile.read( (char*)this->columndata( j / size ), m * sizeof(T) ); |
483 |
|
} |
484 |
|
ompfile.close(); |
485 |
|
} |
486 |
|
} /** end omp parallel */ |
487 |
|
|
488 |
|
|
489 |
|
|
490 |
|
|
491 |
|
|
492 |
|
}; /** end void read() */ |
493 |
|
|
494 |
|
|
495 |
|
|
496 |
|
|
497 |
|
|
498 |
|
|
499 |
|
|
500 |
|
|
501 |
|
/** |
502 |
|
* Overload operator () to allow accessing data using gids |
503 |
|
*/ |
504 |
|
T & operator () ( size_t i , size_t j ) |
505 |
|
{ |
506 |
|
/** Assert that Kij is stored on this MPI process. */ |
507 |
|
// TODO: take care of block sizes. |
508 |
|
assert( j % this->GetSize() == this->GetRank() ); |
509 |
|
/** Return reference of Kij */ |
510 |
|
return DistDataBase<T>::operator () ( i, j / this->GetSize() ); |
511 |
|
}; |
512 |
|
|
513 |
|
|
514 |
|
/** |
515 |
|
* Overload operator () to return a local submatrix using gids |
516 |
|
*/ |
517 |
|
Data<T> operator () ( vector<size_t> &I, vector<size_t> &J ) |
518 |
|
{ |
519 |
|
for ( auto it = J.begin(); it != J.end(); it ++ ) |
520 |
|
{ |
521 |
|
/** assert that Kij is stored on this MPI process */ |
522 |
|
assert( (*it) % this->GetSize() == this->GetRank() ); |
523 |
|
(*it) = (*it) / this->GetSize(); |
524 |
|
} |
525 |
|
return DistDataBase<T>::operator () ( I, J ); |
526 |
|
}; |
527 |
|
|
528 |
|
|
529 |
|
/** redistribute from CIRC to CBLK */ |
530 |
|
DistData<STAR, CBLK, T> & operator = ( const DistData<CIRC, CIRC, T> &A ) |
531 |
|
{ |
532 |
|
printf( "not implemented yet\n" ); |
533 |
|
exit( 1 ); |
534 |
|
return (*this); |
535 |
|
}; |
536 |
|
|
537 |
|
|
538 |
|
/** Redistribute from CIDS to CBLK */ |
539 |
|
DistData<STAR, CBLK, T> & operator = ( DistData<STAR, CIDS, T> &A ) |
540 |
|
{ |
541 |
|
/** MPI */ |
542 |
|
mpi::Comm comm = this->GetComm(); |
543 |
|
int size = this->GetSize(); |
544 |
|
int rank = this->GetRank(); |
545 |
|
|
546 |
|
/** allocate buffer for ids */ |
547 |
|
vector<vector<size_t>> sendids = A.CBLKOwnership(); |
548 |
|
vector<vector<size_t>> recvids( size ); |
549 |
|
|
550 |
|
/** exchange cids */ |
551 |
|
mpi::AlltoallVector( sendids, recvids, comm ); |
552 |
|
|
553 |
|
/** allocate buffer for data */ |
554 |
|
vector<vector<T, ALLOCATOR>> senddata( size ); |
555 |
|
vector<vector<T, ALLOCATOR>> recvdata( size ); |
556 |
|
|
557 |
|
vector<size_t> amap( this->row() ); |
558 |
|
for ( size_t i = 0; i < amap.size(); i ++ ) amap[ i ] = i; |
559 |
|
|
560 |
|
/** extract rows from A<STAR, CIDS> */ |
561 |
|
#pragma omp parallel for |
562 |
|
for ( size_t p = 0; p < size; p ++ ) |
563 |
|
{ |
564 |
|
/** recvids should be gids (not local posiiton) */ |
565 |
|
senddata[ p ] = A( amap, sendids[ p ] ); |
566 |
|
} |
567 |
|
|
568 |
|
/** exchange data */ |
569 |
|
mpi::AlltoallVector( senddata, recvdata, comm ); |
570 |
|
|
571 |
|
/** |
572 |
|
* It is possible that the received cid has several copies on |
573 |
|
* different MPI rank. |
574 |
|
*/ |
575 |
|
|
576 |
|
#pragma omp parallel for |
577 |
|
for ( size_t p = 0; p < size; p ++ ) |
578 |
|
{ |
579 |
|
size_t ld = this->row(); |
580 |
|
|
581 |
|
for ( size_t j = 0; j < recvids[ p ].size(); j ++ ) |
582 |
|
{ |
583 |
|
size_t cid = recvids[ p ][ j ]; |
584 |
|
|
585 |
|
for ( size_t i = 0; i < this->row(); i ++ ) |
586 |
|
{ |
587 |
|
(*this)( i, cid ) = recvdata[ p ][ j * ld + i ]; |
588 |
|
} |
589 |
|
}; |
590 |
|
}; |
591 |
|
|
592 |
|
mpi::Barrier( comm ); |
593 |
|
/** free all buffers and return */ |
594 |
|
return (*this); |
595 |
|
}; |
596 |
|
|
597 |
|
|
598 |
|
|
599 |
|
|
600 |
|
|
601 |
|
|
602 |
|
|
603 |
|
private: |
604 |
|
|
605 |
|
}; /** end class DistData<STAR, CBLK, T> */ |
606 |
|
|
607 |
|
|
608 |
|
|
609 |
|
/** |
610 |
|
* @brief Ecah MPI process own ( n / size ) rows of A |
611 |
|
* in a cyclic fashion (Round Robin). i.e. |
612 |
|
* If there are 3 MPI processes, then |
613 |
|
* |
614 |
|
* rank0 owns A(0,:), A(3,:), A(6,:), ... |
615 |
|
* rank1 owns A(1,:), A(4,:), A(7,:), ... |
616 |
|
* rank2 owns A(2,:), A(5,:), A(8,:), ... |
617 |
|
*/ |
618 |
|
template<typename T> |
619 |
|
class DistData<RBLK, STAR, T> : public DistDataBase<T> |
620 |
|
{ |
621 |
|
public: |
622 |
|
|
623 |
|
#ifdef HMLP_MIC_AVX512 |
624 |
|
/** use hbw::allocator for Intel Xeon Phi */ |
625 |
|
using ALLOCATOR = hbw::allocator<T>; |
626 |
|
#elif HMLP_USE_CUDA |
627 |
|
/** use pinned (page-lock) memory for NVIDIA GPUs */ |
628 |
|
using ALLOCATOR = thrust::system::cuda::experimental::pinned_allocator<T>; |
629 |
|
#else |
630 |
|
/** use default stl allocator */ |
631 |
|
using ALLOCATOR = std::allocator<T>; |
632 |
|
#endif |
633 |
|
|
634 |
|
|
635 |
|
|
636 |
|
DistData( size_t m, size_t n, mpi::Comm comm ) |
637 |
|
: DistDataBase<T>( m, n, comm ) |
638 |
|
{ |
639 |
|
/** MPI */ |
640 |
|
int size = this->GetSize(); |
641 |
|
int rank = this->GetRank(); |
642 |
|
size_t edge_m = m % size; |
643 |
|
size_t local_m = ( m - edge_m ) / size; |
644 |
|
if ( rank < edge_m ) local_m ++; |
645 |
|
|
646 |
|
/** resize the local buffer */ |
647 |
|
this->resize( local_m, n ); |
648 |
|
}; |
649 |
|
|
650 |
|
/** Construct distributed data from local row data. */ |
651 |
|
DistData( size_t m, size_t n, Data<T>& local_row_data, mpi::Comm comm ) |
652 |
|
: DistDataBase<T>( m, n, local_row_data, comm ) |
653 |
|
{ |
654 |
|
/** MPI */ |
655 |
|
int size = this->GetSize(); |
656 |
|
int rank = this->GetRank(); |
657 |
|
size_t edge_m = m % size; |
658 |
|
size_t local_m = ( m - edge_m ) / size; |
659 |
|
if ( rank < edge_m ) local_m ++; |
660 |
|
|
661 |
|
assert( local_m == local_row_data.row() ); |
662 |
|
assert( n == local_row_data.col() ); |
663 |
|
}; |
664 |
|
|
665 |
|
/** Construct distributed data from local vector. */ |
666 |
|
DistData( size_t m, size_t n, vector<T>& local_vector, mpi::Comm comm ) |
667 |
|
: DistDataBase<T>( m, n, local_vector.size() / n, n, local_vector, comm ) |
668 |
|
{ |
669 |
|
/** MPI */ |
670 |
|
int size = this->GetSize(); |
671 |
|
int rank = this->GetRank(); |
672 |
|
size_t edge_m = m % size; |
673 |
|
size_t local_m = ( m - edge_m ) / size; |
674 |
|
if ( rank < edge_m ) local_m ++; |
675 |
|
|
676 |
|
assert( local_m == this->row_owned() ); |
677 |
|
}; |
678 |
|
|
679 |
|
|
680 |
|
/** |
681 |
|
* Overload operator () to allow accessing data using gids |
682 |
|
*/ |
683 |
|
T & operator () ( size_t i , size_t j ) |
684 |
|
{ |
685 |
|
/** assert that Kij is stored on this MPI process */ |
686 |
|
if ( i % this->GetSize() != this->GetRank() ) |
687 |
|
{ |
688 |
|
printf( "ERROR: accessing %lu on rank %d\n", i, this->GetRank() ); |
689 |
|
exit( 1 ); |
690 |
|
} |
691 |
|
/** return reference of Kij */ |
692 |
|
return DistDataBase<T>::operator () ( i / this->GetSize(), j ); |
693 |
|
}; |
694 |
|
|
695 |
|
|
696 |
|
/** |
697 |
|
* Overload operator () to return a local submatrix using gids |
698 |
|
*/ |
699 |
|
hmlp::Data<T> operator () ( std::vector<size_t> I, std::vector<size_t> J ) |
700 |
|
{ |
701 |
|
for ( auto it = I.begin(); it != I.end(); it ++ ) |
702 |
|
{ |
703 |
|
/** assert that Kij is stored on this MPI process */ |
704 |
|
assert( (*it) % this->GetSize() == this->GetRank() ); |
705 |
|
(*it) = (*it) / this->GetSize(); |
706 |
|
} |
707 |
|
return DistDataBase<T>::operator () ( I, J ); |
708 |
|
}; |
709 |
|
|
710 |
|
|
711 |
|
/** redistribute from CIRC to RBLK */ |
712 |
|
DistData<RBLK, STAR, T> & operator = ( const DistData<CIRC, CIRC, T> &A ) |
713 |
|
{ |
714 |
|
printf( "not implemented yet\n" ); |
715 |
|
exit( 1 ); |
716 |
|
return (*this); |
717 |
|
}; |
718 |
|
|
719 |
|
|
720 |
|
|
721 |
|
/** redistribute from RIDS to RBLK */ |
722 |
|
DistData<RBLK, STAR, T> & operator = ( DistData<RIDS, STAR, T> &A ) |
723 |
|
{ |
724 |
|
/** MPI */ |
725 |
|
mpi::Comm comm = this->GetComm(); |
726 |
|
int size = this->GetSize(); |
727 |
|
int rank = this->GetRank(); |
728 |
|
|
729 |
|
/** allocate buffer for ids */ |
730 |
|
vector<vector<size_t>> sendids = A.RBLKOwnership(); |
731 |
|
vector<vector<size_t>> recvids( size ); |
732 |
|
|
733 |
|
/** Exchange rids. */ |
734 |
|
mpi::AlltoallVector( sendids, recvids, comm ); |
735 |
|
|
736 |
|
/** Allocate buffer for data. */ |
737 |
|
vector<vector<T, ALLOCATOR>> senddata( size ); |
738 |
|
vector<vector<T, ALLOCATOR>> recvdata( size ); |
739 |
|
|
740 |
|
vector<size_t> bmap( this->col() ); |
741 |
|
for ( size_t j = 0; j < bmap.size(); j ++ ) bmap[ j ] = j; |
742 |
|
|
743 |
|
/** Extract rows from A<RBLK,STAR> */ |
744 |
|
#pragma omp parallel for |
745 |
|
for ( size_t p = 0; p < size; p ++ ) |
746 |
|
{ |
747 |
|
/** recvids should be gids (not local posiiton) */ |
748 |
|
senddata[ p ] = A( sendids[ p ], bmap ); |
749 |
|
} |
750 |
|
|
751 |
|
/** exchange data */ |
752 |
|
mpi::AlltoallVector( senddata, recvdata, comm ); |
753 |
|
|
754 |
|
#pragma omp parallel for |
755 |
|
for ( size_t p = 0; p < size; p ++ ) |
756 |
|
{ |
757 |
|
size_t ld = recvdata[ p ].size() / this->col(); |
758 |
|
assert( ld == recvids[ p ].size() ); |
759 |
|
for ( size_t i = 0; i < recvids[ p ].size(); i ++ ) |
760 |
|
{ |
761 |
|
size_t rid = recvids[ p ][ i ]; |
762 |
|
for ( size_t j = 0; j < this->col(); j ++ ) |
763 |
|
{ |
764 |
|
(*this)( rid, j ) = recvdata[ p ][ j * ld + i ]; |
765 |
|
} |
766 |
|
}; |
767 |
|
}; |
768 |
|
|
769 |
|
mpi::Barrier( comm ); |
770 |
|
/** free all buffers and return */ |
771 |
|
return (*this); |
772 |
|
}; |
773 |
|
|
774 |
|
private: |
775 |
|
|
776 |
|
}; /** end class DistData<RBLK, START, T> */ |
777 |
|
|
778 |
|
|
779 |
|
|
780 |
|
|
781 |
|
/** |
782 |
|
* @brief Ecah MPI process own ( cids.size() ) columns of A, |
783 |
|
* and cids denote the distribution. i.e. |
784 |
|
* ranki owns A(:,cids[0]), |
785 |
|
* A(:,cids[1]), |
786 |
|
* A(:,cids[2]), ... |
787 |
|
*/ |
788 |
|
template<typename T> |
789 |
|
class DistData<STAR, CIDS, T> : public DistDataBase<T> |
790 |
|
{ |
791 |
|
public: |
792 |
|
|
793 |
|
#ifdef HMLP_MIC_AVX512 |
794 |
|
/** use hbw::allocator for Intel Xeon Phi */ |
795 |
|
using ALLOCATOR = hbw::allocator<T>; |
796 |
|
#elif HMLP_USE_CUDA |
797 |
|
/** use pinned (page-lock) memory for NVIDIA GPUs */ |
798 |
|
using ALLOCATOR = thrust::system::cuda::experimental::pinned_allocator<T>; |
799 |
|
#else |
800 |
|
/** use default stl allocator */ |
801 |
|
using ALLOCATOR = std::allocator<T>; |
802 |
|
#endif |
803 |
|
|
804 |
|
/** Default constructor */ |
805 |
|
DistData( size_t m, size_t n, vector<size_t> &cids, mpi::Comm comm ) : |
806 |
|
DistDataBase<T>( m, n, comm ) |
807 |
|
{ |
808 |
|
/** now check if (sum cids.size() >= n) */ |
809 |
|
size_t bcast_n = cids.size(); |
810 |
|
size_t reduc_n = 0; |
811 |
|
mpi::Allreduce( &bcast_n, &reduc_n, 1, MPI_SUM, comm ); |
812 |
|
assert( reduc_n == n ); |
813 |
|
this->cids = cids; |
814 |
|
this->resize( m, cids.size() ); |
815 |
|
|
816 |
|
for ( size_t j = 0; j < cids.size(); j ++ ) |
817 |
|
cid2col[ cids[ j ] ] = j; |
818 |
|
}; |
819 |
|
|
820 |
|
/** Default constructor */ |
821 |
|
DistData( size_t m, size_t n, vector<size_t> &cids, T initT, mpi::Comm comm ) : |
822 |
|
DistDataBase<T>( m, n, comm ) |
823 |
|
{ |
824 |
|
/** now check if (sum cids.size() >= n) */ |
825 |
|
size_t bcast_n = cids.size(); |
826 |
|
size_t reduc_n = 0; |
827 |
|
mpi::Allreduce( &bcast_n, &reduc_n, 1, MPI_SUM, comm ); |
828 |
|
assert( reduc_n == n ); |
829 |
|
this->cids = cids; |
830 |
|
this->resize( m, cids.size(), initT ); |
831 |
|
|
832 |
|
for ( size_t j = 0; j < cids.size(); j ++ ) |
833 |
|
cid2col[ cids[ j ] ] = j; |
834 |
|
}; |
835 |
|
|
836 |
|
DistData( size_t m, size_t n, vector<size_t> &cids, Data<T> &A, mpi::Comm comm ) : |
837 |
|
DistDataBase<T>( m, n, comm ) |
838 |
|
{ |
839 |
|
assert( A.row() == m ); |
840 |
|
assert( A.col() == cids.size() ); |
841 |
|
|
842 |
|
/** now check if (sum cids.size() >= n) */ |
843 |
|
size_t bcast_n = cids.size(); |
844 |
|
size_t reduc_n = 0; |
845 |
|
mpi::Allreduce( &bcast_n, &reduc_n, 1, MPI_SUM, comm ); |
846 |
|
assert( reduc_n == n ); |
847 |
|
this->cids = cids; |
848 |
|
|
849 |
|
this->insert( this->end(), A.begin(), A.end() ); |
850 |
|
this->resize( A.row(), A.col() ); |
851 |
|
|
852 |
|
for ( size_t j = 0; j < cids.size(); j ++ ) cid2col[ cids[ j ] ] = j; |
853 |
|
}; |
854 |
|
|
855 |
|
/** |
856 |
|
* Overload operator () to allow accessing data using gids |
857 |
|
*/ |
858 |
|
T & operator () ( size_t i , size_t j ) |
859 |
|
{ |
860 |
|
/** assert that Kij is stored on this MPI process */ |
861 |
|
assert( cid2col.count( j ) == 1 ); |
862 |
|
/** return reference of Kij */ |
863 |
|
return DistDataBase<T>::operator () ( i, cid2col[ j ] ); |
864 |
|
}; |
865 |
|
|
866 |
|
|
867 |
|
/** |
868 |
|
* Overload operator () to return a local submatrix using gids |
869 |
|
*/ |
870 |
|
Data<T> operator () ( vector<size_t> I, vector<size_t> J ) |
871 |
|
{ |
872 |
|
for ( auto it = J.begin(); it != J.end(); it ++ ) |
873 |
|
{ |
874 |
|
/** assert that Kij is stored on this MPI process */ |
875 |
|
assert( cid2col.count(*it) == 1 ); |
876 |
|
(*it) = cid2col[ (*it) ]; |
877 |
|
} |
878 |
|
return DistDataBase<T>::operator () ( I, J ); |
879 |
|
}; |
880 |
|
|
881 |
|
bool HasColumn( size_t cid ) |
882 |
|
{ |
883 |
|
return cid2col.count( cid ); |
884 |
|
}; |
885 |
|
|
886 |
|
T *columndata( size_t cid ) |
887 |
|
{ |
888 |
|
assert( cid2col.count( cid ) == 1 ); |
889 |
|
return Data<T>::columndata( cid2col[ cid ] ); |
890 |
|
}; |
891 |
|
|
892 |
|
pair<size_t, T*> GetIDAndColumnPointer( size_t j ) |
893 |
|
{ |
894 |
|
return pair<size_t, T*>( cids[ j ], Data<T>::columndata( j ) ); |
895 |
|
}; |
896 |
|
|
897 |
|
/** |
898 |
|
* @brief Return a vector of vector that indicates the RBLK ownership |
899 |
|
* of each MPI rank. |
900 |
|
*/ |
901 |
|
vector<vector<size_t>> CBLKOwnership() |
902 |
|
{ |
903 |
|
/** MPI */ |
904 |
|
mpi::Comm comm = this->GetComm(); |
905 |
|
int size = this->GetSize(); |
906 |
|
int rank = this->GetRank(); |
907 |
|
|
908 |
|
vector<vector<size_t>> ownership( size ); |
909 |
|
|
910 |
|
for ( auto it = cids.begin(); it != cids.end(); it ++ ) |
911 |
|
{ |
912 |
|
size_t cid = (*it); |
913 |
|
/** |
914 |
|
* While in CBLK distribution, rid is owned by |
915 |
|
* rank ( cid % size ) at position ( cid / size ) |
916 |
|
*/ |
917 |
|
ownership[ cid % size ].push_back( cid ); |
918 |
|
}; |
919 |
|
|
920 |
|
return ownership; |
921 |
|
|
922 |
|
}; /** end CBLKOwnership() */ |
923 |
|
|
924 |
|
|
925 |
|
vector<vector<size_t>> CIRCOwnership( int owner ) |
926 |
|
{ |
927 |
|
/** MPI */ |
928 |
|
mpi::Comm comm = this->GetComm(); |
929 |
|
int size = this->GetSize(); |
930 |
|
int rank = this->GetRank(); |
931 |
|
|
932 |
|
vector<vector<size_t>> ownership( size ); |
933 |
|
ownership[ owner ] = cids; |
934 |
|
return ownership; |
935 |
|
}; /** end CIRCOwnership() */ |
936 |
|
|
937 |
|
|
938 |
|
|
939 |
|
/** Redistribution from CBLK to CIDS */ |
940 |
|
DistData<STAR, CIDS, T> & operator = ( DistData<STAR, CBLK, T> &A ) |
941 |
|
{ |
942 |
|
/** assertion: must provide rids */ |
943 |
|
assert( cids.size() ); |
944 |
|
|
945 |
|
/** MPI */ |
946 |
|
mpi::Comm comm = this->GetComm(); |
947 |
|
int size = this->GetSize(); |
948 |
|
int rank = this->GetRank(); |
949 |
|
|
950 |
|
/** allocate buffer for ids */ |
951 |
|
vector<vector<size_t>> sendids = CBLKOwnership(); |
952 |
|
vector<vector<size_t>> recvids( size ); |
953 |
|
|
954 |
|
/** |
955 |
|
* exchange cids: |
956 |
|
* |
957 |
|
* sendids contain all required ids from each rank |
958 |
|
* recvids contain all requested ids from each rank |
959 |
|
* |
960 |
|
*/ |
961 |
|
mpi::AlltoallVector( sendids, recvids, comm ); |
962 |
|
|
963 |
|
|
964 |
|
/** allocate buffer for data */ |
965 |
|
vector<vector<T, ALLOCATOR>> senddata( size ); |
966 |
|
vector<vector<T, ALLOCATOR>> recvdata( size ); |
967 |
|
|
968 |
|
vector<size_t> amap( this->row() ); |
969 |
|
for ( size_t i = 0; i < amap.size(); i ++ ) amap[ i ] = i; |
970 |
|
|
971 |
|
|
972 |
|
/** extract columns from A<STAR,CBLK> */ |
973 |
|
#pragma omp parallel for |
974 |
|
for ( size_t p = 0; p < size; p ++ ) |
975 |
|
{ |
976 |
|
/** recvids should be gids (not local posiiton) */ |
977 |
|
senddata[ p ] = A( amap, recvids[ p ] ); |
978 |
|
assert( senddata[ p ].size() == amap.size() * recvids[ p ].size() ); |
979 |
|
} |
980 |
|
|
981 |
|
/** exchange data */ |
982 |
|
mpi::AlltoallVector( senddata, recvdata, comm ); |
983 |
|
|
984 |
|
#pragma omp parallel for |
985 |
|
for ( size_t p = 0; p < size; p ++ ) |
986 |
|
{ |
987 |
|
assert( recvdata[ p ].size() == sendids[ p ].size() * this->row() ); |
988 |
|
|
989 |
|
size_t ld = this->row(); |
990 |
|
|
991 |
|
for ( size_t j = 0; j < sendids[ p ].size(); j ++ ) |
992 |
|
{ |
993 |
|
size_t cid = sendids[ p ][ j ]; |
994 |
|
for ( size_t i = 0; i < this->row(); i ++ ) |
995 |
|
{ |
996 |
|
(*this)( i, cid ) = recvdata[ p ][ j * ld + i ]; |
997 |
|
} |
998 |
|
}; |
999 |
|
}; |
1000 |
|
|
1001 |
|
mpi::Barrier( comm ); |
1002 |
|
return *this; |
1003 |
|
}; |
1004 |
|
|
1005 |
|
private: |
1006 |
|
|
1007 |
|
vector<size_t> cids; |
1008 |
|
|
1009 |
|
vector<size_t> cids_from_other_ranks; |
1010 |
|
|
1011 |
|
/** Use hash table: cid2col[ cid ] = local column index */ |
1012 |
|
unordered_map<size_t, size_t> cid2col; |
1013 |
|
|
1014 |
|
}; /** end class DistData<STAR, CIDS, T> */ |
1015 |
|
|
1016 |
|
|
1017 |
|
template<typename T> |
1018 |
|
class DistData<STAR, USER, T> : public DistDataBase<T> |
1019 |
|
{ |
1020 |
|
public: |
1021 |
|
|
1022 |
|
#ifdef HMLP_MIC_AVX512 |
1023 |
|
/** use hbw::allocator for Intel Xeon Phi */ |
1024 |
|
using ALLOCATOR = hbw::allocator<T>; |
1025 |
|
#elif HMLP_USE_CUDA |
1026 |
|
/** use pinned (page-lock) memory for NVIDIA GPUs */ |
1027 |
|
using ALLOCATOR = thrust::system::cuda::experimental::pinned_allocator<T>; |
1028 |
|
#else |
1029 |
|
/** use default stl allocator */ |
1030 |
|
using ALLOCATOR = std::allocator<T>; |
1031 |
|
#endif |
1032 |
|
|
1033 |
|
/** Default constructor */ |
1034 |
|
DistData( size_t m, size_t n, mpi::Comm comm ) |
1035 |
|
: DistDataBase<T>( m, n, comm ), all_rows( m ) |
1036 |
|
{ |
1037 |
|
for ( size_t i = 0; i < m; i ++ ) all_rows[ i ] = i; |
1038 |
|
}; |
1039 |
|
|
1040 |
|
/** Constructed from DistDAta<STAR, CBLK> */ |
1041 |
|
DistData( DistData<STAR, CBLK, T> &A ) |
1042 |
|
: DistData( A.row(), A.col(), A.GetComm() ) |
1043 |
|
{ |
1044 |
|
/** MPI */ |
1045 |
|
mpi::Comm comm = this->GetComm(); |
1046 |
|
int comm_size; mpi::Comm_size( comm, &comm_size ); |
1047 |
|
int comm_rank; mpi::Comm_rank( comm, &comm_rank ); |
1048 |
|
|
1049 |
|
/** Insert column indecies to hash table (cyclic) */ |
1050 |
|
for ( size_t j = comm_rank, i = 0; |
1051 |
|
j < this->col(); |
1052 |
|
j += comm_size, i ++ ) |
1053 |
|
{ |
1054 |
|
cid2col[ j ] = i; |
1055 |
|
} |
1056 |
|
|
1057 |
|
/** Check if matches the column number */ |
1058 |
|
assert( cid2col.size() == A.col_owned() ); |
1059 |
|
assert( A.size() == cid2col.size() * this->row() ); |
1060 |
|
|
1061 |
|
/** Use vector<T>::insert */ |
1062 |
|
this->insert( this->end(), A.begin(), A.end() ); |
1063 |
|
|
1064 |
|
/** Resize to update owned_row() and owned_col() */ |
1065 |
|
this->resize( A.row_owned(), A.col_owned() ); |
1066 |
|
}; |
1067 |
|
|
1068 |
|
|
1069 |
|
|
1070 |
|
/** |
1071 |
|
* Insert cids and A in to this object. |
1072 |
|
* Note: this is not thread safe. |
1073 |
|
*/ |
1074 |
|
void InsertColumns( const vector<size_t>& cids, const Data<T>& A ) |
1075 |
|
{ |
1076 |
|
int comm_rank = this->GetRank(); |
1077 |
|
|
1078 |
|
/** Check if all dimensions match */ |
1079 |
|
assert( cids.size() == A.col() ); |
1080 |
|
assert( this->row() == A.row() ); |
1081 |
|
|
1082 |
|
/** Remove duplicated columns */ |
1083 |
|
set<size_t> unique_cids; |
1084 |
|
vector<size_t> new_cids, msk_cids; |
1085 |
|
new_cids.reserve( cids.size() ); |
1086 |
|
msk_cids.reserve( cids.size() ); |
1087 |
|
|
1088 |
|
//printf( "rank %d Here\n", comm_rank ); fflush( stdout ); |
1089 |
|
|
1090 |
|
/** |
1091 |
|
* Check if cids already exist using HasColumn(). Make sure |
1092 |
|
* there is not duplication using unique_cids. |
1093 |
|
*/ |
1094 |
|
//#pragma omp critical |
1095 |
|
//{ |
1096 |
|
for ( size_t i = 0; i < cids.size(); i ++ ) |
1097 |
|
{ |
1098 |
|
size_t cid = cids[ i ]; |
1099 |
|
if ( !HasColumn( cid ) && !unique_cids.count( cid ) ) |
1100 |
|
{ |
1101 |
|
new_cids.push_back( cid ); |
1102 |
|
msk_cids.push_back( i ); |
1103 |
|
unique_cids.insert( cid ); |
1104 |
|
} |
1105 |
|
} |
1106 |
|
|
1107 |
|
if ( new_cids.size() ) |
1108 |
|
// printf( "rank %d cids[ 0 ] %lu cids %lu new_cids %lu\n", |
1109 |
|
// comm_rank, cids[ 0 ], cids.size(), new_cids.size() ); fflush( stdout ); |
1110 |
|
|
1111 |
|
|
1112 |
|
/** Insert to hash table one-by-one */ |
1113 |
|
for ( size_t i = 0; i < new_cids.size(); i ++ ) |
1114 |
|
cid2col[ new_cids[ i ] ] = i + this->col_owned(); |
1115 |
|
|
1116 |
|
//printf( "rank %d new_cids %lu cid2col %lu\n", |
1117 |
|
// comm_rank, new_cids.size(), cid2col.size() ); fflush( stdout ); |
1118 |
|
/** Use vector<T>::insert */ |
1119 |
|
Data<T> new_A = A( all_rows, msk_cids ); |
1120 |
|
//printf( "rank %d A %lu %lu\n", |
1121 |
|
// comm_rank, A.row(), A.col() ); fflush( stdout ); |
1122 |
|
this->insert( this->end(), new_A.begin(), new_A.end() ); |
1123 |
|
//printf( "rank %d this->size %lu row_owned %lu col_owned %lu\n", |
1124 |
|
// comm_rank, this->size(), this->row_owned(), this->col_owned() ); fflush( stdout ); |
1125 |
|
|
1126 |
|
/** Check if total size matches */ |
1127 |
|
assert( this->row_owned() * cid2col.size() == this->size() ); |
1128 |
|
|
1129 |
|
/** Resize to update owned_row() and owned_col() */ |
1130 |
|
//this->resize( this->row(), this->size() / this->row() ); |
1131 |
|
this->resize( this->row(), cid2col.size() ); |
1132 |
|
//printf( "rank %d this->size %lu row_owned %lu col_owned %lu done\n", |
1133 |
|
// comm_rank, this->size(), this->row_owned(), this->col_owned() ); fflush( stdout ); |
1134 |
|
|
1135 |
|
//} /** end omp critical */ |
1136 |
|
}; |
1137 |
|
|
1138 |
|
|
1139 |
|
|
1140 |
|
|
1141 |
|
|
1142 |
|
/** |
1143 |
|
* Overload operator () to allow accessing data using gids |
1144 |
|
*/ |
1145 |
|
T & operator () ( size_t i , size_t j ) |
1146 |
|
{ |
1147 |
|
/** assert that Kij is stored on this MPI process */ |
1148 |
|
assert( cid2col.count( j ) == 1 ); |
1149 |
|
/** return reference of Kij */ |
1150 |
|
return DistDataBase<T>::operator () ( i, cid2col[ j ] ); |
1151 |
|
}; |
1152 |
|
|
1153 |
|
/** |
1154 |
|
* Overload operator () to return a local submatrix using gids |
1155 |
|
*/ |
1156 |
|
Data<T> operator () ( vector<size_t> I, vector<size_t> J ) |
1157 |
|
{ |
1158 |
|
int comm_rank = this->GetRank(); |
1159 |
|
|
1160 |
|
for ( auto it = J.begin(); it != J.end(); it ++ ) |
1161 |
|
{ |
1162 |
|
/** Assert that Kij is stored on this MPI process */ |
1163 |
|
if ( cid2col.count( *it ) != 1 ) |
1164 |
|
{ |
1165 |
|
printf( "rank %d cid2col.count(%lu) = %lu\n", |
1166 |
|
comm_rank, *it, cid2col.count( *it ) ); fflush( stdout ); |
1167 |
|
exit( 1 ); |
1168 |
|
} |
1169 |
|
/** Overwrite J with its corresponding offset */ |
1170 |
|
(*it) = cid2col[ (*it) ]; |
1171 |
|
} |
1172 |
|
return DistDataBase<T>::operator () ( I, J ); |
1173 |
|
}; |
1174 |
|
|
1175 |
|
/** Check if cid already exists */ |
1176 |
|
bool HasColumn( size_t cid ) |
1177 |
|
{ |
1178 |
|
return cid2col.count( cid ); |
1179 |
|
}; |
1180 |
|
|
1181 |
|
/** Return pointer of cid column using Data<T>::columndata */ |
1182 |
|
T *columndata( size_t cid ) |
1183 |
|
{ |
1184 |
|
assert( cid2col.count( cid ) == 1 ); |
1185 |
|
return Data<T>::columndata( cid2col[ cid ] ); |
1186 |
|
}; |
1187 |
|
|
1188 |
|
|
1189 |
|
private: |
1190 |
|
|
1191 |
|
/** all_rows = [ 0, 1, ..., m-1 ] */ |
1192 |
|
vector<size_t> all_rows; |
1193 |
|
|
1194 |
|
|
1195 |
|
/** Use hash table: cid2col[ cid ] = local column index */ |
1196 |
|
unordered_map<size_t, size_t> cid2col; |
1197 |
|
|
1198 |
|
}; /** end class DistData<STAR, USER, T> */ |
1199 |
|
|
1200 |
|
|
1201 |
|
|
1202 |
|
|
1203 |
|
|
1204 |
|
|
1205 |
|
|
1206 |
|
|
1207 |
|
|
1208 |
|
|
1209 |
|
|
1210 |
|
|
1211 |
|
/** |
1212 |
|
* @brief Ecah MPI process own ( rids.size() ) rows of A, |
1213 |
|
* and rids denote the distribution. i.e. |
1214 |
|
* ranki owns A(rids[0],:), |
1215 |
|
* A(rids[1],:), |
1216 |
|
* A(rids[2],:), ... |
1217 |
|
*/ |
1218 |
|
template<typename T> |
1219 |
|
class DistData<RIDS, STAR, T> : public DistDataBase<T> |
1220 |
|
{ |
1221 |
|
public: |
1222 |
|
|
1223 |
|
#ifdef HMLP_MIC_AVX512 |
1224 |
|
/** use hbw::allocator for Intel Xeon Phi */ |
1225 |
|
using ALLOCATOR = hbw::allocator<T>; |
1226 |
|
#elif HMLP_USE_CUDA |
1227 |
|
/** use pinned (page-lock) memory for NVIDIA GPUs */ |
1228 |
|
using ALLOCATOR = thrust::system::cuda::experimental::pinned_allocator<T>; |
1229 |
|
#else |
1230 |
|
/** use default stl allocator */ |
1231 |
|
using ALLOCATOR = std::allocator<T>; |
1232 |
|
#endif |
1233 |
|
|
1234 |
|
|
1235 |
|
/** default constructor */ |
1236 |
|
DistData( size_t m, size_t n, std::vector<size_t> &rids, mpi::Comm comm ) : |
1237 |
|
DistDataBase<T>( m, n, comm ) |
1238 |
|
{ |
1239 |
|
/** now check if (sum rids.size() == m) */ |
1240 |
|
size_t bcast_m = rids.size(); |
1241 |
|
size_t reduc_m = 0; |
1242 |
|
mpi::Allreduce( &bcast_m, &reduc_m, 1, MPI_SUM, comm ); |
1243 |
|
|
1244 |
|
if ( reduc_m != m ) |
1245 |
|
printf( "%lu %lu\n", reduc_m, m ); fflush( stdout ); |
1246 |
|
|
1247 |
|
|
1248 |
|
assert( reduc_m == m ); |
1249 |
|
this->rids = rids; |
1250 |
|
this->resize( rids.size(), n ); |
1251 |
|
|
1252 |
|
for ( size_t i = 0; i < rids.size(); i ++ ) |
1253 |
|
rid2row[ rids[ i ] ] = i; |
1254 |
|
}; |
1255 |
|
|
1256 |
|
|
1257 |
|
/** |
1258 |
|
* Overload operator () to allow accessing data using gids |
1259 |
|
*/ |
1260 |
|
T & operator () ( size_t i , size_t j ) |
1261 |
|
{ |
1262 |
|
/** assert that Kij is stored on this MPI process */ |
1263 |
|
assert( rid2row.count( i ) == 1 ); |
1264 |
|
/** return reference of Kij */ |
1265 |
|
return DistDataBase<T>::operator () ( rid2row[ i ], j ); |
1266 |
|
}; |
1267 |
|
|
1268 |
|
|
1269 |
|
/** |
1270 |
|
* Overload operator () to return a local submatrix using gids |
1271 |
|
*/ |
1272 |
|
hmlp::Data<T> operator () ( std::vector<size_t> I, std::vector<size_t> J ) |
1273 |
|
{ |
1274 |
|
for ( auto it = I.begin(); it != I.end(); it ++ ) |
1275 |
|
{ |
1276 |
|
/** assert that Kij is stored on this MPI process */ |
1277 |
|
assert( rid2row.count(*it) == 1 ); |
1278 |
|
(*it) = rid2row[ (*it) ]; |
1279 |
|
} |
1280 |
|
return DistDataBase<T>::operator () ( I, J ); |
1281 |
|
}; |
1282 |
|
|
1283 |
|
|
1284 |
|
|
1285 |
|
|
1286 |
|
|
1287 |
|
|
1288 |
|
/** |
1289 |
|
* @brief Return a vector of vector that indicates the RBLK ownership |
1290 |
|
* of each MPI rank. |
1291 |
|
*/ |
1292 |
|
std::vector<std::vector<size_t>> RBLKOwnership() |
1293 |
|
{ |
1294 |
|
/** MPI */ |
1295 |
|
mpi::Comm comm = this->GetComm(); |
1296 |
|
int size = this->GetSize(); |
1297 |
|
int rank = this->GetRank(); |
1298 |
|
|
1299 |
|
std::vector<std::vector<size_t>> ownership( size ); |
1300 |
|
|
1301 |
|
for ( auto it = rids.begin(); it != rids.end(); it ++ ) |
1302 |
|
{ |
1303 |
|
size_t rid = (*it); |
1304 |
|
/** |
1305 |
|
* While in RBLK distribution, rid is owned by |
1306 |
|
* rank ( rid % size ) at position ( rid / size ) |
1307 |
|
*/ |
1308 |
|
ownership[ rid % size ].push_back( rid ); |
1309 |
|
}; |
1310 |
|
|
1311 |
|
return ownership; |
1312 |
|
|
1313 |
|
}; /** end RBLKOwnership() */ |
1314 |
|
|
1315 |
|
|
1316 |
|
|
1317 |
|
/** |
1318 |
|
* redistribution from RBLK to RIDS (MPI_Alltoallv) |
1319 |
|
*/ |
1320 |
|
DistData<RIDS, STAR, T> & operator = ( DistData<RBLK, STAR, T> &A ) |
1321 |
|
{ |
1322 |
|
/** assertion: must provide rids */ |
1323 |
|
assert( rids.size() ); |
1324 |
|
|
1325 |
|
/** MPI */ |
1326 |
|
mpi::Comm comm = this->GetComm(); |
1327 |
|
int size = this->GetSize(); |
1328 |
|
int rank = this->GetRank(); |
1329 |
|
|
1330 |
|
//printf( "Enter redistrivution rids.size() %lu\n", rids.size() ); fflush( stdout ); |
1331 |
|
//mpi::Barrier( comm ); |
1332 |
|
|
1333 |
|
/** allocate buffer for ids */ |
1334 |
|
std::vector<std::vector<size_t>> sendids = RBLKOwnership(); |
1335 |
|
std::vector<std::vector<size_t>> recvids( size ); |
1336 |
|
|
1337 |
|
//for ( size_t i = 0; i < rids.size(); i ++ ) |
1338 |
|
//{ |
1339 |
|
// /** since A has RBLK distribution, rid is stored at rank (rid) % size */ |
1340 |
|
// size_t rid = rids[ i ]; |
1341 |
|
// /** ( rid / size ) is the local id of A */ |
1342 |
|
// sendids[ rid % size ].push_back( rid ); |
1343 |
|
//} |
1344 |
|
|
1345 |
|
//printf( "before All2allvector1\n" ); fflush( stdout ); |
1346 |
|
//mpi::Barrier( comm ); |
1347 |
|
|
1348 |
|
/** |
1349 |
|
* exchange rids: |
1350 |
|
* |
1351 |
|
* sendids contain all required ids from each rank |
1352 |
|
* recvids contain all requested ids from each rank |
1353 |
|
* |
1354 |
|
*/ |
1355 |
|
mpi::AlltoallVector( sendids, recvids, comm ); |
1356 |
|
|
1357 |
|
//printf( "Finish All2allvector1\n" );fflush( stdout ); |
1358 |
|
//mpi::Barrier( comm ); |
1359 |
|
|
1360 |
|
|
1361 |
|
std::vector<std::vector<T, ALLOCATOR>> senddata( size ); |
1362 |
|
std::vector<std::vector<T, ALLOCATOR>> recvdata( size ); |
1363 |
|
|
1364 |
|
std::vector<size_t> bmap( this->col() ); |
1365 |
|
for ( size_t j = 0; j < bmap.size(); j ++ ) bmap[ j ] = j; |
1366 |
|
|
1367 |
|
/** extract rows from A<RBLK,STAR> */ |
1368 |
|
#pragma omp parallel for |
1369 |
|
for ( size_t p = 0; p < size; p ++ ) |
1370 |
|
{ |
1371 |
|
/** recvids should be gids (not local posiiton) */ |
1372 |
|
senddata[ p ] = A( recvids[ p ], bmap ); |
1373 |
|
assert( senddata[ p ].size() == recvids[ p ].size() * bmap.size() ); |
1374 |
|
} |
1375 |
|
|
1376 |
|
//printf( "before All2allvector2\n" ); fflush( stdout ); |
1377 |
|
/** exchange data */ |
1378 |
|
mpi::AlltoallVector( senddata, recvdata, comm ); |
1379 |
|
|
1380 |
|
|
1381 |
|
#pragma omp parallel for |
1382 |
|
for ( size_t p = 0; p < size; p ++ ) |
1383 |
|
{ |
1384 |
|
assert( recvdata[ p ].size() == sendids[ p ].size() * this->col() ); |
1385 |
|
|
1386 |
|
size_t ld = recvdata[ p ].size() / this->col(); |
1387 |
|
|
1388 |
|
assert( ld == sendids[ p ].size() ); |
1389 |
|
|
1390 |
|
|
1391 |
|
for ( size_t i = 0; i < sendids[ p ].size(); i ++ ) |
1392 |
|
{ |
1393 |
|
size_t rid = sendids[ p ][ i ]; |
1394 |
|
for ( size_t j = 0; j < this->col(); j ++ ) |
1395 |
|
{ |
1396 |
|
(*this)( rid, j ) = recvdata[ p ][ j * ld + i ]; |
1397 |
|
} |
1398 |
|
}; |
1399 |
|
}; |
1400 |
|
mpi::Barrier( comm ); |
1401 |
|
|
1402 |
|
return (*this); |
1403 |
|
}; |
1404 |
|
|
1405 |
|
|
1406 |
|
|
1407 |
|
|
1408 |
|
|
1409 |
|
|
1410 |
|
|
1411 |
|
private: |
1412 |
|
|
1413 |
|
/** owned row gids */ |
1414 |
|
vector<size_t> rids; |
1415 |
|
|
1416 |
|
map<size_t, size_t> rid2row; |
1417 |
|
|
1418 |
|
|
1419 |
|
}; /** end class DistData<RIDS, STAR, T> */ |
1420 |
|
|
1421 |
|
|
1422 |
|
|
1423 |
|
|
1424 |
|
template<typename T> |
1425 |
|
class DistData<STAR, STAR, T> : public DistDataBase<T> |
1426 |
|
{ |
1427 |
|
public: |
1428 |
|
|
1429 |
|
private: |
1430 |
|
|
1431 |
|
}; /** end class DistData<STAR, STAR, T> */ |
1432 |
|
|
1433 |
|
|
1434 |
|
|
1435 |
|
|
1436 |
|
|
1437 |
|
|
1438 |
|
}; /** end namespace hmlp */ |
1439 |
|
|
1440 |
|
|
1441 |
|
#endif /** define DISTDATA_HPP */ |
1442 |
|
|