1 |
|
/** |
2 |
|
* HMLP (High-Performance Machine Learning Primitives) |
3 |
|
* |
4 |
|
* Copyright (C) 2014-2017, The University of Texas at Austin |
5 |
|
* |
6 |
|
* This program is free software: you can redistribute it and/or modify |
7 |
|
* it under the terms of the GNU General Public License as published by |
8 |
|
* the Free Software Foundation, either version 3 of the License, or |
9 |
|
* (at your option) any later version. |
10 |
|
* |
11 |
|
* This program is distributed in the hope that it will be useful, |
12 |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 |
|
* GNU General Public License for more details. |
15 |
|
* |
16 |
|
* You should have received a copy of the GNU General Public License |
17 |
|
* along with this program. If not, see the LICENSE file. |
18 |
|
* |
19 |
|
**/ |
20 |
|
|
21 |
|
|
22 |
|
#ifndef HMLP_UTIL_HPP |
23 |
|
#define HMLP_UTIL_HPP |
24 |
|
|
25 |
|
#include <stdio.h> |
26 |
|
#include <stdlib.h> |
27 |
|
#include <cmath> |
28 |
|
#include <tuple> |
29 |
|
#include <limits> |
30 |
|
#include <algorithm> |
31 |
|
#include <type_traits> |
32 |
|
#include <cstdint> |
33 |
|
#include <omp.h> |
34 |
|
|
35 |
|
|
36 |
|
#include <hmlp.h> |
37 |
|
|
38 |
|
#define HANDLE_ERROR( err ) (hmlp::handleError( err, __FILE__, __LINE__ )) |
39 |
|
|
40 |
|
|
41 |
|
|
42 |
|
using namespace std; |
43 |
|
|
44 |
|
|
45 |
|
namespace hmlp |
46 |
|
{ |
47 |
|
|
48 |
|
/** |
49 |
|
* \breif Handling runtime error with information. |
50 |
|
*/ |
51 |
|
void handleError( hmlpError_t error, const char* file, int line ); |
52 |
|
|
53 |
|
/** |
54 |
|
* \brief The default function to allocate memory for HMLP. |
55 |
|
* Memory allocated by this function is aligned. Most of the |
56 |
|
* HMLP primitives require memory alignment. |
57 |
|
*/ |
58 |
|
template<int ALIGN_SIZE, typename T> |
59 |
|
T *hmlp_malloc( int m, int n, int size ) |
60 |
|
{ |
61 |
|
T *ptr = NULL; |
62 |
|
#ifdef HMLP_MIC_AVX512 |
63 |
|
int err = hbw_posix_memalign( (void**)&ptr, (size_t)ALIGN_SIZE, size * m * n ); |
64 |
|
#else |
65 |
|
int err = posix_memalign( (void**)&ptr, (size_t)ALIGN_SIZE, size * m * n ); |
66 |
|
#endif |
67 |
|
|
68 |
|
if ( err ) |
69 |
|
{ |
70 |
|
printf( "hmlp_malloc(): posix_memalign() failures\n" ); |
71 |
|
exit( 1 ); |
72 |
|
} |
73 |
|
|
74 |
|
return ptr; |
75 |
|
}; |
76 |
|
|
77 |
|
/** @brief Another interface. */ |
78 |
|
template<int ALIGN_SIZE, typename T> |
79 |
|
T *hmlp_malloc( int n ) |
80 |
|
{ |
81 |
|
return hmlp_malloc<ALIGN_SIZE, T>( n, 1, sizeof(T) ); |
82 |
|
}; |
83 |
|
|
84 |
|
/** |
85 |
|
* @brief Free the aligned memory. |
86 |
|
*/ |
87 |
|
template<typename T> |
88 |
|
void hmlp_free( T *ptr ) |
89 |
|
{ |
90 |
|
#ifdef HMLP_MIC_AVX512 |
91 |
|
if ( ptr ) hbw_free( ptr ); |
92 |
|
#else |
93 |
|
if ( ptr ) free( ptr ); |
94 |
|
#endif |
95 |
|
}; |
96 |
|
|
97 |
|
template<typename T> |
98 |
|
void hmlp_print_binary( T number ) |
99 |
|
{ |
100 |
|
char binary[ 33 ]; |
101 |
|
|
102 |
|
for ( int i = 31; i >= 0; i -- ) |
103 |
|
{ |
104 |
|
if ( i % 5 ) printf( " " ); |
105 |
|
else printf( "%d", i / 5 ); |
106 |
|
} |
107 |
|
printf( "\n" ); |
108 |
|
|
109 |
|
|
110 |
|
for ( size_t i = 0; i < sizeof(T) * 4; i ++ ) |
111 |
|
{ |
112 |
|
if ( number & 1 ) binary[ 31 - i ] = '1'; |
113 |
|
else binary[ 31 - i ] = '0'; |
114 |
|
number >>= 1; |
115 |
|
if ( i == 31 ) break; |
116 |
|
} |
117 |
|
binary[ 32 ] = '\0'; |
118 |
|
|
119 |
|
//size_t i = 0; |
120 |
|
|
121 |
|
//while ( number ) |
122 |
|
//{ |
123 |
|
// if ( i < 32 && ( number & 1 ) ) binary[ i ] = '1'; |
124 |
|
// else binary[ i ] = '0'; |
125 |
|
// //if ( number & 1) printf( "1" ); |
126 |
|
// //else printf( "0" ); |
127 |
|
// number >>= 1; |
128 |
|
// i ++; |
129 |
|
//} |
130 |
|
//binary[ 32 ] = 3; |
131 |
|
//printf("\n"); |
132 |
|
printf( "%s\n", binary ); |
133 |
|
}; |
134 |
|
|
135 |
|
|
136 |
|
|
137 |
|
/** |
138 |
|
* @brief Split into m x n, get the subblock starting from ith row |
139 |
|
* and jth column. (for STRASSEN) |
140 |
|
*/ |
141 |
|
template<typename T> |
142 |
|
void hmlp_acquire_mpart |
143 |
|
( |
144 |
|
hmlpOperation_t transX, |
145 |
|
int m, |
146 |
|
int n, |
147 |
|
T *src_buff, |
148 |
|
int lda, |
149 |
|
int x, |
150 |
|
int y, |
151 |
|
int i, |
152 |
|
int j, |
153 |
|
T **dst_buff |
154 |
|
) |
155 |
|
{ |
156 |
|
//printf( "m: %d, n: %d, lda: %d, x: %d, y: %d, i: %d, j: %d\n", m, n, lda, x, y, i, j ); |
157 |
|
if ( transX == HMLP_OP_N ) |
158 |
|
{ |
159 |
|
*dst_buff = &src_buff[ ( m / x * i ) + ( n / y * j ) * lda ]; //src( m/x*i, n/y*j ) |
160 |
|
} |
161 |
|
else |
162 |
|
{ |
163 |
|
*dst_buff = &src_buff[ ( m / x * i ) * lda + ( n / y * j ) ]; //src( m/x*i, n/y*j ) |
164 |
|
/* x,y,i,j split partition dimension and id after the transposition */ |
165 |
|
} |
166 |
|
}; |
167 |
|
|
168 |
|
template<typename T> |
169 |
|
T hmlp_norm |
170 |
|
( |
171 |
|
int m, int n, |
172 |
|
T *A, int lda |
173 |
|
) |
174 |
|
{ |
175 |
|
T nrm2 = 0.0; |
176 |
|
for ( int j = 0; j < n; j ++ ) |
177 |
|
{ |
178 |
|
for ( int i = 0; i < m; i ++ ) |
179 |
|
{ |
180 |
|
nrm2 += A[ j * lda + i ] * A[ j * lda + i ]; |
181 |
|
} |
182 |
|
} |
183 |
|
return std::sqrt( nrm2 ); |
184 |
|
}; // end hmlp_norm() |
185 |
|
|
186 |
|
|
187 |
|
template<typename TA, typename TB> |
188 |
|
TB hmlp_relative_error |
189 |
|
( |
190 |
|
int m, int n, |
191 |
|
TA *A, int lda, |
192 |
|
TB *B, int ldb |
193 |
|
) |
194 |
|
{ |
195 |
|
TB nrmB = 0.0; |
196 |
|
TB nrm2 = 0.0; |
197 |
|
|
198 |
|
std::tuple<int, int, TB> max_error( -1, -1, 0.0 ); |
199 |
|
for ( int j = 0; j < n; j ++ ) |
200 |
|
{ |
201 |
|
for ( int i = 0; i < m; i ++ ) |
202 |
|
{ |
203 |
|
TA a = A[ j * lda + i ]; |
204 |
|
TB b = B[ j * ldb + i ]; |
205 |
|
TB r = a - b; |
206 |
|
nrmB += b * b; |
207 |
|
nrm2 += r * r; |
208 |
|
if ( r * r > std::get<2>( max_error ) ) |
209 |
|
{ |
210 |
|
max_error = std::make_tuple( i, j, r ); |
211 |
|
} |
212 |
|
} |
213 |
|
} |
214 |
|
nrm2 = std::sqrt( nrm2 ) / std::sqrt( nrmB ); |
215 |
|
|
216 |
|
if ( nrm2 > 1E-7 ) |
217 |
|
{ |
218 |
|
printf( "relative error % .2E maxinum elemenwise error % .2E ( %d, %d )\n", |
219 |
|
nrm2, |
220 |
|
std::get<2>( max_error ), |
221 |
|
std::get<0>( max_error ), std::get<1>( max_error ) ); |
222 |
|
} |
223 |
|
|
224 |
|
return nrm2; |
225 |
|
}; |
226 |
|
|
227 |
|
|
228 |
|
template<typename TA, typename TB> |
229 |
|
TB hmlp_relative_error |
230 |
|
( |
231 |
|
int m, int n, |
232 |
|
TA *A, int lda, int loa, |
233 |
|
TB *B, int ldb, int lob, |
234 |
|
int batchSize |
235 |
|
) |
236 |
|
{ |
237 |
|
TB err = 0.0; |
238 |
|
for ( int b = 0; b < batchSize; b ++ ) |
239 |
|
{ |
240 |
|
err += |
241 |
|
hmlp_relative_error |
242 |
|
( |
243 |
|
m, n, |
244 |
|
A + b * loa, lda, |
245 |
|
B + b * lob, ldb |
246 |
|
); |
247 |
|
} |
248 |
|
if ( err > 1E-7 ) |
249 |
|
{ |
250 |
|
printf( "average relative error % .2E\n", err / batchSize ); |
251 |
|
} |
252 |
|
return err; |
253 |
|
}; |
254 |
|
|
255 |
|
|
256 |
|
|
257 |
|
template<typename T> |
258 |
|
int hmlp_count_error |
259 |
|
( |
260 |
|
int m, int n, |
261 |
|
T *A, int lda, |
262 |
|
T *B, int ldb |
263 |
|
) |
264 |
|
{ |
265 |
|
int error_count = 0; |
266 |
|
std::tuple<int, int> err_location( -1, -1 ); |
267 |
|
|
268 |
|
for ( int j = 0; j < n; j ++ ) |
269 |
|
{ |
270 |
|
for ( int i = 0; i < m; i ++ ) |
271 |
|
{ |
272 |
|
T a = A[ j * lda + i ]; |
273 |
|
T b = B[ j * ldb + i ]; |
274 |
|
if ( a != b ) |
275 |
|
{ |
276 |
|
err_location = std::make_tuple( i, j ); |
277 |
|
error_count ++; |
278 |
|
} |
279 |
|
} |
280 |
|
} |
281 |
|
if ( error_count ) |
282 |
|
{ |
283 |
|
printf( "total error count %d\n", error_count ); |
284 |
|
} |
285 |
|
return error_count; |
286 |
|
}; |
287 |
|
|
288 |
|
|
289 |
|
template<typename T> |
290 |
|
int hmlp_count_error |
291 |
|
( |
292 |
|
int m, int n, |
293 |
|
T *A, int lda, int loa, |
294 |
|
T *B, int ldb, int lob, |
295 |
|
int batchSize |
296 |
|
) |
297 |
|
{ |
298 |
|
int error_count = 0; |
299 |
|
for ( int b = 0; b < batchSize; b ++ ) |
300 |
|
{ |
301 |
|
error_count += |
302 |
|
hmlp_count_error |
303 |
|
( |
304 |
|
m, n, |
305 |
|
A + b * loa, lda, |
306 |
|
B + b * lob, ldb |
307 |
|
); |
308 |
|
} |
309 |
|
if ( error_count ) |
310 |
|
{ |
311 |
|
printf( "total error count %d\n", error_count ); |
312 |
|
} |
313 |
|
return error_count; |
314 |
|
}; |
315 |
|
|
316 |
|
|
317 |
|
|
318 |
|
template<bool IGNOREZERO=false, bool COLUMNINDEX=true, typename T> |
319 |
|
void hmlp_printmatrix( int m, int n, T *A, int lda ) |
320 |
|
{ |
321 |
|
if ( COLUMNINDEX ) |
322 |
|
{ |
323 |
|
for ( int j = 0; j < n; j ++ ) |
324 |
|
{ |
325 |
|
if ( j % 5 == 0 || j == 0 || j == n - 1 ) |
326 |
|
{ |
327 |
|
if ( is_same<T, pair<double, size_t>>::value || is_same<T, pair<float, size_t>>::value ) |
328 |
|
{ |
329 |
|
printf( "col[%10d] ", j ); |
330 |
|
} |
331 |
|
else |
332 |
|
{ |
333 |
|
printf( "col[%4d] ", j ); |
334 |
|
} |
335 |
|
} |
336 |
|
else |
337 |
|
{ |
338 |
|
if ( is_same<T, pair<double, size_t>>::value || is_same<T, pair<float, size_t>>::value ) |
339 |
|
{ |
340 |
|
printf( " " ); |
341 |
|
} |
342 |
|
else |
343 |
|
{ |
344 |
|
printf( " " ); |
345 |
|
} |
346 |
|
} |
347 |
|
} |
348 |
|
printf( "\n" ); |
349 |
|
if ( is_same<T, pair<double, size_t>>::value || is_same<T, pair<float, size_t>>::value ) |
350 |
|
{ |
351 |
|
printf( "===============================================================================\n" ); |
352 |
|
} |
353 |
|
else |
354 |
|
{ |
355 |
|
printf( "===========================================================\n" ); |
356 |
|
} |
357 |
|
} |
358 |
|
printf( "A = [\n" ); |
359 |
|
for ( int i = 0; i < m; i ++ ) |
360 |
|
{ |
361 |
|
for ( int j = 0; j < n; j ++ ) |
362 |
|
{ |
363 |
|
if ( is_same<T, pair<double, size_t>>::value ) |
364 |
|
{ |
365 |
|
auto* A_pair = reinterpret_cast<pair<double, size_t>*>( A ); |
366 |
|
printf( "(% .1E,%5lu)", (double) A_pair[ j * lda + i ].first, A_pair[ j * lda + i ].second ); |
367 |
|
} |
368 |
|
else if ( is_same<T, pair<float, size_t>>::value ) |
369 |
|
{ |
370 |
|
auto* A_pair = reinterpret_cast<pair<float, size_t>*>( A ); |
371 |
|
printf( "(% .1E,%5lu)", (double) A_pair[ j * lda + i ].first, A_pair[ j * lda + i ].second ); |
372 |
|
} |
373 |
|
else if ( is_same<T, double>::value ) |
374 |
|
{ |
375 |
|
auto* A_double = reinterpret_cast<double*>( A ); |
376 |
|
if ( std::fabs( A_double[ j * lda + i ] ) < 1E-15 ) |
377 |
|
{ |
378 |
|
printf( " " ); |
379 |
|
} |
380 |
|
else |
381 |
|
{ |
382 |
|
printf( "% .4E ", (double) A_double[ j * lda + i ] ); |
383 |
|
} |
384 |
|
} |
385 |
|
else if ( is_same<T, double>::value ) |
386 |
|
{ |
387 |
|
auto* A_float = reinterpret_cast<float*>( A ); |
388 |
|
if ( std::fabs( A_float[ j * lda + i ] ) < 1E-15 ) |
389 |
|
{ |
390 |
|
printf( " " ); |
391 |
|
} |
392 |
|
else |
393 |
|
{ |
394 |
|
printf( "% .4E ", (double) A_float[ j * lda + i ] ); |
395 |
|
} |
396 |
|
} |
397 |
|
} |
398 |
|
printf(";\n"); |
399 |
|
} |
400 |
|
printf("];\n"); |
401 |
|
}; |
402 |
|
|
403 |
|
|
404 |
|
//__host__ __device__ |
405 |
|
static inline int hmlp_ceildiv( int x, int y ) |
406 |
|
{ |
407 |
|
return ( x + y - 1 ) / y; |
408 |
|
} |
409 |
|
|
410 |
|
static inline int hmlp_read_nway_from_env( const char* env ) |
411 |
|
{ |
412 |
|
int number = 1; |
413 |
|
char* str = getenv( env ); |
414 |
|
if( str != NULL ) |
415 |
|
{ |
416 |
|
number = strtol( str, NULL, 10 ); |
417 |
|
} |
418 |
|
return number; |
419 |
|
}; |
420 |
|
|
421 |
|
/** |
422 |
|
* @brief A swap function. Just in case we do not have one. |
423 |
|
* (for GSKNN) |
424 |
|
*/ |
425 |
|
template<typename T> |
426 |
|
inline void swap( T *x, int i, int j ) |
427 |
|
{ |
428 |
|
T tmp = x[ i ]; |
429 |
|
x[ i ] = x[ j ]; |
430 |
|
x[ j ] = tmp; |
431 |
|
}; |
432 |
|
|
433 |
|
/** |
434 |
|
* @brief This function is called after the root of the heap |
435 |
|
* is replaced by an new candidate. We need to readjust |
436 |
|
* such the heap condition is satisfied. |
437 |
|
*/ |
438 |
|
template<typename T> |
439 |
|
inline void heap_adjust |
440 |
|
( |
441 |
|
T *D, |
442 |
|
int s, |
443 |
|
int n, |
444 |
|
int *I |
445 |
|
) |
446 |
|
{ |
447 |
|
int j; |
448 |
|
|
449 |
|
while ( 2 * s + 1 < n ) |
450 |
|
{ |
451 |
|
j = 2 * s + 1; |
452 |
|
if ( ( j + 1 ) < n ) |
453 |
|
{ |
454 |
|
if ( D[ j ] < D[ j + 1 ] ) j ++; |
455 |
|
} |
456 |
|
if ( D[ s ] < D[ j ] ) |
457 |
|
{ |
458 |
|
swap<T>( D, s, j ); |
459 |
|
swap<int>( I, s, j ); |
460 |
|
s = j; |
461 |
|
} |
462 |
|
else break; |
463 |
|
} |
464 |
|
} |
465 |
|
|
466 |
|
template<typename T> |
467 |
|
inline void heap_select |
468 |
|
( |
469 |
|
int m, |
470 |
|
int r, |
471 |
|
T *x, |
472 |
|
int *alpha, |
473 |
|
T *D, |
474 |
|
int *I |
475 |
|
) |
476 |
|
{ |
477 |
|
int i; |
478 |
|
|
479 |
|
for ( i = 0; i < m; i ++ ) |
480 |
|
{ |
481 |
|
if ( x[ i ] > D[ 0 ] ) |
482 |
|
{ |
483 |
|
continue; |
484 |
|
} |
485 |
|
else // Replace the root with the new query. |
486 |
|
{ |
487 |
|
D[ 0 ] = x[ i ]; |
488 |
|
I[ 0 ] = alpha[ i ]; |
489 |
|
heap_adjust<T>( D, 0, r, I ); |
490 |
|
} |
491 |
|
} |
492 |
|
}; |
493 |
|
|
494 |
|
|
495 |
|
template<typename T> |
496 |
|
void HeapAdjust |
497 |
|
( |
498 |
|
size_t s, size_t n, |
499 |
|
std::pair<T, size_t> *NN |
500 |
|
) |
501 |
|
{ |
502 |
|
while ( 2 * s + 1 < n ) |
503 |
|
{ |
504 |
|
size_t j = 2 * s + 1; |
505 |
|
if ( ( j + 1 ) < n ) |
506 |
|
{ |
507 |
|
if ( NN[ j ].first < NN[ j + 1 ].first ) j ++; |
508 |
|
} |
509 |
|
if ( NN[ s ].first < NN[ j ].first ) |
510 |
|
{ |
511 |
|
std::swap( NN[ s ], NN[ j ] ); |
512 |
|
s = j; |
513 |
|
} |
514 |
|
else break; |
515 |
|
} |
516 |
|
}; /** end HeapAdjust() */ |
517 |
|
|
518 |
|
template<typename T> |
519 |
|
void HeapSelect |
520 |
|
( |
521 |
|
size_t n, size_t k, |
522 |
|
std::pair<T, size_t> *Query, |
523 |
|
std::pair<T, size_t> *NN |
524 |
|
) |
525 |
|
{ |
526 |
|
for ( size_t i = 0; i < n; i ++ ) |
527 |
|
{ |
528 |
|
if ( Query[ i ].first > NN[ 0 ].first ) |
529 |
|
{ |
530 |
|
continue; |
531 |
|
} |
532 |
|
else // Replace the root with the new query. |
533 |
|
{ |
534 |
|
NN[ 0 ] = Query[ i ]; |
535 |
|
HeapAdjust<T>( 0, k, NN ); |
536 |
|
} |
537 |
|
} |
538 |
|
}; /** end HeapSelect() */ |
539 |
|
|
540 |
|
|
541 |
|
/** |
542 |
|
* @brief A bubble sort for reference. |
543 |
|
*/ |
544 |
|
template<typename T> |
545 |
|
void bubble_sort |
546 |
|
( |
547 |
|
int n, |
548 |
|
T *D, |
549 |
|
int *I |
550 |
|
) |
551 |
|
{ |
552 |
|
int i, j; |
553 |
|
|
554 |
|
for ( i = 0; i < n - 1; i ++ ) |
555 |
|
{ |
556 |
|
for ( j = 0; j < n - 1 - i; j ++ ) |
557 |
|
{ |
558 |
|
if ( D[ j ] > D[ j + 1 ] ) |
559 |
|
{ |
560 |
|
swap<T>( D, j, j + 1 ); |
561 |
|
swap<int>( I, j, j + 1 ); |
562 |
|
} |
563 |
|
} |
564 |
|
} |
565 |
|
}; /** end bubble_sort() */ |
566 |
|
|
567 |
|
|
568 |
|
/** |
569 |
|
* |
570 |
|
* |
571 |
|
**/ |
572 |
|
class Statistic |
573 |
|
{ |
574 |
|
public: |
575 |
|
|
576 |
|
Statistic() |
577 |
|
{ |
578 |
|
_num = 0; |
579 |
|
_max = std::numeric_limits<double>::min(); |
580 |
|
_min = std::numeric_limits<double>::max(); |
581 |
|
_avg = 0.0; |
582 |
|
}; |
583 |
|
|
584 |
|
std::size_t _num; |
585 |
|
|
586 |
|
double _max; |
587 |
|
|
588 |
|
double _min; |
589 |
|
|
590 |
|
double _avg; |
591 |
|
|
592 |
|
void Update( double query ) |
593 |
|
{ |
594 |
|
// Compute the sum |
595 |
|
_avg = _num * _avg; |
596 |
|
_num += 1; |
597 |
|
_max = std::max( _max, query ); |
598 |
|
_min = std::min( _min, query ); |
599 |
|
_avg += query; |
600 |
|
_avg /= _num; |
601 |
|
}; |
602 |
|
|
603 |
|
void Print() |
604 |
|
{ |
605 |
|
printf( "num %5lu min %.1E max %.1E avg %.1E\n", _num, _min, _max, _avg ); |
606 |
|
}; |
607 |
|
|
608 |
|
}; /** end class Statistic */ |
609 |
|
|
610 |
|
}; /** end namespace hmlp */ |
611 |
|
|
612 |
|
#endif /** define HMLP_UTIL_HPP */ |