Head

GCC Code Coverage Report

Directory:	.		Exec	Total	Coverage
File:	frame/base/util.hpp	Lines:	0	57	0.0 %
Date:	2019-01-14	Branches:	0	60	0.0 %


/**
 *  HMLP (High-Performance Machine Learning Primitives)
 *
 *  Copyright (C) 2014-2017, The University of Texas at Austin
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program. If not, see the LICENSE file.
 *
 **/


#ifndef HMLP_UTIL_HPP
#define HMLP_UTIL_HPP

#include <stdio.h>
#include <stdlib.h>
#include <cmath>
#include <tuple>
#include <limits>
#include <algorithm>
#include <type_traits>
#include <cstdint>
#include <omp.h>


#include <hmlp.h>

#define HANDLE_ERROR( err ) (hmlp::handleError( err, __FILE__, __LINE__ ))



using namespace std;


namespace hmlp
{

/**
 *  \breif Handling runtime error with information.
 */
void handleError( hmlpError_t error, const char* file, int line );

/**
 *  \brief The default function to allocate memory for HMLP.
 *         Memory allocated by this function is aligned. Most of the
 *         HMLP primitives require memory alignment.
 */
template<int ALIGN_SIZE, typename T>
T *hmlp_malloc( int m, int n, int size )
{
  T *ptr = NULL;
#ifdef HMLP_MIC_AVX512
  int err = hbw_posix_memalign( (void**)&ptr, (size_t)ALIGN_SIZE, size * m * n );
#else
  int err = posix_memalign( (void**)&ptr, (size_t)ALIGN_SIZE, size * m * n );
#endif

  if ( err )
  {
    printf( "hmlp_malloc(): posix_memalign() failures\n" );
    exit( 1 );
  }

  return ptr;
};

/** @brief Another interface. */
template<int ALIGN_SIZE, typename T>
T *hmlp_malloc( int n )
{
  return hmlp_malloc<ALIGN_SIZE, T>( n, 1, sizeof(T) );
};

/**
 *  @brief Free the aligned memory.
 */
template<typename T>
void hmlp_free( T *ptr )
{
#ifdef HMLP_MIC_AVX512
  if ( ptr ) hbw_free( ptr );
#else
  if ( ptr ) free( ptr );
#endif
};

template<typename T>
void hmlp_print_binary( T number )
{
  char binary[ 33 ];

  for ( int i = 31; i >= 0; i -- )
  {
    if ( i % 5 ) printf( " " );
    else         printf( "%d", i / 5 );
  }
  printf( "\n" );


  for ( size_t i = 0; i < sizeof(T) * 4; i ++ )
  {
    if ( number & 1 ) binary[ 31 - i ] = '1';
    else              binary[ 31 - i ] = '0';
    number >>= 1;
    if ( i == 31 ) break;
  }
  binary[ 32 ] = '\0';

  //size_t i = 0;

  //while ( number )
  //{
  //  if ( i < 32 && ( number & 1 )  ) binary[ i ] = '1';
  //  else                             binary[ i ] = '0';
  //  //if ( number & 1) printf( "1" );
  //  //else             printf( "0" );
  //  number >>= 1;
  //  i ++;
  //}
  //binary[ 32 ] = 3;
  //printf("\n");
  printf( "%s\n", binary );
};



/**
 *  @brief Split into m x n, get the subblock starting from ith row
 *         and jth column. (for STRASSEN)
 */
template<typename T>
void hmlp_acquire_mpart
(
  hmlpOperation_t transX,
  int m,
  int n,
  T *src_buff,
  int lda,
  int x,
  int y,
  int i,
  int j,
  T **dst_buff
)
{
  //printf( "m: %d, n: %d, lda: %d, x: %d, y: %d, i: %d, j: %d\n", m, n, lda, x, y, i, j );
  if ( transX == HMLP_OP_N )
  {
    *dst_buff = &src_buff[ ( m / x * i ) + ( n / y * j ) * lda ]; //src( m/x*i, n/y*j )
  }
  else
  {
    *dst_buff = &src_buff[ ( m / x * i ) * lda + ( n / y * j ) ]; //src( m/x*i, n/y*j )
    /* x,y,i,j split partition dimension and id after the transposition */
  }
};

template<typename T>
T hmlp_norm
(
  int m, int n,
  T *A, int lda
)
{
  T nrm2 = 0.0;
  for ( int j = 0; j < n; j ++ )
  {
    for ( int i = 0; i < m; i ++ )
    {
      nrm2 += A[ j * lda + i ] * A[ j * lda + i ];
    }
  }
  return std::sqrt( nrm2 );
}; // end hmlp_norm()


template<typename TA, typename TB>
TB hmlp_relative_error
(
  int m, int n,
  TA *A, int lda,
  TB *B, int ldb
)
{
  TB nrmB = 0.0;
  TB nrm2 = 0.0;

  std::tuple<int, int, TB> max_error( -1, -1, 0.0 );
  for ( int j = 0; j < n; j ++ )
  {
    for ( int i = 0; i < m; i ++ )
    {
      TA a = A[ j * lda + i ];
      TB b = B[ j * ldb + i ];
      TB r = a - b;
      nrmB += b * b;
      nrm2 += r * r;
      if ( r * r > std::get<2>( max_error ) )
      {
        max_error = std::make_tuple( i, j, r );
      }
    }
  }
  nrm2 = std::sqrt( nrm2 ) / std::sqrt( nrmB );

  if ( nrm2 > 1E-7 )
  {
    printf( "relative error % .2E maxinum elemenwise error % .2E ( %d, %d )\n",
        nrm2,
        std::get<2>( max_error ),
        std::get<0>( max_error ), std::get<1>( max_error ) );
  }

  return nrm2;
};


template<typename TA, typename TB>
TB hmlp_relative_error
(
  int m, int n,
  TA *A, int lda, int loa,
  TB *B, int ldb, int lob,
  int batchSize
)
{
  TB err = 0.0;
  for ( int b = 0; b < batchSize; b ++ )
  {
    err +=
    hmlp_relative_error
    (
      m, n,
      A + b * loa, lda,
      B + b * lob, ldb
    );
  }
  if ( err > 1E-7 )
  {
    printf( "average relative error % .2E\n", err / batchSize );
  }
  return err;
};



template<typename T>
int hmlp_count_error
(
  int m, int n,
  T *A, int lda,
  T *B, int ldb
)
{
  int error_count = 0;
  std::tuple<int, int> err_location( -1, -1 );

  for ( int j = 0; j < n; j ++ )
  {
    for ( int i = 0; i < m; i ++ )
    {
      T a = A[ j * lda + i ];
      T b = B[ j * ldb + i ];
      if ( a != b )
      {
        err_location = std::make_tuple( i, j );
        error_count ++;
      }
    }
  }
  if ( error_count )
  {
    printf( "total error count %d\n", error_count );
  }
  return error_count;
};


template<typename T>
int hmlp_count_error
(
  int m, int n,
  T *A, int lda, int loa,
  T *B, int ldb, int lob,
  int batchSize
)
{
  int error_count = 0;
  for ( int b = 0; b < batchSize; b ++ )
  {
    error_count +=
    hmlp_count_error
    (
      m, n,
      A + b * loa, lda,
      B + b * lob, ldb
    );
  }
  if ( error_count )
  {
    printf( "total error count %d\n", error_count );
  }
  return error_count;
};



template<bool IGNOREZERO=false, bool COLUMNINDEX=true, typename T>
void hmlp_printmatrix( int m, int n, T *A, int lda )
{
  if ( COLUMNINDEX )
  {
    for ( int j = 0; j < n; j ++ )
    {
      if ( j % 5 == 0 || j == 0 || j == n - 1 )
      {
        if ( is_same<T, pair<double, size_t>>::value || is_same<T, pair<float, size_t>>::value )
        {
          printf( "col[%10d] ", j );
        }
        else
        {
          printf( "col[%4d] ", j );
        }
      }
      else
      {
        if ( is_same<T, pair<double, size_t>>::value || is_same<T, pair<float, size_t>>::value )
        {
          printf( "                " );
        }
        else
        {
          printf( "          " );
        }
      }
    }
    printf( "\n" );
    if ( is_same<T, pair<double, size_t>>::value || is_same<T, pair<float, size_t>>::value )
    {
      printf( "===============================================================================\n" );
    }
    else
    {
      printf( "===========================================================\n" );
    }
  }
  printf( "A = [\n" );
  for ( int i = 0; i < m; i ++ )
  {
    for ( int j = 0; j < n; j ++ )
    {
      if ( is_same<T, pair<double, size_t>>::value )
      {
        auto* A_pair = reinterpret_cast<pair<double, size_t>*>( A );
        printf( "(% .1E,%5lu)", (double) A_pair[ j * lda + i ].first, A_pair[ j * lda + i ].second );
      }
      else if ( is_same<T, pair<float, size_t>>::value )
      {
        auto* A_pair = reinterpret_cast<pair<float, size_t>*>( A );
        printf( "(% .1E,%5lu)", (double) A_pair[ j * lda + i ].first, A_pair[ j * lda + i ].second );
      }
      else if ( is_same<T, double>::value )
      {
        auto* A_double = reinterpret_cast<double*>( A );
        if ( std::fabs( A_double[ j * lda + i ] ) < 1E-15 )
        {
          printf( "          " );
        }
        else
        {
          printf( "% .4E ", (double) A_double[ j * lda + i ] );
        }
      }
      else if ( is_same<T, double>::value )
      {
        auto* A_float = reinterpret_cast<float*>( A );
        if ( std::fabs( A_float[ j * lda + i ] ) < 1E-15 )
        {
          printf( "          " );
        }
        else
        {
          printf( "% .4E ", (double) A_float[ j * lda + i ] );
        }
      }
    }
    printf(";\n");
  }
  printf("];\n");
};


//__host__ __device__
static inline int hmlp_ceildiv( int x, int y )
{
    return ( x + y - 1 ) / y;
}

static inline int hmlp_read_nway_from_env( const char* env )
{
  int number = 1;
  char* str = getenv( env );
  if( str != NULL )
  {
    number = strtol( str, NULL, 10 );
  }
  return number;
};

/**
 *  @brief A swap function. Just in case we do not have one.
 *         (for GSKNN)
 */
template<typename T>
inline void swap( T *x, int i, int j )
{
  T tmp = x[ i ];
  x[ i ] = x[ j ];
  x[ j ] = tmp;
};

/**
 *  @brief This function is called after the root of the heap
 *         is replaced by an new candidate. We need to readjust
 *         such the heap condition is satisfied.
 */
template<typename T>
inline void heap_adjust
(
  T *D,
  int s,
  int n,
  int *I
)
{
  int j;

  while ( 2 * s + 1 < n )
  {
    j = 2 * s + 1;
    if ( ( j + 1 ) < n )
    {
      if ( D[ j ] < D[ j + 1 ] ) j ++;
    }
    if ( D[ s ] < D[ j ] )
    {
      swap<T>( D, s, j );
      swap<int>( I, s, j );
      s = j;
    }
    else break;
  }
}

template<typename T>
inline void heap_select
(
  int m,
  int r,
  T *x,
  int *alpha,
  T *D,
  int *I
)
{
  int i;

  for ( i = 0; i < m; i ++ )
  {
    if ( x[ i ] > D[ 0 ] )
    {
      continue;
    }
    else // Replace the root with the new query.
    {
      D[ 0 ] = x[ i ];
      I[ 0 ] = alpha[ i ];
      heap_adjust<T>( D, 0, r, I );
    }
  }
};


template<typename T>
void HeapAdjust
(
  size_t s, size_t n,
  std::pair<T, size_t> *NN
)
{
  while ( 2 * s + 1 < n )
  {
    size_t j = 2 * s + 1;
    if ( ( j + 1 ) < n )
    {
      if ( NN[ j ].first < NN[ j + 1 ].first ) j ++;
    }
    if ( NN[ s ].first < NN[ j ].first )
    {
      std::swap( NN[ s ], NN[ j ] );
      s = j;
    }
    else break;
  }
}; /** end HeapAdjust() */

template<typename T>
void HeapSelect
(
  size_t n, size_t k,
  std::pair<T, size_t> *Query,
  std::pair<T, size_t> *NN
)
{
  for ( size_t i = 0; i < n; i ++ )
  {
    if ( Query[ i ].first > NN[ 0 ].first )
    {
      continue;
    }
    else // Replace the root with the new query.
    {
      NN[ 0 ] = Query[ i ];
      HeapAdjust<T>( 0, k, NN );
    }
  }
}; /** end HeapSelect() */


/**
 *  @brief A bubble sort for reference.
 */
template<typename T>
void bubble_sort
(
  int n,
  T *D,
  int *I
)
{
  int i, j;

  for ( i = 0; i < n - 1; i ++ )
  {
    for ( j = 0; j < n - 1 - i; j ++ )
    {
      if ( D[ j ] > D[ j + 1 ] )
      {
        swap<T>( D, j, j + 1 );
        swap<int>( I, j, j + 1 );
      }
    }
  }
}; /** end bubble_sort() */


/**
 *
 *
 **/
class Statistic
{
  public:

    Statistic()
    {
      _num = 0;
      _max = std::numeric_limits<double>::min();
      _min = std::numeric_limits<double>::max();
      _avg = 0.0;
    };

    std::size_t _num;

    double _max;

    double _min;

    double _avg;

    void Update( double query )
    {
      // Compute the sum
      _avg = _num * _avg;
      _num += 1;
      _max = std::max( _max, query );
      _min = std::min( _min, query );
      _avg += query;
      _avg /= _num;
    };

    void Print()
    {
      printf( "num %5lu min %.1E max %.1E avg %.1E\n", _num, _min, _max, _avg );
    };

}; /** end class Statistic */

}; /** end namespace hmlp */

#endif /** define HMLP_UTIL_HPP */


Generated by: GCOVR (Version 3.2)

Line	Exec	Source
1		/**
2		* HMLP (High-Performance Machine Learning Primitives)
3		*
4		* Copyright (C) 2014-2017, The University of Texas at Austin
5		*
6		* This program is free software: you can redistribute it and/or modify
7		* it under the terms of the GNU General Public License as published by
8		* the Free Software Foundation, either version 3 of the License, or
9		* (at your option) any later version.
10		*
11		* This program is distributed in the hope that it will be useful,
12		* but WITHOUT ANY WARRANTY; without even the implied warranty of
13		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		* GNU General Public License for more details.
15		*
16		* You should have received a copy of the GNU General Public License
17		* along with this program. If not, see the LICENSE file.
18		*
19		**/
20
21
22		#ifndef HMLP_UTIL_HPP
23		#define HMLP_UTIL_HPP
24
25		#include <stdio.h>
26		#include <stdlib.h>
27		#include <cmath>
28		#include <tuple>
29		#include <limits>
30		#include <algorithm>
31		#include <type_traits>
32		#include <cstdint>
33		#include <omp.h>
34
35
36		#include <hmlp.h>
37
38		#define HANDLE_ERROR( err ) (hmlp::handleError( err, __FILE__, __LINE__ ))
39
40
41
42		using namespace std;
43
44
45		namespace hmlp
46		{
47
48		/**
49		* \breif Handling runtime error with information.
50		*/
51		void handleError( hmlpError_t error, const char* file, int line );
52
53		/**
54		* \brief The default function to allocate memory for HMLP.
55		* Memory allocated by this function is aligned. Most of the
56		* HMLP primitives require memory alignment.
57		*/
58		template<int ALIGN_SIZE, typename T>
59		T *hmlp_malloc( int m, int n, int size )
60		{
61		T *ptr = NULL;
62		#ifdef HMLP_MIC_AVX512
63		int err = hbw_posix_memalign( (void*)&ptr, (size_t)ALIGN_SIZE, size m * n );
64		#else
65		int err = posix_memalign( (void*)&ptr, (size_t)ALIGN_SIZE, size m * n );
66		#endif
67
68		if ( err )
69		{
70		printf( "hmlp_malloc(): posix_memalign() failures\n" );
71		exit( 1 );
72		}
73
74		return ptr;
75		};
76
77		/** @brief Another interface. */
78		template<int ALIGN_SIZE, typename T>
79		T *hmlp_malloc( int n )
80		{
81		return hmlp_malloc<ALIGN_SIZE, T>( n, 1, sizeof(T) );
82		};
83
84		/**
85		* @brief Free the aligned memory.
86		*/
87		template<typename T>
88		void hmlp_free( T *ptr )
89		{
90		#ifdef HMLP_MIC_AVX512
91		if ( ptr ) hbw_free( ptr );
92		#else
93		if ( ptr ) free( ptr );
94		#endif
95		};
96
97		template<typename T>
98		void hmlp_print_binary( T number )
99		{
100		char binary[ 33 ];
101
102		for ( int i = 31; i >= 0; i -- )
103		{
104		if ( i % 5 ) printf( " " );
105		else printf( "%d", i / 5 );
106		}
107		printf( "\n" );
108
109
110		for ( size_t i = 0; i < sizeof(T) * 4; i ++ )
111		{
112		if ( number & 1 ) binary[ 31 - i ] = '1';
113		else binary[ 31 - i ] = '0';
114		number >>= 1;
115		if ( i == 31 ) break;
116		}
117		binary[ 32 ] = '\0';
118
119		//size_t i = 0;
120
121		//while ( number )
122		//{
123		// if ( i < 32 && ( number & 1 ) ) binary[ i ] = '1';
124		// else binary[ i ] = '0';
125		// //if ( number & 1) printf( "1" );
126		// //else printf( "0" );
127		// number >>= 1;
128		// i ++;
129		//}
130		//binary[ 32 ] = 3;
131		//printf("\n");
132		printf( "%s\n", binary );
133		};
134
135
136
137		/**
138		* @brief Split into m x n, get the subblock starting from ith row
139		* and jth column. (for STRASSEN)
140		*/
141		template<typename T>
142		void hmlp_acquire_mpart
143		(
144		hmlpOperation_t transX,
145		int m,
146		int n,
147		T *src_buff,
148		int lda,
149		int x,
150		int y,
151		int i,
152		int j,
153		T **dst_buff
154		)
155		{
156		//printf( "m: %d, n: %d, lda: %d, x: %d, y: %d, i: %d, j: %d\n", m, n, lda, x, y, i, j );
157		if ( transX == HMLP_OP_N )
158		{
159		dst_buff = &src_buff[ ( m / x i ) + ( n / y * j ) * lda ]; //src( m/xi, n/yj )
160		}
161		else
162		{
163		dst_buff = &src_buff[ ( m / x i ) * lda + ( n / y * j ) ]; //src( m/xi, n/yj )
164		/* x,y,i,j split partition dimension and id after the transposition */
165		}
166		};
167
168		template<typename T>
169		T hmlp_norm
170		(
171		int m, int n,
172		T *A, int lda
173		)
174		{
175		T nrm2 = 0.0;
176		for ( int j = 0; j < n; j ++ )
177		{
178		for ( int i = 0; i < m; i ++ )
179		{
180		nrm2 += A[ j * lda + i ] * A[ j * lda + i ];
181		}
182		}
183		return std::sqrt( nrm2 );
184		}; // end hmlp_norm()
185
186
187		template<typename TA, typename TB>
188		TB hmlp_relative_error
189		(
190		int m, int n,
191		TA *A, int lda,
192		TB *B, int ldb
193		)
194		{
195		TB nrmB = 0.0;
196		TB nrm2 = 0.0;
197
198		std::tuple<int, int, TB> max_error( -1, -1, 0.0 );
199		for ( int j = 0; j < n; j ++ )
200		{
201		for ( int i = 0; i < m; i ++ )
202		{
203		TA a = A[ j * lda + i ];
204		TB b = B[ j * ldb + i ];
205		TB r = a - b;
206		nrmB += b * b;
207		nrm2 += r * r;
208		if ( r * r > std::get<2>( max_error ) )
209		{
210		max_error = std::make_tuple( i, j, r );
211		}
212		}
213		}
214		nrm2 = std::sqrt( nrm2 ) / std::sqrt( nrmB );
215
216		if ( nrm2 > 1E-7 )
217		{
218		printf( "relative error % .2E maxinum elemenwise error % .2E ( %d, %d )\n",
219		nrm2,
220		std::get<2>( max_error ),
221		std::get<0>( max_error ), std::get<1>( max_error ) );
222		}
223
224		return nrm2;
225		};
226
227
228		template<typename TA, typename TB>
229		TB hmlp_relative_error
230		(
231		int m, int n,
232		TA *A, int lda, int loa,
233		TB *B, int ldb, int lob,
234		int batchSize
235		)
236		{
237		TB err = 0.0;
238		for ( int b = 0; b < batchSize; b ++ )
239		{
240		err +=
241		hmlp_relative_error
242		(
243		m, n,
244		A + b * loa, lda,
245		B + b * lob, ldb
246		);
247		}
248		if ( err > 1E-7 )
249		{
250		printf( "average relative error % .2E\n", err / batchSize );
251		}
252		return err;
253		};
254
255
256
257		template<typename T>
258		int hmlp_count_error
259		(
260		int m, int n,
261		T *A, int lda,
262		T *B, int ldb
263		)
264		{
265		int error_count = 0;
266		std::tuple<int, int> err_location( -1, -1 );
267
268		for ( int j = 0; j < n; j ++ )
269		{
270		for ( int i = 0; i < m; i ++ )
271		{
272		T a = A[ j * lda + i ];
273		T b = B[ j * ldb + i ];
274		if ( a != b )
275		{
276		err_location = std::make_tuple( i, j );
277		error_count ++;
278		}
279		}
280		}
281		if ( error_count )
282		{
283		printf( "total error count %d\n", error_count );
284		}
285		return error_count;
286		};
287
288
289		template<typename T>
290		int hmlp_count_error
291		(
292		int m, int n,
293		T *A, int lda, int loa,
294		T *B, int ldb, int lob,
295		int batchSize
296		)
297		{
298		int error_count = 0;
299		for ( int b = 0; b < batchSize; b ++ )
300		{
301		error_count +=
302		hmlp_count_error
303		(
304		m, n,
305		A + b * loa, lda,
306		B + b * lob, ldb
307		);
308		}
309		if ( error_count )
310		{
311		printf( "total error count %d\n", error_count );
312		}
313		return error_count;
314		};
315
316
317
318		template<bool IGNOREZERO=false, bool COLUMNINDEX=true, typename T>
319		void hmlp_printmatrix( int m, int n, T *A, int lda )
320		{
321		if ( COLUMNINDEX )
322		{
323		for ( int j = 0; j < n; j ++ )
324		{
325		if ( j % 5 == 0 \|\| j == 0 \|\| j == n - 1 )
326		{
327		if ( is_same<T, pair<double, size_t>>::value \|\| is_same<T, pair<float, size_t>>::value )
328		{
329		printf( "col[%10d] ", j );
330		}
331		else
332		{
333		printf( "col[%4d] ", j );
334		}
335		}
336		else
337		{
338		if ( is_same<T, pair<double, size_t>>::value \|\| is_same<T, pair<float, size_t>>::value )
339		{
340		printf( " " );
341		}
342		else
343		{
344		printf( " " );
345		}
346		}
347		}
348		printf( "\n" );
349		if ( is_same<T, pair<double, size_t>>::value \|\| is_same<T, pair<float, size_t>>::value )
350		{
351		printf( "===============================================================================\n" );
352		}
353		else
354		{
355		printf( "===========================================================\n" );
356		}
357		}
358		printf( "A = [\n" );
359		for ( int i = 0; i < m; i ++ )
360		{
361		for ( int j = 0; j < n; j ++ )
362		{
363		if ( is_same<T, pair<double, size_t>>::value )
364		{
365		auto* A_pair = reinterpret_cast<pair<double, size_t>*>( A );
366		printf( "(% .1E,%5lu)", (double) A_pair[ j * lda + i ].first, A_pair[ j * lda + i ].second );
367		}
368		else if ( is_same<T, pair<float, size_t>>::value )
369		{
370		auto* A_pair = reinterpret_cast<pair<float, size_t>*>( A );
371		printf( "(% .1E,%5lu)", (double) A_pair[ j * lda + i ].first, A_pair[ j * lda + i ].second );
372		}
373		else if ( is_same<T, double>::value )
374		{
375		auto* A_double = reinterpret_cast<double*>( A );
376		if ( std::fabs( A_double[ j * lda + i ] ) < 1E-15 )
377		{
378		printf( " " );
379		}
380		else
381		{
382		printf( "% .4E ", (double) A_double[ j * lda + i ] );
383		}
384		}
385		else if ( is_same<T, double>::value )
386		{
387		auto* A_float = reinterpret_cast<float*>( A );
388		if ( std::fabs( A_float[ j * lda + i ] ) < 1E-15 )
389		{
390		printf( " " );
391		}
392		else
393		{
394		printf( "% .4E ", (double) A_float[ j * lda + i ] );
395		}
396		}
397		}
398		printf(";\n");
399		}
400		printf("];\n");
401		};
402
403
404		//__host__ __device__
405		static inline int hmlp_ceildiv( int x, int y )
406		{
407		return ( x + y - 1 ) / y;
408		}
409
410		static inline int hmlp_read_nway_from_env( const char* env )
411		{
412		int number = 1;
413		char* str = getenv( env );
414		if( str != NULL )
415		{
416		number = strtol( str, NULL, 10 );
417		}
418		return number;
419		};
420
421		/**
422		* @brief A swap function. Just in case we do not have one.
423		* (for GSKNN)
424		*/
425		template<typename T>
426		inline void swap( T *x, int i, int j )
427		{
428		T tmp = x[ i ];
429		x[ i ] = x[ j ];
430		x[ j ] = tmp;
431		};
432
433		/**
434		* @brief This function is called after the root of the heap
435		* is replaced by an new candidate. We need to readjust
436		* such the heap condition is satisfied.
437		*/
438		template<typename T>
439		inline void heap_adjust
440		(
441		T *D,
442		int s,
443		int n,
444		int *I
445		)
446		{
447		int j;
448
449		while ( 2 * s + 1 < n )
450		{
451		j = 2 * s + 1;
452		if ( ( j + 1 ) < n )
453		{
454		if ( D[ j ] < D[ j + 1 ] ) j ++;
455		}
456		if ( D[ s ] < D[ j ] )
457		{
458		swap<T>( D, s, j );
459		swap<int>( I, s, j );
460		s = j;
461		}
462		else break;
463		}
464		}
465
466		template<typename T>
467		inline void heap_select
468		(
469		int m,
470		int r,
471		T *x,
472		int *alpha,
473		T *D,
474		int *I
475		)
476		{
477		int i;
478
479		for ( i = 0; i < m; i ++ )
480		{
481		if ( x[ i ] > D[ 0 ] )
482		{
483		continue;
484		}
485		else // Replace the root with the new query.
486		{
487		D[ 0 ] = x[ i ];
488		I[ 0 ] = alpha[ i ];
489		heap_adjust<T>( D, 0, r, I );
490		}
491		}
492		};
493
494
495		template<typename T>
496		void HeapAdjust
497		(
498		size_t s, size_t n,
499		std::pair<T, size_t> *NN
500		)
501		{
502		while ( 2 * s + 1 < n )
503		{
504		size_t j = 2 * s + 1;
505		if ( ( j + 1 ) < n )
506		{
507		if ( NN[ j ].first < NN[ j + 1 ].first ) j ++;
508		}
509		if ( NN[ s ].first < NN[ j ].first )
510		{
511		std::swap( NN[ s ], NN[ j ] );
512		s = j;
513		}
514		else break;
515		}
516		}; /** end HeapAdjust() */
517
518		template<typename T>
519		void HeapSelect
520		(
521		size_t n, size_t k,
522		std::pair<T, size_t> *Query,
523		std::pair<T, size_t> *NN
524		)
525		{
526		for ( size_t i = 0; i < n; i ++ )
527		{
528		if ( Query[ i ].first > NN[ 0 ].first )
529		{
530		continue;
531		}
532		else // Replace the root with the new query.
533		{
534		NN[ 0 ] = Query[ i ];
535		HeapAdjust<T>( 0, k, NN );
536		}
537		}
538		}; /** end HeapSelect() */
539
540
541		/**
542		* @brief A bubble sort for reference.
543		*/
544		template<typename T>
545		void bubble_sort
546		(
547		int n,
548		T *D,
549		int *I
550		)
551		{
552		int i, j;
553
554		for ( i = 0; i < n - 1; i ++ )
555		{
556		for ( j = 0; j < n - 1 - i; j ++ )
557		{
558		if ( D[ j ] > D[ j + 1 ] )
559		{
560		swap<T>( D, j, j + 1 );
561		swap<int>( I, j, j + 1 );
562		}
563		}
564		}
565		}; /** end bubble_sort() */
566
567
568		/**
569		*
570		*
571		**/
572		class Statistic
573		{
574		public:
575
576		Statistic()
577		{
578		_num = 0;
579		_max = std::numeric_limits<double>::min();
580		_min = std::numeric_limits<double>::max();
581		_avg = 0.0;
582		};
583
584		std::size_t _num;
585
586		double _max;
587
588		double _min;
589
590		double _avg;
591
592		void Update( double query )
593		{
594		// Compute the sum
595		_avg = _num * _avg;
596		_num += 1;
597		_max = std::max( _max, query );
598		_min = std::min( _min, query );
599		_avg += query;
600		_avg /= _num;
601		};
602
603		void Print()
604		{
605		printf( "num %5lu min %.1E max %.1E avg %.1E\n", _num, _min, _max, _avg );
606		};
607
608		}; /** end class Statistic */
609
610		}; /** end namespace hmlp */
611
612		#endif /** define HMLP_UTIL_HPP */