Namespaces
	mpi

	tci

	tree

Classes
class	Cache

class	Cache1D
	Cache1D<NSET, NWAY, T> creates a layer of cache with NSET that directly maps a 1D array. The direct map is [ id % NSET ]. Each set has NWAY that are fully associated. More...

class	Cache2D

class	CacheLine

class	Cluster

class	CovReduceTask

class	CovTask

class	Data

class	Device
	This class describes devices or accelerators that require a master thread to control. A device can accept tasks from multiple workers. All received tasks are expected to be executed independently in a time-sharing fashion. Whether these tasks are executed in parallel, sequential or with some built-in context switching scheme does not matter. More...

class	DeviceMemory

class	DistData

class	DistData< CIRC, CIRC, T >

class	DistData< RBLK, STAR, T >
	Ecah MPI process own ( n / size ) rows of A in a cyclic fashion (Round Robin). i.e. If there are 3 MPI processes, then. More...

class	DistData< RIDS, STAR, T >
	Ecah MPI process own ( rids.size() ) rows of A, and rids denote the distribution. i.e. ranki owns A(rids[0],:), A(rids[1],:), A(rids[2],:), ... More...

class	DistData< STAR, CBLK, T >

class	DistData< STAR, CIDS, T >
	Ecah MPI process own ( cids.size() ) columns of A, and cids denote the distribution. i.e. ranki owns A(:,cids[0]), A(:,cids[1]), A(:,cids[2]), ... More...

class	DistData< STAR, STAR, T >

class	DistData< STAR, USER, T >

class	DistDataBase

class	DistKernelMatrix

class	DistVirtualMatrix
	DistVirtualMatrix is the abstract base class for matrix-free access and operations. Most of the public functions will be virtual. To inherit DistVirtualMatrix, you "must" implement the evaluation operator. Otherwise, the code won't compile. More...

class	Event
	Wrapper for omp or pthread mutex. More...

class	FunctionBase

struct	gsks_ref_mrxnr

struct	kernel_s

class	KernelMatrix

class	Layer

class	Layer< FC, T >

class	Layer< INPUT, T >

class	LayerBase

class	ListenerTask
	This task is the abstraction for all tasks handled by Listeners. More...

class	Lock
	Wrapper for omp or pthread mutex. More...

struct	MatrifyableObject

struct	MatrixLike

class	MatrixReadWrite
	This class creates 2D grids for 2D matrix partition. More...

class	MessageTask
	This task is designed to take care MPI communications. More...

class	MLPGaussNewton

class	MortonHelper

class	NULLTask
	This is a specific type of task that represents NOP. More...

class	OOCCovMatrix

class	OOCData

class	OOCSPDMatrix

struct	pack2D_pbxib

struct	pack_pbxib

class	PVFMMKernelMatrix

class	Range

class	range

class	ReadWrite
	This class provides the ability to perform dependency analysis. More...

class	RecvTask

class	Regression

class	RunTime
	RunTime is statically created in hmlp_runtime.cpp. More...

class	Scheduler

class	SendTask

class	SparseData

class	SPDMatrix
	This class does not need to inherit hmlp::Data<T>, but it should support two interfaces for data fetching. More...

class	SPDMatrixMPISupport

class	Statistic

class	Task

class	thread_communicator

struct	unpack2D_ibxjb

struct	unpack_ibxjb

class	View

class	VirtualFunction

class	VirtualMatrix
	VirtualMatrix is the abstract base class for matrix-free access and operations. Most of the public functions will be virtual. To inherit VirtualMatrix, you "must" implement the evaluation operator. Otherwise, the code won't compile. More...

class	VirtualNormalizedGraph

class	Worker

Enumerations
enum	CacheStatus { CACHE_CLEAN, CACHE_DIRTY }

enum	DeviceType { HOST, NVIDIA_GPU, OTHER_GPU, TI_DSP }

enum	Distribution_t { CBLK, RBLK, CIDS, RIDS, USER, STAR, CIRC }

enum	TaskStatus { ALLOCATED, NOTREADY, QUEUED, RUNNING, EXECUTED, DONE, CANCELLED }

enum	ReadWriteType { R, W, RW }

enum	SchedulePolicy { HMLP_SCHEDULE_DEFAULT, HMLP_SCHEDULE_ROUND_ROBIN, HMLP_SCHEDULE_UNIFORM, HMLP_SCHEDULE_HEFT }

enum	TransposeType { TRANSPOSE, NOTRANSPOSE }

enum	SideType { LEFT, RIGHT, BOTTOM, TOP }

enum	QuadrantType { TOPLEFT, TOPRIGHT, BOTTOMLEFT, BOTTOMRIGHT }

enum	kernel_type { GAUSSIAN, SIGMOID, POLYNOMIAL, LAPLACE, GAUSSIAN_VAR_BANDWIDTH, TANH, QUARTIC, MULTIQUADRATIC, EPANECHNIKOV, USER_DEFINE }

enum	LayerType { INPUT, FC, CONV2D, POOLING }

Functions
template<typename T >
bool	less_first (const pair< T, size_t > &a, const pair< T, size_t > &b)

template<typename T >
bool	less_second (const pair< T, size_t > &a, const pair< T, size_t > &b)

template<typename T >
bool	equal_second (const pair< T, size_t > &a, const pair< T, size_t > &b)

template<typename T >
void	MergeNeighbors (size_t k, pair< T, size_t > A, pair< T, size_t > B, vector< pair< T, size_t >> &aux)

template<typename T >
void	MergeNeighbors (size_t k, size_t n, vector< pair< T, size_t >> &A, vector< pair< T, size_t >> &B)

double	xdot (int n, const double dx, int incx, const double dy, int incy)
	DDOT wrapper. More...

float	xdot (int n, const float dx, int incx, const float dy, int incy)
	SDOT wrapper. More...

double	xnrm2 (int n, double *x, int incx)
	DNRM2 wrapper. More...

float	xnrm2 (int n, float *x, int incx)
	SNRM2 wrapper. More...

void	xgemm (const char transA, const char transB, int m, int n, int k, double alpha, const double A, int lda, const double B, int ldb, double beta, double *C, int ldc)
	DGEMM wrapper. More...

void	xgemm (const char transA, const char transB, int m, int n, int k, float alpha, const float A, int lda, const float B, int ldb, float beta, float *C, int ldc)
	SGEMM wrapper.

void	xsyrk (const char uplo, const char trans, int n, int k, double alpha, double A, int lda, double beta, double C, int ldc)

void	xsyrk (const char uplo, const char trans, int n, int k, float alpha, float A, int lda, float beta, float C, int ldc)

void	xtrsm (const char side, const char uplo, const char transA, const char diag, int m, int n, double alpha, double A, int lda, double B, int ldb)
	DTRSM wrapper. More...

void	xtrsm (const char side, const char uplo, const char transA, const char diag, int m, int n, float alpha, float A, int lda, float B, int ldb)
	STRSM wrapper. More...

void	xtrmm (const char side, const char uplo, const char transA, const char diag, int m, int n, double alpha, double A, int lda, double B, int ldb)
	DTRMM wrapper. More...

void	xtrmm (const char side, const char uplo, const char transA, const char diag, int m, int n, float alpha, float A, int lda, float B, int ldb)
	DTRMM wrapper. More...

void	xlaswp (int n, double A, int lda, int k1, int k2, int ipiv, int incx)
	DLASWP wrapper. More...

void	xlaswp (int n, float A, int lda, int k1, int k2, int ipiv, int incx)
	SLASWP wrapper. More...

void	xpotrf (const char uplo, int n, double A, int lda)
	DPOTRF wrapper. More...

void	xpotrf (const char uplo, int n, float A, int lda)
	SPOTRF wrapper. More...

void	xpotrs (const char uplo, int n, int nrhs, double A, int lda, double *B, int ldb)
	DPOTRS wrapper. More...

void	xpotrs (const char uplo, int n, int nrhs, float A, int lda, float *B, int ldb)
	SPOTRS wrapper. More...

void	xgetrf (int m, int n, double A, int lda, int ipiv)
	DGETRF wrapper. More...

void	xgetrf (int m, int n, float A, int lda, int ipiv)
	SGETRF wrapper. More...

void	xgetrs (const char trans, int m, int nrhs, double A, int lda, int ipiv, double B, int ldb)
	DGETRS wrapper. More...

void	xgetrs (const char trans, int m, int nrhs, float A, int lda, int ipiv, float B, int ldb)
	SGETRS wrapper. More...

void	xgecon (const char norm, int n, double A, int lda, double anorm, double rcond, double work, int *iwork)
	DGECON wrapper. More...

void	xgecon (const char norm, int n, float A, int lda, float anorm, float rcond, float work, int *iwork)
	SGECON wrapper. More...

void	xgeqrf (int m, int n, double A, int lda, double tau, double *work, int lwork)
	DGEQRF wrapper. More...

void	xgeqrf (int m, int n, float A, int lda, float tau, float *work, int lwork)
	SGEQRF wrapper. More...

void	xorgqr (int m, int n, int k, double A, int lda, double tau, double *work, int lwork)
	SORGQR wrapper. More...

void	xorgqr (int m, int n, int k, float A, int lda, float tau, float *work, int lwork)
	SORGQR wrapper. More...

void	xormqr (const char side, const char trans, int m, int n, int k, double A, int lda, double tau, double C, int ldc, double work, int lwork)
	DORMQR wrapper. More...

void	xormqr (const char side, const char trans, int m, int n, int k, float A, int lda, float tau, float C, int ldc, float work, int lwork)
	SORMQR wrapper. More...

void	xgeqp3 (int m, int n, double A, int lda, int jpvt, double tau, double work, int lwork)
	DGEQP3 wrapper. More...

void	xgeqp3 (int m, int n, float A, int lda, int jpvt, float tau, float work, int lwork)
	SGEQP3 wrapper. More...

void	xgeqp4 (int m, int n, double A, int lda, int jpvt, double tau, double work, int lwork)
	DGEQP4 wrapper. More...

void	xgeqp4 (int m, int n, float A, int lda, int jpvt, float tau, float work, int lwork)
	SGEQP4 wrapper. More...

void	xgels (const char trans, int m, int n, int nrhs, double A, int lda, double B, int ldb, double work, int lwork)
	DGELS wrapper. More...

void	xgels (const char trans, int m, int n, int nrhs, float A, int lda, float B, int ldb, float work, int lwork)
	SGELS wrapper. More...

void	xgesdd (const char jobz, int m, int n, double A, int lda, double S, double U, int ldu, double VT, int ldvt, double work, int lwork, int *iwork)
	DGESDD wrapper. More...

void	xgesdd (const char jobz, int m, int n, float A, int lda, float S, float U, int ldu, float VT, int ldvt, float work, int lwork, int *iwork)
	SGESDD wrapper. More...

void	xstev (const char jobz, int n, double D, double E, double Z, int ldz, double *work)

void	xstev (const char jobz, int n, float D, float E, float Z, int ldz, float *work)

void	xposv (const char uplo, int n, int nrhs, double A, int lda, double *B, int ldb)

void	xposv (const char uplo, int n, int nrhs, float A, int lda, float *B, int ldb)

class Device *	hmlp_get_device_host ()

template<typename T >
void	im2col (int m, int n, T packX, T X, int w0, int h0, int d0, int s, int p, int w1, int h1)

template<int FOLD, bool ZEROPAD = true, typename T >
void	pack2Dimg (int m, int n, T packX, int x0, int y0, int offset, T X, int w0, int h0, int d0, int s, int p, int w1, int h1)
	pack image into 2D packed buffer. Notice that here X is d leading.

template<bool TRANS, int FOLD, bool ZEROPAD = false, typename T >
void	pack2D (int m, int n, T X0, T X1, int ldx, T gamma, int xmap, T packX)
	This is the default packing routine for GKMX, GSKS, GSKNN and STRASSEN.

template<bool TRANS, int FOLD, bool ZEROPAD = false, typename T >
void	pack2D (int m, int n, T X0, T X1, int ldx, T gamma, T *packX)

template<bool TRANS, int FOLD, bool ZEROPAD = false, typename T >
void	pack2D (int m, int n, T X, int ldx, int xmap, T *packX)

template<bool TRANS, int FOLD, bool ZEROPAD = false, typename T >
void	pack2D (int m, int n, T X, int ldx, T packX)

template<int PACK_MR, typename TA >
void	packA_kcxmc (int m, int k, TA A, int lda, int amap, TA *packA)

template<int PACK_NR, typename TB >
void	packB_kcxnc (int n, int k, TB B, int ldb, int bmap, TB *packB)

template<int PACK_NR, typename TC >
void	packw_rhsxnc (int n, int rhs, TC w, int ldw, int wmap, TC *packw)

template<int PACK_MR, typename TC >
void	packu_rhsxmc (int m, int rhs, TC u, int ldu, int umap, TC *packu)

void	hmlp_msg_dependency_analysis (int key, int p, ReadWriteType type, Task *task)

template<typename ARG >
void	RecuTaskSubmit (ARG *arg)
	Recursive task sibmission (base case). More...

template<typename ARG , typename TASK , typename... Args>
void	RecuTaskSubmit (ARG *arg, TASK &dummy, Args &...dummyargs)
	Recursive task sibmission. More...

template<typename ARG >
void	RecuTaskExecute (ARG *arg)
	Recursive task execution (base case). More...

template<typename ARG , typename TASK , typename... Args>
void	RecuTaskExecute (ARG *arg, TASK &dummy, Args &...dummyargs)
	Recursive task execution. More...

ostream &	operator<< (ostream &os, const thread_communicator &obj)

range	GetRange (SchedulePolicy strategy, int beg, int end, int nb, int tid, int nparts)

range	GetRange (int beg, int end, int nb, int tid, int nparts)

range	GetRange (int beg, int end, int nb)

const char *	getErrorString (hmlpError_t error)

void	handleError (hmlpError_t error, const char *file, int line)

template<int ALIGN_SIZE, typename T >
T *	hmlp_malloc (int m, int n, int size)
	The default function to allocate memory for HMLP. Memory allocated by this function is aligned. Most of the HMLP primitives require memory alignment.

template<int ALIGN_SIZE, typename T >
T *	hmlp_malloc (int n)
	Another interface.

template<typename T >
void	hmlp_free (T *ptr)
	Free the aligned memory.

template<typename T >
void	hmlp_print_binary (T number)

template<typename T >
void	hmlp_acquire_mpart (hmlpOperation_t transX, int m, int n, T src_buff, int lda, int x, int y, int i, int j, T *dst_buff)
	Split into m x n, get the subblock starting from ith row and jth column. (for STRASSEN)

template<typename T >
T	hmlp_norm (int m, int n, T *A, int lda)

template<typename TA , typename TB >
TB	hmlp_relative_error (int m, int n, TA A, int lda, TB B, int ldb)

template<typename TA , typename TB >
TB	hmlp_relative_error (int m, int n, TA A, int lda, int loa, TB B, int ldb, int lob, int batchSize)

template<typename T >
int	hmlp_count_error (int m, int n, T A, int lda, T B, int ldb)

template<typename T >
int	hmlp_count_error (int m, int n, T A, int lda, int loa, T B, int ldb, int lob, int batchSize)

template<bool IGNOREZERO = false, bool COLUMNINDEX = true, typename T >
void	hmlp_printmatrix (int m, int n, T *A, int lda)

template<typename T >
void	swap (T *x, int i, int j)
	A swap function. Just in case we do not have one. (for GSKNN)

template<typename T >
void	heap_adjust (T D, int s, int n, int I)
	This function is called after the root of the heap is replaced by an new candidate. We need to readjust such the heap condition is satisfied.

template<typename T >
void	heap_select (int m, int r, T x, int alpha, T D, int I)

template<typename T >
void	HeapAdjust (size_t s, size_t n, std::pair< T, size_t > *NN)

template<typename T >
void	HeapSelect (size_t n, size_t k, std::pair< T, size_t > Query, std::pair< T, size_t > NN)

template<typename T >
void	bubble_sort (int n, T D, int I)
	A bubble sort for reference. More...

template<typename T >
void	Partition1x2 (View< T > &A, View< T > &A1, View< T > &A2, size_t nb, SideType side)

template<typename T >
void	Partition2x1 (View< T > &A, View< T > &A1, View< T > &A2, size_t mb, SideType side)

template<typename T >
void	Partition2x2 (View< T > &A, View< T > &A11, View< T > &A12, View< T > &A21, View< T > &A22, size_t mb, size_t nb, QuadrantType quadrant)

template<typename T >
void	Repartition1x2To1x3 (View< T > &AL, View< T > &AR, View< T > &A0, View< T > &A1, View< T > &A2, size_t nb, SideType side)

template<typename T >
void	ContinueWith1x3To1x2 (View< T > &AL, View< T > &AR, View< T > &A0, View< T > &A1, View< T > &A2, SideType side)

template<typename T >
void	Repartition2x1To3x1 (View< T > &AT, View< T > &A0, View< T > &A1, View< T > &AB, View< T > &A2, size_t mb, SideType side)

template<typename T >
void	ContinueWith3x1To2x1 (View< T > &AT, View< T > &A0, View< T > &A1, View< T > &AB, View< T > &A2, SideType side)

template<typename T >
void	Repartition2x2To3x3 (View< T > &ATL, View< T > &ATR, View< T > &A00, View< T > &A01, View< T > &A02, View< T > &A10, View< T > &A11, View< T > &A12, View< T > &ABL, View< T > &ABR, View< T > &A20, View< T > &A21, View< T > &A22, size_t mb, size_t nb, QuadrantType quadrant)

template<typename T >
void	ContinueWith3x3To2x2 (View< T > &ATL, View< T > &ATR, View< T > &A00, View< T > &A01, View< T > &A02, View< T > &A10, View< T > &A11, View< T > &A12, View< T > &ABL, View< T > &ABR, View< T > &A20, View< T > &A21, View< T > &A22, QuadrantType quadrant)

template<typename VIRTUALMATRIX , typename T >
void	lanczos (VIRTUALMATRIX &A, size_t n, size_t r, size_t nkrylov, std::vector< T > &Sigma, std::vector< T > &V)

template<int KC, typename SEMIRINGKERNEL , typename TA , typename TB , typename TV >
void	rank_k_macro_kernel (tci::Comm &Comm3rd, int ic, int jc, int pc, int m, int n, int k, TA packA, TB packB, TV *V, int rs_v, int cs_v, SEMIRINGKERNEL semiringkernel)
	Macro kernel contains the 3rd and 2nd loops. Depending on the configuration of the communicator, the 3rd loop may be parallelized. b_next is the prefetch pointer. More...

template<int MC, int NC, int KC, typename TPACKA , typename TPACKB , typename TV , typename TA , typename TB , typename TC , typename SEMIRINGKERNEL >
void	rank_k_internal (tci::Comm &Comm6th, int batchId, int m, int n, int k, int k_stra, TA &A, TB &B, TV *V, int rs_v, int cs_v, SEMIRINGKERNEL semiringkernel)

template<int MC, int NC, int KC, typename TPACKA , typename TPACKB , typename TV , typename TA , typename TB , typename TC , typename SEMIRINGKERNEL >
void	rank_k (int batchId, int m, int n, int k, TA &A, TB &B, TC &C, SEMIRINGKERNEL semiringkernel)

Detailed Description

HMLP (High-Performance Machine Learning Primitives)

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see the LICENSE file.end extern "C"

HMLP (High-Performance Machine Learning Primitives)

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see the LICENSE file.

PVFMMKernelMatrix<T> uses VirtualMatrix<T> as base For GOFMM compatability

GOFMM templates

HMLP (High-Performance Machine Learning Primitives)

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see the LICENSE file.for USE_STRASSEN reference microkernels

HMLP (High-Performance Machine Learning Primitives)

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see the LICENSE file.for USE_STRASSEN

Enumeration Type Documentation

enum hmlp::Distribution_t

end namespace mpi

Enumerator
RBLK	Elemental MC
CIDS	Elemental MR
RIDS	Distributed according to column ids
USER	Distributed according to row ids
STAR	Distributed acoording to user defined maps
CIRC	Elemental STAR Elemental CIRC

enum hmlp::QuadrantType

2x2, 3x3

enum hmlp::SideType

1x2, 2x1, 1x3, 3x1

Function Documentation

template<typename T >

void hmlp::bubble_sort	(	int	n,
		T *	D,
		int *	I
	)

A bubble sort for reference.

end HeapSelect()

template<typename T >

void hmlp::ContinueWith1x3To1x2	(	View< T > &	AL,
		View< T > &	AR,
		View< T > &	A0,
		View< T > &	A1,
		View< T > &	A2,
		SideType	side
	)

end Repartition1x2To1x3()

template<typename T >

void hmlp::ContinueWith3x1To2x1	(	View< T > &	AT,
		View< T > &	A0,
		View< T > &	A1,
		View< T > &	AB,
		View< T > &	A2,
		SideType	side
	)

end Repartition2x1To3x1()

template<typename T >

void hmlp::ContinueWith3x3To2x2	(	View< T > &	ATL,
		View< T > &	ATR,
		View< T > &	A00,
		View< T > &	A01,
		View< T > &	A02,
		View< T > &	A10,
		View< T > &	A11,
		View< T > &	A12,
		View< T > &	ABL,
		View< T > &	ABR,
		View< T > &	A20,
		View< T > &	A21,
		View< T > &	A22,
		QuadrantType	quadrant
	)

end Repartition2x2To3x3()

const char* hmlp::getErrorString ( hmlpError_t error )

Translate hmlpError_t to error string.

void hmlp::handleError	(	hmlpError_t	error,
		const char *	file,
		int	line
	)

Handling runtime error with information.

Otherwise, handle the error and provide information.

template<typename T >

void hmlp::HeapSelect	(	size_t	n,
		size_t	k,
		std::pair< T, size_t > *	Query,
		std::pair< T, size_t > *	NN
	)

end HeapAdjust()

hmlp::Device * hmlp::hmlp_get_device_host ( )

end RunTime::ExitWithError()

void hmlp::hmlp_msg_dependency_analysis	(	int	key,
		int	p,
		hmlp::ReadWriteType	type,
		hmlp::Task *	task
	)

end class Task

template<typename T >

void hmlp::im2col	(	int	m,
		int	n,
		T *	packX,
		T *	X,
		int	w0,
		int	h0,
		int	d0,
		int	s,
		int	p,
		int	w1,
		int	h1
	)

inline

This is the im2col_gpu() functiobn from

   BVLC/caffe/blob/master/src/caffe/util/im2col.cpp.

   We slightly modify it.

loop over channel, data_im += channel_size zero-padding

template<typename VIRTUALMATRIX , typename T >

void hmlp::lanczos	(	VIRTUALMATRIX &	A,
		size_t	n,
		size_t	r,
		size_t	nkrylov,
		std::vector< T > &	Sigma,
		std::vector< T > &	V
	)

Implement a simple Lanczos algorithm for symmetric eigenpairs.

r <= nkrylov

symmetric tridiagonal matrix

initialize the Krylov subspace

update beta[ 0 ], although we don't use it

normalization

w = A * U( :, 0 )

update alpha[ 0 ]

update w

building the Krylov subspace and form the tridiagonal system

update beta[ iter ] = nrm2( w )

v = w / beta

w = A * U( :, iter )

update alpha[ iter ]

update w

invoke xstev to compute eigenpairs of the tridiagonal system

V' = Z' * U' (V = U * Z( :, (nkrylov - r) ) )

eigenpairs are in ascending order

template<typename T >

bool hmlp::less_first	(	const pair< T, size_t > &	a,
		const pair< T, size_t > &	b
	)

end class MortonHelper

template<typename T >

void hmlp::MergeNeighbors	(	size_t	k,
		pair< T, size_t > *	A,
		pair< T, size_t > *	B,
		vector< pair< T, size_t >> &	aux
	)

Enlarge temporary buffer if it is too small.

template<typename T >

void hmlp::MergeNeighbors	(	size_t	k,
		size_t	n,
		vector< pair< T, size_t >> &	A,
		vector< pair< T, size_t >> &	B
	)

end MergeNeighbors()

template<typename T >

void hmlp::Partition1x2	(	View< T > &	A,
		View< T > &	A1,
		View< T > &	A2,
		size_t	nb,
		SideType	side
	)

end class View

template<typename T >

void hmlp::Partition2x1	(	View< T > &	A,
		View< T > &	A1,
		View< T > &	A2,
		size_t	mb,
		SideType	side
	)

end Partition1x2()

template<typename T >

void hmlp::Partition2x2	(	View< T > &	A,
		View< T > &	A11,
		View< T > &	A12,
		View< T > &	A21,
		View< T > &	A22,
		size_t	mb,
		size_t	nb,
		QuadrantType	quadrant
	)

end Partition2x1()

template<int MC, int NC, int KC, typename TPACKA , typename TPACKB , typename TV , typename TA , typename TB , typename TC , typename SEMIRINGKERNEL >

void hmlp::rank_k	(	int	batchId,
		int	m,
		int	n,
		int	k,
		TA &	A,
		TB &	B,
		TC &	C,
		SEMIRINGKERNEL	semiringkernel
	)

end nbody_internal() This is the main routine of gkmx. All packing buffers are managed here. The communicator and the parallel section start here.

Early return if possible.

Type C must be MatrixLike.

Now get the pointer, row and column stride.

template<int MC, int NC, int KC, typename TPACKA , typename TPACKB , typename TV , typename TA , typename TB , typename TC , typename SEMIRINGKERNEL >

void hmlp::rank_k_internal	(	tci::Comm &	Comm6th,
		int	batchId,
		int	m,
		int	n,
		int	k,
		int	k_stra,
		TA &	A,
		TB &	B,
		TV *	V,
		int	rs_v,
		int	cs_v,
		SEMIRINGKERNEL	semiringkernel
	)

end rank_k_macro_kernel() This function contains the loop body of the 6th to 4th loops, including all packing and unpacking routines. Notice that this function is executed by all threads in the root communicator. To access each thread in different level of communicators, use their ids.

Get all block sizes.

Create subcommunicators for each loop.

Adjuest nc and pack_nc if the 6th loop is parallelized.

Allocate packB (shared over Comm4th, private for each Comm5th gang).

Allocate packA (shared over Comm3th, private for each Comm4th gang).

Distribute range [0,n) over Comm6th.

Distribute range [k_stra,k) over Comm5th.

Distribute range [0,m) over Comm4th.

Distribute range [0,n) over Comm6th.

Distribute range [k_stra,k) over Comm5th.

Distribute range [0,jb) over Comm4th.

PackB and typecast from TB to TPACKB.

Synchronize all threads in Comm4th.

Distribute range [0,m) over Comm4th.

Distribute range [0,ib) over Comm3th.

packA and typecast from TA to TPACKA.

Synchronize all threads in Comm3th.

Otherwise, invoke the semiubg rank-k kernel.

Synchronize all threads in Comm3th.

end 4th loop

end 5th loop

end 6th loop

Free packing buffer.

template<int KC, typename SEMIRINGKERNEL , typename TA , typename TB , typename TV >

void hmlp::rank_k_macro_kernel	(	tci::Comm &	Comm3rd,
		int	ic,
		int	jc,
		int	pc,
		int	m,
		int	n,
		int	k,
		TA *	packA,
		TB *	packB,
		TV *	V,
		int	rs_v,
		int	cs_v,
		SEMIRINGKERNEL	semiringkernel
	)

Macro kernel contains the 3rd and 2nd loops. Depending on the configuration of the communicator, the 3rd loop may be parallelized. b_next is the prefetch pointer.

Get all block sizes

Create subcommunicators for each loop.

Compute loop ranges for each thread

Distribute range [0,n) over Comm3rd (jr loop).

Distribute range [0,m) over Comm2nd (ir loop).

Increase the b_next pointer.

end 2nd loop

end 3rd loop

template<typename ARG >

void hmlp::RecuTaskExecute ( ARG * arg )

Recursive task execution (base case).

end RecuDistTaskSubmit()

do nothing

template<typename ARG , typename TASK , typename... Args>

void hmlp::RecuTaskExecute	(	ARG *	arg,
		TASK &	dummy,
		Args &...	dummyargs
	)

Recursive task execution.

Create the first normal task is it is not a NULLTask

Now recurs to Args&... args, types are deduced automatically

template<typename ARG >

void hmlp::RecuTaskSubmit ( ARG * arg )

Recursive task sibmission (base case).

end class RecvTask

do nothing

template<typename ARG , typename TASK , typename... Args>

void hmlp::RecuTaskSubmit	(	ARG *	arg,
		TASK &	dummy,
		Args &...	dummyargs
	)

Recursive task sibmission.

Create the first normal task is it is not a NULLTask.

now recurs to Args&... args, types are deduced automatically

template<typename T >

void hmlp::Repartition1x2To1x3	(	View< T > &	AL,
		View< T > &	AR,
		View< T > &	A0,
		View< T > &	A1,
		View< T > &	A2,
		size_t	nb,
		SideType	side
	)

end Partition2x2()

template<typename T >

void hmlp::Repartition2x1To3x1	(	View< T > &	AT,
		View< T > &	A0,
		View< T > &	A1,
		View< T > &	AB,
		View< T > &	A2,
		size_t	mb,
		SideType	side
	)

end ContinueWith1x3To1x2()

template<typename T >

void hmlp::Repartition2x2To3x3	(	View< T > &	ATL,
		View< T > &	ATR,
		View< T > &	A00,
		View< T > &	A01,
		View< T > &	A02,
		View< T > &	A10,
		View< T > &	A11,
		View< T > &	A12,
		View< T > &	ABL,
		View< T > &	ABR,
		View< T > &	A20,
		View< T > &	A21,
		View< T > &	A22,
		size_t	mb,
		size_t	nb,
		QuadrantType	quadrant
	)

end ContinueWith3x1To2x1()

double hmlp::xdot	(	int	n,
		const double *	dx,
		int	incx,
		const double *	dy,
		int	incy
	)

DDOT wrapper.

BLAS level-1 wrappers: DOT, NRM2

float hmlp::xdot	(	int	n,
		const float *	dx,
		int	incx,
		const float *	dy,
		int	incy
	)

SDOT wrapper.

end xdot()

void hmlp::xgecon	(	const char *	norm,
		int	n,
		double *	A,
		int	lda,
		double	anorm,
		double *	rcond,
		double *	work,
		int *	iwork
	)

DGECON wrapper.

end xgetrs()

void hmlp::xgecon	(	const char *	norm,
		int	n,
		float *	A,
		int	lda,
		float	anorm,
		float *	rcond,
		float *	work,
		int *	iwork
	)

SGECON wrapper.

end xgecon()

void hmlp::xgels	(	const char *	trans,
		int	m,
		int	n,
		int	nrhs,
		double *	A,
		int	lda,
		double *	B,
		int	ldb,
		double *	work,
		int	lwork
	)

DGELS wrapper.

end geqp4()

void hmlp::xgels	(	const char *	trans,
		int	m,
		int	n,
		int	nrhs,
		float *	A,
		int	lda,
		float *	B,
		int	ldb,
		float *	work,
		int	lwork
	)

SGELS wrapper.

end gels()

void hmlp::xgemm	(	const char *	transA,
		const char *	transB,
		int	m,
		int	n,
		int	k,
		double	alpha,
		const double *	A,
		int	lda,
		const double *	B,
		int	ldb,
		double	beta,
		double *	C,
		int	ldc
	)

DGEMM wrapper.

end xnrm2() BLAS level-3 wrappers: GEMM, TRSM

void hmlp::xgeqp3	(	int	m,
		int	n,
		double *	A,
		int	lda,
		int *	jpvt,
		double *	tau,
		double *	work,
		int	lwork
	)

DGEQP3 wrapper.

end xormqr()

void hmlp::xgeqp3	(	int	m,
		int	n,
		float *	A,
		int	lda,
		int *	jpvt,
		float *	tau,
		float *	work,
		int	lwork
	)

SGEQP3 wrapper.

end geqp3()

void hmlp::xgeqp4	(	int	m,
		int	n,
		double *	A,
		int	lda,
		int *	jpvt,
		double *	tau,
		double *	work,
		int	lwork
	)

DGEQP4 wrapper.

end geqp3()

void hmlp::xgeqp4	(	int	m,
		int	n,
		float *	A,
		int	lda,
		int *	jpvt,
		float *	tau,
		float *	work,
		int	lwork
	)

SGEQP4 wrapper.

end geqp4()

void hmlp::xgeqrf	(	int	m,
		int	n,
		double *	A,
		int	lda,
		double *	tau,
		double *	work,
		int	lwork
	)

DGEQRF wrapper.

end xgecon()

QR family

void hmlp::xgeqrf	(	int	m,
		int	n,
		float *	A,
		int	lda,
		float *	tau,
		float *	work,
		int	lwork
	)

SGEQRF wrapper.

end xgeqrf()

void hmlp::xgesdd	(	const char *	jobz,
		int	m,
		int	n,
		double *	A,
		int	lda,
		double *	S,
		double *	U,
		int	ldu,
		double *	VT,
		int	ldvt,
		double *	work,
		int	lwork,
		int *	iwork
	)

DGESDD wrapper.

end gels()

void hmlp::xgesdd	(	const char *	jobz,
		int	m,
		int	n,
		float *	A,
		int	lda,
		float *	S,
		float *	U,
		int	ldu,
		float *	VT,
		int	ldvt,
		float *	work,
		int	lwork,
		int *	iwork
	)

SGESDD wrapper.

end xgesdd()

void hmlp::xgetrf	(	int	m,
		int	n,
		double *	A,
		int	lda,
		int *	ipiv
	)

DGETRF wrapper.

end xpotrs()

LU family

void hmlp::xgetrf	(	int	m,
		int	n,
		float *	A,
		int	lda,
		int *	ipiv
	)

SGETRF wrapper.

end xgetrf()

void hmlp::xgetrs	(	const char *	trans,
		int	m,
		int	nrhs,
		double *	A,
		int	lda,
		int *	ipiv,
		double *	B,
		int	ldb
	)

DGETRS wrapper.

end xgetrf()

void hmlp::xgetrs	(	const char *	trans,
		int	m,
		int	nrhs,
		float *	A,
		int	lda,
		int *	ipiv,
		float *	B,
		int	ldb
	)

SGETRS wrapper.

end xgetrs()

void hmlp::xlaswp	(	int	n,
		double *	A,
		int	lda,
		int	k1,
		int	k2,
		int *	ipiv,
		int	incx
	)

DLASWP wrapper.

end xtrmm() LAPACK routine wrappers: POTR(F,S), GETR(F,S), GECON, GEQRF, ORGQR, ORMQR, GEQP3, GELS

void hmlp::xlaswp	(	int	n,
		float *	A,
		int	lda,
		int	k1,
		int	k2,
		int *	ipiv,
		int	incx
	)

SLASWP wrapper.

end xlaswp()

double hmlp::xnrm2	(	int	n,
		double *	x,
		int	incx
	)

DNRM2 wrapper.

end xdot()

float hmlp::xnrm2	(	int	n,
		float *	x,
		int	incx
	)

SNRM2 wrapper.

end xnrm2()

void hmlp::xorgqr	(	int	m,
		int	n,
		int	k,
		double *	A,
		int	lda,
		double *	tau,
		double *	work,
		int	lwork
	)

SORGQR wrapper.

end xgeqrf()

void hmlp::xorgqr	(	int	m,
		int	n,
		int	k,
		float *	A,
		int	lda,
		float *	tau,
		float *	work,
		int	lwork
	)

SORGQR wrapper.

end xorgqr()

void hmlp::xormqr	(	const char *	side,
		const char *	trans,
		int	m,
		int	n,
		int	k,
		double *	A,
		int	lda,
		double *	tau,
		double *	C,
		int	ldc,
		double *	work,
		int	lwork
	)

DORMQR wrapper.

end xorgqr()

void hmlp::xormqr	(	const char *	side,
		const char *	trans,
		int	m,
		int	n,
		int	k,
		float *	A,
		int	lda,
		float *	tau,
		float *	C,
		int	ldc,
		float *	work,
		int	lwork
	)

SORMQR wrapper.

end xormqr()

void hmlp::xpotrf	(	const char *	uplo,
		int	n,
		double *	A,
		int	lda
	)

DPOTRF wrapper.

end xlaswp()

Cholesky family

void hmlp::xpotrf	(	const char *	uplo,
		int	n,
		float *	A,
		int	lda
	)

SPOTRF wrapper.

end xpotrf()

void hmlp::xpotrs	(	const char *	uplo,
		int	n,
		int	nrhs,
		double *	A,
		int	lda,
		double *	B,
		int	ldb
	)

DPOTRS wrapper.

end xpotrf()

void hmlp::xpotrs	(	const char *	uplo,
		int	n,
		int	nrhs,
		float *	A,
		int	lda,
		float *	B,
		int	ldb
	)

SPOTRS wrapper.

end xpotrs()

void hmlp::xstev	(	const char *	jobz,
		int	n,
		double *	D,
		double *	E,
		double *	Z,
		int	ldz,
		double *	work
	)

end xgesdd()

void hmlp::xstev	(	const char *	jobz,
		int	n,
		float *	D,
		float *	E,
		float *	Z,
		int	ldz,
		float *	work
	)

end xstev()

void hmlp::xsyrk	(	const char *	uplo,
		const char *	trans,
		int	n,
		int	k,
		float	alpha,
		float *	A,
		int	lda,
		float	beta,
		float *	C,
		int	ldc
	)

end xsyrk()

void hmlp::xtrmm	(	const char *	side,
		const char *	uplo,
		const char *	transA,
		const char *	diag,
		int	m,
		int	n,
		double	alpha,
		double *	A,
		int	lda,
		double *	B,
		int	ldb
	)

DTRMM wrapper.

end xtrsm()

void hmlp::xtrmm	(	const char *	side,
		const char *	uplo,
		const char *	transA,
		const char *	diag,
		int	m,
		int	n,
		float	alpha,
		float *	A,
		int	lda,
		float *	B,
		int	ldb
	)

DTRMM wrapper.

end xtrmm()

void hmlp::xtrsm	(	const char *	side,
		const char *	uplo,
		const char *	transA,
		const char *	diag,
		int	m,
		int	n,
		double	alpha,
		double *	A,
		int	lda,
		double *	B,
		int	ldb
	)

DTRSM wrapper.

end xsyrk()

void hmlp::xtrsm	(	const char *	side,
		const char *	uplo,
		const char *	transA,
		const char *	diag,
		int	m,
		int	n,
		float	alpha,
		float *	A,
		int	lda,
		float *	B,
		int	ldb
	)

STRSM wrapper.

end xtrsm()

Namespaces

Classes

Enumerations

Functions

Detailed Description

Enumeration Type Documentation

Function Documentation