HMLP: High-performance Machine Learning Primitives
hmlp Namespace Reference

Namespaces

 mpi
 
 tci
 
 tree
 

Classes

class  Cache
 
class  Cache1D
 Cache1D<NSET, NWAY, T> creates a layer of cache with NSET that directly maps a 1D array. The direct map is [ id % NSET ]. Each set has NWAY that are fully associated. More...
 
class  Cache2D
 
class  CacheLine
 
class  Cluster
 
class  CovReduceTask
 
class  CovTask
 
class  Data
 
class  Device
 This class describes devices or accelerators that require a master thread to control. A device can accept tasks from multiple workers. All received tasks are expected to be executed independently in a time-sharing fashion. Whether these tasks are executed in parallel, sequential or with some built-in context switching scheme does not matter. More...
 
class  DeviceMemory
 
class  DistData
 
class  DistData< CIRC, CIRC, T >
 
class  DistData< RBLK, STAR, T >
 Ecah MPI process own ( n / size ) rows of A in a cyclic fashion (Round Robin). i.e. If there are 3 MPI processes, then. More...
 
class  DistData< RIDS, STAR, T >
 Ecah MPI process own ( rids.size() ) rows of A, and rids denote the distribution. i.e. ranki owns A(rids[0],:), A(rids[1],:), A(rids[2],:), ... More...
 
class  DistData< STAR, CBLK, T >
 
class  DistData< STAR, CIDS, T >
 Ecah MPI process own ( cids.size() ) columns of A, and cids denote the distribution. i.e. ranki owns A(:,cids[0]), A(:,cids[1]), A(:,cids[2]), ... More...
 
class  DistData< STAR, STAR, T >
 
class  DistData< STAR, USER, T >
 
class  DistDataBase
 
class  DistKernelMatrix
 
class  DistVirtualMatrix
 DistVirtualMatrix is the abstract base class for matrix-free access and operations. Most of the public functions will be virtual. To inherit DistVirtualMatrix, you "must" implement the evaluation operator. Otherwise, the code won't compile. More...
 
class  Event
 Wrapper for omp or pthread mutex. More...
 
class  FunctionBase
 
struct  gsks_ref_mrxnr
 
struct  kernel_s
 
class  KernelMatrix
 
class  Layer
 
class  Layer< FC, T >
 
class  Layer< INPUT, T >
 
class  LayerBase
 
class  ListenerTask
 This task is the abstraction for all tasks handled by Listeners. More...
 
class  Lock
 Wrapper for omp or pthread mutex. More...
 
struct  MatrifyableObject
 
struct  MatrixLike
 
class  MatrixReadWrite
 This class creates 2D grids for 2D matrix partition. More...
 
class  MessageTask
 This task is designed to take care MPI communications. More...
 
class  MLPGaussNewton
 
class  MortonHelper
 
class  NULLTask
 This is a specific type of task that represents NOP. More...
 
class  OOCCovMatrix
 
class  OOCData
 
class  OOCSPDMatrix
 
struct  pack2D_pbxib
 
struct  pack_pbxib
 
class  PVFMMKernelMatrix
 
class  Range
 
class  range
 
class  ReadWrite
 This class provides the ability to perform dependency analysis. More...
 
class  RecvTask
 
class  Regression
 
class  RunTime
 RunTime is statically created in hmlp_runtime.cpp. More...
 
class  Scheduler
 
class  SendTask
 
class  SparseData
 
class  SPDMatrix
 This class does not need to inherit hmlp::Data<T>, but it should support two interfaces for data fetching. More...
 
class  SPDMatrixMPISupport
 
class  Statistic
 
class  Task
 
class  thread_communicator
 
struct  unpack2D_ibxjb
 
struct  unpack_ibxjb
 
class  View
 
class  VirtualFunction
 
class  VirtualMatrix
 VirtualMatrix is the abstract base class for matrix-free access and operations. Most of the public functions will be virtual. To inherit VirtualMatrix, you "must" implement the evaluation operator. Otherwise, the code won't compile. More...
 
class  VirtualNormalizedGraph
 
class  Worker
 

Enumerations

enum  CacheStatus { CACHE_CLEAN, CACHE_DIRTY }
 
enum  DeviceType { HOST, NVIDIA_GPU, OTHER_GPU, TI_DSP }
 
enum  Distribution_t {
  CBLK, RBLK, CIDS, RIDS,
  USER, STAR, CIRC
}
 
enum  TaskStatus {
  ALLOCATED, NOTREADY, QUEUED, RUNNING,
  EXECUTED, DONE, CANCELLED
}
 
enum  ReadWriteType { R, W, RW }
 
enum  SchedulePolicy { HMLP_SCHEDULE_DEFAULT, HMLP_SCHEDULE_ROUND_ROBIN, HMLP_SCHEDULE_UNIFORM, HMLP_SCHEDULE_HEFT }
 
enum  TransposeType { TRANSPOSE, NOTRANSPOSE }
 
enum  SideType { LEFT, RIGHT, BOTTOM, TOP }
 
enum  QuadrantType { TOPLEFT, TOPRIGHT, BOTTOMLEFT, BOTTOMRIGHT }
 
enum  kernel_type {
  GAUSSIAN, SIGMOID, POLYNOMIAL, LAPLACE,
  GAUSSIAN_VAR_BANDWIDTH, TANH, QUARTIC, MULTIQUADRATIC,
  EPANECHNIKOV, USER_DEFINE
}
 
enum  LayerType { INPUT, FC, CONV2D, POOLING }
 

Functions

template<typename T >
bool less_first (const pair< T, size_t > &a, const pair< T, size_t > &b)
 
template<typename T >
bool less_second (const pair< T, size_t > &a, const pair< T, size_t > &b)
 
template<typename T >
bool equal_second (const pair< T, size_t > &a, const pair< T, size_t > &b)
 
template<typename T >
void MergeNeighbors (size_t k, pair< T, size_t > *A, pair< T, size_t > *B, vector< pair< T, size_t >> &aux)
 
template<typename T >
void MergeNeighbors (size_t k, size_t n, vector< pair< T, size_t >> &A, vector< pair< T, size_t >> &B)
 
double xdot (int n, const double *dx, int incx, const double *dy, int incy)
 DDOT wrapper. More...
 
float xdot (int n, const float *dx, int incx, const float *dy, int incy)
 SDOT wrapper. More...
 
double xnrm2 (int n, double *x, int incx)
 DNRM2 wrapper. More...
 
float xnrm2 (int n, float *x, int incx)
 SNRM2 wrapper. More...
 
void xgemm (const char *transA, const char *transB, int m, int n, int k, double alpha, const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc)
 DGEMM wrapper. More...
 
void xgemm (const char *transA, const char *transB, int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc)
 SGEMM wrapper.
 
void xsyrk (const char *uplo, const char *trans, int n, int k, double alpha, double *A, int lda, double beta, double *C, int ldc)
 
void xsyrk (const char *uplo, const char *trans, int n, int k, float alpha, float *A, int lda, float beta, float *C, int ldc)
 
void xtrsm (const char *side, const char *uplo, const char *transA, const char *diag, int m, int n, double alpha, double *A, int lda, double *B, int ldb)
 DTRSM wrapper. More...
 
void xtrsm (const char *side, const char *uplo, const char *transA, const char *diag, int m, int n, float alpha, float *A, int lda, float *B, int ldb)
 STRSM wrapper. More...
 
void xtrmm (const char *side, const char *uplo, const char *transA, const char *diag, int m, int n, double alpha, double *A, int lda, double *B, int ldb)
 DTRMM wrapper. More...
 
void xtrmm (const char *side, const char *uplo, const char *transA, const char *diag, int m, int n, float alpha, float *A, int lda, float *B, int ldb)
 DTRMM wrapper. More...
 
void xlaswp (int n, double *A, int lda, int k1, int k2, int *ipiv, int incx)
 DLASWP wrapper. More...
 
void xlaswp (int n, float *A, int lda, int k1, int k2, int *ipiv, int incx)
 SLASWP wrapper. More...
 
void xpotrf (const char *uplo, int n, double *A, int lda)
 DPOTRF wrapper. More...
 
void xpotrf (const char *uplo, int n, float *A, int lda)
 SPOTRF wrapper. More...
 
void xpotrs (const char *uplo, int n, int nrhs, double *A, int lda, double *B, int ldb)
 DPOTRS wrapper. More...
 
void xpotrs (const char *uplo, int n, int nrhs, float *A, int lda, float *B, int ldb)
 SPOTRS wrapper. More...
 
void xgetrf (int m, int n, double *A, int lda, int *ipiv)
 DGETRF wrapper. More...
 
void xgetrf (int m, int n, float *A, int lda, int *ipiv)
 SGETRF wrapper. More...
 
void xgetrs (const char *trans, int m, int nrhs, double *A, int lda, int *ipiv, double *B, int ldb)
 DGETRS wrapper. More...
 
void xgetrs (const char *trans, int m, int nrhs, float *A, int lda, int *ipiv, float *B, int ldb)
 SGETRS wrapper. More...
 
void xgecon (const char *norm, int n, double *A, int lda, double anorm, double *rcond, double *work, int *iwork)
 DGECON wrapper. More...
 
void xgecon (const char *norm, int n, float *A, int lda, float anorm, float *rcond, float *work, int *iwork)
 SGECON wrapper. More...
 
void xgeqrf (int m, int n, double *A, int lda, double *tau, double *work, int lwork)
 DGEQRF wrapper. More...
 
void xgeqrf (int m, int n, float *A, int lda, float *tau, float *work, int lwork)
 SGEQRF wrapper. More...
 
void xorgqr (int m, int n, int k, double *A, int lda, double *tau, double *work, int lwork)
 SORGQR wrapper. More...
 
void xorgqr (int m, int n, int k, float *A, int lda, float *tau, float *work, int lwork)
 SORGQR wrapper. More...
 
void xormqr (const char *side, const char *trans, int m, int n, int k, double *A, int lda, double *tau, double *C, int ldc, double *work, int lwork)
 DORMQR wrapper. More...
 
void xormqr (const char *side, const char *trans, int m, int n, int k, float *A, int lda, float *tau, float *C, int ldc, float *work, int lwork)
 SORMQR wrapper. More...
 
void xgeqp3 (int m, int n, double *A, int lda, int *jpvt, double *tau, double *work, int lwork)
 DGEQP3 wrapper. More...
 
void xgeqp3 (int m, int n, float *A, int lda, int *jpvt, float *tau, float *work, int lwork)
 SGEQP3 wrapper. More...
 
void xgeqp4 (int m, int n, double *A, int lda, int *jpvt, double *tau, double *work, int lwork)
 DGEQP4 wrapper. More...
 
void xgeqp4 (int m, int n, float *A, int lda, int *jpvt, float *tau, float *work, int lwork)
 SGEQP4 wrapper. More...
 
void xgels (const char *trans, int m, int n, int nrhs, double *A, int lda, double *B, int ldb, double *work, int lwork)
 DGELS wrapper. More...
 
void xgels (const char *trans, int m, int n, int nrhs, float *A, int lda, float *B, int ldb, float *work, int lwork)
 SGELS wrapper. More...
 
void xgesdd (const char *jobz, int m, int n, double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt, double *work, int lwork, int *iwork)
 DGESDD wrapper. More...
 
void xgesdd (const char *jobz, int m, int n, float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt, float *work, int lwork, int *iwork)
 SGESDD wrapper. More...
 
void xstev (const char *jobz, int n, double *D, double *E, double *Z, int ldz, double *work)
 
void xstev (const char *jobz, int n, float *D, float *E, float *Z, int ldz, float *work)
 
void xposv (const char *uplo, int n, int nrhs, double *A, int lda, double *B, int ldb)
 
void xposv (const char *uplo, int n, int nrhs, float *A, int lda, float *B, int ldb)
 
class Devicehmlp_get_device_host ()
 
template<typename T >
void im2col (int m, int n, T *packX, T *X, int w0, int h0, int d0, int s, int p, int w1, int h1)
 
template<int FOLD, bool ZEROPAD = true, typename T >
void pack2Dimg (int m, int n, T *packX, int x0, int y0, int offset, T *X, int w0, int h0, int d0, int s, int p, int w1, int h1)
 pack image into 2D packed buffer. Notice that here X is d leading.
 
template<bool TRANS, int FOLD, bool ZEROPAD = false, typename T >
void pack2D (int m, int n, T *X0, T *X1, int ldx, T gamma, int *xmap, T *packX)
 This is the default packing routine for GKMX, GSKS, GSKNN and STRASSEN.
 
template<bool TRANS, int FOLD, bool ZEROPAD = false, typename T >
void pack2D (int m, int n, T *X0, T *X1, int ldx, T gamma, T *packX)
 
template<bool TRANS, int FOLD, bool ZEROPAD = false, typename T >
void pack2D (int m, int n, T *X, int ldx, int *xmap, T *packX)
 
template<bool TRANS, int FOLD, bool ZEROPAD = false, typename T >
void pack2D (int m, int n, T *X, int ldx, T *packX)
 
template<int PACK_MR, typename TA >
void packA_kcxmc (int m, int k, TA *A, int lda, int *amap, TA *packA)
 
template<int PACK_NR, typename TB >
void packB_kcxnc (int n, int k, TB *B, int ldb, int *bmap, TB *packB)
 
template<int PACK_NR, typename TC >
void packw_rhsxnc (int n, int rhs, TC *w, int ldw, int *wmap, TC *packw)
 
template<int PACK_MR, typename TC >
void packu_rhsxmc (int m, int rhs, TC *u, int ldu, int *umap, TC *packu)
 
void hmlp_msg_dependency_analysis (int key, int p, ReadWriteType type, Task *task)
 
template<typename ARG >
void RecuTaskSubmit (ARG *arg)
 Recursive task sibmission (base case). More...
 
template<typename ARG , typename TASK , typename... Args>
void RecuTaskSubmit (ARG *arg, TASK &dummy, Args &...dummyargs)
 Recursive task sibmission. More...
 
template<typename ARG >
void RecuTaskExecute (ARG *arg)
 Recursive task execution (base case). More...
 
template<typename ARG , typename TASK , typename... Args>
void RecuTaskExecute (ARG *arg, TASK &dummy, Args &...dummyargs)
 Recursive task execution. More...
 
ostream & operator<< (ostream &os, const thread_communicator &obj)
 
range GetRange (SchedulePolicy strategy, int beg, int end, int nb, int tid, int nparts)
 
range GetRange (int beg, int end, int nb, int tid, int nparts)
 
range GetRange (int beg, int end, int nb)
 
const char * getErrorString (hmlpError_t error)
 
void handleError (hmlpError_t error, const char *file, int line)
 
template<int ALIGN_SIZE, typename T >
T * hmlp_malloc (int m, int n, int size)
 The default function to allocate memory for HMLP. Memory allocated by this function is aligned. Most of the HMLP primitives require memory alignment.
 
template<int ALIGN_SIZE, typename T >
T * hmlp_malloc (int n)
 Another interface.
 
template<typename T >
void hmlp_free (T *ptr)
 Free the aligned memory.
 
template<typename T >
void hmlp_print_binary (T number)
 
template<typename T >
void hmlp_acquire_mpart (hmlpOperation_t transX, int m, int n, T *src_buff, int lda, int x, int y, int i, int j, T **dst_buff)
 Split into m x n, get the subblock starting from ith row and jth column. (for STRASSEN)
 
template<typename T >
hmlp_norm (int m, int n, T *A, int lda)
 
template<typename TA , typename TB >
TB hmlp_relative_error (int m, int n, TA *A, int lda, TB *B, int ldb)
 
template<typename TA , typename TB >
TB hmlp_relative_error (int m, int n, TA *A, int lda, int loa, TB *B, int ldb, int lob, int batchSize)
 
template<typename T >
int hmlp_count_error (int m, int n, T *A, int lda, T *B, int ldb)
 
template<typename T >
int hmlp_count_error (int m, int n, T *A, int lda, int loa, T *B, int ldb, int lob, int batchSize)
 
template<bool IGNOREZERO = false, bool COLUMNINDEX = true, typename T >
void hmlp_printmatrix (int m, int n, T *A, int lda)
 
template<typename T >
void swap (T *x, int i, int j)
 A swap function. Just in case we do not have one. (for GSKNN)
 
template<typename T >
void heap_adjust (T *D, int s, int n, int *I)
 This function is called after the root of the heap is replaced by an new candidate. We need to readjust such the heap condition is satisfied.
 
template<typename T >
void heap_select (int m, int r, T *x, int *alpha, T *D, int *I)
 
template<typename T >
void HeapAdjust (size_t s, size_t n, std::pair< T, size_t > *NN)
 
template<typename T >
void HeapSelect (size_t n, size_t k, std::pair< T, size_t > *Query, std::pair< T, size_t > *NN)
 
template<typename T >
void bubble_sort (int n, T *D, int *I)
 A bubble sort for reference. More...
 
template<typename T >
void Partition1x2 (View< T > &A, View< T > &A1, View< T > &A2, size_t nb, SideType side)
 
template<typename T >
void Partition2x1 (View< T > &A, View< T > &A1, View< T > &A2, size_t mb, SideType side)
 
template<typename T >
void Partition2x2 (View< T > &A, View< T > &A11, View< T > &A12, View< T > &A21, View< T > &A22, size_t mb, size_t nb, QuadrantType quadrant)
 
template<typename T >
void Repartition1x2To1x3 (View< T > &AL, View< T > &AR, View< T > &A0, View< T > &A1, View< T > &A2, size_t nb, SideType side)
 
template<typename T >
void ContinueWith1x3To1x2 (View< T > &AL, View< T > &AR, View< T > &A0, View< T > &A1, View< T > &A2, SideType side)
 
template<typename T >
void Repartition2x1To3x1 (View< T > &AT, View< T > &A0, View< T > &A1, View< T > &AB, View< T > &A2, size_t mb, SideType side)
 
template<typename T >
void ContinueWith3x1To2x1 (View< T > &AT, View< T > &A0, View< T > &A1, View< T > &AB, View< T > &A2, SideType side)
 
template<typename T >
void Repartition2x2To3x3 (View< T > &ATL, View< T > &ATR, View< T > &A00, View< T > &A01, View< T > &A02, View< T > &A10, View< T > &A11, View< T > &A12, View< T > &ABL, View< T > &ABR, View< T > &A20, View< T > &A21, View< T > &A22, size_t mb, size_t nb, QuadrantType quadrant)
 
template<typename T >
void ContinueWith3x3To2x2 (View< T > &ATL, View< T > &ATR, View< T > &A00, View< T > &A01, View< T > &A02, View< T > &A10, View< T > &A11, View< T > &A12, View< T > &ABL, View< T > &ABR, View< T > &A20, View< T > &A21, View< T > &A22, QuadrantType quadrant)
 
template<typename VIRTUALMATRIX , typename T >
void lanczos (VIRTUALMATRIX &A, size_t n, size_t r, size_t nkrylov, std::vector< T > &Sigma, std::vector< T > &V)
 
template<int KC, typename SEMIRINGKERNEL , typename TA , typename TB , typename TV >
void rank_k_macro_kernel (tci::Comm &Comm3rd, int ic, int jc, int pc, int m, int n, int k, TA *packA, TB *packB, TV *V, int rs_v, int cs_v, SEMIRINGKERNEL semiringkernel)
 Macro kernel contains the 3rd and 2nd loops. Depending on the configuration of the communicator, the 3rd loop may be parallelized. b_next is the prefetch pointer. More...
 
template<int MC, int NC, int KC, typename TPACKA , typename TPACKB , typename TV , typename TA , typename TB , typename TC , typename SEMIRINGKERNEL >
void rank_k_internal (tci::Comm &Comm6th, int batchId, int m, int n, int k, int k_stra, TA &A, TB &B, TV *V, int rs_v, int cs_v, SEMIRINGKERNEL semiringkernel)
 
template<int MC, int NC, int KC, typename TPACKA , typename TPACKB , typename TV , typename TA , typename TB , typename TC , typename SEMIRINGKERNEL >
void rank_k (int batchId, int m, int n, int k, TA &A, TB &B, TC &C, SEMIRINGKERNEL semiringkernel)
 

Detailed Description

HMLP (High-Performance Machine Learning Primitives)

Copyright (C) 2014-2017, The University of Texas at Austin

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see the LICENSE file.end extern "C"

HMLP (High-Performance Machine Learning Primitives)

Copyright (C) 2014-2017, The University of Texas at Austin

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see the LICENSE file.

PVFMMKernelMatrix<T> uses VirtualMatrix<T> as base For GOFMM compatability

GOFMM templates

HMLP (High-Performance Machine Learning Primitives)

Copyright (C) 2014-2017, The University of Texas at Austin

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see the LICENSE file.for USE_STRASSEN reference microkernels

HMLP (High-Performance Machine Learning Primitives)

Copyright (C) 2014-2017, The University of Texas at Austin

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see the LICENSE file.for USE_STRASSEN

Enumeration Type Documentation

end namespace mpi

Enumerator
RBLK 

Elemental MC

CIDS 

Elemental MR

RIDS 

Distributed according to column ids

USER 

Distributed according to row ids

STAR 

Distributed acoording to user defined maps

CIRC 

Elemental STAR Elemental CIRC

2x2, 3x3

1x2, 2x1, 1x3, 3x1

Function Documentation

template<typename T >
void hmlp::bubble_sort ( int  n,
T *  D,
int *  I 
)

A bubble sort for reference.

end HeapSelect()

template<typename T >
void hmlp::ContinueWith1x3To1x2 ( View< T > &  AL,
View< T > &  AR,
View< T > &  A0,
View< T > &  A1,
View< T > &  A2,
SideType  side 
)
template<typename T >
void hmlp::ContinueWith3x1To2x1 ( View< T > &  AT,
View< T > &  A0,
View< T > &  A1,
View< T > &  AB,
View< T > &  A2,
SideType  side 
)
template<typename T >
void hmlp::ContinueWith3x3To2x2 ( View< T > &  ATL,
View< T > &  ATR,
View< T > &  A00,
View< T > &  A01,
View< T > &  A02,
View< T > &  A10,
View< T > &  A11,
View< T > &  A12,
View< T > &  ABL,
View< T > &  ABR,
View< T > &  A20,
View< T > &  A21,
View< T > &  A22,
QuadrantType  quadrant 
)
const char* hmlp::getErrorString ( hmlpError_t  error)

Translate hmlpError_t to error string.

void hmlp::handleError ( hmlpError_t  error,
const char *  file,
int  line 
)

Handling runtime error with information.

Otherwise, handle the error and provide information.

template<typename T >
void hmlp::HeapSelect ( size_t  n,
size_t  k,
std::pair< T, size_t > *  Query,
std::pair< T, size_t > *  NN 
)

end HeapAdjust()

hmlp::Device * hmlp::hmlp_get_device_host ( )

end RunTime::ExitWithError()

void hmlp::hmlp_msg_dependency_analysis ( int  key,
int  p,
hmlp::ReadWriteType  type,
hmlp::Task task 
)

end class Task

template<typename T >
void hmlp::im2col ( int  m,
int  n,
T *  packX,
T *  X,
int  w0,
int  h0,
int  d0,
int  s,
int  p,
int  w1,
int  h1 
)
inline

This is the im2col_gpu() functiobn from

   BVLC/caffe/blob/master/src/caffe/util/im2col.cpp.

   We slightly modify it.

loop over channel, data_im += channel_size zero-padding

template<typename VIRTUALMATRIX , typename T >
void hmlp::lanczos ( VIRTUALMATRIX &  A,
size_t  n,
size_t  r,
size_t  nkrylov,
std::vector< T > &  Sigma,
std::vector< T > &  V 
)

Implement a simple Lanczos algorithm for symmetric eigenpairs.

r <= nkrylov

symmetric tridiagonal matrix

initialize the Krylov subspace

update beta[ 0 ], although we don't use it

normalization

w = A * U( :, 0 )

update alpha[ 0 ]

update w

building the Krylov subspace and form the tridiagonal system

update beta[ iter ] = nrm2( w )

v = w / beta

w = A * U( :, iter )

update alpha[ iter ]

update w

invoke xstev to compute eigenpairs of the tridiagonal system

V' = Z' * U' (V = U * Z( :, (nkrylov - r) ) )

eigenpairs are in ascending order

template<typename T >
bool hmlp::less_first ( const pair< T, size_t > &  a,
const pair< T, size_t > &  b 
)

end class MortonHelper

template<typename T >
void hmlp::MergeNeighbors ( size_t  k,
pair< T, size_t > *  A,
pair< T, size_t > *  B,
vector< pair< T, size_t >> &  aux 
)

Enlarge temporary buffer if it is too small.

template<typename T >
void hmlp::MergeNeighbors ( size_t  k,
size_t  n,
vector< pair< T, size_t >> &  A,
vector< pair< T, size_t >> &  B 
)
template<typename T >
void hmlp::Partition1x2 ( View< T > &  A,
View< T > &  A1,
View< T > &  A2,
size_t  nb,
SideType  side 
)

end class View

template<typename T >
void hmlp::Partition2x1 ( View< T > &  A,
View< T > &  A1,
View< T > &  A2,
size_t  mb,
SideType  side 
)
template<typename T >
void hmlp::Partition2x2 ( View< T > &  A,
View< T > &  A11,
View< T > &  A12,
View< T > &  A21,
View< T > &  A22,
size_t  mb,
size_t  nb,
QuadrantType  quadrant 
)
template<int MC, int NC, int KC, typename TPACKA , typename TPACKB , typename TV , typename TA , typename TB , typename TC , typename SEMIRINGKERNEL >
void hmlp::rank_k ( int  batchId,
int  m,
int  n,
int  k,
TA &  A,
TB &  B,
TC &  C,
SEMIRINGKERNEL  semiringkernel 
)

end nbody_internal() This is the main routine of gkmx. All packing buffers are managed here. The communicator and the parallel section start here.

Early return if possible.

Type C must be MatrixLike.

Now get the pointer, row and column stride.

template<int MC, int NC, int KC, typename TPACKA , typename TPACKB , typename TV , typename TA , typename TB , typename TC , typename SEMIRINGKERNEL >
void hmlp::rank_k_internal ( tci::Comm Comm6th,
int  batchId,
int  m,
int  n,
int  k,
int  k_stra,
TA &  A,
TB &  B,
TV *  V,
int  rs_v,
int  cs_v,
SEMIRINGKERNEL  semiringkernel 
)

end rank_k_macro_kernel() This function contains the loop body of the 6th to 4th loops, including all packing and unpacking routines. Notice that this function is executed by all threads in the root communicator. To access each thread in different level of communicators, use their ids.

Get all block sizes.

Create subcommunicators for each loop.

Adjuest nc and pack_nc if the 6th loop is parallelized.

Allocate packB (shared over Comm4th, private for each Comm5th gang).

Allocate packA (shared over Comm3th, private for each Comm4th gang).

Distribute range [0,n) over Comm6th.

Distribute range [k_stra,k) over Comm5th.

Distribute range [0,m) over Comm4th.

Distribute range [0,n) over Comm6th.

Distribute range [k_stra,k) over Comm5th.

Distribute range [0,jb) over Comm4th.

PackB and typecast from TB to TPACKB.

Synchronize all threads in Comm4th.

Distribute range [0,m) over Comm4th.

Distribute range [0,ib) over Comm3th.

packA and typecast from TA to TPACKA.

Synchronize all threads in Comm3th.

Otherwise, invoke the semiubg rank-k kernel.

Synchronize all threads in Comm3th.

end 4th loop

end 5th loop

end 6th loop

Free packing buffer.

template<int KC, typename SEMIRINGKERNEL , typename TA , typename TB , typename TV >
void hmlp::rank_k_macro_kernel ( tci::Comm Comm3rd,
int  ic,
int  jc,
int  pc,
int  m,
int  n,
int  k,
TA *  packA,
TB *  packB,
TV *  V,
int  rs_v,
int  cs_v,
SEMIRINGKERNEL  semiringkernel 
)

Macro kernel contains the 3rd and 2nd loops. Depending on the configuration of the communicator, the 3rd loop may be parallelized. b_next is the prefetch pointer.

Get all block sizes

Create subcommunicators for each loop.

Compute loop ranges for each thread

Distribute range [0,n) over Comm3rd (jr loop).

Distribute range [0,m) over Comm2nd (ir loop).

Increase the b_next pointer.

end 2nd loop

end 3rd loop

template<typename ARG >
void hmlp::RecuTaskExecute ( ARG *  arg)

Recursive task execution (base case).

end RecuDistTaskSubmit()

do nothing

template<typename ARG , typename TASK , typename... Args>
void hmlp::RecuTaskExecute ( ARG *  arg,
TASK &  dummy,
Args &...  dummyargs 
)

Recursive task execution.

Create the first normal task is it is not a NULLTask

Now recurs to Args&... args, types are deduced automatically

template<typename ARG >
void hmlp::RecuTaskSubmit ( ARG *  arg)

Recursive task sibmission (base case).

end class RecvTask

do nothing

template<typename ARG , typename TASK , typename... Args>
void hmlp::RecuTaskSubmit ( ARG *  arg,
TASK &  dummy,
Args &...  dummyargs 
)

Recursive task sibmission.

Create the first normal task is it is not a NULLTask.

now recurs to Args&... args, types are deduced automatically

template<typename T >
void hmlp::Repartition1x2To1x3 ( View< T > &  AL,
View< T > &  AR,
View< T > &  A0,
View< T > &  A1,
View< T > &  A2,
size_t  nb,
SideType  side 
)
template<typename T >
void hmlp::Repartition2x1To3x1 ( View< T > &  AT,
View< T > &  A0,
View< T > &  A1,
View< T > &  AB,
View< T > &  A2,
size_t  mb,
SideType  side 
)
template<typename T >
void hmlp::Repartition2x2To3x3 ( View< T > &  ATL,
View< T > &  ATR,
View< T > &  A00,
View< T > &  A01,
View< T > &  A02,
View< T > &  A10,
View< T > &  A11,
View< T > &  A12,
View< T > &  ABL,
View< T > &  ABR,
View< T > &  A20,
View< T > &  A21,
View< T > &  A22,
size_t  mb,
size_t  nb,
QuadrantType  quadrant 
)
double hmlp::xdot ( int  n,
const double *  dx,
int  incx,
const double *  dy,
int  incy 
)

DDOT wrapper.

BLAS level-1 wrappers: DOT, NRM2

float hmlp::xdot ( int  n,
const float *  dx,
int  incx,
const float *  dy,
int  incy 
)

SDOT wrapper.

end xdot()

void hmlp::xgecon ( const char *  norm,
int  n,
double *  A,
int  lda,
double  anorm,
double *  rcond,
double *  work,
int *  iwork 
)

DGECON wrapper.

end xgetrs()

void hmlp::xgecon ( const char *  norm,
int  n,
float *  A,
int  lda,
float  anorm,
float *  rcond,
float *  work,
int *  iwork 
)

SGECON wrapper.

end xgecon()

void hmlp::xgels ( const char *  trans,
int  m,
int  n,
int  nrhs,
double *  A,
int  lda,
double *  B,
int  ldb,
double *  work,
int  lwork 
)

DGELS wrapper.

end geqp4()

void hmlp::xgels ( const char *  trans,
int  m,
int  n,
int  nrhs,
float *  A,
int  lda,
float *  B,
int  ldb,
float *  work,
int  lwork 
)

SGELS wrapper.

end gels()

void hmlp::xgemm ( const char *  transA,
const char *  transB,
int  m,
int  n,
int  k,
double  alpha,
const double *  A,
int  lda,
const double *  B,
int  ldb,
double  beta,
double *  C,
int  ldc 
)

DGEMM wrapper.

end xnrm2() BLAS level-3 wrappers: GEMM, TRSM

void hmlp::xgeqp3 ( int  m,
int  n,
double *  A,
int  lda,
int *  jpvt,
double *  tau,
double *  work,
int  lwork 
)

DGEQP3 wrapper.

end xormqr()

void hmlp::xgeqp3 ( int  m,
int  n,
float *  A,
int  lda,
int *  jpvt,
float *  tau,
float *  work,
int  lwork 
)

SGEQP3 wrapper.

end geqp3()

void hmlp::xgeqp4 ( int  m,
int  n,
double *  A,
int  lda,
int *  jpvt,
double *  tau,
double *  work,
int  lwork 
)

DGEQP4 wrapper.

end geqp3()

void hmlp::xgeqp4 ( int  m,
int  n,
float *  A,
int  lda,
int *  jpvt,
float *  tau,
float *  work,
int  lwork 
)

SGEQP4 wrapper.

end geqp4()

void hmlp::xgeqrf ( int  m,
int  n,
double *  A,
int  lda,
double *  tau,
double *  work,
int  lwork 
)

DGEQRF wrapper.

end xgecon()

QR family

void hmlp::xgeqrf ( int  m,
int  n,
float *  A,
int  lda,
float *  tau,
float *  work,
int  lwork 
)

SGEQRF wrapper.

end xgeqrf()

void hmlp::xgesdd ( const char *  jobz,
int  m,
int  n,
double *  A,
int  lda,
double *  S,
double *  U,
int  ldu,
double *  VT,
int  ldvt,
double *  work,
int  lwork,
int *  iwork 
)

DGESDD wrapper.

end gels()

void hmlp::xgesdd ( const char *  jobz,
int  m,
int  n,
float *  A,
int  lda,
float *  S,
float *  U,
int  ldu,
float *  VT,
int  ldvt,
float *  work,
int  lwork,
int *  iwork 
)

SGESDD wrapper.

end xgesdd()

void hmlp::xgetrf ( int  m,
int  n,
double *  A,
int  lda,
int *  ipiv 
)

DGETRF wrapper.

end xpotrs()

LU family

void hmlp::xgetrf ( int  m,
int  n,
float *  A,
int  lda,
int *  ipiv 
)

SGETRF wrapper.

end xgetrf()

void hmlp::xgetrs ( const char *  trans,
int  m,
int  nrhs,
double *  A,
int  lda,
int *  ipiv,
double *  B,
int  ldb 
)

DGETRS wrapper.

end xgetrf()

void hmlp::xgetrs ( const char *  trans,
int  m,
int  nrhs,
float *  A,
int  lda,
int *  ipiv,
float *  B,
int  ldb 
)

SGETRS wrapper.

end xgetrs()

void hmlp::xlaswp ( int  n,
double *  A,
int  lda,
int  k1,
int  k2,
int *  ipiv,
int  incx 
)

DLASWP wrapper.

end xtrmm() LAPACK routine wrappers: POTR(F,S), GETR(F,S), GECON, GEQRF, ORGQR, ORMQR, GEQP3, GELS

void hmlp::xlaswp ( int  n,
float *  A,
int  lda,
int  k1,
int  k2,
int *  ipiv,
int  incx 
)

SLASWP wrapper.

end xlaswp()

double hmlp::xnrm2 ( int  n,
double *  x,
int  incx 
)

DNRM2 wrapper.

end xdot()

float hmlp::xnrm2 ( int  n,
float *  x,
int  incx 
)

SNRM2 wrapper.

end xnrm2()

void hmlp::xorgqr ( int  m,
int  n,
int  k,
double *  A,
int  lda,
double *  tau,
double *  work,
int  lwork 
)

SORGQR wrapper.

end xgeqrf()

void hmlp::xorgqr ( int  m,
int  n,
int  k,
float *  A,
int  lda,
float *  tau,
float *  work,
int  lwork 
)

SORGQR wrapper.

end xorgqr()

void hmlp::xormqr ( const char *  side,
const char *  trans,
int  m,
int  n,
int  k,
double *  A,
int  lda,
double *  tau,
double *  C,
int  ldc,
double *  work,
int  lwork 
)

DORMQR wrapper.

end xorgqr()

void hmlp::xormqr ( const char *  side,
const char *  trans,
int  m,
int  n,
int  k,
float *  A,
int  lda,
float *  tau,
float *  C,
int  ldc,
float *  work,
int  lwork 
)

SORMQR wrapper.

end xormqr()

void hmlp::xpotrf ( const char *  uplo,
int  n,
double *  A,
int  lda 
)

DPOTRF wrapper.

end xlaswp()

Cholesky family

void hmlp::xpotrf ( const char *  uplo,
int  n,
float *  A,
int  lda 
)

SPOTRF wrapper.

end xpotrf()

void hmlp::xpotrs ( const char *  uplo,
int  n,
int  nrhs,
double *  A,
int  lda,
double *  B,
int  ldb 
)

DPOTRS wrapper.

end xpotrf()

void hmlp::xpotrs ( const char *  uplo,
int  n,
int  nrhs,
float *  A,
int  lda,
float *  B,
int  ldb 
)

SPOTRS wrapper.

end xpotrs()

void hmlp::xstev ( const char *  jobz,
int  n,
double *  D,
double *  E,
double *  Z,
int  ldz,
double *  work 
)

end xgesdd()

void hmlp::xstev ( const char *  jobz,
int  n,
float *  D,
float *  E,
float *  Z,
int  ldz,
float *  work 
)

end xstev()

void hmlp::xsyrk ( const char *  uplo,
const char *  trans,
int  n,
int  k,
float  alpha,
float *  A,
int  lda,
float  beta,
float *  C,
int  ldc 
)

end xsyrk()

void hmlp::xtrmm ( const char *  side,
const char *  uplo,
const char *  transA,
const char *  diag,
int  m,
int  n,
double  alpha,
double *  A,
int  lda,
double *  B,
int  ldb 
)

DTRMM wrapper.

end xtrsm()

void hmlp::xtrmm ( const char *  side,
const char *  uplo,
const char *  transA,
const char *  diag,
int  m,
int  n,
float  alpha,
float *  A,
int  lda,
float *  B,
int  ldb 
)

DTRMM wrapper.

end xtrmm()

void hmlp::xtrsm ( const char *  side,
const char *  uplo,
const char *  transA,
const char *  diag,
int  m,
int  n,
double  alpha,
double *  A,
int  lda,
double *  B,
int  ldb 
)

DTRSM wrapper.

end xsyrk()

void hmlp::xtrsm ( const char *  side,
const char *  uplo,
const char *  transA,
const char *  diag,
int  m,
int  n,
float  alpha,
float *  A,
int  lda,
float *  B,
int  ldb 
)

STRSM wrapper.

end xtrsm()