HMLP: High-performance Machine Learning Primitives
hmlp.h
1 
21 #ifndef HMLP_H
22 #define HMLP_H
23 
24 /* Use STL tempaltes. */
25 #include <vector>
26 /* External prototypes. */
27 #include <external/blas_lapack_prototypes.h>
28 #include <external/mpi_prototypes.h>
29 
30 /* Use STL namespace */
31 using namespace std;
32 
33 
34 typedef enum
35 {
36  HMLP_OP_N,
37  HMLP_OP_T
38 } hmlpOperation_t;
39 
40 typedef enum
41 {
42  HMLP_ERROR_SUCCESS,
43  HMLP_ERROR_NOT_INITIALIZED,
44  HMLP_ERROR_ALLOC_FAILED,
45  HMLP_ERROR_INVALID_VALUE,
46  HMLP_ERROR_EXECUTION_FAILED,
47  HMLP_ERROR_NOT_SUPPORTED,
48  HMLP_ERROR_INTERNAL_ERROR
49 } hmlpError_t;
50 
51 /* HMLP runtime API. */
52 hmlpError_t hmlp_init( int *argc, char ***argv );
53 hmlpError_t hmlp_init( int *argc, char ***argv, MPI_Comm comm );
54 hmlpError_t hmlp_init();
55 hmlpError_t hmlp_init( MPI_Comm comm );
56 hmlpError_t hmlp_set_num_workers( int n_worker );
57 hmlpError_t hmlp_run();
58 hmlpError_t hmlp_finalize();
59 
60 int hmlp_is_in_epoch_session();
61 int hmlp_get_mpi_rank();
62 int hmlp_get_mpi_size();
63 
64 
65 
66 void gkmx_sfma
67 (
68  hmlpOperation_t transA, hmlpOperation_t transB,
69  int m, int n, int k,
70  float *A, int lda,
71  float *B, int ldb,
72  float *C, int ldc
73 );
74 
75 
76 void gkmx_dfma
77 (
78  hmlpOperation_t transA, hmlpOperation_t transB,
79  int m, int n, int k,
80  double *A, int lda,
81  double *B, int ldb,
82  double *C, int ldc
83 );
84 
85 
86 void gkmx_dfma_simple
87 (
88  hmlpOperation_t transA, hmlpOperation_t transB,
89  int m, int n, int k,
90  double *A, int lda,
91  double *B, int ldb,
92  double *C, int ldc
93 );
94 
95 
96 void gkmx_dconv_relu_pool
97 (
98  hmlpOperation_t transA, hmlpOperation_t transB,
99  int m, int n, int k,
100  double *A, int lda,
101  double *B, int ldb,
102  double *C, int ldc
103 );
104 
105 void dstrassen(
106  hmlpOperation_t transA, hmlpOperation_t transB,
107  int m, int n, int k,
108  double *A, int lda,
109  double *B, int ldb,
110  double *C, int ldc
111 );
112 
113 void sconv2d
114 (
115  int w0, int h0, int d0, int s, int p, int batchSize,
116  float *B,
117  int w1, int h1, int d1,
118  float *A,
119  float *C
120 );
121 
122 void dconv2d
123 (
124  int w0, int h0, int d0, int s, int p, int batchSize,
125  double *B,
126  int w1, int h1, int d1,
127  double *A,
128  double *C
129 );
130 
131 void sconv2d_ref
132 (
133  int w0, int h0, int d0, int s, int p, int batchSize,
134  float *B,
135  int w1, int h1, int d1,
136  float *A,
137  float *C
138 );
139 
140 void dconv2d_ref
141 (
142  int w0, int h0, int d0, int s, int p, int batchSize,
143  double *B,
144  int w1, int h1, int d1,
145  double *A,
146  double *C
147 );
148 
149 
150 
151 
152 //void gsks
153 //(
154 // kernel_s<double> *kernel,
155 // int m, int n, int k,
156 // double *u, int *umap,
157 // double *A, double *A2, int *amap,
158 // double *B, double *B2, int *bmap,
159 // double *w, int *wmap
160 //);
161 //
162 //void gsks
163 //(
164 // kernel_s<float> *kernel,
165 // int m, int n, int k,
166 // float *u, int *umap,
167 // float *A, float *A2, int *amap,
168 // float *B, float *B2, int *bmap,
169 // float *w, int *wmap
170 //);
171 //
172 //void dgsks
173 //(
174 // kernel_s<double> *kernel,
175 // int m, int n, int k,
176 // double *u, int *umap,
177 // double *A, double *A2, int *amap,
178 // double *B, double *B2, int *bmap,
179 // double *w, int *wmap
180 //);
181 //
182 //void sgsks
183 //(
184 // kernel_s<float> *kernel,
185 // int m, int n, int k,
186 // float *u, int *umap,
187 // float *A, float *A2, int *amap,
188 // float *B, float *B2, int *bmap,
189 // float *w, int *wmap
190 //);
191 //
192 //void dgsks_ref
193 //(
194 // kernel_s<double> *kernel,
195 // int m, int n, int k,
196 // double *u, int *umap,
197 // double *A, double *A2, int *amap,
198 // double *B, double *B2, int *bmap,
199 // double *w, int *wmap
200 //);
201 //
202 void dgsknn
203 (
204  int m, int n, int k, int r,
205  double *A, double *A2, int *amap,
206  double *B, double *B2, int *bmap,
207  double *D, int *I
208 );
209 
210 void gsknn
211 (
212  int m, int n, int k, int r,
213  double *A, double *A2, int *amap,
214  double *B, double *B2, int *bmap,
215  double *D, int *I
216 );
217 
218 void dgsknn_ref
219 (
220  int m, int n, int k, int r,
221  double *A, double *A2, int *amap,
222  double *B, double *B2, int *bmap,
223  double *D, int *I
224 );
225 
226 #ifdef HMLP_USE_CUDA
227 #include <cuda_runtime.h>
228 #include <cublas_v2.h>
229 #include <cusparse_v2.h>
230 #include <thrust/pair.h>
231 
232 
233 void dsq2nrm
234 (
235  hmlpOperation_t transX,
236  int d, int n,
237  double* X2array[], const double* Xarray[], double* X, int ldx,
238  int batchSize
239 );
240 
241 void gkmm_dfma
242 (
243  cudaStream_t stream,
244  hmlpOperation_t transA, hmlpOperation_t transB,
245  int m, int n, int k,
246  const double *Aarray[], int lda,
247  const double *Barray[], int ldb,
248  double *Carray[], int ldc,
249  int batchSize
250 );
251 
252 void gkmm_dfma
253 (
254  cudaStream_t stream,
255  hmlpOperation_t transA, hmlpOperation_t transB,
256  int m, int n, int k,
257  const double *Aarray, int lda, int loa,
258  const double *Barray, int ldb, int lob,
259  double *Carray, int ldc, int loc,
260  int batchSize
261 );
262 
263 void gkmm_mixfma
264 (
265  cudaStream_t stream,
266  hmlpOperation_t transA, hmlpOperation_t transB,
267  int m, int n, int k,
268  const double *Aarray, int lda, int loa,
269  const double *Barray, int ldb, int lob,
270  float *Carray, int ldc, int loc,
271  int batchSize
272 );
273 
274 void gkrm_dkmeans
275 (
276  cudaStream_t stream,
277  hmlpOperation_t transA, hmlpOperation_t transB,
278  int m, int n, int k,
279  double *Aarray[], double *A2array[], int lda,
280  double *Barray[], double *B2array[], int ldb,
281  thrust::pair<double,int> *Carray[], int ldc,
282  int batchSize
283 );
284 
285 void dkmeans
286 (
287  cudaStream_t stream,
288  int m, int n, int k,
289  double *Aarray[], double *A2array[], int lda,
290  double *Barray[], double *B2array[], int ldb,
291  thrust::pair<double,int> *Carray[], int ldc,
292  int batchSize
293 );
294 
295 void dstrassen
296 (
297  cudaStream_t stream,
298  hmlpOperation_t transA, hmlpOperation_t transB,
299  int m, int n, int k,
300  const double *Aarray[], int lda,
301  const double *Barray[], int ldb,
302  double *Carray[], int ldc,
303  int batchSize
304 );
305 
306 void dstrassen
307 (
308  cudaStream_t stream,
309  hmlpOperation_t transA, hmlpOperation_t transB,
310  int m, int n, int k,
311  const double *Aarray, int lda, int loa,
312  const double *Barray, int ldb, int lob,
313  double *Carray, int ldc, int loc,
314  int batchSize
315 );
316 #endif // end ifdef HMLP_USE_CUDA
317 
318 #endif // define HMLP_H