GCC Code Coverage Report
Directory: . Exec Total Coverage
File: frame/external/dgeqp4.c Lines: 0 284 0.0 %
Date: 2019-01-14 Branches: 0 118 0.0 %

Line Exec Source
1
/*
2
===============================================================================
3
Authors
4
===============================================================================
5
6
Per-Gunnar Martinsson
7
  Dept. of Applied Mathematics,
8
  University of Colorado at Boulder,
9
  526 UCB, Boulder, CO 80309-0526, USA
10
11
Gregorio Quintana-Orti
12
  Depto. de Ingenieria y Ciencia de Computadores,
13
  Universitat Jaume I,
14
  12.071 Castellon, Spain
15
16
Nathan Heavner
17
  Dept. of Applied Mathematics,
18
  University of Colorado at Boulder,
19
  526 UCB, Boulder, CO 80309-0526, USA
20
21
Robert van de Geijn
22
  Dept. of Computer Science and Institute for Computational Engineering and
23
  Sciences,
24
  The University of Texas at Austin
25
  Austin, TX.
26
27
===============================================================================
28
Copyright
29
===============================================================================
30
31
Copyright (C) 2016,
32
  Universitat Jaume I,
33
  University of Colorado at Boulder,
34
  The University of Texas at Austin.
35
36
===============================================================================
37
Disclaimer
38
===============================================================================
39
40
This code is distributed in the hope that it will be useful, but
41
WITHOUT ANY WARRANTY EXPRESSED OR IMPLIED.
42
43
*/
44
45
#include <omp.h>
46
47
#include <stdlib.h>
48
#include <stdio.h>
49
#include <math.h>
50
#include "blas_lapack_prototypes.h"
51
#include "dgeqp4.h"
52
53
54
// Matrices with dimensions smaller than THRESHOLD_FOR_DGEQPF are processed
55
// with LAPACK's routine dgeqpf.
56
// Matrices with dimensions between THRESHOLD_FOR_DGEQPF and
57
// THRESHOLD_FOR_DGEQP3 are processed with LAPACK's routine dgeqp3.
58
// Matrices with dimensions larger than THRESHOLD_FOR_DGEQP3 are processed
59
// with the new HQRRP code.
60
#define THRESHOLD_FOR_DGEQPF    0
61
#define THRESHOLD_FOR_DGEQP3  500
62
63
64
// ============================================================================
65
// Definition of macros.
66
67
#define max( a, b )  ( (a) > (b) ? (a) : (b) )
68
#define min( a, b )  ( (a) > (b) ? (b) : (a) )
69
#define dabs( a )    ( (a) >= 0.0 ? (a) : -(a) )
70
71
// ============================================================================
72
// Compilation declarations.
73
74
#undef CHECK_DOWNDATING_OF_Y
75
76
77
// ============================================================================
78
// Declaration of local prototypes.
79
80
static int dgeqp4_Normal_random_matrix( int m_A, int n_A,
81
               double * buff_A, int ldim_A );
82
83
static double dgeqp4_Normal_random_number( double mu, double sigma );
84
85
static int dgeqp4_Downdate_Y(
86
               int m_U11, int n_U11, double * buff_U11, int ldim_U11,
87
               int m_U21, int n_U21, double * buff_U21, int ldim_U21,
88
               int m_A12, int n_A12, double * buff_A12, int ldim_A12,
89
               int m_T, int n_T, double * buff_T, int ldim_T,
90
               int m_Y2, int n_Y2, double * buff_Y2, int ldim_Y2,
91
               int m_G1, int n_G1, double * buff_G1, int ldim_G1,
92
               int m_G2, int n_G2, double * buff_G2, int ldim_G2 );
93
94
static int dgeqp4_Apply_Q_WY_lhfc_blk_var4(
95
               int m_U, int n_U, double * buff_U, int ldim_U,
96
               int m_T, int n_T, double * buff_T, int ldim_T,
97
               int m_B, int n_B, double * buff_B, int ldim_B );
98
99
static int dgeqp4_Apply_Q_WY_rnfc_blk_var4(
100
               int m_U, int n_U, double * buff_U, int ldim_U,
101
               int m_T, int n_T, double * buff_T, int ldim_T,
102
               int m_B, int n_B, double * buff_B, int ldim_B );
103
104
static int dgeqp4_QRPmod_WY_unb_var4( int pivoting, int num_stages,
105
               int m_A, int n_A, double * buff_A, int ldim_A,
106
               int * buff_p, double * buff_t,
107
               int pivot_B, int m_B, double * buff_B, int ldim_B,
108
               int pivot_C, int m_C, double * buff_C, int ldim_C,
109
               int build_T, double * buff_T, int ldim_T );
110
111
static int dgeqp4_QRP_compute_norms(
112
               int m_A, int n_A, double * buff_A, int ldim_A,
113
               double * buff_d, double * buff_e );
114
115
static int dgeqp4_QRP_downdate_partial_norms( int m_A, int n_A,
116
               double * buff_d,  int st_d,
117
               double * buff_e,  int st_e,
118
               double * buff_wt, int st_wt,
119
               double * buff_A,  int ldim_A );
120
121
static int dgeqp4_QRP_pivot_G_B_C( int j_max_col,
122
               int m_G, double * buff_G, int ldim_G,
123
               int pivot_B, int m_B, double * buff_B, int ldim_B,
124
               int pivot_C, int m_C, double * buff_C, int ldim_C,
125
               int * buff_p,
126
               double * buff_d, double * buff_e );
127
128
129
// ============================================================================
130
void dgeqp4( int * m, int * n, double * A, int * lda, int * jpvt, double * tau,
131
        double * work, int * lwork, int * info ) {
132
//
133
// This routine is plug compatible with LAPACK's routine dgeqp3.
134
// It computes the new HQRRP while keeping the same header as LAPACK's dgeqp3.
135
// It uses dgeqpf or dgeqp3 for small matrices. The thresholds are defined in
136
// constants THRESHOLD_FOR_DGEQPF and THRESHOLD_FOR_DGEQP3.
137
//
138
  int     INB = 1;
139
  int     i_one = 1, i_minus_one = -1,
140
          m_A, n_A, mn_A, ldim_A, lquery, nb, num_factorized_fixed_cols,
141
          minus_info, iws, lwkopt, j, k, num_fixed_cols, n_rest, itmp;
142
  int     * previous_jpvt;
143
  int     ilaenv_();
144
145
  // Some initializations.
146
  m_A    = * m;
147
  n_A    = * n;
148
  mn_A   = min( m_A, n_A );
149
  ldim_A = * lda;
150
151
  // Check input arguments.
152
  * info = 0;
153
  lquery = ( * lwork == -1 );
154
  if( m_A < 0 ) {
155
     * info = -1;
156
  } else if ( n_A < 0 ) {
157
     * info = -2;
158
  } else if ( ldim_A < max( 1, m_A ) ) {
159
     * info = -4;
160
  }
161
162
  if( *info == 0 ) {
163
    if( mn_A == 0 ) {
164
      iws    = 1;
165
      lwkopt = 1;
166
    } else {
167
      iws    = 3 * n_A + 1;
168
      nb     = ilaenv_( & INB, "DGEQRF", " ", & m_A, & n_A, & i_minus_one,
169
                        & i_minus_one );
170
      lwkopt = 2 * n_A + ( n_A + 1 ) * nb;
171
    }
172
    work[ 0 ] = ( double ) lwkopt;
173
174
    if ( ( * lwork < iws )&&( ! lquery ) ) {
175
      * info = -8;
176
    }
177
  }
178
179
  if( * info != 0 ) {
180
    minus_info = - * info;
181
    xerbla_( "DGEQP3", & minus_info );
182
    return;
183
  } else if( lquery ) {
184
    return;
185
  }
186
187
  // Quick return if possible.
188
  if( mn_A == 0 ) {
189
    return;
190
  }
191
192
  // Use LAPACK's DGEQPF or DGEQP3 for small matrices.
193
  //if( mn_A < THRESHOLD_FOR_DGEQPF ) {
194
  //  // Call to LAPACK routine.
195
  //  //// printf( "Calling dgeqpf\n" );
196
  //  dgeqpf_( m, n, A, lda, jpvt, tau, work, info );
197
  //  return;
198
  //} else
199
200
201
  // Use LAPACK's DGEQP3 for small matrices.
202
  if( mn_A < THRESHOLD_FOR_DGEQP3 ) {
203
    //// printf( "Calling dgeqp3\n" );
204
    dgeqp3_( m, n, A, lda, jpvt, tau, work, lwork, info );
205
    return;
206
  }
207
208
  // Move initial columns up front.
209
  num_fixed_cols = 0;
210
  for( j = 0; j < n_A; j++ ) {
211
    if( jpvt[ j ] != 0 ) {
212
      if( j != num_fixed_cols ) {
213
        //// printf( "Swapping columns: %d %d \n", j, num_fixed_cols );
214
        dswap_( & m_A, & A[ 0 + j              * ldim_A ], & i_one,
215
                       & A[ 0 + num_fixed_cols * ldim_A ], & i_one );
216
        jpvt[ j ] = jpvt[ num_fixed_cols ];
217
        jpvt[ num_fixed_cols ] = j + 1;
218
      } else {
219
        jpvt[ j ] = j + 1 ;
220
      }
221
      num_fixed_cols++;
222
    } else {
223
      jpvt[ j ] = j + 1 ;
224
    }
225
  }
226
227
  // Factorize fixed columns at the front.
228
  num_factorized_fixed_cols = min( m_A, num_fixed_cols );
229
  if( num_factorized_fixed_cols > 0 ) {
230
    dgeqrf_( & m_A, & num_factorized_fixed_cols, A, & ldim_A, tau, work, lwork,
231
             info );
232
    if( * info != 0 ) {
233
      fprintf( stderr, "ERROR in dgeqrf: Info: %d \n", * info );
234
    }
235
    iws = max( iws, ( int ) work[ 0 ] );
236
    if( num_factorized_fixed_cols < n_A ) {
237
      n_rest = n_A - num_factorized_fixed_cols;
238
      dormqr_( "Left", "Transpose",
239
               & m_A, & n_rest, & num_factorized_fixed_cols,
240
               A, & ldim_A, tau,
241
               & A[ 0 + num_factorized_fixed_cols * ldim_A ], & ldim_A,
242
               work, lwork, info );
243
      if( * info != 0 ) {
244
        fprintf( stderr, "ERROR in dormqr: Info: %d \n", * info );
245
      }
246
247
      iws = max( iws, ( int ) work[ 0 ] );
248
    }
249
  }
250
251
  // Create intermediate jpvt vector.
252
  previous_jpvt = ( int * ) malloc( n_A * sizeof( int ) );
253
254
  // Save a copy of jpvt vector.
255
  if( num_factorized_fixed_cols > 0 ) {
256
    // Copy vector.
257
    for( j = 0; j < n_A; j++ ) {
258
      previous_jpvt[ j ] = jpvt[ j ];
259
    }
260
  }
261
262
  // Factorize free columns at the bottom with default values:
263
  // nb_alg = 64, pp = 10, panel_pivoting = 1.
264
  if( num_factorized_fixed_cols < mn_A ) {
265
    * info = dgeqp4_HQRRP_WY_blk_var4(
266
        m_A - num_factorized_fixed_cols, n_A - num_factorized_fixed_cols,
267
        & A[ num_factorized_fixed_cols + num_factorized_fixed_cols * ldim_A ],
268
            ldim_A,
269
        & jpvt[ num_factorized_fixed_cols ],
270
        & tau[ num_factorized_fixed_cols ],
271
        64, 10, 1 );
272
  }
273
274
  // Pivot block above factorized block by dgeqp4_HQRRP.
275
  if( num_factorized_fixed_cols > 0 ) {
276
    // Pivot block above factorized block.
277
    for( j = num_factorized_fixed_cols; j < n_A; j++ ) {
278
      //// printf( "%% Processing j: %d \n", j );
279
      for( k = j; k < n_A; k++ ) {
280
        if( jpvt[ j ] == previous_jpvt[ k ] ) {
281
          //// printf( "%%   Found j: %d  k: %d \n", j, k );
282
          break;
283
        }
284
      }
285
      // Swap vector previous_jpvt and block above factorized block.
286
      if( k != j ) {
287
        // Swap elements in previous_jpvt.
288
        //// printf( "%%   Swapping  j: %d  k: %d \n", j, k );
289
        itmp = previous_jpvt[ j ];
290
        previous_jpvt[ j ] = previous_jpvt[ k ];
291
        previous_jpvt[ k ] = itmp;
292
293
        // Swap columns in block above factorized block.
294
        dswap_( & num_factorized_fixed_cols,
295
                & A[ 0 + j * ldim_A ], & i_one,
296
                & A[ 0 + k * ldim_A ], & i_one );
297
      }
298
    }
299
  }
300
301
  // Remove intermediate jpvt vector.
302
  free( previous_jpvt );
303
304
  // Return workspace length required.
305
  work[ 0 ] = iws;
306
  return;
307
}
308
309
// ============================================================================
310
int dgeqp4_HQRRP_WY_blk_var4( int m_A, int n_A, double * buff_A, int ldim_A,
311
        int * buff_jpvt, double * buff_tau,
312
        int nb_alg, int pp, int panel_pivoting ) {
313
//
314
// HQRRP: It computes the Householder QR with Randomized Pivoting of matrix A.
315
// This routine is almost compatible with LAPACK's dgeqp3.
316
// The main difference is that this routine does not manage fixed columns.
317
//
318
// Main features:
319
//   * BLAS-3 based.
320
//   * Norm downdating method by Drmac.
321
//   * Downdating for computing Y.
322
//   * No use of libflame.
323
//   * Compact WY transformations are used instead of UT transformations.
324
//   * LAPACK's routine dlarfb is used to apply block transformations.
325
//
326
// Arguments:
327
// ----------
328
// m_A:            Number of rows of matrix A.
329
// n_A:            Number of columns of matrix A.
330
// buff_A:         Address/pointer of/to data in matrix A. Matrix A must be
331
//                 stored in column-order.
332
// ldim_A:         Leading dimension of matrix A.
333
// buff_jpvt:      Input/output vector with the pivots.
334
// buff_tau:       Output vector with the tau values of the Householder factors.
335
// nb_alg:         Block size.
336
//                 Usual values for nb_alg are 32, 64, etc.
337
// pp:             Oversampling size.
338
//                 Usual values for pp are 5, 10, etc.
339
// panel_pivoting: If panel_pivoting==1, QR with pivoting is applied to
340
//                 factorize the panels of matrix A. Otherwise, QR without
341
//                 pivoting is used. Usual value for panel_pivoting is 1.
342
// Final comments:
343
// ---------------
344
// This code has been created from a libflame code. Hence, you can find some
345
// commented calls to libflame routines. We have left them to make it easier
346
// to interpret the meaning of the C code.
347
//
348
  int     b, j, last_iter, mn_A, m_Y, n_Y, ldim_Y, m_V, n_V, ldim_V,
349
          m_W, n_W, ldim_W, n_VR, m_AB1, n_AB1, ldim_T1_T,
350
          m_A11, n_A11, m_A12, n_A12, m_A21, n_A21, m_A22,
351
          m_G, n_G, ldim_G;
352
  double  * buff_Y, * buff_V, * buff_W, * buff_VR, * buff_YR,
353
          * buff_s, * buff_sB, * buff_s1,
354
          * buff_AR, * buff_AB1, * buff_A01, * buff_Y1, * buff_T1_T,
355
          * buff_A11, * buff_A21, * buff_A12,
356
          * buff_Y2, * buff_G, * buff_G1, * buff_G2;
357
  int     * buff_p, * buff_pB, * buff_p1;
358
  double  d_zero = 0.0;
359
  double  d_one  = 1.0;
360
361
362
  double down_t = 0.0, unb_t = 0.0, lhfc_t = 0.0;
363
364
365
366
  // Executable Statements.
367
  //// printf( "%% dgeqp4_HQRRP_WY_blk_var4.\n" );
368
369
  // Check arguments.
370
  if( m_A < 0 ) {
371
    fprintf( stderr,
372
             "ERROR in dgeqp4_HQRRP_WY_blk_var4: m_A is < 0.\n" );
373
  } if( n_A < 0 ) {
374
    fprintf( stderr,
375
             "ERROR in dgeqp4_HQRRP_WY_blk_var4: n_A is < 0.\n" );
376
  } if( ldim_A < max( 1, m_A ) ) {
377
    fprintf( stderr,
378
             "ERROR in dgeqp4_HQRRP_WY_blk_var4: ldim_A is < max( 1, m_A ).\n" );
379
  }
380
381
  // Some initializations.
382
  mn_A   = min( m_A, n_A );
383
  buff_p = buff_jpvt;
384
  buff_s = buff_tau;
385
386
  // Quick return.
387
  if( mn_A == 0 ) {
388
    return 0;
389
  }
390
391
  // Initialize the seed for the generator of random numbers.
392
  srand( 12 );
393
394
  // Create auxiliary objects.
395
  m_Y     = nb_alg + pp;
396
  n_Y     = n_A;
397
  buff_Y  = ( double * ) malloc( m_Y * n_Y * sizeof( double ) );
398
  ldim_Y  = m_Y;
399
400
  m_V     = nb_alg + pp;
401
  n_V     = n_A;
402
  buff_V  = ( double * ) malloc( m_V * n_V * sizeof( double ) );
403
  ldim_V  = m_V;
404
405
  m_W     = nb_alg;
406
  n_W     = n_A;
407
  buff_W  = ( double * ) malloc( m_W * n_W * sizeof( double ) );
408
  ldim_W  = m_W;
409
410
  m_G     = nb_alg + pp;
411
  n_G     = m_A;
412
  buff_G  = ( double * ) malloc( m_G * n_G * sizeof( double ) );
413
  ldim_G  = m_G;
414
415
  double beg = omp_get_wtime();
416
  // Initialize matrices G and Y.
417
  dgeqp4_Normal_random_matrix( nb_alg + pp, m_A, buff_G, ldim_G );
418
  double rand_t = omp_get_wtime() - beg;
419
  //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
420
  ////           FLA_ONE, G, A, FLA_ZERO, Y );
421
422
423
  beg = omp_get_wtime();
424
  dgemm_( "No tranpose", "No transpose", & m_Y, & n_Y, & m_A,
425
          & d_one, buff_G, & ldim_G, buff_A, & ldim_A,
426
          & d_zero, buff_Y, & ldim_Y );
427
  double gemm_proj_t = omp_get_wtime() - beg;
428
429
  //printf( "rand_t %lf gemm_proj_t %lf\n", rand_t, gemm_proj_t ); fflush( stdout );
430
431
432
433
  // Main Loop.
434
  for( j = 0; j < mn_A; j += nb_alg ) {
435
    b = min( nb_alg, min( n_A - j, m_A - j ) );
436
437
    // Check whether it is the last iteration.
438
    last_iter = ( ( ( j + nb_alg >= m_A )||( j + nb_alg >= n_A ) ) ? 1 : 0 );
439
440
    // Some initializations for the iteration of this loop.
441
    n_VR = n_V - j;
442
    buff_VR = & buff_V[ 0 + j * ldim_V ];
443
    buff_YR = & buff_Y[ 0 + j * ldim_Y ];
444
    buff_pB = & buff_p[ j ];
445
    buff_sB = & buff_s[ j ];
446
    buff_AR = & buff_A[ 0 + j * ldim_A ];
447
448
    m_AB1     = m_A - j;
449
    n_AB1     = b;
450
    buff_AB1  = & buff_A[ j + j * ldim_A ];
451
    buff_p1   = & buff_p[ j ];
452
    buff_s1   = & buff_s[ j ];
453
    buff_A01  = & buff_A[ 0 + j * ldim_A ];
454
    buff_Y1   = & buff_Y[ 0 + j * ldim_Y ];
455
    buff_T1_T = & buff_W[ 0 + j * ldim_W ];
456
    ldim_T1_T = ldim_W;
457
458
    buff_A11 = & buff_A[ j + j * ldim_A ];
459
    m_A11 = b;
460
    n_A11 = b;
461
462
    buff_A21 = & buff_A[ min( m_A - 1, j + nb_alg ) + j * ldim_A ];
463
    m_A21 = max( 0, m_A - j - b );
464
    n_A21 = b;
465
466
    buff_A12 = & buff_A[ j + min( n_A - 1, j + b ) * ldim_A ];
467
    m_A12 = b;
468
    n_A12 = max( 0, n_A - j - b );
469
470
    //// buff_A22 = & buff_A[ min( m_A - 1, j + b ) +
471
    ////                      min( n_A - 1, j + b ) * ldim_A ];
472
    m_A22 = max( 0, m_A - j - b );
473
    //// n_A22 = max( 0, n_A - j - b );
474
475
    buff_Y2 = & buff_Y[ 0 + min( n_Y - 1, j + b ) * ldim_Y ];
476
    buff_G1 = & buff_G[ 0 + j * ldim_G ];
477
    buff_G2 = & buff_G[ 0 + min( n_G - 1, j + b ) * ldim_G ];
478
479
#ifdef CHECK_DOWNDATING_OF_Y
480
    // Check downdating of matrix Y: Compare downdated matrix Y with
481
    // matrix Y computed from scratch.
482
    int     m_cyr, n_cyr, ldim_cyr, m_ABR, ii, jj;
483
    double  * buff_cyr, aux, sum;
484
485
    m_cyr    = m_Y;
486
    n_cyr    = n_Y - j;
487
    ldim_cyr = m_cyr;
488
    m_ABR    = m_A - j;
489
    buff_cyr = ( double * ) malloc( m_cyr * n_cyr * sizeof( double ) );
490
491
    //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
492
    ////           FLA_ONE, GR, ABR, FLA_ZERO, CYR );
493
    dgemm_( "No tranpose", "No transpose", & m_cyr, & n_cyr, & m_ABR,
494
            & d_one, & buff_G[ 0 + j * ldim_G ], & ldim_G,
495
                     & buff_A[ j + j * ldim_A ], & ldim_A,
496
            & d_zero, & buff_cyr[ 0 + 0 * ldim_cyr ], & ldim_cyr );
497
498
    //// print_double_matrix( "cyr", m_cyr, n_cyr, buff_cyr, ldim_cyr );
499
    //// print_double_matrix( "y", m_Y, n_Y, buff_Y, ldim_Y );
500
    sum = 0.0;
501
    for( jj = 0; jj < n_cyr; jj++ ) {
502
      for( ii = 0; ii < m_cyr; ii++ ) {
503
        aux = buff_Y[ ii + ( j + jj ) * ldim_Y ] -
504
              buff_cyr[ ii + jj * ldim_cyr ];
505
        sum += aux * aux;
506
      }
507
    }
508
    sum = sqrt( sum );
509
    printf( "%%  diff between Y and downdated Y: %le\n", sum );
510
511
    free( buff_cyr );
512
#endif
513
514
    if( last_iter == 0 ) {
515
      // Compute QRP of YR, and apply permutations to matrix AR.
516
      // A copy of YR is made into VR, and permutations are applied to YR.
517
      //// FLA_Merge_2x1( ATR,
518
      ////                ABR,   & AR );
519
      //// FLA_Copy( YR, VR );
520
      //// FLA_QRPmod_WY_unb_var4( 1, bRow, VR, pB, sB, 1, AR, 1, YR, 0, None );
521
522
      dlacpy_( "All", & m_V, & n_VR, buff_YR, & ldim_Y,
523
                                     buff_VR, & ldim_V );
524
      dgeqp4_QRPmod_WY_unb_var4( 1, b,
525
          m_V, n_VR, buff_VR, ldim_V, buff_pB, buff_sB,
526
          1, m_A, buff_AR, ldim_A,
527
          1, m_Y, buff_YR, ldim_Y,
528
          0, buff_Y, ldim_Y );
529
    }
530
531
    //
532
    // Compute QRP of panel AB1 = [ A11; A21 ].
533
    // Apply same permutations to A01 and Y1, and build T1_T.
534
    //
535
    //// FLA_Part_2x1( W1,   & T1_T,
536
    ////                     & None,    b, FLA_TOP );
537
    //// FLA_Merge_2x1( A11,
538
    ////                A21,   & AB1 );
539
    //// FLA_QRPmod_WY_unb_var4( panel_pivoting, -1, AB1, p1, s1,
540
    ////                         1, A01, 1, Y1, 1, T1_T );
541
542
    beg = omp_get_wtime();
543
    dgeqp4_QRPmod_WY_unb_var4( panel_pivoting, -1,
544
        m_AB1, n_AB1, buff_AB1, ldim_A, buff_p1, buff_s1,
545
        1, j, buff_A01, ldim_A,
546
        1, m_Y, buff_Y1, ldim_Y,
547
        1, buff_T1_T, ldim_W );
548
    unb_t += omp_get_wtime() - beg;
549
550
    //
551
    // Update the rest of the matrix.
552
    //
553
    if ( ( j + b ) < n_A ) {
554
      // Apply the Householder transforms associated with AB1 = [ A11; A21 ]
555
      // and T1_T to [ A12; A22 ]:
556
      //   / A12 \ := QB1' / A12 \
557
      //   \ A22 /         \ A22 /
558
      // where QB1 is formed from AB1 and T1_T.
559
      //// MyFLA_Apply_Q_WY_lhfc_blk_var4( A11, A21, T1_T, A12, A22 );
560
561
      beg = omp_get_wtime();
562
      dgeqp4_Apply_Q_WY_lhfc_blk_var4(
563
          m_A11 + m_A21, n_A11, buff_A11, ldim_A,
564
          b, b, buff_T1_T, ldim_W,
565
          m_A12 + m_A22, n_A12, buff_A12, ldim_A );
566
      lhfc_t += omp_get_wtime() - beg;
567
    }
568
569
    //
570
    // Downdate matrix Y.
571
    //
572
    if ( ! last_iter ) {
573
      //// MyFLA_Downdate_Y( A11, A21, A12, T1_T, Y2, G1, G2 );
574
575
576
      beg = omp_get_wtime();
577
578
      dgeqp4_Downdate_Y(
579
          m_A11, n_A11, buff_A11, ldim_A,
580
          m_A21, n_A21, buff_A21, ldim_A,
581
          m_A12, n_A12, buff_A12, ldim_A,
582
          b, b, buff_T1_T, ldim_T1_T,
583
          m_Y, max( 0, n_Y - j - b ), buff_Y2, ldim_Y,
584
          m_G, b, buff_G1, ldim_G,
585
          m_G, max( 0, n_G - j - b ), buff_G2, ldim_G );
586
587
      down_t += omp_get_wtime() - beg;
588
589
    }
590
  }
591
592
593
  //printf( "down_t %lf, unb_t %lf, lhfc %lf\n", down_t, unb_t, lhfc_t ); fflush( stdout );
594
595
596
  // Remove auxiliary objects.
597
  //// FLA_Obj_free( & G );
598
  //// FLA_Obj_free( & Y );
599
  //// FLA_Obj_free( & V );
600
  //// FLA_Obj_free( & W );
601
  free( buff_G );
602
  free( buff_Y );
603
  free( buff_V );
604
  free( buff_W );
605
606
  return 0;
607
}
608
609
610
// ============================================================================
611
static int dgeqp4_Normal_random_matrix( int m_A, int n_A,
612
               double * buff_A, int ldim_A ) {
613
//
614
// It generates a random matrix with normal distribution.
615
//
616
  int  i, j;
617
618
  // Main loop.
619
  for ( j = 0; j < n_A; j++ ) {
620
    for ( i = 0; i < m_A; i++ ) {
621
      buff_A[ i + j * ldim_A ] = dgeqp4_Normal_random_number( 0.0, 1.0 );
622
    }
623
  }
624
625
  return 0;
626
}
627
628
// ============================================================================
629
static double dgeqp4_Normal_random_number( double mu, double sigma ) {
630
//
631
// It computes and returns a normal random number.
632
//
633
  static int     alternate_calls = 0;
634
  static double  b1, b2;
635
  double         c1, c2, a, factor;
636
637
  // Quick return.
638
  if( alternate_calls == 1 ) {
639
    alternate_calls = ! alternate_calls;
640
    return( mu + sigma * b2 );
641
  }
642
  // Main loop.
643
  do {
644
    c1 = -1.0 + 2.0 * ( (double) rand() / RAND_MAX );
645
    c2 = -1.0 + 2.0 * ( (double) rand() / RAND_MAX );
646
    a = c1 * c1 + c2 * c2;
647
  } while ( ( a == 0 )||( a >= 1 ) );
648
  factor = sqrt( ( -2 * log( a ) ) / a );
649
  b1 = c1 * factor;
650
  b2 = c2 * factor;
651
  alternate_calls = ! alternate_calls;
652
  return( mu + sigma * b1 );
653
}
654
655
// ============================================================================
656
static int dgeqp4_Downdate_Y(
657
               int m_U11, int n_U11, double * buff_U11, int ldim_U11,
658
               int m_U21, int n_U21, double * buff_U21, int ldim_U21,
659
               int m_A12, int n_A12, double * buff_A12, int ldim_A12,
660
               int m_T, int n_T, double * buff_T, int ldim_T,
661
               int m_Y2, int n_Y2, double * buff_Y2, int ldim_Y2,
662
               int m_G1, int n_G1, double * buff_G1, int ldim_G1,
663
               int m_G2, int n_G2, double * buff_G2, int ldim_G2 ) {
664
//
665
// It downdates matrix Y, and updates matrix G.
666
// Only Y2 of Y is updated.
667
// Only G1 and G2 of G are updated.
668
//
669
// Y2 = Y2 - ( G1 - ( G1*U11 + G2*U21 ) * inv( T11 ) * U11' ) * R12.
670
//
671
  int    i, j;
672
  double * buff_B;
673
  double d_one       = 1.0;
674
  double d_minus_one = -1.0;
675
  int    m_B         = m_G1;
676
  int    n_B         = n_G1;
677
  int    ldim_B      = m_G1;
678
679
  // Create object B.
680
  //// FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G1, & B );
681
  buff_B = ( double * ) malloc( m_B * n_B * sizeof( double ) );
682
683
  // B = G1.
684
  //// FLA_Copy( G1, B );
685
  dlacpy_( "All", & m_G1, & n_G1, buff_G1, & ldim_G1,
686
                                  buff_B, & ldim_B );
687
688
  // B = B * U11.
689
  //// FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR,
690
  ////           FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
691
  ////           FLA_ONE, U11, B );
692
  dtrmm_( "Right", "Lower", "No transpose", "Unit", & m_B, & n_B,
693
          & d_one, buff_U11, & ldim_U11, buff_B, & ldim_B );
694
695
  // B = B + G2 * U21.
696
  //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
697
  ////           FLA_ONE, G2, U21, FLA_ONE, B );
698
  dgemm_( "No transpose", "No tranpose", & m_B, & n_B, & m_U21,
699
          & d_one, buff_G2, & ldim_G2, buff_U21, & ldim_U21,
700
          & d_one, buff_B, & ldim_B );
701
702
  // B = B * inv( T11 ).
703
  //// FLA_Trsm( FLA_RIGHT, FLA_UPPER_TRIANGULAR,
704
  ////           FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
705
  ////           FLA_ONE, T, B );
706
  //// dtrsm_( "Right", "Upper", "No transpose", "Non-unit", & m_B, & n_B,
707
  ////         & d_one, buff_T, & ldim_T, buff_B, & ldim_B );
708
  // Used dtrmm instead of dtrsm because of using compact WY instead of UT.
709
  dtrmm_( "Right", "Upper", "No transpose", "Non-unit", & m_B, & n_B,
710
          & d_one, buff_T, & ldim_T, buff_B, & ldim_B );
711
712
  // B = - B * U11^H.
713
  //// FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR,
714
  ////           FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG,
715
  ////           FLA_MINUS_ONE, U11, B );
716
  dtrmm_( "Right", "Lower", "Conj_tranpose", "Unit", & m_B, & n_B,
717
          & d_minus_one, buff_U11, & ldim_U11, buff_B, & ldim_B );
718
719
  // B = G1 + B.
720
  //// FLA_Axpy( FLA_ONE, G1, B );
721
  for( j = 0; j < n_B; j++ ) {
722
    for( i = 0; i < m_B; i++ ) {
723
      buff_B[ i + j * ldim_B ] += buff_G1[ i + j * ldim_G1 ];
724
    }
725
  }
726
727
  // Y2 = Y2 - B * R12.
728
  //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
729
  ////           FLA_MINUS_ONE, B, A12, FLA_ONE, Y2 );
730
  dgemm_( "No transpose", "No transpose", & m_Y2, & n_Y2, & m_A12,
731
          & d_minus_one, buff_B, & ldim_B, buff_A12, & ldim_A12,
732
          & d_one, buff_Y2, & ldim_Y2 );
733
734
  //
735
  // GR = GR * Q
736
  //
737
  dgeqp4_Apply_Q_WY_rnfc_blk_var4(
738
          m_U11 + m_U21, n_U11, buff_U11, ldim_U11,
739
          m_T, n_T, buff_T, ldim_T,
740
          m_G1, n_G1 + n_G2, buff_G1, ldim_G1 );
741
742
  // Remove object B.
743
  //// FLA_Obj_free( & B );
744
  free( buff_B );
745
746
  return 0;
747
}
748
749
// ============================================================================
750
static int dgeqp4_Apply_Q_WY_lhfc_blk_var4(
751
               int m_U, int n_U, double * buff_U, int ldim_U,
752
               int m_T, int n_T, double * buff_T, int ldim_T,
753
               int m_B, int n_B, double * buff_B, int ldim_B ) {
754
//
755
// It applies the transpose of a block transformation Q to a matrix B from
756
// the left:
757
//   B := Q' * B
758
// where:
759
//   Q = I - U * T' * U'.
760
//
761
  double  * buff_W;
762
  int     ldim_W;
763
764
  // Create auxiliary object.
765
  //// FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B1, & W );
766
  buff_W = ( double * ) malloc( n_B * n_U * sizeof( double ) );
767
  ldim_W = max( 1, n_B );
768
769
  // Apply the block transformation.
770
  dlarfb_( "Left", "Transpose", "Forward", "Columnwise",
771
           & m_B, & n_B, & n_U, buff_U, & ldim_U, buff_T, & ldim_T,
772
           buff_B, & ldim_B, buff_W, & ldim_W );
773
774
  // Remove auxiliary object.
775
  //// FLA_Obj_free( & W );
776
  free( buff_W );
777
778
  return 0;
779
}
780
781
// ============================================================================
782
static int dgeqp4_Apply_Q_WY_rnfc_blk_var4(
783
               int m_U, int n_U, double * buff_U, int ldim_U,
784
               int m_T, int n_T, double * buff_T, int ldim_T,
785
               int m_B, int n_B, double * buff_B, int ldim_B ) {
786
//
787
// It applies a block transformation Q to a matrix B from the right:
788
//   B = B * Q
789
// where:
790
//   Q = I - U * T' * U'.
791
//
792
  double  * buff_W;
793
  int     ldim_W;
794
795
  // Create auxiliary object.
796
  //// FLA_Obj_create_conf_to( FLA_TRANSPOSE, B1, & W );
797
  buff_W = ( double * ) malloc( m_B * n_U * sizeof( double ) );
798
  ldim_W = max( 1, m_B );
799
800
  // Apply the block transformation.
801
  dlarfb_( "Right", "No transpose", "Forward", "Columnwise",
802
           & m_B, & n_B, & n_U, buff_U, & ldim_U, buff_T, & ldim_T,
803
           buff_B, & ldim_B, buff_W, & ldim_W );
804
805
  // Remove auxiliary object.
806
  //// FLA_Obj_free( & W );
807
  free( buff_W );
808
809
  return 0;
810
}
811
812
// ============================================================================
813
static int dgeqp4_QRPmod_WY_unb_var4( int pivoting, int num_stages,
814
               int m_A, int n_A, double * buff_A, int ldim_A,
815
               int * buff_p, double * buff_t,
816
               int pivot_B, int m_B, double * buff_B, int ldim_B,
817
               int pivot_C, int m_C, double * buff_C, int ldim_C,
818
               int build_T, double * buff_T, int ldim_T ) {
819
//
820
// It computes an unblocked QR factorization of matrix A with or without
821
// pivoting. Matrices B and C are optionally pivoted, and matrix T is
822
// optionally built.
823
//
824
// Arguments:
825
// "pivoting": If pivoting==1, then QR factorization with pivoting is used.
826
// "numstages": It tells the number of columns that are factorized.
827
//   If "num_stages" is negative, the whole matrix A is factorized.
828
//   If "num_stages" is positive, only the first "num_stages" are factorized.
829
// "pivot_B": if "pivot_B" is true, matrix "B" is pivoted too.
830
// "pivot_C": if "pivot_C" is true, matrix "C" is pivoted too.
831
// "build_T": if "build_T" is true, matrix "T" is built.
832
//
833
  int     j, mn_A, m_a21, m_A22, n_A22, n_dB, idx_max_col,
834
          i_one = 1, n_house_vector, m_rest;
835
  double  * buff_d, * buff_e, * buff_workspace, diag;
836
  int     idamax_();
837
838
  //// printf( "dgeqp4_QRPmod_WY_unb_var4. pivoting: %d \n", pivoting );
839
840
  // Some initializations.
841
  mn_A    = min( m_A, n_A );
842
843
  // Set the number of stages, if needed.
844
  if( num_stages < 0 ) {
845
    num_stages = mn_A;
846
  }
847
848
  // Create auxiliary vectors.
849
  buff_d         = ( double * ) malloc( n_A * sizeof( double ) );
850
  buff_e         = ( double * ) malloc( n_A * sizeof( double ) );
851
  buff_workspace = ( double * ) malloc( n_A * sizeof( double ) );
852
853
  if( pivoting == 1 ) {
854
    // Compute initial norms of A into d and e.
855
    dgeqp4_QRP_compute_norms( m_A, n_A, buff_A, ldim_A, buff_d, buff_e );
856
  }
857
858
  // Main Loop.
859
  for( j = 0; j < num_stages; j++ ) {
860
    n_dB  = n_A - j;
861
    m_a21 = m_A - j - 1;
862
    m_A22 = m_A - j - 1;
863
    n_A22 = n_A - j - 1;
864
865
    if( pivoting == 1 ) {
866
      // Obtain the index of the column with largest 2-norm.
867
      idx_max_col = idamax_( & n_dB, & buff_d[ j ], & i_one ) - 1;
868
869
      // Swap columns of A, B, C, pivots, and norms vectors.
870
      dgeqp4_QRP_pivot_G_B_C( idx_max_col,
871
          m_A, & buff_A[ 0 + j * ldim_A ], ldim_A,
872
          pivot_B, m_B, & buff_B[ 0 + j * ldim_B ], ldim_B,
873
          pivot_C, m_C, & buff_C[ 0 + j * ldim_C ], ldim_C,
874
          & buff_p[ j ],
875
          & buff_d[ j ],
876
          & buff_e[ j ] );
877
    }
878
879
    // Compute tau1 and u21 from alpha11 and a21 such that tau1 and u21
880
    // determine a Householder transform H such that applying H from the
881
    // left to the column vector consisting of alpha11 and a21 annihilates
882
    // the entries in a21 (and updates alpha11).
883
    n_house_vector = m_a21 + 1;
884
    dlarfg_( & n_house_vector,
885
             & buff_A[ j + j * ldim_A ],
886
             & buff_A[ min( m_A-1, j+1 ) + j * ldim_A ], & i_one,
887
             & buff_t[ j ] );
888
889
    // / a12t \ =  H / a12t \
890
    // \ A22  /      \ A22  /
891
    //
892
    // where H is formed from tau1 and u21.
893
    diag = buff_A[ j + j * ldim_A ];
894
    buff_A[ j + j * ldim_A ] = 1.0;
895
    m_rest = m_A22 + 1;
896
    dlarf_( "Left", & m_rest, & n_A22,
897
        & buff_A[ j + j * ldim_A ], & i_one,
898
        & buff_t[ j ],
899
        & buff_A[ j + ( j+1 ) * ldim_A ], & ldim_A,
900
        buff_workspace );
901
    buff_A[ j + j * ldim_A ] = diag;
902
903
    if( pivoting == 1 ) {
904
      // Update partial column norms.
905
      dgeqp4_QRP_downdate_partial_norms( m_A22, n_A22,
906
          & buff_d[ j+1 ], 1,
907
          & buff_e[ j+1 ], 1,
908
          & buff_A[ j + ( j+1 ) * ldim_A ], ldim_A,
909
          & buff_A[ ( j+1 ) + min( n_A-1, ( j+1 ) ) * ldim_A ], ldim_A );
910
    }
911
  }
912
913
  // Build T.
914
  if( build_T ) {
915
    dlarft_( "Forward", "Columnwise", & m_A, & num_stages, buff_A, & ldim_A,
916
             buff_t, buff_T, & ldim_T );
917
  }
918
919
  // Remove auxiliary vectors.
920
  free( buff_d );
921
  free( buff_e );
922
  free( buff_workspace );
923
924
  return 0;
925
}
926
927
// ============================================================================
928
static int dgeqp4_QRP_compute_norms(
929
               int m_A, int n_A, double * buff_A, int ldim_A,
930
               double * buff_d, double * buff_e ) {
931
//
932
// It computes the column norms of matrix A. The norms are stored into
933
// vectors d and e.
934
//
935
  int     j, i_one = 1;
936
  double  dnrm2_();
937
938
  // Main loop.
939
  for( j = 0; j < n_A; j++ ) {
940
    * buff_d = dnrm2_( & m_A, buff_A, & i_one );
941
    * buff_e = * buff_d;
942
    buff_A += ldim_A;
943
    buff_d++;
944
    buff_e++;
945
  }
946
947
  return 0;
948
}
949
950
// ============================================================================
951
static int dgeqp4_QRP_downdate_partial_norms( int m_A, int n_A,
952
               double * buff_d,  int st_d,
953
               double * buff_e,  int st_e,
954
               double * buff_wt, int st_wt,
955
               double * buff_A,  int ldim_A ) {
956
//
957
// It updates (downdates) the column norms of matrix A. It uses Drmac's method.
958
//
959
  int     j, i_one = 1;
960
  double  * ptr_d, * ptr_e, * ptr_wt, * ptr_A;
961
  double  temp, temp2, temp5, tol3z;
962
  double  dnrm2_(), dlamch_();
963
964
  /*
965
*
966
*           Update partial column norms
967
*
968
          DO 30 J = I + 1, N
969
             IF( WORK( J ).NE.ZERO ) THEN
970
*
971
*                 NOTE: The following 4 lines follow from the analysis in
972
*                 Lapack Working Note 176.
973
*
974
                TEMP = ABS( A( I, J ) ) / WORK( J )
975
                TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
976
                TEMP2 = TEMP*( WORK( J ) / WORK( N+J ) )**2
977
                IF( TEMP2 .LE. TOL3Z ) THEN
978
                   IF( M-I.GT.0 ) THEN
979
                      WORK( J ) = DNRM2( M-I, A( I+1, J ), 1 )
980
                      WORK( N+J ) = WORK( J )
981
                   ELSE
982
                      WORK( J ) = ZERO
983
                      WORK( N+J ) = ZERO
984
                   END IF
985
                ELSE
986
                   WORK( J ) = WORK( J )*SQRT( TEMP )
987
                END IF
988
             END IF
989
 30       CONTINUE
990
  */
991
992
  // Some initializations.
993
  tol3z = sqrt( dlamch_( "Epsilon" ) );
994
  ptr_d  = buff_d;
995
  ptr_e  = buff_e;
996
  ptr_wt = buff_wt;
997
  ptr_A  = buff_A;
998
999
  // Main loop.
1000
  for( j = 0; j < n_A; j++ ) {
1001
    if( * ptr_d != 0.0 ) {
1002
      temp = dabs( * ptr_wt ) / * ptr_d;
1003
      temp = max( 0.0, ( 1.0 + temp ) * ( 1 - temp ) );
1004
      temp5 = * ptr_d / * ptr_e;
1005
      temp2 = temp * temp5 * temp5;
1006
      if( temp2 <= tol3z ) {
1007
        if( m_A > 0 ) {
1008
          * ptr_d = dnrm2_( & m_A, ptr_A, & i_one );
1009
          * ptr_e = *ptr_d;
1010
        } else {
1011
          * ptr_d = 0.0;
1012
          * ptr_e = 0.0;
1013
        }
1014
      } else {
1015
        * ptr_d = * ptr_d * sqrt( temp );
1016
      }
1017
    }
1018
    ptr_A  += ldim_A;
1019
    ptr_d  += st_d;
1020
    ptr_e  += st_e;
1021
    ptr_wt += st_wt;
1022
  }
1023
1024
  return 0;
1025
}
1026
1027
1028
// ============================================================================
1029
static int dgeqp4_QRP_pivot_G_B_C( int j_max_col,
1030
               int m_G, double * buff_G, int ldim_G,
1031
               int pivot_B, int m_B, double * buff_B, int ldim_B,
1032
               int pivot_C, int m_C, double * buff_C, int ldim_C,
1033
               int * buff_p,
1034
               double * buff_d, double * buff_e ) {
1035
//
1036
// It pivots matrix G, pivot vector p, and norms vectors d and e.
1037
// Matrices B and C are optionally pivoted.
1038
//
1039
  int     ival, i_one = 1;
1040
  double  * ptr_g1, * ptr_g2, * ptr_b1, * ptr_b2, * ptr_c1, * ptr_c2;
1041
1042
  // Swap columns of G, pivots, and norms.
1043
  if( j_max_col != 0 ) {
1044
1045
    // Swap full column 0 and column "j_max_col" of G.
1046
    ptr_g1 = & buff_G[ 0 + 0         * ldim_G ];
1047
    ptr_g2 = & buff_G[ 0 + j_max_col * ldim_G ];
1048
    dswap_( & m_G, ptr_g1, & i_one, ptr_g2, & i_one );
1049
1050
    // Swap full column 0 and column "j_max_col" of B.
1051
    if( pivot_B ) {
1052
      ptr_b1 = & buff_B[ 0 + 0         * ldim_B ];
1053
      ptr_b2 = & buff_B[ 0 + j_max_col * ldim_B ];
1054
      dswap_( & m_B, ptr_b1, & i_one, ptr_b2, & i_one );
1055
    }
1056
1057
    // Swap full column 0 and column "j_max_col" of C.
1058
    if( pivot_C ) {
1059
      ptr_c1 = & buff_C[ 0 + 0         * ldim_C ];
1060
      ptr_c2 = & buff_C[ 0 + j_max_col * ldim_C ];
1061
      dswap_( & m_C, ptr_c1, & i_one, ptr_c2, & i_one );
1062
    }
1063
1064
    // Swap element 0 and element "j_max_col" of pivot vector "p".
1065
    ival = buff_p[ j_max_col ];
1066
    buff_p[ j_max_col ] = buff_p[ 0 ];
1067
    buff_p[ 0 ] = ival;
1068
1069
    // Copy norms of column 0 to column "j_max_col".
1070
    buff_d[ j_max_col ] = buff_d[ 0 ];
1071
    buff_e[ j_max_col ] = buff_e[ 0 ];
1072
  }
1073
1074
  return 0;
1075
}
1076