GCC Code Coverage Report
Directory: . Exec Total Coverage
File: frame/external/sgeqp4.c Lines: 0 272 0.0 %
Date: 2019-01-14 Branches: 0 118 0.0 %

Line Exec Source
1
/*
2
===============================================================================
3
Authors
4
===============================================================================
5
6
Per-Gunnar Martinsson
7
  Dept. of Applied Mathematics,
8
  University of Colorado at Boulder,
9
  526 UCB, Boulder, CO 80309-0526, USA
10
11
Gregorio Quintana-Orti
12
  Depto. de Ingenieria y Ciencia de Computadores,
13
  Universitat Jaume I,
14
  12.071 Castellon, Spain
15
16
Nathan Heavner
17
  Dept. of Applied Mathematics,
18
  University of Colorado at Boulder,
19
  526 UCB, Boulder, CO 80309-0526, USA
20
21
Robert van de Geijn
22
  Dept. of Computer Science and Institute for Computational Engineering and
23
  Sciences,
24
  The University of Texas at Austin
25
  Austin, TX.
26
27
===============================================================================
28
Copyright
29
===============================================================================
30
31
Copyright (C) 2016,
32
  Universitat Jaume I,
33
  University of Colorado at Boulder,
34
  The University of Texas at Austin.
35
36
===============================================================================
37
Disclaimer
38
===============================================================================
39
40
This code is distributed in the hope that it will be useful, but
41
WITHOUT ANY WARRANTY EXPRESSED OR IMPLIED.
42
43
===============================================================================
44
Disclaimer
45
===============================================================================
46
47
This copy of HQRRP in single precision is modified by Chenhan D. Yu.
48
49
*/
50
51
#include <stdlib.h>
52
#include <stdio.h>
53
#include <math.h>
54
#include "blas_lapack_prototypes.h"
55
#include "sgeqp4.h"
56
57
58
// Matrices with dimensions smaller than THRESHOLD_FOR_SGEQPF are processed
59
// with LAPACK's routine dgeqpf.
60
// Matrices with dimensions between THRESHOLD_FOR_SGEQPF and
61
// THRESHOLD_FOR_SGEQP3 are processed with LAPACK's routine dgeqp3.
62
// Matrices with dimensions larger than THRESHOLD_FOR_SGEQP3 are processed
63
// with the new HQRRP code.
64
#define THRESHOLD_FOR_SGEQPF    0
65
#define THRESHOLD_FOR_SGEQP3  500
66
67
68
// ============================================================================
69
// Definition of macros.
70
71
#define max( a, b )  ( (a) > (b) ? (a) : (b) )
72
#define min( a, b )  ( (a) > (b) ? (b) : (a) )
73
#define dabs( a )    ( (a) >= 0.0 ? (a) : -(a) )
74
75
// ============================================================================
76
// Compilation declarations.
77
78
#undef CHECK_DOWNDATING_OF_Y
79
80
81
// ============================================================================
82
// Declaration of local prototypes.
83
84
static int sgeqp4_Normal_random_matrix( int m_A, int n_A,
85
               float * buff_A, int ldim_A );
86
87
static float sgeqp4_Normal_random_number( float mu, float sigma );
88
89
static int sgeqp4_Downdate_Y(
90
               int m_U11, int n_U11, float * buff_U11, int ldim_U11,
91
               int m_U21, int n_U21, float * buff_U21, int ldim_U21,
92
               int m_A12, int n_A12, float * buff_A12, int ldim_A12,
93
               int m_T, int n_T, float * buff_T, int ldim_T,
94
               int m_Y2, int n_Y2, float * buff_Y2, int ldim_Y2,
95
               int m_G1, int n_G1, float * buff_G1, int ldim_G1,
96
               int m_G2, int n_G2, float * buff_G2, int ldim_G2 );
97
98
static int sgeqp4_Apply_Q_WY_lhfc_blk_var4(
99
               int m_U, int n_U, float * buff_U, int ldim_U,
100
               int m_T, int n_T, float * buff_T, int ldim_T,
101
               int m_B, int n_B, float * buff_B, int ldim_B );
102
103
static int sgeqp4_Apply_Q_WY_rnfc_blk_var4(
104
               int m_U, int n_U, float * buff_U, int ldim_U,
105
               int m_T, int n_T, float * buff_T, int ldim_T,
106
               int m_B, int n_B, float * buff_B, int ldim_B );
107
108
static int sgeqp4_QRPmod_WY_unb_var4( int pivoting, int num_stages,
109
               int m_A, int n_A, float * buff_A, int ldim_A,
110
               int * buff_p, float * buff_t,
111
               int pivot_B, int m_B, float * buff_B, int ldim_B,
112
               int pivot_C, int m_C, float * buff_C, int ldim_C,
113
               int build_T, float * buff_T, int ldim_T );
114
115
static int sgeqp4_QRP_compute_norms(
116
               int m_A, int n_A, float * buff_A, int ldim_A,
117
               float * buff_d, float * buff_e );
118
119
static int sgeqp4_QRP_downdate_partial_norms( int m_A, int n_A,
120
               float * buff_d,  int st_d,
121
               float * buff_e,  int st_e,
122
               float * buff_wt, int st_wt,
123
               float * buff_A,  int ldim_A );
124
125
static int sgeqp4_QRP_pivot_G_B_C( int j_max_col,
126
               int m_G, float * buff_G, int ldim_G,
127
               int pivot_B, int m_B, float * buff_B, int ldim_B,
128
               int pivot_C, int m_C, float * buff_C, int ldim_C,
129
               int * buff_p,
130
               float * buff_d, float * buff_e );
131
132
133
// ============================================================================
134
void sgeqp4( int * m, int * n, float * A, int * lda, int * jpvt, float * tau,
135
        float * work, int * lwork, int * info ) {
136
//
137
// This routine is plug compatible with LAPACK's routine sgeqp3.
138
// It computes the new HQRRP while keeping the same header as LAPACK's sgeqp3.
139
// It uses dgeqpf or sgeqp3 for small matrices. The thresholds are defined in
140
// constants THRESHOLD_FOR_SGEQPF and THRESHOLD_FOR_SGEQP3.
141
//
142
  int     INB = 1;
143
  int     i_one = 1, i_minus_one = -1,
144
          m_A, n_A, mn_A, ldim_A, lquery, nb, num_factorized_fixed_cols,
145
          minus_info, iws, lwkopt, j, k, num_fixed_cols, n_rest, itmp;
146
  int     * previous_jpvt;
147
  int     ilaenv_();
148
149
  // Some initializations.
150
  m_A    = * m;
151
  n_A    = * n;
152
  mn_A   = min( m_A, n_A );
153
  ldim_A = * lda;
154
155
  // Check input arguments.
156
  * info = 0;
157
  lquery = ( * lwork == -1 );
158
  if( m_A < 0 ) {
159
     * info = -1;
160
  } else if ( n_A < 0 ) {
161
     * info = -2;
162
  } else if ( ldim_A < max( 1, m_A ) ) {
163
     * info = -4;
164
  }
165
166
  if( *info == 0 ) {
167
    if( mn_A == 0 ) {
168
      iws    = 1;
169
      lwkopt = 1;
170
    } else {
171
      iws    = 3 * n_A + 1;
172
      nb     = ilaenv_( & INB, "SGEQRF", " ", & m_A, & n_A, & i_minus_one,
173
                        & i_minus_one );
174
      lwkopt = 2 * n_A + ( n_A + 1 ) * nb;
175
    }
176
    work[ 0 ] = ( float ) lwkopt;
177
178
    if ( ( * lwork < iws )&&( ! lquery ) ) {
179
      * info = -8;
180
    }
181
  }
182
183
  if( * info != 0 ) {
184
    minus_info = - * info;
185
    xerbla_( "SGEQP3", & minus_info );
186
    return;
187
  } else if( lquery ) {
188
    return;
189
  }
190
191
  // Quick return if possible.
192
  if( mn_A == 0 ) {
193
    return;
194
  }
195
196
  // Use LAPACK's SGEQPF or SGEQP3 for small matrices.
197
  //if( mn_A < THRESHOLD_FOR_SGEQPF ) {
198
  //  // Call to LAPACK routine.
199
  //  //// printf( "Calling dgeqpf\n" );
200
  //  sgeqpf_( m, n, A, lda, jpvt, tau, work, info );
201
  //  return;
202
  //} else
203
204
205
  // Use LAPACK's SGEQP3 for small matrices.
206
  if( mn_A < THRESHOLD_FOR_SGEQP3 )
207
  {
208
    //// printf( "Calling dgeqp3\n" );
209
    sgeqp3_( m, n, A, lda, jpvt, tau, work, lwork, info );
210
    return;
211
  }
212
213
  // Move initial columns up front.
214
  num_fixed_cols = 0;
215
  for( j = 0; j < n_A; j++ ) {
216
    if( jpvt[ j ] != 0 ) {
217
      if( j != num_fixed_cols ) {
218
        //// printf( "Swapping columns: %d %d \n", j, num_fixed_cols );
219
        sswap_( & m_A, & A[ 0 + j              * ldim_A ], & i_one,
220
                       & A[ 0 + num_fixed_cols * ldim_A ], & i_one );
221
        jpvt[ j ] = jpvt[ num_fixed_cols ];
222
        jpvt[ num_fixed_cols ] = j + 1;
223
      } else {
224
        jpvt[ j ] = j + 1 ;
225
      }
226
      num_fixed_cols++;
227
    } else {
228
      jpvt[ j ] = j + 1 ;
229
    }
230
  }
231
232
  // Factorize fixed columns at the front.
233
  num_factorized_fixed_cols = min( m_A, num_fixed_cols );
234
  if( num_factorized_fixed_cols > 0 ) {
235
    sgeqrf_( & m_A, & num_factorized_fixed_cols, A, & ldim_A, tau, work, lwork,
236
             info );
237
    if( * info != 0 ) {
238
      fprintf( stderr, "ERROR in sgeqrf: Info: %d \n", * info );
239
    }
240
    iws = max( iws, ( int ) work[ 0 ] );
241
    if( num_factorized_fixed_cols < n_A ) {
242
      n_rest = n_A - num_factorized_fixed_cols;
243
      sormqr_( "Left", "Transpose",
244
               & m_A, & n_rest, & num_factorized_fixed_cols,
245
               A, & ldim_A, tau,
246
               & A[ 0 + num_factorized_fixed_cols * ldim_A ], & ldim_A,
247
               work, lwork, info );
248
      if( * info != 0 ) {
249
        fprintf( stderr, "ERROR in dormqr: Info: %d \n", * info );
250
      }
251
252
      iws = max( iws, ( int ) work[ 0 ] );
253
    }
254
  }
255
256
  // Create intermediate jpvt vector.
257
  previous_jpvt = ( int * ) malloc( n_A * sizeof( int ) );
258
259
  // Save a copy of jpvt vector.
260
  if( num_factorized_fixed_cols > 0 ) {
261
    // Copy vector.
262
    for( j = 0; j < n_A; j++ ) {
263
      previous_jpvt[ j ] = jpvt[ j ];
264
    }
265
  }
266
267
  // Factorize free columns at the bottom with default values:
268
  // nb_alg = 64, pp = 10, panel_pivoting = 1.
269
  if( num_factorized_fixed_cols < mn_A ) {
270
    * info = sgeqp4_HQRRP_WY_blk_var4(
271
        m_A - num_factorized_fixed_cols, n_A - num_factorized_fixed_cols,
272
        & A[ num_factorized_fixed_cols + num_factorized_fixed_cols * ldim_A ],
273
            ldim_A,
274
        & jpvt[ num_factorized_fixed_cols ],
275
        & tau[ num_factorized_fixed_cols ],
276
        64, 10, 1 );
277
  }
278
279
  // Pivot block above factorized block by sgeqp4_HQRRP.
280
  if( num_factorized_fixed_cols > 0 ) {
281
    // Pivot block above factorized block.
282
    for( j = num_factorized_fixed_cols; j < n_A; j++ ) {
283
      //// printf( "%% Processing j: %d \n", j );
284
      for( k = j; k < n_A; k++ ) {
285
        if( jpvt[ j ] == previous_jpvt[ k ] ) {
286
          //// printf( "%%   Found j: %d  k: %d \n", j, k );
287
          break;
288
        }
289
      }
290
      // Swap vector previous_jpvt and block above factorized block.
291
      if( k != j ) {
292
        // Swap elements in previous_jpvt.
293
        //// printf( "%%   Swapping  j: %d  k: %d \n", j, k );
294
        itmp = previous_jpvt[ j ];
295
        previous_jpvt[ j ] = previous_jpvt[ k ];
296
        previous_jpvt[ k ] = itmp;
297
298
        // Swap columns in block above factorized block.
299
        sswap_( & num_factorized_fixed_cols,
300
                & A[ 0 + j * ldim_A ], & i_one,
301
                & A[ 0 + k * ldim_A ], & i_one );
302
      }
303
    }
304
  }
305
306
  // Remove intermediate jpvt vector.
307
  free( previous_jpvt );
308
309
  // Return workspace length required.
310
  work[ 0 ] = iws;
311
  return;
312
}
313
314
// ============================================================================
315
int sgeqp4_HQRRP_WY_blk_var4( int m_A, int n_A, float * buff_A, int ldim_A,
316
        int * buff_jpvt, float * buff_tau,
317
        int nb_alg, int pp, int panel_pivoting ) {
318
//
319
// HQRRP: It computes the Householder QR with Randomized Pivoting of matrix A.
320
// This routine is almost compatible with LAPACK's dgeqp3.
321
// The main difference is that this routine does not manage fixed columns.
322
//
323
// Main features:
324
//   * BLAS-3 based.
325
//   * Norm downdating method by Drmac.
326
//   * Downdating for computing Y.
327
//   * No use of libflame.
328
//   * Compact WY transformations are used instead of UT transformations.
329
//   * LAPACK's routine dlarfb is used to apply block transformations.
330
//
331
// Arguments:
332
// ----------
333
// m_A:            Number of rows of matrix A.
334
// n_A:            Number of columns of matrix A.
335
// buff_A:         Address/pointer of/to data in matrix A. Matrix A must be
336
//                 stored in column-order.
337
// ldim_A:         Leading dimension of matrix A.
338
// buff_jpvt:      Input/output vector with the pivots.
339
// buff_tau:       Output vector with the tau values of the Householder factors.
340
// nb_alg:         Block size.
341
//                 Usual values for nb_alg are 32, 64, etc.
342
// pp:             Oversampling size.
343
//                 Usual values for pp are 5, 10, etc.
344
// panel_pivoting: If panel_pivoting==1, QR with pivoting is applied to
345
//                 factorize the panels of matrix A. Otherwise, QR without
346
//                 pivoting is used. Usual value for panel_pivoting is 1.
347
// Final comments:
348
// ---------------
349
// This code has been created from a libflame code. Hence, you can find some
350
// commented calls to libflame routines. We have left them to make it easier
351
// to interpret the meaning of the C code.
352
//
353
  int     b, j, last_iter, mn_A, m_Y, n_Y, ldim_Y, m_V, n_V, ldim_V,
354
          m_W, n_W, ldim_W, n_VR, m_AB1, n_AB1, ldim_T1_T,
355
          m_A11, n_A11, m_A12, n_A12, m_A21, n_A21, m_A22,
356
          m_G, n_G, ldim_G;
357
  float  * buff_Y, * buff_V, * buff_W, * buff_VR, * buff_YR,
358
          * buff_s, * buff_sB, * buff_s1,
359
          * buff_AR, * buff_AB1, * buff_A01, * buff_Y1, * buff_T1_T,
360
          * buff_A11, * buff_A21, * buff_A12,
361
          * buff_Y2, * buff_G, * buff_G1, * buff_G2;
362
  int     * buff_p, * buff_pB, * buff_p1;
363
  float  d_zero = 0.0;
364
  float  d_one  = 1.0;
365
366
  // Executable Statements.
367
  //// printf( "%% sgeqp4_HQRRP_WY_blk_var4.\n" );
368
369
  // Check arguments.
370
  if( m_A < 0 ) {
371
    fprintf( stderr,
372
             "ERROR in sgeqp4_HQRRP_WY_blk_var4: m_A is < 0.\n" );
373
  } if( n_A < 0 ) {
374
    fprintf( stderr,
375
             "ERROR in sgeqp4_HQRRP_WY_blk_var4: n_A is < 0.\n" );
376
  } if( ldim_A < max( 1, m_A ) ) {
377
    fprintf( stderr,
378
             "ERROR in sgeqp4_HQRRP_WY_blk_var4: ldim_A is < max( 1, m_A ).\n" );
379
  }
380
381
  // Some initializations.
382
  mn_A   = min( m_A, n_A );
383
  buff_p = buff_jpvt;
384
  buff_s = buff_tau;
385
386
  // Quick return.
387
  if( mn_A == 0 ) {
388
    return 0;
389
  }
390
391
  // Initialize the seed for the generator of random numbers.
392
  srand( 12 );
393
394
  // Create auxiliary objects.
395
  m_Y     = nb_alg + pp;
396
  n_Y     = n_A;
397
  buff_Y  = ( float * ) malloc( m_Y * n_Y * sizeof( float ) );
398
  ldim_Y  = m_Y;
399
400
  m_V     = nb_alg + pp;
401
  n_V     = n_A;
402
  buff_V  = ( float * ) malloc( m_V * n_V * sizeof( float ) );
403
  ldim_V  = m_V;
404
405
  m_W     = nb_alg;
406
  n_W     = n_A;
407
  buff_W  = ( float * ) malloc( m_W * n_W * sizeof( float ) );
408
  ldim_W  = m_W;
409
410
  m_G     = nb_alg + pp;
411
  n_G     = m_A;
412
  buff_G  = ( float * ) malloc( m_G * n_G * sizeof( float ) );
413
  ldim_G  = m_G;
414
415
  // Initialize matrices G and Y.
416
  sgeqp4_Normal_random_matrix( nb_alg + pp, m_A, buff_G, ldim_G );
417
  //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
418
  ////           FLA_ONE, G, A, FLA_ZERO, Y );
419
  sgemm_( "No tranpose", "No transpose", & m_Y, & n_Y, & m_A,
420
          & d_one, buff_G, & ldim_G, buff_A, & ldim_A,
421
          & d_zero, buff_Y, & ldim_Y );
422
423
  // Main Loop.
424
  for( j = 0; j < mn_A; j += nb_alg ) {
425
    b = min( nb_alg, min( n_A - j, m_A - j ) );
426
427
    // Check whether it is the last iteration.
428
    last_iter = ( ( ( j + nb_alg >= m_A )||( j + nb_alg >= n_A ) ) ? 1 : 0 );
429
430
    // Some initializations for the iteration of this loop.
431
    n_VR = n_V - j;
432
    buff_VR = & buff_V[ 0 + j * ldim_V ];
433
    buff_YR = & buff_Y[ 0 + j * ldim_Y ];
434
    buff_pB = & buff_p[ j ];
435
    buff_sB = & buff_s[ j ];
436
    buff_AR = & buff_A[ 0 + j * ldim_A ];
437
438
    m_AB1     = m_A - j;
439
    n_AB1     = b;
440
    buff_AB1  = & buff_A[ j + j * ldim_A ];
441
    buff_p1   = & buff_p[ j ];
442
    buff_s1   = & buff_s[ j ];
443
    buff_A01  = & buff_A[ 0 + j * ldim_A ];
444
    buff_Y1   = & buff_Y[ 0 + j * ldim_Y ];
445
    buff_T1_T = & buff_W[ 0 + j * ldim_W ];
446
    ldim_T1_T = ldim_W;
447
448
    buff_A11 = & buff_A[ j + j * ldim_A ];
449
    m_A11 = b;
450
    n_A11 = b;
451
452
    buff_A21 = & buff_A[ min( m_A - 1, j + nb_alg ) + j * ldim_A ];
453
    m_A21 = max( 0, m_A - j - b );
454
    n_A21 = b;
455
456
    buff_A12 = & buff_A[ j + min( n_A - 1, j + b ) * ldim_A ];
457
    m_A12 = b;
458
    n_A12 = max( 0, n_A - j - b );
459
460
    //// buff_A22 = & buff_A[ min( m_A - 1, j + b ) +
461
    ////                      min( n_A - 1, j + b ) * ldim_A ];
462
    m_A22 = max( 0, m_A - j - b );
463
    //// n_A22 = max( 0, n_A - j - b );
464
465
    buff_Y2 = & buff_Y[ 0 + min( n_Y - 1, j + b ) * ldim_Y ];
466
    buff_G1 = & buff_G[ 0 + j * ldim_G ];
467
    buff_G2 = & buff_G[ 0 + min( n_G - 1, j + b ) * ldim_G ];
468
469
#ifdef CHECK_DOWNDATING_OF_Y
470
    // Check downdating of matrix Y: Compare downdated matrix Y with
471
    // matrix Y computed from scratch.
472
    int     m_cyr, n_cyr, ldim_cyr, m_ABR, ii, jj;
473
    float  * buff_cyr, aux, sum;
474
475
    m_cyr    = m_Y;
476
    n_cyr    = n_Y - j;
477
    ldim_cyr = m_cyr;
478
    m_ABR    = m_A - j;
479
    buff_cyr = ( float * ) malloc( m_cyr * n_cyr * sizeof( float ) );
480
481
    //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
482
    ////           FLA_ONE, GR, ABR, FLA_ZERO, CYR );
483
    sgemm_( "No tranpose", "No transpose", & m_cyr, & n_cyr, & m_ABR,
484
            & d_one, & buff_G[ 0 + j * ldim_G ], & ldim_G,
485
                     & buff_A[ j + j * ldim_A ], & ldim_A,
486
            & d_zero, & buff_cyr[ 0 + 0 * ldim_cyr ], & ldim_cyr );
487
488
    //// print_float_matrix( "cyr", m_cyr, n_cyr, buff_cyr, ldim_cyr );
489
    //// print_float_matrix( "y", m_Y, n_Y, buff_Y, ldim_Y );
490
    sum = 0.0;
491
    for( jj = 0; jj < n_cyr; jj++ ) {
492
      for( ii = 0; ii < m_cyr; ii++ ) {
493
        aux = buff_Y[ ii + ( j + jj ) * ldim_Y ] -
494
              buff_cyr[ ii + jj * ldim_cyr ];
495
        sum += aux * aux;
496
      }
497
    }
498
    sum = sqrt( sum );
499
    printf( "%%  diff between Y and downdated Y: %le\n", sum );
500
501
    free( buff_cyr );
502
#endif
503
504
    if( last_iter == 0 ) {
505
      // Compute QRP of YR, and apply permutations to matrix AR.
506
      // A copy of YR is made into VR, and permutations are applied to YR.
507
      //// FLA_Merge_2x1( ATR,
508
      ////                ABR,   & AR );
509
      //// FLA_Copy( YR, VR );
510
      //// FLA_QRPmod_WY_unb_var4( 1, bRow, VR, pB, sB, 1, AR, 1, YR, 0, None );
511
512
      slacpy_( "All", & m_V, & n_VR, buff_YR, & ldim_Y,
513
                                     buff_VR, & ldim_V );
514
      sgeqp4_QRPmod_WY_unb_var4( 1, b,
515
          m_V, n_VR, buff_VR, ldim_V, buff_pB, buff_sB,
516
          1, m_A, buff_AR, ldim_A,
517
          1, m_Y, buff_YR, ldim_Y,
518
          0, buff_Y, ldim_Y );
519
    }
520
521
    //
522
    // Compute QRP of panel AB1 = [ A11; A21 ].
523
    // Apply same permutations to A01 and Y1, and build T1_T.
524
    //
525
    //// FLA_Part_2x1( W1,   & T1_T,
526
    ////                     & None,    b, FLA_TOP );
527
    //// FLA_Merge_2x1( A11,
528
    ////                A21,   & AB1 );
529
    //// FLA_QRPmod_WY_unb_var4( panel_pivoting, -1, AB1, p1, s1,
530
    ////                         1, A01, 1, Y1, 1, T1_T );
531
532
    sgeqp4_QRPmod_WY_unb_var4( panel_pivoting, -1,
533
        m_AB1, n_AB1, buff_AB1, ldim_A, buff_p1, buff_s1,
534
        1, j, buff_A01, ldim_A,
535
        1, m_Y, buff_Y1, ldim_Y,
536
        1, buff_T1_T, ldim_W );
537
538
    //
539
    // Update the rest of the matrix.
540
    //
541
    if ( ( j + b ) < n_A ) {
542
      // Apply the Householder transforms associated with AB1 = [ A11; A21 ]
543
      // and T1_T to [ A12; A22 ]:
544
      //   / A12 \ := QB1' / A12 \
545
      //   \ A22 /         \ A22 /
546
      // where QB1 is formed from AB1 and T1_T.
547
      //// MyFLA_Apply_Q_WY_lhfc_blk_var4( A11, A21, T1_T, A12, A22 );
548
549
      sgeqp4_Apply_Q_WY_lhfc_blk_var4(
550
          m_A11 + m_A21, n_A11, buff_A11, ldim_A,
551
          b, b, buff_T1_T, ldim_W,
552
          m_A12 + m_A22, n_A12, buff_A12, ldim_A );
553
    }
554
555
    //
556
    // Downdate matrix Y.
557
    //
558
    if ( ! last_iter ) {
559
      //// MyFLA_Downdate_Y( A11, A21, A12, T1_T, Y2, G1, G2 );
560
561
      sgeqp4_Downdate_Y(
562
          m_A11, n_A11, buff_A11, ldim_A,
563
          m_A21, n_A21, buff_A21, ldim_A,
564
          m_A12, n_A12, buff_A12, ldim_A,
565
          b, b, buff_T1_T, ldim_T1_T,
566
          m_Y, max( 0, n_Y - j - b ), buff_Y2, ldim_Y,
567
          m_G, b, buff_G1, ldim_G,
568
          m_G, max( 0, n_G - j - b ), buff_G2, ldim_G );
569
    }
570
  }
571
572
  // Remove auxiliary objects.
573
  //// FLA_Obj_free( & G );
574
  //// FLA_Obj_free( & Y );
575
  //// FLA_Obj_free( & V );
576
  //// FLA_Obj_free( & W );
577
  free( buff_G );
578
  free( buff_Y );
579
  free( buff_V );
580
  free( buff_W );
581
582
  return 0;
583
}
584
585
586
// ============================================================================
587
static int sgeqp4_Normal_random_matrix( int m_A, int n_A,
588
               float * buff_A, int ldim_A ) {
589
//
590
// It generates a random matrix with normal distribution.
591
//
592
  int  i, j;
593
594
  // Main loop.
595
  for ( j = 0; j < n_A; j++ ) {
596
    for ( i = 0; i < m_A; i++ ) {
597
      buff_A[ i + j * ldim_A ] = sgeqp4_Normal_random_number( 0.0, 1.0 );
598
    }
599
  }
600
601
  return 0;
602
}
603
604
// ============================================================================
605
static float sgeqp4_Normal_random_number( float mu, float sigma ) {
606
//
607
// It computes and returns a normal random number.
608
//
609
  static int     alternate_calls = 0;
610
  static float  b1, b2;
611
  float         c1, c2, a, factor;
612
613
  // Quick return.
614
  if( alternate_calls == 1 ) {
615
    alternate_calls = ! alternate_calls;
616
    return( mu + sigma * b2 );
617
  }
618
  // Main loop.
619
  do {
620
    c1 = -1.0 + 2.0 * ( (float) rand() / RAND_MAX );
621
    c2 = -1.0 + 2.0 * ( (float) rand() / RAND_MAX );
622
    a = c1 * c1 + c2 * c2;
623
  } while ( ( a == 0 )||( a >= 1 ) );
624
  factor = sqrt( ( -2 * log( a ) ) / a );
625
  b1 = c1 * factor;
626
  b2 = c2 * factor;
627
  alternate_calls = ! alternate_calls;
628
  return( mu + sigma * b1 );
629
}
630
631
// ============================================================================
632
static int sgeqp4_Downdate_Y(
633
               int m_U11, int n_U11, float * buff_U11, int ldim_U11,
634
               int m_U21, int n_U21, float * buff_U21, int ldim_U21,
635
               int m_A12, int n_A12, float * buff_A12, int ldim_A12,
636
               int m_T, int n_T, float * buff_T, int ldim_T,
637
               int m_Y2, int n_Y2, float * buff_Y2, int ldim_Y2,
638
               int m_G1, int n_G1, float * buff_G1, int ldim_G1,
639
               int m_G2, int n_G2, float * buff_G2, int ldim_G2 ) {
640
//
641
// It downdates matrix Y, and updates matrix G.
642
// Only Y2 of Y is updated.
643
// Only G1 and G2 of G are updated.
644
//
645
// Y2 = Y2 - ( G1 - ( G1*U11 + G2*U21 ) * inv( T11 ) * U11' ) * R12.
646
//
647
  int    i, j;
648
  float * buff_B;
649
  float d_one       = 1.0;
650
  float d_minus_one = -1.0;
651
  int    m_B         = m_G1;
652
  int    n_B         = n_G1;
653
  int    ldim_B      = m_G1;
654
655
  // Create object B.
656
  //// FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G1, & B );
657
  buff_B = ( float * ) malloc( m_B * n_B * sizeof( float ) );
658
659
  // B = G1.
660
  //// FLA_Copy( G1, B );
661
  slacpy_( "All", & m_G1, & n_G1, buff_G1, & ldim_G1,
662
                                  buff_B, & ldim_B );
663
664
  // B = B * U11.
665
  //// FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR,
666
  ////           FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
667
  ////           FLA_ONE, U11, B );
668
  strmm_( "Right", "Lower", "No transpose", "Unit", & m_B, & n_B,
669
          & d_one, buff_U11, & ldim_U11, buff_B, & ldim_B );
670
671
  // B = B + G2 * U21.
672
  //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
673
  ////           FLA_ONE, G2, U21, FLA_ONE, B );
674
  sgemm_( "No transpose", "No tranpose", & m_B, & n_B, & m_U21,
675
          & d_one, buff_G2, & ldim_G2, buff_U21, & ldim_U21,
676
          & d_one, buff_B, & ldim_B );
677
678
  // B = B * inv( T11 ).
679
  //// FLA_Trsm( FLA_RIGHT, FLA_UPPER_TRIANGULAR,
680
  ////           FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
681
  ////           FLA_ONE, T, B );
682
  //// dtrsm_( "Right", "Upper", "No transpose", "Non-unit", & m_B, & n_B,
683
  ////         & d_one, buff_T, & ldim_T, buff_B, & ldim_B );
684
  // Used dtrmm instead of dtrsm because of using compact WY instead of UT.
685
  strmm_( "Right", "Upper", "No transpose", "Non-unit", & m_B, & n_B,
686
          & d_one, buff_T, & ldim_T, buff_B, & ldim_B );
687
688
  // B = - B * U11^H.
689
  //// FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR,
690
  ////           FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG,
691
  ////           FLA_MINUS_ONE, U11, B );
692
  strmm_( "Right", "Lower", "Conj_tranpose", "Unit", & m_B, & n_B,
693
          & d_minus_one, buff_U11, & ldim_U11, buff_B, & ldim_B );
694
695
  // B = G1 + B.
696
  //// FLA_Axpy( FLA_ONE, G1, B );
697
  for( j = 0; j < n_B; j++ ) {
698
    for( i = 0; i < m_B; i++ ) {
699
      buff_B[ i + j * ldim_B ] += buff_G1[ i + j * ldim_G1 ];
700
    }
701
  }
702
703
  // Y2 = Y2 - B * R12.
704
  //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
705
  ////           FLA_MINUS_ONE, B, A12, FLA_ONE, Y2 );
706
  sgemm_( "No transpose", "No transpose", & m_Y2, & n_Y2, & m_A12,
707
          & d_minus_one, buff_B, & ldim_B, buff_A12, & ldim_A12,
708
          & d_one, buff_Y2, & ldim_Y2 );
709
710
  //
711
  // GR = GR * Q
712
  //
713
  sgeqp4_Apply_Q_WY_rnfc_blk_var4(
714
          m_U11 + m_U21, n_U11, buff_U11, ldim_U11,
715
          m_T, n_T, buff_T, ldim_T,
716
          m_G1, n_G1 + n_G2, buff_G1, ldim_G1 );
717
718
  // Remove object B.
719
  //// FLA_Obj_free( & B );
720
  free( buff_B );
721
722
  return 0;
723
}
724
725
// ============================================================================
726
static int sgeqp4_Apply_Q_WY_lhfc_blk_var4(
727
               int m_U, int n_U, float * buff_U, int ldim_U,
728
               int m_T, int n_T, float * buff_T, int ldim_T,
729
               int m_B, int n_B, float * buff_B, int ldim_B ) {
730
//
731
// It applies the transpose of a block transformation Q to a matrix B from
732
// the left:
733
//   B := Q' * B
734
// where:
735
//   Q = I - U * T' * U'.
736
//
737
  float  * buff_W;
738
  int     ldim_W;
739
740
  // Create auxiliary object.
741
  //// FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B1, & W );
742
  buff_W = ( float * ) malloc( n_B * n_U * sizeof( float ) );
743
  ldim_W = max( 1, n_B );
744
745
  // Apply the block transformation.
746
  slarfb_( "Left", "Transpose", "Forward", "Columnwise",
747
           & m_B, & n_B, & n_U, buff_U, & ldim_U, buff_T, & ldim_T,
748
           buff_B, & ldim_B, buff_W, & ldim_W );
749
750
  // Remove auxiliary object.
751
  //// FLA_Obj_free( & W );
752
  free( buff_W );
753
754
  return 0;
755
}
756
757
// ============================================================================
758
static int sgeqp4_Apply_Q_WY_rnfc_blk_var4(
759
               int m_U, int n_U, float * buff_U, int ldim_U,
760
               int m_T, int n_T, float * buff_T, int ldim_T,
761
               int m_B, int n_B, float * buff_B, int ldim_B ) {
762
//
763
// It applies a block transformation Q to a matrix B from the right:
764
//   B = B * Q
765
// where:
766
//   Q = I - U * T' * U'.
767
//
768
  float  * buff_W;
769
  int     ldim_W;
770
771
  // Create auxiliary object.
772
  //// FLA_Obj_create_conf_to( FLA_TRANSPOSE, B1, & W );
773
  buff_W = ( float * ) malloc( m_B * n_U * sizeof( float ) );
774
  ldim_W = max( 1, m_B );
775
776
  // Apply the block transformation.
777
  slarfb_( "Right", "No transpose", "Forward", "Columnwise",
778
           & m_B, & n_B, & n_U, buff_U, & ldim_U, buff_T, & ldim_T,
779
           buff_B, & ldim_B, buff_W, & ldim_W );
780
781
  // Remove auxiliary object.
782
  //// FLA_Obj_free( & W );
783
  free( buff_W );
784
785
  return 0;
786
}
787
788
// ============================================================================
789
static int sgeqp4_QRPmod_WY_unb_var4( int pivoting, int num_stages,
790
               int m_A, int n_A, float * buff_A, int ldim_A,
791
               int * buff_p, float * buff_t,
792
               int pivot_B, int m_B, float * buff_B, int ldim_B,
793
               int pivot_C, int m_C, float * buff_C, int ldim_C,
794
               int build_T, float * buff_T, int ldim_T ) {
795
//
796
// It computes an unblocked QR factorization of matrix A with or without
797
// pivoting. Matrices B and C are optionally pivoted, and matrix T is
798
// optionally built.
799
//
800
// Arguments:
801
// "pivoting": If pivoting==1, then QR factorization with pivoting is used.
802
// "numstages": It tells the number of columns that are factorized.
803
//   If "num_stages" is negative, the whole matrix A is factorized.
804
//   If "num_stages" is positive, only the first "num_stages" are factorized.
805
// "pivot_B": if "pivot_B" is true, matrix "B" is pivoted too.
806
// "pivot_C": if "pivot_C" is true, matrix "C" is pivoted too.
807
// "build_T": if "build_T" is true, matrix "T" is built.
808
//
809
  int     j, mn_A, m_a21, m_A22, n_A22, n_dB, idx_max_col,
810
          i_one = 1, n_house_vector, m_rest;
811
  float  * buff_d, * buff_e, * buff_workspace, diag;
812
  int     isamax_();
813
814
  //// printf( "sgeqp4_QRPmod_WY_unb_var4. pivoting: %d \n", pivoting );
815
816
  // Some initializations.
817
  mn_A    = min( m_A, n_A );
818
819
  // Set the number of stages, if needed.
820
  if( num_stages < 0 ) {
821
    num_stages = mn_A;
822
  }
823
824
  // Create auxiliary vectors.
825
  buff_d         = ( float * ) malloc( n_A * sizeof( float ) );
826
  buff_e         = ( float * ) malloc( n_A * sizeof( float ) );
827
  buff_workspace = ( float * ) malloc( n_A * sizeof( float ) );
828
829
  if( pivoting == 1 ) {
830
    // Compute initial norms of A into d and e.
831
    sgeqp4_QRP_compute_norms( m_A, n_A, buff_A, ldim_A, buff_d, buff_e );
832
  }
833
834
  // Main Loop.
835
  for( j = 0; j < num_stages; j++ ) {
836
    n_dB  = n_A - j;
837
    m_a21 = m_A - j - 1;
838
    m_A22 = m_A - j - 1;
839
    n_A22 = n_A - j - 1;
840
841
    if( pivoting == 1 ) {
842
      // Obtain the index of the column with largest 2-norm.
843
      idx_max_col = isamax_( & n_dB, & buff_d[ j ], & i_one ) - 1;
844
845
      // Swap columns of A, B, C, pivots, and norms vectors.
846
      sgeqp4_QRP_pivot_G_B_C( idx_max_col,
847
          m_A, & buff_A[ 0 + j * ldim_A ], ldim_A,
848
          pivot_B, m_B, & buff_B[ 0 + j * ldim_B ], ldim_B,
849
          pivot_C, m_C, & buff_C[ 0 + j * ldim_C ], ldim_C,
850
          & buff_p[ j ],
851
          & buff_d[ j ],
852
          & buff_e[ j ] );
853
    }
854
855
    // Compute tau1 and u21 from alpha11 and a21 such that tau1 and u21
856
    // determine a Householder transform H such that applying H from the
857
    // left to the column vector consisting of alpha11 and a21 annihilates
858
    // the entries in a21 (and updates alpha11).
859
    n_house_vector = m_a21 + 1;
860
    slarfg_( & n_house_vector,
861
             & buff_A[ j + j * ldim_A ],
862
             & buff_A[ min( m_A-1, j+1 ) + j * ldim_A ], & i_one,
863
             & buff_t[ j ] );
864
865
    // / a12t \ =  H / a12t \
866
    // \ A22  /      \ A22  /
867
    //
868
    // where H is formed from tau1 and u21.
869
    diag = buff_A[ j + j * ldim_A ];
870
    buff_A[ j + j * ldim_A ] = 1.0;
871
    m_rest = m_A22 + 1;
872
    slarf_( "Left", & m_rest, & n_A22,
873
        & buff_A[ j + j * ldim_A ], & i_one,
874
        & buff_t[ j ],
875
        & buff_A[ j + ( j+1 ) * ldim_A ], & ldim_A,
876
        buff_workspace );
877
    buff_A[ j + j * ldim_A ] = diag;
878
879
    if( pivoting == 1 ) {
880
      // Update partial column norms.
881
      sgeqp4_QRP_downdate_partial_norms( m_A22, n_A22,
882
          & buff_d[ j+1 ], 1,
883
          & buff_e[ j+1 ], 1,
884
          & buff_A[ j + ( j+1 ) * ldim_A ], ldim_A,
885
          & buff_A[ ( j+1 ) + min( n_A-1, ( j+1 ) ) * ldim_A ], ldim_A );
886
    }
887
  }
888
889
  // Build T.
890
  if( build_T ) {
891
    slarft_( "Forward", "Columnwise", & m_A, & num_stages, buff_A, & ldim_A,
892
             buff_t, buff_T, & ldim_T );
893
  }
894
895
  // Remove auxiliary vectors.
896
  free( buff_d );
897
  free( buff_e );
898
  free( buff_workspace );
899
900
  return 0;
901
}
902
903
// ============================================================================
904
static int sgeqp4_QRP_compute_norms(
905
               int m_A, int n_A, float * buff_A, int ldim_A,
906
               float * buff_d, float * buff_e ) {
907
//
908
// It computes the column norms of matrix A. The norms are stored into
909
// vectors d and e.
910
//
911
  int     j, i_one = 1;
912
  float  snrm2_();
913
914
  // Main loop.
915
  for( j = 0; j < n_A; j++ ) {
916
    * buff_d = snrm2_( & m_A, buff_A, & i_one );
917
    * buff_e = * buff_d;
918
    buff_A += ldim_A;
919
    buff_d++;
920
    buff_e++;
921
  }
922
923
  return 0;
924
}
925
926
// ============================================================================
927
static int sgeqp4_QRP_downdate_partial_norms( int m_A, int n_A,
928
               float * buff_d,  int st_d,
929
               float * buff_e,  int st_e,
930
               float * buff_wt, int st_wt,
931
               float * buff_A,  int ldim_A ) {
932
//
933
// It updates (downdates) the column norms of matrix A. It uses Drmac's method.
934
//
935
  int     j, i_one = 1;
936
  float  * ptr_d, * ptr_e, * ptr_wt, * ptr_A;
937
  float  temp, temp2, temp5, tol3z;
938
  float  snrm2_(), slamch_();
939
940
  /*
941
*
942
*           Update partial column norms
943
*
944
          DO 30 J = I + 1, N
945
             IF( WORK( J ).NE.ZERO ) THEN
946
*
947
*                 NOTE: The following 4 lines follow from the analysis in
948
*                 Lapack Working Note 176.
949
*
950
                TEMP = ABS( A( I, J ) ) / WORK( J )
951
                TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
952
                TEMP2 = TEMP*( WORK( J ) / WORK( N+J ) )**2
953
                IF( TEMP2 .LE. TOL3Z ) THEN
954
                   IF( M-I.GT.0 ) THEN
955
                      WORK( J ) = DNRM2( M-I, A( I+1, J ), 1 )
956
                      WORK( N+J ) = WORK( J )
957
                   ELSE
958
                      WORK( J ) = ZERO
959
                      WORK( N+J ) = ZERO
960
                   END IF
961
                ELSE
962
                   WORK( J ) = WORK( J )*SQRT( TEMP )
963
                END IF
964
             END IF
965
 30       CONTINUE
966
  */
967
968
  // Some initializations.
969
  tol3z = sqrt( slamch_( "Epsilon" ) );
970
  ptr_d  = buff_d;
971
  ptr_e  = buff_e;
972
  ptr_wt = buff_wt;
973
  ptr_A  = buff_A;
974
975
  // Main loop.
976
  for( j = 0; j < n_A; j++ ) {
977
    if( * ptr_d != 0.0 ) {
978
      temp = dabs( * ptr_wt ) / * ptr_d;
979
      temp = max( 0.0, ( 1.0 + temp ) * ( 1 - temp ) );
980
      temp5 = * ptr_d / * ptr_e;
981
      temp2 = temp * temp5 * temp5;
982
      if( temp2 <= tol3z ) {
983
        if( m_A > 0 ) {
984
          * ptr_d = snrm2_( & m_A, ptr_A, & i_one );
985
          * ptr_e = *ptr_d;
986
        } else {
987
          * ptr_d = 0.0;
988
          * ptr_e = 0.0;
989
        }
990
      } else {
991
        * ptr_d = * ptr_d * sqrt( temp );
992
      }
993
    }
994
    ptr_A  += ldim_A;
995
    ptr_d  += st_d;
996
    ptr_e  += st_e;
997
    ptr_wt += st_wt;
998
  }
999
1000
  return 0;
1001
}
1002
1003
1004
// ============================================================================
1005
static int sgeqp4_QRP_pivot_G_B_C( int j_max_col,
1006
               int m_G, float * buff_G, int ldim_G,
1007
               int pivot_B, int m_B, float * buff_B, int ldim_B,
1008
               int pivot_C, int m_C, float * buff_C, int ldim_C,
1009
               int * buff_p,
1010
               float * buff_d, float * buff_e ) {
1011
//
1012
// It pivots matrix G, pivot vector p, and norms vectors d and e.
1013
// Matrices B and C are optionally pivoted.
1014
//
1015
  int     ival, i_one = 1;
1016
  float  * ptr_g1, * ptr_g2, * ptr_b1, * ptr_b2, * ptr_c1, * ptr_c2;
1017
1018
  // Swap columns of G, pivots, and norms.
1019
  if( j_max_col != 0 ) {
1020
1021
    // Swap full column 0 and column "j_max_col" of G.
1022
    ptr_g1 = & buff_G[ 0 + 0         * ldim_G ];
1023
    ptr_g2 = & buff_G[ 0 + j_max_col * ldim_G ];
1024
    sswap_( & m_G, ptr_g1, & i_one, ptr_g2, & i_one );
1025
1026
    // Swap full column 0 and column "j_max_col" of B.
1027
    if( pivot_B ) {
1028
      ptr_b1 = & buff_B[ 0 + 0         * ldim_B ];
1029
      ptr_b2 = & buff_B[ 0 + j_max_col * ldim_B ];
1030
      sswap_( & m_B, ptr_b1, & i_one, ptr_b2, & i_one );
1031
    }
1032
1033
    // Swap full column 0 and column "j_max_col" of C.
1034
    if( pivot_C ) {
1035
      ptr_c1 = & buff_C[ 0 + 0         * ldim_C ];
1036
      ptr_c2 = & buff_C[ 0 + j_max_col * ldim_C ];
1037
      sswap_( & m_C, ptr_c1, & i_one, ptr_c2, & i_one );
1038
    }
1039
1040
    // Swap element 0 and element "j_max_col" of pivot vector "p".
1041
    ival = buff_p[ j_max_col ];
1042
    buff_p[ j_max_col ] = buff_p[ 0 ];
1043
    buff_p[ 0 ] = ival;
1044
1045
    // Copy norms of column 0 to column "j_max_col".
1046
    buff_d[ j_max_col ] = buff_d[ 0 ];
1047
    buff_e[ j_max_col ] = buff_e[ 0 ];
1048
  }
1049
1050
  return 0;
1051
}
1052