1 |
|
/* |
2 |
|
=============================================================================== |
3 |
|
Authors |
4 |
|
=============================================================================== |
5 |
|
|
6 |
|
Per-Gunnar Martinsson |
7 |
|
Dept. of Applied Mathematics, |
8 |
|
University of Colorado at Boulder, |
9 |
|
526 UCB, Boulder, CO 80309-0526, USA |
10 |
|
|
11 |
|
Gregorio Quintana-Orti |
12 |
|
Depto. de Ingenieria y Ciencia de Computadores, |
13 |
|
Universitat Jaume I, |
14 |
|
12.071 Castellon, Spain |
15 |
|
|
16 |
|
Nathan Heavner |
17 |
|
Dept. of Applied Mathematics, |
18 |
|
University of Colorado at Boulder, |
19 |
|
526 UCB, Boulder, CO 80309-0526, USA |
20 |
|
|
21 |
|
Robert van de Geijn |
22 |
|
Dept. of Computer Science and Institute for Computational Engineering and |
23 |
|
Sciences, |
24 |
|
The University of Texas at Austin |
25 |
|
Austin, TX. |
26 |
|
|
27 |
|
=============================================================================== |
28 |
|
Copyright |
29 |
|
=============================================================================== |
30 |
|
|
31 |
|
Copyright (C) 2016, |
32 |
|
Universitat Jaume I, |
33 |
|
University of Colorado at Boulder, |
34 |
|
The University of Texas at Austin. |
35 |
|
|
36 |
|
=============================================================================== |
37 |
|
Disclaimer |
38 |
|
=============================================================================== |
39 |
|
|
40 |
|
This code is distributed in the hope that it will be useful, but |
41 |
|
WITHOUT ANY WARRANTY EXPRESSED OR IMPLIED. |
42 |
|
|
43 |
|
*/ |
44 |
|
|
45 |
|
#include <omp.h> |
46 |
|
|
47 |
|
#include <stdlib.h> |
48 |
|
#include <stdio.h> |
49 |
|
#include <math.h> |
50 |
|
#include "blas_lapack_prototypes.h" |
51 |
|
#include "dgeqp4.h" |
52 |
|
|
53 |
|
|
54 |
|
// Matrices with dimensions smaller than THRESHOLD_FOR_DGEQPF are processed |
55 |
|
// with LAPACK's routine dgeqpf. |
56 |
|
// Matrices with dimensions between THRESHOLD_FOR_DGEQPF and |
57 |
|
// THRESHOLD_FOR_DGEQP3 are processed with LAPACK's routine dgeqp3. |
58 |
|
// Matrices with dimensions larger than THRESHOLD_FOR_DGEQP3 are processed |
59 |
|
// with the new HQRRP code. |
60 |
|
#define THRESHOLD_FOR_DGEQPF 0 |
61 |
|
#define THRESHOLD_FOR_DGEQP3 500 |
62 |
|
|
63 |
|
|
64 |
|
// ============================================================================ |
65 |
|
// Definition of macros. |
66 |
|
|
67 |
|
#define max( a, b ) ( (a) > (b) ? (a) : (b) ) |
68 |
|
#define min( a, b ) ( (a) > (b) ? (b) : (a) ) |
69 |
|
#define dabs( a ) ( (a) >= 0.0 ? (a) : -(a) ) |
70 |
|
|
71 |
|
// ============================================================================ |
72 |
|
// Compilation declarations. |
73 |
|
|
74 |
|
#undef CHECK_DOWNDATING_OF_Y |
75 |
|
|
76 |
|
|
77 |
|
// ============================================================================ |
78 |
|
// Declaration of local prototypes. |
79 |
|
|
80 |
|
static int dgeqp4_Normal_random_matrix( int m_A, int n_A, |
81 |
|
double * buff_A, int ldim_A ); |
82 |
|
|
83 |
|
static double dgeqp4_Normal_random_number( double mu, double sigma ); |
84 |
|
|
85 |
|
static int dgeqp4_Downdate_Y( |
86 |
|
int m_U11, int n_U11, double * buff_U11, int ldim_U11, |
87 |
|
int m_U21, int n_U21, double * buff_U21, int ldim_U21, |
88 |
|
int m_A12, int n_A12, double * buff_A12, int ldim_A12, |
89 |
|
int m_T, int n_T, double * buff_T, int ldim_T, |
90 |
|
int m_Y2, int n_Y2, double * buff_Y2, int ldim_Y2, |
91 |
|
int m_G1, int n_G1, double * buff_G1, int ldim_G1, |
92 |
|
int m_G2, int n_G2, double * buff_G2, int ldim_G2 ); |
93 |
|
|
94 |
|
static int dgeqp4_Apply_Q_WY_lhfc_blk_var4( |
95 |
|
int m_U, int n_U, double * buff_U, int ldim_U, |
96 |
|
int m_T, int n_T, double * buff_T, int ldim_T, |
97 |
|
int m_B, int n_B, double * buff_B, int ldim_B ); |
98 |
|
|
99 |
|
static int dgeqp4_Apply_Q_WY_rnfc_blk_var4( |
100 |
|
int m_U, int n_U, double * buff_U, int ldim_U, |
101 |
|
int m_T, int n_T, double * buff_T, int ldim_T, |
102 |
|
int m_B, int n_B, double * buff_B, int ldim_B ); |
103 |
|
|
104 |
|
static int dgeqp4_QRPmod_WY_unb_var4( int pivoting, int num_stages, |
105 |
|
int m_A, int n_A, double * buff_A, int ldim_A, |
106 |
|
int * buff_p, double * buff_t, |
107 |
|
int pivot_B, int m_B, double * buff_B, int ldim_B, |
108 |
|
int pivot_C, int m_C, double * buff_C, int ldim_C, |
109 |
|
int build_T, double * buff_T, int ldim_T ); |
110 |
|
|
111 |
|
static int dgeqp4_QRP_compute_norms( |
112 |
|
int m_A, int n_A, double * buff_A, int ldim_A, |
113 |
|
double * buff_d, double * buff_e ); |
114 |
|
|
115 |
|
static int dgeqp4_QRP_downdate_partial_norms( int m_A, int n_A, |
116 |
|
double * buff_d, int st_d, |
117 |
|
double * buff_e, int st_e, |
118 |
|
double * buff_wt, int st_wt, |
119 |
|
double * buff_A, int ldim_A ); |
120 |
|
|
121 |
|
static int dgeqp4_QRP_pivot_G_B_C( int j_max_col, |
122 |
|
int m_G, double * buff_G, int ldim_G, |
123 |
|
int pivot_B, int m_B, double * buff_B, int ldim_B, |
124 |
|
int pivot_C, int m_C, double * buff_C, int ldim_C, |
125 |
|
int * buff_p, |
126 |
|
double * buff_d, double * buff_e ); |
127 |
|
|
128 |
|
|
129 |
|
// ============================================================================ |
130 |
|
void dgeqp4( int * m, int * n, double * A, int * lda, int * jpvt, double * tau, |
131 |
|
double * work, int * lwork, int * info ) { |
132 |
|
// |
133 |
|
// This routine is plug compatible with LAPACK's routine dgeqp3. |
134 |
|
// It computes the new HQRRP while keeping the same header as LAPACK's dgeqp3. |
135 |
|
// It uses dgeqpf or dgeqp3 for small matrices. The thresholds are defined in |
136 |
|
// constants THRESHOLD_FOR_DGEQPF and THRESHOLD_FOR_DGEQP3. |
137 |
|
// |
138 |
|
int INB = 1; |
139 |
|
int i_one = 1, i_minus_one = -1, |
140 |
|
m_A, n_A, mn_A, ldim_A, lquery, nb, num_factorized_fixed_cols, |
141 |
|
minus_info, iws, lwkopt, j, k, num_fixed_cols, n_rest, itmp; |
142 |
|
int * previous_jpvt; |
143 |
|
int ilaenv_(); |
144 |
|
|
145 |
|
// Some initializations. |
146 |
|
m_A = * m; |
147 |
|
n_A = * n; |
148 |
|
mn_A = min( m_A, n_A ); |
149 |
|
ldim_A = * lda; |
150 |
|
|
151 |
|
// Check input arguments. |
152 |
|
* info = 0; |
153 |
|
lquery = ( * lwork == -1 ); |
154 |
|
if( m_A < 0 ) { |
155 |
|
* info = -1; |
156 |
|
} else if ( n_A < 0 ) { |
157 |
|
* info = -2; |
158 |
|
} else if ( ldim_A < max( 1, m_A ) ) { |
159 |
|
* info = -4; |
160 |
|
} |
161 |
|
|
162 |
|
if( *info == 0 ) { |
163 |
|
if( mn_A == 0 ) { |
164 |
|
iws = 1; |
165 |
|
lwkopt = 1; |
166 |
|
} else { |
167 |
|
iws = 3 * n_A + 1; |
168 |
|
nb = ilaenv_( & INB, "DGEQRF", " ", & m_A, & n_A, & i_minus_one, |
169 |
|
& i_minus_one ); |
170 |
|
lwkopt = 2 * n_A + ( n_A + 1 ) * nb; |
171 |
|
} |
172 |
|
work[ 0 ] = ( double ) lwkopt; |
173 |
|
|
174 |
|
if ( ( * lwork < iws )&&( ! lquery ) ) { |
175 |
|
* info = -8; |
176 |
|
} |
177 |
|
} |
178 |
|
|
179 |
|
if( * info != 0 ) { |
180 |
|
minus_info = - * info; |
181 |
|
xerbla_( "DGEQP3", & minus_info ); |
182 |
|
return; |
183 |
|
} else if( lquery ) { |
184 |
|
return; |
185 |
|
} |
186 |
|
|
187 |
|
// Quick return if possible. |
188 |
|
if( mn_A == 0 ) { |
189 |
|
return; |
190 |
|
} |
191 |
|
|
192 |
|
// Use LAPACK's DGEQPF or DGEQP3 for small matrices. |
193 |
|
//if( mn_A < THRESHOLD_FOR_DGEQPF ) { |
194 |
|
// // Call to LAPACK routine. |
195 |
|
// //// printf( "Calling dgeqpf\n" ); |
196 |
|
// dgeqpf_( m, n, A, lda, jpvt, tau, work, info ); |
197 |
|
// return; |
198 |
|
//} else |
199 |
|
|
200 |
|
|
201 |
|
// Use LAPACK's DGEQP3 for small matrices. |
202 |
|
if( mn_A < THRESHOLD_FOR_DGEQP3 ) { |
203 |
|
//// printf( "Calling dgeqp3\n" ); |
204 |
|
dgeqp3_( m, n, A, lda, jpvt, tau, work, lwork, info ); |
205 |
|
return; |
206 |
|
} |
207 |
|
|
208 |
|
// Move initial columns up front. |
209 |
|
num_fixed_cols = 0; |
210 |
|
for( j = 0; j < n_A; j++ ) { |
211 |
|
if( jpvt[ j ] != 0 ) { |
212 |
|
if( j != num_fixed_cols ) { |
213 |
|
//// printf( "Swapping columns: %d %d \n", j, num_fixed_cols ); |
214 |
|
dswap_( & m_A, & A[ 0 + j * ldim_A ], & i_one, |
215 |
|
& A[ 0 + num_fixed_cols * ldim_A ], & i_one ); |
216 |
|
jpvt[ j ] = jpvt[ num_fixed_cols ]; |
217 |
|
jpvt[ num_fixed_cols ] = j + 1; |
218 |
|
} else { |
219 |
|
jpvt[ j ] = j + 1 ; |
220 |
|
} |
221 |
|
num_fixed_cols++; |
222 |
|
} else { |
223 |
|
jpvt[ j ] = j + 1 ; |
224 |
|
} |
225 |
|
} |
226 |
|
|
227 |
|
// Factorize fixed columns at the front. |
228 |
|
num_factorized_fixed_cols = min( m_A, num_fixed_cols ); |
229 |
|
if( num_factorized_fixed_cols > 0 ) { |
230 |
|
dgeqrf_( & m_A, & num_factorized_fixed_cols, A, & ldim_A, tau, work, lwork, |
231 |
|
info ); |
232 |
|
if( * info != 0 ) { |
233 |
|
fprintf( stderr, "ERROR in dgeqrf: Info: %d \n", * info ); |
234 |
|
} |
235 |
|
iws = max( iws, ( int ) work[ 0 ] ); |
236 |
|
if( num_factorized_fixed_cols < n_A ) { |
237 |
|
n_rest = n_A - num_factorized_fixed_cols; |
238 |
|
dormqr_( "Left", "Transpose", |
239 |
|
& m_A, & n_rest, & num_factorized_fixed_cols, |
240 |
|
A, & ldim_A, tau, |
241 |
|
& A[ 0 + num_factorized_fixed_cols * ldim_A ], & ldim_A, |
242 |
|
work, lwork, info ); |
243 |
|
if( * info != 0 ) { |
244 |
|
fprintf( stderr, "ERROR in dormqr: Info: %d \n", * info ); |
245 |
|
} |
246 |
|
|
247 |
|
iws = max( iws, ( int ) work[ 0 ] ); |
248 |
|
} |
249 |
|
} |
250 |
|
|
251 |
|
// Create intermediate jpvt vector. |
252 |
|
previous_jpvt = ( int * ) malloc( n_A * sizeof( int ) ); |
253 |
|
|
254 |
|
// Save a copy of jpvt vector. |
255 |
|
if( num_factorized_fixed_cols > 0 ) { |
256 |
|
// Copy vector. |
257 |
|
for( j = 0; j < n_A; j++ ) { |
258 |
|
previous_jpvt[ j ] = jpvt[ j ]; |
259 |
|
} |
260 |
|
} |
261 |
|
|
262 |
|
// Factorize free columns at the bottom with default values: |
263 |
|
// nb_alg = 64, pp = 10, panel_pivoting = 1. |
264 |
|
if( num_factorized_fixed_cols < mn_A ) { |
265 |
|
* info = dgeqp4_HQRRP_WY_blk_var4( |
266 |
|
m_A - num_factorized_fixed_cols, n_A - num_factorized_fixed_cols, |
267 |
|
& A[ num_factorized_fixed_cols + num_factorized_fixed_cols * ldim_A ], |
268 |
|
ldim_A, |
269 |
|
& jpvt[ num_factorized_fixed_cols ], |
270 |
|
& tau[ num_factorized_fixed_cols ], |
271 |
|
64, 10, 1 ); |
272 |
|
} |
273 |
|
|
274 |
|
// Pivot block above factorized block by dgeqp4_HQRRP. |
275 |
|
if( num_factorized_fixed_cols > 0 ) { |
276 |
|
// Pivot block above factorized block. |
277 |
|
for( j = num_factorized_fixed_cols; j < n_A; j++ ) { |
278 |
|
//// printf( "%% Processing j: %d \n", j ); |
279 |
|
for( k = j; k < n_A; k++ ) { |
280 |
|
if( jpvt[ j ] == previous_jpvt[ k ] ) { |
281 |
|
//// printf( "%% Found j: %d k: %d \n", j, k ); |
282 |
|
break; |
283 |
|
} |
284 |
|
} |
285 |
|
// Swap vector previous_jpvt and block above factorized block. |
286 |
|
if( k != j ) { |
287 |
|
// Swap elements in previous_jpvt. |
288 |
|
//// printf( "%% Swapping j: %d k: %d \n", j, k ); |
289 |
|
itmp = previous_jpvt[ j ]; |
290 |
|
previous_jpvt[ j ] = previous_jpvt[ k ]; |
291 |
|
previous_jpvt[ k ] = itmp; |
292 |
|
|
293 |
|
// Swap columns in block above factorized block. |
294 |
|
dswap_( & num_factorized_fixed_cols, |
295 |
|
& A[ 0 + j * ldim_A ], & i_one, |
296 |
|
& A[ 0 + k * ldim_A ], & i_one ); |
297 |
|
} |
298 |
|
} |
299 |
|
} |
300 |
|
|
301 |
|
// Remove intermediate jpvt vector. |
302 |
|
free( previous_jpvt ); |
303 |
|
|
304 |
|
// Return workspace length required. |
305 |
|
work[ 0 ] = iws; |
306 |
|
return; |
307 |
|
} |
308 |
|
|
309 |
|
// ============================================================================ |
310 |
|
int dgeqp4_HQRRP_WY_blk_var4( int m_A, int n_A, double * buff_A, int ldim_A, |
311 |
|
int * buff_jpvt, double * buff_tau, |
312 |
|
int nb_alg, int pp, int panel_pivoting ) { |
313 |
|
// |
314 |
|
// HQRRP: It computes the Householder QR with Randomized Pivoting of matrix A. |
315 |
|
// This routine is almost compatible with LAPACK's dgeqp3. |
316 |
|
// The main difference is that this routine does not manage fixed columns. |
317 |
|
// |
318 |
|
// Main features: |
319 |
|
// * BLAS-3 based. |
320 |
|
// * Norm downdating method by Drmac. |
321 |
|
// * Downdating for computing Y. |
322 |
|
// * No use of libflame. |
323 |
|
// * Compact WY transformations are used instead of UT transformations. |
324 |
|
// * LAPACK's routine dlarfb is used to apply block transformations. |
325 |
|
// |
326 |
|
// Arguments: |
327 |
|
// ---------- |
328 |
|
// m_A: Number of rows of matrix A. |
329 |
|
// n_A: Number of columns of matrix A. |
330 |
|
// buff_A: Address/pointer of/to data in matrix A. Matrix A must be |
331 |
|
// stored in column-order. |
332 |
|
// ldim_A: Leading dimension of matrix A. |
333 |
|
// buff_jpvt: Input/output vector with the pivots. |
334 |
|
// buff_tau: Output vector with the tau values of the Householder factors. |
335 |
|
// nb_alg: Block size. |
336 |
|
// Usual values for nb_alg are 32, 64, etc. |
337 |
|
// pp: Oversampling size. |
338 |
|
// Usual values for pp are 5, 10, etc. |
339 |
|
// panel_pivoting: If panel_pivoting==1, QR with pivoting is applied to |
340 |
|
// factorize the panels of matrix A. Otherwise, QR without |
341 |
|
// pivoting is used. Usual value for panel_pivoting is 1. |
342 |
|
// Final comments: |
343 |
|
// --------------- |
344 |
|
// This code has been created from a libflame code. Hence, you can find some |
345 |
|
// commented calls to libflame routines. We have left them to make it easier |
346 |
|
// to interpret the meaning of the C code. |
347 |
|
// |
348 |
|
int b, j, last_iter, mn_A, m_Y, n_Y, ldim_Y, m_V, n_V, ldim_V, |
349 |
|
m_W, n_W, ldim_W, n_VR, m_AB1, n_AB1, ldim_T1_T, |
350 |
|
m_A11, n_A11, m_A12, n_A12, m_A21, n_A21, m_A22, |
351 |
|
m_G, n_G, ldim_G; |
352 |
|
double * buff_Y, * buff_V, * buff_W, * buff_VR, * buff_YR, |
353 |
|
* buff_s, * buff_sB, * buff_s1, |
354 |
|
* buff_AR, * buff_AB1, * buff_A01, * buff_Y1, * buff_T1_T, |
355 |
|
* buff_A11, * buff_A21, * buff_A12, |
356 |
|
* buff_Y2, * buff_G, * buff_G1, * buff_G2; |
357 |
|
int * buff_p, * buff_pB, * buff_p1; |
358 |
|
double d_zero = 0.0; |
359 |
|
double d_one = 1.0; |
360 |
|
|
361 |
|
|
362 |
|
double down_t = 0.0, unb_t = 0.0, lhfc_t = 0.0; |
363 |
|
|
364 |
|
|
365 |
|
|
366 |
|
// Executable Statements. |
367 |
|
//// printf( "%% dgeqp4_HQRRP_WY_blk_var4.\n" ); |
368 |
|
|
369 |
|
// Check arguments. |
370 |
|
if( m_A < 0 ) { |
371 |
|
fprintf( stderr, |
372 |
|
"ERROR in dgeqp4_HQRRP_WY_blk_var4: m_A is < 0.\n" ); |
373 |
|
} if( n_A < 0 ) { |
374 |
|
fprintf( stderr, |
375 |
|
"ERROR in dgeqp4_HQRRP_WY_blk_var4: n_A is < 0.\n" ); |
376 |
|
} if( ldim_A < max( 1, m_A ) ) { |
377 |
|
fprintf( stderr, |
378 |
|
"ERROR in dgeqp4_HQRRP_WY_blk_var4: ldim_A is < max( 1, m_A ).\n" ); |
379 |
|
} |
380 |
|
|
381 |
|
// Some initializations. |
382 |
|
mn_A = min( m_A, n_A ); |
383 |
|
buff_p = buff_jpvt; |
384 |
|
buff_s = buff_tau; |
385 |
|
|
386 |
|
// Quick return. |
387 |
|
if( mn_A == 0 ) { |
388 |
|
return 0; |
389 |
|
} |
390 |
|
|
391 |
|
// Initialize the seed for the generator of random numbers. |
392 |
|
srand( 12 ); |
393 |
|
|
394 |
|
// Create auxiliary objects. |
395 |
|
m_Y = nb_alg + pp; |
396 |
|
n_Y = n_A; |
397 |
|
buff_Y = ( double * ) malloc( m_Y * n_Y * sizeof( double ) ); |
398 |
|
ldim_Y = m_Y; |
399 |
|
|
400 |
|
m_V = nb_alg + pp; |
401 |
|
n_V = n_A; |
402 |
|
buff_V = ( double * ) malloc( m_V * n_V * sizeof( double ) ); |
403 |
|
ldim_V = m_V; |
404 |
|
|
405 |
|
m_W = nb_alg; |
406 |
|
n_W = n_A; |
407 |
|
buff_W = ( double * ) malloc( m_W * n_W * sizeof( double ) ); |
408 |
|
ldim_W = m_W; |
409 |
|
|
410 |
|
m_G = nb_alg + pp; |
411 |
|
n_G = m_A; |
412 |
|
buff_G = ( double * ) malloc( m_G * n_G * sizeof( double ) ); |
413 |
|
ldim_G = m_G; |
414 |
|
|
415 |
|
double beg = omp_get_wtime(); |
416 |
|
// Initialize matrices G and Y. |
417 |
|
dgeqp4_Normal_random_matrix( nb_alg + pp, m_A, buff_G, ldim_G ); |
418 |
|
double rand_t = omp_get_wtime() - beg; |
419 |
|
//// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, |
420 |
|
//// FLA_ONE, G, A, FLA_ZERO, Y ); |
421 |
|
|
422 |
|
|
423 |
|
beg = omp_get_wtime(); |
424 |
|
dgemm_( "No tranpose", "No transpose", & m_Y, & n_Y, & m_A, |
425 |
|
& d_one, buff_G, & ldim_G, buff_A, & ldim_A, |
426 |
|
& d_zero, buff_Y, & ldim_Y ); |
427 |
|
double gemm_proj_t = omp_get_wtime() - beg; |
428 |
|
|
429 |
|
//printf( "rand_t %lf gemm_proj_t %lf\n", rand_t, gemm_proj_t ); fflush( stdout ); |
430 |
|
|
431 |
|
|
432 |
|
|
433 |
|
// Main Loop. |
434 |
|
for( j = 0; j < mn_A; j += nb_alg ) { |
435 |
|
b = min( nb_alg, min( n_A - j, m_A - j ) ); |
436 |
|
|
437 |
|
// Check whether it is the last iteration. |
438 |
|
last_iter = ( ( ( j + nb_alg >= m_A )||( j + nb_alg >= n_A ) ) ? 1 : 0 ); |
439 |
|
|
440 |
|
// Some initializations for the iteration of this loop. |
441 |
|
n_VR = n_V - j; |
442 |
|
buff_VR = & buff_V[ 0 + j * ldim_V ]; |
443 |
|
buff_YR = & buff_Y[ 0 + j * ldim_Y ]; |
444 |
|
buff_pB = & buff_p[ j ]; |
445 |
|
buff_sB = & buff_s[ j ]; |
446 |
|
buff_AR = & buff_A[ 0 + j * ldim_A ]; |
447 |
|
|
448 |
|
m_AB1 = m_A - j; |
449 |
|
n_AB1 = b; |
450 |
|
buff_AB1 = & buff_A[ j + j * ldim_A ]; |
451 |
|
buff_p1 = & buff_p[ j ]; |
452 |
|
buff_s1 = & buff_s[ j ]; |
453 |
|
buff_A01 = & buff_A[ 0 + j * ldim_A ]; |
454 |
|
buff_Y1 = & buff_Y[ 0 + j * ldim_Y ]; |
455 |
|
buff_T1_T = & buff_W[ 0 + j * ldim_W ]; |
456 |
|
ldim_T1_T = ldim_W; |
457 |
|
|
458 |
|
buff_A11 = & buff_A[ j + j * ldim_A ]; |
459 |
|
m_A11 = b; |
460 |
|
n_A11 = b; |
461 |
|
|
462 |
|
buff_A21 = & buff_A[ min( m_A - 1, j + nb_alg ) + j * ldim_A ]; |
463 |
|
m_A21 = max( 0, m_A - j - b ); |
464 |
|
n_A21 = b; |
465 |
|
|
466 |
|
buff_A12 = & buff_A[ j + min( n_A - 1, j + b ) * ldim_A ]; |
467 |
|
m_A12 = b; |
468 |
|
n_A12 = max( 0, n_A - j - b ); |
469 |
|
|
470 |
|
//// buff_A22 = & buff_A[ min( m_A - 1, j + b ) + |
471 |
|
//// min( n_A - 1, j + b ) * ldim_A ]; |
472 |
|
m_A22 = max( 0, m_A - j - b ); |
473 |
|
//// n_A22 = max( 0, n_A - j - b ); |
474 |
|
|
475 |
|
buff_Y2 = & buff_Y[ 0 + min( n_Y - 1, j + b ) * ldim_Y ]; |
476 |
|
buff_G1 = & buff_G[ 0 + j * ldim_G ]; |
477 |
|
buff_G2 = & buff_G[ 0 + min( n_G - 1, j + b ) * ldim_G ]; |
478 |
|
|
479 |
|
#ifdef CHECK_DOWNDATING_OF_Y |
480 |
|
// Check downdating of matrix Y: Compare downdated matrix Y with |
481 |
|
// matrix Y computed from scratch. |
482 |
|
int m_cyr, n_cyr, ldim_cyr, m_ABR, ii, jj; |
483 |
|
double * buff_cyr, aux, sum; |
484 |
|
|
485 |
|
m_cyr = m_Y; |
486 |
|
n_cyr = n_Y - j; |
487 |
|
ldim_cyr = m_cyr; |
488 |
|
m_ABR = m_A - j; |
489 |
|
buff_cyr = ( double * ) malloc( m_cyr * n_cyr * sizeof( double ) ); |
490 |
|
|
491 |
|
//// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, |
492 |
|
//// FLA_ONE, GR, ABR, FLA_ZERO, CYR ); |
493 |
|
dgemm_( "No tranpose", "No transpose", & m_cyr, & n_cyr, & m_ABR, |
494 |
|
& d_one, & buff_G[ 0 + j * ldim_G ], & ldim_G, |
495 |
|
& buff_A[ j + j * ldim_A ], & ldim_A, |
496 |
|
& d_zero, & buff_cyr[ 0 + 0 * ldim_cyr ], & ldim_cyr ); |
497 |
|
|
498 |
|
//// print_double_matrix( "cyr", m_cyr, n_cyr, buff_cyr, ldim_cyr ); |
499 |
|
//// print_double_matrix( "y", m_Y, n_Y, buff_Y, ldim_Y ); |
500 |
|
sum = 0.0; |
501 |
|
for( jj = 0; jj < n_cyr; jj++ ) { |
502 |
|
for( ii = 0; ii < m_cyr; ii++ ) { |
503 |
|
aux = buff_Y[ ii + ( j + jj ) * ldim_Y ] - |
504 |
|
buff_cyr[ ii + jj * ldim_cyr ]; |
505 |
|
sum += aux * aux; |
506 |
|
} |
507 |
|
} |
508 |
|
sum = sqrt( sum ); |
509 |
|
printf( "%% diff between Y and downdated Y: %le\n", sum ); |
510 |
|
|
511 |
|
free( buff_cyr ); |
512 |
|
#endif |
513 |
|
|
514 |
|
if( last_iter == 0 ) { |
515 |
|
// Compute QRP of YR, and apply permutations to matrix AR. |
516 |
|
// A copy of YR is made into VR, and permutations are applied to YR. |
517 |
|
//// FLA_Merge_2x1( ATR, |
518 |
|
//// ABR, & AR ); |
519 |
|
//// FLA_Copy( YR, VR ); |
520 |
|
//// FLA_QRPmod_WY_unb_var4( 1, bRow, VR, pB, sB, 1, AR, 1, YR, 0, None ); |
521 |
|
|
522 |
|
dlacpy_( "All", & m_V, & n_VR, buff_YR, & ldim_Y, |
523 |
|
buff_VR, & ldim_V ); |
524 |
|
dgeqp4_QRPmod_WY_unb_var4( 1, b, |
525 |
|
m_V, n_VR, buff_VR, ldim_V, buff_pB, buff_sB, |
526 |
|
1, m_A, buff_AR, ldim_A, |
527 |
|
1, m_Y, buff_YR, ldim_Y, |
528 |
|
0, buff_Y, ldim_Y ); |
529 |
|
} |
530 |
|
|
531 |
|
// |
532 |
|
// Compute QRP of panel AB1 = [ A11; A21 ]. |
533 |
|
// Apply same permutations to A01 and Y1, and build T1_T. |
534 |
|
// |
535 |
|
//// FLA_Part_2x1( W1, & T1_T, |
536 |
|
//// & None, b, FLA_TOP ); |
537 |
|
//// FLA_Merge_2x1( A11, |
538 |
|
//// A21, & AB1 ); |
539 |
|
//// FLA_QRPmod_WY_unb_var4( panel_pivoting, -1, AB1, p1, s1, |
540 |
|
//// 1, A01, 1, Y1, 1, T1_T ); |
541 |
|
|
542 |
|
beg = omp_get_wtime(); |
543 |
|
dgeqp4_QRPmod_WY_unb_var4( panel_pivoting, -1, |
544 |
|
m_AB1, n_AB1, buff_AB1, ldim_A, buff_p1, buff_s1, |
545 |
|
1, j, buff_A01, ldim_A, |
546 |
|
1, m_Y, buff_Y1, ldim_Y, |
547 |
|
1, buff_T1_T, ldim_W ); |
548 |
|
unb_t += omp_get_wtime() - beg; |
549 |
|
|
550 |
|
// |
551 |
|
// Update the rest of the matrix. |
552 |
|
// |
553 |
|
if ( ( j + b ) < n_A ) { |
554 |
|
// Apply the Householder transforms associated with AB1 = [ A11; A21 ] |
555 |
|
// and T1_T to [ A12; A22 ]: |
556 |
|
// / A12 \ := QB1' / A12 \ |
557 |
|
// \ A22 / \ A22 / |
558 |
|
// where QB1 is formed from AB1 and T1_T. |
559 |
|
//// MyFLA_Apply_Q_WY_lhfc_blk_var4( A11, A21, T1_T, A12, A22 ); |
560 |
|
|
561 |
|
beg = omp_get_wtime(); |
562 |
|
dgeqp4_Apply_Q_WY_lhfc_blk_var4( |
563 |
|
m_A11 + m_A21, n_A11, buff_A11, ldim_A, |
564 |
|
b, b, buff_T1_T, ldim_W, |
565 |
|
m_A12 + m_A22, n_A12, buff_A12, ldim_A ); |
566 |
|
lhfc_t += omp_get_wtime() - beg; |
567 |
|
} |
568 |
|
|
569 |
|
// |
570 |
|
// Downdate matrix Y. |
571 |
|
// |
572 |
|
if ( ! last_iter ) { |
573 |
|
//// MyFLA_Downdate_Y( A11, A21, A12, T1_T, Y2, G1, G2 ); |
574 |
|
|
575 |
|
|
576 |
|
beg = omp_get_wtime(); |
577 |
|
|
578 |
|
dgeqp4_Downdate_Y( |
579 |
|
m_A11, n_A11, buff_A11, ldim_A, |
580 |
|
m_A21, n_A21, buff_A21, ldim_A, |
581 |
|
m_A12, n_A12, buff_A12, ldim_A, |
582 |
|
b, b, buff_T1_T, ldim_T1_T, |
583 |
|
m_Y, max( 0, n_Y - j - b ), buff_Y2, ldim_Y, |
584 |
|
m_G, b, buff_G1, ldim_G, |
585 |
|
m_G, max( 0, n_G - j - b ), buff_G2, ldim_G ); |
586 |
|
|
587 |
|
down_t += omp_get_wtime() - beg; |
588 |
|
|
589 |
|
} |
590 |
|
} |
591 |
|
|
592 |
|
|
593 |
|
//printf( "down_t %lf, unb_t %lf, lhfc %lf\n", down_t, unb_t, lhfc_t ); fflush( stdout ); |
594 |
|
|
595 |
|
|
596 |
|
// Remove auxiliary objects. |
597 |
|
//// FLA_Obj_free( & G ); |
598 |
|
//// FLA_Obj_free( & Y ); |
599 |
|
//// FLA_Obj_free( & V ); |
600 |
|
//// FLA_Obj_free( & W ); |
601 |
|
free( buff_G ); |
602 |
|
free( buff_Y ); |
603 |
|
free( buff_V ); |
604 |
|
free( buff_W ); |
605 |
|
|
606 |
|
return 0; |
607 |
|
} |
608 |
|
|
609 |
|
|
610 |
|
// ============================================================================ |
611 |
|
static int dgeqp4_Normal_random_matrix( int m_A, int n_A, |
612 |
|
double * buff_A, int ldim_A ) { |
613 |
|
// |
614 |
|
// It generates a random matrix with normal distribution. |
615 |
|
// |
616 |
|
int i, j; |
617 |
|
|
618 |
|
// Main loop. |
619 |
|
for ( j = 0; j < n_A; j++ ) { |
620 |
|
for ( i = 0; i < m_A; i++ ) { |
621 |
|
buff_A[ i + j * ldim_A ] = dgeqp4_Normal_random_number( 0.0, 1.0 ); |
622 |
|
} |
623 |
|
} |
624 |
|
|
625 |
|
return 0; |
626 |
|
} |
627 |
|
|
628 |
|
// ============================================================================ |
629 |
|
static double dgeqp4_Normal_random_number( double mu, double sigma ) { |
630 |
|
// |
631 |
|
// It computes and returns a normal random number. |
632 |
|
// |
633 |
|
static int alternate_calls = 0; |
634 |
|
static double b1, b2; |
635 |
|
double c1, c2, a, factor; |
636 |
|
|
637 |
|
// Quick return. |
638 |
|
if( alternate_calls == 1 ) { |
639 |
|
alternate_calls = ! alternate_calls; |
640 |
|
return( mu + sigma * b2 ); |
641 |
|
} |
642 |
|
// Main loop. |
643 |
|
do { |
644 |
|
c1 = -1.0 + 2.0 * ( (double) rand() / RAND_MAX ); |
645 |
|
c2 = -1.0 + 2.0 * ( (double) rand() / RAND_MAX ); |
646 |
|
a = c1 * c1 + c2 * c2; |
647 |
|
} while ( ( a == 0 )||( a >= 1 ) ); |
648 |
|
factor = sqrt( ( -2 * log( a ) ) / a ); |
649 |
|
b1 = c1 * factor; |
650 |
|
b2 = c2 * factor; |
651 |
|
alternate_calls = ! alternate_calls; |
652 |
|
return( mu + sigma * b1 ); |
653 |
|
} |
654 |
|
|
655 |
|
// ============================================================================ |
656 |
|
static int dgeqp4_Downdate_Y( |
657 |
|
int m_U11, int n_U11, double * buff_U11, int ldim_U11, |
658 |
|
int m_U21, int n_U21, double * buff_U21, int ldim_U21, |
659 |
|
int m_A12, int n_A12, double * buff_A12, int ldim_A12, |
660 |
|
int m_T, int n_T, double * buff_T, int ldim_T, |
661 |
|
int m_Y2, int n_Y2, double * buff_Y2, int ldim_Y2, |
662 |
|
int m_G1, int n_G1, double * buff_G1, int ldim_G1, |
663 |
|
int m_G2, int n_G2, double * buff_G2, int ldim_G2 ) { |
664 |
|
// |
665 |
|
// It downdates matrix Y, and updates matrix G. |
666 |
|
// Only Y2 of Y is updated. |
667 |
|
// Only G1 and G2 of G are updated. |
668 |
|
// |
669 |
|
// Y2 = Y2 - ( G1 - ( G1*U11 + G2*U21 ) * inv( T11 ) * U11' ) * R12. |
670 |
|
// |
671 |
|
int i, j; |
672 |
|
double * buff_B; |
673 |
|
double d_one = 1.0; |
674 |
|
double d_minus_one = -1.0; |
675 |
|
int m_B = m_G1; |
676 |
|
int n_B = n_G1; |
677 |
|
int ldim_B = m_G1; |
678 |
|
|
679 |
|
// Create object B. |
680 |
|
//// FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G1, & B ); |
681 |
|
buff_B = ( double * ) malloc( m_B * n_B * sizeof( double ) ); |
682 |
|
|
683 |
|
// B = G1. |
684 |
|
//// FLA_Copy( G1, B ); |
685 |
|
dlacpy_( "All", & m_G1, & n_G1, buff_G1, & ldim_G1, |
686 |
|
buff_B, & ldim_B ); |
687 |
|
|
688 |
|
// B = B * U11. |
689 |
|
//// FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, |
690 |
|
//// FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, |
691 |
|
//// FLA_ONE, U11, B ); |
692 |
|
dtrmm_( "Right", "Lower", "No transpose", "Unit", & m_B, & n_B, |
693 |
|
& d_one, buff_U11, & ldim_U11, buff_B, & ldim_B ); |
694 |
|
|
695 |
|
// B = B + G2 * U21. |
696 |
|
//// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, |
697 |
|
//// FLA_ONE, G2, U21, FLA_ONE, B ); |
698 |
|
dgemm_( "No transpose", "No tranpose", & m_B, & n_B, & m_U21, |
699 |
|
& d_one, buff_G2, & ldim_G2, buff_U21, & ldim_U21, |
700 |
|
& d_one, buff_B, & ldim_B ); |
701 |
|
|
702 |
|
// B = B * inv( T11 ). |
703 |
|
//// FLA_Trsm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, |
704 |
|
//// FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, |
705 |
|
//// FLA_ONE, T, B ); |
706 |
|
//// dtrsm_( "Right", "Upper", "No transpose", "Non-unit", & m_B, & n_B, |
707 |
|
//// & d_one, buff_T, & ldim_T, buff_B, & ldim_B ); |
708 |
|
// Used dtrmm instead of dtrsm because of using compact WY instead of UT. |
709 |
|
dtrmm_( "Right", "Upper", "No transpose", "Non-unit", & m_B, & n_B, |
710 |
|
& d_one, buff_T, & ldim_T, buff_B, & ldim_B ); |
711 |
|
|
712 |
|
// B = - B * U11^H. |
713 |
|
//// FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, |
714 |
|
//// FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG, |
715 |
|
//// FLA_MINUS_ONE, U11, B ); |
716 |
|
dtrmm_( "Right", "Lower", "Conj_tranpose", "Unit", & m_B, & n_B, |
717 |
|
& d_minus_one, buff_U11, & ldim_U11, buff_B, & ldim_B ); |
718 |
|
|
719 |
|
// B = G1 + B. |
720 |
|
//// FLA_Axpy( FLA_ONE, G1, B ); |
721 |
|
for( j = 0; j < n_B; j++ ) { |
722 |
|
for( i = 0; i < m_B; i++ ) { |
723 |
|
buff_B[ i + j * ldim_B ] += buff_G1[ i + j * ldim_G1 ]; |
724 |
|
} |
725 |
|
} |
726 |
|
|
727 |
|
// Y2 = Y2 - B * R12. |
728 |
|
//// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, |
729 |
|
//// FLA_MINUS_ONE, B, A12, FLA_ONE, Y2 ); |
730 |
|
dgemm_( "No transpose", "No transpose", & m_Y2, & n_Y2, & m_A12, |
731 |
|
& d_minus_one, buff_B, & ldim_B, buff_A12, & ldim_A12, |
732 |
|
& d_one, buff_Y2, & ldim_Y2 ); |
733 |
|
|
734 |
|
// |
735 |
|
// GR = GR * Q |
736 |
|
// |
737 |
|
dgeqp4_Apply_Q_WY_rnfc_blk_var4( |
738 |
|
m_U11 + m_U21, n_U11, buff_U11, ldim_U11, |
739 |
|
m_T, n_T, buff_T, ldim_T, |
740 |
|
m_G1, n_G1 + n_G2, buff_G1, ldim_G1 ); |
741 |
|
|
742 |
|
// Remove object B. |
743 |
|
//// FLA_Obj_free( & B ); |
744 |
|
free( buff_B ); |
745 |
|
|
746 |
|
return 0; |
747 |
|
} |
748 |
|
|
749 |
|
// ============================================================================ |
750 |
|
static int dgeqp4_Apply_Q_WY_lhfc_blk_var4( |
751 |
|
int m_U, int n_U, double * buff_U, int ldim_U, |
752 |
|
int m_T, int n_T, double * buff_T, int ldim_T, |
753 |
|
int m_B, int n_B, double * buff_B, int ldim_B ) { |
754 |
|
// |
755 |
|
// It applies the transpose of a block transformation Q to a matrix B from |
756 |
|
// the left: |
757 |
|
// B := Q' * B |
758 |
|
// where: |
759 |
|
// Q = I - U * T' * U'. |
760 |
|
// |
761 |
|
double * buff_W; |
762 |
|
int ldim_W; |
763 |
|
|
764 |
|
// Create auxiliary object. |
765 |
|
//// FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B1, & W ); |
766 |
|
buff_W = ( double * ) malloc( n_B * n_U * sizeof( double ) ); |
767 |
|
ldim_W = max( 1, n_B ); |
768 |
|
|
769 |
|
// Apply the block transformation. |
770 |
|
dlarfb_( "Left", "Transpose", "Forward", "Columnwise", |
771 |
|
& m_B, & n_B, & n_U, buff_U, & ldim_U, buff_T, & ldim_T, |
772 |
|
buff_B, & ldim_B, buff_W, & ldim_W ); |
773 |
|
|
774 |
|
// Remove auxiliary object. |
775 |
|
//// FLA_Obj_free( & W ); |
776 |
|
free( buff_W ); |
777 |
|
|
778 |
|
return 0; |
779 |
|
} |
780 |
|
|
781 |
|
// ============================================================================ |
782 |
|
static int dgeqp4_Apply_Q_WY_rnfc_blk_var4( |
783 |
|
int m_U, int n_U, double * buff_U, int ldim_U, |
784 |
|
int m_T, int n_T, double * buff_T, int ldim_T, |
785 |
|
int m_B, int n_B, double * buff_B, int ldim_B ) { |
786 |
|
// |
787 |
|
// It applies a block transformation Q to a matrix B from the right: |
788 |
|
// B = B * Q |
789 |
|
// where: |
790 |
|
// Q = I - U * T' * U'. |
791 |
|
// |
792 |
|
double * buff_W; |
793 |
|
int ldim_W; |
794 |
|
|
795 |
|
// Create auxiliary object. |
796 |
|
//// FLA_Obj_create_conf_to( FLA_TRANSPOSE, B1, & W ); |
797 |
|
buff_W = ( double * ) malloc( m_B * n_U * sizeof( double ) ); |
798 |
|
ldim_W = max( 1, m_B ); |
799 |
|
|
800 |
|
// Apply the block transformation. |
801 |
|
dlarfb_( "Right", "No transpose", "Forward", "Columnwise", |
802 |
|
& m_B, & n_B, & n_U, buff_U, & ldim_U, buff_T, & ldim_T, |
803 |
|
buff_B, & ldim_B, buff_W, & ldim_W ); |
804 |
|
|
805 |
|
// Remove auxiliary object. |
806 |
|
//// FLA_Obj_free( & W ); |
807 |
|
free( buff_W ); |
808 |
|
|
809 |
|
return 0; |
810 |
|
} |
811 |
|
|
812 |
|
// ============================================================================ |
813 |
|
static int dgeqp4_QRPmod_WY_unb_var4( int pivoting, int num_stages, |
814 |
|
int m_A, int n_A, double * buff_A, int ldim_A, |
815 |
|
int * buff_p, double * buff_t, |
816 |
|
int pivot_B, int m_B, double * buff_B, int ldim_B, |
817 |
|
int pivot_C, int m_C, double * buff_C, int ldim_C, |
818 |
|
int build_T, double * buff_T, int ldim_T ) { |
819 |
|
// |
820 |
|
// It computes an unblocked QR factorization of matrix A with or without |
821 |
|
// pivoting. Matrices B and C are optionally pivoted, and matrix T is |
822 |
|
// optionally built. |
823 |
|
// |
824 |
|
// Arguments: |
825 |
|
// "pivoting": If pivoting==1, then QR factorization with pivoting is used. |
826 |
|
// "numstages": It tells the number of columns that are factorized. |
827 |
|
// If "num_stages" is negative, the whole matrix A is factorized. |
828 |
|
// If "num_stages" is positive, only the first "num_stages" are factorized. |
829 |
|
// "pivot_B": if "pivot_B" is true, matrix "B" is pivoted too. |
830 |
|
// "pivot_C": if "pivot_C" is true, matrix "C" is pivoted too. |
831 |
|
// "build_T": if "build_T" is true, matrix "T" is built. |
832 |
|
// |
833 |
|
int j, mn_A, m_a21, m_A22, n_A22, n_dB, idx_max_col, |
834 |
|
i_one = 1, n_house_vector, m_rest; |
835 |
|
double * buff_d, * buff_e, * buff_workspace, diag; |
836 |
|
int idamax_(); |
837 |
|
|
838 |
|
//// printf( "dgeqp4_QRPmod_WY_unb_var4. pivoting: %d \n", pivoting ); |
839 |
|
|
840 |
|
// Some initializations. |
841 |
|
mn_A = min( m_A, n_A ); |
842 |
|
|
843 |
|
// Set the number of stages, if needed. |
844 |
|
if( num_stages < 0 ) { |
845 |
|
num_stages = mn_A; |
846 |
|
} |
847 |
|
|
848 |
|
// Create auxiliary vectors. |
849 |
|
buff_d = ( double * ) malloc( n_A * sizeof( double ) ); |
850 |
|
buff_e = ( double * ) malloc( n_A * sizeof( double ) ); |
851 |
|
buff_workspace = ( double * ) malloc( n_A * sizeof( double ) ); |
852 |
|
|
853 |
|
if( pivoting == 1 ) { |
854 |
|
// Compute initial norms of A into d and e. |
855 |
|
dgeqp4_QRP_compute_norms( m_A, n_A, buff_A, ldim_A, buff_d, buff_e ); |
856 |
|
} |
857 |
|
|
858 |
|
// Main Loop. |
859 |
|
for( j = 0; j < num_stages; j++ ) { |
860 |
|
n_dB = n_A - j; |
861 |
|
m_a21 = m_A - j - 1; |
862 |
|
m_A22 = m_A - j - 1; |
863 |
|
n_A22 = n_A - j - 1; |
864 |
|
|
865 |
|
if( pivoting == 1 ) { |
866 |
|
// Obtain the index of the column with largest 2-norm. |
867 |
|
idx_max_col = idamax_( & n_dB, & buff_d[ j ], & i_one ) - 1; |
868 |
|
|
869 |
|
// Swap columns of A, B, C, pivots, and norms vectors. |
870 |
|
dgeqp4_QRP_pivot_G_B_C( idx_max_col, |
871 |
|
m_A, & buff_A[ 0 + j * ldim_A ], ldim_A, |
872 |
|
pivot_B, m_B, & buff_B[ 0 + j * ldim_B ], ldim_B, |
873 |
|
pivot_C, m_C, & buff_C[ 0 + j * ldim_C ], ldim_C, |
874 |
|
& buff_p[ j ], |
875 |
|
& buff_d[ j ], |
876 |
|
& buff_e[ j ] ); |
877 |
|
} |
878 |
|
|
879 |
|
// Compute tau1 and u21 from alpha11 and a21 such that tau1 and u21 |
880 |
|
// determine a Householder transform H such that applying H from the |
881 |
|
// left to the column vector consisting of alpha11 and a21 annihilates |
882 |
|
// the entries in a21 (and updates alpha11). |
883 |
|
n_house_vector = m_a21 + 1; |
884 |
|
dlarfg_( & n_house_vector, |
885 |
|
& buff_A[ j + j * ldim_A ], |
886 |
|
& buff_A[ min( m_A-1, j+1 ) + j * ldim_A ], & i_one, |
887 |
|
& buff_t[ j ] ); |
888 |
|
|
889 |
|
// / a12t \ = H / a12t \ |
890 |
|
// \ A22 / \ A22 / |
891 |
|
// |
892 |
|
// where H is formed from tau1 and u21. |
893 |
|
diag = buff_A[ j + j * ldim_A ]; |
894 |
|
buff_A[ j + j * ldim_A ] = 1.0; |
895 |
|
m_rest = m_A22 + 1; |
896 |
|
dlarf_( "Left", & m_rest, & n_A22, |
897 |
|
& buff_A[ j + j * ldim_A ], & i_one, |
898 |
|
& buff_t[ j ], |
899 |
|
& buff_A[ j + ( j+1 ) * ldim_A ], & ldim_A, |
900 |
|
buff_workspace ); |
901 |
|
buff_A[ j + j * ldim_A ] = diag; |
902 |
|
|
903 |
|
if( pivoting == 1 ) { |
904 |
|
// Update partial column norms. |
905 |
|
dgeqp4_QRP_downdate_partial_norms( m_A22, n_A22, |
906 |
|
& buff_d[ j+1 ], 1, |
907 |
|
& buff_e[ j+1 ], 1, |
908 |
|
& buff_A[ j + ( j+1 ) * ldim_A ], ldim_A, |
909 |
|
& buff_A[ ( j+1 ) + min( n_A-1, ( j+1 ) ) * ldim_A ], ldim_A ); |
910 |
|
} |
911 |
|
} |
912 |
|
|
913 |
|
// Build T. |
914 |
|
if( build_T ) { |
915 |
|
dlarft_( "Forward", "Columnwise", & m_A, & num_stages, buff_A, & ldim_A, |
916 |
|
buff_t, buff_T, & ldim_T ); |
917 |
|
} |
918 |
|
|
919 |
|
// Remove auxiliary vectors. |
920 |
|
free( buff_d ); |
921 |
|
free( buff_e ); |
922 |
|
free( buff_workspace ); |
923 |
|
|
924 |
|
return 0; |
925 |
|
} |
926 |
|
|
927 |
|
// ============================================================================ |
928 |
|
static int dgeqp4_QRP_compute_norms( |
929 |
|
int m_A, int n_A, double * buff_A, int ldim_A, |
930 |
|
double * buff_d, double * buff_e ) { |
931 |
|
// |
932 |
|
// It computes the column norms of matrix A. The norms are stored into |
933 |
|
// vectors d and e. |
934 |
|
// |
935 |
|
int j, i_one = 1; |
936 |
|
double dnrm2_(); |
937 |
|
|
938 |
|
// Main loop. |
939 |
|
for( j = 0; j < n_A; j++ ) { |
940 |
|
* buff_d = dnrm2_( & m_A, buff_A, & i_one ); |
941 |
|
* buff_e = * buff_d; |
942 |
|
buff_A += ldim_A; |
943 |
|
buff_d++; |
944 |
|
buff_e++; |
945 |
|
} |
946 |
|
|
947 |
|
return 0; |
948 |
|
} |
949 |
|
|
950 |
|
// ============================================================================ |
951 |
|
static int dgeqp4_QRP_downdate_partial_norms( int m_A, int n_A, |
952 |
|
double * buff_d, int st_d, |
953 |
|
double * buff_e, int st_e, |
954 |
|
double * buff_wt, int st_wt, |
955 |
|
double * buff_A, int ldim_A ) { |
956 |
|
// |
957 |
|
// It updates (downdates) the column norms of matrix A. It uses Drmac's method. |
958 |
|
// |
959 |
|
int j, i_one = 1; |
960 |
|
double * ptr_d, * ptr_e, * ptr_wt, * ptr_A; |
961 |
|
double temp, temp2, temp5, tol3z; |
962 |
|
double dnrm2_(), dlamch_(); |
963 |
|
|
964 |
|
/* |
965 |
|
* |
966 |
|
* Update partial column norms |
967 |
|
* |
968 |
|
DO 30 J = I + 1, N |
969 |
|
IF( WORK( J ).NE.ZERO ) THEN |
970 |
|
* |
971 |
|
* NOTE: The following 4 lines follow from the analysis in |
972 |
|
* Lapack Working Note 176. |
973 |
|
* |
974 |
|
TEMP = ABS( A( I, J ) ) / WORK( J ) |
975 |
|
TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) |
976 |
|
TEMP2 = TEMP*( WORK( J ) / WORK( N+J ) )**2 |
977 |
|
IF( TEMP2 .LE. TOL3Z ) THEN |
978 |
|
IF( M-I.GT.0 ) THEN |
979 |
|
WORK( J ) = DNRM2( M-I, A( I+1, J ), 1 ) |
980 |
|
WORK( N+J ) = WORK( J ) |
981 |
|
ELSE |
982 |
|
WORK( J ) = ZERO |
983 |
|
WORK( N+J ) = ZERO |
984 |
|
END IF |
985 |
|
ELSE |
986 |
|
WORK( J ) = WORK( J )*SQRT( TEMP ) |
987 |
|
END IF |
988 |
|
END IF |
989 |
|
30 CONTINUE |
990 |
|
*/ |
991 |
|
|
992 |
|
// Some initializations. |
993 |
|
tol3z = sqrt( dlamch_( "Epsilon" ) ); |
994 |
|
ptr_d = buff_d; |
995 |
|
ptr_e = buff_e; |
996 |
|
ptr_wt = buff_wt; |
997 |
|
ptr_A = buff_A; |
998 |
|
|
999 |
|
// Main loop. |
1000 |
|
for( j = 0; j < n_A; j++ ) { |
1001 |
|
if( * ptr_d != 0.0 ) { |
1002 |
|
temp = dabs( * ptr_wt ) / * ptr_d; |
1003 |
|
temp = max( 0.0, ( 1.0 + temp ) * ( 1 - temp ) ); |
1004 |
|
temp5 = * ptr_d / * ptr_e; |
1005 |
|
temp2 = temp * temp5 * temp5; |
1006 |
|
if( temp2 <= tol3z ) { |
1007 |
|
if( m_A > 0 ) { |
1008 |
|
* ptr_d = dnrm2_( & m_A, ptr_A, & i_one ); |
1009 |
|
* ptr_e = *ptr_d; |
1010 |
|
} else { |
1011 |
|
* ptr_d = 0.0; |
1012 |
|
* ptr_e = 0.0; |
1013 |
|
} |
1014 |
|
} else { |
1015 |
|
* ptr_d = * ptr_d * sqrt( temp ); |
1016 |
|
} |
1017 |
|
} |
1018 |
|
ptr_A += ldim_A; |
1019 |
|
ptr_d += st_d; |
1020 |
|
ptr_e += st_e; |
1021 |
|
ptr_wt += st_wt; |
1022 |
|
} |
1023 |
|
|
1024 |
|
return 0; |
1025 |
|
} |
1026 |
|
|
1027 |
|
|
1028 |
|
// ============================================================================ |
1029 |
|
static int dgeqp4_QRP_pivot_G_B_C( int j_max_col, |
1030 |
|
int m_G, double * buff_G, int ldim_G, |
1031 |
|
int pivot_B, int m_B, double * buff_B, int ldim_B, |
1032 |
|
int pivot_C, int m_C, double * buff_C, int ldim_C, |
1033 |
|
int * buff_p, |
1034 |
|
double * buff_d, double * buff_e ) { |
1035 |
|
// |
1036 |
|
// It pivots matrix G, pivot vector p, and norms vectors d and e. |
1037 |
|
// Matrices B and C are optionally pivoted. |
1038 |
|
// |
1039 |
|
int ival, i_one = 1; |
1040 |
|
double * ptr_g1, * ptr_g2, * ptr_b1, * ptr_b2, * ptr_c1, * ptr_c2; |
1041 |
|
|
1042 |
|
// Swap columns of G, pivots, and norms. |
1043 |
|
if( j_max_col != 0 ) { |
1044 |
|
|
1045 |
|
// Swap full column 0 and column "j_max_col" of G. |
1046 |
|
ptr_g1 = & buff_G[ 0 + 0 * ldim_G ]; |
1047 |
|
ptr_g2 = & buff_G[ 0 + j_max_col * ldim_G ]; |
1048 |
|
dswap_( & m_G, ptr_g1, & i_one, ptr_g2, & i_one ); |
1049 |
|
|
1050 |
|
// Swap full column 0 and column "j_max_col" of B. |
1051 |
|
if( pivot_B ) { |
1052 |
|
ptr_b1 = & buff_B[ 0 + 0 * ldim_B ]; |
1053 |
|
ptr_b2 = & buff_B[ 0 + j_max_col * ldim_B ]; |
1054 |
|
dswap_( & m_B, ptr_b1, & i_one, ptr_b2, & i_one ); |
1055 |
|
} |
1056 |
|
|
1057 |
|
// Swap full column 0 and column "j_max_col" of C. |
1058 |
|
if( pivot_C ) { |
1059 |
|
ptr_c1 = & buff_C[ 0 + 0 * ldim_C ]; |
1060 |
|
ptr_c2 = & buff_C[ 0 + j_max_col * ldim_C ]; |
1061 |
|
dswap_( & m_C, ptr_c1, & i_one, ptr_c2, & i_one ); |
1062 |
|
} |
1063 |
|
|
1064 |
|
// Swap element 0 and element "j_max_col" of pivot vector "p". |
1065 |
|
ival = buff_p[ j_max_col ]; |
1066 |
|
buff_p[ j_max_col ] = buff_p[ 0 ]; |
1067 |
|
buff_p[ 0 ] = ival; |
1068 |
|
|
1069 |
|
// Copy norms of column 0 to column "j_max_col". |
1070 |
|
buff_d[ j_max_col ] = buff_d[ 0 ]; |
1071 |
|
buff_e[ j_max_col ] = buff_e[ 0 ]; |
1072 |
|
} |
1073 |
|
|
1074 |
|
return 0; |
1075 |
|
} |
1076 |
|
|