1 |
|
/* |
2 |
|
=============================================================================== |
3 |
|
Authors |
4 |
|
=============================================================================== |
5 |
|
|
6 |
|
Per-Gunnar Martinsson |
7 |
|
Dept. of Applied Mathematics, |
8 |
|
University of Colorado at Boulder, |
9 |
|
526 UCB, Boulder, CO 80309-0526, USA |
10 |
|
|
11 |
|
Gregorio Quintana-Orti |
12 |
|
Depto. de Ingenieria y Ciencia de Computadores, |
13 |
|
Universitat Jaume I, |
14 |
|
12.071 Castellon, Spain |
15 |
|
|
16 |
|
Nathan Heavner |
17 |
|
Dept. of Applied Mathematics, |
18 |
|
University of Colorado at Boulder, |
19 |
|
526 UCB, Boulder, CO 80309-0526, USA |
20 |
|
|
21 |
|
Robert van de Geijn |
22 |
|
Dept. of Computer Science and Institute for Computational Engineering and |
23 |
|
Sciences, |
24 |
|
The University of Texas at Austin |
25 |
|
Austin, TX. |
26 |
|
|
27 |
|
=============================================================================== |
28 |
|
Copyright |
29 |
|
=============================================================================== |
30 |
|
|
31 |
|
Copyright (C) 2016, |
32 |
|
Universitat Jaume I, |
33 |
|
University of Colorado at Boulder, |
34 |
|
The University of Texas at Austin. |
35 |
|
|
36 |
|
=============================================================================== |
37 |
|
Disclaimer |
38 |
|
=============================================================================== |
39 |
|
|
40 |
|
This code is distributed in the hope that it will be useful, but |
41 |
|
WITHOUT ANY WARRANTY EXPRESSED OR IMPLIED. |
42 |
|
|
43 |
|
=============================================================================== |
44 |
|
Disclaimer |
45 |
|
=============================================================================== |
46 |
|
|
47 |
|
This copy of HQRRP in single precision is modified by Chenhan D. Yu. |
48 |
|
|
49 |
|
*/ |
50 |
|
|
51 |
|
#include <stdlib.h> |
52 |
|
#include <stdio.h> |
53 |
|
#include <math.h> |
54 |
|
#include "blas_lapack_prototypes.h" |
55 |
|
#include "sgeqp4.h" |
56 |
|
|
57 |
|
|
58 |
|
// Matrices with dimensions smaller than THRESHOLD_FOR_SGEQPF are processed |
59 |
|
// with LAPACK's routine dgeqpf. |
60 |
|
// Matrices with dimensions between THRESHOLD_FOR_SGEQPF and |
61 |
|
// THRESHOLD_FOR_SGEQP3 are processed with LAPACK's routine dgeqp3. |
62 |
|
// Matrices with dimensions larger than THRESHOLD_FOR_SGEQP3 are processed |
63 |
|
// with the new HQRRP code. |
64 |
|
#define THRESHOLD_FOR_SGEQPF 0 |
65 |
|
#define THRESHOLD_FOR_SGEQP3 500 |
66 |
|
|
67 |
|
|
68 |
|
// ============================================================================ |
69 |
|
// Definition of macros. |
70 |
|
|
71 |
|
#define max( a, b ) ( (a) > (b) ? (a) : (b) ) |
72 |
|
#define min( a, b ) ( (a) > (b) ? (b) : (a) ) |
73 |
|
#define dabs( a ) ( (a) >= 0.0 ? (a) : -(a) ) |
74 |
|
|
75 |
|
// ============================================================================ |
76 |
|
// Compilation declarations. |
77 |
|
|
78 |
|
#undef CHECK_DOWNDATING_OF_Y |
79 |
|
|
80 |
|
|
81 |
|
// ============================================================================ |
82 |
|
// Declaration of local prototypes. |
83 |
|
|
84 |
|
static int sgeqp4_Normal_random_matrix( int m_A, int n_A, |
85 |
|
float * buff_A, int ldim_A ); |
86 |
|
|
87 |
|
static float sgeqp4_Normal_random_number( float mu, float sigma ); |
88 |
|
|
89 |
|
static int sgeqp4_Downdate_Y( |
90 |
|
int m_U11, int n_U11, float * buff_U11, int ldim_U11, |
91 |
|
int m_U21, int n_U21, float * buff_U21, int ldim_U21, |
92 |
|
int m_A12, int n_A12, float * buff_A12, int ldim_A12, |
93 |
|
int m_T, int n_T, float * buff_T, int ldim_T, |
94 |
|
int m_Y2, int n_Y2, float * buff_Y2, int ldim_Y2, |
95 |
|
int m_G1, int n_G1, float * buff_G1, int ldim_G1, |
96 |
|
int m_G2, int n_G2, float * buff_G2, int ldim_G2 ); |
97 |
|
|
98 |
|
static int sgeqp4_Apply_Q_WY_lhfc_blk_var4( |
99 |
|
int m_U, int n_U, float * buff_U, int ldim_U, |
100 |
|
int m_T, int n_T, float * buff_T, int ldim_T, |
101 |
|
int m_B, int n_B, float * buff_B, int ldim_B ); |
102 |
|
|
103 |
|
static int sgeqp4_Apply_Q_WY_rnfc_blk_var4( |
104 |
|
int m_U, int n_U, float * buff_U, int ldim_U, |
105 |
|
int m_T, int n_T, float * buff_T, int ldim_T, |
106 |
|
int m_B, int n_B, float * buff_B, int ldim_B ); |
107 |
|
|
108 |
|
static int sgeqp4_QRPmod_WY_unb_var4( int pivoting, int num_stages, |
109 |
|
int m_A, int n_A, float * buff_A, int ldim_A, |
110 |
|
int * buff_p, float * buff_t, |
111 |
|
int pivot_B, int m_B, float * buff_B, int ldim_B, |
112 |
|
int pivot_C, int m_C, float * buff_C, int ldim_C, |
113 |
|
int build_T, float * buff_T, int ldim_T ); |
114 |
|
|
115 |
|
static int sgeqp4_QRP_compute_norms( |
116 |
|
int m_A, int n_A, float * buff_A, int ldim_A, |
117 |
|
float * buff_d, float * buff_e ); |
118 |
|
|
119 |
|
static int sgeqp4_QRP_downdate_partial_norms( int m_A, int n_A, |
120 |
|
float * buff_d, int st_d, |
121 |
|
float * buff_e, int st_e, |
122 |
|
float * buff_wt, int st_wt, |
123 |
|
float * buff_A, int ldim_A ); |
124 |
|
|
125 |
|
static int sgeqp4_QRP_pivot_G_B_C( int j_max_col, |
126 |
|
int m_G, float * buff_G, int ldim_G, |
127 |
|
int pivot_B, int m_B, float * buff_B, int ldim_B, |
128 |
|
int pivot_C, int m_C, float * buff_C, int ldim_C, |
129 |
|
int * buff_p, |
130 |
|
float * buff_d, float * buff_e ); |
131 |
|
|
132 |
|
|
133 |
|
// ============================================================================ |
134 |
|
void sgeqp4( int * m, int * n, float * A, int * lda, int * jpvt, float * tau, |
135 |
|
float * work, int * lwork, int * info ) { |
136 |
|
// |
137 |
|
// This routine is plug compatible with LAPACK's routine sgeqp3. |
138 |
|
// It computes the new HQRRP while keeping the same header as LAPACK's sgeqp3. |
139 |
|
// It uses dgeqpf or sgeqp3 for small matrices. The thresholds are defined in |
140 |
|
// constants THRESHOLD_FOR_SGEQPF and THRESHOLD_FOR_SGEQP3. |
141 |
|
// |
142 |
|
int INB = 1; |
143 |
|
int i_one = 1, i_minus_one = -1, |
144 |
|
m_A, n_A, mn_A, ldim_A, lquery, nb, num_factorized_fixed_cols, |
145 |
|
minus_info, iws, lwkopt, j, k, num_fixed_cols, n_rest, itmp; |
146 |
|
int * previous_jpvt; |
147 |
|
int ilaenv_(); |
148 |
|
|
149 |
|
// Some initializations. |
150 |
|
m_A = * m; |
151 |
|
n_A = * n; |
152 |
|
mn_A = min( m_A, n_A ); |
153 |
|
ldim_A = * lda; |
154 |
|
|
155 |
|
// Check input arguments. |
156 |
|
* info = 0; |
157 |
|
lquery = ( * lwork == -1 ); |
158 |
|
if( m_A < 0 ) { |
159 |
|
* info = -1; |
160 |
|
} else if ( n_A < 0 ) { |
161 |
|
* info = -2; |
162 |
|
} else if ( ldim_A < max( 1, m_A ) ) { |
163 |
|
* info = -4; |
164 |
|
} |
165 |
|
|
166 |
|
if( *info == 0 ) { |
167 |
|
if( mn_A == 0 ) { |
168 |
|
iws = 1; |
169 |
|
lwkopt = 1; |
170 |
|
} else { |
171 |
|
iws = 3 * n_A + 1; |
172 |
|
nb = ilaenv_( & INB, "SGEQRF", " ", & m_A, & n_A, & i_minus_one, |
173 |
|
& i_minus_one ); |
174 |
|
lwkopt = 2 * n_A + ( n_A + 1 ) * nb; |
175 |
|
} |
176 |
|
work[ 0 ] = ( float ) lwkopt; |
177 |
|
|
178 |
|
if ( ( * lwork < iws )&&( ! lquery ) ) { |
179 |
|
* info = -8; |
180 |
|
} |
181 |
|
} |
182 |
|
|
183 |
|
if( * info != 0 ) { |
184 |
|
minus_info = - * info; |
185 |
|
xerbla_( "SGEQP3", & minus_info ); |
186 |
|
return; |
187 |
|
} else if( lquery ) { |
188 |
|
return; |
189 |
|
} |
190 |
|
|
191 |
|
// Quick return if possible. |
192 |
|
if( mn_A == 0 ) { |
193 |
|
return; |
194 |
|
} |
195 |
|
|
196 |
|
// Use LAPACK's SGEQPF or SGEQP3 for small matrices. |
197 |
|
//if( mn_A < THRESHOLD_FOR_SGEQPF ) { |
198 |
|
// // Call to LAPACK routine. |
199 |
|
// //// printf( "Calling dgeqpf\n" ); |
200 |
|
// sgeqpf_( m, n, A, lda, jpvt, tau, work, info ); |
201 |
|
// return; |
202 |
|
//} else |
203 |
|
|
204 |
|
|
205 |
|
// Use LAPACK's SGEQP3 for small matrices. |
206 |
|
if( mn_A < THRESHOLD_FOR_SGEQP3 ) |
207 |
|
{ |
208 |
|
//// printf( "Calling dgeqp3\n" ); |
209 |
|
sgeqp3_( m, n, A, lda, jpvt, tau, work, lwork, info ); |
210 |
|
return; |
211 |
|
} |
212 |
|
|
213 |
|
// Move initial columns up front. |
214 |
|
num_fixed_cols = 0; |
215 |
|
for( j = 0; j < n_A; j++ ) { |
216 |
|
if( jpvt[ j ] != 0 ) { |
217 |
|
if( j != num_fixed_cols ) { |
218 |
|
//// printf( "Swapping columns: %d %d \n", j, num_fixed_cols ); |
219 |
|
sswap_( & m_A, & A[ 0 + j * ldim_A ], & i_one, |
220 |
|
& A[ 0 + num_fixed_cols * ldim_A ], & i_one ); |
221 |
|
jpvt[ j ] = jpvt[ num_fixed_cols ]; |
222 |
|
jpvt[ num_fixed_cols ] = j + 1; |
223 |
|
} else { |
224 |
|
jpvt[ j ] = j + 1 ; |
225 |
|
} |
226 |
|
num_fixed_cols++; |
227 |
|
} else { |
228 |
|
jpvt[ j ] = j + 1 ; |
229 |
|
} |
230 |
|
} |
231 |
|
|
232 |
|
// Factorize fixed columns at the front. |
233 |
|
num_factorized_fixed_cols = min( m_A, num_fixed_cols ); |
234 |
|
if( num_factorized_fixed_cols > 0 ) { |
235 |
|
sgeqrf_( & m_A, & num_factorized_fixed_cols, A, & ldim_A, tau, work, lwork, |
236 |
|
info ); |
237 |
|
if( * info != 0 ) { |
238 |
|
fprintf( stderr, "ERROR in sgeqrf: Info: %d \n", * info ); |
239 |
|
} |
240 |
|
iws = max( iws, ( int ) work[ 0 ] ); |
241 |
|
if( num_factorized_fixed_cols < n_A ) { |
242 |
|
n_rest = n_A - num_factorized_fixed_cols; |
243 |
|
sormqr_( "Left", "Transpose", |
244 |
|
& m_A, & n_rest, & num_factorized_fixed_cols, |
245 |
|
A, & ldim_A, tau, |
246 |
|
& A[ 0 + num_factorized_fixed_cols * ldim_A ], & ldim_A, |
247 |
|
work, lwork, info ); |
248 |
|
if( * info != 0 ) { |
249 |
|
fprintf( stderr, "ERROR in dormqr: Info: %d \n", * info ); |
250 |
|
} |
251 |
|
|
252 |
|
iws = max( iws, ( int ) work[ 0 ] ); |
253 |
|
} |
254 |
|
} |
255 |
|
|
256 |
|
// Create intermediate jpvt vector. |
257 |
|
previous_jpvt = ( int * ) malloc( n_A * sizeof( int ) ); |
258 |
|
|
259 |
|
// Save a copy of jpvt vector. |
260 |
|
if( num_factorized_fixed_cols > 0 ) { |
261 |
|
// Copy vector. |
262 |
|
for( j = 0; j < n_A; j++ ) { |
263 |
|
previous_jpvt[ j ] = jpvt[ j ]; |
264 |
|
} |
265 |
|
} |
266 |
|
|
267 |
|
// Factorize free columns at the bottom with default values: |
268 |
|
// nb_alg = 64, pp = 10, panel_pivoting = 1. |
269 |
|
if( num_factorized_fixed_cols < mn_A ) { |
270 |
|
* info = sgeqp4_HQRRP_WY_blk_var4( |
271 |
|
m_A - num_factorized_fixed_cols, n_A - num_factorized_fixed_cols, |
272 |
|
& A[ num_factorized_fixed_cols + num_factorized_fixed_cols * ldim_A ], |
273 |
|
ldim_A, |
274 |
|
& jpvt[ num_factorized_fixed_cols ], |
275 |
|
& tau[ num_factorized_fixed_cols ], |
276 |
|
64, 10, 1 ); |
277 |
|
} |
278 |
|
|
279 |
|
// Pivot block above factorized block by sgeqp4_HQRRP. |
280 |
|
if( num_factorized_fixed_cols > 0 ) { |
281 |
|
// Pivot block above factorized block. |
282 |
|
for( j = num_factorized_fixed_cols; j < n_A; j++ ) { |
283 |
|
//// printf( "%% Processing j: %d \n", j ); |
284 |
|
for( k = j; k < n_A; k++ ) { |
285 |
|
if( jpvt[ j ] == previous_jpvt[ k ] ) { |
286 |
|
//// printf( "%% Found j: %d k: %d \n", j, k ); |
287 |
|
break; |
288 |
|
} |
289 |
|
} |
290 |
|
// Swap vector previous_jpvt and block above factorized block. |
291 |
|
if( k != j ) { |
292 |
|
// Swap elements in previous_jpvt. |
293 |
|
//// printf( "%% Swapping j: %d k: %d \n", j, k ); |
294 |
|
itmp = previous_jpvt[ j ]; |
295 |
|
previous_jpvt[ j ] = previous_jpvt[ k ]; |
296 |
|
previous_jpvt[ k ] = itmp; |
297 |
|
|
298 |
|
// Swap columns in block above factorized block. |
299 |
|
sswap_( & num_factorized_fixed_cols, |
300 |
|
& A[ 0 + j * ldim_A ], & i_one, |
301 |
|
& A[ 0 + k * ldim_A ], & i_one ); |
302 |
|
} |
303 |
|
} |
304 |
|
} |
305 |
|
|
306 |
|
// Remove intermediate jpvt vector. |
307 |
|
free( previous_jpvt ); |
308 |
|
|
309 |
|
// Return workspace length required. |
310 |
|
work[ 0 ] = iws; |
311 |
|
return; |
312 |
|
} |
313 |
|
|
314 |
|
// ============================================================================ |
315 |
|
int sgeqp4_HQRRP_WY_blk_var4( int m_A, int n_A, float * buff_A, int ldim_A, |
316 |
|
int * buff_jpvt, float * buff_tau, |
317 |
|
int nb_alg, int pp, int panel_pivoting ) { |
318 |
|
// |
319 |
|
// HQRRP: It computes the Householder QR with Randomized Pivoting of matrix A. |
320 |
|
// This routine is almost compatible with LAPACK's dgeqp3. |
321 |
|
// The main difference is that this routine does not manage fixed columns. |
322 |
|
// |
323 |
|
// Main features: |
324 |
|
// * BLAS-3 based. |
325 |
|
// * Norm downdating method by Drmac. |
326 |
|
// * Downdating for computing Y. |
327 |
|
// * No use of libflame. |
328 |
|
// * Compact WY transformations are used instead of UT transformations. |
329 |
|
// * LAPACK's routine dlarfb is used to apply block transformations. |
330 |
|
// |
331 |
|
// Arguments: |
332 |
|
// ---------- |
333 |
|
// m_A: Number of rows of matrix A. |
334 |
|
// n_A: Number of columns of matrix A. |
335 |
|
// buff_A: Address/pointer of/to data in matrix A. Matrix A must be |
336 |
|
// stored in column-order. |
337 |
|
// ldim_A: Leading dimension of matrix A. |
338 |
|
// buff_jpvt: Input/output vector with the pivots. |
339 |
|
// buff_tau: Output vector with the tau values of the Householder factors. |
340 |
|
// nb_alg: Block size. |
341 |
|
// Usual values for nb_alg are 32, 64, etc. |
342 |
|
// pp: Oversampling size. |
343 |
|
// Usual values for pp are 5, 10, etc. |
344 |
|
// panel_pivoting: If panel_pivoting==1, QR with pivoting is applied to |
345 |
|
// factorize the panels of matrix A. Otherwise, QR without |
346 |
|
// pivoting is used. Usual value for panel_pivoting is 1. |
347 |
|
// Final comments: |
348 |
|
// --------------- |
349 |
|
// This code has been created from a libflame code. Hence, you can find some |
350 |
|
// commented calls to libflame routines. We have left them to make it easier |
351 |
|
// to interpret the meaning of the C code. |
352 |
|
// |
353 |
|
int b, j, last_iter, mn_A, m_Y, n_Y, ldim_Y, m_V, n_V, ldim_V, |
354 |
|
m_W, n_W, ldim_W, n_VR, m_AB1, n_AB1, ldim_T1_T, |
355 |
|
m_A11, n_A11, m_A12, n_A12, m_A21, n_A21, m_A22, |
356 |
|
m_G, n_G, ldim_G; |
357 |
|
float * buff_Y, * buff_V, * buff_W, * buff_VR, * buff_YR, |
358 |
|
* buff_s, * buff_sB, * buff_s1, |
359 |
|
* buff_AR, * buff_AB1, * buff_A01, * buff_Y1, * buff_T1_T, |
360 |
|
* buff_A11, * buff_A21, * buff_A12, |
361 |
|
* buff_Y2, * buff_G, * buff_G1, * buff_G2; |
362 |
|
int * buff_p, * buff_pB, * buff_p1; |
363 |
|
float d_zero = 0.0; |
364 |
|
float d_one = 1.0; |
365 |
|
|
366 |
|
// Executable Statements. |
367 |
|
//// printf( "%% sgeqp4_HQRRP_WY_blk_var4.\n" ); |
368 |
|
|
369 |
|
// Check arguments. |
370 |
|
if( m_A < 0 ) { |
371 |
|
fprintf( stderr, |
372 |
|
"ERROR in sgeqp4_HQRRP_WY_blk_var4: m_A is < 0.\n" ); |
373 |
|
} if( n_A < 0 ) { |
374 |
|
fprintf( stderr, |
375 |
|
"ERROR in sgeqp4_HQRRP_WY_blk_var4: n_A is < 0.\n" ); |
376 |
|
} if( ldim_A < max( 1, m_A ) ) { |
377 |
|
fprintf( stderr, |
378 |
|
"ERROR in sgeqp4_HQRRP_WY_blk_var4: ldim_A is < max( 1, m_A ).\n" ); |
379 |
|
} |
380 |
|
|
381 |
|
// Some initializations. |
382 |
|
mn_A = min( m_A, n_A ); |
383 |
|
buff_p = buff_jpvt; |
384 |
|
buff_s = buff_tau; |
385 |
|
|
386 |
|
// Quick return. |
387 |
|
if( mn_A == 0 ) { |
388 |
|
return 0; |
389 |
|
} |
390 |
|
|
391 |
|
// Initialize the seed for the generator of random numbers. |
392 |
|
srand( 12 ); |
393 |
|
|
394 |
|
// Create auxiliary objects. |
395 |
|
m_Y = nb_alg + pp; |
396 |
|
n_Y = n_A; |
397 |
|
buff_Y = ( float * ) malloc( m_Y * n_Y * sizeof( float ) ); |
398 |
|
ldim_Y = m_Y; |
399 |
|
|
400 |
|
m_V = nb_alg + pp; |
401 |
|
n_V = n_A; |
402 |
|
buff_V = ( float * ) malloc( m_V * n_V * sizeof( float ) ); |
403 |
|
ldim_V = m_V; |
404 |
|
|
405 |
|
m_W = nb_alg; |
406 |
|
n_W = n_A; |
407 |
|
buff_W = ( float * ) malloc( m_W * n_W * sizeof( float ) ); |
408 |
|
ldim_W = m_W; |
409 |
|
|
410 |
|
m_G = nb_alg + pp; |
411 |
|
n_G = m_A; |
412 |
|
buff_G = ( float * ) malloc( m_G * n_G * sizeof( float ) ); |
413 |
|
ldim_G = m_G; |
414 |
|
|
415 |
|
// Initialize matrices G and Y. |
416 |
|
sgeqp4_Normal_random_matrix( nb_alg + pp, m_A, buff_G, ldim_G ); |
417 |
|
//// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, |
418 |
|
//// FLA_ONE, G, A, FLA_ZERO, Y ); |
419 |
|
sgemm_( "No tranpose", "No transpose", & m_Y, & n_Y, & m_A, |
420 |
|
& d_one, buff_G, & ldim_G, buff_A, & ldim_A, |
421 |
|
& d_zero, buff_Y, & ldim_Y ); |
422 |
|
|
423 |
|
// Main Loop. |
424 |
|
for( j = 0; j < mn_A; j += nb_alg ) { |
425 |
|
b = min( nb_alg, min( n_A - j, m_A - j ) ); |
426 |
|
|
427 |
|
// Check whether it is the last iteration. |
428 |
|
last_iter = ( ( ( j + nb_alg >= m_A )||( j + nb_alg >= n_A ) ) ? 1 : 0 ); |
429 |
|
|
430 |
|
// Some initializations for the iteration of this loop. |
431 |
|
n_VR = n_V - j; |
432 |
|
buff_VR = & buff_V[ 0 + j * ldim_V ]; |
433 |
|
buff_YR = & buff_Y[ 0 + j * ldim_Y ]; |
434 |
|
buff_pB = & buff_p[ j ]; |
435 |
|
buff_sB = & buff_s[ j ]; |
436 |
|
buff_AR = & buff_A[ 0 + j * ldim_A ]; |
437 |
|
|
438 |
|
m_AB1 = m_A - j; |
439 |
|
n_AB1 = b; |
440 |
|
buff_AB1 = & buff_A[ j + j * ldim_A ]; |
441 |
|
buff_p1 = & buff_p[ j ]; |
442 |
|
buff_s1 = & buff_s[ j ]; |
443 |
|
buff_A01 = & buff_A[ 0 + j * ldim_A ]; |
444 |
|
buff_Y1 = & buff_Y[ 0 + j * ldim_Y ]; |
445 |
|
buff_T1_T = & buff_W[ 0 + j * ldim_W ]; |
446 |
|
ldim_T1_T = ldim_W; |
447 |
|
|
448 |
|
buff_A11 = & buff_A[ j + j * ldim_A ]; |
449 |
|
m_A11 = b; |
450 |
|
n_A11 = b; |
451 |
|
|
452 |
|
buff_A21 = & buff_A[ min( m_A - 1, j + nb_alg ) + j * ldim_A ]; |
453 |
|
m_A21 = max( 0, m_A - j - b ); |
454 |
|
n_A21 = b; |
455 |
|
|
456 |
|
buff_A12 = & buff_A[ j + min( n_A - 1, j + b ) * ldim_A ]; |
457 |
|
m_A12 = b; |
458 |
|
n_A12 = max( 0, n_A - j - b ); |
459 |
|
|
460 |
|
//// buff_A22 = & buff_A[ min( m_A - 1, j + b ) + |
461 |
|
//// min( n_A - 1, j + b ) * ldim_A ]; |
462 |
|
m_A22 = max( 0, m_A - j - b ); |
463 |
|
//// n_A22 = max( 0, n_A - j - b ); |
464 |
|
|
465 |
|
buff_Y2 = & buff_Y[ 0 + min( n_Y - 1, j + b ) * ldim_Y ]; |
466 |
|
buff_G1 = & buff_G[ 0 + j * ldim_G ]; |
467 |
|
buff_G2 = & buff_G[ 0 + min( n_G - 1, j + b ) * ldim_G ]; |
468 |
|
|
469 |
|
#ifdef CHECK_DOWNDATING_OF_Y |
470 |
|
// Check downdating of matrix Y: Compare downdated matrix Y with |
471 |
|
// matrix Y computed from scratch. |
472 |
|
int m_cyr, n_cyr, ldim_cyr, m_ABR, ii, jj; |
473 |
|
float * buff_cyr, aux, sum; |
474 |
|
|
475 |
|
m_cyr = m_Y; |
476 |
|
n_cyr = n_Y - j; |
477 |
|
ldim_cyr = m_cyr; |
478 |
|
m_ABR = m_A - j; |
479 |
|
buff_cyr = ( float * ) malloc( m_cyr * n_cyr * sizeof( float ) ); |
480 |
|
|
481 |
|
//// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, |
482 |
|
//// FLA_ONE, GR, ABR, FLA_ZERO, CYR ); |
483 |
|
sgemm_( "No tranpose", "No transpose", & m_cyr, & n_cyr, & m_ABR, |
484 |
|
& d_one, & buff_G[ 0 + j * ldim_G ], & ldim_G, |
485 |
|
& buff_A[ j + j * ldim_A ], & ldim_A, |
486 |
|
& d_zero, & buff_cyr[ 0 + 0 * ldim_cyr ], & ldim_cyr ); |
487 |
|
|
488 |
|
//// print_float_matrix( "cyr", m_cyr, n_cyr, buff_cyr, ldim_cyr ); |
489 |
|
//// print_float_matrix( "y", m_Y, n_Y, buff_Y, ldim_Y ); |
490 |
|
sum = 0.0; |
491 |
|
for( jj = 0; jj < n_cyr; jj++ ) { |
492 |
|
for( ii = 0; ii < m_cyr; ii++ ) { |
493 |
|
aux = buff_Y[ ii + ( j + jj ) * ldim_Y ] - |
494 |
|
buff_cyr[ ii + jj * ldim_cyr ]; |
495 |
|
sum += aux * aux; |
496 |
|
} |
497 |
|
} |
498 |
|
sum = sqrt( sum ); |
499 |
|
printf( "%% diff between Y and downdated Y: %le\n", sum ); |
500 |
|
|
501 |
|
free( buff_cyr ); |
502 |
|
#endif |
503 |
|
|
504 |
|
if( last_iter == 0 ) { |
505 |
|
// Compute QRP of YR, and apply permutations to matrix AR. |
506 |
|
// A copy of YR is made into VR, and permutations are applied to YR. |
507 |
|
//// FLA_Merge_2x1( ATR, |
508 |
|
//// ABR, & AR ); |
509 |
|
//// FLA_Copy( YR, VR ); |
510 |
|
//// FLA_QRPmod_WY_unb_var4( 1, bRow, VR, pB, sB, 1, AR, 1, YR, 0, None ); |
511 |
|
|
512 |
|
slacpy_( "All", & m_V, & n_VR, buff_YR, & ldim_Y, |
513 |
|
buff_VR, & ldim_V ); |
514 |
|
sgeqp4_QRPmod_WY_unb_var4( 1, b, |
515 |
|
m_V, n_VR, buff_VR, ldim_V, buff_pB, buff_sB, |
516 |
|
1, m_A, buff_AR, ldim_A, |
517 |
|
1, m_Y, buff_YR, ldim_Y, |
518 |
|
0, buff_Y, ldim_Y ); |
519 |
|
} |
520 |
|
|
521 |
|
// |
522 |
|
// Compute QRP of panel AB1 = [ A11; A21 ]. |
523 |
|
// Apply same permutations to A01 and Y1, and build T1_T. |
524 |
|
// |
525 |
|
//// FLA_Part_2x1( W1, & T1_T, |
526 |
|
//// & None, b, FLA_TOP ); |
527 |
|
//// FLA_Merge_2x1( A11, |
528 |
|
//// A21, & AB1 ); |
529 |
|
//// FLA_QRPmod_WY_unb_var4( panel_pivoting, -1, AB1, p1, s1, |
530 |
|
//// 1, A01, 1, Y1, 1, T1_T ); |
531 |
|
|
532 |
|
sgeqp4_QRPmod_WY_unb_var4( panel_pivoting, -1, |
533 |
|
m_AB1, n_AB1, buff_AB1, ldim_A, buff_p1, buff_s1, |
534 |
|
1, j, buff_A01, ldim_A, |
535 |
|
1, m_Y, buff_Y1, ldim_Y, |
536 |
|
1, buff_T1_T, ldim_W ); |
537 |
|
|
538 |
|
// |
539 |
|
// Update the rest of the matrix. |
540 |
|
// |
541 |
|
if ( ( j + b ) < n_A ) { |
542 |
|
// Apply the Householder transforms associated with AB1 = [ A11; A21 ] |
543 |
|
// and T1_T to [ A12; A22 ]: |
544 |
|
// / A12 \ := QB1' / A12 \ |
545 |
|
// \ A22 / \ A22 / |
546 |
|
// where QB1 is formed from AB1 and T1_T. |
547 |
|
//// MyFLA_Apply_Q_WY_lhfc_blk_var4( A11, A21, T1_T, A12, A22 ); |
548 |
|
|
549 |
|
sgeqp4_Apply_Q_WY_lhfc_blk_var4( |
550 |
|
m_A11 + m_A21, n_A11, buff_A11, ldim_A, |
551 |
|
b, b, buff_T1_T, ldim_W, |
552 |
|
m_A12 + m_A22, n_A12, buff_A12, ldim_A ); |
553 |
|
} |
554 |
|
|
555 |
|
// |
556 |
|
// Downdate matrix Y. |
557 |
|
// |
558 |
|
if ( ! last_iter ) { |
559 |
|
//// MyFLA_Downdate_Y( A11, A21, A12, T1_T, Y2, G1, G2 ); |
560 |
|
|
561 |
|
sgeqp4_Downdate_Y( |
562 |
|
m_A11, n_A11, buff_A11, ldim_A, |
563 |
|
m_A21, n_A21, buff_A21, ldim_A, |
564 |
|
m_A12, n_A12, buff_A12, ldim_A, |
565 |
|
b, b, buff_T1_T, ldim_T1_T, |
566 |
|
m_Y, max( 0, n_Y - j - b ), buff_Y2, ldim_Y, |
567 |
|
m_G, b, buff_G1, ldim_G, |
568 |
|
m_G, max( 0, n_G - j - b ), buff_G2, ldim_G ); |
569 |
|
} |
570 |
|
} |
571 |
|
|
572 |
|
// Remove auxiliary objects. |
573 |
|
//// FLA_Obj_free( & G ); |
574 |
|
//// FLA_Obj_free( & Y ); |
575 |
|
//// FLA_Obj_free( & V ); |
576 |
|
//// FLA_Obj_free( & W ); |
577 |
|
free( buff_G ); |
578 |
|
free( buff_Y ); |
579 |
|
free( buff_V ); |
580 |
|
free( buff_W ); |
581 |
|
|
582 |
|
return 0; |
583 |
|
} |
584 |
|
|
585 |
|
|
586 |
|
// ============================================================================ |
587 |
|
static int sgeqp4_Normal_random_matrix( int m_A, int n_A, |
588 |
|
float * buff_A, int ldim_A ) { |
589 |
|
// |
590 |
|
// It generates a random matrix with normal distribution. |
591 |
|
// |
592 |
|
int i, j; |
593 |
|
|
594 |
|
// Main loop. |
595 |
|
for ( j = 0; j < n_A; j++ ) { |
596 |
|
for ( i = 0; i < m_A; i++ ) { |
597 |
|
buff_A[ i + j * ldim_A ] = sgeqp4_Normal_random_number( 0.0, 1.0 ); |
598 |
|
} |
599 |
|
} |
600 |
|
|
601 |
|
return 0; |
602 |
|
} |
603 |
|
|
604 |
|
// ============================================================================ |
605 |
|
static float sgeqp4_Normal_random_number( float mu, float sigma ) { |
606 |
|
// |
607 |
|
// It computes and returns a normal random number. |
608 |
|
// |
609 |
|
static int alternate_calls = 0; |
610 |
|
static float b1, b2; |
611 |
|
float c1, c2, a, factor; |
612 |
|
|
613 |
|
// Quick return. |
614 |
|
if( alternate_calls == 1 ) { |
615 |
|
alternate_calls = ! alternate_calls; |
616 |
|
return( mu + sigma * b2 ); |
617 |
|
} |
618 |
|
// Main loop. |
619 |
|
do { |
620 |
|
c1 = -1.0 + 2.0 * ( (float) rand() / RAND_MAX ); |
621 |
|
c2 = -1.0 + 2.0 * ( (float) rand() / RAND_MAX ); |
622 |
|
a = c1 * c1 + c2 * c2; |
623 |
|
} while ( ( a == 0 )||( a >= 1 ) ); |
624 |
|
factor = sqrt( ( -2 * log( a ) ) / a ); |
625 |
|
b1 = c1 * factor; |
626 |
|
b2 = c2 * factor; |
627 |
|
alternate_calls = ! alternate_calls; |
628 |
|
return( mu + sigma * b1 ); |
629 |
|
} |
630 |
|
|
631 |
|
// ============================================================================ |
632 |
|
static int sgeqp4_Downdate_Y( |
633 |
|
int m_U11, int n_U11, float * buff_U11, int ldim_U11, |
634 |
|
int m_U21, int n_U21, float * buff_U21, int ldim_U21, |
635 |
|
int m_A12, int n_A12, float * buff_A12, int ldim_A12, |
636 |
|
int m_T, int n_T, float * buff_T, int ldim_T, |
637 |
|
int m_Y2, int n_Y2, float * buff_Y2, int ldim_Y2, |
638 |
|
int m_G1, int n_G1, float * buff_G1, int ldim_G1, |
639 |
|
int m_G2, int n_G2, float * buff_G2, int ldim_G2 ) { |
640 |
|
// |
641 |
|
// It downdates matrix Y, and updates matrix G. |
642 |
|
// Only Y2 of Y is updated. |
643 |
|
// Only G1 and G2 of G are updated. |
644 |
|
// |
645 |
|
// Y2 = Y2 - ( G1 - ( G1*U11 + G2*U21 ) * inv( T11 ) * U11' ) * R12. |
646 |
|
// |
647 |
|
int i, j; |
648 |
|
float * buff_B; |
649 |
|
float d_one = 1.0; |
650 |
|
float d_minus_one = -1.0; |
651 |
|
int m_B = m_G1; |
652 |
|
int n_B = n_G1; |
653 |
|
int ldim_B = m_G1; |
654 |
|
|
655 |
|
// Create object B. |
656 |
|
//// FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G1, & B ); |
657 |
|
buff_B = ( float * ) malloc( m_B * n_B * sizeof( float ) ); |
658 |
|
|
659 |
|
// B = G1. |
660 |
|
//// FLA_Copy( G1, B ); |
661 |
|
slacpy_( "All", & m_G1, & n_G1, buff_G1, & ldim_G1, |
662 |
|
buff_B, & ldim_B ); |
663 |
|
|
664 |
|
// B = B * U11. |
665 |
|
//// FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, |
666 |
|
//// FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, |
667 |
|
//// FLA_ONE, U11, B ); |
668 |
|
strmm_( "Right", "Lower", "No transpose", "Unit", & m_B, & n_B, |
669 |
|
& d_one, buff_U11, & ldim_U11, buff_B, & ldim_B ); |
670 |
|
|
671 |
|
// B = B + G2 * U21. |
672 |
|
//// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, |
673 |
|
//// FLA_ONE, G2, U21, FLA_ONE, B ); |
674 |
|
sgemm_( "No transpose", "No tranpose", & m_B, & n_B, & m_U21, |
675 |
|
& d_one, buff_G2, & ldim_G2, buff_U21, & ldim_U21, |
676 |
|
& d_one, buff_B, & ldim_B ); |
677 |
|
|
678 |
|
// B = B * inv( T11 ). |
679 |
|
//// FLA_Trsm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, |
680 |
|
//// FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, |
681 |
|
//// FLA_ONE, T, B ); |
682 |
|
//// dtrsm_( "Right", "Upper", "No transpose", "Non-unit", & m_B, & n_B, |
683 |
|
//// & d_one, buff_T, & ldim_T, buff_B, & ldim_B ); |
684 |
|
// Used dtrmm instead of dtrsm because of using compact WY instead of UT. |
685 |
|
strmm_( "Right", "Upper", "No transpose", "Non-unit", & m_B, & n_B, |
686 |
|
& d_one, buff_T, & ldim_T, buff_B, & ldim_B ); |
687 |
|
|
688 |
|
// B = - B * U11^H. |
689 |
|
//// FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, |
690 |
|
//// FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG, |
691 |
|
//// FLA_MINUS_ONE, U11, B ); |
692 |
|
strmm_( "Right", "Lower", "Conj_tranpose", "Unit", & m_B, & n_B, |
693 |
|
& d_minus_one, buff_U11, & ldim_U11, buff_B, & ldim_B ); |
694 |
|
|
695 |
|
// B = G1 + B. |
696 |
|
//// FLA_Axpy( FLA_ONE, G1, B ); |
697 |
|
for( j = 0; j < n_B; j++ ) { |
698 |
|
for( i = 0; i < m_B; i++ ) { |
699 |
|
buff_B[ i + j * ldim_B ] += buff_G1[ i + j * ldim_G1 ]; |
700 |
|
} |
701 |
|
} |
702 |
|
|
703 |
|
// Y2 = Y2 - B * R12. |
704 |
|
//// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, |
705 |
|
//// FLA_MINUS_ONE, B, A12, FLA_ONE, Y2 ); |
706 |
|
sgemm_( "No transpose", "No transpose", & m_Y2, & n_Y2, & m_A12, |
707 |
|
& d_minus_one, buff_B, & ldim_B, buff_A12, & ldim_A12, |
708 |
|
& d_one, buff_Y2, & ldim_Y2 ); |
709 |
|
|
710 |
|
// |
711 |
|
// GR = GR * Q |
712 |
|
// |
713 |
|
sgeqp4_Apply_Q_WY_rnfc_blk_var4( |
714 |
|
m_U11 + m_U21, n_U11, buff_U11, ldim_U11, |
715 |
|
m_T, n_T, buff_T, ldim_T, |
716 |
|
m_G1, n_G1 + n_G2, buff_G1, ldim_G1 ); |
717 |
|
|
718 |
|
// Remove object B. |
719 |
|
//// FLA_Obj_free( & B ); |
720 |
|
free( buff_B ); |
721 |
|
|
722 |
|
return 0; |
723 |
|
} |
724 |
|
|
725 |
|
// ============================================================================ |
726 |
|
static int sgeqp4_Apply_Q_WY_lhfc_blk_var4( |
727 |
|
int m_U, int n_U, float * buff_U, int ldim_U, |
728 |
|
int m_T, int n_T, float * buff_T, int ldim_T, |
729 |
|
int m_B, int n_B, float * buff_B, int ldim_B ) { |
730 |
|
// |
731 |
|
// It applies the transpose of a block transformation Q to a matrix B from |
732 |
|
// the left: |
733 |
|
// B := Q' * B |
734 |
|
// where: |
735 |
|
// Q = I - U * T' * U'. |
736 |
|
// |
737 |
|
float * buff_W; |
738 |
|
int ldim_W; |
739 |
|
|
740 |
|
// Create auxiliary object. |
741 |
|
//// FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B1, & W ); |
742 |
|
buff_W = ( float * ) malloc( n_B * n_U * sizeof( float ) ); |
743 |
|
ldim_W = max( 1, n_B ); |
744 |
|
|
745 |
|
// Apply the block transformation. |
746 |
|
slarfb_( "Left", "Transpose", "Forward", "Columnwise", |
747 |
|
& m_B, & n_B, & n_U, buff_U, & ldim_U, buff_T, & ldim_T, |
748 |
|
buff_B, & ldim_B, buff_W, & ldim_W ); |
749 |
|
|
750 |
|
// Remove auxiliary object. |
751 |
|
//// FLA_Obj_free( & W ); |
752 |
|
free( buff_W ); |
753 |
|
|
754 |
|
return 0; |
755 |
|
} |
756 |
|
|
757 |
|
// ============================================================================ |
758 |
|
static int sgeqp4_Apply_Q_WY_rnfc_blk_var4( |
759 |
|
int m_U, int n_U, float * buff_U, int ldim_U, |
760 |
|
int m_T, int n_T, float * buff_T, int ldim_T, |
761 |
|
int m_B, int n_B, float * buff_B, int ldim_B ) { |
762 |
|
// |
763 |
|
// It applies a block transformation Q to a matrix B from the right: |
764 |
|
// B = B * Q |
765 |
|
// where: |
766 |
|
// Q = I - U * T' * U'. |
767 |
|
// |
768 |
|
float * buff_W; |
769 |
|
int ldim_W; |
770 |
|
|
771 |
|
// Create auxiliary object. |
772 |
|
//// FLA_Obj_create_conf_to( FLA_TRANSPOSE, B1, & W ); |
773 |
|
buff_W = ( float * ) malloc( m_B * n_U * sizeof( float ) ); |
774 |
|
ldim_W = max( 1, m_B ); |
775 |
|
|
776 |
|
// Apply the block transformation. |
777 |
|
slarfb_( "Right", "No transpose", "Forward", "Columnwise", |
778 |
|
& m_B, & n_B, & n_U, buff_U, & ldim_U, buff_T, & ldim_T, |
779 |
|
buff_B, & ldim_B, buff_W, & ldim_W ); |
780 |
|
|
781 |
|
// Remove auxiliary object. |
782 |
|
//// FLA_Obj_free( & W ); |
783 |
|
free( buff_W ); |
784 |
|
|
785 |
|
return 0; |
786 |
|
} |
787 |
|
|
788 |
|
// ============================================================================ |
789 |
|
static int sgeqp4_QRPmod_WY_unb_var4( int pivoting, int num_stages, |
790 |
|
int m_A, int n_A, float * buff_A, int ldim_A, |
791 |
|
int * buff_p, float * buff_t, |
792 |
|
int pivot_B, int m_B, float * buff_B, int ldim_B, |
793 |
|
int pivot_C, int m_C, float * buff_C, int ldim_C, |
794 |
|
int build_T, float * buff_T, int ldim_T ) { |
795 |
|
// |
796 |
|
// It computes an unblocked QR factorization of matrix A with or without |
797 |
|
// pivoting. Matrices B and C are optionally pivoted, and matrix T is |
798 |
|
// optionally built. |
799 |
|
// |
800 |
|
// Arguments: |
801 |
|
// "pivoting": If pivoting==1, then QR factorization with pivoting is used. |
802 |
|
// "numstages": It tells the number of columns that are factorized. |
803 |
|
// If "num_stages" is negative, the whole matrix A is factorized. |
804 |
|
// If "num_stages" is positive, only the first "num_stages" are factorized. |
805 |
|
// "pivot_B": if "pivot_B" is true, matrix "B" is pivoted too. |
806 |
|
// "pivot_C": if "pivot_C" is true, matrix "C" is pivoted too. |
807 |
|
// "build_T": if "build_T" is true, matrix "T" is built. |
808 |
|
// |
809 |
|
int j, mn_A, m_a21, m_A22, n_A22, n_dB, idx_max_col, |
810 |
|
i_one = 1, n_house_vector, m_rest; |
811 |
|
float * buff_d, * buff_e, * buff_workspace, diag; |
812 |
|
int isamax_(); |
813 |
|
|
814 |
|
//// printf( "sgeqp4_QRPmod_WY_unb_var4. pivoting: %d \n", pivoting ); |
815 |
|
|
816 |
|
// Some initializations. |
817 |
|
mn_A = min( m_A, n_A ); |
818 |
|
|
819 |
|
// Set the number of stages, if needed. |
820 |
|
if( num_stages < 0 ) { |
821 |
|
num_stages = mn_A; |
822 |
|
} |
823 |
|
|
824 |
|
// Create auxiliary vectors. |
825 |
|
buff_d = ( float * ) malloc( n_A * sizeof( float ) ); |
826 |
|
buff_e = ( float * ) malloc( n_A * sizeof( float ) ); |
827 |
|
buff_workspace = ( float * ) malloc( n_A * sizeof( float ) ); |
828 |
|
|
829 |
|
if( pivoting == 1 ) { |
830 |
|
// Compute initial norms of A into d and e. |
831 |
|
sgeqp4_QRP_compute_norms( m_A, n_A, buff_A, ldim_A, buff_d, buff_e ); |
832 |
|
} |
833 |
|
|
834 |
|
// Main Loop. |
835 |
|
for( j = 0; j < num_stages; j++ ) { |
836 |
|
n_dB = n_A - j; |
837 |
|
m_a21 = m_A - j - 1; |
838 |
|
m_A22 = m_A - j - 1; |
839 |
|
n_A22 = n_A - j - 1; |
840 |
|
|
841 |
|
if( pivoting == 1 ) { |
842 |
|
// Obtain the index of the column with largest 2-norm. |
843 |
|
idx_max_col = isamax_( & n_dB, & buff_d[ j ], & i_one ) - 1; |
844 |
|
|
845 |
|
// Swap columns of A, B, C, pivots, and norms vectors. |
846 |
|
sgeqp4_QRP_pivot_G_B_C( idx_max_col, |
847 |
|
m_A, & buff_A[ 0 + j * ldim_A ], ldim_A, |
848 |
|
pivot_B, m_B, & buff_B[ 0 + j * ldim_B ], ldim_B, |
849 |
|
pivot_C, m_C, & buff_C[ 0 + j * ldim_C ], ldim_C, |
850 |
|
& buff_p[ j ], |
851 |
|
& buff_d[ j ], |
852 |
|
& buff_e[ j ] ); |
853 |
|
} |
854 |
|
|
855 |
|
// Compute tau1 and u21 from alpha11 and a21 such that tau1 and u21 |
856 |
|
// determine a Householder transform H such that applying H from the |
857 |
|
// left to the column vector consisting of alpha11 and a21 annihilates |
858 |
|
// the entries in a21 (and updates alpha11). |
859 |
|
n_house_vector = m_a21 + 1; |
860 |
|
slarfg_( & n_house_vector, |
861 |
|
& buff_A[ j + j * ldim_A ], |
862 |
|
& buff_A[ min( m_A-1, j+1 ) + j * ldim_A ], & i_one, |
863 |
|
& buff_t[ j ] ); |
864 |
|
|
865 |
|
// / a12t \ = H / a12t \ |
866 |
|
// \ A22 / \ A22 / |
867 |
|
// |
868 |
|
// where H is formed from tau1 and u21. |
869 |
|
diag = buff_A[ j + j * ldim_A ]; |
870 |
|
buff_A[ j + j * ldim_A ] = 1.0; |
871 |
|
m_rest = m_A22 + 1; |
872 |
|
slarf_( "Left", & m_rest, & n_A22, |
873 |
|
& buff_A[ j + j * ldim_A ], & i_one, |
874 |
|
& buff_t[ j ], |
875 |
|
& buff_A[ j + ( j+1 ) * ldim_A ], & ldim_A, |
876 |
|
buff_workspace ); |
877 |
|
buff_A[ j + j * ldim_A ] = diag; |
878 |
|
|
879 |
|
if( pivoting == 1 ) { |
880 |
|
// Update partial column norms. |
881 |
|
sgeqp4_QRP_downdate_partial_norms( m_A22, n_A22, |
882 |
|
& buff_d[ j+1 ], 1, |
883 |
|
& buff_e[ j+1 ], 1, |
884 |
|
& buff_A[ j + ( j+1 ) * ldim_A ], ldim_A, |
885 |
|
& buff_A[ ( j+1 ) + min( n_A-1, ( j+1 ) ) * ldim_A ], ldim_A ); |
886 |
|
} |
887 |
|
} |
888 |
|
|
889 |
|
// Build T. |
890 |
|
if( build_T ) { |
891 |
|
slarft_( "Forward", "Columnwise", & m_A, & num_stages, buff_A, & ldim_A, |
892 |
|
buff_t, buff_T, & ldim_T ); |
893 |
|
} |
894 |
|
|
895 |
|
// Remove auxiliary vectors. |
896 |
|
free( buff_d ); |
897 |
|
free( buff_e ); |
898 |
|
free( buff_workspace ); |
899 |
|
|
900 |
|
return 0; |
901 |
|
} |
902 |
|
|
903 |
|
// ============================================================================ |
904 |
|
static int sgeqp4_QRP_compute_norms( |
905 |
|
int m_A, int n_A, float * buff_A, int ldim_A, |
906 |
|
float * buff_d, float * buff_e ) { |
907 |
|
// |
908 |
|
// It computes the column norms of matrix A. The norms are stored into |
909 |
|
// vectors d and e. |
910 |
|
// |
911 |
|
int j, i_one = 1; |
912 |
|
float snrm2_(); |
913 |
|
|
914 |
|
// Main loop. |
915 |
|
for( j = 0; j < n_A; j++ ) { |
916 |
|
* buff_d = snrm2_( & m_A, buff_A, & i_one ); |
917 |
|
* buff_e = * buff_d; |
918 |
|
buff_A += ldim_A; |
919 |
|
buff_d++; |
920 |
|
buff_e++; |
921 |
|
} |
922 |
|
|
923 |
|
return 0; |
924 |
|
} |
925 |
|
|
926 |
|
// ============================================================================ |
927 |
|
static int sgeqp4_QRP_downdate_partial_norms( int m_A, int n_A, |
928 |
|
float * buff_d, int st_d, |
929 |
|
float * buff_e, int st_e, |
930 |
|
float * buff_wt, int st_wt, |
931 |
|
float * buff_A, int ldim_A ) { |
932 |
|
// |
933 |
|
// It updates (downdates) the column norms of matrix A. It uses Drmac's method. |
934 |
|
// |
935 |
|
int j, i_one = 1; |
936 |
|
float * ptr_d, * ptr_e, * ptr_wt, * ptr_A; |
937 |
|
float temp, temp2, temp5, tol3z; |
938 |
|
float snrm2_(), slamch_(); |
939 |
|
|
940 |
|
/* |
941 |
|
* |
942 |
|
* Update partial column norms |
943 |
|
* |
944 |
|
DO 30 J = I + 1, N |
945 |
|
IF( WORK( J ).NE.ZERO ) THEN |
946 |
|
* |
947 |
|
* NOTE: The following 4 lines follow from the analysis in |
948 |
|
* Lapack Working Note 176. |
949 |
|
* |
950 |
|
TEMP = ABS( A( I, J ) ) / WORK( J ) |
951 |
|
TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) |
952 |
|
TEMP2 = TEMP*( WORK( J ) / WORK( N+J ) )**2 |
953 |
|
IF( TEMP2 .LE. TOL3Z ) THEN |
954 |
|
IF( M-I.GT.0 ) THEN |
955 |
|
WORK( J ) = DNRM2( M-I, A( I+1, J ), 1 ) |
956 |
|
WORK( N+J ) = WORK( J ) |
957 |
|
ELSE |
958 |
|
WORK( J ) = ZERO |
959 |
|
WORK( N+J ) = ZERO |
960 |
|
END IF |
961 |
|
ELSE |
962 |
|
WORK( J ) = WORK( J )*SQRT( TEMP ) |
963 |
|
END IF |
964 |
|
END IF |
965 |
|
30 CONTINUE |
966 |
|
*/ |
967 |
|
|
968 |
|
// Some initializations. |
969 |
|
tol3z = sqrt( slamch_( "Epsilon" ) ); |
970 |
|
ptr_d = buff_d; |
971 |
|
ptr_e = buff_e; |
972 |
|
ptr_wt = buff_wt; |
973 |
|
ptr_A = buff_A; |
974 |
|
|
975 |
|
// Main loop. |
976 |
|
for( j = 0; j < n_A; j++ ) { |
977 |
|
if( * ptr_d != 0.0 ) { |
978 |
|
temp = dabs( * ptr_wt ) / * ptr_d; |
979 |
|
temp = max( 0.0, ( 1.0 + temp ) * ( 1 - temp ) ); |
980 |
|
temp5 = * ptr_d / * ptr_e; |
981 |
|
temp2 = temp * temp5 * temp5; |
982 |
|
if( temp2 <= tol3z ) { |
983 |
|
if( m_A > 0 ) { |
984 |
|
* ptr_d = snrm2_( & m_A, ptr_A, & i_one ); |
985 |
|
* ptr_e = *ptr_d; |
986 |
|
} else { |
987 |
|
* ptr_d = 0.0; |
988 |
|
* ptr_e = 0.0; |
989 |
|
} |
990 |
|
} else { |
991 |
|
* ptr_d = * ptr_d * sqrt( temp ); |
992 |
|
} |
993 |
|
} |
994 |
|
ptr_A += ldim_A; |
995 |
|
ptr_d += st_d; |
996 |
|
ptr_e += st_e; |
997 |
|
ptr_wt += st_wt; |
998 |
|
} |
999 |
|
|
1000 |
|
return 0; |
1001 |
|
} |
1002 |
|
|
1003 |
|
|
1004 |
|
// ============================================================================ |
1005 |
|
static int sgeqp4_QRP_pivot_G_B_C( int j_max_col, |
1006 |
|
int m_G, float * buff_G, int ldim_G, |
1007 |
|
int pivot_B, int m_B, float * buff_B, int ldim_B, |
1008 |
|
int pivot_C, int m_C, float * buff_C, int ldim_C, |
1009 |
|
int * buff_p, |
1010 |
|
float * buff_d, float * buff_e ) { |
1011 |
|
// |
1012 |
|
// It pivots matrix G, pivot vector p, and norms vectors d and e. |
1013 |
|
// Matrices B and C are optionally pivoted. |
1014 |
|
// |
1015 |
|
int ival, i_one = 1; |
1016 |
|
float * ptr_g1, * ptr_g2, * ptr_b1, * ptr_b2, * ptr_c1, * ptr_c2; |
1017 |
|
|
1018 |
|
// Swap columns of G, pivots, and norms. |
1019 |
|
if( j_max_col != 0 ) { |
1020 |
|
|
1021 |
|
// Swap full column 0 and column "j_max_col" of G. |
1022 |
|
ptr_g1 = & buff_G[ 0 + 0 * ldim_G ]; |
1023 |
|
ptr_g2 = & buff_G[ 0 + j_max_col * ldim_G ]; |
1024 |
|
sswap_( & m_G, ptr_g1, & i_one, ptr_g2, & i_one ); |
1025 |
|
|
1026 |
|
// Swap full column 0 and column "j_max_col" of B. |
1027 |
|
if( pivot_B ) { |
1028 |
|
ptr_b1 = & buff_B[ 0 + 0 * ldim_B ]; |
1029 |
|
ptr_b2 = & buff_B[ 0 + j_max_col * ldim_B ]; |
1030 |
|
sswap_( & m_B, ptr_b1, & i_one, ptr_b2, & i_one ); |
1031 |
|
} |
1032 |
|
|
1033 |
|
// Swap full column 0 and column "j_max_col" of C. |
1034 |
|
if( pivot_C ) { |
1035 |
|
ptr_c1 = & buff_C[ 0 + 0 * ldim_C ]; |
1036 |
|
ptr_c2 = & buff_C[ 0 + j_max_col * ldim_C ]; |
1037 |
|
sswap_( & m_C, ptr_c1, & i_one, ptr_c2, & i_one ); |
1038 |
|
} |
1039 |
|
|
1040 |
|
// Swap element 0 and element "j_max_col" of pivot vector "p". |
1041 |
|
ival = buff_p[ j_max_col ]; |
1042 |
|
buff_p[ j_max_col ] = buff_p[ 0 ]; |
1043 |
|
buff_p[ 0 ] = ival; |
1044 |
|
|
1045 |
|
// Copy norms of column 0 to column "j_max_col". |
1046 |
|
buff_d[ j_max_col ] = buff_d[ 0 ]; |
1047 |
|
buff_e[ j_max_col ] = buff_e[ 0 ]; |
1048 |
|
} |
1049 |
|
|
1050 |
|
return 0; |
1051 |
|
} |
1052 |
|
|