Solves a system of linear equations A X = B, where A is a N-by-N general matrix and X and B are N-by-NRHS matrices. The matrix A is factorized using the LU descomposition with tiled-pivoting. The matrix A is descomposed as:
where P is a permutation matrix, L is a lower triangular matrix with unit diagonal elements and U is an upper triangular matrix.
NRHS int. NRHS specifies the number of right-hand-sides (number of columns of B). NRHS >= 0.
100 int ni, mi, nni, ki, nrhsi;
107 int current_position;
108 enum LASS_RETURN ret;
112 if ( N % TILE_SIZE == 0 )
118 nt = ( N / TILE_SIZE ) + 1;
121 if ( NRHS % TILE_SIZE == 0 )
123 nrhst = NRHS / TILE_SIZE;
127 nrhst = ( NRHS / TILE_SIZE ) + 1;
134 double ( *TILE_A )[nt][TILE_SIZE * TILE_SIZE] = malloc ( nt * nt *
135 TILE_SIZE * TILE_SIZE *
sizeof(
double ) );
137 if ( TILE_A == NULL )
139 fprintf( stderr,
"Failure in kdtpgesv for matrix TILE_A\n" );
143 double ( *TILE_B )[nrhst][TILE_SIZE * TILE_SIZE] = malloc(
144 nt * nrhst * TILE_SIZE * TILE_SIZE *
sizeof(
double ) );
146 if ( TILE_B == NULL )
148 fprintf( stderr,
"Failure in kdtpgesv for matrix TILE_B\n" );
152 ipiv_local = (
int* ) malloc( N *
sizeof(
int ) );
154 if ( ipiv_local == NULL )
156 fprintf( stderr,
"Failure in kdtpgesv for ipiv_local\n" );
174 current_position = 0;
176 for ( ki = 0; ki < nt; ki++ )
180 #pragma oss task inout( TILE_A[ki][ki] ) \ 181 inout( ipiv_local ) \ 182 inout( current_position ) \ 183 out( local_ipiv_size ) \ 185 firstprivate( ki, i ) \ 188 LAPACKE_dgetrf( CblasRowMajor,
189 tile_size_k, tile_size_k,
190 TILE_A[ki][ki], tile_size_k,
191 ipiv_local + current_position );
194 local_ipiv_size = tile_size_k;
195 for ( i = 0; i < local_ipiv_size; i++ )
197 ( IPIV + current_position )[i] =
198 ( ipiv_local + current_position )[i] + current_position;
200 current_position += local_ipiv_size;
202 for ( mi = ki + 1; mi < nt; mi++ )
206 #pragma oss task in( TILE_A[ki][ki] ) \ 207 inout(TILE_A[mi][ki]) \ 209 firstprivate( mi, ki ) \ 211 cblas_dtrsm( CblasRowMajor,
212 ( CBLAS_SIDE ) Right, ( CBLAS_UPLO ) Upper,
213 ( CBLAS_TRANSPOSE ) NoTrans, ( CBLAS_DIAG ) NonUnit,
214 tile_size_m, tile_size_k,
215 1.0, TILE_A[ki][ki], tile_size_k,
216 TILE_A[mi][ki], tile_size_k );
218 for ( ni = 0; ni < ki; ni++ )
222 #pragma oss task in ( ipiv_local ) \ 223 in( current_position ) \ 224 in( local_ipiv_size ) \ 225 inout( TILE_A[ki][ni] ) \ 227 firstprivate( ki, ni ) \ 229 LAPACKE_dlaswp( CblasRowMajor,
230 tile_size_n, TILE_A[ki][ni], tile_size_n,
232 ipiv_local + current_position - local_ipiv_size,
236 for ( nrhsi = 0; nrhsi < nrhst; nrhsi++ )
240 #pragma oss task in ( ipiv_local ) \ 241 in( current_position ) \ 242 in( local_ipiv_size ) \ 243 inout( TILE_B[ki][nrhsi] ) \ 245 firstprivate( ki, nrhsi ) \ 247 LAPACKE_dlaswp( CblasRowMajor,
248 tile_size_nrhs, TILE_B[ki][nrhsi], tile_size_nrhs,
250 ipiv_local + current_position - local_ipiv_size,
254 for ( ni = ki + 1; ni < nt; ni++ )
258 #pragma oss task in ( ipiv_local ) \ 259 in( current_position ) \ 260 in( local_ipiv_size ) \ 261 inout( TILE_A[ki][ni] ) \ 263 firstprivate( ki, ni ) \ 264 label( dlaswp_right ) 265 LAPACKE_dlaswp( CblasRowMajor,
266 tile_size_n, TILE_A[ki][ni], tile_size_n,
268 ipiv_local + current_position - local_ipiv_size,
271 #pragma oss task in( TILE_A[ki][ki] ) \ 272 inout(TILE_A[ki][ni]) \ 274 firstprivate( ni, ki ) \ 276 cblas_dtrsm( CblasRowMajor,
277 ( CBLAS_SIDE ) Left, ( CBLAS_UPLO ) Lower,
278 ( CBLAS_TRANSPOSE ) NoTrans, ( CBLAS_DIAG ) Unit,
279 tile_size_k, tile_size_n,
280 1.0, TILE_A[ki][ki], tile_size_k,
281 TILE_A[ki][ni], tile_size_n );
283 for ( nni = ki + 1; nni < nt; nni++ )
287 #pragma oss task in( TILE_A[nni][ki] ) \ 288 in( TILE_A[ki][ni] ) \ 289 inout( TILE_A[nni][ni] ) \ 291 firstprivate( ni, nni, ki ) \ 293 cblas_dgemm( CblasRowMajor,
294 ( CBLAS_TRANSPOSE ) NoTrans,
295 ( CBLAS_TRANSPOSE ) NoTrans,
299 -1.0, TILE_A[nni][ki], tile_size_k,
300 TILE_A[ki][ni], tile_size_n,
301 1.0, TILE_A[nni][ni], tile_size_n );
311 for ( ki = 0; ki < nt; ki++ )
315 for ( ni = 0; ni < nrhst; ni++ )
319 #pragma oss task in( TILE_A[ki][ki] ) \ 320 inout( TILE_B[ki][ni] ) \ 321 shared( TILE_A, TILE_B ) \ 322 firstprivate( ki, ni ) \ 323 label( dtrsm_dtrsm1 ) 324 cblas_dtrsm( CblasRowMajor,
325 ( CBLAS_SIDE ) Left, ( CBLAS_UPLO ) Lower,
326 ( CBLAS_TRANSPOSE ) NoTrans,
330 1.0, TILE_A[ki][ki], tile_size_k,
331 TILE_B[ki][ni], tile_size_n );
333 for ( nni = 0; nni < nrhst; nni++ )
337 for ( mi = ki + 1; mi < nt; mi++ )
340 #pragma oss task in( TILE_A[mi][ki] ) \ 341 in( TILE_B[ki][nni] ) \ 342 inout( TILE_B[mi][nni] ) \ 343 shared( TILE_A, TILE_B ) \ 344 firstprivate( ki, mi, nni ) \ 345 label( dtrsm_dgemm1 ) 346 cblas_dgemm( CblasRowMajor,
347 CblasNoTrans, CblasNoTrans,
351 -1.0, TILE_A[mi][ki], tile_size_k,
352 TILE_B[ki][nni], tile_size_nn,
353 1.0, TILE_B[mi][nni], tile_size_nn );
360 for ( ki = nt - 1; ki >= 0; ki-- )
364 for ( ni = 0; ni < nrhst; ni++ )
368 #pragma oss task in( TILE_A[ki][ki] ) \ 369 inout( TILE_B[ki][ni] ) \ 370 shared( TILE_A, TILE_B ) \ 371 firstprivate( ki, ni ) \ 372 label( dtrsmi_dtrsm2 ) 373 cblas_dtrsm( CblasRowMajor,
374 ( CBLAS_SIDE ) Left, ( CBLAS_UPLO ) Upper,
375 ( CBLAS_TRANSPOSE ) NoTrans,
376 ( CBLAS_DIAG ) NonUnit,
379 1.0, TILE_A[ki][ki], tile_size_k,
380 TILE_B[ki][ni], tile_size_n );
383 for ( mi = ki - 1; mi >= 0; mi-- )
387 for ( nni = 0; nni < nrhst; nni++ )
391 #pragma oss task in( TILE_A[mi][ki] ) \ 392 in( TILE_B[ki][nni] ) \ 393 inout( TILE_B[mi][nni] ) \ 394 shared( TILE_A, TILE_B ) \ 395 firstprivate( ki, mi, nni ) \ 396 label( dtrsm_dgemm2 ) 397 cblas_dgemm( CblasRowMajor,
398 CblasNoTrans, CblasNoTrans,
402 -1.0, TILE_A[mi][ki], tile_size_k,
403 TILE_B[ki][nni], tile_size_nn,
404 1.0, TILE_B[mi][nni], tile_size_nn );
void ddss_dflat2tiled(int M, int N, double *A, int LDA, int MT, int NT, double(*TILE_A)[NT][TILE_SIZE *TILE_SIZE])
void ddss_dtiled2flat(int M, int N, double *A, int LDA, int MT, int NT, double(*TILE_A)[NT][TILE_SIZE *TILE_SIZE])
int ddss_tile_size(int M, int MT)