1 #include "../include/lass.h" 123 enum LASS_RETURN
kdtrsm(
enum DDSS_SIDE SIDE,
enum DDSS_UPLO UPLO,
124 enum DDSS_TRANS TRANS_A,
enum DDSS_DIAG DIAG,
126 const double ALPHA,
double *A,
int LDA,
133 int ki, ni, mi, nni, mmi;
144 if ( M % TILE_SIZE == 0 )
150 mt = ( M / TILE_SIZE ) + 1;
153 if ( N % TILE_SIZE == 0 )
159 nt = ( N / TILE_SIZE ) + 1;
184 double ( *TILE_A )[An][TILE_SIZE * TILE_SIZE] = malloc(
185 Am * An * TILE_SIZE * TILE_SIZE *
sizeof(
double ) );
187 if ( TILE_A == NULL )
189 fprintf( stderr,
"Failure in ddss_dtile_alloc for matrix TILE_A\n" );
193 double ( *TILE_B )[Bn][TILE_SIZE * TILE_SIZE] = malloc(
194 Bm * Bn * TILE_SIZE * TILE_SIZE *
sizeof(
double ) );
196 if ( TILE_B == NULL )
198 fprintf( stderr,
"Failure in ddss_dtile_alloc for matrix TILE_B\n" );
221 if ( TRANS_A == NoTrans )
223 for ( ki = mt - 1; ki >= 0; ki-- )
235 for ( ni = 0; ni < nt; ni++ )
239 #pragma oss task in( TILE_A[ki][ki] ) \ 240 inout( TILE_B[ki][ni] ) \ 241 shared( TILE_A, TILE_B ) \ 242 firstprivate( ki, ni ) \ 244 cblas_dtrsm( CblasRowMajor,
245 ( CBLAS_SIDE ) SIDE, ( CBLAS_UPLO ) UPLO,
246 ( CBLAS_TRANSPOSE ) TRANS_A,
250 alpha, TILE_A[ki][ki], tile_size_k,
251 TILE_B[ki][ni], tile_size_n );
254 for ( mi = ki - 1; mi >= 0; mi-- )
257 for ( nni = 0; nni < nt; nni++ )
261 #pragma oss task in( TILE_A[mi][ki] ) \ 262 in( TILE_B[ki][nni] ) \ 263 inout( TILE_B[mi][nni] ) \ 264 shared( TILE_A, TILE_B ) \ 265 firstprivate( ki, mi, nni ) \ 267 cblas_dgemm( CblasRowMajor,
268 CblasNoTrans, CblasNoTrans,
272 -1.0, TILE_A[mi][ki], tile_size_k,
273 TILE_B[ki][nni], tile_size_nn,
274 alpha, TILE_B[mi][nni], tile_size_nn );
283 for ( ki = 0; ki < mt; ki++ )
295 for ( ni = 0; ni < nt; ni++ )
299 #pragma oss task in( TILE_A[ki][ki] ) \ 300 inout( TILE_B[ki][ni] ) \ 301 shared( TILE_A, TILE_B ) \ 302 firstprivate( ki, ni ) \ 304 cblas_dtrsm( CblasRowMajor,
305 ( CBLAS_SIDE ) SIDE, ( CBLAS_UPLO ) UPLO,
306 ( CBLAS_TRANSPOSE ) TRANS_A,
310 alpha, TILE_A[ki][ki], tile_size_k,
311 TILE_B[ki][ni], tile_size_n );
314 for ( mi = ki + 1; mi < mt; mi++ )
317 for ( nni = 0; nni < nt; nni++ )
321 #pragma oss task in( TILE_A[ki][mi]) \ 322 in( TILE_B[ki][nni] ) \ 323 inout( TILE_B[mi][nni] ) \ 324 shared( TILE_A, TILE_B ) \ 325 firstprivate( ki, mi, nni ) \ 327 cblas_dgemm( CblasRowMajor,
328 CblasTrans, CblasNoTrans,
332 -1.0, TILE_A[ki][mi], tile_size_m,
333 TILE_B[ki][nni], tile_size_nn,
334 alpha, TILE_B[mi][nni], tile_size_nn );
344 if ( TRANS_A == NoTrans )
346 for ( ki = 0; ki < mt; ki++ )
358 for ( ni = 0; ni < nt; ni++ )
362 #pragma oss task in( TILE_A[ki][ki] ) \ 363 inout( TILE_B[ki][ni] ) \ 364 shared( TILE_A, TILE_B ) \ 365 firstprivate( ki, ni ) \ 367 cblas_dtrsm( CblasRowMajor,
368 ( CBLAS_SIDE ) SIDE, ( CBLAS_UPLO ) UPLO,
369 ( CBLAS_TRANSPOSE ) TRANS_A,
373 alpha, TILE_A[ki][ki], tile_size_k,
374 TILE_B[ki][ni], tile_size_n );
377 for ( nni = 0; nni < nt; nni++ )
381 for ( mi = ki + 1; mi < mt; mi++ )
386 #pragma oss task in( TILE_A[mi][ki] ) \ 387 in( TILE_B[ki][nni] ) \ 388 inout( TILE_B[mi][nni] ) \ 389 shared( TILE_A, TILE_B ) \ 390 firstprivate( ki, mi, nni ) \ 392 cblas_dgemm( CblasRowMajor,
393 CblasNoTrans, CblasNoTrans,
397 -1.0, TILE_A[mi][ki], tile_size_k,
398 TILE_B[ki][nni], tile_size_nn,
399 alpha, TILE_B[mi][nni], tile_size_nn );
408 for ( ki = mt - 1; ki >= 0; ki-- )
420 for ( ni = 0; ni < nt; ni++ )
424 #pragma oss task in( TILE_A[ki][ki] ) \ 425 inout( TILE_B[ki][ni] ) \ 426 shared( TILE_A, TILE_B ) \ 427 firstprivate( ki, ni ) \ 429 cblas_dtrsm( CblasRowMajor,
430 ( CBLAS_SIDE ) SIDE, ( CBLAS_UPLO ) UPLO,
431 ( CBLAS_TRANSPOSE ) TRANS_A,
435 alpha, TILE_A[ki][ki], tile_size_k,
436 TILE_B[ki][ni], tile_size_n );
439 for ( mi = ki - 1; mi >= 0; mi-- )
442 for ( nni = 0; nni < nt; nni++ )
446 #pragma oss task in( TILE_A[ki][mi] ) \ 447 in( TILE_B[ki][nni] ) \ 448 inout( TILE_B[mi][nni] ) \ 449 shared( TILE_A, TILE_B ) \ 450 firstprivate( ki, mi, nni ) \ 452 cblas_dgemm( CblasRowMajor,
453 CblasTrans, CblasNoTrans,
457 -1.0, TILE_A[ki][mi], tile_size_m,
458 TILE_B[ki][nni], tile_size_nn,
459 alpha, TILE_B[mi][nni], tile_size_nn );
472 if ( TRANS_A == NoTrans )
474 for ( ki = 0; ki < nt; ki++ )
486 for ( mi = 0; mi < mt; mi++ )
490 #pragma oss task in( TILE_A[ki][ki] ) \ 491 inout( TILE_B[mi][ki] ) \ 492 shared( TILE_A, TILE_B ) \ 493 firstprivate( ki, mi ) \ 495 cblas_dtrsm( CblasRowMajor,
496 ( CBLAS_SIDE ) SIDE, ( CBLAS_UPLO ) UPLO,
497 ( CBLAS_TRANSPOSE ) TRANS_A,
501 alpha, TILE_A[ki][ki], tile_size_k,
502 TILE_B[mi][ki], tile_size_k );
505 for ( ni = ki + 1; ni < nt; ni++ )
508 for ( mmi = 0; mmi < mt; mmi++ )
512 #pragma oss task in( TILE_A[ki][ni] ) \ 513 in( TILE_B[mmi][ki] ) \ 514 inout( TILE_B[mmi][ni] ) \ 515 shared( TILE_A, TILE_B ) \ 516 firstprivate( ki, ni, mmi ) \ 518 cblas_dgemm( CblasRowMajor,
519 CblasNoTrans, CblasNoTrans,
523 -1.0, TILE_B[mmi][ki], tile_size_k,
524 TILE_A[ki][ni], tile_size_n,
525 alpha, TILE_B[mmi][ni], tile_size_n );
533 for ( ki = nt - 1; ki >= 0; ki-- )
545 for ( mi = 0; mi < mt; mi++ )
549 #pragma oss task in( TILE_A[ki][ki] ) \ 550 inout( TILE_B[mi][ki] ) \ 551 shared( TILE_A, TILE_B ) \ 552 firstprivate( ki, mi ) \ 554 cblas_dtrsm( CblasRowMajor,
555 ( CBLAS_SIDE ) SIDE, ( CBLAS_UPLO ) UPLO,
556 ( CBLAS_TRANSPOSE ) TRANS_A,
560 alpha, TILE_A[ki][ki], tile_size_k,
561 TILE_B[mi][ki], tile_size_k );
564 for ( ni = ki - 1; ni >= 0; ni-- )
567 for ( mmi = 0; mmi < mt; mmi++ )
571 #pragma oss task in( TILE_A[ni][ki] ) \ 572 in( TILE_B[mmi][ki] ) \ 573 inout( TILE_B[mmi][ni] ) \ 574 shared( TILE_A, TILE_B ) \ 575 firstprivate( ki, ni, mmi ) \ 577 cblas_dgemm( CblasRowMajor,
578 CblasNoTrans, CblasTrans,
582 -1.0, TILE_B[mmi][ki], tile_size_k,
583 TILE_A[ni][ki], tile_size_k,
584 alpha, TILE_B[mmi][ni], tile_size_n );
594 if ( TRANS_A == NoTrans )
596 for ( ki = nt - 1; ki >= 0; ki-- )
608 for ( mi = 0; mi < mt; mi++ )
612 #pragma oss task in( TILE_A[ki][ki] ) \ 613 inout( TILE_B[mi][ki] ) \ 614 shared( TILE_A, TILE_B ) \ 615 firstprivate( ki, mi ) \ 617 cblas_dtrsm( CblasRowMajor,
618 ( CBLAS_SIDE ) SIDE, ( CBLAS_UPLO ) UPLO,
619 ( CBLAS_TRANSPOSE ) TRANS_A,
623 alpha, TILE_A[ki][ki], tile_size_k,
624 TILE_B[mi][ki], tile_size_k );
627 for ( ni = ki - 1; ni >= 0; ni-- )
630 for ( mmi = 0; mmi < mt; mmi++ )
634 #pragma oss task in( TILE_A[ki][ni] ) \ 635 in( TILE_B[mmi][ki] ) \ 636 inout( TILE_B[mmi][ni] ) \ 637 shared( TILE_A, TILE_B ) \ 638 firstprivate( ki, ni, mmi ) \ 640 cblas_dgemm( CblasRowMajor,
641 CblasNoTrans, CblasNoTrans,
645 -1.0, TILE_B[mmi][ki], tile_size_k,
646 TILE_A[ki][ni], tile_size_n,
647 alpha, TILE_B[mmi][ni], tile_size_n );
656 for ( ki = 0; ki < nt; ki++ )
668 for ( mi = 0; mi < mt; mi++ )
672 #pragma oss task in( TILE_A[ki][ki] ) \ 673 inout( TILE_B[mi][ki] ) \ 674 shared( TILE_A,TILE_B ) \ 675 firstprivate( ki, mi ) \ 677 cblas_dtrsm( CblasRowMajor,
678 ( CBLAS_SIDE ) SIDE, ( CBLAS_UPLO ) UPLO,
679 ( CBLAS_TRANSPOSE ) TRANS_A,
683 alpha, TILE_A[ki][ki], tile_size_k,
684 TILE_B[mi][ki], tile_size_k );
687 for ( ni = ki + 1; ni < nt; ni++ )
690 for ( mmi = 0; mmi < mt; mmi++ )
694 #pragma oss task in( TILE_A[ni][ki] ) \ 695 in( TILE_B[mmi][ki] ) \ 696 inout( TILE_B[mmi][ni] ) \ 697 shared( TILE_A, TILE_B ) \ 698 firstprivate( ki, ni, mmi ) \ 700 cblas_dgemm( CblasRowMajor,
701 CblasNoTrans, CblasTrans,
705 -1.0, TILE_B[mmi][ki], tile_size_k,
706 TILE_A[ni][ki], tile_size_k,
707 alpha, TILE_B[mmi][ni], tile_size_n );
void ddss_dflat2tiled(int M, int N, double *A, int LDA, int MT, int NT, double(*TILE_A)[NT][TILE_SIZE *TILE_SIZE])
void ddss_dsymflat2tiled(int M, int N, double *A, int LDA, int MT, int NT, double(*TILE_A)[NT][TILE_SIZE *TILE_SIZE], enum DDSS_UPLO UPLO)
enum LASS_RETURN kdtrsm(enum DDSS_SIDE SIDE, enum DDSS_UPLO UPLO, enum DDSS_TRANS TRANS_A, enum DDSS_DIAG DIAG, int M, int N, const double ALPHA, double *A, int LDA, double *B, int LDB)
void ddss_dtiled2flat(int M, int N, double *A, int LDA, int MT, int NT, double(*TILE_A)[NT][TILE_SIZE *TILE_SIZE])
int ddss_tile_size(int M, int MT)