LASs - Linear Algebra Routines on OmpSs  1.0.0
LASs
kdtpgesv.c
Go to the documentation of this file.
1 #include "../include/lass.h"
2 
3 /**
4  *
5  * @file kdtpgesv.c
6  *
7  * @brief LASs-DDSs kdtpgesv routine.
8  *
9  * LASs-DDSs is a software package provided by:
10  * Barcelona Supercomputing Center - Centro Nacional de Supercomputacion
11  *
12  * @author Pedro Valero-Lara pedro.valero@bsc.es
13  * @date 2018-09-20
14  * @reviewer
15  * @modified
16  *
17  **/
18 
19 /**
20  *
21  * @ingroup DDSS
22  *
23  * Solves a system of linear equations A \times X = B, where A is a N-by-N
24  * general matrix and X and B are N-by-NRHS matrices. The matrix A is
25  * factorized using the LU descomposition with tiled-pivoting.
26  * The matrix A is descomposed as:
27  *
28  * A = P * L * U
29  *
30  * where P is a permutation matrix, L is a lower triangular matrix with unit
31  * diagonal elements and U is an upper triangular matrix.
32  *
33 **/
34 
35 /**
36  *
37  * @param[in]
38  * N int.
39  * N specifies the order of the square matrix A. N >= 0.
40  *
41  * NRHS int.
42  * NRHS specifies the number of right-hand-sides (number
43  * of columns of B). NRHS >= 0.
44  *
45  * @param[in,out]
46  * A double *.
47  * A is a pointer to a regular matrix of dimension N-by-LDA.
48  * On exit, if return value is Success, the matrix A is overwriten by
49  * the factors L and U. The unit diagonal elements of L are not stored.
50  *
51  * @param[in]
52  * LDA int.
53  * LDA specifies the number of columns of A ( row-major order ).
54  * LDA must be at least max( 1, N ).
55  *
56  * @param[out]
57  * IPIV int *.
58  * ipiv is a pointer to an array of dimesion at least
59  * max( 1, min ( M, N ) ). ipiv( i ) = j, 1 <= i <= min( M, N ) implies
60  * that rows i and j have been interchanged.
61  *
62  * @param[in,out]
63  * B double *.
64  * B is a pointer to a matrix of dimension N by NRHS, which stores the
65  * right-hand-sides of the systems of linear equations.
66  * (row-major order). On exit, if return value is Success, the matrix B
67  * is overwriten by the solution matrix X.
68  *
69  * @param[in]
70  * LDB int.
71  * LDB specifies the number of columns of B ( row-major order ).
72  * LDB must be at least max( 1, NRHS ).
73  *
74  **/
75 
76 /**
77  *
78  * @retval Success successful exit
79  * @retval NoSuccess unsuccessful exit
80  *
81  **/
82 
83 /**
84  *
85  * @sa ddss_tile
86  * @sa ddss_flat2tiled
87  * @sa ddss_tiled2flat
88  *
89  **/
90 
91 enum LASS_RETURN kdtpgesv( int N, int NRHS,
92  double *A, int LDA,
93  int *IPIV,
94  double *B, int LDB )
95 {
96 
97  // Local variables
98  int i;
99  int nt, nrhst;
100  int ni, mi, nni, ki, nrhsi;
101  int tile_size_n;
102  int tile_size_m;
103  int tile_size_nn;
104  int tile_size_nrhs;
105  int tile_size_k;
106  int local_ipiv_size;
107  int current_position;
108  enum LASS_RETURN ret;
109  int *ipiv_local;
110 
111  // Number of tiles
112  if ( N % TILE_SIZE == 0 )
113  {
114  nt = N / TILE_SIZE;
115  }
116  else
117  {
118  nt = ( N / TILE_SIZE ) + 1;
119  }
120 
121  if ( NRHS % TILE_SIZE == 0 )
122  {
123  nrhst = NRHS / TILE_SIZE;
124  }
125  else
126  {
127  nrhst = ( NRHS / TILE_SIZE ) + 1;
128  }
129 
130  /***************************
131  --Tile matrices allocation--
132  ***************************/
133 
134  double ( *TILE_A )[nt][TILE_SIZE * TILE_SIZE] = malloc ( nt * nt *
135  TILE_SIZE * TILE_SIZE * sizeof( double ) );
136 
137  if ( TILE_A == NULL )
138  {
139  fprintf( stderr, "Failure in kdtpgesv for matrix TILE_A\n" );
140  return NoSuccess;
141  }
142 
143  double ( *TILE_B )[nrhst][TILE_SIZE * TILE_SIZE] = malloc(
144  nt * nrhst * TILE_SIZE * TILE_SIZE * sizeof( double ) );
145 
146  if ( TILE_B == NULL )
147  {
148  fprintf( stderr, "Failure in kdtpgesv for matrix TILE_B\n" );
149  return NoSuccess;
150  }
151 
152  ipiv_local = ( int* ) malloc( N * sizeof( int ) );
153 
154  if ( ipiv_local == NULL )
155  {
156  fprintf( stderr, "Failure in kdtpgesv for ipiv_local\n" );
157  return NoSuccess;
158  }
159 
160  /************************************************
161  // --From flat data layout to tiled data layout--
162  ************************************************/
163 
164  // From flat matrix A to tile matrix TILE_A
165  ddss_dflat2tiled( N, N, A, LDA, nt, nt, TILE_A );
166 
167  // From flat matrix B to tile matrix TILE_B
168  ddss_dflat2tiled( N, NRHS, B, LDB, nt, nrhst, TILE_B );
169 
170  /****************
171  --DGETRF tile--
172  ****************/
173 
174  current_position = 0;
175  i = 0;
176  for ( ki = 0; ki < nt; ki++ )
177  {
178  tile_size_k = ddss_tile_size( N, ki );
179 
180  #pragma oss task inout( TILE_A[ki][ki] ) \
181  inout( ipiv_local ) \
182  inout( current_position ) \
183  out( local_ipiv_size ) \
184  shared( TILE_A ) \
185  firstprivate( ki, i ) \
186  label( dgetrf )
187  {
188  LAPACKE_dgetrf( CblasRowMajor,
189  tile_size_k, tile_size_k,
190  TILE_A[ki][ki], tile_size_k,
191  ipiv_local + current_position );
192 
193  // Update the global ipiv array
194  local_ipiv_size = tile_size_k;
195  for ( i = 0; i < local_ipiv_size; i++ )
196  {
197  ( IPIV + current_position )[i] =
198  ( ipiv_local + current_position )[i] + current_position;
199  }
200  current_position += local_ipiv_size;
201  }
202  for ( mi = ki + 1; mi < nt; mi++ )
203  {
204  tile_size_m = ddss_tile_size( N, mi );
205 
206  #pragma oss task in( TILE_A[ki][ki] ) \
207  inout(TILE_A[mi][ki]) \
208  shared( TILE_A ) \
209  firstprivate( mi, ki ) \
210  label( dtrsm_below )
211  cblas_dtrsm( CblasRowMajor,
212  ( CBLAS_SIDE ) Right, ( CBLAS_UPLO ) Upper,
213  ( CBLAS_TRANSPOSE ) NoTrans, ( CBLAS_DIAG ) NonUnit,
214  tile_size_m, tile_size_k,
215  1.0, TILE_A[ki][ki], tile_size_k,
216  TILE_A[mi][ki], tile_size_k );
217  }
218  for ( ni = 0; ni < ki; ni++ )
219  {
220  tile_size_n = ddss_tile_size( N, ni );
221  // Swap the tiles to the left of A[ki][ki]
222  #pragma oss task in ( ipiv_local ) \
223  in( current_position ) \
224  in( local_ipiv_size ) \
225  inout( TILE_A[ki][ni] ) \
226  shared( TILE_A ) \
227  firstprivate( ki, ni ) \
228  label( dlaswp_left )
229  LAPACKE_dlaswp( CblasRowMajor,
230  tile_size_n, TILE_A[ki][ni], tile_size_n,
231  1, local_ipiv_size,
232  ipiv_local + current_position - local_ipiv_size,
233  1 );
234 
235  }
236  for ( nrhsi = 0; nrhsi < nrhst; nrhsi++ )
237  {
238  tile_size_nrhs = ddss_tile_size( NRHS, nrhsi );
239  // Swap on the B[ki][nrhsi] tile
240  #pragma oss task in ( ipiv_local ) \
241  in( current_position ) \
242  in( local_ipiv_size ) \
243  inout( TILE_B[ki][nrhsi] ) \
244  shared( TILE_B ) \
245  firstprivate( ki, nrhsi ) \
246  label( dlaswp_on_B )
247  LAPACKE_dlaswp( CblasRowMajor,
248  tile_size_nrhs, TILE_B[ki][nrhsi], tile_size_nrhs,
249  1, local_ipiv_size,
250  ipiv_local + current_position - local_ipiv_size,
251  1 );
252 
253  }
254  for ( ni = ki + 1; ni < nt; ni++ )
255  {
256  tile_size_n = ddss_tile_size( N, ni );
257  // Swap the tiles to the right of A[ki][ki]
258  #pragma oss task in ( ipiv_local ) \
259  in( current_position ) \
260  in( local_ipiv_size ) \
261  inout( TILE_A[ki][ni] ) \
262  shared( TILE_A ) \
263  firstprivate( ki, ni ) \
264  label( dlaswp_right )
265  LAPACKE_dlaswp( CblasRowMajor,
266  tile_size_n, TILE_A[ki][ni], tile_size_n,
267  1, local_ipiv_size,
268  ipiv_local + current_position - local_ipiv_size,
269  1 );
270 
271  #pragma oss task in( TILE_A[ki][ki] ) \
272  inout(TILE_A[ki][ni]) \
273  shared( TILE_A ) \
274  firstprivate( ni, ki ) \
275  label( dtrsm_right )
276  cblas_dtrsm( CblasRowMajor,
277  ( CBLAS_SIDE ) Left, ( CBLAS_UPLO ) Lower,
278  ( CBLAS_TRANSPOSE ) NoTrans, ( CBLAS_DIAG ) Unit,
279  tile_size_k, tile_size_n,
280  1.0, TILE_A[ki][ki], tile_size_k,
281  TILE_A[ki][ni], tile_size_n );
282 
283  for ( nni = ki + 1; nni < nt; nni++ )
284  {
285  tile_size_nn = ddss_tile_size( N, nni );
286 
287  #pragma oss task in( TILE_A[nni][ki] ) \
288  in( TILE_A[ki][ni] ) \
289  inout( TILE_A[nni][ni] ) \
290  shared( TILE_A ) \
291  firstprivate( ni, nni, ki ) \
292  label( dgemm )
293  cblas_dgemm( CblasRowMajor,
294  ( CBLAS_TRANSPOSE ) NoTrans,
295  ( CBLAS_TRANSPOSE ) NoTrans,
296  tile_size_nn,
297  tile_size_n,
298  TILE_SIZE,
299  -1.0, TILE_A[nni][ki], tile_size_k,
300  TILE_A[ki][ni], tile_size_n,
301  1.0, TILE_A[nni][ni], tile_size_n );
302  }
303  }
304  }
305 
306  // --From tile data layout to flat data layout--
307  ddss_dtiled2flat( N, N, A, LDA, nt, nt, TILE_A );
308 
309  // Triandular solve
310  // --SIDE = Left & UPLO = Lower & TRANS_A = NoTrans & Unit--
311  for ( ki = 0; ki < nt; ki++ )
312  {
313  tile_size_k = ddss_tile_size( N, ki );
314 
315  for ( ni = 0; ni < nrhst; ni++ )
316  {
317  tile_size_n = ddss_tile_size( NRHS, ni );
318 
319  #pragma oss task in( TILE_A[ki][ki] ) \
320  inout( TILE_B[ki][ni] ) \
321  shared( TILE_A, TILE_B ) \
322  firstprivate( ki, ni ) \
323  label( dtrsm_dtrsm1 )
324  cblas_dtrsm( CblasRowMajor,
325  ( CBLAS_SIDE ) Left, ( CBLAS_UPLO ) Lower,
326  ( CBLAS_TRANSPOSE ) NoTrans,
327  ( CBLAS_DIAG ) Unit,
328  tile_size_k,
329  tile_size_n,
330  1.0, TILE_A[ki][ki], tile_size_k,
331  TILE_B[ki][ni], tile_size_n );
332  }
333  for ( nni = 0; nni < nrhst; nni++ )
334  {
335  tile_size_nn = ddss_tile_size( NRHS, nni );
336 
337  for ( mi = ki + 1; mi < nt; mi++ )
338  {
339 
340  #pragma oss task in( TILE_A[mi][ki] ) \
341  in( TILE_B[ki][nni] ) \
342  inout( TILE_B[mi][nni] ) \
343  shared( TILE_A, TILE_B ) \
344  firstprivate( ki, mi, nni ) \
345  label( dtrsm_dgemm1 )
346  cblas_dgemm( CblasRowMajor,
347  CblasNoTrans, CblasNoTrans,
348  tile_size_k,
349  tile_size_nn,
350  tile_size_k,
351  -1.0, TILE_A[mi][ki], tile_size_k,
352  TILE_B[ki][nni], tile_size_nn,
353  1.0, TILE_B[mi][nni], tile_size_nn );
354  }
355  }
356  }
357 
358  // Triandular solve
359  // --SIDE = Left & UPLO = Upper & TRANS_A = NoTrans & NonUnit--
360  for ( ki = nt - 1; ki >= 0; ki-- )
361  {
362  tile_size_k = ddss_tile_size( N, ki );
363 
364  for ( ni = 0; ni < nrhst; ni++ )
365  {
366  tile_size_n = ddss_tile_size( NRHS, ni );
367 
368  #pragma oss task in( TILE_A[ki][ki] ) \
369  inout( TILE_B[ki][ni] ) \
370  shared( TILE_A, TILE_B ) \
371  firstprivate( ki, ni ) \
372  label( dtrsmi_dtrsm2 )
373  cblas_dtrsm( CblasRowMajor,
374  ( CBLAS_SIDE ) Left, ( CBLAS_UPLO ) Upper,
375  ( CBLAS_TRANSPOSE ) NoTrans,
376  ( CBLAS_DIAG ) NonUnit,
377  tile_size_k,
378  tile_size_n,
379  1.0, TILE_A[ki][ki], tile_size_k,
380  TILE_B[ki][ni], tile_size_n );
381  }
382 
383  for ( mi = ki - 1; mi >= 0; mi-- )
384  {
385  tile_size_m = ddss_tile_size( N, mi );
386 
387  for ( nni = 0; nni < nrhst; nni++ )
388  {
389  tile_size_nn = ddss_tile_size( NRHS, nni );
390 
391  #pragma oss task in( TILE_A[mi][ki] ) \
392  in( TILE_B[ki][nni] ) \
393  inout( TILE_B[mi][nni] ) \
394  shared( TILE_A, TILE_B ) \
395  firstprivate( ki, mi, nni ) \
396  label( dtrsm_dgemm2 )
397  cblas_dgemm( CblasRowMajor,
398  CblasNoTrans, CblasNoTrans,
399  tile_size_m,
400  tile_size_nn,
401  tile_size_k,
402  -1.0, TILE_A[mi][ki], tile_size_k,
403  TILE_B[ki][nni], tile_size_nn,
404  1.0, TILE_B[mi][nni], tile_size_nn );
405  }
406  }
407  }
408 
409  /*********************************************
410  --From tiled data layout to flat data layout--
411  *********************************************/
412  ddss_dtiled2flat( N, NRHS, B, LDB, nt, nrhst, TILE_B );
413 
414  // --Free--
415  free( TILE_A );
416  free( TILE_B );
417  free( ipiv_local );
418 
419  return Success;
420 
421 }
void ddss_dflat2tiled(int M, int N, double *A, int LDA, int MT, int NT, double(*TILE_A)[NT][TILE_SIZE *TILE_SIZE])
enum LASS_RETURN kdtpgesv(int N, int NRHS, double *A, int LDA, int *IPIV, double *B, int LDB)
Definition: kdtpgesv.c:91
void ddss_dtiled2flat(int M, int N, double *A, int LDA, int MT, int NT, double(*TILE_A)[NT][TILE_SIZE *TILE_SIZE])
int ddss_tile_size(int M, int MT)
Definition: ddss_tile.c:52