LASs - Linear Algebra Routines on OmpSs  1.0.0
LASs
kdgemm.c
Go to the documentation of this file.
1 #include "../include/lass.h"
2 
3 /**
4  *
5  * @file kdgemm.c
6  *
7  * @brief LASs-DDSs kdgemm routine.
8  *
9  * LASs-DDSs is a software package provided by:
10  * Barcelona Supercomputing Center - Centro Nacional de Supercomputacion
11  *
12  * @author Pedro Valero-Lara pedro.valero@bsc.es
13  * @date 2017-01-05
14  * @reviewer
15  * @modified
16  *
17  **/
18 
19 /**
20  *
21  * @ingroup DDSs
22  *
23  * Performs the matrix-matrix operation:
24  *
25  * C = ALPHA * op( A ) * op( B ) + BETA * C
26  *
27  * where op( X ) is one of:
28  *
29  * op( X ) = X or
30  * op( X ) = X**T
31  *
32  * ALPHA and BETA are scalars, and A, B and C are matrices,
33  * with op( A ) an M by K matrix, op( B ) a K by N matrix and C
34  * an M by N matrix.
35  *
36 **/
37 
38 /**
39  *
40  * @param[in]
41  * TRANS_A enum DDSS_TRANS.
42  * TRANS_A specifies the form of op( A ) to be used in
43  * the matrix multiplication as follows:
44  * - NoTrans: op( A ) = A.
45  * - Trans: op( A ) = A**T.
46  *
47  * @param[in]
48  * TRANS_B enum DDSS_TRANS.
49  * TRANS_B specifies the form of op( B ) to be used in
50  * the matrix multiplication as follows:
51  * - NoTrans: op( B ) = B.
52  * - Trans: op( B ) = B**T.
53  *
54  * @param[in]
55  * M int.
56  * M specifies the number of rows of the matrix A
57  * and of the matrix C.
58  * M must be greater than zero.
59  *
60  * @param[in]
61  * N int.
62  * N specifies the number of columns of the matrix B
63  * and the number of columns of the matrix C.
64  * N must be greater than zero.
65  *
66  * @param[in]
67  * K int.
68  * K specifies the number of columns of the matrix A
69  * and the number of rows of the matrix B.
70  * K must be greater than zero.
71  *
72  * @param[in]
73  * ALPHA double.
74  *
75  * @param[in]
76  * A double *.
77  * A is a pointer to a matrix of dimension Ma ( rows ) by Ka
78  * ( columns ), where Ma is M and Ka is K when TRANS_A = NoTrans,
79  * and Ma is K and Ka is M otherwise.
80  *
81  * @param[in]
82  * LDA int.
83  * LDA specifies the number of columns of A ( row-major order ).
84  * When TRANS_A = NoTrans then LDA must be at least max( 1, K ),
85  * otherwise LDA must be at least max( 1, M ).
86  *
87  * @param[in]
88  * B double *.
89  * B is a pointer to a matrix of dimension Kb ( rows ) by Nb
90  * ( columns ), where Kb is K and Nb is N when TRANS_B = NoTrans,
91  * and Kb is N and Nb is K otherwise.
92  *
93  * @param[in]
94  * LDB int.
95  * LDB specifies the number of columns of B ( row-major order ).
96  * When TRANS_B = NoTrans then LDB must be at least max( 1, N ),
97  * otherwise LDB must be at least max( 1, K ).
98  *
99  * @param[in]
100  * BETA double.
101  *
102  * @param[in,out]
103  * C double *.
104  * C is a pointer to a matrix of dimension LDC by N.
105  * On exit, C is overwritten by the M by N
106  * matrix ( ALPHA*op( A )*op( B ) + BETA*C ).
107  *
108  * @param[in]
109  * LDC int.
110  * LDC specifies the number of columns of C ( row-major order).
111  * LDC must be at least max( 1, M )
112  *
113  **/
114 
115 /**
116  *
117  * @retval Success sucessful exit
118  * @retval NoSuccess unsucessful exit
119  *
120  **/
121 
122 /**
123  *
124  * @sa ddss_dgemm
125  * @sa ddss_tile
126  * @sa ddss_flat2tiled
127  * @sa ddss_tiled2flat
128  *
129  **/
130 
131 enum LASS_RETURN kdgemm( enum DDSS_TRANS TRANS_A, enum DDSS_TRANS TRANS_B,
132  int M, int N, int K,
133  const double ALPHA, double *A, int LDA,
134  double *B, int LDB,
135  const double BETA, double *C, int LDC )
136 {
137 
138  // Local variables
139  int mt, kt, nt;
140  int mi, ki, ni;
141  int Am, An;
142  int Bm, Bn;
143  int tile_size_m;
144  int tile_size_n;
145  int tile_size_k;
146  int k_check;
147  double betat;
148 
149  // Number of tiles
150  if ( M % TILE_SIZE == 0 )
151  {
152  mt = M / TILE_SIZE;
153  }
154  else
155  {
156  mt = ( M / TILE_SIZE ) + 1;
157  }
158 
159  if ( K % TILE_SIZE == 0 )
160  {
161  kt = K / TILE_SIZE;
162  }
163  else
164  {
165  kt = ( K / TILE_SIZE ) + 1;
166  }
167 
168  if ( N % TILE_SIZE == 0 )
169  {
170  nt = N / TILE_SIZE;
171  }
172  else
173  {
174  nt = ( N / TILE_SIZE ) + 1;
175  }
176 
177  /****************************
178  --Tile matrices declaration--
179  ****************************/
180 
181  if ( TRANS_A == NoTrans )
182  {
183  Am = mt;
184  An = kt;
185  }
186  else
187  {
188  Am = kt;
189  An = mt;
190  }
191 
192  if ( TRANS_B == NoTrans )
193  {
194  Bm = kt;
195  Bn = nt;
196  }
197  else
198  {
199  Bm = nt;
200  Bn = kt;
201  }
202 
203  /***************************
204  --Tile matrices allocation--
205  ***************************/
206 
207  double (*TILE_A)[An][TILE_SIZE * TILE_SIZE] = malloc ( Am * An *
208  TILE_SIZE * TILE_SIZE * sizeof( double ) );
209 
210  if ( TILE_A == NULL)
211  {
212  fprintf( stderr, "Failure in kdgemm for matrix TILE_A\n" );
213  return NoSuccess;
214  }
215 
216  double (*TILE_B)[Bn][TILE_SIZE * TILE_SIZE] = malloc ( Bm * Bn *
217  TILE_SIZE * TILE_SIZE * sizeof( double ) );
218 
219  if ( TILE_B == NULL)
220  {
221  fprintf( stderr, "Failure in kdgemm for matrix TILE_B\n" );
222  return NoSuccess;
223  }
224 
225  double (*TILE_C)[nt][TILE_SIZE * TILE_SIZE] = malloc ( mt * nt *
226  TILE_SIZE * TILE_SIZE * sizeof( double ) );
227 
228  if ( TILE_C == NULL)
229  {
230  fprintf( stderr, "Failure in kdgemm for matrix TILE_C\n" );
231  return NoSuccess;
232  }
233 
234  /*********************************************
235  --From flat data layout to tiled data layout--
236  *********************************************/
237 
238  // From flat matrix A to tile matrix TILE_A
239  if ( TRANS_A == NoTrans )
240  {
241  ddss_dflat2tiled( M, K, A, LDA, mt, kt, TILE_A );
242  }
243  else
244  {
245  ddss_dflat2tiled( K, M, A, LDA, kt, mt, TILE_A );
246  }
247 
248  // From flat matrix B to tile matrix TILE_B
249  if ( TRANS_B == NoTrans )
250  {
251  ddss_dflat2tiled( K, N, B, LDB, kt, nt, TILE_B );
252  }
253  else
254  {
255  ddss_dflat2tiled( N, K, B, LDB, nt, kt, TILE_B );
256  }
257 
258  // From flat matrix C to tile matrix TILE_C
259  ddss_dflat2tiled( M, N, C, LDC, mt, nt, TILE_C );
260 
261  /*************
262  --DGEMM tile--
263  *************/
264 
265  for ( mi = 0; mi < mt; mi++ )
266  {
267  tile_size_m = ddss_tile_size( M, mi );
268  for ( ni = 0; ni < nt; ni++ )
269  {
270  tile_size_n = ddss_tile_size( N, ni );
271  if ( TRANS_A == NoTrans )
272  {
273  k_check = K;
274  }
275  else
276  {
277  k_check = M;
278  }
279  // --Scale on C ( C = BETA * C )--
280  if ( ( ALPHA == 0.0 ) || ( k_check == 0 ) )
281  {
282  #pragma oss task inout( TILE_C[mi][ni] ) \
283  shared( TILE_A, TILE_B, TILE_C ) \
284  firstprivate( mi, ni ) \
285  no_copy_deps
286  cblas_dgemm( CblasRowMajor,
287  ( CBLAS_TRANSPOSE ) TRANS_A,
288  ( CBLAS_TRANSPOSE ) TRANS_B,
289  tile_size_m,
290  tile_size_n,
291  0,
292  ALPHA, TILE_A[0][0], 1,
293  TILE_B[0][0], 1,
294  BETA, TILE_C[mi][ni], tile_size_n );
295  }
296  else if ( TRANS_A == NoTrans )
297  {
298  // --TRANS_A = NoTrans & TRANS_B = NoTrans--
299  if ( TRANS_B == NoTrans )
300  {
301  for ( ki = 0; ki < kt; ki++ )
302  {
303  tile_size_k = ddss_tile_size( K, ki );
304  if (ki == 0)
305  {
306  betat = BETA;
307  }
308  else
309  {
310  betat = 1.0;
311  }
312 
313  #pragma oss task in( TILE_A[mi][ki] ) \
314  in( TILE_B[ki][ni] ) \
315  inout( TILE_C[mi][ni] ) \
316  shared( TILE_A, TILE_B, TILE_C ) \
317  firstprivate( mi, ni, ki, betat ) \
318  no_copy_deps
319  cblas_dgemm( CblasRowMajor,
320  ( CBLAS_TRANSPOSE ) TRANS_A,
321  ( CBLAS_TRANSPOSE ) TRANS_B,
322  tile_size_m,
323  tile_size_n,
324  tile_size_k,
325  ALPHA, TILE_A[mi][ki], tile_size_k,
326  TILE_B[ki][ni], tile_size_n,
327  betat, TILE_C[mi][ni], tile_size_n );
328  }
329  }
330  // --TRANS_A = NoTrans & TRANS_B = Trans--
331  else
332  {
333  for ( ki = 0; ki < kt; ki++ )
334  {
335  tile_size_k = ddss_tile_size( K, ki );
336  if (ki == 0)
337  {
338  betat = BETA;
339  }
340  else
341  {
342  betat = 1.0;
343  }
344 
345  #pragma oss task in( TILE_A[mi][ki] ) \
346  in( TILE_B[ni][ki] ) \
347  inout( TILE_C[mi][ni] ) \
348  shared( TILE_A, TILE_B, TILE_C ) \
349  firstprivate( mi, ni, ki, betat )
350  cblas_dgemm( CblasRowMajor,
351  ( CBLAS_TRANSPOSE ) TRANS_A,
352  ( CBLAS_TRANSPOSE ) TRANS_B,
353  tile_size_m,
354  tile_size_n,
355  tile_size_k,
356  ALPHA, TILE_A[mi][ki], tile_size_k,
357  TILE_B[ni][ki], tile_size_k,
358  betat, TILE_C[mi][ni], tile_size_n );
359  }
360  }
361  }
362  else
363  {
364  // --TRANS_A = Trans & TRANS_B = NoTrans--
365  if ( TRANS_B == NoTrans )
366  {
367  for ( ki = 0; ki < kt; ki++ )
368  {
369  tile_size_k = ddss_tile_size( K, ki );
370  if (ki == 0)
371  {
372  betat = BETA;
373  }
374  else
375  {
376  betat = 1.0;
377  }
378 
379  #pragma oss task in( TILE_A[ki][mi] ) \
380  in( TILE_B[ki][ni] ) \
381  inout( TILE_C[mi][ni] ) \
382  shared( TILE_A, TILE_B, TILE_C ) \
383  firstprivate( mi, ni, ki, betat )
384  cblas_dgemm( CblasRowMajor,
385  ( CBLAS_TRANSPOSE ) TRANS_A,
386  ( CBLAS_TRANSPOSE ) TRANS_B,
387  tile_size_m,
388  tile_size_n,
389  tile_size_k,
390  ALPHA, TILE_A[ki][mi], tile_size_m,
391  TILE_B[ki][ni], tile_size_n,
392  betat, TILE_C[mi][ni], tile_size_n );
393  }
394  }
395  // --TRANS_A = Trans & TRANS_B = Trans--
396  else
397  {
398  for ( ki = 0; ki < kt; ki++ )
399  {
400  tile_size_k = ddss_tile_size( K, ki );
401  if (ki == 0)
402  {
403  betat = BETA;
404  }
405  else
406  {
407  betat = 1.0;
408  }
409 
410  #pragma oss task in( TILE_A[ki][mi] ) \
411  in( TILE_B[ni][ki] ) \
412  inout( TILE_C[mi][ni] ) \
413  shared( TILE_A, TILE_B, TILE_C ) \
414  firstprivate( mi, ni, ki, betat )
415  cblas_dgemm( CblasRowMajor,
416  ( CBLAS_TRANSPOSE ) TRANS_A,
417  ( CBLAS_TRANSPOSE ) TRANS_B,
418  tile_size_m,
419  tile_size_n,
420  tile_size_k,
421  ALPHA, TILE_A[ki][mi], tile_size_m,
422  TILE_B[ni][ki], tile_size_k,
423  betat, TILE_C[mi][ni], tile_size_n );
424  }
425  }
426  }
427  }
428  }
429 
430  /************************************************
431  // --From tiled data layout to flat data layout--
432  ************************************************/
433 
434  // From tile matrix TILE_C to flat matrix C
435  ddss_dtiled2flat( M, N, C, LDC, mt, nt, TILE_C );
436 
437  // --Tile matrices free--
438  free( TILE_A );
439  free( TILE_B );
440  free( TILE_C );
441 
442  return Success;
443 
444 }
void ddss_dflat2tiled(int M, int N, double *A, int LDA, int MT, int NT, double(*TILE_A)[NT][TILE_SIZE *TILE_SIZE])
enum LASS_RETURN kdgemm(enum DDSS_TRANS TRANS_A, enum DDSS_TRANS TRANS_B, int M, int N, int K, const double ALPHA, double *A, int LDA, double *B, int LDB, const double BETA, double *C, int LDC)
Definition: kdgemm.c:131
void ddss_dtiled2flat(int M, int N, double *A, int LDA, int MT, int NT, double(*TILE_A)[NT][TILE_SIZE *TILE_SIZE])
int ddss_tile_size(int M, int MT)
Definition: ddss_tile.c:52