GCC Code Coverage Report


Directory: src/
File: src/talp/perf_metrics.c
Date: 2025-11-21 10:34:40
Exec Total Coverage
Lines: 133 181 73.5%
Functions: 4 5 80.0%
Branches: 11 23 47.8%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2025 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #include "talp/perf_metrics.h"
21
22 #include "LB_core/spd.h"
23 #include "apis/dlb_talp.h"
24 #include "support/debug.h"
25 #ifdef MPI_LIB
26 #include "mpi/mpi_core.h"
27 #endif
28
29 #include <stddef.h>
30 #include <stdio.h>
31
32 /*********************************************************************************/
33 /* POP metrics - pure MPI model */
34 /*********************************************************************************/
35
36 /* Compute POP metrics for the MPI model
37 * (This funtion is actually not used anywhere) */
38 static inline void perf_metrics__compute_mpi_model(
39 perf_metrics_mpi_t *metrics,
40 int num_cpus,
41 int num_nodes,
42 int64_t elapsed_time,
43 int64_t elapsed_useful,
44 int64_t app_sum_useful,
45 int64_t node_sum_useful) __attribute__((unused));
46 static inline void perf_metrics__compute_mpi_model(
47 perf_metrics_mpi_t *metrics,
48 int num_cpus,
49 int num_nodes,
50 int64_t elapsed_time,
51 int64_t elapsed_useful,
52 int64_t app_sum_useful,
53 int64_t node_sum_useful) {
54
55 if (elapsed_time > 0) {
56 *metrics = (const perf_metrics_mpi_t) {
57 .parallel_efficiency = (float)app_sum_useful / (elapsed_time * num_cpus),
58 .communication_efficiency = (float)elapsed_useful / elapsed_time,
59 .load_balance = (float)app_sum_useful / (elapsed_useful * num_cpus),
60 .lb_in = (float)(node_sum_useful * num_nodes) / (elapsed_useful * num_cpus),
61 .lb_out = (float)app_sum_useful / (node_sum_useful * num_nodes),
62 };
63 } else {
64 *metrics = (const perf_metrics_mpi_t) {};
65 }
66 }
67
68 /* Compute POP metrics for the MPI model, but with some inferred values:
69 * (Only useful for node metrics) */
70 6 void perf_metrics__infer_mpi_model(
71 perf_metrics_mpi_t *metrics,
72 int processes_per_node,
73 int64_t node_sum_useful,
74 int64_t node_sum_mpi,
75 int64_t max_useful_time) {
76
77 6 int64_t elapsed_time = (node_sum_useful + node_sum_mpi) / processes_per_node;
78
1/2
✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
6 if (elapsed_time > 0) {
79 6 *metrics = (const perf_metrics_mpi_t) {
80 6 .parallel_efficiency = (float)node_sum_useful / (node_sum_useful + node_sum_mpi),
81 6 .communication_efficiency = (float)max_useful_time / elapsed_time,
82 6 .load_balance = ((float)node_sum_useful / processes_per_node) / max_useful_time,
83 };
84 } else {
85 *metrics = (const perf_metrics_mpi_t) {};
86 }
87 6 }
88
89
90 /*********************************************************************************/
91 /* POP metrics - hybrid MPI + OpenMP model */
92 /*********************************************************************************/
93
94 /* Computed efficiency metrics for the POP hybrid model */
95 typedef struct perf_metrics_hybrid_t {
96 float parallel_efficiency;
97 float mpi_parallel_efficiency;
98 float mpi_communication_efficiency;
99 float mpi_load_balance;
100 float mpi_load_balance_in;
101 float mpi_load_balance_out;
102 float omp_parallel_efficiency;
103 float omp_load_balance;
104 float omp_scheduling_efficiency;
105 float omp_serialization_efficiency;
106 float device_offload_efficiency;
107 float gpu_parallel_efficiency;
108 float gpu_load_balance;
109 float gpu_communication_efficiency;
110 float gpu_orchestration_efficiency;
111 } perf_metrics_hybrid_t;
112
113
114 /* Compute POP metrics for the hybrid MPI + OpenMP model
115 * (Ver. 1: All metrics are multiplicative, but some of them are > 1) */
116 static inline void perf_metrics__compute_hybrid_model_v1(
117 perf_metrics_hybrid_t *metrics,
118 const pop_base_metrics_t *base_metrics) {
119
120 int num_cpus = base_metrics->num_cpus;
121 int num_gpus = base_metrics->num_gpus;
122 int64_t elapsed_time = base_metrics->elapsed_time;
123 int64_t useful_time = base_metrics->useful_time;
124 int64_t mpi_time = base_metrics->mpi_time;
125 int64_t omp_load_imbalance_time = base_metrics->omp_load_imbalance_time;
126 int64_t omp_scheduling_time = base_metrics->omp_scheduling_time;
127 int64_t omp_serialization_time = base_metrics->omp_serialization_time;
128 int64_t gpu_runtime_time = base_metrics->gpu_runtime_time;
129 double min_mpi_normd_proc = base_metrics->min_mpi_normd_proc;
130 double min_mpi_normd_node = base_metrics->min_mpi_normd_node;
131 int64_t gpu_useful_time = base_metrics->gpu_useful_time;
132 int64_t max_gpu_useful_time = base_metrics->max_gpu_useful_time;
133 int64_t max_gpu_active_time = base_metrics->max_gpu_active_time;
134
135 /* Active is the union of all times (while CPU is not disabled) */
136 int64_t sum_active = useful_time + mpi_time + omp_load_imbalance_time +
137 omp_scheduling_time + omp_serialization_time + gpu_runtime_time;
138
139 /* Equivalent to all CPU time if OMP was not present */
140 int64_t sum_active_non_omp = useful_time + mpi_time + gpu_runtime_time;
141
142 /* Equivalent to all CPU time if GPU was not present */
143 int64_t sum_active_non_gpu = sum_active - gpu_runtime_time;
144
145 /* MPI time normalized at application level */
146 double mpi_normd_app = (double)mpi_time / num_cpus;
147
148 /* Non-MPI time normalized at application level */
149 double non_mpi_normd_app = elapsed_time - mpi_normd_app;
150
151 /* Max value of non-MPI times normalized at process level */
152 double max_non_mpi_normd_proc = elapsed_time - min_mpi_normd_proc;
153
154 /* Max value of non-MPI times normalized at node level */
155 double max_non_mpi_normd_node = elapsed_time - min_mpi_normd_node;
156
157 /* All Device time */
158 int64_t sum_device_time = elapsed_time * num_gpus;
159
160 /* Compute output metrics */
161 *metrics = (const perf_metrics_hybrid_t) {
162 .parallel_efficiency = (float)useful_time / sum_active,
163 .mpi_parallel_efficiency = (float)useful_time / (useful_time + mpi_time),
164 .mpi_communication_efficiency =
165 max_non_mpi_normd_proc / (non_mpi_normd_app + mpi_normd_app),
166 .mpi_load_balance = non_mpi_normd_app / max_non_mpi_normd_proc,
167 .mpi_load_balance_in = max_non_mpi_normd_node / max_non_mpi_normd_proc,
168 .mpi_load_balance_out = non_mpi_normd_app / max_non_mpi_normd_node,
169 .omp_parallel_efficiency = (float)sum_active_non_omp / sum_active,
170 .omp_load_balance = (float)(sum_active_non_omp + omp_serialization_time)
171 / (sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time),
172 .omp_scheduling_efficiency =
173 (float)(sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time)
174 / (sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time
175 + omp_scheduling_time),
176 .omp_serialization_efficiency = (float)sum_active_non_omp
177 / (sum_active_non_omp + omp_serialization_time),
178 .device_offload_efficiency = (float)sum_active_non_gpu / sum_active,
179 .gpu_parallel_efficiency = sum_device_time == 0 ? 0
180 : (float)gpu_useful_time / sum_device_time,
181 .gpu_load_balance = sum_device_time == 0 ? 0
182 : (float)gpu_useful_time / (max_gpu_useful_time * num_gpus),
183 .gpu_communication_efficiency = sum_device_time == 0 ? 0
184 : (float)max_gpu_useful_time / max_gpu_active_time,
185 .gpu_orchestration_efficiency = sum_device_time == 0 ? 0
186 : (float)max_gpu_active_time / elapsed_time,
187 };
188 }
189
190 /* Compute POP metrics for the hybrid MPI + OpenMP model (Ver. 2: PE != MPE * OPE) */
191 15 static inline void perf_metrics__compute_hybrid_model_v2(
192 perf_metrics_hybrid_t *metrics,
193 const pop_base_metrics_t *base_metrics) {
194
195 15 int num_cpus = base_metrics->num_cpus;
196 15 int num_gpus = base_metrics->num_gpus;
197 15 int64_t elapsed_time = base_metrics->elapsed_time;
198 15 int64_t useful_time = base_metrics->useful_time;
199 15 int64_t mpi_time = base_metrics->mpi_time;
200 15 int64_t omp_load_imbalance_time = base_metrics->omp_load_imbalance_time;
201 15 int64_t omp_scheduling_time = base_metrics->omp_scheduling_time;
202 15 int64_t omp_serialization_time = base_metrics->omp_serialization_time;
203 15 int64_t gpu_runtime_time = base_metrics->gpu_runtime_time;
204 15 double min_mpi_normd_proc = base_metrics->min_mpi_normd_proc;
205 15 double min_mpi_normd_node = base_metrics->min_mpi_normd_node;
206 15 int64_t gpu_useful_time = base_metrics->gpu_useful_time;
207 15 int64_t max_gpu_useful_time = base_metrics->max_gpu_useful_time;
208 15 int64_t max_gpu_active_time = base_metrics->max_gpu_active_time;
209
210 /* Active is the union of all times (CPU not disabled) */
211 15 int64_t sum_active = useful_time + mpi_time + omp_load_imbalance_time +
212 15 omp_scheduling_time + omp_serialization_time + gpu_runtime_time;
213
214 /* Equivalent to all CPU time if OMP was not present */
215 15 int64_t sum_active_non_omp = useful_time + mpi_time + gpu_runtime_time;
216
217 /* CPU time of OpenMP not useful */
218 15 int64_t sum_omp_not_useful = omp_load_imbalance_time + omp_scheduling_time +
219 omp_serialization_time;
220
221 /* MPI time normalized at application level */
222 15 double mpi_normd_app = (double)mpi_time / num_cpus;
223
224 /* Non-MPI time normalized at application level */
225 15 double non_mpi_normd_app = elapsed_time - mpi_normd_app;
226
227 /* Max value of non-MPI times normalized at process level */
228 15 double max_non_mpi_normd_proc = elapsed_time - min_mpi_normd_proc;
229
230 /* Max value of non-MPI times normalized at node level */
231 15 double max_non_mpi_normd_node = elapsed_time - min_mpi_normd_node;
232
233 /* All Device time */
234 15 int64_t sum_device_time = elapsed_time * num_gpus;
235
236 /* Compute output metrics */
237 15 *metrics = (const perf_metrics_hybrid_t) {
238 15 .parallel_efficiency = (float)useful_time / sum_active,
239 15 .mpi_parallel_efficiency = non_mpi_normd_app / elapsed_time,
240 15 .mpi_communication_efficiency = max_non_mpi_normd_proc / elapsed_time,
241 15 .mpi_load_balance = non_mpi_normd_app / max_non_mpi_normd_proc,
242 15 .mpi_load_balance_in = max_non_mpi_normd_node / max_non_mpi_normd_proc,
243 15 .mpi_load_balance_out = non_mpi_normd_app / max_non_mpi_normd_node,
244 15 .omp_parallel_efficiency = (float)sum_active_non_omp / sum_active,
245 15 .omp_load_balance = (float)(sum_active_non_omp + omp_serialization_time)
246 15 / (sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time),
247 .omp_scheduling_efficiency =
248 15 (float)(sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time)
249 15 / (sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time
250 15 + omp_scheduling_time),
251 15 .omp_serialization_efficiency = (float)sum_active_non_omp
252 15 / (sum_active_non_omp + omp_serialization_time),
253 15 .device_offload_efficiency = (float)(useful_time + sum_omp_not_useful)
254 15 / (useful_time + sum_omp_not_useful + gpu_runtime_time),
255 .gpu_parallel_efficiency = sum_device_time == 0 ? 0
256
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 14 times.
15 : (float)gpu_useful_time / sum_device_time,
257 .gpu_load_balance = sum_device_time == 0 ? 0
258
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 14 times.
15 : (float)gpu_useful_time / (max_gpu_useful_time * num_gpus),
259 .gpu_communication_efficiency = sum_device_time == 0 ? 0
260
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 14 times.
15 : (float)max_gpu_useful_time / max_gpu_active_time,
261 .gpu_orchestration_efficiency = sum_device_time == 0 ? 0
262
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 14 times.
15 : (float)max_gpu_active_time / elapsed_time,
263 };
264 15 }
265
266 #ifdef MPI_LIB
267
268 /* The following node and app reductions are needed to compute POP metrics: */
269
270 /*** Node reduction ***/
271
272 /* Data type to reduce among processes in node */
273 typedef struct node_reduction_t {
274 bool node_used;
275 int cpus_node;
276 int64_t mpi_time;
277 } node_reduction_t;
278
279 /* Function called in the MPI node reduction */
280 static void mpi_node_reduction_fn(void *invec, void *inoutvec, int *len,
281 MPI_Datatype *datatype) {
282 node_reduction_t *in = invec;
283 node_reduction_t *inout = inoutvec;
284
285 int _len = *len;
286 for (int i = 0; i < _len; ++i) {
287 if (in[i].node_used) {
288 inout[i].node_used = true;
289 inout[i].cpus_node += in[i].cpus_node;
290 inout[i].mpi_time += in[i].mpi_time;
291 }
292 }
293 }
294
295 /* Function to perform the reduction at node level */
296 static void reduce_pop_metrics_node_reduction(node_reduction_t *node_reduction,
297 const dlb_monitor_t *monitor) {
298
299 const node_reduction_t node_reduction_send = {
300 .node_used = monitor->num_measurements > 0,
301 .cpus_node = monitor->num_cpus,
302 .mpi_time = monitor->mpi_time,
303 };
304
305 /* MPI type: int64_t */
306 MPI_Datatype mpi_int64_type = get_mpi_int64_type();
307
308 /* MPI struct type: node_reduction_t */
309 MPI_Datatype mpi_node_reduction_type;
310 {
311 int count = 3;
312 int blocklengths[] = {1, 1, 1};
313 MPI_Aint displacements[] = {
314 offsetof(node_reduction_t, node_used),
315 offsetof(node_reduction_t, cpus_node),
316 offsetof(node_reduction_t, mpi_time)};
317 MPI_Datatype types[] = {MPI_C_BOOL, MPI_INT, mpi_int64_type};
318 MPI_Datatype tmp_type;
319 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
320 PMPI_Type_create_resized(tmp_type, 0, sizeof(node_reduction_t),
321 &mpi_node_reduction_type);
322 PMPI_Type_commit(&mpi_node_reduction_type);
323 }
324
325 /* Define MPI operation */
326 MPI_Op node_reduction_op;
327 PMPI_Op_create(mpi_node_reduction_fn, true, &node_reduction_op);
328
329 /* MPI reduction */
330 PMPI_Reduce(&node_reduction_send, node_reduction, 1,
331 mpi_node_reduction_type, node_reduction_op,
332 0, getNodeComm());
333
334 /* Free MPI types */
335 PMPI_Type_free(&mpi_node_reduction_type);
336 PMPI_Op_free(&node_reduction_op);
337 }
338
339 /** App reduction ***/
340
341 /* Data type to reduce among processes in application */
342 typedef struct app_reduction_t {
343 /* Resources */
344 int num_cpus;
345 int num_nodes;
346 float avg_cpus;
347 int num_gpus;
348 /* Hardware Counters */
349 double cycles;
350 double instructions;
351 /* Statistics */
352 int64_t num_measurements;
353 int64_t num_mpi_calls;
354 int64_t num_omp_parallels;
355 int64_t num_omp_tasks;
356 int64_t num_gpu_runtime_calls;
357 /* Host Times */
358 int64_t elapsed_time;
359 int64_t useful_time;
360 int64_t mpi_time;
361 int64_t omp_load_imbalance_time;
362 int64_t omp_scheduling_time;
363 int64_t omp_serialization_time;
364 int64_t gpu_runtime_time;
365 /* Host Normalized Times */
366 double min_mpi_normd_proc;
367 double min_mpi_normd_node;
368 /* Device Times */
369 int64_t gpu_useful_time;
370 int64_t gpu_communication_time;
371 int64_t gpu_inactive_time;
372 /* Device Max Times */
373 int64_t max_gpu_useful_time;
374 int64_t max_gpu_active_time;
375 } app_reduction_t;
376
377 /* Function called in the MPI app reduction */
378 static void mpi_reduction_fn(void *invec, void *inoutvec, int *len,
379 MPI_Datatype *datatype) {
380 app_reduction_t *in = invec;
381 app_reduction_t *inout = inoutvec;
382
383 int _len = *len;
384 for (int i = 0; i < _len; ++i) {
385 /* Resources */
386 inout[i].num_cpus += in[i].num_cpus;
387 inout[i].num_nodes += in[i].num_nodes;
388 inout[i].avg_cpus += in[i].avg_cpus;
389 inout[i].num_gpus += in[i].num_gpus;
390 /* Hardware Counters */
391 inout[i].cycles += in[i].cycles;
392 inout[i].instructions += in[i].instructions;
393 /* Statistics */
394 inout[i].num_measurements += in[i].num_measurements;
395 inout[i].num_mpi_calls += in[i].num_mpi_calls;
396 inout[i].num_omp_parallels += in[i].num_omp_parallels;
397 inout[i].num_omp_tasks += in[i].num_omp_tasks;
398 inout[i].num_gpu_runtime_calls += in[i].num_gpu_runtime_calls;
399 /* Host Times */
400 inout[i].elapsed_time = max_int64(inout[i].elapsed_time, in[i].elapsed_time);
401 inout[i].useful_time += in[i].useful_time;
402 inout[i].mpi_time += in[i].mpi_time;
403 inout[i].omp_load_imbalance_time += in[i].omp_load_imbalance_time;
404 inout[i].omp_scheduling_time += in[i].omp_scheduling_time;
405 inout[i].omp_serialization_time += in[i].omp_serialization_time;
406 inout[i].gpu_runtime_time += in[i].gpu_runtime_time;
407
408 /* Host Normalized Times */
409 inout[i].min_mpi_normd_proc =
410 min_double_non_zero(inout[i].min_mpi_normd_proc, in[i].min_mpi_normd_proc);
411 inout[i].min_mpi_normd_node =
412 min_double_non_zero(inout[i].min_mpi_normd_node, in[i].min_mpi_normd_node);
413
414 /* Device Times */
415 inout[i].gpu_useful_time += in[i].gpu_useful_time;
416 inout[i].gpu_communication_time += in[i].gpu_communication_time;
417 inout[i].gpu_inactive_time += in[i].gpu_inactive_time;
418
419 /* Device Max Times */
420 inout[i].max_gpu_useful_time =
421 max_int64(inout[i].max_gpu_useful_time, in[i].max_gpu_useful_time);
422 inout[i].max_gpu_active_time =
423 max_int64(inout[i].max_gpu_active_time, in[i].max_gpu_active_time);
424 }
425 }
426
427 /* Function to perform the reduction at application level */
428 static void reduce_pop_metrics_app_reduction(app_reduction_t *app_reduction,
429 const node_reduction_t *node_reduction, const dlb_monitor_t *monitor,
430 bool all_to_all) {
431
432 double min_mpi_normd_proc = monitor->num_cpus == 0 ? 0.0
433 : (double)monitor->mpi_time / monitor->num_cpus;
434 double min_mpi_normd_node = _process_id != 0 ? 0.0
435 : node_reduction->cpus_node == 0 ? 0.0
436 : (double)node_reduction->mpi_time / node_reduction->cpus_node;
437
438 bool have_gpus = (monitor->gpu_useful_time + monitor->gpu_communication_time > 0);
439
440 const app_reduction_t app_reduction_send = {
441 /* Resources */
442 .num_cpus = monitor->num_cpus,
443 .num_nodes = _process_id == 0 && node_reduction->node_used ? 1 : 0,
444 .avg_cpus = monitor->avg_cpus,
445 .num_gpus = have_gpus ? 1 : 0,
446 /* Hardware Counters */
447 .cycles = (double)monitor->cycles,
448 .instructions = (double)monitor->instructions,
449 /* Statistics */
450 .num_measurements = monitor->num_measurements,
451 .num_mpi_calls = monitor->num_mpi_calls,
452 .num_omp_parallels = monitor->num_omp_parallels,
453 .num_omp_tasks = monitor->num_omp_tasks,
454 .num_gpu_runtime_calls = monitor->num_gpu_runtime_calls,
455 /* Host Times */
456 .elapsed_time = monitor->elapsed_time,
457 .useful_time = monitor->useful_time,
458 .mpi_time = monitor->mpi_time,
459 .omp_load_imbalance_time = monitor->omp_load_imbalance_time,
460 .omp_scheduling_time = monitor->omp_scheduling_time,
461 .omp_serialization_time = monitor->omp_serialization_time,
462 .gpu_runtime_time = monitor->gpu_runtime_time,
463 /* Host Normalized Times */
464 .min_mpi_normd_proc = min_mpi_normd_proc,
465 .min_mpi_normd_node = min_mpi_normd_node,
466 /* Device Times */
467 .gpu_useful_time = monitor->gpu_useful_time,
468 .gpu_communication_time = monitor->gpu_communication_time,
469 .gpu_inactive_time = monitor->gpu_inactive_time,
470 /* Device Max Times */
471 .max_gpu_useful_time = monitor->gpu_useful_time,
472 .max_gpu_active_time = monitor->gpu_useful_time + monitor->gpu_communication_time,
473 };
474
475 /* MPI type: int64_t */
476 MPI_Datatype mpi_int64_type = get_mpi_int64_type();
477
478 /* MPI struct type: app_reduction_t */
479 MPI_Datatype mpi_app_reduction_type;
480 {
481 int blocklengths[] = {
482 1, 1, 1, 1, /* Resources */
483 1, 1, /* Hardware Counters */
484 1, 1, 1, 1, 1, /* Statistics */
485 1, 1, 1, 1, 1, 1, 1, /* Host Times */
486 1, 1, /* Host Normalized Times */
487 1, 1, 1, /* Device Times */
488 1, 1}; /* Device Max Times */
489
490 enum {count = sizeof(blocklengths) / sizeof(blocklengths[0])};
491
492 MPI_Aint displacements[] = {
493 /* Resources */
494 offsetof(app_reduction_t, num_cpus),
495 offsetof(app_reduction_t, num_nodes),
496 offsetof(app_reduction_t, avg_cpus),
497 offsetof(app_reduction_t, num_gpus),
498 /* Hardware Counters */
499 offsetof(app_reduction_t, cycles),
500 offsetof(app_reduction_t, instructions),
501 /* Statistics */
502 offsetof(app_reduction_t, num_measurements),
503 offsetof(app_reduction_t, num_mpi_calls),
504 offsetof(app_reduction_t, num_omp_parallels),
505 offsetof(app_reduction_t, num_omp_tasks),
506 offsetof(app_reduction_t, num_gpu_runtime_calls),
507 /* Host Times */
508 offsetof(app_reduction_t, elapsed_time),
509 offsetof(app_reduction_t, useful_time),
510 offsetof(app_reduction_t, mpi_time),
511 offsetof(app_reduction_t, omp_load_imbalance_time),
512 offsetof(app_reduction_t, omp_scheduling_time),
513 offsetof(app_reduction_t, omp_serialization_time),
514 offsetof(app_reduction_t, gpu_runtime_time),
515 /* Normalized Times */
516 offsetof(app_reduction_t, min_mpi_normd_proc),
517 offsetof(app_reduction_t, min_mpi_normd_node),
518 /* Device Times */
519 offsetof(app_reduction_t, gpu_useful_time),
520 offsetof(app_reduction_t, gpu_communication_time),
521 offsetof(app_reduction_t, gpu_inactive_time),
522 /* Device Max Times */
523 offsetof(app_reduction_t, max_gpu_useful_time),
524 offsetof(app_reduction_t, max_gpu_active_time),
525 };
526
527 MPI_Datatype types[] = {
528 MPI_INT, MPI_INT, MPI_FLOAT, MPI_INT, /* Resources */
529 MPI_DOUBLE, MPI_DOUBLE, /* Hardware Counters */
530 mpi_int64_type, mpi_int64_type,
531 mpi_int64_type, mpi_int64_type,
532 mpi_int64_type, /* Statistics */
533 mpi_int64_type, mpi_int64_type,
534 mpi_int64_type, mpi_int64_type,
535 mpi_int64_type, mpi_int64_type,
536 mpi_int64_type, /* Host Times */
537 MPI_DOUBLE, MPI_DOUBLE, /* Host Normalized Times */
538 mpi_int64_type, mpi_int64_type,
539 mpi_int64_type, /* Device Times */
540 mpi_int64_type, mpi_int64_type, /* Device Max Times */
541 };
542
543 MPI_Datatype tmp_type;
544 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
545 PMPI_Type_create_resized(tmp_type, 0, sizeof(app_reduction_t),
546 &mpi_app_reduction_type);
547 PMPI_Type_commit(&mpi_app_reduction_type);
548
549 static_ensure(sizeof(blocklengths)/sizeof(blocklengths[0]) == count,
550 "blocklengths size mismatch");
551 static_ensure(sizeof(displacements)/sizeof(displacements[0]) == count,
552 "displacements size mismatch");
553 static_ensure(sizeof(types)/sizeof(types[0]) == count,
554 "types size mismatch");
555 }
556
557 /* Define MPI operation */
558 MPI_Op app_reduction_op;
559 PMPI_Op_create(mpi_reduction_fn, true, &app_reduction_op);
560
561 /* MPI reduction */
562 if (!all_to_all) {
563 PMPI_Reduce(&app_reduction_send, app_reduction, 1,
564 mpi_app_reduction_type, app_reduction_op,
565 0, getWorldComm());
566 } else {
567 PMPI_Allreduce(&app_reduction_send, app_reduction, 1,
568 mpi_app_reduction_type, app_reduction_op,
569 getWorldComm());
570 }
571
572 /* Free MPI types */
573 PMPI_Type_free(&mpi_app_reduction_type);
574 PMPI_Op_free(&app_reduction_op);
575 }
576
577 #endif
578
579
580
581 #if MPI_LIB
582 /* Construct a base metrics struct out of a monitor reduced via MPI */
583 void perf_metrics__reduce_monitor_into_base_metrics(pop_base_metrics_t *base_metrics,
584 const dlb_monitor_t *monitor, bool all_to_all) {
585
586 /* First, reduce some values among processes in the node,
587 * needed to compute pop metrics */
588 node_reduction_t node_reduction = {0};
589 reduce_pop_metrics_node_reduction(&node_reduction, monitor);
590
591 /* With the node reduction, reduce again among all process */
592 app_reduction_t app_reduction = {0};
593 reduce_pop_metrics_app_reduction(&app_reduction, &node_reduction,
594 monitor, all_to_all);
595
596 /* Finally, fill output base_metrics... */
597
598 int num_mpi_ranks;
599 PMPI_Comm_size(getWorldComm(), &num_mpi_ranks);
600
601 *base_metrics = (const pop_base_metrics_t) {
602 .num_cpus = app_reduction.num_cpus,
603 .num_mpi_ranks = num_mpi_ranks,
604 .num_nodes = app_reduction.num_nodes,
605 .avg_cpus = app_reduction.avg_cpus,
606 .num_gpus = app_reduction.num_gpus,
607 .cycles = app_reduction.cycles,
608 .instructions = app_reduction.instructions,
609 .num_measurements = app_reduction.num_measurements,
610 .num_mpi_calls = app_reduction.num_mpi_calls,
611 .num_omp_parallels = app_reduction.num_omp_parallels,
612 .num_omp_tasks = app_reduction.num_omp_tasks,
613 .num_gpu_runtime_calls = app_reduction.num_gpu_runtime_calls,
614 .elapsed_time = app_reduction.elapsed_time,
615 .useful_time = app_reduction.useful_time,
616 .mpi_time = app_reduction.mpi_time,
617 .omp_load_imbalance_time = app_reduction.omp_load_imbalance_time,
618 .omp_scheduling_time = app_reduction.omp_scheduling_time,
619 .omp_serialization_time = app_reduction.omp_serialization_time,
620 .gpu_runtime_time = app_reduction.gpu_runtime_time,
621 .min_mpi_normd_proc = app_reduction.min_mpi_normd_proc,
622 .min_mpi_normd_node = app_reduction.min_mpi_normd_node,
623 .gpu_useful_time = app_reduction.gpu_useful_time,
624 .gpu_communication_time = app_reduction.gpu_communication_time,
625 .gpu_inactive_time = app_reduction.gpu_inactive_time,
626 .max_gpu_useful_time = app_reduction.max_gpu_useful_time,
627 .max_gpu_active_time = app_reduction.max_gpu_active_time,
628 };
629 }
630 #endif
631
632
633 /* Construct a base metrics struct out of a single monitor */
634 15 void perf_metrics__local_monitor_into_base_metrics(pop_base_metrics_t *base_metrics,
635 const dlb_monitor_t *monitor) {
636
637 15 bool have_gpus = (monitor->gpu_useful_time + monitor->gpu_communication_time > 0);
638
639 15 *base_metrics = (const pop_base_metrics_t){
640 15 .num_cpus = monitor->num_cpus,
641 .num_mpi_ranks = 0,
642 .num_nodes = 1,
643 15 .avg_cpus = monitor->avg_cpus,
644 15 .num_gpus = have_gpus ? 1 : 0,
645 15 .cycles = (double)monitor->cycles,
646 15 .instructions = (double)monitor->instructions,
647 15 .num_measurements = monitor->num_measurements,
648 15 .num_mpi_calls = monitor->num_mpi_calls,
649 15 .num_omp_parallels = monitor->num_omp_parallels,
650 15 .num_omp_tasks = monitor->num_omp_tasks,
651 15 .num_gpu_runtime_calls = monitor->num_gpu_runtime_calls,
652 15 .elapsed_time = monitor->elapsed_time,
653 15 .useful_time = monitor->useful_time,
654 15 .mpi_time = monitor->mpi_time,
655 15 .omp_load_imbalance_time = monitor->omp_load_imbalance_time,
656 15 .omp_scheduling_time = monitor->omp_scheduling_time,
657 15 .omp_serialization_time = monitor->omp_serialization_time,
658 15 .gpu_runtime_time = monitor->gpu_runtime_time,
659 15 .min_mpi_normd_proc = (double)monitor->mpi_time / monitor->num_cpus,
660 15 .min_mpi_normd_node = (double)monitor->mpi_time / monitor->num_cpus,
661 15 .gpu_useful_time = monitor->gpu_useful_time,
662 15 .gpu_communication_time = monitor->gpu_communication_time,
663 15 .gpu_inactive_time = monitor->gpu_inactive_time,
664 15 .max_gpu_useful_time = monitor->gpu_useful_time,
665 15 .max_gpu_active_time = monitor->gpu_useful_time + monitor->gpu_communication_time,
666 };
667 15 }
668
669 /* Compute POP metrics out of a base metrics struct */
670 15 void perf_metrics__base_to_pop_metrics(const char *monitor_name,
671 const pop_base_metrics_t *base_metrics, dlb_pop_metrics_t *pop_metrics) {
672
673 /* Compute POP metrics */
674 15 perf_metrics_hybrid_t metrics = {0};
675
676
1/2
✓ Branch 0 taken 15 times.
✗ Branch 1 not taken.
15 if (base_metrics->useful_time > 0) {
677
678
1/3
✗ Branch 0 not taken.
✓ Branch 1 taken 15 times.
✗ Branch 2 not taken.
15 switch(thread_spd->options.talp_model) {
679 case TALP_MODEL_HYBRID_V1:
680 perf_metrics__compute_hybrid_model_v1(&metrics, base_metrics);
681 break;
682 15 case TALP_MODEL_HYBRID_V2:
683 15 perf_metrics__compute_hybrid_model_v2(&metrics, base_metrics);
684 15 break;
685 };
686 }
687
688 /* Initialize structure */
689 15 *pop_metrics = (const dlb_pop_metrics_t) {
690 15 .num_cpus = base_metrics->num_cpus,
691 15 .num_mpi_ranks = base_metrics->num_mpi_ranks,
692 15 .num_nodes = base_metrics->num_nodes,
693 15 .avg_cpus = base_metrics->avg_cpus,
694 15 .num_gpus = base_metrics->num_gpus,
695 15 .cycles = base_metrics->cycles,
696 15 .instructions = base_metrics->instructions,
697 15 .num_measurements = base_metrics->num_measurements,
698 15 .num_mpi_calls = base_metrics->num_mpi_calls,
699 15 .num_omp_parallels = base_metrics->num_omp_parallels,
700 15 .num_omp_tasks = base_metrics->num_omp_tasks,
701 15 .num_gpu_runtime_calls = base_metrics->num_gpu_runtime_calls,
702 15 .elapsed_time = base_metrics->elapsed_time,
703 15 .useful_time = base_metrics->useful_time,
704 15 .mpi_time = base_metrics->mpi_time,
705 15 .omp_load_imbalance_time = base_metrics->omp_load_imbalance_time,
706 15 .omp_scheduling_time = base_metrics->omp_scheduling_time,
707 15 .omp_serialization_time = base_metrics->omp_serialization_time,
708 15 .gpu_runtime_time = base_metrics->gpu_runtime_time,
709 15 .min_mpi_normd_proc = base_metrics->min_mpi_normd_proc,
710 15 .min_mpi_normd_node = base_metrics->min_mpi_normd_node,
711 15 .gpu_useful_time = base_metrics->gpu_useful_time,
712 15 .gpu_communication_time = base_metrics->gpu_communication_time,
713 15 .gpu_inactive_time = base_metrics->gpu_inactive_time,
714 15 .max_gpu_useful_time = base_metrics->max_gpu_useful_time,
715 15 .max_gpu_active_time = base_metrics->max_gpu_active_time,
716 15 .parallel_efficiency = metrics.parallel_efficiency,
717 15 .mpi_parallel_efficiency = metrics.mpi_parallel_efficiency,
718 15 .mpi_communication_efficiency = metrics.mpi_communication_efficiency,
719 15 .mpi_load_balance = metrics.mpi_load_balance,
720 15 .mpi_load_balance_in = metrics.mpi_load_balance_in,
721 15 .mpi_load_balance_out = metrics.mpi_load_balance_out,
722 15 .omp_parallel_efficiency = metrics.omp_parallel_efficiency,
723 15 .omp_load_balance = metrics.omp_load_balance,
724 15 .omp_scheduling_efficiency = metrics.omp_scheduling_efficiency,
725 15 .omp_serialization_efficiency = metrics.omp_serialization_efficiency,
726 15 .device_offload_efficiency = metrics.device_offload_efficiency,
727 15 .gpu_parallel_efficiency = metrics.gpu_parallel_efficiency,
728 15 .gpu_load_balance = metrics.gpu_load_balance,
729 15 .gpu_communication_efficiency = metrics.gpu_communication_efficiency,
730 15 .gpu_orchestration_efficiency = metrics.gpu_orchestration_efficiency,
731 };
732 15 snprintf(pop_metrics->name, DLB_MONITOR_NAME_MAX, "%s", monitor_name);
733 15 }
734