GCC Code Coverage Report


Directory: src/
File: src/support/perf_metrics.h
Date: 2024-11-22 17:07:10
Exec Total Coverage
Lines: 77 108 71.3%
Functions: 3 4 75.0%
Branches: 3 7 42.9%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2024 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #ifndef PERF_METRICS_H
21 #define PERF_METRICS_H
22
23 #include "LB_core/spd.h"
24 #include "support/options.h"
25
26 #include <stdint.h>
27
28
29 /*********************************************************************************/
30 /* POP metrics - pure MPI model */
31 /*********************************************************************************/
32
33 /* POP metrics for pure MPI executions */
34 typedef struct perf_metrics_mpi_t {
35 float parallel_efficiency;
36 float communication_efficiency;
37 float load_balance;
38 float lb_in;
39 float lb_out;
40 } perf_metrics_mpi_t;
41
42 /* Compute POP metrics for the MPI model
43 * (This funtion is actually not used anywhere) */
44 static inline void perf_metrics__compute_mpi_model(
45 perf_metrics_mpi_t *metrics,
46 int num_cpus,
47 int num_nodes,
48 int64_t elapsed_time,
49 int64_t elapsed_useful,
50 int64_t app_sum_useful,
51 int64_t node_sum_useful) {
52
53 if (elapsed_time > 0) {
54 *metrics = (const perf_metrics_mpi_t) {
55 .parallel_efficiency = (float)app_sum_useful / (elapsed_time * num_cpus),
56 .communication_efficiency = (float)elapsed_useful / elapsed_time,
57 .load_balance = (float)app_sum_useful / (elapsed_useful * num_cpus),
58 .lb_in = (float)(node_sum_useful * num_nodes) / (elapsed_useful * num_cpus),
59 .lb_out = (float)app_sum_useful / (node_sum_useful * num_nodes),
60 };
61 } else {
62 *metrics = (const perf_metrics_mpi_t) {};
63 }
64 }
65
66 /* Compute POP metrics for the MPI model, but with some inferred values:
67 * (Only useful for node metrics) */
68 5 static inline void perf_metrics__infer_mpi_model(
69 perf_metrics_mpi_t *metrics,
70 int processes_per_node,
71 int64_t node_sum_useful,
72 int64_t node_sum_mpi,
73 int64_t max_useful_time) {
74
75 5 int64_t elapsed_time = (node_sum_useful + node_sum_mpi) / processes_per_node;
76
1/2
✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
5 if (elapsed_time > 0) {
77 5 *metrics = (const perf_metrics_mpi_t) {
78 5 .parallel_efficiency = (float)node_sum_useful / (node_sum_useful + node_sum_mpi),
79 5 .communication_efficiency = (float)max_useful_time / elapsed_time,
80 5 .load_balance = ((float)node_sum_useful / processes_per_node) / max_useful_time,
81 };
82 } else {
83 *metrics = (const perf_metrics_mpi_t) {};
84 }
85 5 }
86
87
88 /*********************************************************************************/
89 /* POP metrics - hybrid MPI + OpenMP model */
90 /*********************************************************************************/
91
92 /* Internal struct to contain everything that's needed to actually construct a
93 * dlb_pop_metrics_t. Can be thought of as an abstract to the app_reduction_t
94 * combining serial and parallel program flow. Everything in here is coming
95 * directly from measurement, or computed in an MPI reduction. */
96 typedef struct pop_base_metrics_t {
97 /* Resources */
98 int num_cpus;
99 int num_mpi_ranks;
100 int num_nodes;
101 float avg_cpus;
102 /* Hardware counters */
103 double cycles;
104 double instructions;
105 /* Statistics */
106 int64_t num_measurements;
107 int64_t num_mpi_calls;
108 int64_t num_omp_parallels;
109 int64_t num_omp_tasks;
110 /* Sum of times among all processes */
111 int64_t elapsed_time;
112 int64_t useful_time;
113 int64_t mpi_time;
114 int64_t omp_load_imbalance_time;
115 int64_t omp_scheduling_time;
116 int64_t omp_serialization_time;
117 /* Normalized times by the number of assigned CPUs */
118 double useful_normd_app; /* Useful time normalized by num. CPUs at application level */
119 double mpi_normd_app; /* MPI time normalized by num. CPUs at application level */
120 double max_useful_normd_proc; /* Max value of useful times normalized at process level */
121 double max_useful_normd_node; /* Max value of useful times normalized at node level */
122 double mpi_normd_of_max_useful; /* MPI time normalized at process level of the process with
123 the max useful time */
124 } pop_base_metrics_t;
125
126 /* Computed efficiency metrics for the POP hybrid model */
127 typedef struct perf_metrics_hybrid_t {
128 float parallel_efficiency;
129 float mpi_parallel_efficiency;
130 float mpi_communication_efficiency;
131 float mpi_load_balance;
132 float mpi_load_balance_in;
133 float mpi_load_balance_out;
134 float omp_parallel_efficiency;
135 float omp_load_balance;
136 float omp_scheduling_efficiency;
137 float omp_serialization_efficiency;
138 } perf_metrics_hybrid_t;
139
140 /* Compute POP metrics for the hybrid MPI + OpenMP model
141 * (Ver. 1: All metrics are multiplicative, but some of them are > 1) */
142 static inline void perf_metrics__compute_hybrid_model_v1(
143 perf_metrics_hybrid_t *metrics,
144 const pop_base_metrics_t *base_metrics) {
145
146 int64_t useful_time = base_metrics->useful_time;
147 int64_t mpi_time = base_metrics->mpi_time;
148 int64_t omp_load_imbalance_time = base_metrics->omp_load_imbalance_time;
149 int64_t omp_scheduling_time = base_metrics->omp_scheduling_time;
150 int64_t omp_serialization_time = base_metrics->omp_serialization_time;
151 double useful_normd_app = base_metrics->useful_normd_app;
152 double mpi_normd_app = base_metrics->mpi_normd_app;
153 double max_useful_normd_proc = base_metrics->max_useful_normd_proc;
154 double max_useful_normd_node = base_metrics->max_useful_normd_node;
155
156 /* Active is the union of all times (CPU not disabled) */
157 int64_t sum_active = useful_time + mpi_time + omp_load_imbalance_time +
158 omp_scheduling_time + omp_serialization_time;
159
160 /* Equivalent to all CPU time if OMP was not present */
161 int64_t sum_active_not_omp = useful_time + mpi_time;
162
163 /* Compute output metrics */
164 *metrics = (const perf_metrics_hybrid_t) {
165 .parallel_efficiency = (float)useful_time / sum_active,
166 .mpi_parallel_efficiency = (float)useful_time / sum_active_not_omp,
167 .mpi_communication_efficiency =
168 max_useful_normd_proc / (useful_normd_app + mpi_normd_app),
169 .mpi_load_balance = useful_normd_app / max_useful_normd_proc,
170 .mpi_load_balance_in = max_useful_normd_node / max_useful_normd_proc,
171 .mpi_load_balance_out = useful_normd_app / max_useful_normd_node,
172 .omp_parallel_efficiency = (float)sum_active_not_omp / sum_active,
173 .omp_load_balance = (float)(sum_active_not_omp + omp_serialization_time)
174 / (sum_active_not_omp + omp_serialization_time + omp_load_imbalance_time),
175 .omp_scheduling_efficiency =
176 (float)(sum_active_not_omp + omp_serialization_time + omp_load_imbalance_time)
177 / (sum_active_not_omp + omp_serialization_time + omp_load_imbalance_time
178 + omp_scheduling_time),
179 .omp_serialization_efficiency = (float)sum_active_not_omp
180 / (sum_active_not_omp + omp_serialization_time),
181 };
182 }
183
184 /* Compute POP metrics for the hybrid MPI + OpenMP model (Ver. 2: PE != MPE * OPE) */
185 8 static inline void perf_metrics__compute_hybrid_model_v2(
186 perf_metrics_hybrid_t *metrics,
187 const pop_base_metrics_t *base_metrics) {
188
189 8 int64_t useful_time = base_metrics->useful_time;
190 8 int64_t mpi_time = base_metrics->mpi_time;
191 8 int64_t omp_load_imbalance_time = base_metrics->omp_load_imbalance_time;
192 8 int64_t omp_scheduling_time = base_metrics->omp_scheduling_time;
193 8 int64_t omp_serialization_time = base_metrics->omp_serialization_time;
194 8 double useful_normd_app = base_metrics->useful_normd_app;
195 8 double max_useful_normd_proc = base_metrics->max_useful_normd_proc;
196 8 double max_useful_normd_node = base_metrics->max_useful_normd_node;
197 8 double mpi_normd_of_max_useful = base_metrics->mpi_normd_of_max_useful;
198
199 /* Active is the union of all times (CPU not disabled) */
200 8 int64_t sum_active = useful_time + mpi_time + omp_load_imbalance_time +
201 omp_scheduling_time + omp_serialization_time;
202
203 /* Equivalent to all CPU time if OMP was not present */
204 8 int64_t sum_active_not_omp = useful_time + mpi_time;
205
206 /* Compute output metrics */
207 8 *metrics = (const perf_metrics_hybrid_t) {
208 8 .parallel_efficiency = (float)useful_time / sum_active,
209 .mpi_parallel_efficiency =
210 8 useful_normd_app / (max_useful_normd_proc + mpi_normd_of_max_useful),
211 .mpi_communication_efficiency =
212 8 max_useful_normd_proc / (max_useful_normd_proc + mpi_normd_of_max_useful),
213 8 .mpi_load_balance = useful_normd_app / max_useful_normd_proc,
214 8 .mpi_load_balance_in = max_useful_normd_node / max_useful_normd_proc,
215 8 .mpi_load_balance_out = useful_normd_app / max_useful_normd_node,
216 8 .omp_parallel_efficiency = (float)sum_active_not_omp / sum_active,
217 8 .omp_load_balance = (float)(sum_active_not_omp + omp_serialization_time)
218 8 / (sum_active_not_omp + omp_serialization_time + omp_load_imbalance_time),
219 .omp_scheduling_efficiency =
220 8 (float)(sum_active_not_omp + omp_serialization_time + omp_load_imbalance_time)
221 8 / (sum_active_not_omp + omp_serialization_time + omp_load_imbalance_time
222 8 + omp_scheduling_time),
223 8 .omp_serialization_efficiency = (float)sum_active_not_omp
224 8 / (sum_active_not_omp + omp_serialization_time),
225 };
226 8 }
227
228 /* Actual 'public' function. DLB_talp.c invokes this function with base_metrics
229 * as input, and the computed pop_metrics as output. */
230 8 static inline void talp_base_metrics_to_pop_metrics(const char *monitor_name,
231 const pop_base_metrics_t *base_metrics, dlb_pop_metrics_t *pop_metrics) {
232
233 /* Compute POP metrics */
234 8 perf_metrics_hybrid_t metrics = {0};
235
236
1/2
✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
8 if (base_metrics->useful_time > 0) {
237
238
1/3
✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 switch(thread_spd->options.talp_model) {
239 case TALP_MODEL_HYBRID_V1:
240 perf_metrics__compute_hybrid_model_v1(&metrics, base_metrics);
241 break;
242 8 case TALP_MODEL_HYBRID_V2:
243 8 perf_metrics__compute_hybrid_model_v2(&metrics, base_metrics);
244 8 break;
245 };
246 }
247
248 /* Initialize structure */
249 8 *pop_metrics = (const dlb_pop_metrics_t) {
250 8 .num_cpus = base_metrics->num_cpus,
251 8 .num_mpi_ranks = base_metrics->num_mpi_ranks,
252 8 .num_nodes = base_metrics->num_nodes,
253 8 .avg_cpus = base_metrics->avg_cpus,
254 8 .cycles = base_metrics->cycles,
255 8 .instructions = base_metrics->instructions,
256 8 .num_measurements = base_metrics->num_measurements,
257 8 .num_mpi_calls = base_metrics->num_mpi_calls,
258 8 .num_omp_parallels = base_metrics->num_omp_parallels,
259 8 .num_omp_tasks = base_metrics->num_omp_tasks,
260 8 .elapsed_time = base_metrics->elapsed_time,
261 8 .useful_time = base_metrics->useful_time,
262 8 .mpi_time = base_metrics->mpi_time,
263 8 .omp_load_imbalance_time = base_metrics->omp_load_imbalance_time,
264 8 .omp_scheduling_time = base_metrics->omp_scheduling_time,
265 8 .omp_serialization_time = base_metrics->omp_serialization_time,
266 8 .useful_normd_app = base_metrics->useful_normd_app,
267 8 .mpi_normd_app = base_metrics->mpi_normd_app,
268 8 .max_useful_normd_proc = base_metrics->max_useful_normd_proc,
269 8 .max_useful_normd_node = base_metrics->max_useful_normd_node,
270 8 .mpi_normd_of_max_useful = base_metrics->mpi_normd_of_max_useful,
271 8 .parallel_efficiency = metrics.parallel_efficiency,
272 8 .mpi_parallel_efficiency = metrics.mpi_parallel_efficiency,
273 8 .mpi_communication_efficiency = metrics.mpi_communication_efficiency,
274 8 .mpi_load_balance = metrics.mpi_load_balance,
275 8 .mpi_load_balance_in = metrics.mpi_load_balance_in,
276 8 .mpi_load_balance_out = metrics.mpi_load_balance_out,
277 8 .omp_parallel_efficiency = metrics.omp_parallel_efficiency,
278 8 .omp_load_balance = metrics.omp_load_balance,
279 8 .omp_scheduling_efficiency = metrics.omp_scheduling_efficiency,
280 8 .omp_serialization_efficiency = metrics.omp_serialization_efficiency,
281 };
282 8 snprintf(pop_metrics->name, DLB_MONITOR_NAME_MAX, "%s", monitor_name);
283 8 }
284
285 #endif /* PERF_METRICS_H */
286