Line | Branch | Exec | Source |
---|---|---|---|
1 | /*********************************************************************************/ | ||
2 | /* Copyright 2009-2024 Barcelona Supercomputing Center */ | ||
3 | /* */ | ||
4 | /* This file is part of the DLB library. */ | ||
5 | /* */ | ||
6 | /* DLB is free software: you can redistribute it and/or modify */ | ||
7 | /* it under the terms of the GNU Lesser General Public License as published by */ | ||
8 | /* the Free Software Foundation, either version 3 of the License, or */ | ||
9 | /* (at your option) any later version. */ | ||
10 | /* */ | ||
11 | /* DLB is distributed in the hope that it will be useful, */ | ||
12 | /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
13 | /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
14 | /* GNU Lesser General Public License for more details. */ | ||
15 | /* */ | ||
16 | /* You should have received a copy of the GNU Lesser General Public License */ | ||
17 | /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */ | ||
18 | /*********************************************************************************/ | ||
19 | |||
20 | #ifndef PERF_METRICS_H | ||
21 | #define PERF_METRICS_H | ||
22 | |||
23 | #include "LB_core/spd.h" | ||
24 | #include "support/options.h" | ||
25 | |||
26 | #include <stdint.h> | ||
27 | |||
28 | |||
29 | /*********************************************************************************/ | ||
30 | /* POP metrics - pure MPI model */ | ||
31 | /*********************************************************************************/ | ||
32 | |||
33 | /* POP metrics for pure MPI executions */ | ||
34 | typedef struct perf_metrics_mpi_t { | ||
35 | float parallel_efficiency; | ||
36 | float communication_efficiency; | ||
37 | float load_balance; | ||
38 | float lb_in; | ||
39 | float lb_out; | ||
40 | } perf_metrics_mpi_t; | ||
41 | |||
42 | /* Compute POP metrics for the MPI model | ||
43 | * (This funtion is actually not used anywhere) */ | ||
44 | static inline void perf_metrics__compute_mpi_model( | ||
45 | perf_metrics_mpi_t *metrics, | ||
46 | int num_cpus, | ||
47 | int num_nodes, | ||
48 | int64_t elapsed_time, | ||
49 | int64_t elapsed_useful, | ||
50 | int64_t app_sum_useful, | ||
51 | int64_t node_sum_useful) { | ||
52 | |||
53 | if (elapsed_time > 0) { | ||
54 | *metrics = (const perf_metrics_mpi_t) { | ||
55 | .parallel_efficiency = (float)app_sum_useful / (elapsed_time * num_cpus), | ||
56 | .communication_efficiency = (float)elapsed_useful / elapsed_time, | ||
57 | .load_balance = (float)app_sum_useful / (elapsed_useful * num_cpus), | ||
58 | .lb_in = (float)(node_sum_useful * num_nodes) / (elapsed_useful * num_cpus), | ||
59 | .lb_out = (float)app_sum_useful / (node_sum_useful * num_nodes), | ||
60 | }; | ||
61 | } else { | ||
62 | *metrics = (const perf_metrics_mpi_t) {}; | ||
63 | } | ||
64 | } | ||
65 | |||
66 | /* Compute POP metrics for the MPI model, but with some inferred values: | ||
67 | * (Only useful for node metrics) */ | ||
68 | 5 | static inline void perf_metrics__infer_mpi_model( | |
69 | perf_metrics_mpi_t *metrics, | ||
70 | int processes_per_node, | ||
71 | int64_t node_sum_useful, | ||
72 | int64_t node_sum_mpi, | ||
73 | int64_t max_useful_time) { | ||
74 | |||
75 | 5 | int64_t elapsed_time = (node_sum_useful + node_sum_mpi) / processes_per_node; | |
76 |
1/2✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
|
5 | if (elapsed_time > 0) { |
77 | 5 | *metrics = (const perf_metrics_mpi_t) { | |
78 | 5 | .parallel_efficiency = (float)node_sum_useful / (node_sum_useful + node_sum_mpi), | |
79 | 5 | .communication_efficiency = (float)max_useful_time / elapsed_time, | |
80 | 5 | .load_balance = ((float)node_sum_useful / processes_per_node) / max_useful_time, | |
81 | }; | ||
82 | } else { | ||
83 | ✗ | *metrics = (const perf_metrics_mpi_t) {}; | |
84 | } | ||
85 | 5 | } | |
86 | |||
87 | |||
88 | /*********************************************************************************/ | ||
89 | /* POP metrics - hybrid MPI + OpenMP model */ | ||
90 | /*********************************************************************************/ | ||
91 | |||
92 | /* Internal struct to contain everything that's needed to actually construct a | ||
93 | * dlb_pop_metrics_t. Can be thought of as an abstract to the app_reduction_t | ||
94 | * combining serial and parallel program flow. Everything in here is coming | ||
95 | * directly from measurement, or computed in an MPI reduction. */ | ||
96 | typedef struct pop_base_metrics_t { | ||
97 | /* Resources */ | ||
98 | int num_cpus; | ||
99 | int num_mpi_ranks; | ||
100 | int num_nodes; | ||
101 | float avg_cpus; | ||
102 | /* Hardware counters */ | ||
103 | double cycles; | ||
104 | double instructions; | ||
105 | /* Statistics */ | ||
106 | int64_t num_measurements; | ||
107 | int64_t num_mpi_calls; | ||
108 | int64_t num_omp_parallels; | ||
109 | int64_t num_omp_tasks; | ||
110 | /* Sum of times among all processes */ | ||
111 | int64_t elapsed_time; | ||
112 | int64_t useful_time; | ||
113 | int64_t mpi_time; | ||
114 | int64_t omp_load_imbalance_time; | ||
115 | int64_t omp_scheduling_time; | ||
116 | int64_t omp_serialization_time; | ||
117 | /* Normalized times by the number of assigned CPUs */ | ||
118 | double useful_normd_app; /* Useful time normalized by num. CPUs at application level */ | ||
119 | double mpi_normd_app; /* MPI time normalized by num. CPUs at application level */ | ||
120 | double max_useful_normd_proc; /* Max value of useful times normalized at process level */ | ||
121 | double max_useful_normd_node; /* Max value of useful times normalized at node level */ | ||
122 | double mpi_normd_of_max_useful; /* MPI time normalized at process level of the process with | ||
123 | the max useful time */ | ||
124 | } pop_base_metrics_t; | ||
125 | |||
126 | /* Computed efficiency metrics for the POP hybrid model */ | ||
127 | typedef struct perf_metrics_hybrid_t { | ||
128 | float parallel_efficiency; | ||
129 | float mpi_parallel_efficiency; | ||
130 | float mpi_communication_efficiency; | ||
131 | float mpi_load_balance; | ||
132 | float mpi_load_balance_in; | ||
133 | float mpi_load_balance_out; | ||
134 | float omp_parallel_efficiency; | ||
135 | float omp_load_balance; | ||
136 | float omp_scheduling_efficiency; | ||
137 | float omp_serialization_efficiency; | ||
138 | } perf_metrics_hybrid_t; | ||
139 | |||
140 | /* Compute POP metrics for the hybrid MPI + OpenMP model | ||
141 | * (Ver. 1: All metrics are multiplicative, but some of them are > 1) */ | ||
142 | ✗ | static inline void perf_metrics__compute_hybrid_model_v1( | |
143 | perf_metrics_hybrid_t *metrics, | ||
144 | const pop_base_metrics_t *base_metrics) { | ||
145 | |||
146 | ✗ | int64_t useful_time = base_metrics->useful_time; | |
147 | ✗ | int64_t mpi_time = base_metrics->mpi_time; | |
148 | ✗ | int64_t omp_load_imbalance_time = base_metrics->omp_load_imbalance_time; | |
149 | ✗ | int64_t omp_scheduling_time = base_metrics->omp_scheduling_time; | |
150 | ✗ | int64_t omp_serialization_time = base_metrics->omp_serialization_time; | |
151 | ✗ | double useful_normd_app = base_metrics->useful_normd_app; | |
152 | ✗ | double mpi_normd_app = base_metrics->mpi_normd_app; | |
153 | ✗ | double max_useful_normd_proc = base_metrics->max_useful_normd_proc; | |
154 | ✗ | double max_useful_normd_node = base_metrics->max_useful_normd_node; | |
155 | |||
156 | /* Active is the union of all times (CPU not disabled) */ | ||
157 | ✗ | int64_t sum_active = useful_time + mpi_time + omp_load_imbalance_time + | |
158 | omp_scheduling_time + omp_serialization_time; | ||
159 | |||
160 | /* Equivalent to all CPU time if OMP was not present */ | ||
161 | ✗ | int64_t sum_active_not_omp = useful_time + mpi_time; | |
162 | |||
163 | /* Compute output metrics */ | ||
164 | ✗ | *metrics = (const perf_metrics_hybrid_t) { | |
165 | ✗ | .parallel_efficiency = (float)useful_time / sum_active, | |
166 | ✗ | .mpi_parallel_efficiency = (float)useful_time / sum_active_not_omp, | |
167 | .mpi_communication_efficiency = | ||
168 | ✗ | max_useful_normd_proc / (useful_normd_app + mpi_normd_app), | |
169 | ✗ | .mpi_load_balance = useful_normd_app / max_useful_normd_proc, | |
170 | ✗ | .mpi_load_balance_in = max_useful_normd_node / max_useful_normd_proc, | |
171 | ✗ | .mpi_load_balance_out = useful_normd_app / max_useful_normd_node, | |
172 | ✗ | .omp_parallel_efficiency = (float)sum_active_not_omp / sum_active, | |
173 | ✗ | .omp_load_balance = (float)(sum_active_not_omp + omp_serialization_time) | |
174 | ✗ | / (sum_active_not_omp + omp_serialization_time + omp_load_imbalance_time), | |
175 | .omp_scheduling_efficiency = | ||
176 | ✗ | (float)(sum_active_not_omp + omp_serialization_time + omp_load_imbalance_time) | |
177 | ✗ | / (sum_active_not_omp + omp_serialization_time + omp_load_imbalance_time | |
178 | ✗ | + omp_scheduling_time), | |
179 | ✗ | .omp_serialization_efficiency = (float)sum_active_not_omp | |
180 | ✗ | / (sum_active_not_omp + omp_serialization_time), | |
181 | }; | ||
182 | } | ||
183 | |||
184 | /* Compute POP metrics for the hybrid MPI + OpenMP model (Ver. 2: PE != MPE * OPE) */ | ||
185 | 8 | static inline void perf_metrics__compute_hybrid_model_v2( | |
186 | perf_metrics_hybrid_t *metrics, | ||
187 | const pop_base_metrics_t *base_metrics) { | ||
188 | |||
189 | 8 | int64_t useful_time = base_metrics->useful_time; | |
190 | 8 | int64_t mpi_time = base_metrics->mpi_time; | |
191 | 8 | int64_t omp_load_imbalance_time = base_metrics->omp_load_imbalance_time; | |
192 | 8 | int64_t omp_scheduling_time = base_metrics->omp_scheduling_time; | |
193 | 8 | int64_t omp_serialization_time = base_metrics->omp_serialization_time; | |
194 | 8 | double useful_normd_app = base_metrics->useful_normd_app; | |
195 | 8 | double max_useful_normd_proc = base_metrics->max_useful_normd_proc; | |
196 | 8 | double max_useful_normd_node = base_metrics->max_useful_normd_node; | |
197 | 8 | double mpi_normd_of_max_useful = base_metrics->mpi_normd_of_max_useful; | |
198 | |||
199 | /* Active is the union of all times (CPU not disabled) */ | ||
200 | 8 | int64_t sum_active = useful_time + mpi_time + omp_load_imbalance_time + | |
201 | omp_scheduling_time + omp_serialization_time; | ||
202 | |||
203 | /* Equivalent to all CPU time if OMP was not present */ | ||
204 | 8 | int64_t sum_active_not_omp = useful_time + mpi_time; | |
205 | |||
206 | /* Compute output metrics */ | ||
207 | 8 | *metrics = (const perf_metrics_hybrid_t) { | |
208 | 8 | .parallel_efficiency = (float)useful_time / sum_active, | |
209 | .mpi_parallel_efficiency = | ||
210 | 8 | useful_normd_app / (max_useful_normd_proc + mpi_normd_of_max_useful), | |
211 | .mpi_communication_efficiency = | ||
212 | 8 | max_useful_normd_proc / (max_useful_normd_proc + mpi_normd_of_max_useful), | |
213 | 8 | .mpi_load_balance = useful_normd_app / max_useful_normd_proc, | |
214 | 8 | .mpi_load_balance_in = max_useful_normd_node / max_useful_normd_proc, | |
215 | 8 | .mpi_load_balance_out = useful_normd_app / max_useful_normd_node, | |
216 | 8 | .omp_parallel_efficiency = (float)sum_active_not_omp / sum_active, | |
217 | 8 | .omp_load_balance = (float)(sum_active_not_omp + omp_serialization_time) | |
218 | 8 | / (sum_active_not_omp + omp_serialization_time + omp_load_imbalance_time), | |
219 | .omp_scheduling_efficiency = | ||
220 | 8 | (float)(sum_active_not_omp + omp_serialization_time + omp_load_imbalance_time) | |
221 | 8 | / (sum_active_not_omp + omp_serialization_time + omp_load_imbalance_time | |
222 | 8 | + omp_scheduling_time), | |
223 | 8 | .omp_serialization_efficiency = (float)sum_active_not_omp | |
224 | 8 | / (sum_active_not_omp + omp_serialization_time), | |
225 | }; | ||
226 | 8 | } | |
227 | |||
228 | /* Actual 'public' function. DLB_talp.c invokes this function with base_metrics | ||
229 | * as input, and the computed pop_metrics as output. */ | ||
230 | 8 | static inline void talp_base_metrics_to_pop_metrics(const char *monitor_name, | |
231 | const pop_base_metrics_t *base_metrics, dlb_pop_metrics_t *pop_metrics) { | ||
232 | |||
233 | /* Compute POP metrics */ | ||
234 | 8 | perf_metrics_hybrid_t metrics = {0}; | |
235 | |||
236 |
1/2✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
|
8 | if (base_metrics->useful_time > 0) { |
237 | |||
238 |
1/3✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
|
8 | switch(thread_spd->options.talp_model) { |
239 | ✗ | case TALP_MODEL_HYBRID_V1: | |
240 | ✗ | perf_metrics__compute_hybrid_model_v1(&metrics, base_metrics); | |
241 | ✗ | break; | |
242 | 8 | case TALP_MODEL_HYBRID_V2: | |
243 | 8 | perf_metrics__compute_hybrid_model_v2(&metrics, base_metrics); | |
244 | 8 | break; | |
245 | }; | ||
246 | } | ||
247 | |||
248 | /* Initialize structure */ | ||
249 | 8 | *pop_metrics = (const dlb_pop_metrics_t) { | |
250 | 8 | .num_cpus = base_metrics->num_cpus, | |
251 | 8 | .num_mpi_ranks = base_metrics->num_mpi_ranks, | |
252 | 8 | .num_nodes = base_metrics->num_nodes, | |
253 | 8 | .avg_cpus = base_metrics->avg_cpus, | |
254 | 8 | .cycles = base_metrics->cycles, | |
255 | 8 | .instructions = base_metrics->instructions, | |
256 | 8 | .num_measurements = base_metrics->num_measurements, | |
257 | 8 | .num_mpi_calls = base_metrics->num_mpi_calls, | |
258 | 8 | .num_omp_parallels = base_metrics->num_omp_parallels, | |
259 | 8 | .num_omp_tasks = base_metrics->num_omp_tasks, | |
260 | 8 | .elapsed_time = base_metrics->elapsed_time, | |
261 | 8 | .useful_time = base_metrics->useful_time, | |
262 | 8 | .mpi_time = base_metrics->mpi_time, | |
263 | 8 | .omp_load_imbalance_time = base_metrics->omp_load_imbalance_time, | |
264 | 8 | .omp_scheduling_time = base_metrics->omp_scheduling_time, | |
265 | 8 | .omp_serialization_time = base_metrics->omp_serialization_time, | |
266 | 8 | .useful_normd_app = base_metrics->useful_normd_app, | |
267 | 8 | .mpi_normd_app = base_metrics->mpi_normd_app, | |
268 | 8 | .max_useful_normd_proc = base_metrics->max_useful_normd_proc, | |
269 | 8 | .max_useful_normd_node = base_metrics->max_useful_normd_node, | |
270 | 8 | .mpi_normd_of_max_useful = base_metrics->mpi_normd_of_max_useful, | |
271 | 8 | .parallel_efficiency = metrics.parallel_efficiency, | |
272 | 8 | .mpi_parallel_efficiency = metrics.mpi_parallel_efficiency, | |
273 | 8 | .mpi_communication_efficiency = metrics.mpi_communication_efficiency, | |
274 | 8 | .mpi_load_balance = metrics.mpi_load_balance, | |
275 | 8 | .mpi_load_balance_in = metrics.mpi_load_balance_in, | |
276 | 8 | .mpi_load_balance_out = metrics.mpi_load_balance_out, | |
277 | 8 | .omp_parallel_efficiency = metrics.omp_parallel_efficiency, | |
278 | 8 | .omp_load_balance = metrics.omp_load_balance, | |
279 | 8 | .omp_scheduling_efficiency = metrics.omp_scheduling_efficiency, | |
280 | 8 | .omp_serialization_efficiency = metrics.omp_serialization_efficiency, | |
281 | }; | ||
282 | 8 | snprintf(pop_metrics->name, DLB_MONITOR_NAME_MAX, "%s", monitor_name); | |
283 | 8 | } | |
284 | |||
285 | #endif /* PERF_METRICS_H */ | ||
286 |