| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /*********************************************************************************/ | ||
| 2 | /* Copyright 2009-2025 Barcelona Supercomputing Center */ | ||
| 3 | /* */ | ||
| 4 | /* This file is part of the DLB library. */ | ||
| 5 | /* */ | ||
| 6 | /* DLB is free software: you can redistribute it and/or modify */ | ||
| 7 | /* it under the terms of the GNU Lesser General Public License as published by */ | ||
| 8 | /* the Free Software Foundation, either version 3 of the License, or */ | ||
| 9 | /* (at your option) any later version. */ | ||
| 10 | /* */ | ||
| 11 | /* DLB is distributed in the hope that it will be useful, */ | ||
| 12 | /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
| 13 | /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
| 14 | /* GNU Lesser General Public License for more details. */ | ||
| 15 | /* */ | ||
| 16 | /* You should have received a copy of the GNU Lesser General Public License */ | ||
| 17 | /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */ | ||
| 18 | /*********************************************************************************/ | ||
| 19 | |||
| 20 | #include "talp/perf_metrics.h" | ||
| 21 | |||
| 22 | #include "LB_core/spd.h" | ||
| 23 | #include "apis/dlb_talp.h" | ||
| 24 | #include "support/debug.h" | ||
| 25 | #ifdef MPI_LIB | ||
| 26 | #include "mpi/mpi_core.h" | ||
| 27 | #endif | ||
| 28 | |||
| 29 | #include <stddef.h> | ||
| 30 | #include <stdio.h> | ||
| 31 | |||
| 32 | /*********************************************************************************/ | ||
| 33 | /* POP metrics - pure MPI model */ | ||
| 34 | /*********************************************************************************/ | ||
| 35 | |||
| 36 | /* Compute POP metrics for the MPI model | ||
| 37 | * (This funtion is actually not used anywhere) */ | ||
| 38 | static inline void perf_metrics__compute_mpi_model( | ||
| 39 | perf_metrics_mpi_t *metrics, | ||
| 40 | int num_cpus, | ||
| 41 | int num_nodes, | ||
| 42 | int64_t elapsed_time, | ||
| 43 | int64_t elapsed_useful, | ||
| 44 | int64_t app_sum_useful, | ||
| 45 | int64_t node_sum_useful) __attribute__((unused)); | ||
| 46 | static inline void perf_metrics__compute_mpi_model( | ||
| 47 | perf_metrics_mpi_t *metrics, | ||
| 48 | int num_cpus, | ||
| 49 | int num_nodes, | ||
| 50 | int64_t elapsed_time, | ||
| 51 | int64_t elapsed_useful, | ||
| 52 | int64_t app_sum_useful, | ||
| 53 | int64_t node_sum_useful) { | ||
| 54 | |||
| 55 | if (elapsed_time > 0) { | ||
| 56 | *metrics = (const perf_metrics_mpi_t) { | ||
| 57 | .parallel_efficiency = (float)app_sum_useful / (elapsed_time * num_cpus), | ||
| 58 | .communication_efficiency = (float)elapsed_useful / elapsed_time, | ||
| 59 | .load_balance = (float)app_sum_useful / (elapsed_useful * num_cpus), | ||
| 60 | .lb_in = (float)(node_sum_useful * num_nodes) / (elapsed_useful * num_cpus), | ||
| 61 | .lb_out = (float)app_sum_useful / (node_sum_useful * num_nodes), | ||
| 62 | }; | ||
| 63 | } else { | ||
| 64 | *metrics = (const perf_metrics_mpi_t) {}; | ||
| 65 | } | ||
| 66 | } | ||
| 67 | |||
| 68 | /* Compute POP metrics for the MPI model, but with some inferred values: | ||
| 69 | * (Only useful for node metrics) */ | ||
| 70 | 6 | void perf_metrics__infer_mpi_model( | |
| 71 | perf_metrics_mpi_t *metrics, | ||
| 72 | int processes_per_node, | ||
| 73 | int64_t node_sum_useful, | ||
| 74 | int64_t node_sum_mpi, | ||
| 75 | int64_t max_useful_time) { | ||
| 76 | |||
| 77 | 6 | int64_t elapsed_time = (node_sum_useful + node_sum_mpi) / processes_per_node; | |
| 78 |
1/2✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
|
6 | if (elapsed_time > 0) { |
| 79 | 6 | *metrics = (const perf_metrics_mpi_t) { | |
| 80 | 6 | .parallel_efficiency = (float)node_sum_useful / (node_sum_useful + node_sum_mpi), | |
| 81 | 6 | .communication_efficiency = (float)max_useful_time / elapsed_time, | |
| 82 | 6 | .load_balance = ((float)node_sum_useful / processes_per_node) / max_useful_time, | |
| 83 | }; | ||
| 84 | } else { | ||
| 85 | ✗ | *metrics = (const perf_metrics_mpi_t) {}; | |
| 86 | } | ||
| 87 | 6 | } | |
| 88 | |||
| 89 | |||
| 90 | /*********************************************************************************/ | ||
| 91 | /* POP metrics - hybrid MPI + OpenMP model */ | ||
| 92 | /*********************************************************************************/ | ||
| 93 | |||
| 94 | /* Computed efficiency metrics for the POP hybrid model */ | ||
| 95 | typedef struct perf_metrics_hybrid_t { | ||
| 96 | float parallel_efficiency; | ||
| 97 | float mpi_parallel_efficiency; | ||
| 98 | float mpi_communication_efficiency; | ||
| 99 | float mpi_load_balance; | ||
| 100 | float mpi_load_balance_in; | ||
| 101 | float mpi_load_balance_out; | ||
| 102 | float omp_parallel_efficiency; | ||
| 103 | float omp_load_balance; | ||
| 104 | float omp_scheduling_efficiency; | ||
| 105 | float omp_serialization_efficiency; | ||
| 106 | float device_offload_efficiency; | ||
| 107 | float gpu_parallel_efficiency; | ||
| 108 | float gpu_load_balance; | ||
| 109 | float gpu_communication_efficiency; | ||
| 110 | float gpu_orchestration_efficiency; | ||
| 111 | } perf_metrics_hybrid_t; | ||
| 112 | |||
| 113 | |||
| 114 | /* Compute POP metrics for the hybrid MPI + OpenMP model | ||
| 115 | * (Ver. 1: All metrics are multiplicative, but some of them are > 1) */ | ||
| 116 | ✗ | static inline void perf_metrics__compute_hybrid_model_v1( | |
| 117 | perf_metrics_hybrid_t *metrics, | ||
| 118 | const pop_base_metrics_t *base_metrics) { | ||
| 119 | |||
| 120 | ✗ | int num_cpus = base_metrics->num_cpus; | |
| 121 | ✗ | int num_gpus = base_metrics->num_gpus; | |
| 122 | ✗ | int64_t elapsed_time = base_metrics->elapsed_time; | |
| 123 | ✗ | int64_t useful_time = base_metrics->useful_time; | |
| 124 | ✗ | int64_t mpi_time = base_metrics->mpi_time; | |
| 125 | ✗ | int64_t omp_load_imbalance_time = base_metrics->omp_load_imbalance_time; | |
| 126 | ✗ | int64_t omp_scheduling_time = base_metrics->omp_scheduling_time; | |
| 127 | ✗ | int64_t omp_serialization_time = base_metrics->omp_serialization_time; | |
| 128 | ✗ | int64_t gpu_runtime_time = base_metrics->gpu_runtime_time; | |
| 129 | ✗ | double min_mpi_normd_proc = base_metrics->min_mpi_normd_proc; | |
| 130 | ✗ | double min_mpi_normd_node = base_metrics->min_mpi_normd_node; | |
| 131 | ✗ | int64_t gpu_useful_time = base_metrics->gpu_useful_time; | |
| 132 | ✗ | int64_t max_gpu_useful_time = base_metrics->max_gpu_useful_time; | |
| 133 | ✗ | int64_t max_gpu_active_time = base_metrics->max_gpu_active_time; | |
| 134 | |||
| 135 | /* Active is the union of all times (while CPU is not disabled) */ | ||
| 136 | ✗ | int64_t sum_active = useful_time + mpi_time + omp_load_imbalance_time + | |
| 137 | ✗ | omp_scheduling_time + omp_serialization_time + gpu_runtime_time; | |
| 138 | |||
| 139 | /* Equivalent to all CPU time if OMP was not present */ | ||
| 140 | ✗ | int64_t sum_active_non_omp = useful_time + mpi_time + gpu_runtime_time; | |
| 141 | |||
| 142 | /* Equivalent to all CPU time if GPU was not present */ | ||
| 143 | ✗ | int64_t sum_active_non_gpu = sum_active - gpu_runtime_time; | |
| 144 | |||
| 145 | /* MPI time normalized at application level */ | ||
| 146 | ✗ | double mpi_normd_app = (double)mpi_time / num_cpus; | |
| 147 | |||
| 148 | /* Non-MPI time normalized at application level */ | ||
| 149 | ✗ | double non_mpi_normd_app = elapsed_time - mpi_normd_app; | |
| 150 | |||
| 151 | /* Max value of non-MPI times normalized at process level */ | ||
| 152 | ✗ | double max_non_mpi_normd_proc = elapsed_time - min_mpi_normd_proc; | |
| 153 | |||
| 154 | /* Max value of non-MPI times normalized at node level */ | ||
| 155 | ✗ | double max_non_mpi_normd_node = elapsed_time - min_mpi_normd_node; | |
| 156 | |||
| 157 | /* All Device time */ | ||
| 158 | ✗ | int64_t sum_device_time = elapsed_time * num_gpus; | |
| 159 | |||
| 160 | /* Compute output metrics */ | ||
| 161 | ✗ | *metrics = (const perf_metrics_hybrid_t) { | |
| 162 | ✗ | .parallel_efficiency = (float)useful_time / sum_active, | |
| 163 | ✗ | .mpi_parallel_efficiency = (float)useful_time / (useful_time + mpi_time), | |
| 164 | .mpi_communication_efficiency = | ||
| 165 | ✗ | max_non_mpi_normd_proc / (non_mpi_normd_app + mpi_normd_app), | |
| 166 | ✗ | .mpi_load_balance = non_mpi_normd_app / max_non_mpi_normd_proc, | |
| 167 | ✗ | .mpi_load_balance_in = max_non_mpi_normd_node / max_non_mpi_normd_proc, | |
| 168 | ✗ | .mpi_load_balance_out = non_mpi_normd_app / max_non_mpi_normd_node, | |
| 169 | ✗ | .omp_parallel_efficiency = (float)sum_active_non_omp / sum_active, | |
| 170 | ✗ | .omp_load_balance = (float)(sum_active_non_omp + omp_serialization_time) | |
| 171 | ✗ | / (sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time), | |
| 172 | .omp_scheduling_efficiency = | ||
| 173 | ✗ | (float)(sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time) | |
| 174 | ✗ | / (sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time | |
| 175 | ✗ | + omp_scheduling_time), | |
| 176 | ✗ | .omp_serialization_efficiency = (float)sum_active_non_omp | |
| 177 | ✗ | / (sum_active_non_omp + omp_serialization_time), | |
| 178 | ✗ | .device_offload_efficiency = (float)sum_active_non_gpu / sum_active, | |
| 179 | .gpu_parallel_efficiency = sum_device_time == 0 ? 0 | ||
| 180 | ✗ | : (float)gpu_useful_time / sum_device_time, | |
| 181 | .gpu_load_balance = sum_device_time == 0 ? 0 | ||
| 182 | ✗ | : (float)gpu_useful_time / (max_gpu_useful_time * num_gpus), | |
| 183 | .gpu_communication_efficiency = sum_device_time == 0 ? 0 | ||
| 184 | ✗ | : (float)max_gpu_useful_time / max_gpu_active_time, | |
| 185 | .gpu_orchestration_efficiency = sum_device_time == 0 ? 0 | ||
| 186 | ✗ | : (float)max_gpu_active_time / elapsed_time, | |
| 187 | }; | ||
| 188 | } | ||
| 189 | |||
| 190 | /* Compute POP metrics for the hybrid MPI + OpenMP model (Ver. 2: PE != MPE * OPE) */ | ||
| 191 | 15 | static inline void perf_metrics__compute_hybrid_model_v2( | |
| 192 | perf_metrics_hybrid_t *metrics, | ||
| 193 | const pop_base_metrics_t *base_metrics) { | ||
| 194 | |||
| 195 | 15 | int num_cpus = base_metrics->num_cpus; | |
| 196 | 15 | int num_gpus = base_metrics->num_gpus; | |
| 197 | 15 | int64_t elapsed_time = base_metrics->elapsed_time; | |
| 198 | 15 | int64_t useful_time = base_metrics->useful_time; | |
| 199 | 15 | int64_t mpi_time = base_metrics->mpi_time; | |
| 200 | 15 | int64_t omp_load_imbalance_time = base_metrics->omp_load_imbalance_time; | |
| 201 | 15 | int64_t omp_scheduling_time = base_metrics->omp_scheduling_time; | |
| 202 | 15 | int64_t omp_serialization_time = base_metrics->omp_serialization_time; | |
| 203 | 15 | int64_t gpu_runtime_time = base_metrics->gpu_runtime_time; | |
| 204 | 15 | double min_mpi_normd_proc = base_metrics->min_mpi_normd_proc; | |
| 205 | 15 | double min_mpi_normd_node = base_metrics->min_mpi_normd_node; | |
| 206 | 15 | int64_t gpu_useful_time = base_metrics->gpu_useful_time; | |
| 207 | 15 | int64_t max_gpu_useful_time = base_metrics->max_gpu_useful_time; | |
| 208 | 15 | int64_t max_gpu_active_time = base_metrics->max_gpu_active_time; | |
| 209 | |||
| 210 | /* Active is the union of all times (CPU not disabled) */ | ||
| 211 | 15 | int64_t sum_active = useful_time + mpi_time + omp_load_imbalance_time + | |
| 212 | 15 | omp_scheduling_time + omp_serialization_time + gpu_runtime_time; | |
| 213 | |||
| 214 | /* Equivalent to all CPU time if OMP was not present */ | ||
| 215 | 15 | int64_t sum_active_non_omp = useful_time + mpi_time + gpu_runtime_time; | |
| 216 | |||
| 217 | /* CPU time of OpenMP not useful */ | ||
| 218 | 15 | int64_t sum_omp_not_useful = omp_load_imbalance_time + omp_scheduling_time + | |
| 219 | omp_serialization_time; | ||
| 220 | |||
| 221 | /* MPI time normalized at application level */ | ||
| 222 | 15 | double mpi_normd_app = (double)mpi_time / num_cpus; | |
| 223 | |||
| 224 | /* Non-MPI time normalized at application level */ | ||
| 225 | 15 | double non_mpi_normd_app = elapsed_time - mpi_normd_app; | |
| 226 | |||
| 227 | /* Max value of non-MPI times normalized at process level */ | ||
| 228 | 15 | double max_non_mpi_normd_proc = elapsed_time - min_mpi_normd_proc; | |
| 229 | |||
| 230 | /* Max value of non-MPI times normalized at node level */ | ||
| 231 | 15 | double max_non_mpi_normd_node = elapsed_time - min_mpi_normd_node; | |
| 232 | |||
| 233 | /* All Device time */ | ||
| 234 | 15 | int64_t sum_device_time = elapsed_time * num_gpus; | |
| 235 | |||
| 236 | /* Compute output metrics */ | ||
| 237 | 15 | *metrics = (const perf_metrics_hybrid_t) { | |
| 238 | 15 | .parallel_efficiency = (float)useful_time / sum_active, | |
| 239 | 15 | .mpi_parallel_efficiency = non_mpi_normd_app / elapsed_time, | |
| 240 | 15 | .mpi_communication_efficiency = max_non_mpi_normd_proc / elapsed_time, | |
| 241 | 15 | .mpi_load_balance = non_mpi_normd_app / max_non_mpi_normd_proc, | |
| 242 | 15 | .mpi_load_balance_in = max_non_mpi_normd_node / max_non_mpi_normd_proc, | |
| 243 | 15 | .mpi_load_balance_out = non_mpi_normd_app / max_non_mpi_normd_node, | |
| 244 | 15 | .omp_parallel_efficiency = (float)sum_active_non_omp / sum_active, | |
| 245 | 15 | .omp_load_balance = (float)(sum_active_non_omp + omp_serialization_time) | |
| 246 | 15 | / (sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time), | |
| 247 | .omp_scheduling_efficiency = | ||
| 248 | 15 | (float)(sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time) | |
| 249 | 15 | / (sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time | |
| 250 | 15 | + omp_scheduling_time), | |
| 251 | 15 | .omp_serialization_efficiency = (float)sum_active_non_omp | |
| 252 | 15 | / (sum_active_non_omp + omp_serialization_time), | |
| 253 | 15 | .device_offload_efficiency = (float)(useful_time + sum_omp_not_useful) | |
| 254 | 15 | / (useful_time + sum_omp_not_useful + gpu_runtime_time), | |
| 255 | .gpu_parallel_efficiency = sum_device_time == 0 ? 0 | ||
| 256 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 14 times.
|
15 | : (float)gpu_useful_time / sum_device_time, |
| 257 | .gpu_load_balance = sum_device_time == 0 ? 0 | ||
| 258 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 14 times.
|
15 | : (float)gpu_useful_time / (max_gpu_useful_time * num_gpus), |
| 259 | .gpu_communication_efficiency = sum_device_time == 0 ? 0 | ||
| 260 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 14 times.
|
15 | : (float)max_gpu_useful_time / max_gpu_active_time, |
| 261 | .gpu_orchestration_efficiency = sum_device_time == 0 ? 0 | ||
| 262 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 14 times.
|
15 | : (float)max_gpu_active_time / elapsed_time, |
| 263 | }; | ||
| 264 | 15 | } | |
| 265 | |||
| 266 | #ifdef MPI_LIB | ||
| 267 | |||
| 268 | /* The following node and app reductions are needed to compute POP metrics: */ | ||
| 269 | |||
| 270 | /*** Node reduction ***/ | ||
| 271 | |||
| 272 | /* Data type to reduce among processes in node */ | ||
| 273 | typedef struct node_reduction_t { | ||
| 274 | bool node_used; | ||
| 275 | int cpus_node; | ||
| 276 | int64_t mpi_time; | ||
| 277 | } node_reduction_t; | ||
| 278 | |||
| 279 | /* Function called in the MPI node reduction */ | ||
| 280 | static void mpi_node_reduction_fn(void *invec, void *inoutvec, int *len, | ||
| 281 | MPI_Datatype *datatype) { | ||
| 282 | node_reduction_t *in = invec; | ||
| 283 | node_reduction_t *inout = inoutvec; | ||
| 284 | |||
| 285 | int _len = *len; | ||
| 286 | for (int i = 0; i < _len; ++i) { | ||
| 287 | if (in[i].node_used) { | ||
| 288 | inout[i].node_used = true; | ||
| 289 | inout[i].cpus_node += in[i].cpus_node; | ||
| 290 | inout[i].mpi_time += in[i].mpi_time; | ||
| 291 | } | ||
| 292 | } | ||
| 293 | } | ||
| 294 | |||
| 295 | /* Function to perform the reduction at node level */ | ||
| 296 | static void reduce_pop_metrics_node_reduction(node_reduction_t *node_reduction, | ||
| 297 | const dlb_monitor_t *monitor) { | ||
| 298 | |||
| 299 | const node_reduction_t node_reduction_send = { | ||
| 300 | .node_used = monitor->num_measurements > 0, | ||
| 301 | .cpus_node = monitor->num_cpus, | ||
| 302 | .mpi_time = monitor->mpi_time, | ||
| 303 | }; | ||
| 304 | |||
| 305 | /* MPI type: int64_t */ | ||
| 306 | MPI_Datatype mpi_int64_type = get_mpi_int64_type(); | ||
| 307 | |||
| 308 | /* MPI struct type: node_reduction_t */ | ||
| 309 | MPI_Datatype mpi_node_reduction_type; | ||
| 310 | { | ||
| 311 | int count = 3; | ||
| 312 | int blocklengths[] = {1, 1, 1}; | ||
| 313 | MPI_Aint displacements[] = { | ||
| 314 | offsetof(node_reduction_t, node_used), | ||
| 315 | offsetof(node_reduction_t, cpus_node), | ||
| 316 | offsetof(node_reduction_t, mpi_time)}; | ||
| 317 | MPI_Datatype types[] = {MPI_C_BOOL, MPI_INT, mpi_int64_type}; | ||
| 318 | MPI_Datatype tmp_type; | ||
| 319 | PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type); | ||
| 320 | PMPI_Type_create_resized(tmp_type, 0, sizeof(node_reduction_t), | ||
| 321 | &mpi_node_reduction_type); | ||
| 322 | PMPI_Type_commit(&mpi_node_reduction_type); | ||
| 323 | } | ||
| 324 | |||
| 325 | /* Define MPI operation */ | ||
| 326 | MPI_Op node_reduction_op; | ||
| 327 | PMPI_Op_create(mpi_node_reduction_fn, true, &node_reduction_op); | ||
| 328 | |||
| 329 | /* MPI reduction */ | ||
| 330 | PMPI_Reduce(&node_reduction_send, node_reduction, 1, | ||
| 331 | mpi_node_reduction_type, node_reduction_op, | ||
| 332 | 0, getNodeComm()); | ||
| 333 | |||
| 334 | /* Free MPI types */ | ||
| 335 | PMPI_Type_free(&mpi_node_reduction_type); | ||
| 336 | PMPI_Op_free(&node_reduction_op); | ||
| 337 | } | ||
| 338 | |||
| 339 | /** App reduction ***/ | ||
| 340 | |||
| 341 | /* Data type to reduce among processes in application */ | ||
| 342 | typedef struct app_reduction_t { | ||
| 343 | /* Resources */ | ||
| 344 | int num_cpus; | ||
| 345 | int num_nodes; | ||
| 346 | float avg_cpus; | ||
| 347 | int num_gpus; | ||
| 348 | /* Hardware Counters */ | ||
| 349 | double cycles; | ||
| 350 | double instructions; | ||
| 351 | /* Statistics */ | ||
| 352 | int64_t num_measurements; | ||
| 353 | int64_t num_mpi_calls; | ||
| 354 | int64_t num_omp_parallels; | ||
| 355 | int64_t num_omp_tasks; | ||
| 356 | int64_t num_gpu_runtime_calls; | ||
| 357 | /* Host Times */ | ||
| 358 | int64_t elapsed_time; | ||
| 359 | int64_t useful_time; | ||
| 360 | int64_t mpi_time; | ||
| 361 | int64_t omp_load_imbalance_time; | ||
| 362 | int64_t omp_scheduling_time; | ||
| 363 | int64_t omp_serialization_time; | ||
| 364 | int64_t gpu_runtime_time; | ||
| 365 | /* Host Normalized Times */ | ||
| 366 | double min_mpi_normd_proc; | ||
| 367 | double min_mpi_normd_node; | ||
| 368 | /* Device Times */ | ||
| 369 | int64_t gpu_useful_time; | ||
| 370 | int64_t gpu_communication_time; | ||
| 371 | int64_t gpu_inactive_time; | ||
| 372 | /* Device Max Times */ | ||
| 373 | int64_t max_gpu_useful_time; | ||
| 374 | int64_t max_gpu_active_time; | ||
| 375 | } app_reduction_t; | ||
| 376 | |||
| 377 | /* Function called in the MPI app reduction */ | ||
| 378 | static void mpi_reduction_fn(void *invec, void *inoutvec, int *len, | ||
| 379 | MPI_Datatype *datatype) { | ||
| 380 | app_reduction_t *in = invec; | ||
| 381 | app_reduction_t *inout = inoutvec; | ||
| 382 | |||
| 383 | int _len = *len; | ||
| 384 | for (int i = 0; i < _len; ++i) { | ||
| 385 | /* Resources */ | ||
| 386 | inout[i].num_cpus += in[i].num_cpus; | ||
| 387 | inout[i].num_nodes += in[i].num_nodes; | ||
| 388 | inout[i].avg_cpus += in[i].avg_cpus; | ||
| 389 | inout[i].num_gpus += in[i].num_gpus; | ||
| 390 | /* Hardware Counters */ | ||
| 391 | inout[i].cycles += in[i].cycles; | ||
| 392 | inout[i].instructions += in[i].instructions; | ||
| 393 | /* Statistics */ | ||
| 394 | inout[i].num_measurements += in[i].num_measurements; | ||
| 395 | inout[i].num_mpi_calls += in[i].num_mpi_calls; | ||
| 396 | inout[i].num_omp_parallels += in[i].num_omp_parallels; | ||
| 397 | inout[i].num_omp_tasks += in[i].num_omp_tasks; | ||
| 398 | inout[i].num_gpu_runtime_calls += in[i].num_gpu_runtime_calls; | ||
| 399 | /* Host Times */ | ||
| 400 | inout[i].elapsed_time = max_int64(inout[i].elapsed_time, in[i].elapsed_time); | ||
| 401 | inout[i].useful_time += in[i].useful_time; | ||
| 402 | inout[i].mpi_time += in[i].mpi_time; | ||
| 403 | inout[i].omp_load_imbalance_time += in[i].omp_load_imbalance_time; | ||
| 404 | inout[i].omp_scheduling_time += in[i].omp_scheduling_time; | ||
| 405 | inout[i].omp_serialization_time += in[i].omp_serialization_time; | ||
| 406 | inout[i].gpu_runtime_time += in[i].gpu_runtime_time; | ||
| 407 | |||
| 408 | /* Host Normalized Times */ | ||
| 409 | inout[i].min_mpi_normd_proc = | ||
| 410 | min_double_non_zero(inout[i].min_mpi_normd_proc, in[i].min_mpi_normd_proc); | ||
| 411 | inout[i].min_mpi_normd_node = | ||
| 412 | min_double_non_zero(inout[i].min_mpi_normd_node, in[i].min_mpi_normd_node); | ||
| 413 | |||
| 414 | /* Device Times */ | ||
| 415 | inout[i].gpu_useful_time += in[i].gpu_useful_time; | ||
| 416 | inout[i].gpu_communication_time += in[i].gpu_communication_time; | ||
| 417 | inout[i].gpu_inactive_time += in[i].gpu_inactive_time; | ||
| 418 | |||
| 419 | /* Device Max Times */ | ||
| 420 | inout[i].max_gpu_useful_time = | ||
| 421 | max_int64(inout[i].max_gpu_useful_time, in[i].max_gpu_useful_time); | ||
| 422 | inout[i].max_gpu_active_time = | ||
| 423 | max_int64(inout[i].max_gpu_active_time, in[i].max_gpu_active_time); | ||
| 424 | } | ||
| 425 | } | ||
| 426 | |||
| 427 | /* Function to perform the reduction at application level */ | ||
| 428 | static void reduce_pop_metrics_app_reduction(app_reduction_t *app_reduction, | ||
| 429 | const node_reduction_t *node_reduction, const dlb_monitor_t *monitor, | ||
| 430 | bool all_to_all) { | ||
| 431 | |||
| 432 | double min_mpi_normd_proc = monitor->num_cpus == 0 ? 0.0 | ||
| 433 | : (double)monitor->mpi_time / monitor->num_cpus; | ||
| 434 | double min_mpi_normd_node = _process_id != 0 ? 0.0 | ||
| 435 | : node_reduction->cpus_node == 0 ? 0.0 | ||
| 436 | : (double)node_reduction->mpi_time / node_reduction->cpus_node; | ||
| 437 | |||
| 438 | bool have_gpus = (monitor->gpu_useful_time + monitor->gpu_communication_time > 0); | ||
| 439 | |||
| 440 | const app_reduction_t app_reduction_send = { | ||
| 441 | /* Resources */ | ||
| 442 | .num_cpus = monitor->num_cpus, | ||
| 443 | .num_nodes = _process_id == 0 && node_reduction->node_used ? 1 : 0, | ||
| 444 | .avg_cpus = monitor->avg_cpus, | ||
| 445 | .num_gpus = have_gpus ? 1 : 0, | ||
| 446 | /* Hardware Counters */ | ||
| 447 | .cycles = (double)monitor->cycles, | ||
| 448 | .instructions = (double)monitor->instructions, | ||
| 449 | /* Statistics */ | ||
| 450 | .num_measurements = monitor->num_measurements, | ||
| 451 | .num_mpi_calls = monitor->num_mpi_calls, | ||
| 452 | .num_omp_parallels = monitor->num_omp_parallels, | ||
| 453 | .num_omp_tasks = monitor->num_omp_tasks, | ||
| 454 | .num_gpu_runtime_calls = monitor->num_gpu_runtime_calls, | ||
| 455 | /* Host Times */ | ||
| 456 | .elapsed_time = monitor->elapsed_time, | ||
| 457 | .useful_time = monitor->useful_time, | ||
| 458 | .mpi_time = monitor->mpi_time, | ||
| 459 | .omp_load_imbalance_time = monitor->omp_load_imbalance_time, | ||
| 460 | .omp_scheduling_time = monitor->omp_scheduling_time, | ||
| 461 | .omp_serialization_time = monitor->omp_serialization_time, | ||
| 462 | .gpu_runtime_time = monitor->gpu_runtime_time, | ||
| 463 | /* Host Normalized Times */ | ||
| 464 | .min_mpi_normd_proc = min_mpi_normd_proc, | ||
| 465 | .min_mpi_normd_node = min_mpi_normd_node, | ||
| 466 | /* Device Times */ | ||
| 467 | .gpu_useful_time = monitor->gpu_useful_time, | ||
| 468 | .gpu_communication_time = monitor->gpu_communication_time, | ||
| 469 | .gpu_inactive_time = monitor->gpu_inactive_time, | ||
| 470 | /* Device Max Times */ | ||
| 471 | .max_gpu_useful_time = monitor->gpu_useful_time, | ||
| 472 | .max_gpu_active_time = monitor->gpu_useful_time + monitor->gpu_communication_time, | ||
| 473 | }; | ||
| 474 | |||
| 475 | /* MPI type: int64_t */ | ||
| 476 | MPI_Datatype mpi_int64_type = get_mpi_int64_type(); | ||
| 477 | |||
| 478 | /* MPI struct type: app_reduction_t */ | ||
| 479 | MPI_Datatype mpi_app_reduction_type; | ||
| 480 | { | ||
| 481 | int blocklengths[] = { | ||
| 482 | 1, 1, 1, 1, /* Resources */ | ||
| 483 | 1, 1, /* Hardware Counters */ | ||
| 484 | 1, 1, 1, 1, 1, /* Statistics */ | ||
| 485 | 1, 1, 1, 1, 1, 1, 1, /* Host Times */ | ||
| 486 | 1, 1, /* Host Normalized Times */ | ||
| 487 | 1, 1, 1, /* Device Times */ | ||
| 488 | 1, 1}; /* Device Max Times */ | ||
| 489 | |||
| 490 | enum {count = sizeof(blocklengths) / sizeof(blocklengths[0])}; | ||
| 491 | |||
| 492 | MPI_Aint displacements[] = { | ||
| 493 | /* Resources */ | ||
| 494 | offsetof(app_reduction_t, num_cpus), | ||
| 495 | offsetof(app_reduction_t, num_nodes), | ||
| 496 | offsetof(app_reduction_t, avg_cpus), | ||
| 497 | offsetof(app_reduction_t, num_gpus), | ||
| 498 | /* Hardware Counters */ | ||
| 499 | offsetof(app_reduction_t, cycles), | ||
| 500 | offsetof(app_reduction_t, instructions), | ||
| 501 | /* Statistics */ | ||
| 502 | offsetof(app_reduction_t, num_measurements), | ||
| 503 | offsetof(app_reduction_t, num_mpi_calls), | ||
| 504 | offsetof(app_reduction_t, num_omp_parallels), | ||
| 505 | offsetof(app_reduction_t, num_omp_tasks), | ||
| 506 | offsetof(app_reduction_t, num_gpu_runtime_calls), | ||
| 507 | /* Host Times */ | ||
| 508 | offsetof(app_reduction_t, elapsed_time), | ||
| 509 | offsetof(app_reduction_t, useful_time), | ||
| 510 | offsetof(app_reduction_t, mpi_time), | ||
| 511 | offsetof(app_reduction_t, omp_load_imbalance_time), | ||
| 512 | offsetof(app_reduction_t, omp_scheduling_time), | ||
| 513 | offsetof(app_reduction_t, omp_serialization_time), | ||
| 514 | offsetof(app_reduction_t, gpu_runtime_time), | ||
| 515 | /* Normalized Times */ | ||
| 516 | offsetof(app_reduction_t, min_mpi_normd_proc), | ||
| 517 | offsetof(app_reduction_t, min_mpi_normd_node), | ||
| 518 | /* Device Times */ | ||
| 519 | offsetof(app_reduction_t, gpu_useful_time), | ||
| 520 | offsetof(app_reduction_t, gpu_communication_time), | ||
| 521 | offsetof(app_reduction_t, gpu_inactive_time), | ||
| 522 | /* Device Max Times */ | ||
| 523 | offsetof(app_reduction_t, max_gpu_useful_time), | ||
| 524 | offsetof(app_reduction_t, max_gpu_active_time), | ||
| 525 | }; | ||
| 526 | |||
| 527 | MPI_Datatype types[] = { | ||
| 528 | MPI_INT, MPI_INT, MPI_FLOAT, MPI_INT, /* Resources */ | ||
| 529 | MPI_DOUBLE, MPI_DOUBLE, /* Hardware Counters */ | ||
| 530 | mpi_int64_type, mpi_int64_type, | ||
| 531 | mpi_int64_type, mpi_int64_type, | ||
| 532 | mpi_int64_type, /* Statistics */ | ||
| 533 | mpi_int64_type, mpi_int64_type, | ||
| 534 | mpi_int64_type, mpi_int64_type, | ||
| 535 | mpi_int64_type, mpi_int64_type, | ||
| 536 | mpi_int64_type, /* Host Times */ | ||
| 537 | MPI_DOUBLE, MPI_DOUBLE, /* Host Normalized Times */ | ||
| 538 | mpi_int64_type, mpi_int64_type, | ||
| 539 | mpi_int64_type, /* Device Times */ | ||
| 540 | mpi_int64_type, mpi_int64_type, /* Device Max Times */ | ||
| 541 | }; | ||
| 542 | |||
| 543 | MPI_Datatype tmp_type; | ||
| 544 | PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type); | ||
| 545 | PMPI_Type_create_resized(tmp_type, 0, sizeof(app_reduction_t), | ||
| 546 | &mpi_app_reduction_type); | ||
| 547 | PMPI_Type_commit(&mpi_app_reduction_type); | ||
| 548 | |||
| 549 | static_ensure(sizeof(blocklengths)/sizeof(blocklengths[0]) == count, | ||
| 550 | "blocklengths size mismatch"); | ||
| 551 | static_ensure(sizeof(displacements)/sizeof(displacements[0]) == count, | ||
| 552 | "displacements size mismatch"); | ||
| 553 | static_ensure(sizeof(types)/sizeof(types[0]) == count, | ||
| 554 | "types size mismatch"); | ||
| 555 | } | ||
| 556 | |||
| 557 | /* Define MPI operation */ | ||
| 558 | MPI_Op app_reduction_op; | ||
| 559 | PMPI_Op_create(mpi_reduction_fn, true, &app_reduction_op); | ||
| 560 | |||
| 561 | /* MPI reduction */ | ||
| 562 | if (!all_to_all) { | ||
| 563 | PMPI_Reduce(&app_reduction_send, app_reduction, 1, | ||
| 564 | mpi_app_reduction_type, app_reduction_op, | ||
| 565 | 0, getWorldComm()); | ||
| 566 | } else { | ||
| 567 | PMPI_Allreduce(&app_reduction_send, app_reduction, 1, | ||
| 568 | mpi_app_reduction_type, app_reduction_op, | ||
| 569 | getWorldComm()); | ||
| 570 | } | ||
| 571 | |||
| 572 | /* Free MPI types */ | ||
| 573 | PMPI_Type_free(&mpi_app_reduction_type); | ||
| 574 | PMPI_Op_free(&app_reduction_op); | ||
| 575 | } | ||
| 576 | |||
| 577 | #endif | ||
| 578 | |||
| 579 | |||
| 580 | |||
| 581 | #if MPI_LIB | ||
| 582 | /* Construct a base metrics struct out of a monitor reduced via MPI */ | ||
| 583 | void perf_metrics__reduce_monitor_into_base_metrics(pop_base_metrics_t *base_metrics, | ||
| 584 | const dlb_monitor_t *monitor, bool all_to_all) { | ||
| 585 | |||
| 586 | /* First, reduce some values among processes in the node, | ||
| 587 | * needed to compute pop metrics */ | ||
| 588 | node_reduction_t node_reduction = {0}; | ||
| 589 | reduce_pop_metrics_node_reduction(&node_reduction, monitor); | ||
| 590 | |||
| 591 | /* With the node reduction, reduce again among all process */ | ||
| 592 | app_reduction_t app_reduction = {0}; | ||
| 593 | reduce_pop_metrics_app_reduction(&app_reduction, &node_reduction, | ||
| 594 | monitor, all_to_all); | ||
| 595 | |||
| 596 | /* Finally, fill output base_metrics... */ | ||
| 597 | |||
| 598 | int num_mpi_ranks; | ||
| 599 | PMPI_Comm_size(getWorldComm(), &num_mpi_ranks); | ||
| 600 | |||
| 601 | *base_metrics = (const pop_base_metrics_t) { | ||
| 602 | .num_cpus = app_reduction.num_cpus, | ||
| 603 | .num_mpi_ranks = num_mpi_ranks, | ||
| 604 | .num_nodes = app_reduction.num_nodes, | ||
| 605 | .avg_cpus = app_reduction.avg_cpus, | ||
| 606 | .num_gpus = app_reduction.num_gpus, | ||
| 607 | .cycles = app_reduction.cycles, | ||
| 608 | .instructions = app_reduction.instructions, | ||
| 609 | .num_measurements = app_reduction.num_measurements, | ||
| 610 | .num_mpi_calls = app_reduction.num_mpi_calls, | ||
| 611 | .num_omp_parallels = app_reduction.num_omp_parallels, | ||
| 612 | .num_omp_tasks = app_reduction.num_omp_tasks, | ||
| 613 | .num_gpu_runtime_calls = app_reduction.num_gpu_runtime_calls, | ||
| 614 | .elapsed_time = app_reduction.elapsed_time, | ||
| 615 | .useful_time = app_reduction.useful_time, | ||
| 616 | .mpi_time = app_reduction.mpi_time, | ||
| 617 | .omp_load_imbalance_time = app_reduction.omp_load_imbalance_time, | ||
| 618 | .omp_scheduling_time = app_reduction.omp_scheduling_time, | ||
| 619 | .omp_serialization_time = app_reduction.omp_serialization_time, | ||
| 620 | .gpu_runtime_time = app_reduction.gpu_runtime_time, | ||
| 621 | .min_mpi_normd_proc = app_reduction.min_mpi_normd_proc, | ||
| 622 | .min_mpi_normd_node = app_reduction.min_mpi_normd_node, | ||
| 623 | .gpu_useful_time = app_reduction.gpu_useful_time, | ||
| 624 | .gpu_communication_time = app_reduction.gpu_communication_time, | ||
| 625 | .gpu_inactive_time = app_reduction.gpu_inactive_time, | ||
| 626 | .max_gpu_useful_time = app_reduction.max_gpu_useful_time, | ||
| 627 | .max_gpu_active_time = app_reduction.max_gpu_active_time, | ||
| 628 | }; | ||
| 629 | } | ||
| 630 | #endif | ||
| 631 | |||
| 632 | |||
| 633 | /* Construct a base metrics struct out of a single monitor */ | ||
| 634 | 15 | void perf_metrics__local_monitor_into_base_metrics(pop_base_metrics_t *base_metrics, | |
| 635 | const dlb_monitor_t *monitor) { | ||
| 636 | |||
| 637 | 15 | bool have_gpus = (monitor->gpu_useful_time + monitor->gpu_communication_time > 0); | |
| 638 | |||
| 639 | 15 | *base_metrics = (const pop_base_metrics_t){ | |
| 640 | 15 | .num_cpus = monitor->num_cpus, | |
| 641 | .num_mpi_ranks = 0, | ||
| 642 | .num_nodes = 1, | ||
| 643 | 15 | .avg_cpus = monitor->avg_cpus, | |
| 644 | 15 | .num_gpus = have_gpus ? 1 : 0, | |
| 645 | 15 | .cycles = (double)monitor->cycles, | |
| 646 | 15 | .instructions = (double)monitor->instructions, | |
| 647 | 15 | .num_measurements = monitor->num_measurements, | |
| 648 | 15 | .num_mpi_calls = monitor->num_mpi_calls, | |
| 649 | 15 | .num_omp_parallels = monitor->num_omp_parallels, | |
| 650 | 15 | .num_omp_tasks = monitor->num_omp_tasks, | |
| 651 | 15 | .num_gpu_runtime_calls = monitor->num_gpu_runtime_calls, | |
| 652 | 15 | .elapsed_time = monitor->elapsed_time, | |
| 653 | 15 | .useful_time = monitor->useful_time, | |
| 654 | 15 | .mpi_time = monitor->mpi_time, | |
| 655 | 15 | .omp_load_imbalance_time = monitor->omp_load_imbalance_time, | |
| 656 | 15 | .omp_scheduling_time = monitor->omp_scheduling_time, | |
| 657 | 15 | .omp_serialization_time = monitor->omp_serialization_time, | |
| 658 | 15 | .gpu_runtime_time = monitor->gpu_runtime_time, | |
| 659 | 15 | .min_mpi_normd_proc = (double)monitor->mpi_time / monitor->num_cpus, | |
| 660 | 15 | .min_mpi_normd_node = (double)monitor->mpi_time / monitor->num_cpus, | |
| 661 | 15 | .gpu_useful_time = monitor->gpu_useful_time, | |
| 662 | 15 | .gpu_communication_time = monitor->gpu_communication_time, | |
| 663 | 15 | .gpu_inactive_time = monitor->gpu_inactive_time, | |
| 664 | 15 | .max_gpu_useful_time = monitor->gpu_useful_time, | |
| 665 | 15 | .max_gpu_active_time = monitor->gpu_useful_time + monitor->gpu_communication_time, | |
| 666 | }; | ||
| 667 | 15 | } | |
| 668 | |||
| 669 | /* Compute POP metrics out of a base metrics struct */ | ||
| 670 | 15 | void perf_metrics__base_to_pop_metrics(const char *monitor_name, | |
| 671 | const pop_base_metrics_t *base_metrics, dlb_pop_metrics_t *pop_metrics) { | ||
| 672 | |||
| 673 | /* Compute POP metrics */ | ||
| 674 | 15 | perf_metrics_hybrid_t metrics = {0}; | |
| 675 | |||
| 676 |
1/2✓ Branch 0 taken 15 times.
✗ Branch 1 not taken.
|
15 | if (base_metrics->useful_time > 0) { |
| 677 | |||
| 678 |
1/3✗ Branch 0 not taken.
✓ Branch 1 taken 15 times.
✗ Branch 2 not taken.
|
15 | switch(thread_spd->options.talp_model) { |
| 679 | ✗ | case TALP_MODEL_HYBRID_V1: | |
| 680 | ✗ | perf_metrics__compute_hybrid_model_v1(&metrics, base_metrics); | |
| 681 | ✗ | break; | |
| 682 | 15 | case TALP_MODEL_HYBRID_V2: | |
| 683 | 15 | perf_metrics__compute_hybrid_model_v2(&metrics, base_metrics); | |
| 684 | 15 | break; | |
| 685 | }; | ||
| 686 | } | ||
| 687 | |||
| 688 | /* Initialize structure */ | ||
| 689 | 15 | *pop_metrics = (const dlb_pop_metrics_t) { | |
| 690 | 15 | .num_cpus = base_metrics->num_cpus, | |
| 691 | 15 | .num_mpi_ranks = base_metrics->num_mpi_ranks, | |
| 692 | 15 | .num_nodes = base_metrics->num_nodes, | |
| 693 | 15 | .avg_cpus = base_metrics->avg_cpus, | |
| 694 | 15 | .num_gpus = base_metrics->num_gpus, | |
| 695 | 15 | .cycles = base_metrics->cycles, | |
| 696 | 15 | .instructions = base_metrics->instructions, | |
| 697 | 15 | .num_measurements = base_metrics->num_measurements, | |
| 698 | 15 | .num_mpi_calls = base_metrics->num_mpi_calls, | |
| 699 | 15 | .num_omp_parallels = base_metrics->num_omp_parallels, | |
| 700 | 15 | .num_omp_tasks = base_metrics->num_omp_tasks, | |
| 701 | 15 | .num_gpu_runtime_calls = base_metrics->num_gpu_runtime_calls, | |
| 702 | 15 | .elapsed_time = base_metrics->elapsed_time, | |
| 703 | 15 | .useful_time = base_metrics->useful_time, | |
| 704 | 15 | .mpi_time = base_metrics->mpi_time, | |
| 705 | 15 | .omp_load_imbalance_time = base_metrics->omp_load_imbalance_time, | |
| 706 | 15 | .omp_scheduling_time = base_metrics->omp_scheduling_time, | |
| 707 | 15 | .omp_serialization_time = base_metrics->omp_serialization_time, | |
| 708 | 15 | .gpu_runtime_time = base_metrics->gpu_runtime_time, | |
| 709 | 15 | .min_mpi_normd_proc = base_metrics->min_mpi_normd_proc, | |
| 710 | 15 | .min_mpi_normd_node = base_metrics->min_mpi_normd_node, | |
| 711 | 15 | .gpu_useful_time = base_metrics->gpu_useful_time, | |
| 712 | 15 | .gpu_communication_time = base_metrics->gpu_communication_time, | |
| 713 | 15 | .gpu_inactive_time = base_metrics->gpu_inactive_time, | |
| 714 | 15 | .max_gpu_useful_time = base_metrics->max_gpu_useful_time, | |
| 715 | 15 | .max_gpu_active_time = base_metrics->max_gpu_active_time, | |
| 716 | 15 | .parallel_efficiency = metrics.parallel_efficiency, | |
| 717 | 15 | .mpi_parallel_efficiency = metrics.mpi_parallel_efficiency, | |
| 718 | 15 | .mpi_communication_efficiency = metrics.mpi_communication_efficiency, | |
| 719 | 15 | .mpi_load_balance = metrics.mpi_load_balance, | |
| 720 | 15 | .mpi_load_balance_in = metrics.mpi_load_balance_in, | |
| 721 | 15 | .mpi_load_balance_out = metrics.mpi_load_balance_out, | |
| 722 | 15 | .omp_parallel_efficiency = metrics.omp_parallel_efficiency, | |
| 723 | 15 | .omp_load_balance = metrics.omp_load_balance, | |
| 724 | 15 | .omp_scheduling_efficiency = metrics.omp_scheduling_efficiency, | |
| 725 | 15 | .omp_serialization_efficiency = metrics.omp_serialization_efficiency, | |
| 726 | 15 | .device_offload_efficiency = metrics.device_offload_efficiency, | |
| 727 | 15 | .gpu_parallel_efficiency = metrics.gpu_parallel_efficiency, | |
| 728 | 15 | .gpu_load_balance = metrics.gpu_load_balance, | |
| 729 | 15 | .gpu_communication_efficiency = metrics.gpu_communication_efficiency, | |
| 730 | 15 | .gpu_orchestration_efficiency = metrics.gpu_orchestration_efficiency, | |
| 731 | }; | ||
| 732 | 15 | snprintf(pop_metrics->name, DLB_MONITOR_NAME_MAX, "%s", monitor_name); | |
| 733 | 15 | } | |
| 734 |