| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /*********************************************************************************/ | ||
| 2 | /* Copyright 2009-2025 Barcelona Supercomputing Center */ | ||
| 3 | /* */ | ||
| 4 | /* This file is part of the DLB library. */ | ||
| 5 | /* */ | ||
| 6 | /* DLB is free software: you can redistribute it and/or modify */ | ||
| 7 | /* it under the terms of the GNU Lesser General Public License as published by */ | ||
| 8 | /* the Free Software Foundation, either version 3 of the License, or */ | ||
| 9 | /* (at your option) any later version. */ | ||
| 10 | /* */ | ||
| 11 | /* DLB is distributed in the hope that it will be useful, */ | ||
| 12 | /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
| 13 | /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
| 14 | /* GNU Lesser General Public License for more details. */ | ||
| 15 | /* */ | ||
| 16 | /* You should have received a copy of the GNU Lesser General Public License */ | ||
| 17 | /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */ | ||
| 18 | /*********************************************************************************/ | ||
| 19 | |||
| 20 | #include "talp/perf_metrics.h" | ||
| 21 | |||
| 22 | #include "LB_core/spd.h" | ||
| 23 | #include "apis/dlb_talp.h" | ||
| 24 | #include "support/debug.h" | ||
| 25 | #ifdef MPI_LIB | ||
| 26 | #include "mpi/mpi_core.h" | ||
| 27 | #endif | ||
| 28 | |||
| 29 | #include <stddef.h> | ||
| 30 | #include <stdio.h> | ||
| 31 | |||
| 32 | /*********************************************************************************/ | ||
| 33 | /* POP metrics - pure MPI model */ | ||
| 34 | /*********************************************************************************/ | ||
| 35 | |||
| 36 | /* Compute POP metrics for the MPI model | ||
| 37 | * (This funtion is actually not used anywhere) */ | ||
| 38 | static inline void perf_metrics__compute_mpi_model( | ||
| 39 | perf_metrics_mpi_t *metrics, | ||
| 40 | int num_cpus, | ||
| 41 | int num_nodes, | ||
| 42 | int64_t elapsed_time, | ||
| 43 | int64_t elapsed_useful, | ||
| 44 | int64_t app_sum_useful, | ||
| 45 | int64_t node_sum_useful) __attribute__((unused)); | ||
| 46 | static inline void perf_metrics__compute_mpi_model( | ||
| 47 | perf_metrics_mpi_t *metrics, | ||
| 48 | int num_cpus, | ||
| 49 | int num_nodes, | ||
| 50 | int64_t elapsed_time, | ||
| 51 | int64_t elapsed_useful, | ||
| 52 | int64_t app_sum_useful, | ||
| 53 | int64_t node_sum_useful) { | ||
| 54 | |||
| 55 | if (elapsed_time > 0) { | ||
| 56 | *metrics = (const perf_metrics_mpi_t) { | ||
| 57 | .parallel_efficiency = (float)app_sum_useful / (elapsed_time * num_cpus), | ||
| 58 | .communication_efficiency = (float)elapsed_useful / elapsed_time, | ||
| 59 | .load_balance = (float)app_sum_useful / (elapsed_useful * num_cpus), | ||
| 60 | .lb_in = (float)(node_sum_useful * num_nodes) / (elapsed_useful * num_cpus), | ||
| 61 | .lb_out = (float)app_sum_useful / (node_sum_useful * num_nodes), | ||
| 62 | }; | ||
| 63 | } else { | ||
| 64 | *metrics = (const perf_metrics_mpi_t) {}; | ||
| 65 | } | ||
| 66 | } | ||
| 67 | |||
| 68 | /* Compute POP metrics for the MPI model, but with some inferred values: | ||
| 69 | * (Only useful for node metrics) */ | ||
| 70 | 6 | void perf_metrics__infer_mpi_model( | |
| 71 | perf_metrics_mpi_t *metrics, | ||
| 72 | int processes_per_node, | ||
| 73 | int64_t node_sum_useful, | ||
| 74 | int64_t node_sum_mpi, | ||
| 75 | int64_t max_useful_time) { | ||
| 76 | |||
| 77 | 6 | int64_t elapsed_time = (node_sum_useful + node_sum_mpi) / processes_per_node; | |
| 78 |
1/2✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
|
6 | if (elapsed_time > 0) { |
| 79 | 6 | *metrics = (const perf_metrics_mpi_t) { | |
| 80 | 6 | .parallel_efficiency = (float)node_sum_useful / (node_sum_useful + node_sum_mpi), | |
| 81 | 6 | .communication_efficiency = (float)max_useful_time / elapsed_time, | |
| 82 | 6 | .load_balance = ((float)node_sum_useful / processes_per_node) / max_useful_time, | |
| 83 | }; | ||
| 84 | } else { | ||
| 85 | ✗ | *metrics = (const perf_metrics_mpi_t) {}; | |
| 86 | } | ||
| 87 | 6 | } | |
| 88 | |||
| 89 | |||
| 90 | /*********************************************************************************/ | ||
| 91 | /* POP metrics - hybrid MPI + OpenMP model */ | ||
| 92 | /*********************************************************************************/ | ||
| 93 | |||
| 94 | /* Computed efficiency metrics for the POP hybrid model */ | ||
| 95 | typedef struct perf_metrics_hybrid_t { | ||
| 96 | float parallel_efficiency; | ||
| 97 | float mpi_parallel_efficiency; | ||
| 98 | float mpi_communication_efficiency; | ||
| 99 | float mpi_load_balance; | ||
| 100 | float mpi_load_balance_in; | ||
| 101 | float mpi_load_balance_out; | ||
| 102 | float omp_parallel_efficiency; | ||
| 103 | float omp_load_balance; | ||
| 104 | float omp_scheduling_efficiency; | ||
| 105 | float omp_serialization_efficiency; | ||
| 106 | float device_offload_efficiency; | ||
| 107 | float gpu_parallel_efficiency; | ||
| 108 | float gpu_load_balance; | ||
| 109 | float gpu_communication_efficiency; | ||
| 110 | float gpu_orchestration_efficiency; | ||
| 111 | } perf_metrics_hybrid_t; | ||
| 112 | |||
| 113 | |||
| 114 | /* Compute POP metrics for the hybrid MPI + OpenMP model | ||
| 115 | * (Ver. 1: All metrics are multiplicative, but some of them are > 1) */ | ||
| 116 | ✗ | static inline void perf_metrics__compute_hybrid_model_v1( | |
| 117 | perf_metrics_hybrid_t *metrics, | ||
| 118 | const pop_base_metrics_t *base_metrics) { | ||
| 119 | |||
| 120 | ✗ | int num_cpus = base_metrics->num_cpus; | |
| 121 | ✗ | int num_gpus = base_metrics->num_gpus; | |
| 122 | ✗ | int64_t elapsed_time = base_metrics->elapsed_time; | |
| 123 | ✗ | int64_t useful_time = base_metrics->useful_time; | |
| 124 | ✗ | int64_t mpi_time = base_metrics->mpi_time; | |
| 125 | ✗ | int64_t omp_load_imbalance_time = base_metrics->omp_load_imbalance_time; | |
| 126 | ✗ | int64_t omp_scheduling_time = base_metrics->omp_scheduling_time; | |
| 127 | ✗ | int64_t omp_serialization_time = base_metrics->omp_serialization_time; | |
| 128 | ✗ | int64_t gpu_runtime_time = base_metrics->gpu_runtime_time; | |
| 129 | ✗ | double min_mpi_normd_proc = base_metrics->min_mpi_normd_proc; | |
| 130 | ✗ | double min_mpi_normd_node = base_metrics->min_mpi_normd_node; | |
| 131 | ✗ | int64_t gpu_useful_time = base_metrics->gpu_useful_time; | |
| 132 | ✗ | int64_t max_gpu_useful_time = base_metrics->max_gpu_useful_time; | |
| 133 | ✗ | int64_t max_gpu_active_time = base_metrics->max_gpu_active_time; | |
| 134 | |||
| 135 | /* Active is the union of all times (while CPU is not disabled) */ | ||
| 136 | ✗ | int64_t sum_active = useful_time + mpi_time + omp_load_imbalance_time + | |
| 137 | ✗ | omp_scheduling_time + omp_serialization_time + gpu_runtime_time; | |
| 138 | |||
| 139 | /* Equivalent to all CPU time if OMP was not present */ | ||
| 140 | ✗ | int64_t sum_active_non_omp = useful_time + mpi_time + gpu_runtime_time; | |
| 141 | |||
| 142 | /* Equivalent to all CPU time if GPU was not present */ | ||
| 143 | ✗ | int64_t sum_active_non_gpu = sum_active - gpu_runtime_time; | |
| 144 | |||
| 145 | /* MPI time normalized at application level */ | ||
| 146 | ✗ | double mpi_normd_app = (double)mpi_time / num_cpus; | |
| 147 | |||
| 148 | /* Non-MPI time normalized at application level */ | ||
| 149 | ✗ | double non_mpi_normd_app = elapsed_time - mpi_normd_app; | |
| 150 | |||
| 151 | /* Max value of non-MPI times normalized at process level */ | ||
| 152 | ✗ | double max_non_mpi_normd_proc = elapsed_time - min_mpi_normd_proc; | |
| 153 | |||
| 154 | /* Max value of non-MPI times normalized at node level */ | ||
| 155 | ✗ | double max_non_mpi_normd_node = elapsed_time - min_mpi_normd_node; | |
| 156 | |||
| 157 | /* All Device time */ | ||
| 158 | ✗ | int64_t sum_device_time = elapsed_time * num_gpus; | |
| 159 | |||
| 160 | /* Compute output metrics */ | ||
| 161 | ✗ | *metrics = (const perf_metrics_hybrid_t) { | |
| 162 | ✗ | .parallel_efficiency = (float)useful_time / sum_active, | |
| 163 | ✗ | .mpi_parallel_efficiency = (float)useful_time / (useful_time + mpi_time), | |
| 164 | .mpi_communication_efficiency = | ||
| 165 | ✗ | max_non_mpi_normd_proc / (non_mpi_normd_app + mpi_normd_app), | |
| 166 | ✗ | .mpi_load_balance = non_mpi_normd_app / max_non_mpi_normd_proc, | |
| 167 | ✗ | .mpi_load_balance_in = max_non_mpi_normd_node / max_non_mpi_normd_proc, | |
| 168 | ✗ | .mpi_load_balance_out = non_mpi_normd_app / max_non_mpi_normd_node, | |
| 169 | ✗ | .omp_parallel_efficiency = (float)sum_active_non_omp / sum_active, | |
| 170 | ✗ | .omp_load_balance = (float)(sum_active_non_omp + omp_serialization_time) | |
| 171 | ✗ | / (sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time), | |
| 172 | .omp_scheduling_efficiency = | ||
| 173 | ✗ | (float)(sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time) | |
| 174 | ✗ | / (sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time | |
| 175 | ✗ | + omp_scheduling_time), | |
| 176 | ✗ | .omp_serialization_efficiency = (float)sum_active_non_omp | |
| 177 | ✗ | / (sum_active_non_omp + omp_serialization_time), | |
| 178 | ✗ | .device_offload_efficiency = (float)sum_active_non_gpu / sum_active, | |
| 179 | .gpu_parallel_efficiency = sum_device_time == 0 ? 0 | ||
| 180 | ✗ | : (float)gpu_useful_time / sum_device_time, | |
| 181 | ✗ | .gpu_load_balance = max_gpu_useful_time * num_gpus == 0 ? 0 | |
| 182 | ✗ | : (float)gpu_useful_time / (max_gpu_useful_time * num_gpus), | |
| 183 | .gpu_communication_efficiency = max_gpu_active_time == 0 ? 0 | ||
| 184 | ✗ | : (float)max_gpu_useful_time / max_gpu_active_time, | |
| 185 | .gpu_orchestration_efficiency = sum_device_time == 0 ? 0 | ||
| 186 | ✗ | : (float)max_gpu_active_time / elapsed_time, | |
| 187 | }; | ||
| 188 | } | ||
| 189 | |||
| 190 | /* Compute POP metrics for the hybrid MPI + OpenMP model (Ver. 2: PE != MPE * OPE) */ | ||
| 191 | 18 | static inline void perf_metrics__compute_hybrid_model_v2( | |
| 192 | perf_metrics_hybrid_t *metrics, | ||
| 193 | const pop_base_metrics_t *base_metrics) { | ||
| 194 | |||
| 195 | 18 | int num_cpus = base_metrics->num_cpus; | |
| 196 | 18 | int num_gpus = base_metrics->num_gpus; | |
| 197 | 18 | int64_t elapsed_time = base_metrics->elapsed_time; | |
| 198 | 18 | int64_t useful_time = base_metrics->useful_time; | |
| 199 | 18 | int64_t mpi_time = base_metrics->mpi_time; | |
| 200 | 18 | int64_t mpi_worker_idle_time = base_metrics->mpi_worker_idle_time; | |
| 201 | 18 | int64_t omp_load_imbalance_time = base_metrics->omp_load_imbalance_time; | |
| 202 | 18 | int64_t omp_scheduling_time = base_metrics->omp_scheduling_time; | |
| 203 | 18 | int64_t omp_serialization_time = base_metrics->omp_serialization_time; | |
| 204 | 18 | int64_t gpu_runtime_time = base_metrics->gpu_runtime_time; | |
| 205 | 18 | double min_mpi_normd_proc = base_metrics->min_mpi_normd_proc; | |
| 206 | 18 | double min_mpi_normd_node = base_metrics->min_mpi_normd_node; | |
| 207 | 18 | int64_t gpu_useful_time = base_metrics->gpu_useful_time; | |
| 208 | 18 | int64_t max_gpu_useful_time = base_metrics->max_gpu_useful_time; | |
| 209 | 18 | int64_t max_gpu_active_time = base_metrics->max_gpu_active_time; | |
| 210 | |||
| 211 | /* Active is the union of all times (CPU not disabled) */ | ||
| 212 | 18 | int64_t sum_active = useful_time + mpi_time + omp_load_imbalance_time + | |
| 213 | 18 | omp_scheduling_time + omp_serialization_time + gpu_runtime_time; | |
| 214 | |||
| 215 | /* Equivalent to all CPU time if OMP was not present */ | ||
| 216 | 18 | int64_t sum_active_non_omp = useful_time + mpi_time + gpu_runtime_time; | |
| 217 | |||
| 218 | /* CPU time of OpenMP not useful */ | ||
| 219 | 18 | int64_t sum_omp_not_useful = omp_load_imbalance_time + omp_scheduling_time + | |
| 220 | omp_serialization_time; | ||
| 221 | |||
| 222 | /* MPI time normalized at application level */ | ||
| 223 | 18 | double mpi_normd_app = (double)(mpi_time + mpi_worker_idle_time) / num_cpus; | |
| 224 | |||
| 225 | /* Non-MPI time normalized at application level */ | ||
| 226 | 18 | double non_mpi_normd_app = elapsed_time - mpi_normd_app; | |
| 227 | |||
| 228 | /* Max value of non-MPI times normalized at process level */ | ||
| 229 | 18 | double max_non_mpi_normd_proc = elapsed_time - min_mpi_normd_proc; | |
| 230 | |||
| 231 | /* Max value of non-MPI times normalized at node level */ | ||
| 232 | 18 | double max_non_mpi_normd_node = elapsed_time - min_mpi_normd_node; | |
| 233 | |||
| 234 | /* All Device time */ | ||
| 235 | 18 | int64_t sum_device_time = elapsed_time * num_gpus; | |
| 236 | |||
| 237 | /* Compute output metrics */ | ||
| 238 | 18 | *metrics = (const perf_metrics_hybrid_t) { | |
| 239 | 18 | .parallel_efficiency = (float)useful_time / sum_active, | |
| 240 | 18 | .mpi_parallel_efficiency = non_mpi_normd_app / elapsed_time, | |
| 241 | 18 | .mpi_communication_efficiency = max_non_mpi_normd_proc / elapsed_time, | |
| 242 | 18 | .mpi_load_balance = non_mpi_normd_app / max_non_mpi_normd_proc, | |
| 243 | 18 | .mpi_load_balance_in = max_non_mpi_normd_node / max_non_mpi_normd_proc, | |
| 244 | 18 | .mpi_load_balance_out = non_mpi_normd_app / max_non_mpi_normd_node, | |
| 245 | 18 | .omp_parallel_efficiency = (float)sum_active_non_omp / sum_active, | |
| 246 | 18 | .omp_load_balance = (float)(sum_active_non_omp + omp_serialization_time) | |
| 247 | 18 | / (sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time), | |
| 248 | .omp_scheduling_efficiency = | ||
| 249 | 18 | (float)(sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time) | |
| 250 | 18 | / (sum_active_non_omp + omp_serialization_time + omp_load_imbalance_time | |
| 251 | 18 | + omp_scheduling_time), | |
| 252 | 18 | .omp_serialization_efficiency = (float)sum_active_non_omp | |
| 253 | 18 | / (sum_active_non_omp + omp_serialization_time), | |
| 254 | 18 | .device_offload_efficiency = (float)(useful_time + sum_omp_not_useful) | |
| 255 | 18 | / (useful_time + sum_omp_not_useful + gpu_runtime_time), | |
| 256 | .gpu_parallel_efficiency = sum_device_time == 0 ? 0 | ||
| 257 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 17 times.
|
18 | : (float)gpu_useful_time / sum_device_time, |
| 258 | 18 | .gpu_load_balance = max_gpu_useful_time * num_gpus == 0 ? 0 | |
| 259 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 17 times.
|
18 | : (float)gpu_useful_time / (max_gpu_useful_time * num_gpus), |
| 260 | .gpu_communication_efficiency = max_gpu_active_time == 0 ? 0 | ||
| 261 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 17 times.
|
18 | : (float)max_gpu_useful_time / max_gpu_active_time, |
| 262 | .gpu_orchestration_efficiency = sum_device_time == 0 ? 0 | ||
| 263 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 17 times.
|
18 | : (float)max_gpu_active_time / elapsed_time, |
| 264 | }; | ||
| 265 | 18 | } | |
| 266 | |||
| 267 | #ifdef MPI_LIB | ||
| 268 | |||
| 269 | /* The following node and app reductions are needed to compute POP metrics: */ | ||
| 270 | |||
| 271 | /*** Node reduction ***/ | ||
| 272 | |||
| 273 | /* Data type to reduce among processes in node */ | ||
| 274 | typedef struct node_reduction_t { | ||
| 275 | bool node_used; | ||
| 276 | int cpus_node; | ||
| 277 | int64_t mpi_time; | ||
| 278 | int64_t mpi_worker_idle_time; | ||
| 279 | } node_reduction_t; | ||
| 280 | |||
| 281 | /* Function called in the MPI node reduction */ | ||
| 282 | static void mpi_node_reduction_fn(void *invec, void *inoutvec, int *len, | ||
| 283 | MPI_Datatype *datatype) { | ||
| 284 | node_reduction_t *in = invec; | ||
| 285 | node_reduction_t *inout = inoutvec; | ||
| 286 | |||
| 287 | int _len = *len; | ||
| 288 | for (int i = 0; i < _len; ++i) { | ||
| 289 | if (in[i].node_used) { | ||
| 290 | inout[i].node_used = true; | ||
| 291 | inout[i].cpus_node += in[i].cpus_node; | ||
| 292 | inout[i].mpi_time += in[i].mpi_time; | ||
| 293 | inout[i].mpi_worker_idle_time += in[i].mpi_worker_idle_time; | ||
| 294 | } | ||
| 295 | } | ||
| 296 | } | ||
| 297 | |||
| 298 | /* Function to perform the reduction at node level */ | ||
| 299 | static void reduce_pop_metrics_node_reduction(node_reduction_t *node_reduction, | ||
| 300 | const dlb_monitor_t *monitor) { | ||
| 301 | |||
| 302 | const node_reduction_t node_reduction_send = { | ||
| 303 | .node_used = monitor->num_measurements > 0, | ||
| 304 | .cpus_node = monitor->num_cpus, | ||
| 305 | .mpi_time = monitor->mpi_time, | ||
| 306 | .mpi_worker_idle_time = monitor->mpi_worker_idle_time, | ||
| 307 | }; | ||
| 308 | |||
| 309 | /* MPI type: int64_t */ | ||
| 310 | MPI_Datatype mpi_int64_type = get_mpi_int64_type(); | ||
| 311 | |||
| 312 | /* MPI struct type: node_reduction_t */ | ||
| 313 | MPI_Datatype mpi_node_reduction_type; | ||
| 314 | { | ||
| 315 | int count = 4; | ||
| 316 | int blocklengths[] = {1, 1, 1, 1}; | ||
| 317 | MPI_Aint displacements[] = { | ||
| 318 | offsetof(node_reduction_t, node_used), | ||
| 319 | offsetof(node_reduction_t, cpus_node), | ||
| 320 | offsetof(node_reduction_t, mpi_time), | ||
| 321 | offsetof(node_reduction_t, mpi_worker_idle_time)}; | ||
| 322 | MPI_Datatype types[] = {MPI_C_BOOL, MPI_INT, mpi_int64_type, mpi_int64_type}; | ||
| 323 | MPI_Datatype tmp_type; | ||
| 324 | PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type); | ||
| 325 | PMPI_Type_create_resized(tmp_type, 0, sizeof(node_reduction_t), | ||
| 326 | &mpi_node_reduction_type); | ||
| 327 | PMPI_Type_commit(&mpi_node_reduction_type); | ||
| 328 | } | ||
| 329 | |||
| 330 | /* Define MPI operation */ | ||
| 331 | MPI_Op node_reduction_op; | ||
| 332 | PMPI_Op_create(mpi_node_reduction_fn, true, &node_reduction_op); | ||
| 333 | |||
| 334 | /* MPI reduction */ | ||
| 335 | PMPI_Reduce(&node_reduction_send, node_reduction, 1, | ||
| 336 | mpi_node_reduction_type, node_reduction_op, | ||
| 337 | 0, getNodeComm()); | ||
| 338 | |||
| 339 | /* Free MPI types */ | ||
| 340 | PMPI_Type_free(&mpi_node_reduction_type); | ||
| 341 | PMPI_Op_free(&node_reduction_op); | ||
| 342 | } | ||
| 343 | |||
| 344 | /** App reduction ***/ | ||
| 345 | |||
| 346 | /* Data type to reduce among processes in application */ | ||
| 347 | typedef struct app_reduction_t { | ||
| 348 | /* Resources */ | ||
| 349 | int num_cpus; | ||
| 350 | int num_nodes; | ||
| 351 | float avg_cpus; | ||
| 352 | int num_gpus; | ||
| 353 | /* Hardware Counters */ | ||
| 354 | double cycles; | ||
| 355 | double instructions; | ||
| 356 | /* Statistics */ | ||
| 357 | int64_t num_measurements; | ||
| 358 | int64_t num_mpi_calls; | ||
| 359 | int64_t num_omp_parallels; | ||
| 360 | int64_t num_omp_tasks; | ||
| 361 | int64_t num_gpu_runtime_calls; | ||
| 362 | /* Host Times */ | ||
| 363 | int64_t elapsed_time; | ||
| 364 | int64_t useful_time; | ||
| 365 | int64_t mpi_time; | ||
| 366 | int64_t mpi_worker_idle_time; | ||
| 367 | int64_t omp_load_imbalance_time; | ||
| 368 | int64_t omp_scheduling_time; | ||
| 369 | int64_t omp_serialization_time; | ||
| 370 | int64_t gpu_runtime_time; | ||
| 371 | /* Host Normalized Times */ | ||
| 372 | double min_mpi_normd_proc; | ||
| 373 | double min_mpi_normd_node; | ||
| 374 | /* Device Times */ | ||
| 375 | int64_t gpu_useful_time; | ||
| 376 | int64_t gpu_communication_time; | ||
| 377 | int64_t gpu_inactive_time; | ||
| 378 | /* Device Max Times */ | ||
| 379 | int64_t max_gpu_useful_time; | ||
| 380 | int64_t max_gpu_active_time; | ||
| 381 | } app_reduction_t; | ||
| 382 | |||
| 383 | /* Function called in the MPI app reduction */ | ||
| 384 | static void mpi_reduction_fn(void *invec, void *inoutvec, int *len, | ||
| 385 | MPI_Datatype *datatype) { | ||
| 386 | app_reduction_t *in = invec; | ||
| 387 | app_reduction_t *inout = inoutvec; | ||
| 388 | |||
| 389 | int _len = *len; | ||
| 390 | for (int i = 0; i < _len; ++i) { | ||
| 391 | /* Resources */ | ||
| 392 | inout[i].num_cpus += in[i].num_cpus; | ||
| 393 | inout[i].num_nodes += in[i].num_nodes; | ||
| 394 | inout[i].avg_cpus += in[i].avg_cpus; | ||
| 395 | inout[i].num_gpus += in[i].num_gpus; | ||
| 396 | /* Hardware Counters */ | ||
| 397 | inout[i].cycles += in[i].cycles; | ||
| 398 | inout[i].instructions += in[i].instructions; | ||
| 399 | /* Statistics */ | ||
| 400 | inout[i].num_measurements += in[i].num_measurements; | ||
| 401 | inout[i].num_mpi_calls += in[i].num_mpi_calls; | ||
| 402 | inout[i].num_omp_parallels += in[i].num_omp_parallels; | ||
| 403 | inout[i].num_omp_tasks += in[i].num_omp_tasks; | ||
| 404 | inout[i].num_gpu_runtime_calls += in[i].num_gpu_runtime_calls; | ||
| 405 | /* Host Times */ | ||
| 406 | inout[i].elapsed_time = max_int64(inout[i].elapsed_time, in[i].elapsed_time); | ||
| 407 | inout[i].useful_time += in[i].useful_time; | ||
| 408 | inout[i].mpi_time += in[i].mpi_time; | ||
| 409 | inout[i].mpi_worker_idle_time += in[i].mpi_worker_idle_time; | ||
| 410 | inout[i].omp_load_imbalance_time += in[i].omp_load_imbalance_time; | ||
| 411 | inout[i].omp_scheduling_time += in[i].omp_scheduling_time; | ||
| 412 | inout[i].omp_serialization_time += in[i].omp_serialization_time; | ||
| 413 | inout[i].gpu_runtime_time += in[i].gpu_runtime_time; | ||
| 414 | |||
| 415 | /* Host Normalized Times */ | ||
| 416 | inout[i].min_mpi_normd_proc = | ||
| 417 | min_double_non_zero(inout[i].min_mpi_normd_proc, in[i].min_mpi_normd_proc); | ||
| 418 | inout[i].min_mpi_normd_node = | ||
| 419 | min_double_non_zero(inout[i].min_mpi_normd_node, in[i].min_mpi_normd_node); | ||
| 420 | |||
| 421 | /* Device Times */ | ||
| 422 | inout[i].gpu_useful_time += in[i].gpu_useful_time; | ||
| 423 | inout[i].gpu_communication_time += in[i].gpu_communication_time; | ||
| 424 | inout[i].gpu_inactive_time += in[i].gpu_inactive_time; | ||
| 425 | |||
| 426 | /* Device Max Times */ | ||
| 427 | inout[i].max_gpu_useful_time = | ||
| 428 | max_int64(inout[i].max_gpu_useful_time, in[i].max_gpu_useful_time); | ||
| 429 | inout[i].max_gpu_active_time = | ||
| 430 | max_int64(inout[i].max_gpu_active_time, in[i].max_gpu_active_time); | ||
| 431 | } | ||
| 432 | } | ||
| 433 | |||
| 434 | /* Function to perform the reduction at application level */ | ||
| 435 | static void reduce_pop_metrics_app_reduction(app_reduction_t *app_reduction, | ||
| 436 | const node_reduction_t *node_reduction, const dlb_monitor_t *monitor, | ||
| 437 | bool all_to_all) { | ||
| 438 | |||
| 439 | double min_mpi_normd_proc = monitor->num_cpus == 0 ? 0.0 | ||
| 440 | : (double)(monitor->mpi_time + monitor->mpi_worker_idle_time) / monitor->num_cpus; | ||
| 441 | double min_mpi_normd_node = _process_id != 0 ? 0.0 | ||
| 442 | : node_reduction->cpus_node == 0 ? 0.0 | ||
| 443 | : (double)(node_reduction->mpi_time + node_reduction->mpi_worker_idle_time) | ||
| 444 | / node_reduction->cpus_node; | ||
| 445 | |||
| 446 | bool have_gpus = (monitor->gpu_useful_time + monitor->gpu_communication_time > 0); | ||
| 447 | |||
| 448 | const app_reduction_t app_reduction_send = { | ||
| 449 | /* Resources */ | ||
| 450 | .num_cpus = monitor->num_cpus, | ||
| 451 | .num_nodes = _process_id == 0 && node_reduction->node_used ? 1 : 0, | ||
| 452 | .avg_cpus = monitor->avg_cpus, | ||
| 453 | .num_gpus = have_gpus ? 1 : 0, | ||
| 454 | /* Hardware Counters */ | ||
| 455 | .cycles = (double)monitor->cycles, | ||
| 456 | .instructions = (double)monitor->instructions, | ||
| 457 | /* Statistics */ | ||
| 458 | .num_measurements = monitor->num_measurements, | ||
| 459 | .num_mpi_calls = monitor->num_mpi_calls, | ||
| 460 | .num_omp_parallels = monitor->num_omp_parallels, | ||
| 461 | .num_omp_tasks = monitor->num_omp_tasks, | ||
| 462 | .num_gpu_runtime_calls = monitor->num_gpu_runtime_calls, | ||
| 463 | /* Host Times */ | ||
| 464 | .elapsed_time = monitor->elapsed_time, | ||
| 465 | .useful_time = monitor->useful_time, | ||
| 466 | .mpi_time = monitor->mpi_time, | ||
| 467 | .mpi_worker_idle_time = monitor->mpi_worker_idle_time, | ||
| 468 | .omp_load_imbalance_time = monitor->omp_load_imbalance_time, | ||
| 469 | .omp_scheduling_time = monitor->omp_scheduling_time, | ||
| 470 | .omp_serialization_time = monitor->omp_serialization_time, | ||
| 471 | .gpu_runtime_time = monitor->gpu_runtime_time, | ||
| 472 | /* Host Normalized Times */ | ||
| 473 | .min_mpi_normd_proc = min_mpi_normd_proc, | ||
| 474 | .min_mpi_normd_node = min_mpi_normd_node, | ||
| 475 | /* Device Times */ | ||
| 476 | .gpu_useful_time = monitor->gpu_useful_time, | ||
| 477 | .gpu_communication_time = monitor->gpu_communication_time, | ||
| 478 | .gpu_inactive_time = monitor->gpu_inactive_time, | ||
| 479 | /* Device Max Times */ | ||
| 480 | .max_gpu_useful_time = monitor->gpu_useful_time, | ||
| 481 | .max_gpu_active_time = monitor->gpu_useful_time + monitor->gpu_communication_time, | ||
| 482 | }; | ||
| 483 | |||
| 484 | /* MPI type: int64_t */ | ||
| 485 | MPI_Datatype mpi_int64_type = get_mpi_int64_type(); | ||
| 486 | |||
| 487 | /* MPI struct type: app_reduction_t */ | ||
| 488 | MPI_Datatype mpi_app_reduction_type; | ||
| 489 | { | ||
| 490 | int blocklengths[] = { | ||
| 491 | 1, 1, 1, 1, /* Resources */ | ||
| 492 | 1, 1, /* Hardware Counters */ | ||
| 493 | 1, 1, 1, 1, 1, /* Statistics */ | ||
| 494 | 1, 1, 1, 1, 1, 1, 1, 1, /* Host Times */ | ||
| 495 | 1, 1, /* Host Normalized Times */ | ||
| 496 | 1, 1, 1, /* Device Times */ | ||
| 497 | 1, 1}; /* Device Max Times */ | ||
| 498 | |||
| 499 | enum {count = sizeof(blocklengths) / sizeof(blocklengths[0])}; | ||
| 500 | |||
| 501 | MPI_Aint displacements[] = { | ||
| 502 | /* Resources */ | ||
| 503 | offsetof(app_reduction_t, num_cpus), | ||
| 504 | offsetof(app_reduction_t, num_nodes), | ||
| 505 | offsetof(app_reduction_t, avg_cpus), | ||
| 506 | offsetof(app_reduction_t, num_gpus), | ||
| 507 | /* Hardware Counters */ | ||
| 508 | offsetof(app_reduction_t, cycles), | ||
| 509 | offsetof(app_reduction_t, instructions), | ||
| 510 | /* Statistics */ | ||
| 511 | offsetof(app_reduction_t, num_measurements), | ||
| 512 | offsetof(app_reduction_t, num_mpi_calls), | ||
| 513 | offsetof(app_reduction_t, num_omp_parallels), | ||
| 514 | offsetof(app_reduction_t, num_omp_tasks), | ||
| 515 | offsetof(app_reduction_t, num_gpu_runtime_calls), | ||
| 516 | /* Host Times */ | ||
| 517 | offsetof(app_reduction_t, elapsed_time), | ||
| 518 | offsetof(app_reduction_t, useful_time), | ||
| 519 | offsetof(app_reduction_t, mpi_time), | ||
| 520 | offsetof(app_reduction_t, mpi_worker_idle_time), | ||
| 521 | offsetof(app_reduction_t, omp_load_imbalance_time), | ||
| 522 | offsetof(app_reduction_t, omp_scheduling_time), | ||
| 523 | offsetof(app_reduction_t, omp_serialization_time), | ||
| 524 | offsetof(app_reduction_t, gpu_runtime_time), | ||
| 525 | /* Normalized Times */ | ||
| 526 | offsetof(app_reduction_t, min_mpi_normd_proc), | ||
| 527 | offsetof(app_reduction_t, min_mpi_normd_node), | ||
| 528 | /* Device Times */ | ||
| 529 | offsetof(app_reduction_t, gpu_useful_time), | ||
| 530 | offsetof(app_reduction_t, gpu_communication_time), | ||
| 531 | offsetof(app_reduction_t, gpu_inactive_time), | ||
| 532 | /* Device Max Times */ | ||
| 533 | offsetof(app_reduction_t, max_gpu_useful_time), | ||
| 534 | offsetof(app_reduction_t, max_gpu_active_time), | ||
| 535 | }; | ||
| 536 | |||
| 537 | MPI_Datatype types[] = { | ||
| 538 | /* Resources */ | ||
| 539 | MPI_INT, MPI_INT, MPI_FLOAT, MPI_INT, | ||
| 540 | /* Hardware Counters */ | ||
| 541 | MPI_DOUBLE, MPI_DOUBLE, | ||
| 542 | /* Statistics */ | ||
| 543 | mpi_int64_type, mpi_int64_type, | ||
| 544 | mpi_int64_type, mpi_int64_type, | ||
| 545 | mpi_int64_type, | ||
| 546 | /* Host Times */ | ||
| 547 | mpi_int64_type, mpi_int64_type, | ||
| 548 | mpi_int64_type, mpi_int64_type, | ||
| 549 | mpi_int64_type, mpi_int64_type, | ||
| 550 | mpi_int64_type, mpi_int64_type, | ||
| 551 | /* Host Normalized Times */ | ||
| 552 | MPI_DOUBLE, MPI_DOUBLE, | ||
| 553 | /* Device Times */ | ||
| 554 | mpi_int64_type, mpi_int64_type, | ||
| 555 | mpi_int64_type, | ||
| 556 | /* Device Max Times */ | ||
| 557 | mpi_int64_type, mpi_int64_type, | ||
| 558 | }; | ||
| 559 | |||
| 560 | MPI_Datatype tmp_type; | ||
| 561 | PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type); | ||
| 562 | PMPI_Type_create_resized(tmp_type, 0, sizeof(app_reduction_t), | ||
| 563 | &mpi_app_reduction_type); | ||
| 564 | PMPI_Type_commit(&mpi_app_reduction_type); | ||
| 565 | |||
| 566 | static_ensure(sizeof(blocklengths)/sizeof(blocklengths[0]) == count, | ||
| 567 | "blocklengths size mismatch"); | ||
| 568 | static_ensure(sizeof(displacements)/sizeof(displacements[0]) == count, | ||
| 569 | "displacements size mismatch"); | ||
| 570 | static_ensure(sizeof(types)/sizeof(types[0]) == count, | ||
| 571 | "types size mismatch"); | ||
| 572 | } | ||
| 573 | |||
| 574 | /* Define MPI operation */ | ||
| 575 | MPI_Op app_reduction_op; | ||
| 576 | PMPI_Op_create(mpi_reduction_fn, true, &app_reduction_op); | ||
| 577 | |||
| 578 | /* MPI reduction */ | ||
| 579 | if (!all_to_all) { | ||
| 580 | PMPI_Reduce(&app_reduction_send, app_reduction, 1, | ||
| 581 | mpi_app_reduction_type, app_reduction_op, | ||
| 582 | 0, getWorldComm()); | ||
| 583 | } else { | ||
| 584 | PMPI_Allreduce(&app_reduction_send, app_reduction, 1, | ||
| 585 | mpi_app_reduction_type, app_reduction_op, | ||
| 586 | getWorldComm()); | ||
| 587 | } | ||
| 588 | |||
| 589 | /* Free MPI types */ | ||
| 590 | PMPI_Type_free(&mpi_app_reduction_type); | ||
| 591 | PMPI_Op_free(&app_reduction_op); | ||
| 592 | } | ||
| 593 | |||
| 594 | #endif | ||
| 595 | |||
| 596 | |||
| 597 | |||
| 598 | #if MPI_LIB | ||
| 599 | /* Construct a base metrics struct out of a monitor reduced via MPI */ | ||
| 600 | void perf_metrics__reduce_monitor_into_base_metrics(pop_base_metrics_t *base_metrics, | ||
| 601 | const dlb_monitor_t *monitor, bool all_to_all) { | ||
| 602 | |||
| 603 | /* First, reduce some values among processes in the node, | ||
| 604 | * needed to compute pop metrics */ | ||
| 605 | node_reduction_t node_reduction = {0}; | ||
| 606 | reduce_pop_metrics_node_reduction(&node_reduction, monitor); | ||
| 607 | |||
| 608 | /* With the node reduction, reduce again among all process */ | ||
| 609 | app_reduction_t app_reduction = {0}; | ||
| 610 | reduce_pop_metrics_app_reduction(&app_reduction, &node_reduction, | ||
| 611 | monitor, all_to_all); | ||
| 612 | |||
| 613 | /* Finally, fill output base_metrics... */ | ||
| 614 | |||
| 615 | int num_mpi_ranks; | ||
| 616 | PMPI_Comm_size(getWorldComm(), &num_mpi_ranks); | ||
| 617 | |||
| 618 | *base_metrics = (const pop_base_metrics_t) { | ||
| 619 | .num_cpus = app_reduction.num_cpus, | ||
| 620 | .num_mpi_ranks = num_mpi_ranks, | ||
| 621 | .num_nodes = app_reduction.num_nodes, | ||
| 622 | .avg_cpus = app_reduction.avg_cpus, | ||
| 623 | .num_gpus = app_reduction.num_gpus, | ||
| 624 | .cycles = app_reduction.cycles, | ||
| 625 | .instructions = app_reduction.instructions, | ||
| 626 | .num_measurements = app_reduction.num_measurements, | ||
| 627 | .num_mpi_calls = app_reduction.num_mpi_calls, | ||
| 628 | .num_omp_parallels = app_reduction.num_omp_parallels, | ||
| 629 | .num_omp_tasks = app_reduction.num_omp_tasks, | ||
| 630 | .num_gpu_runtime_calls = app_reduction.num_gpu_runtime_calls, | ||
| 631 | .elapsed_time = app_reduction.elapsed_time, | ||
| 632 | .useful_time = app_reduction.useful_time, | ||
| 633 | .mpi_time = app_reduction.mpi_time, | ||
| 634 | .mpi_worker_idle_time = app_reduction.mpi_worker_idle_time, | ||
| 635 | .omp_load_imbalance_time = app_reduction.omp_load_imbalance_time, | ||
| 636 | .omp_scheduling_time = app_reduction.omp_scheduling_time, | ||
| 637 | .omp_serialization_time = app_reduction.omp_serialization_time, | ||
| 638 | .gpu_runtime_time = app_reduction.gpu_runtime_time, | ||
| 639 | .min_mpi_normd_proc = app_reduction.min_mpi_normd_proc, | ||
| 640 | .min_mpi_normd_node = app_reduction.min_mpi_normd_node, | ||
| 641 | .gpu_useful_time = app_reduction.gpu_useful_time, | ||
| 642 | .gpu_communication_time = app_reduction.gpu_communication_time, | ||
| 643 | .gpu_inactive_time = app_reduction.gpu_inactive_time, | ||
| 644 | .max_gpu_useful_time = app_reduction.max_gpu_useful_time, | ||
| 645 | .max_gpu_active_time = app_reduction.max_gpu_active_time, | ||
| 646 | }; | ||
| 647 | } | ||
| 648 | #endif | ||
| 649 | |||
| 650 | |||
| 651 | /* Construct a base metrics struct out of a single monitor */ | ||
| 652 | 18 | void perf_metrics__local_monitor_into_base_metrics(pop_base_metrics_t *base_metrics, | |
| 653 | const dlb_monitor_t *monitor) { | ||
| 654 | |||
| 655 | 18 | bool have_gpus = (monitor->gpu_useful_time + monitor->gpu_communication_time > 0); | |
| 656 | |||
| 657 | 18 | double mpi_normd = | |
| 658 | 18 | (double)(monitor->mpi_time + monitor->mpi_worker_idle_time) / monitor->num_cpus; | |
| 659 | |||
| 660 | 18 | *base_metrics = (const pop_base_metrics_t){ | |
| 661 | 18 | .num_cpus = monitor->num_cpus, | |
| 662 | .num_mpi_ranks = 0, | ||
| 663 | .num_nodes = 1, | ||
| 664 | 18 | .avg_cpus = monitor->avg_cpus, | |
| 665 | 18 | .num_gpus = have_gpus ? 1 : 0, | |
| 666 | 18 | .cycles = (double)monitor->cycles, | |
| 667 | 18 | .instructions = (double)monitor->instructions, | |
| 668 | 18 | .num_measurements = monitor->num_measurements, | |
| 669 | 18 | .num_mpi_calls = monitor->num_mpi_calls, | |
| 670 | 18 | .num_omp_parallels = monitor->num_omp_parallels, | |
| 671 | 18 | .num_omp_tasks = monitor->num_omp_tasks, | |
| 672 | 18 | .num_gpu_runtime_calls = monitor->num_gpu_runtime_calls, | |
| 673 | 18 | .elapsed_time = monitor->elapsed_time, | |
| 674 | 18 | .useful_time = monitor->useful_time, | |
| 675 | 18 | .mpi_time = monitor->mpi_time, | |
| 676 | 18 | .mpi_worker_idle_time = monitor->mpi_worker_idle_time, | |
| 677 | 18 | .omp_load_imbalance_time = monitor->omp_load_imbalance_time, | |
| 678 | 18 | .omp_scheduling_time = monitor->omp_scheduling_time, | |
| 679 | 18 | .omp_serialization_time = monitor->omp_serialization_time, | |
| 680 | 18 | .gpu_runtime_time = monitor->gpu_runtime_time, | |
| 681 | .min_mpi_normd_proc = mpi_normd, | ||
| 682 | .min_mpi_normd_node = mpi_normd, | ||
| 683 | 18 | .gpu_useful_time = monitor->gpu_useful_time, | |
| 684 | 18 | .gpu_communication_time = monitor->gpu_communication_time, | |
| 685 | 18 | .gpu_inactive_time = monitor->gpu_inactive_time, | |
| 686 | 18 | .max_gpu_useful_time = monitor->gpu_useful_time, | |
| 687 | 18 | .max_gpu_active_time = monitor->gpu_useful_time + monitor->gpu_communication_time, | |
| 688 | }; | ||
| 689 | 18 | } | |
| 690 | |||
| 691 | /* Compute POP metrics out of a base metrics struct */ | ||
| 692 | 18 | void perf_metrics__base_to_pop_metrics(const char *monitor_name, | |
| 693 | const pop_base_metrics_t *base_metrics, dlb_pop_metrics_t *pop_metrics) { | ||
| 694 | |||
| 695 | /* Compute POP metrics */ | ||
| 696 | 18 | perf_metrics_hybrid_t metrics = {0}; | |
| 697 | |||
| 698 |
1/2✓ Branch 0 taken 18 times.
✗ Branch 1 not taken.
|
18 | if (base_metrics->useful_time > 0) { |
| 699 | |||
| 700 |
1/3✗ Branch 0 not taken.
✓ Branch 1 taken 18 times.
✗ Branch 2 not taken.
|
18 | switch(thread_spd->options.talp_model) { |
| 701 | ✗ | case TALP_MODEL_HYBRID_V1: | |
| 702 | ✗ | perf_metrics__compute_hybrid_model_v1(&metrics, base_metrics); | |
| 703 | ✗ | break; | |
| 704 | 18 | case TALP_MODEL_HYBRID_V2: | |
| 705 | 18 | perf_metrics__compute_hybrid_model_v2(&metrics, base_metrics); | |
| 706 | 18 | break; | |
| 707 | }; | ||
| 708 | } | ||
| 709 | |||
| 710 | /* Initialize structure */ | ||
| 711 | 18 | *pop_metrics = (const dlb_pop_metrics_t) { | |
| 712 | 18 | .num_cpus = base_metrics->num_cpus, | |
| 713 | 18 | .num_mpi_ranks = base_metrics->num_mpi_ranks, | |
| 714 | 18 | .num_nodes = base_metrics->num_nodes, | |
| 715 | 18 | .avg_cpus = base_metrics->avg_cpus, | |
| 716 | 18 | .num_gpus = base_metrics->num_gpus, | |
| 717 | 18 | .cycles = base_metrics->cycles, | |
| 718 | 18 | .instructions = base_metrics->instructions, | |
| 719 | 18 | .num_measurements = base_metrics->num_measurements, | |
| 720 | 18 | .num_mpi_calls = base_metrics->num_mpi_calls, | |
| 721 | 18 | .num_omp_parallels = base_metrics->num_omp_parallels, | |
| 722 | 18 | .num_omp_tasks = base_metrics->num_omp_tasks, | |
| 723 | 18 | .num_gpu_runtime_calls = base_metrics->num_gpu_runtime_calls, | |
| 724 | 18 | .elapsed_time = base_metrics->elapsed_time, | |
| 725 | 18 | .useful_time = base_metrics->useful_time, | |
| 726 | 18 | .mpi_time = base_metrics->mpi_time, | |
| 727 | 18 | .mpi_worker_idle_time = base_metrics->mpi_worker_idle_time, | |
| 728 | 18 | .omp_load_imbalance_time = base_metrics->omp_load_imbalance_time, | |
| 729 | 18 | .omp_scheduling_time = base_metrics->omp_scheduling_time, | |
| 730 | 18 | .omp_serialization_time = base_metrics->omp_serialization_time, | |
| 731 | 18 | .gpu_runtime_time = base_metrics->gpu_runtime_time, | |
| 732 | 18 | .min_mpi_normd_proc = base_metrics->min_mpi_normd_proc, | |
| 733 | 18 | .min_mpi_normd_node = base_metrics->min_mpi_normd_node, | |
| 734 | 18 | .gpu_useful_time = base_metrics->gpu_useful_time, | |
| 735 | 18 | .gpu_communication_time = base_metrics->gpu_communication_time, | |
| 736 | 18 | .gpu_inactive_time = base_metrics->gpu_inactive_time, | |
| 737 | 18 | .max_gpu_useful_time = base_metrics->max_gpu_useful_time, | |
| 738 | 18 | .max_gpu_active_time = base_metrics->max_gpu_active_time, | |
| 739 | 18 | .parallel_efficiency = metrics.parallel_efficiency, | |
| 740 | 18 | .mpi_parallel_efficiency = metrics.mpi_parallel_efficiency, | |
| 741 | 18 | .mpi_communication_efficiency = metrics.mpi_communication_efficiency, | |
| 742 | 18 | .mpi_load_balance = metrics.mpi_load_balance, | |
| 743 | 18 | .mpi_load_balance_in = metrics.mpi_load_balance_in, | |
| 744 | 18 | .mpi_load_balance_out = metrics.mpi_load_balance_out, | |
| 745 | 18 | .omp_parallel_efficiency = metrics.omp_parallel_efficiency, | |
| 746 | 18 | .omp_load_balance = metrics.omp_load_balance, | |
| 747 | 18 | .omp_scheduling_efficiency = metrics.omp_scheduling_efficiency, | |
| 748 | 18 | .omp_serialization_efficiency = metrics.omp_serialization_efficiency, | |
| 749 | 18 | .device_offload_efficiency = metrics.device_offload_efficiency, | |
| 750 | 18 | .gpu_parallel_efficiency = metrics.gpu_parallel_efficiency, | |
| 751 | 18 | .gpu_load_balance = metrics.gpu_load_balance, | |
| 752 | 18 | .gpu_communication_efficiency = metrics.gpu_communication_efficiency, | |
| 753 | 18 | .gpu_orchestration_efficiency = metrics.gpu_orchestration_efficiency, | |
| 754 | }; | ||
| 755 | 18 | snprintf(pop_metrics->name, DLB_MONITOR_NAME_MAX, "%s", monitor_name); | |
| 756 | 18 | } | |
| 757 |