| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /*********************************************************************************/ | ||
| 2 | /* Copyright 2009-2025 Barcelona Supercomputing Center */ | ||
| 3 | /* */ | ||
| 4 | /* This file is part of the DLB library. */ | ||
| 5 | /* */ | ||
| 6 | /* DLB is free software: you can redistribute it and/or modify */ | ||
| 7 | /* it under the terms of the GNU Lesser General Public License as published by */ | ||
| 8 | /* the Free Software Foundation, either version 3 of the License, or */ | ||
| 9 | /* (at your option) any later version. */ | ||
| 10 | /* */ | ||
| 11 | /* DLB is distributed in the hope that it will be useful, */ | ||
| 12 | /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
| 13 | /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
| 14 | /* GNU Lesser General Public License for more details. */ | ||
| 15 | /* */ | ||
| 16 | /* You should have received a copy of the GNU Lesser General Public License */ | ||
| 17 | /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */ | ||
| 18 | /*********************************************************************************/ | ||
| 19 | |||
| 20 | #include "talp/talp_record.h" | ||
| 21 | |||
| 22 | #include "LB_comm/shmem_talp.h" | ||
| 23 | #include "LB_core/node_barrier.h" | ||
| 24 | #include "LB_core/spd.h" | ||
| 25 | #include "apis/dlb_talp.h" | ||
| 26 | #include "support/debug.h" | ||
| 27 | #include "support/mask_utils.h" | ||
| 28 | #include "support/options.h" | ||
| 29 | #include "talp/perf_metrics.h" | ||
| 30 | #include "talp/regions.h" | ||
| 31 | #include "talp/talp_output.h" | ||
| 32 | #include "talp/talp_types.h" | ||
| 33 | #ifdef MPI_LIB | ||
| 34 | #include "mpi/mpi_core.h" | ||
| 35 | #endif | ||
| 36 | |||
| 37 | #include <stddef.h> | ||
| 38 | #include <stdio.h> | ||
| 39 | #include <unistd.h> | ||
| 40 | |||
| 41 | |||
| 42 | /*********************************************************************************/ | ||
| 43 | /* TALP Record in serial (non-MPI) mode */ | ||
| 44 | /*********************************************************************************/ | ||
| 45 | |||
| 46 | /* For any given monitor, record metrics considering only this (sub-)process */ | ||
| 47 | 4818 | void talp_record_monitor(const subprocess_descriptor_t *spd, | |
| 48 | const dlb_monitor_t *monitor) { | ||
| 49 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 4816 times.
|
4818 | if (spd->options.talp_summary & SUMMARY_PROCESS) { |
| 50 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | verbose(VB_TALP, "TALP process summary: recording region %s", monitor->name); |
| 51 | |||
| 52 | 2 | process_record_t process_record = { | |
| 53 | .rank = 0, | ||
| 54 | 2 | .pid = spd->id, | |
| 55 | .monitor = *monitor, | ||
| 56 | }; | ||
| 57 | |||
| 58 | /* Fill hostname and CPU mask strings in process_record */ | ||
| 59 | 2 | gethostname(process_record.hostname, HOST_NAME_MAX); | |
| 60 | 2 | snprintf(process_record.cpuset, TALP_OUTPUT_CPUSET_MAX, "%s", | |
| 61 | mu_to_str(&spd->process_mask)); | ||
| 62 | 2 | mu_get_quoted_mask(&spd->process_mask, process_record.cpuset_quoted, | |
| 63 | TALP_OUTPUT_CPUSET_MAX); | ||
| 64 | |||
| 65 | /* Add record */ | ||
| 66 | 2 | talp_output_record_process(monitor->name, &process_record, 1); | |
| 67 | } | ||
| 68 | |||
| 69 |
2/2✓ Branch 0 taken 15 times.
✓ Branch 1 taken 4803 times.
|
4818 | if (spd->options.talp_summary & SUMMARY_POP_METRICS) { |
| 70 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 1 times.
|
15 | if (monitor->elapsed_time > 0) { |
| 71 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
|
14 | verbose(VB_TALP, "TALP summary: recording region %s", monitor->name); |
| 72 | |||
| 73 | pop_base_metrics_t base_metrics; | ||
| 74 | 14 | perf_metrics__local_monitor_into_base_metrics(&base_metrics, monitor); | |
| 75 | |||
| 76 | dlb_pop_metrics_t pop_metrics; | ||
| 77 | 14 | perf_metrics__base_to_pop_metrics(monitor->name, &base_metrics, &pop_metrics); | |
| 78 | 14 | talp_output_record_pop_metrics(&pop_metrics); | |
| 79 | |||
| 80 | 14 | talp_info_t *talp_info = spd->talp_info; | |
| 81 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 6 times.
|
14 | if(monitor == talp_info->monitor) { |
| 82 | 8 | talp_output_record_resources(monitor->num_cpus, | |
| 83 | /* num_nodes */ 1, /* num_ranks */ 0, base_metrics.num_gpus); | ||
| 84 | } | ||
| 85 | |||
| 86 | } else { | ||
| 87 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | verbose(VB_TALP, "TALP summary: recording empty region %s", monitor->name); |
| 88 | 1 | dlb_pop_metrics_t pop_metrics = {0}; | |
| 89 | 1 | snprintf(pop_metrics.name, DLB_MONITOR_NAME_MAX, "%s", monitor->name); | |
| 90 | 1 | talp_output_record_pop_metrics(&pop_metrics); | |
| 91 | } | ||
| 92 | } | ||
| 93 | 4818 | } | |
| 94 | |||
| 95 | |||
| 96 | /*********************************************************************************/ | ||
| 97 | /* TALP Record in MPI mode */ | ||
| 98 | /*********************************************************************************/ | ||
| 99 | |||
| 100 | #if MPI_LIB | ||
| 101 | |||
| 102 | /* Compute Node summary of all Global Monitors and record data */ | ||
| 103 | void talp_record_node_summary(const subprocess_descriptor_t *spd) { | ||
| 104 | node_record_t *node_summary = NULL; | ||
| 105 | size_t node_summary_size = 0; | ||
| 106 | |||
| 107 | /* Perform a barrier so that all processes in the node have arrived at the | ||
| 108 | * MPI_Finalize */ | ||
| 109 | node_barrier(spd, NULL); | ||
| 110 | |||
| 111 | /* Node process 0 reduces all global regions from all processes in the node */ | ||
| 112 | if (_process_id == 0) { | ||
| 113 | /* Obtain a list of regions associated with the Global Region Name, sorted by PID */ | ||
| 114 | int max_procs = mu_get_system_size(); | ||
| 115 | talp_region_list_t *region_list = malloc(max_procs * sizeof(talp_region_list_t)); | ||
| 116 | int nelems; | ||
| 117 | shmem_talp__get_regionlist(region_list, &nelems, max_procs, region_get_global_name()); | ||
| 118 | |||
| 119 | /* Allocate and initialize node summary structure */ | ||
| 120 | node_summary_size = sizeof(node_record_t) + sizeof(process_in_node_record_t) * nelems; | ||
| 121 | node_summary = malloc(node_summary_size); | ||
| 122 | *node_summary = (const node_record_t) { | ||
| 123 | .node_id = _node_id, | ||
| 124 | .nelems = nelems, | ||
| 125 | }; | ||
| 126 | |||
| 127 | /* Iterate the PID list and gather times of every process */ | ||
| 128 | for (int i = 0; i < nelems; ++i) { | ||
| 129 | int64_t mpi_time = region_list[i].mpi_time; | ||
| 130 | int64_t useful_time = region_list[i].useful_time; | ||
| 131 | |||
| 132 | /* Save times in local structure */ | ||
| 133 | node_summary->processes[i].pid = region_list[i].pid; | ||
| 134 | node_summary->processes[i].mpi_time = mpi_time; | ||
| 135 | node_summary->processes[i].useful_time = useful_time; | ||
| 136 | |||
| 137 | /* Accumulate total and max values */ | ||
| 138 | node_summary->avg_useful_time += useful_time; | ||
| 139 | node_summary->avg_mpi_time += mpi_time; | ||
| 140 | node_summary->max_useful_time = max_int64(useful_time, node_summary->max_useful_time); | ||
| 141 | node_summary->max_mpi_time = max_int64(mpi_time, node_summary->max_mpi_time); | ||
| 142 | } | ||
| 143 | free(region_list); | ||
| 144 | |||
| 145 | /* Compute average values */ | ||
| 146 | node_summary->avg_useful_time /= node_summary->nelems; | ||
| 147 | node_summary->avg_mpi_time /= node_summary->nelems; | ||
| 148 | } | ||
| 149 | |||
| 150 | /* Perform a final barrier so that all processes let the _process_id 0 to | ||
| 151 | * gather all the data */ | ||
| 152 | node_barrier(spd, NULL); | ||
| 153 | |||
| 154 | /* All main processes from each node send data to rank 0 */ | ||
| 155 | if (_process_id == 0) { | ||
| 156 | verbose(VB_TALP, "Node summary: gathering data"); | ||
| 157 | |||
| 158 | /* MPI type: int64_t */ | ||
| 159 | MPI_Datatype mpi_int64_type = get_mpi_int64_type(); | ||
| 160 | |||
| 161 | /* MPI type: pid_t */ | ||
| 162 | MPI_Datatype mpi_pid_type; | ||
| 163 | PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(pid_t), &mpi_pid_type); | ||
| 164 | |||
| 165 | /* MPI struct type: process_in_node_record_t */ | ||
| 166 | MPI_Datatype mpi_process_info_type; | ||
| 167 | { | ||
| 168 | int count = 3; | ||
| 169 | int blocklengths[] = {1, 1, 1}; | ||
| 170 | MPI_Aint displacements[] = { | ||
| 171 | offsetof(process_in_node_record_t, pid), | ||
| 172 | offsetof(process_in_node_record_t, mpi_time), | ||
| 173 | offsetof(process_in_node_record_t, useful_time)}; | ||
| 174 | MPI_Datatype types[] = {mpi_pid_type, mpi_int64_type, mpi_int64_type}; | ||
| 175 | MPI_Datatype tmp_type; | ||
| 176 | PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type); | ||
| 177 | PMPI_Type_create_resized(tmp_type, 0, sizeof(process_in_node_record_t), | ||
| 178 | &mpi_process_info_type); | ||
| 179 | PMPI_Type_commit(&mpi_process_info_type); | ||
| 180 | } | ||
| 181 | |||
| 182 | /* MPI struct type: node_record_t */ | ||
| 183 | MPI_Datatype mpi_node_record_type;; | ||
| 184 | { | ||
| 185 | int count = 7; | ||
| 186 | int blocklengths[] = {1, 1, 1, 1, 1, 1, node_summary->nelems}; | ||
| 187 | MPI_Aint displacements[] = { | ||
| 188 | offsetof(node_record_t, node_id), | ||
| 189 | offsetof(node_record_t, nelems), | ||
| 190 | offsetof(node_record_t, avg_useful_time), | ||
| 191 | offsetof(node_record_t, avg_mpi_time), | ||
| 192 | offsetof(node_record_t, max_useful_time), | ||
| 193 | offsetof(node_record_t, max_mpi_time), | ||
| 194 | offsetof(node_record_t, processes)}; | ||
| 195 | MPI_Datatype types[] = {MPI_INT, MPI_INT, mpi_int64_type, mpi_int64_type, | ||
| 196 | mpi_int64_type, mpi_int64_type, mpi_process_info_type}; | ||
| 197 | MPI_Datatype tmp_type; | ||
| 198 | PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type); | ||
| 199 | PMPI_Type_create_resized(tmp_type, 0, node_summary_size, &mpi_node_record_type); | ||
| 200 | PMPI_Type_commit(&mpi_node_record_type); | ||
| 201 | } | ||
| 202 | |||
| 203 | /* Gather data */ | ||
| 204 | void *recvbuf = NULL; | ||
| 205 | if (_mpi_rank == 0) { | ||
| 206 | recvbuf = malloc(_num_nodes * node_summary_size); | ||
| 207 | } | ||
| 208 | PMPI_Gather(node_summary, 1, mpi_node_record_type, | ||
| 209 | recvbuf, 1, mpi_node_record_type, | ||
| 210 | 0, getInterNodeComm()); | ||
| 211 | |||
| 212 | /* Free send buffer and MPI Datatypes */ | ||
| 213 | free(node_summary); | ||
| 214 | PMPI_Type_free(&mpi_process_info_type); | ||
| 215 | PMPI_Type_free(&mpi_node_record_type); | ||
| 216 | |||
| 217 | /* Add records */ | ||
| 218 | if (_mpi_rank == 0) { | ||
| 219 | for (int node_id = 0; node_id < _num_nodes; ++node_id) { | ||
| 220 | verbose(VB_TALP, "Node summary: recording node %d", node_id); | ||
| 221 | node_record_t *node_record = (node_record_t*)( | ||
| 222 | (unsigned char *)recvbuf + node_summary_size * node_id); | ||
| 223 | ensure( node_id == node_record->node_id, "Node id error in %s", __func__ ); | ||
| 224 | talp_output_record_node(node_record); | ||
| 225 | } | ||
| 226 | free(recvbuf); | ||
| 227 | } | ||
| 228 | } | ||
| 229 | } | ||
| 230 | |||
| 231 | /* Gather PROCESS data of a monitor among all ranks and record it in rank 0 */ | ||
| 232 | void talp_record_process_summary(const subprocess_descriptor_t *spd, | ||
| 233 | const dlb_monitor_t *monitor) { | ||
| 234 | |||
| 235 | /* Internal monitors will not be recorded */ | ||
| 236 | if (((monitor_data_t*)monitor->_data)->flags.internal) { | ||
| 237 | return; | ||
| 238 | } | ||
| 239 | |||
| 240 | if (_mpi_rank == 0) { | ||
| 241 | verbose(VB_TALP, "Process summary: gathering region %s", monitor->name); | ||
| 242 | } | ||
| 243 | |||
| 244 | process_record_t process_record_send = { | ||
| 245 | .rank = _mpi_rank, | ||
| 246 | .pid = spd->id, | ||
| 247 | .node_id = _node_id, | ||
| 248 | .monitor = *monitor, | ||
| 249 | }; | ||
| 250 | |||
| 251 | /* Invalidate pointers of the copied monitor */ | ||
| 252 | process_record_send.monitor.name = NULL; | ||
| 253 | process_record_send.monitor._data = NULL; | ||
| 254 | |||
| 255 | /* Fill hostname and CPU mask strings in process_record_send */ | ||
| 256 | gethostname(process_record_send.hostname, HOST_NAME_MAX); | ||
| 257 | snprintf(process_record_send.cpuset, TALP_OUTPUT_CPUSET_MAX, "%s", | ||
| 258 | mu_to_str(&spd->process_mask)); | ||
| 259 | mu_get_quoted_mask(&spd->process_mask, process_record_send.cpuset_quoted, | ||
| 260 | TALP_OUTPUT_CPUSET_MAX); | ||
| 261 | |||
| 262 | /* MPI type: int64_t */ | ||
| 263 | MPI_Datatype mpi_int64_type = get_mpi_int64_type(); | ||
| 264 | |||
| 265 | /* MPI type: pid_t */ | ||
| 266 | MPI_Datatype mpi_pid_type; | ||
| 267 | PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(pid_t), &mpi_pid_type); | ||
| 268 | |||
| 269 | /* Note: obviously, it doesn't make sense to send addresses via MPI, but we | ||
| 270 | * are sending the whole dlb_monitor_t, so... Addresses are discarded | ||
| 271 | * either way. */ | ||
| 272 | |||
| 273 | /* MPI type: void* */ | ||
| 274 | MPI_Datatype address_type; | ||
| 275 | PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(void*), &address_type); | ||
| 276 | |||
| 277 | /* MPI struct type: dlb_monitor_t */ | ||
| 278 | MPI_Datatype mpi_dlb_monitor_type; | ||
| 279 | { | ||
| 280 | int blocklengths[] = { | ||
| 281 | 1, 1, 1, /* Name + Resources: num_cpus, avg_cpus */ | ||
| 282 | 1, 1, /* Hardware counters: cycles, instructions */ | ||
| 283 | 1, 1, 1, 1, 1, 1, /* Statistics: num_* */ | ||
| 284 | 1, 1, /* Monitor Start and Stop times */ | ||
| 285 | 1, 1, 1, 1, 1, 1, 1, /* Host Times */ | ||
| 286 | 1, 1, 1, /* Device Times */ | ||
| 287 | 1}; /* _data */ | ||
| 288 | |||
| 289 | enum {count = sizeof(blocklengths) / sizeof(blocklengths[0])}; | ||
| 290 | |||
| 291 | MPI_Aint displacements[] = { | ||
| 292 | offsetof(dlb_monitor_t, name), | ||
| 293 | /* Resources */ | ||
| 294 | offsetof(dlb_monitor_t, num_cpus), | ||
| 295 | offsetof(dlb_monitor_t, avg_cpus), | ||
| 296 | /* Hardware counters */ | ||
| 297 | offsetof(dlb_monitor_t, cycles), | ||
| 298 | offsetof(dlb_monitor_t, instructions), | ||
| 299 | /* Statistics */ | ||
| 300 | offsetof(dlb_monitor_t, num_measurements), | ||
| 301 | offsetof(dlb_monitor_t, num_resets), | ||
| 302 | offsetof(dlb_monitor_t, num_mpi_calls), | ||
| 303 | offsetof(dlb_monitor_t, num_omp_parallels), | ||
| 304 | offsetof(dlb_monitor_t, num_omp_tasks), | ||
| 305 | offsetof(dlb_monitor_t, num_gpu_runtime_calls), | ||
| 306 | /* Monitor Start and Stop times */ | ||
| 307 | offsetof(dlb_monitor_t, start_time), | ||
| 308 | offsetof(dlb_monitor_t, stop_time), | ||
| 309 | /* Host Times */ | ||
| 310 | offsetof(dlb_monitor_t, elapsed_time), | ||
| 311 | offsetof(dlb_monitor_t, useful_time), | ||
| 312 | offsetof(dlb_monitor_t, mpi_time), | ||
| 313 | offsetof(dlb_monitor_t, omp_load_imbalance_time), | ||
| 314 | offsetof(dlb_monitor_t, omp_scheduling_time), | ||
| 315 | offsetof(dlb_monitor_t, omp_serialization_time), | ||
| 316 | offsetof(dlb_monitor_t, gpu_runtime_time), | ||
| 317 | /* Device Times */ | ||
| 318 | offsetof(dlb_monitor_t, gpu_useful_time), | ||
| 319 | offsetof(dlb_monitor_t, gpu_communication_time), | ||
| 320 | offsetof(dlb_monitor_t, gpu_inactive_time), | ||
| 321 | /* _data */ | ||
| 322 | offsetof(dlb_monitor_t, _data)}; | ||
| 323 | |||
| 324 | MPI_Datatype types[] = { | ||
| 325 | address_type, MPI_INT, MPI_FLOAT, /* Name + Resources: num_cpus, avg_cpus */ | ||
| 326 | mpi_int64_type, mpi_int64_type, /* Hardware counters: cycles, instructions */ | ||
| 327 | MPI_INT, MPI_INT, | ||
| 328 | mpi_int64_type, mpi_int64_type, | ||
| 329 | mpi_int64_type, mpi_int64_type, /* Statistics: num_* */ | ||
| 330 | mpi_int64_type, mpi_int64_type, /* Monitor Start and Stop times */ | ||
| 331 | mpi_int64_type, mpi_int64_type, | ||
| 332 | mpi_int64_type, mpi_int64_type, | ||
| 333 | mpi_int64_type, mpi_int64_type, | ||
| 334 | mpi_int64_type, /* Host Times */ | ||
| 335 | mpi_int64_type, mpi_int64_type, | ||
| 336 | mpi_int64_type, /* Device Times */ | ||
| 337 | address_type}; /* _data */ | ||
| 338 | |||
| 339 | MPI_Datatype tmp_type; | ||
| 340 | PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type); | ||
| 341 | PMPI_Type_create_resized(tmp_type, 0, sizeof(dlb_monitor_t), &mpi_dlb_monitor_type); | ||
| 342 | PMPI_Type_commit(&mpi_dlb_monitor_type); | ||
| 343 | |||
| 344 | static_ensure(sizeof(blocklengths)/sizeof(blocklengths[0]) == count, | ||
| 345 | "blocklengths size mismatch"); | ||
| 346 | static_ensure(sizeof(displacements)/sizeof(displacements[0]) == count, | ||
| 347 | "displacements size mismatch"); | ||
| 348 | static_ensure(sizeof(types)/sizeof(types[0]) == count, | ||
| 349 | "types size mismatch"); | ||
| 350 | } | ||
| 351 | |||
| 352 | /* MPI struct type: process_record_t */ | ||
| 353 | MPI_Datatype mpi_process_record_type; | ||
| 354 | { | ||
| 355 | int count = 7; | ||
| 356 | int blocklengths[] = {1, 1, 1, HOST_NAME_MAX, | ||
| 357 | TALP_OUTPUT_CPUSET_MAX, TALP_OUTPUT_CPUSET_MAX, 1}; | ||
| 358 | MPI_Aint displacements[] = { | ||
| 359 | offsetof(process_record_t, rank), | ||
| 360 | offsetof(process_record_t, pid), | ||
| 361 | offsetof(process_record_t, node_id), | ||
| 362 | offsetof(process_record_t, hostname), | ||
| 363 | offsetof(process_record_t, cpuset), | ||
| 364 | offsetof(process_record_t, cpuset_quoted), | ||
| 365 | offsetof(process_record_t, monitor)}; | ||
| 366 | MPI_Datatype types[] = {MPI_INT, mpi_pid_type, MPI_INT, MPI_CHAR, MPI_CHAR, | ||
| 367 | MPI_CHAR, mpi_dlb_monitor_type}; | ||
| 368 | MPI_Datatype tmp_type; | ||
| 369 | PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type); | ||
| 370 | PMPI_Type_create_resized(tmp_type, 0, sizeof(process_record_t), | ||
| 371 | &mpi_process_record_type); | ||
| 372 | PMPI_Type_commit(&mpi_process_record_type); | ||
| 373 | } | ||
| 374 | |||
| 375 | /* Gather data */ | ||
| 376 | process_record_t *recvbuf = NULL; | ||
| 377 | if (_mpi_rank == 0) { | ||
| 378 | recvbuf = malloc(_mpi_size * sizeof(process_record_t)); | ||
| 379 | } | ||
| 380 | PMPI_Gather(&process_record_send, 1, mpi_process_record_type, | ||
| 381 | recvbuf, 1, mpi_process_record_type, | ||
| 382 | 0, getWorldComm()); | ||
| 383 | |||
| 384 | /* Add records */ | ||
| 385 | if (_mpi_rank == 0) { | ||
| 386 | for (int rank = 0; rank < _mpi_size; ++rank) { | ||
| 387 | verbose(VB_TALP, "Process summary: recording region %s on rank %d", | ||
| 388 | monitor->name, rank); | ||
| 389 | talp_output_record_process(monitor->name, &recvbuf[rank], _mpi_size); | ||
| 390 | } | ||
| 391 | free(recvbuf); | ||
| 392 | } | ||
| 393 | |||
| 394 | /* Free MPI types */ | ||
| 395 | PMPI_Type_free(&mpi_dlb_monitor_type); | ||
| 396 | PMPI_Type_free(&mpi_process_record_type); | ||
| 397 | } | ||
| 398 | |||
| 399 | /* Gather POP METRICS data of a monitor among all ranks and record it in rank 0 */ | ||
| 400 | void talp_record_pop_summary(const subprocess_descriptor_t *spd, | ||
| 401 | const dlb_monitor_t *monitor) { | ||
| 402 | |||
| 403 | /* Internal monitors will not be recorded */ | ||
| 404 | if (((monitor_data_t*)monitor->_data)->flags.internal) { | ||
| 405 | return; | ||
| 406 | } | ||
| 407 | |||
| 408 | if (_mpi_rank == 0) { | ||
| 409 | verbose(VB_TALP, "TALP summary: gathering region %s", monitor->name); | ||
| 410 | } | ||
| 411 | |||
| 412 | talp_info_t *talp_info = spd->talp_info; | ||
| 413 | |||
| 414 | /* Reduce monitor among all MPI ranks into MPI rank 0 */ | ||
| 415 | pop_base_metrics_t base_metrics; | ||
| 416 | perf_metrics__reduce_monitor_into_base_metrics(&base_metrics, monitor, false); | ||
| 417 | |||
| 418 | if (_mpi_rank == 0) { | ||
| 419 | if (base_metrics.elapsed_time > 0) { | ||
| 420 | |||
| 421 | /* Only the global region records the resources */ | ||
| 422 | if (monitor == talp_info->monitor) { | ||
| 423 | talp_output_record_resources(base_metrics.num_cpus, | ||
| 424 | base_metrics.num_nodes, base_metrics.num_mpi_ranks, | ||
| 425 | base_metrics.num_gpus); | ||
| 426 | } | ||
| 427 | |||
| 428 | /* Construct pop_metrics out of base metrics */ | ||
| 429 | dlb_pop_metrics_t pop_metrics; | ||
| 430 | perf_metrics__base_to_pop_metrics(monitor->name, &base_metrics, &pop_metrics); | ||
| 431 | |||
| 432 | /* Record */ | ||
| 433 | verbose(VB_TALP, "TALP summary: recording region %s", monitor->name); | ||
| 434 | talp_output_record_pop_metrics(&pop_metrics); | ||
| 435 | |||
| 436 | } else { | ||
| 437 | /* Record empty */ | ||
| 438 | verbose(VB_TALP, "TALP summary: recording empty region %s", monitor->name); | ||
| 439 | dlb_pop_metrics_t pop_metrics = {0}; | ||
| 440 | snprintf(pop_metrics.name, DLB_MONITOR_NAME_MAX, "%s", monitor->name); | ||
| 441 | talp_output_record_pop_metrics(&pop_metrics); | ||
| 442 | } | ||
| 443 | } | ||
| 444 | } | ||
| 445 | |||
| 446 | #endif /* MPI_LIB */ | ||
| 447 |