| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /*********************************************************************************/ | ||
| 2 | /* Copyright 2009-2025 Barcelona Supercomputing Center */ | ||
| 3 | /* */ | ||
| 4 | /* This file is part of the DLB library. */ | ||
| 5 | /* */ | ||
| 6 | /* DLB is free software: you can redistribute it and/or modify */ | ||
| 7 | /* it under the terms of the GNU Lesser General Public License as published by */ | ||
| 8 | /* the Free Software Foundation, either version 3 of the License, or */ | ||
| 9 | /* (at your option) any later version. */ | ||
| 10 | /* */ | ||
| 11 | /* DLB is distributed in the hope that it will be useful, */ | ||
| 12 | /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
| 13 | /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
| 14 | /* GNU Lesser General Public License for more details. */ | ||
| 15 | /* */ | ||
| 16 | /* You should have received a copy of the GNU Lesser General Public License */ | ||
| 17 | /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */ | ||
| 18 | /*********************************************************************************/ | ||
| 19 | |||
| 20 | #ifdef HAVE_CONFIG_H | ||
| 21 | #include <config.h> | ||
| 22 | #endif | ||
| 23 | |||
| 24 | #include "talp/talp.h" | ||
| 25 | |||
| 26 | #include "LB_core/node_barrier.h" | ||
| 27 | #include "LB_core/spd.h" | ||
| 28 | #include "LB_comm/shmem_talp.h" | ||
| 29 | #include "apis/dlb_errors.h" | ||
| 30 | #include "apis/dlb_talp.h" | ||
| 31 | #include "support/atomic.h" | ||
| 32 | #include "support/debug.h" | ||
| 33 | #include "support/error.h" | ||
| 34 | #include "support/gslist.h" | ||
| 35 | #include "support/gtree.h" | ||
| 36 | #include "support/mytime.h" | ||
| 37 | #include "support/tracing.h" | ||
| 38 | #include "support/options.h" | ||
| 39 | #include "support/mask_utils.h" | ||
| 40 | #include "talp/perf_metrics.h" | ||
| 41 | #include "talp/regions.h" | ||
| 42 | #include "talp/talp_gpu.h" | ||
| 43 | #include "talp/talp_output.h" | ||
| 44 | #include "talp/talp_record.h" | ||
| 45 | #include "talp/talp_types.h" | ||
| 46 | #ifdef MPI_LIB | ||
| 47 | #include "mpi/mpi_core.h" | ||
| 48 | #endif | ||
| 49 | |||
| 50 | #include <stdlib.h> | ||
| 51 | #include <pthread.h> | ||
| 52 | |||
| 53 | #ifdef PAPI_LIB | ||
| 54 | #include <papi.h> | ||
| 55 | #endif | ||
| 56 | |||
| 57 | extern __thread bool thread_is_observer; | ||
| 58 | |||
| 59 | static void talp_dealloc_samples(const subprocess_descriptor_t *spd); | ||
| 60 | |||
| 61 | |||
| 62 | #if PAPI_LIB | ||
| 63 | static __thread int EventSet = PAPI_NULL; | ||
| 64 | #endif | ||
| 65 | |||
| 66 | |||
| 67 | /* Update all open regions with the macrosample */ | ||
| 68 | 12532 | static void update_regions_with_macrosample(const subprocess_descriptor_t *spd, | |
| 69 | const talp_macrosample_t *macrosample, int num_cpus) { | ||
| 70 | 12532 | talp_info_t *talp_info = spd->talp_info; | |
| 71 | |||
| 72 | /* Update all open regions */ | ||
| 73 | 12532 | pthread_mutex_lock(&talp_info->regions_mutex); | |
| 74 | { | ||
| 75 | 12532 | for (GSList *node = talp_info->open_regions; | |
| 76 |
2/2✓ Branch 0 taken 16708 times.
✓ Branch 1 taken 12532 times.
|
29240 | node != NULL; |
| 77 | 16708 | node = node->next) { | |
| 78 | 16708 | dlb_monitor_t *monitor = node->data; | |
| 79 | 16708 | monitor_data_t *monitor_data = monitor->_data; | |
| 80 | |||
| 81 | /* Update number of CPUs if needed */ | ||
| 82 | 16708 | monitor->num_cpus = max_int(monitor->num_cpus, num_cpus); | |
| 83 | |||
| 84 | /* Timers */ | ||
| 85 | 16708 | monitor->useful_time += macrosample->timers.useful; | |
| 86 | 16708 | monitor->mpi_time += macrosample->timers.not_useful_mpi; | |
| 87 | 16708 | monitor->omp_load_imbalance_time += macrosample->timers.not_useful_omp_in_lb; | |
| 88 | 16708 | monitor->omp_scheduling_time += macrosample->timers.not_useful_omp_in_sched; | |
| 89 | 16708 | monitor->omp_serialization_time += macrosample->timers.not_useful_omp_out; | |
| 90 | 16708 | monitor->gpu_runtime_time += macrosample->timers.not_useful_gpu; | |
| 91 | |||
| 92 | /* GPU Timers */ | ||
| 93 | 16708 | monitor->gpu_useful_time += macrosample->gpu_timers.useful; | |
| 94 | 16708 | monitor->gpu_communication_time += macrosample->gpu_timers.communication; | |
| 95 | 16708 | monitor->gpu_inactive_time += macrosample->gpu_timers.inactive; | |
| 96 | |||
| 97 | #ifdef PAPI_LIB | ||
| 98 | /* Counters */ | ||
| 99 | monitor->cycles += macrosample->counters.cycles; | ||
| 100 | monitor->instructions += macrosample->counters.instructions; | ||
| 101 | #endif | ||
| 102 | /* Stats */ | ||
| 103 | 16708 | monitor->num_mpi_calls += macrosample->stats.num_mpi_calls; | |
| 104 | 16708 | monitor->num_omp_parallels += macrosample->stats.num_omp_parallels; | |
| 105 | 16708 | monitor->num_omp_tasks += macrosample->stats.num_omp_tasks; | |
| 106 | 16708 | monitor->num_gpu_runtime_calls += macrosample->stats.num_gpu_runtime_calls; | |
| 107 | |||
| 108 | /* Update shared memory only if requested */ | ||
| 109 |
2/2✓ Branch 0 taken 14403 times.
✓ Branch 1 taken 2305 times.
|
16708 | if (talp_info->flags.external_profiler) { |
| 110 | 14403 | shmem_talp__set_times(monitor_data->node_shared_id, | |
| 111 | monitor->mpi_time, | ||
| 112 | monitor->useful_time); | ||
| 113 | } | ||
| 114 | } | ||
| 115 | } | ||
| 116 | 12532 | pthread_mutex_unlock(&talp_info->regions_mutex); | |
| 117 | 12532 | } | |
| 118 | |||
| 119 | |||
| 120 | |||
| 121 | /*********************************************************************************/ | ||
| 122 | /* PAPI counters */ | ||
| 123 | /*********************************************************************************/ | ||
| 124 | |||
| 125 | /* Called once in the main thread */ | ||
| 126 | static inline int init_papi(void) __attribute__((unused)); | ||
| 127 | static inline int init_papi(void) { | ||
| 128 | #ifdef PAPI_LIB | ||
| 129 | ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi, | ||
| 130 | "Error invoking %s when PAPI has been disabled", __FUNCTION__); | ||
| 131 | |||
| 132 | /* Library init */ | ||
| 133 | int retval = PAPI_library_init(PAPI_VER_CURRENT); | ||
| 134 | if(retval != PAPI_VER_CURRENT || retval == PAPI_EINVAL){ | ||
| 135 | // PAPI init failed because of a version mismatch | ||
| 136 | // Maybe we can rectify the situation by cheating a bit. | ||
| 137 | // Lets first check if the major version is the same | ||
| 138 | int loaded_lib_version = PAPI_get_opt(PAPI_LIB_VERSION, NULL); | ||
| 139 | if(PAPI_VERSION_MAJOR(PAPI_VER_CURRENT) == PAPI_VERSION_MAJOR(loaded_lib_version)){ | ||
| 140 | warning("The PAPI version loaded at runtime differs from the one DLB was compiled with. Expected version %d.%d.%d but got %d.%d.%d. Continuing on best effort.", | ||
| 141 | PAPI_VERSION_MAJOR(PAPI_VER_CURRENT), | ||
| 142 | PAPI_VERSION_MINOR(PAPI_VER_CURRENT), | ||
| 143 | PAPI_VERSION_REVISION(PAPI_VER_CURRENT), | ||
| 144 | PAPI_VERSION_MAJOR(loaded_lib_version), | ||
| 145 | PAPI_VERSION_MINOR(loaded_lib_version), | ||
| 146 | PAPI_VERSION_REVISION(loaded_lib_version)); | ||
| 147 | // retval here can only be loaded_lib_version or negative. We assume loaded_lib_version and check for other failures below | ||
| 148 | retval = PAPI_library_init(loaded_lib_version); | ||
| 149 | } | ||
| 150 | else{ | ||
| 151 | // we have a different major version. we fail. | ||
| 152 | warning("The PAPI version loaded at runtime differs greatly from the one DLB was compiled with. Expected version%d.%d.%d but got %d.%d.%d.", | ||
| 153 | PAPI_VERSION_MAJOR(PAPI_VER_CURRENT), | ||
| 154 | PAPI_VERSION_MINOR(PAPI_VER_CURRENT), | ||
| 155 | PAPI_VERSION_REVISION(PAPI_VER_CURRENT), | ||
| 156 | PAPI_VERSION_MAJOR(loaded_lib_version), | ||
| 157 | PAPI_VERSION_MINOR(loaded_lib_version), | ||
| 158 | PAPI_VERSION_REVISION(loaded_lib_version)); | ||
| 159 | return -1; | ||
| 160 | } | ||
| 161 | } | ||
| 162 | |||
| 163 | if(retval < 0 || PAPI_is_initialized() == PAPI_NOT_INITED){ | ||
| 164 | // so we failed, we can maybe get some info why: | ||
| 165 | if(retval == PAPI_ENOMEM){ | ||
| 166 | warning("PAPI initialization failed: Insufficient memory to complete the operation."); | ||
| 167 | } | ||
| 168 | if(retval == PAPI_ECMP){ | ||
| 169 | warning("PAPI initialization failed: This component does not support the underlying hardware."); | ||
| 170 | } | ||
| 171 | if(retval == PAPI_ESYS){ | ||
| 172 | warning("PAPI initialization failed: A system or C library call failed inside PAPI."); | ||
| 173 | } | ||
| 174 | return -1; | ||
| 175 | } | ||
| 176 | |||
| 177 | /* Activate thread tracing */ | ||
| 178 | int error = PAPI_thread_init(pthread_self); | ||
| 179 | if (error != PAPI_OK) { | ||
| 180 | warning("PAPI Error during thread initialization. %d: %s", | ||
| 181 | error, PAPI_strerror(error)); | ||
| 182 | return -1; | ||
| 183 | } | ||
| 184 | #endif | ||
| 185 | return 0; | ||
| 186 | } | ||
| 187 | |||
| 188 | /* Called once per thread */ | ||
| 189 | ✗ | int talp_init_papi_counters(void) { | |
| 190 | #ifdef PAPI_LIB | ||
| 191 | ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi, | ||
| 192 | "Error invoking %s when PAPI has been disabled", __FUNCTION__); | ||
| 193 | |||
| 194 | int error = PAPI_register_thread(); | ||
| 195 | if (error != PAPI_OK) { | ||
| 196 | warning("PAPI Error during thread registration. %d: %s", | ||
| 197 | error, PAPI_strerror(error)); | ||
| 198 | return -1; | ||
| 199 | } | ||
| 200 | |||
| 201 | /* Eventset creation */ | ||
| 202 | EventSet = PAPI_NULL; | ||
| 203 | error = PAPI_create_eventset(&EventSet); | ||
| 204 | if (error != PAPI_OK) { | ||
| 205 | warning("PAPI Error during eventset creation. %d: %s", | ||
| 206 | error, PAPI_strerror(error)); | ||
| 207 | return -1; | ||
| 208 | } | ||
| 209 | |||
| 210 | int Events[2] = {PAPI_TOT_CYC, PAPI_TOT_INS}; | ||
| 211 | error = PAPI_add_events(EventSet, Events, 2); | ||
| 212 | if (error != PAPI_OK) { | ||
| 213 | warning("PAPI Error adding events. %d: %s", | ||
| 214 | error, PAPI_strerror(error)); | ||
| 215 | return -1; | ||
| 216 | } | ||
| 217 | |||
| 218 | /* Start tracing */ | ||
| 219 | error = PAPI_start(EventSet); | ||
| 220 | if (error != PAPI_OK) { | ||
| 221 | warning("PAPI Error during tracing initialization: %d: %s", | ||
| 222 | error, PAPI_strerror(error)); | ||
| 223 | return -1; | ||
| 224 | } | ||
| 225 | #endif | ||
| 226 | ✗ | return 0; | |
| 227 | } | ||
| 228 | |||
| 229 | ✗ | static inline void update_last_read_papi_counters(talp_sample_t* sample) { | |
| 230 | #ifdef PAPI_LIB | ||
| 231 | ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi, | ||
| 232 | "Error invoking %s when PAPI has been disabled", __FUNCTION__); | ||
| 233 | |||
| 234 | if(sample->state == useful) { | ||
| 235 | // Similar to the old logic, we "reset" the PAPI counters here. | ||
| 236 | // But instead of calling PAPI_Reset we just store the current value, | ||
| 237 | // such that we can subtract the value later when updating the sample. | ||
| 238 | long long papi_values[2]; | ||
| 239 | int error = PAPI_read(EventSet, papi_values); | ||
| 240 | if (error != PAPI_OK) { | ||
| 241 | verbose(VB_TALP, "Error reading PAPI counters: %d, %s", error, PAPI_strerror(error)); | ||
| 242 | } | ||
| 243 | |||
| 244 | /* Update sample */ | ||
| 245 | sample->last_read_counters.cycles = papi_values[0]; | ||
| 246 | sample->last_read_counters.instructions = papi_values[1]; | ||
| 247 | } | ||
| 248 | #endif | ||
| 249 | } | ||
| 250 | |||
| 251 | /* Called once in the main thread */ | ||
| 252 | static inline void fini_papi(void) __attribute__((unused)); | ||
| 253 | static inline void fini_papi(void) { | ||
| 254 | #ifdef PAPI_LIB | ||
| 255 | ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi, | ||
| 256 | "Error invoking %s when PAPI has been disabled", __FUNCTION__); | ||
| 257 | |||
| 258 | PAPI_shutdown(); | ||
| 259 | #endif | ||
| 260 | } | ||
| 261 | |||
| 262 | /* Called once per thread */ | ||
| 263 | ✗ | void talp_fini_papi_counters(void) { | |
| 264 | #ifdef PAPI_LIB | ||
| 265 | ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi, | ||
| 266 | "Error invoking %s when PAPI has been disabled", __FUNCTION__); | ||
| 267 | |||
| 268 | int error; | ||
| 269 | |||
| 270 | if (EventSet != PAPI_NULL) { | ||
| 271 | // Stop counters (ignore if already stopped) | ||
| 272 | error = PAPI_stop(EventSet, NULL); | ||
| 273 | if (error != PAPI_OK && error != PAPI_ENOTRUN) { | ||
| 274 | warning("PAPI Error stopping counters. %d: %s", | ||
| 275 | error, PAPI_strerror(error)); | ||
| 276 | } | ||
| 277 | |||
| 278 | // Cleanup and destroy the EventSet | ||
| 279 | error = PAPI_cleanup_eventset(EventSet); | ||
| 280 | if (error != PAPI_OK) { | ||
| 281 | warning("PAPI Error cleaning eventset. %d: %s", | ||
| 282 | error, PAPI_strerror(error)); | ||
| 283 | } | ||
| 284 | |||
| 285 | error = PAPI_destroy_eventset(&EventSet); | ||
| 286 | if (error != PAPI_OK) { | ||
| 287 | warning("PAPI Error destroying eventset. %d: %s", | ||
| 288 | error, PAPI_strerror(error)); | ||
| 289 | } | ||
| 290 | |||
| 291 | EventSet = PAPI_NULL; | ||
| 292 | } | ||
| 293 | |||
| 294 | // Unregister this thread from PAPI | ||
| 295 | error = PAPI_unregister_thread(); | ||
| 296 | if (error != PAPI_OK) { | ||
| 297 | warning("PAPI Error unregistering thread. %d: %s", | ||
| 298 | error, PAPI_strerror(error)); | ||
| 299 | } | ||
| 300 | #endif | ||
| 301 | } | ||
| 302 | |||
| 303 | |||
| 304 | /*********************************************************************************/ | ||
| 305 | /* Init / Finalize */ | ||
| 306 | /*********************************************************************************/ | ||
| 307 | |||
| 308 | 17 | void talp_init(subprocess_descriptor_t *spd) { | |
| 309 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 17 times.
|
17 | ensure(!spd->talp_info, "TALP already initialized"); |
| 310 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 17 times.
|
17 | ensure(!thread_is_observer, "An observer thread cannot call talp_init"); |
| 311 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 14 times.
|
17 | verbose(VB_TALP, "Initializing TALP module"); |
| 312 | |||
| 313 | /* Initialize talp info */ | ||
| 314 | 17 | talp_info_t *talp_info = malloc(sizeof(talp_info_t)); | |
| 315 | 17 | *talp_info = (const talp_info_t) { | |
| 316 | .flags = { | ||
| 317 | 17 | .external_profiler = spd->options.talp_external_profiler, | |
| 318 | 17 | .papi = spd->options.talp_papi, | |
| 319 | 17 | .have_shmem = spd->options.talp_external_profiler, | |
| 320 | 17 | .have_minimal_shmem = !spd->options.talp_external_profiler | |
| 321 |
3/4✓ Branch 0 taken 12 times.
✓ Branch 1 taken 5 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 12 times.
|
17 | && spd->options.talp_summary & SUMMARY_NODE, |
| 322 | }, | ||
| 323 | 17 | .regions = g_tree_new_full( | |
| 324 | (GCompareDataFunc)region_compare_by_name, | ||
| 325 | NULL, NULL, region_dealloc), | ||
| 326 | .regions_mutex = PTHREAD_MUTEX_INITIALIZER, | ||
| 327 | .samples_mutex = PTHREAD_MUTEX_INITIALIZER, | ||
| 328 | }; | ||
| 329 | 17 | spd->talp_info = talp_info; | |
| 330 | |||
| 331 | /* Initialize shared memory */ | ||
| 332 |
3/4✓ Branch 0 taken 12 times.
✓ Branch 1 taken 5 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 12 times.
|
17 | if (talp_info->flags.have_shmem || talp_info->flags.have_minimal_shmem) { |
| 333 | /* If we only need a minimal shmem, its size will be the user-provided | ||
| 334 | * multiplier times 'system_size' (usually, 1 region per process) | ||
| 335 | * Otherwise, we multiply it by DEFAULT_REGIONS_PER_PROC. | ||
| 336 | */ | ||
| 337 | enum { DEFAULT_REGIONS_PER_PROC = 100 }; | ||
| 338 | 10 | int shmem_size_multiplier = spd->options.shm_size_multiplier | |
| 339 |
1/2✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
|
5 | * (talp_info->flags.have_shmem ? DEFAULT_REGIONS_PER_PROC : 1); |
| 340 | 5 | shmem_talp__init(spd->options.shm_key, shmem_size_multiplier); | |
| 341 | } | ||
| 342 | |||
| 343 | /* Initialize global region monitor | ||
| 344 | * (at this point we don't know how many CPUs, it will be fixed in talp_openmp_init) */ | ||
| 345 | 17 | talp_info->monitor = region_register(spd, region_get_global_name()); | |
| 346 | |||
| 347 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 17 times.
|
17 | verbose(VB_TALP, "TALP module with workers mask: %s", mu_to_str(&spd->process_mask)); |
| 348 | |||
| 349 | /* Initialize and start running PAPI */ | ||
| 350 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 9 times.
|
17 | if (talp_info->flags.papi) { |
| 351 | #ifdef PAPI_LIB | ||
| 352 | int papi_local_fail = 0; | ||
| 353 | int papi_global_fail = 0; | ||
| 354 | |||
| 355 | if (init_papi() != 0 || talp_init_papi_counters() != 0) { | ||
| 356 | papi_local_fail = 1; | ||
| 357 | } | ||
| 358 | |||
| 359 | #ifdef MPI_LIB | ||
| 360 | PMPI_Allreduce(&papi_local_fail, &papi_global_fail, 1, MPI_INT, MPI_MAX, getWorldComm()); | ||
| 361 | #endif | ||
| 362 | |||
| 363 | if (papi_global_fail && !papi_local_fail) { | ||
| 364 | /* Un-initialize */ | ||
| 365 | talp_fini_papi_counters(); | ||
| 366 | fini_papi(); | ||
| 367 | } | ||
| 368 | |||
| 369 | if (papi_global_fail || papi_local_fail) { | ||
| 370 | warning0("PAPI initialization has failed, disabling option."); | ||
| 371 | talp_info->flags.papi = false; | ||
| 372 | } | ||
| 373 | #else | ||
| 374 | 8 | warning0("DLB has not been configured with PAPI support, disabling option."); | |
| 375 | 8 | talp_info->flags.papi = false; | |
| 376 | #endif | ||
| 377 | } | ||
| 378 | |||
| 379 | /* Start global region */ | ||
| 380 | 17 | region_start(spd, talp_info->monitor); | |
| 381 | 17 | } | |
| 382 | |||
| 383 | 17 | void talp_finalize(subprocess_descriptor_t *spd) { | |
| 384 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 17 times.
|
17 | ensure(spd->talp_info, "TALP is not initialized"); |
| 385 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 17 times.
|
17 | ensure(!thread_is_observer, "An observer thread cannot call talp_finalize"); |
| 386 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 17 times.
|
17 | verbose(VB_TALP, "Finalizing TALP module"); |
| 387 | |||
| 388 | 17 | talp_info_t *talp_info = spd->talp_info; | |
| 389 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 5 times.
|
17 | if (!talp_info->flags.have_mpi) { |
| 390 | /* If we don't have MPI support, regions may be still running and | ||
| 391 | * without being recorded to talp_output. Do that now. */ | ||
| 392 | |||
| 393 | /* Stop open regions | ||
| 394 | * (Note that region_stop need to acquire the regions_mutex | ||
| 395 | * lock, so we we need to iterate without it) */ | ||
| 396 |
2/2✓ Branch 0 taken 9 times.
✓ Branch 1 taken 12 times.
|
21 | while(talp_info->open_regions != NULL) { |
| 397 | 9 | dlb_monitor_t *monitor = talp_info->open_regions->data; | |
| 398 | 9 | region_stop(spd, monitor); | |
| 399 | } | ||
| 400 | |||
| 401 | 12 | pthread_mutex_lock(&talp_info->regions_mutex); | |
| 402 | { | ||
| 403 | /* Record all regions */ | ||
| 404 | 12 | for (GTreeNode *node = g_tree_node_first(talp_info->regions); | |
| 405 |
2/2✓ Branch 0 taken 4818 times.
✓ Branch 1 taken 12 times.
|
4830 | node != NULL; |
| 406 | 4818 | node = g_tree_node_next(node)) { | |
| 407 | 4818 | const dlb_monitor_t *monitor = g_tree_node_value(node); | |
| 408 | 4818 | talp_record_monitor(spd, monitor); | |
| 409 | } | ||
| 410 | } | ||
| 411 | 12 | pthread_mutex_unlock(&talp_info->regions_mutex); | |
| 412 | } | ||
| 413 | |||
| 414 | /* Print/write all collected summaries */ | ||
| 415 | 17 | talp_output_finalize(spd->options.talp_output_file); | |
| 416 | |||
| 417 | /* Deallocate samples structure */ | ||
| 418 | 17 | talp_dealloc_samples(spd); | |
| 419 | |||
| 420 | /* Finalize shared memory */ | ||
| 421 |
3/4✓ Branch 0 taken 12 times.
✓ Branch 1 taken 5 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 12 times.
|
17 | if (talp_info->flags.have_shmem || talp_info->flags.have_minimal_shmem) { |
| 422 | 5 | shmem_talp__finalize(spd->id); | |
| 423 | } | ||
| 424 | |||
| 425 | #ifdef PAPI_LIB | ||
| 426 | if (talp_info->flags.papi) { | ||
| 427 | fini_papi(); | ||
| 428 | } | ||
| 429 | #endif | ||
| 430 | |||
| 431 | /* Deallocate monitoring regions and talp_info */ | ||
| 432 | 17 | pthread_mutex_lock(&talp_info->regions_mutex); | |
| 433 | { | ||
| 434 | /* Destroy GTree, each node is deallocated with the function region_dealloc */ | ||
| 435 | 17 | g_tree_destroy(talp_info->regions); | |
| 436 | 17 | talp_info->regions = NULL; | |
| 437 | 17 | talp_info->monitor = NULL; | |
| 438 | |||
| 439 | /* Destroy list of open regions */ | ||
| 440 | 17 | g_slist_free(talp_info->open_regions); | |
| 441 | 17 | talp_info->open_regions = NULL; | |
| 442 | } | ||
| 443 | 17 | pthread_mutex_unlock(&talp_info->regions_mutex); | |
| 444 | 17 | free(talp_info); | |
| 445 | 17 | spd->talp_info = NULL; | |
| 446 | 17 | } | |
| 447 | |||
| 448 | |||
| 449 | /*********************************************************************************/ | ||
| 450 | /* Sample functions */ | ||
| 451 | /*********************************************************************************/ | ||
| 452 | |||
| 453 | static __thread talp_sample_t* _tls_sample = NULL; | ||
| 454 | |||
| 455 | /* Quick test, without locking and without generating a new sample */ | ||
| 456 | static inline bool is_talp_sample_mine(const talp_sample_t *sample) { | ||
| 457 | return sample != NULL && sample == _tls_sample; | ||
| 458 | } | ||
| 459 | |||
| 460 | 17 | static void talp_dealloc_samples(const subprocess_descriptor_t *spd) { | |
| 461 | |||
| 462 | /* Warning about _tls_sample in worker threads: | ||
| 463 | * worker threads do not currently deallocate their sample. | ||
| 464 | * In some cases, it might happen that a worker thread exits without | ||
| 465 | * the main thread reducing its sample, so in these cases the sample | ||
| 466 | * needs to outlive the thread. | ||
| 467 | * The main thread could deallocate it at this point, but then the | ||
| 468 | * TLS variable would be broken if TALP is reinitialized again. | ||
| 469 | * For now we will keep it like this and will revisit if needed. */ | ||
| 470 | |||
| 471 | /* Deallocate main thread sample */ | ||
| 472 | 17 | free(_tls_sample); | |
| 473 | 17 | _tls_sample = NULL; | |
| 474 | |||
| 475 | /* Deallocate samples list */ | ||
| 476 | 17 | talp_info_t *talp_info = spd->talp_info; | |
| 477 | 17 | pthread_mutex_lock(&talp_info->samples_mutex); | |
| 478 | { | ||
| 479 | 17 | free(talp_info->samples); | |
| 480 | 17 | talp_info->samples = NULL; | |
| 481 | 17 | talp_info->ncpus = 0; | |
| 482 | } | ||
| 483 | 17 | pthread_mutex_unlock(&talp_info->samples_mutex); | |
| 484 | 17 | } | |
| 485 | |||
| 486 | /* Get the TLS associated sample */ | ||
| 487 | 12583 | talp_sample_t* talp_get_thread_sample(const subprocess_descriptor_t *spd) { | |
| 488 | /* Thread already has an allocated sample, return it */ | ||
| 489 |
2/2✓ Branch 0 taken 12565 times.
✓ Branch 1 taken 18 times.
|
12583 | if (likely(_tls_sample != NULL)) return _tls_sample; |
| 490 | |||
| 491 | /* Observer threads don't have a valid sample */ | ||
| 492 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 18 times.
|
18 | if (unlikely(thread_is_observer)) return NULL; |
| 493 | |||
| 494 | /* Otherwise, allocate */ | ||
| 495 | 18 | talp_info_t *talp_info = spd->talp_info; | |
| 496 | 18 | pthread_mutex_lock(&talp_info->samples_mutex); | |
| 497 | { | ||
| 498 | 18 | int ncpus = ++talp_info->ncpus; | |
| 499 | 18 | void *samples = realloc(talp_info->samples, sizeof(talp_sample_t*)*ncpus); | |
| 500 |
1/2✓ Branch 0 taken 18 times.
✗ Branch 1 not taken.
|
18 | if (samples) { |
| 501 | 18 | talp_info->samples = samples; | |
| 502 | void *new_sample; | ||
| 503 |
1/2✓ Branch 0 taken 18 times.
✗ Branch 1 not taken.
|
18 | if (posix_memalign(&new_sample, DLB_CACHE_LINE, sizeof(talp_sample_t)) == 0) { |
| 504 | 18 | _tls_sample = new_sample; | |
| 505 | 18 | talp_info->samples[ncpus-1] = new_sample; | |
| 506 | } | ||
| 507 | } | ||
| 508 | } | ||
| 509 | 18 | pthread_mutex_unlock(&talp_info->samples_mutex); | |
| 510 | |||
| 511 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 18 times.
|
18 | fatal_cond(_tls_sample == NULL, "TALP: could not allocate thread sample"); |
| 512 | |||
| 513 | /* If a thread is created mid-region, its initial time is that of the | ||
| 514 | * innermost open region, otherwise it is the current time */ | ||
| 515 | int64_t last_updated_timestamp; | ||
| 516 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 16 times.
|
18 | if (talp_info->open_regions) { |
| 517 | 2 | const dlb_monitor_t *monitor = talp_info->open_regions->data; | |
| 518 | 2 | last_updated_timestamp = monitor->start_time; | |
| 519 | } else { | ||
| 520 | 16 | last_updated_timestamp = get_time_in_ns(); | |
| 521 | } | ||
| 522 | |||
| 523 | 18 | *_tls_sample = (const talp_sample_t) { | |
| 524 | .last_updated_timestamp = last_updated_timestamp, | ||
| 525 | }; | ||
| 526 | |||
| 527 | 18 | talp_set_sample_state(_tls_sample, disabled, talp_info->flags.papi); | |
| 528 | |||
| 529 | #ifdef INSTRUMENTATION_VERSION | ||
| 530 | unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR}; | ||
| 531 | long long papi_values[] = {0, 0}; | ||
| 532 | instrument_nevent(2, events, papi_values); | ||
| 533 | #endif | ||
| 534 | |||
| 535 | 18 | return _tls_sample; | |
| 536 | } | ||
| 537 | |||
| 538 | /* WARNING: this function may only be called when updating own thread's sample */ | ||
| 539 | 82 | void talp_set_sample_state(talp_sample_t *sample, enum talp_sample_state state, | |
| 540 | bool papi) { | ||
| 541 | 82 | sample->state = state; | |
| 542 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 82 times.
|
82 | if (papi) { |
| 543 | ✗ | update_last_read_papi_counters(sample); | |
| 544 | } | ||
| 545 | instrument_event(MONITOR_STATE, | ||
| 546 | state == disabled ? MONITOR_STATE_DISABLED | ||
| 547 | : state == useful ? MONITOR_STATE_USEFUL | ||
| 548 | : state == not_useful_mpi ? MONITOR_STATE_NOT_USEFUL_MPI | ||
| 549 | : state == not_useful_omp_in ? MONITOR_STATE_NOT_USEFUL_OMP_IN | ||
| 550 | : state == not_useful_omp_out ? MONITOR_STATE_NOT_USEFUL_OMP_OUT | ||
| 551 | : state == not_useful_gpu ? MONITOR_STATE_NOT_USEFUL_GPU | ||
| 552 | : 0, | ||
| 553 | EVENT_BEGIN); | ||
| 554 | 82 | } | |
| 555 | |||
| 556 | /* Compute new microsample (time since last update) and update sample values */ | ||
| 557 | 12559 | void talp_update_sample(talp_sample_t *sample, bool papi, int64_t timestamp) { | |
| 558 | /* Observer threads ignore this function */ | ||
| 559 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12559 times.
|
12559 | if (unlikely(sample == NULL)) return; |
| 560 | |||
| 561 | /* Compute duration and set new last_updated_timestamp */ | ||
| 562 |
2/2✓ Branch 0 taken 37 times.
✓ Branch 1 taken 12522 times.
|
12559 | int64_t now = timestamp == TALP_NO_TIMESTAMP ? get_time_in_ns() : timestamp; |
| 563 | 12559 | int64_t microsample_duration = now - sample->last_updated_timestamp; | |
| 564 | 12559 | sample->last_updated_timestamp = now; | |
| 565 | |||
| 566 | /* Update the appropriate sample timer */ | ||
| 567 |
5/7✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12535 times.
✓ Branch 2 taken 7 times.
✓ Branch 3 taken 11 times.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
|
12559 | switch(sample->state) { |
| 568 | 4 | case disabled: | |
| 569 | 4 | break; | |
| 570 | 12535 | case useful: | |
| 571 | 12535 | DLB_ATOMIC_ADD_RLX(&sample->timers.useful, microsample_duration); | |
| 572 | 12535 | break; | |
| 573 | 7 | case not_useful_mpi: | |
| 574 | 7 | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_mpi, microsample_duration); | |
| 575 | 7 | break; | |
| 576 | 11 | case not_useful_omp_in: | |
| 577 | 11 | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_in, microsample_duration); | |
| 578 | 11 | break; | |
| 579 | 2 | case not_useful_omp_out: | |
| 580 | 2 | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_out, microsample_duration); | |
| 581 | 2 | break; | |
| 582 | ✗ | case not_useful_gpu: | |
| 583 | ✗ | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_gpu, microsample_duration); | |
| 584 | ✗ | break; | |
| 585 | } | ||
| 586 | |||
| 587 | #ifdef PAPI_LIB | ||
| 588 | if (papi) { | ||
| 589 | /* Only read counters if we are updating this thread's sample */ | ||
| 590 | if (is_talp_sample_mine(sample)) { | ||
| 591 | if (sample->state == useful) { | ||
| 592 | /* Read */ | ||
| 593 | long long papi_values[2]; | ||
| 594 | int error = PAPI_read(EventSet, papi_values); | ||
| 595 | if (error != PAPI_OK) { | ||
| 596 | verbose(VB_TALP, "stop return code: %d, %s", error, PAPI_strerror(error)); | ||
| 597 | } | ||
| 598 | // Compute the difference from the last value read | ||
| 599 | const long long useful_cycles = papi_values[0] - sample->last_read_counters.cycles; | ||
| 600 | const long long useful_instructions = papi_values[1] - sample->last_read_counters.instructions; | ||
| 601 | |||
| 602 | /* Atomically add papi_values to sample structure */ | ||
| 603 | DLB_ATOMIC_ADD_RLX(&sample->counters.cycles, useful_cycles); | ||
| 604 | DLB_ATOMIC_ADD_RLX(&sample->counters.instructions, useful_instructions); | ||
| 605 | |||
| 606 | #ifdef INSTRUMENTATION_VERSION | ||
| 607 | unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR}; | ||
| 608 | instrument_nevent(2, events, papi_values); | ||
| 609 | #endif | ||
| 610 | |||
| 611 | /* Counters are updated here and every time the sample is set to useful */ | ||
| 612 | sample->last_read_counters.cycles = papi_values[0]; | ||
| 613 | sample->last_read_counters.instructions = papi_values[1]; | ||
| 614 | } | ||
| 615 | else { | ||
| 616 | #ifdef INSTRUMENTATION_VERSION | ||
| 617 | /* Emit 0's to distinguish useful chunks in traces */ | ||
| 618 | unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR}; | ||
| 619 | long long papi_values[] = {0, 0}; | ||
| 620 | instrument_nevent(2, events, papi_values); | ||
| 621 | #endif | ||
| 622 | } | ||
| 623 | } | ||
| 624 | } | ||
| 625 | #endif /* PAPI_LIB */ | ||
| 626 | } | ||
| 627 | |||
| 628 | /* Flush and aggregate a single sample into a macrosample */ | ||
| 629 | 12522 | static inline void flush_sample_to_macrosample(talp_sample_t *sample, | |
| 630 | talp_macrosample_t *macrosample) { | ||
| 631 | |||
| 632 | /* Timers */ | ||
| 633 | 12522 | macrosample->timers.useful += | |
| 634 | 12522 | DLB_ATOMIC_EXCH_RLX(&sample->timers.useful, 0); | |
| 635 | 12522 | macrosample->timers.not_useful_mpi += | |
| 636 | 12522 | DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_mpi, 0); | |
| 637 | 12522 | macrosample->timers.not_useful_omp_out += | |
| 638 | 12522 | DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_omp_out, 0); | |
| 639 | /* timers.not_useful_omp_in is not flushed here, make sure struct is empty */ | ||
| 640 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12522 times.
|
12522 | ensure(DLB_ATOMIC_LD_RLX(&sample->timers.not_useful_omp_in) == 0, |
| 641 | "Inconsistency in TALP sample metric not_useful_omp_in." | ||
| 642 | " Please, report bug at " PACKAGE_BUGREPORT); | ||
| 643 | 12522 | macrosample->timers.not_useful_gpu += | |
| 644 | 12522 | DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_gpu, 0); | |
| 645 | |||
| 646 | #ifdef PAPI_LIB | ||
| 647 | /* Counters */ | ||
| 648 | macrosample->counters.cycles += | ||
| 649 | DLB_ATOMIC_EXCH_RLX(&sample->counters.cycles, 0); | ||
| 650 | macrosample->counters.instructions += | ||
| 651 | DLB_ATOMIC_EXCH_RLX(&sample->counters.instructions, 0); | ||
| 652 | #endif | ||
| 653 | |||
| 654 | /* Stats */ | ||
| 655 | 12522 | macrosample->stats.num_mpi_calls += | |
| 656 | 12522 | DLB_ATOMIC_EXCH_RLX(&sample->stats.num_mpi_calls, 0); | |
| 657 | 12522 | macrosample->stats.num_omp_parallels += | |
| 658 | 12522 | DLB_ATOMIC_EXCH_RLX(&sample->stats.num_omp_parallels, 0); | |
| 659 | 12522 | macrosample->stats.num_omp_tasks += | |
| 660 | 12522 | DLB_ATOMIC_EXCH_RLX(&sample->stats.num_omp_tasks, 0); | |
| 661 | 12522 | macrosample->stats.num_gpu_runtime_calls += | |
| 662 | 12522 | DLB_ATOMIC_EXCH_RLX(&sample->stats.num_gpu_runtime_calls, 0); | |
| 663 | 12522 | } | |
| 664 | |||
| 665 | /* Flush values from gpu_sample to macrosample */ | ||
| 666 | ✗ | void flush_gpu_sample_to_macrosample(talp_gpu_sample_t *gpu_sample, | |
| 667 | talp_macrosample_t *macrosample) { | ||
| 668 | |||
| 669 | ✗ | macrosample->gpu_timers.useful = gpu_sample->timers.useful; | |
| 670 | ✗ | macrosample->gpu_timers.communication = gpu_sample->timers.communication; | |
| 671 | ✗ | macrosample->gpu_timers.inactive = gpu_sample->timers.inactive; | |
| 672 | |||
| 673 | /* Reset */ | ||
| 674 | ✗ | *gpu_sample = (const talp_gpu_sample_t){0}; | |
| 675 | } | ||
| 676 | |||
| 677 | /* Accumulate values from samples of all threads and update regions */ | ||
| 678 | 12531 | int talp_flush_samples_to_regions(const subprocess_descriptor_t *spd) { | |
| 679 | |||
| 680 | /* Observer threads don't have a valid sample so they cannot start/stop regions */ | ||
| 681 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 12530 times.
|
12531 | if (unlikely(thread_is_observer)) return DLB_ERR_PERM; |
| 682 | |||
| 683 | int num_cpus; | ||
| 684 | 12530 | talp_info_t *talp_info = spd->talp_info; | |
| 685 | |||
| 686 | /* Accumulate samples from all threads */ | ||
| 687 | 12530 | talp_macrosample_t macrosample = (const talp_macrosample_t) {}; | |
| 688 | 12530 | pthread_mutex_lock(&talp_info->samples_mutex); | |
| 689 | { | ||
| 690 | 12530 | num_cpus = talp_info->ncpus; | |
| 691 | |||
| 692 | /* Force-update and aggregate all samples */ | ||
| 693 | 12530 | int64_t timestamp = get_time_in_ns(); | |
| 694 |
2/2✓ Branch 0 taken 12519 times.
✓ Branch 1 taken 12530 times.
|
25049 | for (int i = 0; i < num_cpus; ++i) { |
| 695 | 12519 | talp_update_sample(talp_info->samples[i], talp_info->flags.papi, timestamp); | |
| 696 | 12519 | flush_sample_to_macrosample(talp_info->samples[i], ¯osample); | |
| 697 | } | ||
| 698 | } | ||
| 699 | 12530 | pthread_mutex_unlock(&talp_info->samples_mutex); | |
| 700 | |||
| 701 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12530 times.
|
12530 | if (talp_info->flags.have_gpu) { |
| 702 | /* gpu_sample data is filled asynchronously, first we need to synchronize | ||
| 703 | * and wait for the measurements to be read, then flush it */ | ||
| 704 | ✗ | talp_gpu_sync_measurements(); | |
| 705 | ✗ | flush_gpu_sample_to_macrosample(&talp_info->gpu_sample, ¯osample); | |
| 706 | } | ||
| 707 | |||
| 708 | /* Update all started regions */ | ||
| 709 | 12530 | update_regions_with_macrosample(spd, ¯osample, num_cpus); | |
| 710 | |||
| 711 | 12530 | return DLB_SUCCESS; | |
| 712 | } | ||
| 713 | |||
| 714 | /* Accumulate samples from only a subset of samples of a parallel region. | ||
| 715 | * Load Balance and Scheduling are computed here based on all samples. */ | ||
| 716 | 2 | void talp_flush_sample_subset_to_regions(const subprocess_descriptor_t *spd, | |
| 717 | talp_sample_t **samples, unsigned int nelems) { | ||
| 718 | |||
| 719 | 2 | talp_info_t *talp_info = spd->talp_info; | |
| 720 | 2 | talp_macrosample_t macrosample = (const talp_macrosample_t) {}; | |
| 721 | 2 | pthread_mutex_lock(&talp_info->samples_mutex); | |
| 722 | { | ||
| 723 | /* Iterate first to force-update all samples and compute the minimum | ||
| 724 | * not-useful-omp-in among them */ | ||
| 725 | 2 | int64_t timestamp = get_time_in_ns(); | |
| 726 | 2 | int64_t min_not_useful_omp_in = INT64_MAX; | |
| 727 | unsigned int i; | ||
| 728 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
|
5 | for (i=0; i<nelems; ++i) { |
| 729 | 3 | talp_update_sample(samples[i], talp_info->flags.papi, timestamp); | |
| 730 | 3 | min_not_useful_omp_in = min_int64(min_not_useful_omp_in, | |
| 731 | 3 | DLB_ATOMIC_LD_RLX(&samples[i]->timers.not_useful_omp_in)); | |
| 732 | } | ||
| 733 | |||
| 734 | /* Iterate again to accumulate Load Balance, and to aggregate sample */ | ||
| 735 | 2 | int64_t sched_timer = min_not_useful_omp_in * nelems; | |
| 736 | 2 | int64_t lb_timer = 0; | |
| 737 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
|
5 | for (i=0; i<nelems; ++i) { |
| 738 | 3 | lb_timer += DLB_ATOMIC_EXCH_RLX(&samples[i]->timers.not_useful_omp_in, 0) | |
| 739 | 3 | - min_not_useful_omp_in; | |
| 740 | 3 | flush_sample_to_macrosample(samples[i], ¯osample); | |
| 741 | } | ||
| 742 | |||
| 743 | /* Update derived timers into macrosample */ | ||
| 744 | 2 | macrosample.timers.not_useful_omp_in_lb = lb_timer; | |
| 745 | 2 | macrosample.timers.not_useful_omp_in_sched = sched_timer; | |
| 746 | } | ||
| 747 | 2 | pthread_mutex_unlock(&talp_info->samples_mutex); | |
| 748 | |||
| 749 | /* Update all started regions */ | ||
| 750 | 2 | update_regions_with_macrosample(spd, ¯osample, nelems); | |
| 751 | 2 | } | |
| 752 | |||
| 753 | |||
| 754 | /*********************************************************************************/ | ||
| 755 | /* TALP collect functions for 3rd party programs: */ | ||
| 756 | /* - It's also safe to call it from a 1st party program */ | ||
| 757 | /* - Requires --talp-external-profiler set up in application */ | ||
| 758 | /* - Does not need to synchronize with application */ | ||
| 759 | /*********************************************************************************/ | ||
| 760 | |||
| 761 | /* Function that may be called from a third-party process to compute | ||
| 762 | * node_metrics for a given region */ | ||
| 763 | 6 | int talp_query_pop_node_metrics(const char *name, dlb_node_metrics_t *node_metrics) { | |
| 764 | |||
| 765 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 5 times.
|
6 | if (name == NULL) { |
| 766 | 1 | name = region_get_global_name(); | |
| 767 | } | ||
| 768 | |||
| 769 | 6 | int error = DLB_SUCCESS; | |
| 770 | 6 | int64_t total_mpi_time = 0; | |
| 771 | 6 | int64_t total_useful_time = 0; | |
| 772 | 6 | int64_t max_mpi_time = 0; | |
| 773 | 6 | int64_t max_useful_time = 0; | |
| 774 | |||
| 775 | /* Obtain a list of regions in the node associated with given region */ | ||
| 776 | 6 | int max_procs = mu_get_system_size(); | |
| 777 | 6 | talp_region_list_t *region_list = malloc(max_procs * sizeof(talp_region_list_t)); | |
| 778 | int nelems; | ||
| 779 | 6 | shmem_talp__get_regionlist(region_list, &nelems, max_procs, name); | |
| 780 | |||
| 781 | /* Count how many processes have started the region */ | ||
| 782 | 6 | int processes_per_node = 0; | |
| 783 | |||
| 784 | /* Iterate the PID list and gather times of every process */ | ||
| 785 | int i; | ||
| 786 |
2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 6 times.
|
13 | for (i = 0; i <nelems; ++i) { |
| 787 | 7 | int64_t mpi_time = region_list[i].mpi_time; | |
| 788 | 7 | int64_t useful_time = region_list[i].useful_time; | |
| 789 | |||
| 790 | /* Accumulate total and max values */ | ||
| 791 |
3/4✓ Branch 0 taken 4 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 4 times.
✗ Branch 3 not taken.
|
7 | if (mpi_time > 0 || useful_time > 0) { |
| 792 | 7 | ++processes_per_node; | |
| 793 | 7 | total_mpi_time += mpi_time; | |
| 794 | 7 | total_useful_time += useful_time; | |
| 795 | 7 | max_mpi_time = max_int64(mpi_time, max_mpi_time); | |
| 796 | 7 | max_useful_time = max_int64(useful_time, max_useful_time); | |
| 797 | } | ||
| 798 | } | ||
| 799 | 6 | free(region_list); | |
| 800 | |||
| 801 | #if MPI_LIB | ||
| 802 | int node_id = _node_id; | ||
| 803 | #else | ||
| 804 | 6 | int node_id = 0; | |
| 805 | #endif | ||
| 806 | |||
| 807 |
1/2✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
|
6 | if (processes_per_node > 0) { |
| 808 | /* Compute POP metrics with some inferred values */ | ||
| 809 | perf_metrics_mpi_t metrics; | ||
| 810 | 6 | perf_metrics__infer_mpi_model( | |
| 811 | &metrics, | ||
| 812 | processes_per_node, | ||
| 813 | total_useful_time, | ||
| 814 | total_mpi_time, | ||
| 815 | max_useful_time); | ||
| 816 | |||
| 817 | /* Initialize structure */ | ||
| 818 | 6 | *node_metrics = (const dlb_node_metrics_t) { | |
| 819 | .node_id = node_id, | ||
| 820 | .processes_per_node = processes_per_node, | ||
| 821 | .total_useful_time = total_useful_time, | ||
| 822 | .total_mpi_time = total_mpi_time, | ||
| 823 | .max_useful_time = max_useful_time, | ||
| 824 | .max_mpi_time = max_mpi_time, | ||
| 825 | 6 | .parallel_efficiency = metrics.parallel_efficiency, | |
| 826 | 6 | .communication_efficiency = metrics.communication_efficiency, | |
| 827 | 6 | .load_balance = metrics.load_balance, | |
| 828 | }; | ||
| 829 | 6 | snprintf(node_metrics->name, DLB_MONITOR_NAME_MAX, "%s", name); | |
| 830 | } else { | ||
| 831 | ✗ | error = DLB_ERR_NOENT; | |
| 832 | } | ||
| 833 | |||
| 834 | 6 | return error; | |
| 835 | } | ||
| 836 | |||
| 837 | |||
| 838 | /*********************************************************************************/ | ||
| 839 | /* TALP collect functions for 1st party programs */ | ||
| 840 | /* - Requires synchronization (MPI or node barrier) among all processes */ | ||
| 841 | /*********************************************************************************/ | ||
| 842 | |||
| 843 | /* Compute the current POP metrics for the specified monitor. If monitor is NULL, | ||
| 844 | * the global monitoring region is assumed. | ||
| 845 | * Pre-conditions: | ||
| 846 | * - if MPI, the given monitor must have been registered in all MPI ranks | ||
| 847 | * - pop_metrics is an allocated structure | ||
| 848 | */ | ||
| 849 | 1 | int talp_collect_pop_metrics(const subprocess_descriptor_t *spd, | |
| 850 | dlb_monitor_t *monitor, dlb_pop_metrics_t *pop_metrics) { | ||
| 851 | 1 | talp_info_t *talp_info = spd->talp_info; | |
| 852 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (monitor == NULL) { |
| 853 | ✗ | monitor = talp_info->monitor; | |
| 854 | } | ||
| 855 | |||
| 856 | /* Stop monitor so that metrics are updated */ | ||
| 857 | 1 | bool resume_region = region_stop(spd, monitor) == DLB_SUCCESS; | |
| 858 | |||
| 859 | pop_base_metrics_t base_metrics; | ||
| 860 | #ifdef MPI_LIB | ||
| 861 | /* Reduce monitor among all MPI ranks and everbody collects (all-to-all) */ | ||
| 862 | perf_metrics__reduce_monitor_into_base_metrics(&base_metrics, monitor, true); | ||
| 863 | #else | ||
| 864 | /* Construct base metrics using only the monitor from this process */ | ||
| 865 | 1 | perf_metrics__local_monitor_into_base_metrics(&base_metrics, monitor); | |
| 866 | #endif | ||
| 867 | |||
| 868 | /* Construct output pop_metrics out of base metrics */ | ||
| 869 | 1 | perf_metrics__base_to_pop_metrics(monitor->name, &base_metrics, pop_metrics); | |
| 870 | |||
| 871 | /* Resume monitor */ | ||
| 872 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (resume_region) { |
| 873 | ✗ | region_start(spd, monitor); | |
| 874 | } | ||
| 875 | |||
| 876 | 1 | return DLB_SUCCESS; | |
| 877 | } | ||
| 878 | |||
| 879 | /* Node-collective function to compute node_metrics for a given region */ | ||
| 880 | 5 | int talp_collect_pop_node_metrics(const subprocess_descriptor_t *spd, | |
| 881 | dlb_monitor_t *monitor, dlb_node_metrics_t *node_metrics) { | ||
| 882 | |||
| 883 | 5 | talp_info_t *talp_info = spd->talp_info; | |
| 884 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 3 times.
|
5 | monitor = monitor ? monitor : talp_info->monitor; |
| 885 | 5 | monitor_data_t *monitor_data = monitor->_data; | |
| 886 | |||
| 887 | /* Stop monitor so that metrics are updated */ | ||
| 888 | 5 | bool resume_region = region_stop(spd, monitor) == DLB_SUCCESS; | |
| 889 | |||
| 890 | /* This functionality needs a shared memory, create a temporary one if needed */ | ||
| 891 |
1/2✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
|
5 | if (!talp_info->flags.have_shmem) { |
| 892 | 5 | shmem_talp__init(spd->options.shm_key, 1); | |
| 893 | 5 | shmem_talp__register(spd->id, monitor->avg_cpus, monitor->name, | |
| 894 | &monitor_data->node_shared_id); | ||
| 895 | } | ||
| 896 | |||
| 897 | /* Update the shared memory with this process' metrics */ | ||
| 898 | 5 | shmem_talp__set_times(monitor_data->node_shared_id, | |
| 899 | monitor->mpi_time, | ||
| 900 | monitor->useful_time); | ||
| 901 | |||
| 902 | /* Perform a node barrier to ensure everyone has updated their metrics */ | ||
| 903 | 5 | node_barrier(spd, NULL); | |
| 904 | |||
| 905 | /* Compute node metrics for that region name */ | ||
| 906 | 5 | talp_query_pop_node_metrics(monitor->name, node_metrics); | |
| 907 | |||
| 908 | /* Remove shared memory if it was a temporary one */ | ||
| 909 |
1/2✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
|
5 | if (!talp_info->flags.have_shmem) { |
| 910 | 5 | shmem_talp__finalize(spd->id); | |
| 911 | } | ||
| 912 | |||
| 913 | /* Resume monitor */ | ||
| 914 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
|
5 | if (resume_region) { |
| 915 | 1 | region_start(spd, monitor); | |
| 916 | } | ||
| 917 | |||
| 918 | 5 | return DLB_SUCCESS; | |
| 919 | } | ||
| 920 |