GCC Code Coverage Report


Directory: src/
File: src/talp/talp.c
Date: 2025-11-21 10:34:40
Exec Total Coverage
Lines: 244 262 93.1%
Functions: 13 17 76.5%
Branches: 73 99 73.7%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2025 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include "talp/talp.h"
25
26 #include "LB_core/node_barrier.h"
27 #include "LB_core/spd.h"
28 #include "LB_comm/shmem_talp.h"
29 #include "apis/dlb_errors.h"
30 #include "apis/dlb_talp.h"
31 #include "support/atomic.h"
32 #include "support/debug.h"
33 #include "support/error.h"
34 #include "support/gslist.h"
35 #include "support/gtree.h"
36 #include "support/mytime.h"
37 #include "support/tracing.h"
38 #include "support/options.h"
39 #include "support/mask_utils.h"
40 #include "talp/perf_metrics.h"
41 #include "talp/regions.h"
42 #include "talp/talp_gpu.h"
43 #include "talp/talp_output.h"
44 #include "talp/talp_record.h"
45 #include "talp/talp_types.h"
46 #ifdef MPI_LIB
47 #include "mpi/mpi_core.h"
48 #endif
49
50 #include <stdlib.h>
51 #include <pthread.h>
52
53 #ifdef PAPI_LIB
54 #include <papi.h>
55 #endif
56
57 extern __thread bool thread_is_observer;
58
59 static void talp_dealloc_samples(const subprocess_descriptor_t *spd);
60
61
62 #if PAPI_LIB
63 static __thread int EventSet = PAPI_NULL;
64 #endif
65
66
67 /* Update all open regions with the macrosample */
68 12532 static void update_regions_with_macrosample(const subprocess_descriptor_t *spd,
69 const talp_macrosample_t *macrosample, int num_cpus) {
70 12532 talp_info_t *talp_info = spd->talp_info;
71
72 /* Update all open regions */
73 12532 pthread_mutex_lock(&talp_info->regions_mutex);
74 {
75 12532 for (GSList *node = talp_info->open_regions;
76
2/2
✓ Branch 0 taken 16708 times.
✓ Branch 1 taken 12532 times.
29240 node != NULL;
77 16708 node = node->next) {
78 16708 dlb_monitor_t *monitor = node->data;
79 16708 monitor_data_t *monitor_data = monitor->_data;
80
81 /* Update number of CPUs if needed */
82 16708 monitor->num_cpus = max_int(monitor->num_cpus, num_cpus);
83
84 /* Timers */
85 16708 monitor->useful_time += macrosample->timers.useful;
86 16708 monitor->mpi_time += macrosample->timers.not_useful_mpi;
87 16708 monitor->omp_load_imbalance_time += macrosample->timers.not_useful_omp_in_lb;
88 16708 monitor->omp_scheduling_time += macrosample->timers.not_useful_omp_in_sched;
89 16708 monitor->omp_serialization_time += macrosample->timers.not_useful_omp_out;
90 16708 monitor->gpu_runtime_time += macrosample->timers.not_useful_gpu;
91
92 /* GPU Timers */
93 16708 monitor->gpu_useful_time += macrosample->gpu_timers.useful;
94 16708 monitor->gpu_communication_time += macrosample->gpu_timers.communication;
95 16708 monitor->gpu_inactive_time += macrosample->gpu_timers.inactive;
96
97 #ifdef PAPI_LIB
98 /* Counters */
99 monitor->cycles += macrosample->counters.cycles;
100 monitor->instructions += macrosample->counters.instructions;
101 #endif
102 /* Stats */
103 16708 monitor->num_mpi_calls += macrosample->stats.num_mpi_calls;
104 16708 monitor->num_omp_parallels += macrosample->stats.num_omp_parallels;
105 16708 monitor->num_omp_tasks += macrosample->stats.num_omp_tasks;
106 16708 monitor->num_gpu_runtime_calls += macrosample->stats.num_gpu_runtime_calls;
107
108 /* Update shared memory only if requested */
109
2/2
✓ Branch 0 taken 14403 times.
✓ Branch 1 taken 2305 times.
16708 if (talp_info->flags.external_profiler) {
110 14403 shmem_talp__set_times(monitor_data->node_shared_id,
111 monitor->mpi_time,
112 monitor->useful_time);
113 }
114 }
115 }
116 12532 pthread_mutex_unlock(&talp_info->regions_mutex);
117 12532 }
118
119
120
121 /*********************************************************************************/
122 /* PAPI counters */
123 /*********************************************************************************/
124
125 /* Called once in the main thread */
126 static inline int init_papi(void) __attribute__((unused));
127 static inline int init_papi(void) {
128 #ifdef PAPI_LIB
129 ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi,
130 "Error invoking %s when PAPI has been disabled", __FUNCTION__);
131
132 /* Library init */
133 int retval = PAPI_library_init(PAPI_VER_CURRENT);
134 if(retval != PAPI_VER_CURRENT || retval == PAPI_EINVAL){
135 // PAPI init failed because of a version mismatch
136 // Maybe we can rectify the situation by cheating a bit.
137 // Lets first check if the major version is the same
138 int loaded_lib_version = PAPI_get_opt(PAPI_LIB_VERSION, NULL);
139 if(PAPI_VERSION_MAJOR(PAPI_VER_CURRENT) == PAPI_VERSION_MAJOR(loaded_lib_version)){
140 warning("The PAPI version loaded at runtime differs from the one DLB was compiled with. Expected version %d.%d.%d but got %d.%d.%d. Continuing on best effort.",
141 PAPI_VERSION_MAJOR(PAPI_VER_CURRENT),
142 PAPI_VERSION_MINOR(PAPI_VER_CURRENT),
143 PAPI_VERSION_REVISION(PAPI_VER_CURRENT),
144 PAPI_VERSION_MAJOR(loaded_lib_version),
145 PAPI_VERSION_MINOR(loaded_lib_version),
146 PAPI_VERSION_REVISION(loaded_lib_version));
147 // retval here can only be loaded_lib_version or negative. We assume loaded_lib_version and check for other failures below
148 retval = PAPI_library_init(loaded_lib_version);
149 }
150 else{
151 // we have a different major version. we fail.
152 warning("The PAPI version loaded at runtime differs greatly from the one DLB was compiled with. Expected version%d.%d.%d but got %d.%d.%d.",
153 PAPI_VERSION_MAJOR(PAPI_VER_CURRENT),
154 PAPI_VERSION_MINOR(PAPI_VER_CURRENT),
155 PAPI_VERSION_REVISION(PAPI_VER_CURRENT),
156 PAPI_VERSION_MAJOR(loaded_lib_version),
157 PAPI_VERSION_MINOR(loaded_lib_version),
158 PAPI_VERSION_REVISION(loaded_lib_version));
159 return -1;
160 }
161 }
162
163 if(retval < 0 || PAPI_is_initialized() == PAPI_NOT_INITED){
164 // so we failed, we can maybe get some info why:
165 if(retval == PAPI_ENOMEM){
166 warning("PAPI initialization failed: Insufficient memory to complete the operation.");
167 }
168 if(retval == PAPI_ECMP){
169 warning("PAPI initialization failed: This component does not support the underlying hardware.");
170 }
171 if(retval == PAPI_ESYS){
172 warning("PAPI initialization failed: A system or C library call failed inside PAPI.");
173 }
174 return -1;
175 }
176
177 /* Activate thread tracing */
178 int error = PAPI_thread_init(pthread_self);
179 if (error != PAPI_OK) {
180 warning("PAPI Error during thread initialization. %d: %s",
181 error, PAPI_strerror(error));
182 return -1;
183 }
184 #endif
185 return 0;
186 }
187
188 /* Called once per thread */
189 int talp_init_papi_counters(void) {
190 #ifdef PAPI_LIB
191 ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi,
192 "Error invoking %s when PAPI has been disabled", __FUNCTION__);
193
194 int error = PAPI_register_thread();
195 if (error != PAPI_OK) {
196 warning("PAPI Error during thread registration. %d: %s",
197 error, PAPI_strerror(error));
198 return -1;
199 }
200
201 /* Eventset creation */
202 EventSet = PAPI_NULL;
203 error = PAPI_create_eventset(&EventSet);
204 if (error != PAPI_OK) {
205 warning("PAPI Error during eventset creation. %d: %s",
206 error, PAPI_strerror(error));
207 return -1;
208 }
209
210 int Events[2] = {PAPI_TOT_CYC, PAPI_TOT_INS};
211 error = PAPI_add_events(EventSet, Events, 2);
212 if (error != PAPI_OK) {
213 warning("PAPI Error adding events. %d: %s",
214 error, PAPI_strerror(error));
215 return -1;
216 }
217
218 /* Start tracing */
219 error = PAPI_start(EventSet);
220 if (error != PAPI_OK) {
221 warning("PAPI Error during tracing initialization: %d: %s",
222 error, PAPI_strerror(error));
223 return -1;
224 }
225 #endif
226 return 0;
227 }
228
229 static inline void update_last_read_papi_counters(talp_sample_t* sample) {
230 #ifdef PAPI_LIB
231 ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi,
232 "Error invoking %s when PAPI has been disabled", __FUNCTION__);
233
234 if(sample->state == useful) {
235 // Similar to the old logic, we "reset" the PAPI counters here.
236 // But instead of calling PAPI_Reset we just store the current value,
237 // such that we can subtract the value later when updating the sample.
238 long long papi_values[2];
239 int error = PAPI_read(EventSet, papi_values);
240 if (error != PAPI_OK) {
241 verbose(VB_TALP, "Error reading PAPI counters: %d, %s", error, PAPI_strerror(error));
242 }
243
244 /* Update sample */
245 sample->last_read_counters.cycles = papi_values[0];
246 sample->last_read_counters.instructions = papi_values[1];
247 }
248 #endif
249 }
250
251 /* Called once in the main thread */
252 static inline void fini_papi(void) __attribute__((unused));
253 static inline void fini_papi(void) {
254 #ifdef PAPI_LIB
255 ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi,
256 "Error invoking %s when PAPI has been disabled", __FUNCTION__);
257
258 PAPI_shutdown();
259 #endif
260 }
261
262 /* Called once per thread */
263 void talp_fini_papi_counters(void) {
264 #ifdef PAPI_LIB
265 ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi,
266 "Error invoking %s when PAPI has been disabled", __FUNCTION__);
267
268 int error;
269
270 if (EventSet != PAPI_NULL) {
271 // Stop counters (ignore if already stopped)
272 error = PAPI_stop(EventSet, NULL);
273 if (error != PAPI_OK && error != PAPI_ENOTRUN) {
274 warning("PAPI Error stopping counters. %d: %s",
275 error, PAPI_strerror(error));
276 }
277
278 // Cleanup and destroy the EventSet
279 error = PAPI_cleanup_eventset(EventSet);
280 if (error != PAPI_OK) {
281 warning("PAPI Error cleaning eventset. %d: %s",
282 error, PAPI_strerror(error));
283 }
284
285 error = PAPI_destroy_eventset(&EventSet);
286 if (error != PAPI_OK) {
287 warning("PAPI Error destroying eventset. %d: %s",
288 error, PAPI_strerror(error));
289 }
290
291 EventSet = PAPI_NULL;
292 }
293
294 // Unregister this thread from PAPI
295 error = PAPI_unregister_thread();
296 if (error != PAPI_OK) {
297 warning("PAPI Error unregistering thread. %d: %s",
298 error, PAPI_strerror(error));
299 }
300 #endif
301 }
302
303
304 /*********************************************************************************/
305 /* Init / Finalize */
306 /*********************************************************************************/
307
308 17 void talp_init(subprocess_descriptor_t *spd) {
309
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 17 times.
17 ensure(!spd->talp_info, "TALP already initialized");
310
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 17 times.
17 ensure(!thread_is_observer, "An observer thread cannot call talp_init");
311
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 14 times.
17 verbose(VB_TALP, "Initializing TALP module");
312
313 /* Initialize talp info */
314 17 talp_info_t *talp_info = malloc(sizeof(talp_info_t));
315 17 *talp_info = (const talp_info_t) {
316 .flags = {
317 17 .external_profiler = spd->options.talp_external_profiler,
318 17 .papi = spd->options.talp_papi,
319 17 .have_shmem = spd->options.talp_external_profiler,
320 17 .have_minimal_shmem = !spd->options.talp_external_profiler
321
3/4
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 5 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 12 times.
17 && spd->options.talp_summary & SUMMARY_NODE,
322 },
323 17 .regions = g_tree_new_full(
324 (GCompareDataFunc)region_compare_by_name,
325 NULL, NULL, region_dealloc),
326 .regions_mutex = PTHREAD_MUTEX_INITIALIZER,
327 .samples_mutex = PTHREAD_MUTEX_INITIALIZER,
328 };
329 17 spd->talp_info = talp_info;
330
331 /* Initialize shared memory */
332
3/4
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 5 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 12 times.
17 if (talp_info->flags.have_shmem || talp_info->flags.have_minimal_shmem) {
333 /* If we only need a minimal shmem, its size will be the user-provided
334 * multiplier times 'system_size' (usually, 1 region per process)
335 * Otherwise, we multiply it by DEFAULT_REGIONS_PER_PROC.
336 */
337 enum { DEFAULT_REGIONS_PER_PROC = 100 };
338 10 int shmem_size_multiplier = spd->options.shm_size_multiplier
339
1/2
✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
5 * (talp_info->flags.have_shmem ? DEFAULT_REGIONS_PER_PROC : 1);
340 5 shmem_talp__init(spd->options.shm_key, shmem_size_multiplier);
341 }
342
343 /* Initialize global region monitor
344 * (at this point we don't know how many CPUs, it will be fixed in talp_openmp_init) */
345 17 talp_info->monitor = region_register(spd, region_get_global_name());
346
347
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 17 times.
17 verbose(VB_TALP, "TALP module with workers mask: %s", mu_to_str(&spd->process_mask));
348
349 /* Initialize and start running PAPI */
350
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 9 times.
17 if (talp_info->flags.papi) {
351 #ifdef PAPI_LIB
352 int papi_local_fail = 0;
353 int papi_global_fail = 0;
354
355 if (init_papi() != 0 || talp_init_papi_counters() != 0) {
356 papi_local_fail = 1;
357 }
358
359 #ifdef MPI_LIB
360 PMPI_Allreduce(&papi_local_fail, &papi_global_fail, 1, MPI_INT, MPI_MAX, getWorldComm());
361 #endif
362
363 if (papi_global_fail && !papi_local_fail) {
364 /* Un-initialize */
365 talp_fini_papi_counters();
366 fini_papi();
367 }
368
369 if (papi_global_fail || papi_local_fail) {
370 warning0("PAPI initialization has failed, disabling option.");
371 talp_info->flags.papi = false;
372 }
373 #else
374 8 warning0("DLB has not been configured with PAPI support, disabling option.");
375 8 talp_info->flags.papi = false;
376 #endif
377 }
378
379 /* Start global region */
380 17 region_start(spd, talp_info->monitor);
381 17 }
382
383 17 void talp_finalize(subprocess_descriptor_t *spd) {
384
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 17 times.
17 ensure(spd->talp_info, "TALP is not initialized");
385
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 17 times.
17 ensure(!thread_is_observer, "An observer thread cannot call talp_finalize");
386
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 17 times.
17 verbose(VB_TALP, "Finalizing TALP module");
387
388 17 talp_info_t *talp_info = spd->talp_info;
389
2/2
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 5 times.
17 if (!talp_info->flags.have_mpi) {
390 /* If we don't have MPI support, regions may be still running and
391 * without being recorded to talp_output. Do that now. */
392
393 /* Stop open regions
394 * (Note that region_stop need to acquire the regions_mutex
395 * lock, so we we need to iterate without it) */
396
2/2
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 12 times.
21 while(talp_info->open_regions != NULL) {
397 9 dlb_monitor_t *monitor = talp_info->open_regions->data;
398 9 region_stop(spd, monitor);
399 }
400
401 12 pthread_mutex_lock(&talp_info->regions_mutex);
402 {
403 /* Record all regions */
404 12 for (GTreeNode *node = g_tree_node_first(talp_info->regions);
405
2/2
✓ Branch 0 taken 4818 times.
✓ Branch 1 taken 12 times.
4830 node != NULL;
406 4818 node = g_tree_node_next(node)) {
407 4818 const dlb_monitor_t *monitor = g_tree_node_value(node);
408 4818 talp_record_monitor(spd, monitor);
409 }
410 }
411 12 pthread_mutex_unlock(&talp_info->regions_mutex);
412 }
413
414 /* Print/write all collected summaries */
415 17 talp_output_finalize(spd->options.talp_output_file);
416
417 /* Deallocate samples structure */
418 17 talp_dealloc_samples(spd);
419
420 /* Finalize shared memory */
421
3/4
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 5 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 12 times.
17 if (talp_info->flags.have_shmem || talp_info->flags.have_minimal_shmem) {
422 5 shmem_talp__finalize(spd->id);
423 }
424
425 #ifdef PAPI_LIB
426 if (talp_info->flags.papi) {
427 fini_papi();
428 }
429 #endif
430
431 /* Deallocate monitoring regions and talp_info */
432 17 pthread_mutex_lock(&talp_info->regions_mutex);
433 {
434 /* Destroy GTree, each node is deallocated with the function region_dealloc */
435 17 g_tree_destroy(talp_info->regions);
436 17 talp_info->regions = NULL;
437 17 talp_info->monitor = NULL;
438
439 /* Destroy list of open regions */
440 17 g_slist_free(talp_info->open_regions);
441 17 talp_info->open_regions = NULL;
442 }
443 17 pthread_mutex_unlock(&talp_info->regions_mutex);
444 17 free(talp_info);
445 17 spd->talp_info = NULL;
446 17 }
447
448
449 /*********************************************************************************/
450 /* Sample functions */
451 /*********************************************************************************/
452
453 static __thread talp_sample_t* _tls_sample = NULL;
454
455 /* Quick test, without locking and without generating a new sample */
456 static inline bool is_talp_sample_mine(const talp_sample_t *sample) {
457 return sample != NULL && sample == _tls_sample;
458 }
459
460 17 static void talp_dealloc_samples(const subprocess_descriptor_t *spd) {
461
462 /* Warning about _tls_sample in worker threads:
463 * worker threads do not currently deallocate their sample.
464 * In some cases, it might happen that a worker thread exits without
465 * the main thread reducing its sample, so in these cases the sample
466 * needs to outlive the thread.
467 * The main thread could deallocate it at this point, but then the
468 * TLS variable would be broken if TALP is reinitialized again.
469 * For now we will keep it like this and will revisit if needed. */
470
471 /* Deallocate main thread sample */
472 17 free(_tls_sample);
473 17 _tls_sample = NULL;
474
475 /* Deallocate samples list */
476 17 talp_info_t *talp_info = spd->talp_info;
477 17 pthread_mutex_lock(&talp_info->samples_mutex);
478 {
479 17 free(talp_info->samples);
480 17 talp_info->samples = NULL;
481 17 talp_info->ncpus = 0;
482 }
483 17 pthread_mutex_unlock(&talp_info->samples_mutex);
484 17 }
485
486 /* Get the TLS associated sample */
487 12583 talp_sample_t* talp_get_thread_sample(const subprocess_descriptor_t *spd) {
488 /* Thread already has an allocated sample, return it */
489
2/2
✓ Branch 0 taken 12565 times.
✓ Branch 1 taken 18 times.
12583 if (likely(_tls_sample != NULL)) return _tls_sample;
490
491 /* Observer threads don't have a valid sample */
492
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 18 times.
18 if (unlikely(thread_is_observer)) return NULL;
493
494 /* Otherwise, allocate */
495 18 talp_info_t *talp_info = spd->talp_info;
496 18 pthread_mutex_lock(&talp_info->samples_mutex);
497 {
498 18 int ncpus = ++talp_info->ncpus;
499 18 void *samples = realloc(talp_info->samples, sizeof(talp_sample_t*)*ncpus);
500
1/2
✓ Branch 0 taken 18 times.
✗ Branch 1 not taken.
18 if (samples) {
501 18 talp_info->samples = samples;
502 void *new_sample;
503
1/2
✓ Branch 0 taken 18 times.
✗ Branch 1 not taken.
18 if (posix_memalign(&new_sample, DLB_CACHE_LINE, sizeof(talp_sample_t)) == 0) {
504 18 _tls_sample = new_sample;
505 18 talp_info->samples[ncpus-1] = new_sample;
506 }
507 }
508 }
509 18 pthread_mutex_unlock(&talp_info->samples_mutex);
510
511
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 18 times.
18 fatal_cond(_tls_sample == NULL, "TALP: could not allocate thread sample");
512
513 /* If a thread is created mid-region, its initial time is that of the
514 * innermost open region, otherwise it is the current time */
515 int64_t last_updated_timestamp;
516
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 16 times.
18 if (talp_info->open_regions) {
517 2 const dlb_monitor_t *monitor = talp_info->open_regions->data;
518 2 last_updated_timestamp = monitor->start_time;
519 } else {
520 16 last_updated_timestamp = get_time_in_ns();
521 }
522
523 18 *_tls_sample = (const talp_sample_t) {
524 .last_updated_timestamp = last_updated_timestamp,
525 };
526
527 18 talp_set_sample_state(_tls_sample, disabled, talp_info->flags.papi);
528
529 #ifdef INSTRUMENTATION_VERSION
530 unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR};
531 long long papi_values[] = {0, 0};
532 instrument_nevent(2, events, papi_values);
533 #endif
534
535 18 return _tls_sample;
536 }
537
538 /* WARNING: this function may only be called when updating own thread's sample */
539 82 void talp_set_sample_state(talp_sample_t *sample, enum talp_sample_state state,
540 bool papi) {
541 82 sample->state = state;
542
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 82 times.
82 if (papi) {
543 update_last_read_papi_counters(sample);
544 }
545 instrument_event(MONITOR_STATE,
546 state == disabled ? MONITOR_STATE_DISABLED
547 : state == useful ? MONITOR_STATE_USEFUL
548 : state == not_useful_mpi ? MONITOR_STATE_NOT_USEFUL_MPI
549 : state == not_useful_omp_in ? MONITOR_STATE_NOT_USEFUL_OMP_IN
550 : state == not_useful_omp_out ? MONITOR_STATE_NOT_USEFUL_OMP_OUT
551 : state == not_useful_gpu ? MONITOR_STATE_NOT_USEFUL_GPU
552 : 0,
553 EVENT_BEGIN);
554 82 }
555
556 /* Compute new microsample (time since last update) and update sample values */
557 12559 void talp_update_sample(talp_sample_t *sample, bool papi, int64_t timestamp) {
558 /* Observer threads ignore this function */
559
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 12559 times.
12559 if (unlikely(sample == NULL)) return;
560
561 /* Compute duration and set new last_updated_timestamp */
562
2/2
✓ Branch 0 taken 37 times.
✓ Branch 1 taken 12522 times.
12559 int64_t now = timestamp == TALP_NO_TIMESTAMP ? get_time_in_ns() : timestamp;
563 12559 int64_t microsample_duration = now - sample->last_updated_timestamp;
564 12559 sample->last_updated_timestamp = now;
565
566 /* Update the appropriate sample timer */
567
5/7
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 12535 times.
✓ Branch 2 taken 7 times.
✓ Branch 3 taken 11 times.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
12559 switch(sample->state) {
568 4 case disabled:
569 4 break;
570 12535 case useful:
571 12535 DLB_ATOMIC_ADD_RLX(&sample->timers.useful, microsample_duration);
572 12535 break;
573 7 case not_useful_mpi:
574 7 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_mpi, microsample_duration);
575 7 break;
576 11 case not_useful_omp_in:
577 11 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_in, microsample_duration);
578 11 break;
579 2 case not_useful_omp_out:
580 2 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_out, microsample_duration);
581 2 break;
582 case not_useful_gpu:
583 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_gpu, microsample_duration);
584 break;
585 }
586
587 #ifdef PAPI_LIB
588 if (papi) {
589 /* Only read counters if we are updating this thread's sample */
590 if (is_talp_sample_mine(sample)) {
591 if (sample->state == useful) {
592 /* Read */
593 long long papi_values[2];
594 int error = PAPI_read(EventSet, papi_values);
595 if (error != PAPI_OK) {
596 verbose(VB_TALP, "stop return code: %d, %s", error, PAPI_strerror(error));
597 }
598 // Compute the difference from the last value read
599 const long long useful_cycles = papi_values[0] - sample->last_read_counters.cycles;
600 const long long useful_instructions = papi_values[1] - sample->last_read_counters.instructions;
601
602 /* Atomically add papi_values to sample structure */
603 DLB_ATOMIC_ADD_RLX(&sample->counters.cycles, useful_cycles);
604 DLB_ATOMIC_ADD_RLX(&sample->counters.instructions, useful_instructions);
605
606 #ifdef INSTRUMENTATION_VERSION
607 unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR};
608 instrument_nevent(2, events, papi_values);
609 #endif
610
611 /* Counters are updated here and every time the sample is set to useful */
612 sample->last_read_counters.cycles = papi_values[0];
613 sample->last_read_counters.instructions = papi_values[1];
614 }
615 else {
616 #ifdef INSTRUMENTATION_VERSION
617 /* Emit 0's to distinguish useful chunks in traces */
618 unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR};
619 long long papi_values[] = {0, 0};
620 instrument_nevent(2, events, papi_values);
621 #endif
622 }
623 }
624 }
625 #endif /* PAPI_LIB */
626 }
627
628 /* Flush and aggregate a single sample into a macrosample */
629 12522 static inline void flush_sample_to_macrosample(talp_sample_t *sample,
630 talp_macrosample_t *macrosample) {
631
632 /* Timers */
633 12522 macrosample->timers.useful +=
634 12522 DLB_ATOMIC_EXCH_RLX(&sample->timers.useful, 0);
635 12522 macrosample->timers.not_useful_mpi +=
636 12522 DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_mpi, 0);
637 12522 macrosample->timers.not_useful_omp_out +=
638 12522 DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_omp_out, 0);
639 /* timers.not_useful_omp_in is not flushed here, make sure struct is empty */
640
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 12522 times.
12522 ensure(DLB_ATOMIC_LD_RLX(&sample->timers.not_useful_omp_in) == 0,
641 "Inconsistency in TALP sample metric not_useful_omp_in."
642 " Please, report bug at " PACKAGE_BUGREPORT);
643 12522 macrosample->timers.not_useful_gpu +=
644 12522 DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_gpu, 0);
645
646 #ifdef PAPI_LIB
647 /* Counters */
648 macrosample->counters.cycles +=
649 DLB_ATOMIC_EXCH_RLX(&sample->counters.cycles, 0);
650 macrosample->counters.instructions +=
651 DLB_ATOMIC_EXCH_RLX(&sample->counters.instructions, 0);
652 #endif
653
654 /* Stats */
655 12522 macrosample->stats.num_mpi_calls +=
656 12522 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_mpi_calls, 0);
657 12522 macrosample->stats.num_omp_parallels +=
658 12522 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_omp_parallels, 0);
659 12522 macrosample->stats.num_omp_tasks +=
660 12522 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_omp_tasks, 0);
661 12522 macrosample->stats.num_gpu_runtime_calls +=
662 12522 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_gpu_runtime_calls, 0);
663 12522 }
664
665 /* Flush values from gpu_sample to macrosample */
666 void flush_gpu_sample_to_macrosample(talp_gpu_sample_t *gpu_sample,
667 talp_macrosample_t *macrosample) {
668
669 macrosample->gpu_timers.useful = gpu_sample->timers.useful;
670 macrosample->gpu_timers.communication = gpu_sample->timers.communication;
671 macrosample->gpu_timers.inactive = gpu_sample->timers.inactive;
672
673 /* Reset */
674 *gpu_sample = (const talp_gpu_sample_t){0};
675 }
676
677 /* Accumulate values from samples of all threads and update regions */
678 12531 int talp_flush_samples_to_regions(const subprocess_descriptor_t *spd) {
679
680 /* Observer threads don't have a valid sample so they cannot start/stop regions */
681
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 12530 times.
12531 if (unlikely(thread_is_observer)) return DLB_ERR_PERM;
682
683 int num_cpus;
684 12530 talp_info_t *talp_info = spd->talp_info;
685
686 /* Accumulate samples from all threads */
687 12530 talp_macrosample_t macrosample = (const talp_macrosample_t) {};
688 12530 pthread_mutex_lock(&talp_info->samples_mutex);
689 {
690 12530 num_cpus = talp_info->ncpus;
691
692 /* Force-update and aggregate all samples */
693 12530 int64_t timestamp = get_time_in_ns();
694
2/2
✓ Branch 0 taken 12519 times.
✓ Branch 1 taken 12530 times.
25049 for (int i = 0; i < num_cpus; ++i) {
695 12519 talp_update_sample(talp_info->samples[i], talp_info->flags.papi, timestamp);
696 12519 flush_sample_to_macrosample(talp_info->samples[i], &macrosample);
697 }
698 }
699 12530 pthread_mutex_unlock(&talp_info->samples_mutex);
700
701
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 12530 times.
12530 if (talp_info->flags.have_gpu) {
702 /* gpu_sample data is filled asynchronously, first we need to synchronize
703 * and wait for the measurements to be read, then flush it */
704 talp_gpu_sync_measurements();
705 flush_gpu_sample_to_macrosample(&talp_info->gpu_sample, &macrosample);
706 }
707
708 /* Update all started regions */
709 12530 update_regions_with_macrosample(spd, &macrosample, num_cpus);
710
711 12530 return DLB_SUCCESS;
712 }
713
714 /* Accumulate samples from only a subset of samples of a parallel region.
715 * Load Balance and Scheduling are computed here based on all samples. */
716 2 void talp_flush_sample_subset_to_regions(const subprocess_descriptor_t *spd,
717 talp_sample_t **samples, unsigned int nelems) {
718
719 2 talp_info_t *talp_info = spd->talp_info;
720 2 talp_macrosample_t macrosample = (const talp_macrosample_t) {};
721 2 pthread_mutex_lock(&talp_info->samples_mutex);
722 {
723 /* Iterate first to force-update all samples and compute the minimum
724 * not-useful-omp-in among them */
725 2 int64_t timestamp = get_time_in_ns();
726 2 int64_t min_not_useful_omp_in = INT64_MAX;
727 unsigned int i;
728
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
5 for (i=0; i<nelems; ++i) {
729 3 talp_update_sample(samples[i], talp_info->flags.papi, timestamp);
730 3 min_not_useful_omp_in = min_int64(min_not_useful_omp_in,
731 3 DLB_ATOMIC_LD_RLX(&samples[i]->timers.not_useful_omp_in));
732 }
733
734 /* Iterate again to accumulate Load Balance, and to aggregate sample */
735 2 int64_t sched_timer = min_not_useful_omp_in * nelems;
736 2 int64_t lb_timer = 0;
737
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
5 for (i=0; i<nelems; ++i) {
738 3 lb_timer += DLB_ATOMIC_EXCH_RLX(&samples[i]->timers.not_useful_omp_in, 0)
739 3 - min_not_useful_omp_in;
740 3 flush_sample_to_macrosample(samples[i], &macrosample);
741 }
742
743 /* Update derived timers into macrosample */
744 2 macrosample.timers.not_useful_omp_in_lb = lb_timer;
745 2 macrosample.timers.not_useful_omp_in_sched = sched_timer;
746 }
747 2 pthread_mutex_unlock(&talp_info->samples_mutex);
748
749 /* Update all started regions */
750 2 update_regions_with_macrosample(spd, &macrosample, nelems);
751 2 }
752
753
754 /*********************************************************************************/
755 /* TALP collect functions for 3rd party programs: */
756 /* - It's also safe to call it from a 1st party program */
757 /* - Requires --talp-external-profiler set up in application */
758 /* - Does not need to synchronize with application */
759 /*********************************************************************************/
760
761 /* Function that may be called from a third-party process to compute
762 * node_metrics for a given region */
763 6 int talp_query_pop_node_metrics(const char *name, dlb_node_metrics_t *node_metrics) {
764
765
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 5 times.
6 if (name == NULL) {
766 1 name = region_get_global_name();
767 }
768
769 6 int error = DLB_SUCCESS;
770 6 int64_t total_mpi_time = 0;
771 6 int64_t total_useful_time = 0;
772 6 int64_t max_mpi_time = 0;
773 6 int64_t max_useful_time = 0;
774
775 /* Obtain a list of regions in the node associated with given region */
776 6 int max_procs = mu_get_system_size();
777 6 talp_region_list_t *region_list = malloc(max_procs * sizeof(talp_region_list_t));
778 int nelems;
779 6 shmem_talp__get_regionlist(region_list, &nelems, max_procs, name);
780
781 /* Count how many processes have started the region */
782 6 int processes_per_node = 0;
783
784 /* Iterate the PID list and gather times of every process */
785 int i;
786
2/2
✓ Branch 0 taken 7 times.
✓ Branch 1 taken 6 times.
13 for (i = 0; i <nelems; ++i) {
787 7 int64_t mpi_time = region_list[i].mpi_time;
788 7 int64_t useful_time = region_list[i].useful_time;
789
790 /* Accumulate total and max values */
791
3/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 4 times.
✗ Branch 3 not taken.
7 if (mpi_time > 0 || useful_time > 0) {
792 7 ++processes_per_node;
793 7 total_mpi_time += mpi_time;
794 7 total_useful_time += useful_time;
795 7 max_mpi_time = max_int64(mpi_time, max_mpi_time);
796 7 max_useful_time = max_int64(useful_time, max_useful_time);
797 }
798 }
799 6 free(region_list);
800
801 #if MPI_LIB
802 int node_id = _node_id;
803 #else
804 6 int node_id = 0;
805 #endif
806
807
1/2
✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
6 if (processes_per_node > 0) {
808 /* Compute POP metrics with some inferred values */
809 perf_metrics_mpi_t metrics;
810 6 perf_metrics__infer_mpi_model(
811 &metrics,
812 processes_per_node,
813 total_useful_time,
814 total_mpi_time,
815 max_useful_time);
816
817 /* Initialize structure */
818 6 *node_metrics = (const dlb_node_metrics_t) {
819 .node_id = node_id,
820 .processes_per_node = processes_per_node,
821 .total_useful_time = total_useful_time,
822 .total_mpi_time = total_mpi_time,
823 .max_useful_time = max_useful_time,
824 .max_mpi_time = max_mpi_time,
825 6 .parallel_efficiency = metrics.parallel_efficiency,
826 6 .communication_efficiency = metrics.communication_efficiency,
827 6 .load_balance = metrics.load_balance,
828 };
829 6 snprintf(node_metrics->name, DLB_MONITOR_NAME_MAX, "%s", name);
830 } else {
831 error = DLB_ERR_NOENT;
832 }
833
834 6 return error;
835 }
836
837
838 /*********************************************************************************/
839 /* TALP collect functions for 1st party programs */
840 /* - Requires synchronization (MPI or node barrier) among all processes */
841 /*********************************************************************************/
842
843 /* Compute the current POP metrics for the specified monitor. If monitor is NULL,
844 * the global monitoring region is assumed.
845 * Pre-conditions:
846 * - if MPI, the given monitor must have been registered in all MPI ranks
847 * - pop_metrics is an allocated structure
848 */
849 1 int talp_collect_pop_metrics(const subprocess_descriptor_t *spd,
850 dlb_monitor_t *monitor, dlb_pop_metrics_t *pop_metrics) {
851 1 talp_info_t *talp_info = spd->talp_info;
852
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (monitor == NULL) {
853 monitor = talp_info->monitor;
854 }
855
856 /* Stop monitor so that metrics are updated */
857 1 bool resume_region = region_stop(spd, monitor) == DLB_SUCCESS;
858
859 pop_base_metrics_t base_metrics;
860 #ifdef MPI_LIB
861 /* Reduce monitor among all MPI ranks and everbody collects (all-to-all) */
862 perf_metrics__reduce_monitor_into_base_metrics(&base_metrics, monitor, true);
863 #else
864 /* Construct base metrics using only the monitor from this process */
865 1 perf_metrics__local_monitor_into_base_metrics(&base_metrics, monitor);
866 #endif
867
868 /* Construct output pop_metrics out of base metrics */
869 1 perf_metrics__base_to_pop_metrics(monitor->name, &base_metrics, pop_metrics);
870
871 /* Resume monitor */
872
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (resume_region) {
873 region_start(spd, monitor);
874 }
875
876 1 return DLB_SUCCESS;
877 }
878
879 /* Node-collective function to compute node_metrics for a given region */
880 5 int talp_collect_pop_node_metrics(const subprocess_descriptor_t *spd,
881 dlb_monitor_t *monitor, dlb_node_metrics_t *node_metrics) {
882
883 5 talp_info_t *talp_info = spd->talp_info;
884
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 3 times.
5 monitor = monitor ? monitor : talp_info->monitor;
885 5 monitor_data_t *monitor_data = monitor->_data;
886
887 /* Stop monitor so that metrics are updated */
888 5 bool resume_region = region_stop(spd, monitor) == DLB_SUCCESS;
889
890 /* This functionality needs a shared memory, create a temporary one if needed */
891
1/2
✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
5 if (!talp_info->flags.have_shmem) {
892 5 shmem_talp__init(spd->options.shm_key, 1);
893 5 shmem_talp__register(spd->id, monitor->avg_cpus, monitor->name,
894 &monitor_data->node_shared_id);
895 }
896
897 /* Update the shared memory with this process' metrics */
898 5 shmem_talp__set_times(monitor_data->node_shared_id,
899 monitor->mpi_time,
900 monitor->useful_time);
901
902 /* Perform a node barrier to ensure everyone has updated their metrics */
903 5 node_barrier(spd, NULL);
904
905 /* Compute node metrics for that region name */
906 5 talp_query_pop_node_metrics(monitor->name, node_metrics);
907
908 /* Remove shared memory if it was a temporary one */
909
1/2
✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
5 if (!talp_info->flags.have_shmem) {
910 5 shmem_talp__finalize(spd->id);
911 }
912
913 /* Resume monitor */
914
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
5 if (resume_region) {
915 1 region_start(spd, monitor);
916 }
917
918 5 return DLB_SUCCESS;
919 }
920