GCC Code Coverage Report


Directory: src/
File: src/talp/talp.c
Date: 2026-04-21 15:16:03
Exec Total Coverage
Lines: 268 293 91.5%
Functions: 14 15 93.3%
Branches: 86 133 64.7%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2026 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include "talp/talp.h"
25
26 #include "LB_core/node_barrier.h"
27 #include "LB_core/spd.h"
28 #include "LB_comm/shmem_talp.h"
29 #include "apis/dlb_errors.h"
30 #include "apis/dlb_talp.h"
31 #include "support/atomic.h"
32 #include "support/debug.h"
33 #include "support/error.h"
34 #include "support/gslist.h"
35 #include "support/gtree.h"
36 #include "support/mytime.h"
37 #include "support/tracing.h"
38 #include "support/options.h"
39 #include "support/mask_utils.h"
40 #include "talp/backend.h"
41 #include "talp/perf_metrics.h"
42 #include "talp/regions.h"
43 #include "talp/talp_gpu.h"
44 #include "talp/talp_hwc.h"
45 #include "talp/talp_output.h"
46 #include "talp/talp_record.h"
47 #include "talp/talp_types.h"
48 #ifdef MPI_LIB
49 #include "mpi/mpi_core.h"
50 #endif
51
52 #include <stdlib.h>
53 #include <pthread.h>
54
55 extern __thread bool thread_is_observer;
56
57 static void talp_dealloc_samples(const subprocess_descriptor_t *spd);
58
59
60 /* Update all open regions with the macrosample */
61 5348 static void update_regions_with_macrosample(const subprocess_descriptor_t *spd,
62 const talp_macrosample_t *macrosample, int num_cpus) {
63 5348 talp_info_t *talp_info = spd->talp_info;
64
65 /* Update all open regions */
66 5348 pthread_mutex_lock(&talp_info->regions_mutex);
67 {
68 5348 for (GSList *node = talp_info->open_regions;
69
2/2
✓ Branch 0 taken 5918 times.
✓ Branch 1 taken 5348 times.
11266 node != NULL;
70 5918 node = node->next) {
71 5918 dlb_monitor_t *monitor = node->data;
72 5918 monitor_data_t *monitor_data = monitor->_data;
73
74 /* Update number of CPUs if needed */
75 5918 monitor->num_cpus = max_int(monitor->num_cpus, num_cpus);
76
77 /* Timers */
78 5918 monitor->useful_time += macrosample->timers.useful;
79 5918 monitor->mpi_time += macrosample->timers.not_useful_mpi;
80 5918 monitor->omp_load_imbalance_time += macrosample->timers.not_useful_omp_in_lb;
81 5918 monitor->omp_scheduling_time += macrosample->timers.not_useful_omp_in_sched;
82 5918 monitor->omp_serialization_time += macrosample->timers.not_useful_omp_out;
83 5918 monitor->gpu_runtime_time += macrosample->timers.not_useful_gpu;
84
85 /* GPU Timers */
86 5918 monitor->gpu_useful_time += macrosample->gpu_timers.useful;
87 5918 monitor->gpu_communication_time += macrosample->gpu_timers.communication;
88 5918 monitor->gpu_inactive_time += macrosample->gpu_timers.inactive;
89
90 /* Counters */
91 5918 monitor->cycles += macrosample->counters.cycles;
92 5918 monitor->instructions += macrosample->counters.instructions;
93
94 /* Stats */
95 5918 monitor->num_mpi_calls += macrosample->stats.num_mpi_calls;
96 5918 monitor->num_omp_parallels += macrosample->stats.num_omp_parallels;
97 5918 monitor->num_omp_tasks += macrosample->stats.num_omp_tasks;
98 5918 monitor->num_gpu_runtime_calls += macrosample->stats.num_gpu_runtime_calls;
99
100 /* Update shared memory only if requested */
101
2/2
✓ Branch 0 taken 3610 times.
✓ Branch 1 taken 2308 times.
5918 if (talp_info->flags.external_profiler) {
102 3610 shmem_talp__set_times(monitor_data->node_shared_id,
103 monitor->mpi_time,
104 monitor->useful_time);
105 }
106 }
107 }
108 5348 pthread_mutex_unlock(&talp_info->regions_mutex);
109 5348 }
110
111
112 #ifdef MPI_LIB
113 /* Returns the number of MPI processes that have HWC enabled */
114 static int get_hwc_init_across_world(const subprocess_descriptor_t *spd) {
115
116 talp_info_t *talp_info = spd->talp_info;
117
118 // status = 1 means HWC are enabled
119 int hwc_local_status = talp_info->flags.have_hwc ? 1 : 0;
120
121 int hwc_global_statuses = 0;
122
123 PMPI_Allreduce(&hwc_local_status, &hwc_global_statuses, 1,
124 MPI_INT, MPI_SUM, getWorldComm());
125
126 return hwc_global_statuses;
127 }
128 #endif
129
130
131 /*********************************************************************************/
132 /* Init / Finalize */
133 /*********************************************************************************/
134
135 23 void talp_init(subprocess_descriptor_t *spd) {
136
137
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 23 times.
23 ensure(!spd->talp_info, "TALP already initialized");
138
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 23 times.
23 ensure(!thread_is_observer, "An observer thread cannot call talp_init");
139
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 20 times.
23 verbose(VB_TALP, "Initializing TALP module with worker mask: %s",
140 mu_to_str(&spd->process_mask));
141
142 /* Initialize talp info */
143 23 talp_info_t *talp_info = malloc(sizeof(talp_info_t));
144 23 *talp_info = (const talp_info_t) {
145 .flags = {
146 23 .external_profiler = spd->options.talp_external_profiler,
147 23 .have_shmem = spd->options.talp_external_profiler,
148 23 .have_minimal_shmem = !spd->options.talp_external_profiler
149
3/4
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 9 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 14 times.
23 && spd->options.talp_summary & SUMMARY_NODE,
150 },
151 23 .regions = g_tree_new_full(
152 (GCompareDataFunc)region_compare_by_name,
153 NULL, NULL, region_dealloc),
154 .regions_mutex = PTHREAD_MUTEX_INITIALIZER,
155 .samples_mutex = PTHREAD_MUTEX_INITIALIZER,
156 };
157 23 spd->talp_info = talp_info;
158
159 /* Initialize shared memory */
160
3/4
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 9 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 14 times.
23 if (talp_info->flags.have_shmem || talp_info->flags.have_minimal_shmem) {
161 /* If we only need a minimal shmem, its size will be the user-provided
162 * multiplier times 'system_size' (usually, 1 region per process)
163 * Otherwise, we multiply it by DEFAULT_REGIONS_PER_PROC.
164 */
165 enum { DEFAULT_REGIONS_PER_PROC = 100 };
166 18 int shmem_size_multiplier = spd->options.shm_size_multiplier
167
1/2
✓ Branch 0 taken 9 times.
✗ Branch 1 not taken.
9 * (talp_info->flags.have_shmem ? DEFAULT_REGIONS_PER_PROC : 1);
168 9 shmem_talp__init(spd->options.shm_key, shmem_size_multiplier);
169 }
170
171 /* Initialize TALP components */
172
2/2
✓ Branch 0 taken 15 times.
✓ Branch 1 taken 8 times.
23 if (spd->options.talp & (TALP_COMPONENT_DEFAULT | TALP_COMPONENT_GPU)) {
173
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 15 times.
15 if (talp_gpu_init(spd) == DLB_SUCCESS) {
174 talp_info->flags.have_gpu = true;
175 verbose(VB_TALP, "GPU component enabled successfully");
176 } else {
177
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 15 times.
15 if (spd->options.talp & TALP_COMPONENT_GPU) {
178 /* component was explicit and failed, warn user */
179 warning("TALP: Failed to load GPU component");
180 }
181 }
182 }
183
1/2
✓ Branch 0 taken 23 times.
✗ Branch 1 not taken.
23 if (spd->options.talp & (TALP_COMPONENT_DEFAULT | TALP_COMPONENT_HWC)) {
184
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 23 times.
23 if (talp_hwc_init(spd) == DLB_SUCCESS) {
185 talp_info->flags.have_hwc = true;
186 verbose(VB_TALP, "HWC component enabled successfully");
187 } else {
188
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 15 times.
23 if (spd->options.talp & TALP_COMPONENT_HWC) {
189 /* component was explicit and failed, warn user */
190 8 warning("TALP: Failed to load HWC component");
191 }
192 }
193 }
194
195 #ifdef MPI_LIB
196 /* Check HWC status across all process. Every process needs to do the check
197 * because it's a collective operation and some process may have been started
198 * without the appropriate flag. */
199 if (is_mpi_ready()) {
200 int num_procs_with_hwc = get_hwc_init_across_world(spd);
201 if (num_procs_with_hwc > 0 && num_procs_with_hwc < _mpi_size) {
202 warning0("Hardware Counters initialization has failed, disabling option.");
203 talp_hwc_finalize();
204 talp_info->flags.have_hwc = false;
205 }
206 }
207 #endif
208
209 /* Initialize global region monitor
210 * (at this point we don't know how many CPUs, it will be fixed in talp_openmp_init) */
211 23 talp_info->monitor = region_register(spd, region_get_global_name());
212
213 /* Start global region */
214 23 region_start(spd, talp_info->monitor);
215 23 }
216
217 23 void talp_finalize(subprocess_descriptor_t *spd) {
218
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 23 times.
23 ensure(spd->talp_info, "TALP is not initialized");
219
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 23 times.
23 ensure(!thread_is_observer, "An observer thread cannot call talp_finalize");
220
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 23 times.
23 verbose(VB_TALP, "Finalizing TALP module");
221
222 23 talp_info_t *talp_info = spd->talp_info;
223
224 /* Stop open regions
225 * (Note that region_stop need to acquire the regions_mutex
226 * lock, so we we need to iterate without it) */
227
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 23 times.
39 while(talp_info->open_regions != NULL) {
228 16 dlb_monitor_t *monitor = talp_info->open_regions->data;
229 16 region_stop(spd, monitor);
230 }
231
232 /* Finalize TALP components */
233
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 23 times.
23 if (talp_info->flags.have_gpu) {
234 talp_gpu_finalize();
235 }
236
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 23 times.
23 if (talp_info->flags.have_hwc) {
237 talp_hwc_finalize();
238 }
239
240 /* Per-process output (no MPI or requested by user) */
241
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 15 times.
23 if (!talp_info->flags.have_mpi
242
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
8 || spd->options.talp_partial_output) {
243
244 15 pthread_mutex_lock(&talp_info->regions_mutex);
245 {
246 /* Record all regions */
247 15 for (GTreeNode *node = g_tree_node_first(talp_info->regions);
248
2/2
✓ Branch 0 taken 1221 times.
✓ Branch 1 taken 15 times.
1236 node != NULL;
249 1221 node = g_tree_node_next(node)) {
250 1221 const dlb_monitor_t *monitor = g_tree_node_value(node);
251 1221 talp_record_monitor(spd, monitor);
252 }
253 }
254 15 pthread_mutex_unlock(&talp_info->regions_mutex);
255 }
256
257 /* Print/write all collected summaries */
258 23 talp_output_finalize(spd->options.talp_output_file, spd->options.talp_partial_output);
259
260 /* Deallocate samples structure */
261 23 talp_dealloc_samples(spd);
262
263 /* Finalize shared memory */
264
3/4
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 9 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 14 times.
23 if (talp_info->flags.have_shmem || talp_info->flags.have_minimal_shmem) {
265 9 shmem_talp__finalize(spd->id);
266 }
267
268 /* Deallocate monitoring regions and talp_info */
269 23 pthread_mutex_lock(&talp_info->regions_mutex);
270 {
271 /* Destroy GTree, each node is deallocated with the function region_dealloc */
272 23 g_tree_destroy(talp_info->regions);
273 23 talp_info->regions = NULL;
274 23 talp_info->monitor = NULL;
275
276 /* Destroy list of open regions */
277 23 g_slist_free(talp_info->open_regions);
278 23 talp_info->open_regions = NULL;
279 }
280 23 pthread_mutex_unlock(&talp_info->regions_mutex);
281 23 free(talp_info);
282 23 spd->talp_info = NULL;
283 23 }
284
285
286 /*********************************************************************************/
287 /* Sample functions */
288 /*********************************************************************************/
289
290 static __thread talp_sample_t* _tls_sample = NULL;
291 static __thread bool _is_main_sample = false;
292 static __thread bool _is_main_sample_in_serial_mode = false;
293
294 /* Quick test, without locking and without generating a new sample */
295 static inline bool is_talp_sample_mine(const talp_sample_t *sample) {
296 return sample != NULL && sample == _tls_sample;
297 }
298
299 23 static void talp_dealloc_samples(const subprocess_descriptor_t *spd) {
300
301 /* Warning about _tls_sample in worker threads:
302 * worker threads do not currently deallocate their sample.
303 * In some cases, it might happen that a worker thread exits without
304 * the main thread reducing its sample, so in these cases the sample
305 * needs to outlive the thread.
306 * The main thread could deallocate it at this point, but then the
307 * TLS variable would be broken if TALP is reinitialized again.
308 * For now we will keep it like this and will revisit if needed. */
309
310 /* Deallocate main thread sample */
311 23 free(_tls_sample);
312 23 _tls_sample = NULL;
313
314 /* Deallocate samples list */
315 23 talp_info_t *talp_info = spd->talp_info;
316 23 pthread_mutex_lock(&talp_info->samples_mutex);
317 {
318 23 free(talp_info->samples);
319 23 talp_info->samples = NULL;
320 23 talp_info->ncpus = 0;
321 }
322 23 pthread_mutex_unlock(&talp_info->samples_mutex);
323 23 }
324
325 /* Get the TLS associated sample */
326 5399 talp_sample_t* talp_get_thread_sample(const subprocess_descriptor_t *spd) {
327
328 /* Thread already has an allocated sample, return it */
329
2/2
✓ Branch 0 taken 5375 times.
✓ Branch 1 taken 24 times.
5399 if (likely(_tls_sample != NULL)) return _tls_sample;
330
331 /* Observer threads don't have a valid sample */
332
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
24 if (unlikely(thread_is_observer)) return NULL;
333
334 /* Otherwise, allocate */
335 24 talp_info_t *talp_info = spd->talp_info;
336 24 pthread_mutex_lock(&talp_info->samples_mutex);
337 {
338 24 int ncpus = ++talp_info->ncpus;
339 24 void *samples = realloc(talp_info->samples, sizeof(talp_sample_t*)*ncpus);
340
1/2
✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
24 if (samples) {
341 24 talp_info->samples = samples;
342 void *new_sample;
343
1/2
✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
24 if (posix_memalign(&new_sample, DLB_CACHE_LINE, sizeof(talp_sample_t)) == 0) {
344 24 _tls_sample = new_sample;
345 24 talp_info->samples[ncpus-1] = new_sample;
346
2/2
✓ Branch 0 taken 22 times.
✓ Branch 1 taken 2 times.
24 if (ncpus == 1) {
347 22 _is_main_sample = true;
348 22 _is_main_sample_in_serial_mode = true;
349 }
350 }
351 }
352 }
353 24 pthread_mutex_unlock(&talp_info->samples_mutex);
354
355
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
24 fatal_cond(_tls_sample == NULL, "TALP: could not allocate thread sample");
356
357 /* If a thread is created mid-region, its initial time is that of the
358 * innermost open region, otherwise it is the current time */
359 int64_t last_updated_timestamp;
360
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 22 times.
24 if (talp_info->open_regions) {
361 2 const dlb_monitor_t *monitor = talp_info->open_regions->data;
362 2 last_updated_timestamp = monitor->start_time;
363 } else {
364 22 last_updated_timestamp = get_time_in_ns();
365 }
366
367 24 *_tls_sample = (const talp_sample_t) {
368 .last_updated_timestamp = last_updated_timestamp,
369 };
370
371 24 talp_set_sample_state(spd, _tls_sample, TALP_STATE_DISABLED);
372
373 #ifdef INSTRUMENTATION_VERSION
374 unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR};
375 long long hwc_values[] = {0, 0};
376 instrument_nevent(2, events, hwc_values);
377 #endif
378
379 24 return _tls_sample;
380 }
381
382 /* WARNING: this function may only be called when updating own thread's sample */
383 97 void talp_set_sample_state(const subprocess_descriptor_t *spd, talp_sample_t *sample,
384 talp_sample_state_t new_state) {
385
386 97 talp_info_t *talp_info = spd->talp_info;
387
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 97 times.
97 if (talp_info->flags.have_hwc) {
388 talp_sample_state_t old = sample->state;
389 talp_hwc_on_state_change(old, new_state);
390 }
391
392 97 sample->state = new_state;
393
394 instrument_event(MONITOR_STATE,
395 new_state == TALP_STATE_DISABLED ? MONITOR_STATE_DISABLED
396 : new_state == TALP_STATE_USEFUL ? MONITOR_STATE_USEFUL
397 : new_state == TALP_STATE_NOT_USEFUL_MPI ? MONITOR_STATE_NOT_USEFUL_MPI
398 : new_state == TALP_STATE_NOT_USEFUL_OMP_IN ? MONITOR_STATE_NOT_USEFUL_OMP_IN
399 : new_state == TALP_STATE_NOT_USEFUL_OMP_OUT ? MONITOR_STATE_NOT_USEFUL_OMP_OUT
400 : new_state == TALP_STATE_NOT_USEFUL_GPU ? MONITOR_STATE_NOT_USEFUL_GPU
401 : 0,
402 EVENT_BEGIN);
403 97 }
404
405 /* Compute new microsample (time since last update) and update sample values */
406 5369 void talp_update_sample(const subprocess_descriptor_t *spd, talp_sample_t *sample,
407 int64_t timestamp) {
408
409 /* Observer threads ignore this function */
410
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5369 times.
5369 if (unlikely(sample == NULL)) return;
411
412 5369 talp_info_t *talp_info = spd->talp_info;
413
414 /* Compute duration and set new last_updated_timestamp */
415
2/2
✓ Branch 0 taken 37 times.
✓ Branch 1 taken 5332 times.
5369 int64_t now = timestamp == TALP_NO_TIMESTAMP ? get_time_in_ns() : timestamp;
416 5369 int64_t microsample_duration = now - sample->last_updated_timestamp;
417 5369 sample->last_updated_timestamp = now;
418
419 /* Update the appropriate sample timer */
420
5/7
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 5345 times.
✓ Branch 2 taken 7 times.
✓ Branch 3 taken 11 times.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
5369 switch(sample->state) {
421 4 case TALP_STATE_DISABLED:
422 4 break;
423 5345 case TALP_STATE_USEFUL:
424 5345 DLB_ATOMIC_ADD_RLX(&sample->timers.useful, microsample_duration);
425 5345 break;
426 7 case TALP_STATE_NOT_USEFUL_MPI:
427
1/2
✓ Branch 0 taken 7 times.
✗ Branch 1 not taken.
7 if (_is_main_sample_in_serial_mode) {
428 7 int num_cpus = talp_info->ncpus;
429 7 microsample_duration *= num_cpus;
430 }
431 7 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_mpi, microsample_duration);
432 7 break;
433 11 case TALP_STATE_NOT_USEFUL_OMP_IN:
434 11 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_in, microsample_duration);
435 11 break;
436 2 case TALP_STATE_NOT_USEFUL_OMP_OUT:
437 2 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_out, microsample_duration);
438 2 break;
439 case TALP_STATE_NOT_USEFUL_GPU:
440 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_gpu, microsample_duration);
441 break;
442 }
443
444
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5369 times.
5369 if (talp_info->flags.have_hwc) {
445 /* Only read counters if we are updating this thread's sample */
446 if (is_talp_sample_mine(sample)) {
447 hwc_measurements_t measurements;
448 if (talp_hwc_collect(&measurements)) {
449 /* Atomically add HWC values to sample structure */
450 DLB_ATOMIC_ADD_RLX(&sample->counters.cycles, measurements.cycles);
451 DLB_ATOMIC_ADD_RLX(&sample->counters.instructions, measurements.instructions);
452 }
453
454 #ifdef INSTRUMENTATION_VERSION
455 // It's safe to emit even if talp_hwc_collect returned false,
456 // struct is zero'ed in that case
457 unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR};
458 long long hwc_values[] = {measurements.cycles, measurements.instructions};
459 instrument_nevent(2, events, hwc_values);
460 #endif
461 }
462 }
463 }
464
465 /* Flush and aggregate a single sample into a macrosample */
466 5332 static inline void flush_sample_to_macrosample(talp_sample_t *sample,
467 talp_macrosample_t *macrosample) {
468
469 /* Timers */
470 5332 macrosample->timers.useful +=
471 5332 DLB_ATOMIC_EXCH_RLX(&sample->timers.useful, 0);
472 5332 macrosample->timers.not_useful_mpi +=
473 5332 DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_mpi, 0);
474 5332 macrosample->timers.not_useful_omp_out +=
475 5332 DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_omp_out, 0);
476 /* timers.not_useful_omp_in is not flushed here, make sure struct is empty */
477
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5332 times.
5332 ensure(DLB_ATOMIC_LD_RLX(&sample->timers.not_useful_omp_in) == 0,
478 "Inconsistency in TALP sample metric not_useful_omp_in."
479 " Please, report bug at " PACKAGE_BUGREPORT);
480 5332 macrosample->timers.not_useful_gpu +=
481 5332 DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_gpu, 0);
482
483 /* Counters */
484 5332 macrosample->counters.cycles +=
485 5332 DLB_ATOMIC_EXCH_RLX(&sample->counters.cycles, 0);
486 5332 macrosample->counters.instructions +=
487 5332 DLB_ATOMIC_EXCH_RLX(&sample->counters.instructions, 0);
488
489 /* Stats */
490 5332 macrosample->stats.num_mpi_calls +=
491 5332 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_mpi_calls, 0);
492 5332 macrosample->stats.num_omp_parallels +=
493 5332 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_omp_parallels, 0);
494 5332 macrosample->stats.num_omp_tasks +=
495 5332 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_omp_tasks, 0);
496 5332 macrosample->stats.num_gpu_runtime_calls +=
497 5332 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_gpu_runtime_calls, 0);
498 5332 }
499
500 /* Accumulate values from samples of all threads and update regions */
501 5347 int talp_flush_samples_to_regions(const subprocess_descriptor_t *spd) {
502
503 /* Observer threads don't have a valid sample so they cannot start/stop regions */
504
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 5346 times.
5347 if (unlikely(thread_is_observer)) return DLB_ERR_PERM;
505
506 int num_cpus;
507 5346 talp_info_t *talp_info = spd->talp_info;
508
509 /* Accumulate samples from all threads */
510 5346 talp_macrosample_t macrosample = (const talp_macrosample_t) {};
511 5346 pthread_mutex_lock(&talp_info->samples_mutex);
512 {
513 5346 num_cpus = talp_info->ncpus;
514
515 /* Force-update and aggregate all samples */
516 5346 int64_t timestamp = get_time_in_ns();
517
2/2
✓ Branch 0 taken 5329 times.
✓ Branch 1 taken 5346 times.
10675 for (int i = 0; i < num_cpus; ++i) {
518 5329 talp_update_sample(spd, talp_info->samples[i], timestamp);
519 5329 flush_sample_to_macrosample(talp_info->samples[i], &macrosample);
520 }
521 }
522 5346 pthread_mutex_unlock(&talp_info->samples_mutex);
523
524
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5346 times.
5346 if (talp_info->flags.have_gpu) {
525 /* Collect GPU measuremnts up to this point and update macrosample */
526 gpu_measurements_t measurements;
527 talp_gpu_collect(&measurements);
528 macrosample.gpu_timers.useful = measurements.useful_time;
529 macrosample.gpu_timers.communication = measurements.communication_time;
530 macrosample.gpu_timers.inactive = measurements.inactive_time;
531 }
532
533 /* Update all started regions */
534 5346 update_regions_with_macrosample(spd, &macrosample, num_cpus);
535
536 5346 return DLB_SUCCESS;
537 }
538
539 /* Accumulate samples from only a subset of samples of a parallel region.
540 * Load Balance and Scheduling are computed here based on all samples. */
541 2 void talp_flush_sample_subset_to_regions(const subprocess_descriptor_t *spd,
542 talp_sample_t **samples, unsigned int nelems) {
543
544 2 talp_info_t *talp_info = spd->talp_info;
545 2 talp_macrosample_t macrosample = (const talp_macrosample_t) {};
546 2 pthread_mutex_lock(&talp_info->samples_mutex);
547 {
548 /* Iterate first to force-update all samples and compute the minimum
549 * not-useful-omp-in among them */
550 2 int64_t timestamp = get_time_in_ns();
551 2 int64_t min_not_useful_omp_in = INT64_MAX;
552 unsigned int i;
553
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
5 for (i=0; i<nelems; ++i) {
554 3 talp_update_sample(spd, samples[i], timestamp);
555 3 min_not_useful_omp_in = min_int64(min_not_useful_omp_in,
556 3 DLB_ATOMIC_LD_RLX(&samples[i]->timers.not_useful_omp_in));
557 }
558
559 /* Iterate again to accumulate Load Balance, and to aggregate sample */
560 2 int64_t sched_timer = min_not_useful_omp_in * nelems;
561 2 int64_t lb_timer = 0;
562
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
5 for (i=0; i<nelems; ++i) {
563 3 lb_timer += DLB_ATOMIC_EXCH_RLX(&samples[i]->timers.not_useful_omp_in, 0)
564 3 - min_not_useful_omp_in;
565 3 flush_sample_to_macrosample(samples[i], &macrosample);
566 }
567
568 /* Update derived timers into macrosample */
569 2 macrosample.timers.not_useful_omp_in_lb = lb_timer;
570 2 macrosample.timers.not_useful_omp_in_sched = sched_timer;
571 }
572 2 pthread_mutex_unlock(&talp_info->samples_mutex);
573
574 /* Update all started regions */
575 2 update_regions_with_macrosample(spd, &macrosample, nelems);
576 2 }
577
578 /* Sets the TLS variable _is_main_sample_in_serial_mode. This function is
579 * called by the main thread when beginning or ending parallel region of level 1.
580 * FIXME: free agent threads may break this condition.
581 *
582 * Sets whether the main thread is running in serial mode. */
583 4 void talp_set_main_sample_in_serial_mode(bool serial_mode) {
584
585
1/2
✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
4 if (_is_main_sample) {
586 4 _is_main_sample_in_serial_mode = serial_mode;
587 }
588 4 }
589
590
591 /*********************************************************************************/
592 /* TALP collect functions for 3rd party programs: */
593 /* - It's also safe to call it from a 1st party program */
594 /* - Requires --talp-external-profiler set up in application */
595 /* - Does not need to synchronize with application */
596 /*********************************************************************************/
597
598 /* Function that may be called from a third-party process to compute
599 * node_metrics for a given region */
600 6 int talp_query_pop_node_metrics(const char *name, dlb_node_metrics_t *node_metrics) {
601
602
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 5 times.
6 if (name == NULL) {
603 1 name = region_get_global_name();
604 }
605
606 6 int error = DLB_SUCCESS;
607 6 int64_t total_mpi_time = 0;
608 6 int64_t total_useful_time = 0;
609 6 int64_t max_mpi_time = 0;
610 6 int64_t max_useful_time = 0;
611
612 /* Obtain a list of regions in the node associated with given region */
613 6 int max_procs = mu_get_system_size();
614 6 talp_region_list_t *region_list = malloc(max_procs * sizeof(talp_region_list_t));
615 int nelems;
616 6 shmem_talp__get_regionlist(region_list, &nelems, max_procs, name);
617
618 /* Count how many processes have started the region */
619 6 int processes_per_node = 0;
620
621 /* Iterate the PID list and gather times of every process */
622 int i;
623
2/2
✓ Branch 0 taken 7 times.
✓ Branch 1 taken 6 times.
13 for (i = 0; i <nelems; ++i) {
624 7 int64_t mpi_time = region_list[i].mpi_time;
625 7 int64_t useful_time = region_list[i].useful_time;
626
627 /* Accumulate total and max values */
628
3/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 4 times.
✗ Branch 3 not taken.
7 if (mpi_time > 0 || useful_time > 0) {
629 7 ++processes_per_node;
630 7 total_mpi_time += mpi_time;
631 7 total_useful_time += useful_time;
632 7 max_mpi_time = max_int64(mpi_time, max_mpi_time);
633 7 max_useful_time = max_int64(useful_time, max_useful_time);
634 }
635 }
636 6 free(region_list);
637
638 #if MPI_LIB
639 int node_id = _node_id;
640 #else
641 6 int node_id = 0;
642 #endif
643
644
1/2
✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
6 if (processes_per_node > 0) {
645 /* Compute POP metrics with some inferred values */
646 perf_metrics_mpi_t metrics;
647 6 perf_metrics__infer_mpi_model(
648 &metrics,
649 processes_per_node,
650 total_useful_time,
651 total_mpi_time,
652 max_useful_time);
653
654 /* Initialize structure */
655 6 *node_metrics = (const dlb_node_metrics_t) {
656 .node_id = node_id,
657 .processes_per_node = processes_per_node,
658 .total_useful_time = total_useful_time,
659 .total_mpi_time = total_mpi_time,
660 .max_useful_time = max_useful_time,
661 .max_mpi_time = max_mpi_time,
662 6 .parallel_efficiency = metrics.parallel_efficiency,
663 6 .communication_efficiency = metrics.communication_efficiency,
664 6 .load_balance = metrics.load_balance,
665 };
666 6 snprintf(node_metrics->name, DLB_MONITOR_NAME_MAX, "%s", name);
667 } else {
668 error = DLB_ERR_NOENT;
669 }
670
671 6 return error;
672 }
673
674
675 /*********************************************************************************/
676 /* TALP collect functions for 1st party programs */
677 /* - Requires synchronization (MPI or node barrier) among all processes */
678 /*********************************************************************************/
679
680 /* Compute the current POP metrics for the specified monitor. If monitor is NULL,
681 * the global monitoring region is assumed.
682 * Pre-conditions:
683 * - if MPI, the given monitor must have been registered in all MPI ranks
684 * - pop_metrics is an allocated structure
685 */
686 1 int talp_collect_pop_metrics(const subprocess_descriptor_t *spd,
687 dlb_monitor_t *monitor, dlb_pop_metrics_t *pop_metrics) {
688 1 talp_info_t *talp_info = spd->talp_info;
689
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (monitor == NULL) {
690 monitor = talp_info->monitor;
691 }
692
693 /* Stop monitor so that metrics are updated */
694 1 bool resume_region = region_stop(spd, monitor) == DLB_SUCCESS;
695
696 pop_base_metrics_t base_metrics;
697 #ifdef MPI_LIB
698 /* Reduce monitor among all MPI ranks and everbody collects (all-to-all) */
699 perf_metrics__reduce_monitor_into_base_metrics(&base_metrics, monitor, true);
700 #else
701 /* Construct base metrics using only the monitor from this process */
702 1 perf_metrics__local_monitor_into_base_metrics(&base_metrics, monitor);
703 #endif
704
705 /* Construct output pop_metrics out of base metrics */
706 1 perf_metrics__base_to_pop_metrics(monitor->name, &base_metrics, pop_metrics);
707
708 /* Resume monitor */
709
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (resume_region) {
710 region_start(spd, monitor);
711 }
712
713 1 return DLB_SUCCESS;
714 }
715
716 /* Node-collective function to compute node_metrics for a given region */
717 5 int talp_collect_pop_node_metrics(const subprocess_descriptor_t *spd,
718 dlb_monitor_t *monitor, dlb_node_metrics_t *node_metrics) {
719
720 5 talp_info_t *talp_info = spd->talp_info;
721
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 3 times.
5 monitor = monitor ? monitor : talp_info->monitor;
722 5 monitor_data_t *monitor_data = monitor->_data;
723
724 /* Stop monitor so that metrics are updated */
725 5 bool resume_region = region_stop(spd, monitor) == DLB_SUCCESS;
726
727 /* This functionality needs a shared memory, create a temporary one if needed */
728
1/2
✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
5 if (!talp_info->flags.have_shmem) {
729 5 shmem_talp__init(spd->options.shm_key, 1);
730 5 shmem_talp__register(spd->id, monitor->avg_cpus, monitor->name,
731 &monitor_data->node_shared_id);
732 }
733
734 /* Update the shared memory with this process' metrics */
735 5 shmem_talp__set_times(monitor_data->node_shared_id,
736 monitor->mpi_time,
737 monitor->useful_time);
738
739 /* Perform a node barrier to ensure everyone has updated their metrics */
740 5 node_barrier(spd, NULL);
741
742 /* Compute node metrics for that region name */
743 5 talp_query_pop_node_metrics(monitor->name, node_metrics);
744
745 /* Remove shared memory if it was a temporary one */
746
1/2
✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
5 if (!talp_info->flags.have_shmem) {
747 5 shmem_talp__finalize(spd->id);
748 }
749
750 /* Resume monitor */
751
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
5 if (resume_region) {
752 1 region_start(spd, monitor);
753 }
754
755 5 return DLB_SUCCESS;
756 }
757