GCC Code Coverage Report


Directory: src/
File: src/talp/talp.c
Date: 2026-06-05 08:54:23
Exec Total Coverage
Lines: 155 169 91.7%
Functions: 8 8 100.0%
Branches: 57 84 67.9%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2026 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include "talp/talp.h"
25
26 #include "LB_core/node_barrier.h"
27 #include "LB_core/spd.h"
28 #include "LB_comm/shmem_talp.h"
29 #include "apis/dlb_errors.h"
30 #include "apis/dlb_talp.h"
31 #include "support/atomic.h"
32 #include "support/debug.h"
33 #include "support/error.h"
34 #include "support/gslist.h"
35 #include "support/gtree.h"
36 #include "support/mytime.h"
37 #include "support/tracing.h"
38 #include "support/options.h"
39 #include "support/mask_utils.h"
40 #include "talp/backend.h"
41 #include "talp/perf_metrics.h"
42 #include "talp/sample.h"
43 #include "talp/regions.h"
44 #include "talp/talp_gpu.h"
45 #include "talp/talp_hwc.h"
46 #include "talp/talp_output.h"
47 #include "talp/talp_record.h"
48 #include "talp/talp_types.h"
49 #ifdef MPI_LIB
50 #include "mpi/mpi_core.h"
51 #endif
52
53 #include <stdlib.h>
54 #include <pthread.h>
55
56 extern __thread bool thread_is_observer;
57
58
59 #ifdef MPI_LIB
60 /* Returns the number of MPI processes that have HWC enabled */
61 static int get_hwc_init_across_world(const subprocess_descriptor_t *spd) {
62
63 talp_info_t *talp_info = spd->talp_info;
64
65 // status = 1 means HWC are enabled
66 int hwc_local_status = talp_info->flags.have_hwc ? 1 : 0;
67
68 int hwc_global_statuses = 0;
69
70 PMPI_Allreduce(&hwc_local_status, &hwc_global_statuses, 1,
71 MPI_INT, MPI_SUM, getWorldComm());
72
73 return hwc_global_statuses;
74 }
75 #endif
76
77
78 /*********************************************************************************/
79 /* Init / Finalize */
80 /*********************************************************************************/
81
82 23 void talp_init(subprocess_descriptor_t *spd) {
83
84
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 23 times.
23 ensure(!spd->talp_info, "TALP already initialized");
85
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 23 times.
23 ensure(!thread_is_observer, "An observer thread cannot call talp_init");
86
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 20 times.
23 verbose(VB_TALP, "Initializing TALP module with worker mask: %s",
87 mu_to_str(&spd->process_mask));
88
89 /* Initialize talp info */
90 23 talp_info_t *talp_info = malloc(sizeof(talp_info_t));
91 23 *talp_info = (const talp_info_t) {
92 .flags = {
93 23 .external_profiler = spd->options.talp_external_profiler,
94 23 .have_shmem = spd->options.talp_external_profiler,
95 23 .have_minimal_shmem = !spd->options.talp_external_profiler
96
3/4
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 9 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 14 times.
23 && spd->options.talp_summary & SUMMARY_NODE,
97 },
98 23 .regions = g_tree_new_full(
99 (GCompareDataFunc)region_compare_by_name,
100 NULL, NULL, region_dealloc),
101 .regions_mutex = PTHREAD_MUTEX_INITIALIZER,
102 };
103 23 spd->talp_info = talp_info;
104
105 /* Initialize sample structure */
106 23 talp_sample_init(talp_info);
107
108 /* Initialize shared memory */
109
3/4
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 9 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 14 times.
23 if (talp_info->flags.have_shmem || talp_info->flags.have_minimal_shmem) {
110 /* If we only need a minimal shmem, its size will be the user-provided
111 * multiplier times 'system_size' (usually, 1 region per process)
112 * Otherwise, we multiply it by DEFAULT_REGIONS_PER_PROC.
113 */
114 enum { DEFAULT_REGIONS_PER_PROC = 100 };
115 18 int shmem_size_multiplier = spd->options.shm_size_multiplier
116
1/2
✓ Branch 0 taken 9 times.
✗ Branch 1 not taken.
9 * (talp_info->flags.have_shmem ? DEFAULT_REGIONS_PER_PROC : 1);
117 9 shmem_talp__init(spd->options.shm_key, shmem_size_multiplier);
118 }
119
120 /* Initialize TALP components */
121
2/2
✓ Branch 0 taken 15 times.
✓ Branch 1 taken 8 times.
23 if (spd->options.talp & (TALP_COMPONENT_DEFAULT | TALP_COMPONENT_GPU)) {
122
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 15 times.
15 if (talp_gpu_init(spd) == DLB_SUCCESS) {
123 talp_info->flags.have_gpu = true;
124 verbose(VB_TALP, "GPU component enabled successfully");
125 } else {
126
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 15 times.
15 if (spd->options.talp & TALP_COMPONENT_GPU) {
127 /* component was explicit and failed, warn user */
128 warning("TALP: Failed to load GPU component");
129 }
130 }
131 }
132
1/2
✓ Branch 0 taken 23 times.
✗ Branch 1 not taken.
23 if (spd->options.talp & (TALP_COMPONENT_DEFAULT | TALP_COMPONENT_HWC)) {
133
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 23 times.
23 if (talp_hwc_init(spd) == DLB_SUCCESS) {
134 talp_info->flags.have_hwc = true;
135 verbose(VB_TALP, "HWC component enabled successfully");
136 } else {
137
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 15 times.
23 if (spd->options.talp & TALP_COMPONENT_HWC) {
138 /* component was explicit and failed, warn user */
139 8 warning("TALP: Failed to load HWC component");
140 }
141 }
142 }
143
144 #ifdef MPI_LIB
145 /* Check HWC status across all process. Every process needs to do the check
146 * because it's a collective operation and some process may have been started
147 * without the appropriate flag. */
148 if (is_mpi_ready()) {
149 int num_procs_with_hwc = get_hwc_init_across_world(spd);
150 if (num_procs_with_hwc > 0 && num_procs_with_hwc < _mpi_size) {
151 warning0("Hardware Counters initialization has failed, disabling option.");
152 talp_hwc_finalize();
153 talp_info->flags.have_hwc = false;
154 }
155 }
156 #endif
157
158 /* Initialize global region monitor
159 * (at this point we don't know how many CPUs, it will be fixed in talp_openmp_init) */
160 23 talp_info->monitor = region_register(spd, region_get_global_name());
161
162 /* Start global region */
163 23 region_start(spd, talp_info->monitor);
164 23 }
165
166 23 void talp_finalize(subprocess_descriptor_t *spd) {
167
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 23 times.
23 ensure(spd->talp_info, "TALP is not initialized");
168
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 23 times.
23 ensure(!thread_is_observer, "An observer thread cannot call talp_finalize");
169
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 23 times.
23 verbose(VB_TALP, "Finalizing TALP module");
170
171 23 talp_info_t *talp_info = spd->talp_info;
172
173 /* Stop open regions
174 * (Note that region_stop need to acquire the regions_mutex
175 * lock, so we we need to iterate without it) */
176
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 23 times.
39 while(talp_info->open_regions != NULL) {
177 16 dlb_monitor_t *monitor = talp_info->open_regions->data;
178 16 region_stop(spd, monitor);
179 }
180
181 /* Finalize TALP components */
182
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 23 times.
23 if (talp_info->flags.have_gpu) {
183 talp_gpu_finalize();
184 }
185
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 23 times.
23 if (talp_info->flags.have_hwc) {
186 talp_hwc_finalize();
187 }
188
189 /* Per-process output (no MPI or requested by user) */
190
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 15 times.
23 if (!talp_info->flags.have_mpi
191
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
8 || spd->options.talp_partial_output) {
192
193 15 pthread_mutex_lock(&talp_info->regions_mutex);
194 {
195 /* Record all regions */
196 15 for (GTreeNode *node = g_tree_node_first(talp_info->regions);
197
2/2
✓ Branch 0 taken 1221 times.
✓ Branch 1 taken 15 times.
1236 node != NULL;
198 1221 node = g_tree_node_next(node)) {
199 1221 const dlb_monitor_t *monitor = g_tree_node_value(node);
200 1221 talp_record_monitor(spd, monitor);
201 }
202 }
203 15 pthread_mutex_unlock(&talp_info->regions_mutex);
204 }
205
206 /* Print/write all collected summaries */
207 23 talp_output_finalize(spd->options.talp_output_file, spd->options.talp_partial_output);
208
209 /* Deallocate samples structure */
210 23 talp_sample_finalize(talp_info);
211
212 /* Finalize shared memory */
213
3/4
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 9 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 14 times.
23 if (talp_info->flags.have_shmem || talp_info->flags.have_minimal_shmem) {
214 9 shmem_talp__finalize(spd->id);
215 }
216
217 /* Deallocate monitoring regions and talp_info */
218 23 pthread_mutex_lock(&talp_info->regions_mutex);
219 {
220 /* Destroy GTree, each node is deallocated with the function region_dealloc */
221 23 g_tree_destroy(talp_info->regions);
222 23 talp_info->regions = NULL;
223 23 talp_info->monitor = NULL;
224
225 /* Destroy list of open regions */
226 23 g_slist_free(talp_info->open_regions);
227 23 talp_info->open_regions = NULL;
228 }
229 23 pthread_mutex_unlock(&talp_info->regions_mutex);
230 23 free(talp_info);
231 23 spd->talp_info = NULL;
232 23 }
233
234
235 /*********************************************************************************/
236 /* Functions for sample aggregation to regions */
237 /*********************************************************************************/
238
239 /* Update all open regions with the macrosample */
240 5347 static void update_regions_with_macrosample(talp_info_t *restrict talp_info,
241 const talp_macrosample_t *restrict macrosample) {
242
243 /* Update all open regions */
244 5347 pthread_mutex_lock(&talp_info->regions_mutex);
245 {
246 5347 for (GSList *node = talp_info->open_regions;
247
2/2
✓ Branch 0 taken 5917 times.
✓ Branch 1 taken 5347 times.
11264 node != NULL;
248 5917 node = node->next) {
249 5917 dlb_monitor_t *monitor = node->data;
250 5917 monitor_data_t *monitor_data = monitor->_data;
251
252 /* Update number of CPUs if needed */
253 5917 monitor->num_cpus = max_int(monitor->num_cpus, macrosample->num_cpus);
254
255 /* Timers */
256 5917 monitor->useful_time += macrosample->timers.useful;
257 5917 monitor->mpi_time += macrosample->timers.not_useful_mpi;
258 5917 monitor->mpi_worker_idle_time += macrosample->timers.not_useful_omp_during_mpi;
259 5917 monitor->omp_load_imbalance_time += macrosample->timers.not_useful_omp_in_lb;
260 5917 monitor->omp_scheduling_time += macrosample->timers.not_useful_omp_in_sched;
261 5917 monitor->omp_serialization_time += macrosample->timers.not_useful_omp_out;
262 5917 monitor->gpu_runtime_time += macrosample->timers.not_useful_gpu;
263
264 /* GPU Timers */
265 5917 monitor->gpu_useful_time += macrosample->gpu_timers.useful;
266 5917 monitor->gpu_communication_time += macrosample->gpu_timers.communication;
267 5917 monitor->gpu_inactive_time += macrosample->gpu_timers.inactive;
268
269 /* Counters */
270 5917 monitor->cycles += macrosample->counters.cycles;
271 5917 monitor->instructions += macrosample->counters.instructions;
272
273 /* Stats */
274 5917 monitor->num_mpi_calls += macrosample->stats.num_mpi_calls;
275 5917 monitor->num_omp_parallels += macrosample->stats.num_omp_parallels;
276 5917 monitor->num_omp_tasks += macrosample->stats.num_omp_tasks;
277 5917 monitor->num_gpu_runtime_calls += macrosample->stats.num_gpu_runtime_calls;
278
279 /* Update shared memory only if requested */
280
2/2
✓ Branch 0 taken 3609 times.
✓ Branch 1 taken 2308 times.
5917 if (talp_info->flags.external_profiler) {
281 3609 shmem_talp__set_times(monitor_data->node_shared_id,
282 monitor->mpi_time,
283 monitor->useful_time);
284 }
285 }
286 }
287 5347 pthread_mutex_unlock(&talp_info->regions_mutex);
288 5347 }
289
290 /* Accumulate values from samples of all threads and update regions */
291 5346 int talp_aggregate_samples_to_regions(talp_info_t *talp_info) {
292
293 /* Observer threads don't have a valid sample so they cannot start/stop regions */
294
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 5345 times.
5346 if (unlikely(thread_is_observer)) return DLB_ERR_PERM;
295
296 /* Accumulate samples from all threads */
297 5345 talp_macrosample_t macrosample = {0};
298 5345 talp_sample_aggregate_all_to_macrosample(talp_info, &macrosample);
299
300
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5345 times.
5345 if (talp_info->flags.have_gpu) {
301 /* Collect GPU measuremnts up to this point and update macrosample */
302 gpu_measurements_t measurements;
303 talp_gpu_collect(&measurements);
304 macrosample.gpu_timers.useful = measurements.useful_time;
305 macrosample.gpu_timers.communication = measurements.communication_time;
306 macrosample.gpu_timers.inactive = measurements.inactive_time;
307 }
308
309 /* Update all started regions */
310 5345 update_regions_with_macrosample(talp_info, &macrosample);
311
312 5345 return DLB_SUCCESS;
313 }
314
315 /* Accumulate samples from only a subset of samples of a parallel region.
316 * Load Balance and Scheduling are computed here based on all samples. */
317 2 void talp_aggregate_subset_to_regions(talp_info_t *talp_info,
318 talp_sample_t **samples, unsigned int nelems) {
319
320 /* Accumulate samples from subset */
321 2 talp_macrosample_t macrosample = {0};
322 2 talp_sample_aggregate_subset_to_macrosample(talp_info, samples, nelems, &macrosample);
323
324 /* Update all started regions */
325 2 update_regions_with_macrosample(talp_info, &macrosample);
326 2 }
327
328
329 /*********************************************************************************/
330 /* TALP collect functions for 3rd party programs: */
331 /* - It's also safe to call it from a 1st party program */
332 /* - Requires --talp-external-profiler set up in application */
333 /* - Does not need to synchronize with application */
334 /*********************************************************************************/
335
336 /* Function that may be called from a third-party process to compute
337 * node_metrics for a given region */
338 6 int talp_query_pop_node_metrics(const char *name, dlb_node_metrics_t *node_metrics) {
339
340
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 5 times.
6 if (name == NULL) {
341 1 name = region_get_global_name();
342 }
343
344 6 int error = DLB_SUCCESS;
345 6 int64_t total_mpi_time = 0;
346 6 int64_t total_useful_time = 0;
347 6 int64_t max_mpi_time = 0;
348 6 int64_t max_useful_time = 0;
349
350 /* Obtain a list of regions in the node associated with given region */
351 6 int max_procs = mu_get_system_size();
352 6 talp_region_list_t *region_list = malloc(max_procs * sizeof(talp_region_list_t));
353 int nelems;
354 6 shmem_talp__get_regionlist(region_list, &nelems, max_procs, name);
355
356 /* Count how many processes have started the region */
357 6 int processes_per_node = 0;
358
359 /* Iterate the PID list and gather times of every process */
360 int i;
361
2/2
✓ Branch 0 taken 7 times.
✓ Branch 1 taken 6 times.
13 for (i = 0; i <nelems; ++i) {
362 7 int64_t mpi_time = region_list[i].mpi_time;
363 7 int64_t useful_time = region_list[i].useful_time;
364
365 /* Accumulate total and max values */
366
3/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 4 times.
✗ Branch 3 not taken.
7 if (mpi_time > 0 || useful_time > 0) {
367 7 ++processes_per_node;
368 7 total_mpi_time += mpi_time;
369 7 total_useful_time += useful_time;
370 7 max_mpi_time = max_int64(mpi_time, max_mpi_time);
371 7 max_useful_time = max_int64(useful_time, max_useful_time);
372 }
373 }
374 6 free(region_list);
375
376 #if MPI_LIB
377 int node_id = _node_id;
378 #else
379 6 int node_id = 0;
380 #endif
381
382
1/2
✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
6 if (processes_per_node > 0) {
383 /* Compute POP metrics with some inferred values */
384 perf_metrics_mpi_t metrics;
385 6 perf_metrics__infer_mpi_model(
386 &metrics,
387 processes_per_node,
388 total_useful_time,
389 total_mpi_time,
390 max_useful_time);
391
392 /* Initialize structure */
393 6 *node_metrics = (const dlb_node_metrics_t) {
394 .node_id = node_id,
395 .processes_per_node = processes_per_node,
396 .total_useful_time = total_useful_time,
397 .total_mpi_time = total_mpi_time,
398 .max_useful_time = max_useful_time,
399 .max_mpi_time = max_mpi_time,
400 6 .parallel_efficiency = metrics.parallel_efficiency,
401 6 .communication_efficiency = metrics.communication_efficiency,
402 6 .load_balance = metrics.load_balance,
403 };
404 6 snprintf(node_metrics->name, DLB_MONITOR_NAME_MAX, "%s", name);
405 } else {
406 error = DLB_ERR_NOENT;
407 }
408
409 6 return error;
410 }
411
412
413 /*********************************************************************************/
414 /* TALP collect functions for 1st party programs */
415 /* - Requires synchronization (MPI or node barrier) among all processes */
416 /*********************************************************************************/
417
418 /* Compute the current POP metrics for the specified monitor. If monitor is NULL,
419 * the global monitoring region is assumed.
420 * Pre-conditions:
421 * - if MPI, the given monitor must have been registered in all MPI ranks
422 * - pop_metrics is an allocated structure
423 */
424 1 int talp_collect_pop_metrics(const subprocess_descriptor_t *spd,
425 dlb_monitor_t *monitor, dlb_pop_metrics_t *pop_metrics) {
426 1 talp_info_t *talp_info = spd->talp_info;
427
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (monitor == NULL) {
428 monitor = talp_info->monitor;
429 }
430
431 /* Stop monitor so that metrics are updated */
432 1 bool resume_region = region_stop(spd, monitor) == DLB_SUCCESS;
433
434 pop_base_metrics_t base_metrics;
435 #ifdef MPI_LIB
436 /* Reduce monitor among all MPI ranks and everbody collects (all-to-all) */
437 perf_metrics__reduce_monitor_into_base_metrics(&base_metrics, monitor, true);
438 #else
439 /* Construct base metrics using only the monitor from this process */
440 1 perf_metrics__local_monitor_into_base_metrics(&base_metrics, monitor);
441 #endif
442
443 /* Construct output pop_metrics out of base metrics */
444 1 perf_metrics__base_to_pop_metrics(monitor->name, &base_metrics, pop_metrics);
445
446 /* Resume monitor */
447
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (resume_region) {
448 region_start(spd, monitor);
449 }
450
451 1 return DLB_SUCCESS;
452 }
453
454 /* Node-collective function to compute node_metrics for a given region */
455 5 int talp_collect_pop_node_metrics(const subprocess_descriptor_t *spd,
456 dlb_monitor_t *monitor, dlb_node_metrics_t *node_metrics) {
457
458 5 talp_info_t *talp_info = spd->talp_info;
459
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 3 times.
5 monitor = monitor ? monitor : talp_info->monitor;
460 5 monitor_data_t *monitor_data = monitor->_data;
461
462 /* Stop monitor so that metrics are updated */
463 5 bool resume_region = region_stop(spd, monitor) == DLB_SUCCESS;
464
465 /* This functionality needs a shared memory, create a temporary one if needed */
466
1/2
✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
5 if (!talp_info->flags.have_shmem) {
467 5 shmem_talp__init(spd->options.shm_key, 1);
468 5 shmem_talp__register(spd->id, monitor->avg_cpus, monitor->name,
469 &monitor_data->node_shared_id);
470 }
471
472 /* Update the shared memory with this process' metrics */
473 5 shmem_talp__set_times(monitor_data->node_shared_id,
474 monitor->mpi_time,
475 monitor->useful_time);
476
477 /* Perform a node barrier to ensure everyone has updated their metrics */
478 5 node_barrier(spd, NULL);
479
480 /* Compute node metrics for that region name */
481 5 talp_query_pop_node_metrics(monitor->name, node_metrics);
482
483 /* Remove shared memory if it was a temporary one */
484
1/2
✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
5 if (!talp_info->flags.have_shmem) {
485 5 shmem_talp__finalize(spd->id);
486 }
487
488 /* Resume monitor */
489
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
5 if (resume_region) {
490 1 region_start(spd, monitor);
491 }
492
493 5 return DLB_SUCCESS;
494 }
495