GCC Code Coverage Report


Directory: src/
File: src/talp/regions.c
Date: 2026-06-05 08:54:23
Exec Total Coverage
Lines: 174 175 99.4%
Functions: 15 15 100.0%
Branches: 88 98 89.8%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2026 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include "talp/regions.h"
25
26 #include "LB_comm/shmem_talp.h"
27 #include "LB_core/spd.h"
28 #include "apis/dlb_errors.h"
29 #include "apis/dlb_talp.h"
30 #include "support/debug.h"
31 #include "support/gtree.h"
32 #include "support/mask_utils.h"
33 #include "support/tracing.h"
34 #include "talp/sample.h"
35 #include "talp/talp.h"
36 #include "talp/talp_output.h"
37 #include "talp/talp_types.h"
38
39 #include <pthread.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <string.h>
43
44 extern __thread bool thread_is_observer;
45
46
47 /*********************************************************************************/
48 /* TALP Monitoring Regions */
49 /*********************************************************************************/
50
51 const char *global_region_name = DLB_GLOBAL_REGION_NAME;
52
53
54 /* Unique region ids */
55 2256 static int get_new_monitor_id(void) {
56 static atomic_int id = 0;
57 2256 return DLB_ATOMIC_ADD_FETCH_RLX(&id, 1);
58 }
59
60 /* Unique anonymous regions */
61 4 static int get_new_anonymous_id(void) {
62 static atomic_int id = 0;
63 4 return DLB_ATOMIC_ADD_FETCH_RLX(&id, 1);
64 }
65
66 /* Return true if the region is to be enabled.
67 * region_select format:
68 * --talp-region-select=[(include|exclude):]<region-list>
69 */
70 2256 static bool parse_region_select(const char *region_select, const char *region_name) {
71
72 /* Default case, all regions enabled */
73
1/2
✓ Branch 0 taken 2256 times.
✗ Branch 1 not taken.
2256 if (region_select == NULL
74
2/2
✓ Branch 0 taken 2242 times.
✓ Branch 1 taken 14 times.
2256 || region_select[0] == '\0') {
75 2242 return true;
76 }
77
78 /* Select inclusion or exclusion mode,
79 * and advance pointer */
80 14 bool in_inclusion_mode = true;
81
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 10 times.
14 if (strncmp(region_select, "exclude:", strlen("exclude:")) == 0) {
82 4 in_inclusion_mode = false;
83 4 region_select += strlen("exclude:");
84
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 7 times.
10 } else if (strncmp(region_select, "include:", strlen("include:")) == 0) {
85 3 region_select += strlen("include:");
86 }
87
88 /* If "[(include|exclude):]all" */
89
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 11 times.
14 if (strcmp(region_select, "all") == 0) {
90 3 return in_inclusion_mode;
91 }
92
93 /* Break region_select into tokens and find region_name */
94 11 bool found_in_select = false;
95 11 size_t len = strlen(region_select);
96 11 char *region_select_copy = malloc(sizeof(char)*(len+1));
97 11 strcpy(region_select_copy, region_select);
98 11 char *saveptr = NULL;
99 11 char *token = strtok_r(region_select_copy, ",", &saveptr);
100
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 5 times.
21 while (token) {
101 /* Region name is found */
102
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 11 times.
16 if (strcmp(token, region_name) == 0) {
103 5 found_in_select = true;
104 5 break;
105 }
106
107 /* Region name is same as global region, and same as token (ignoring case) */
108
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 8 times.
11 if (strcasecmp(region_name, global_region_name) == 0
109
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
3 && strcasecmp(token, global_region_name) == 0) {
110 1 found_in_select = true;
111 1 break;
112 }
113
114 /* next token */
115 10 token = strtok_r(NULL, ",", &saveptr);
116 }
117 11 free(region_select_copy);
118
119
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 3 times.
11 return in_inclusion_mode ? found_in_select : !found_in_select;
120 }
121
122 2256 static void region_initialize(dlb_monitor_t *monitor, int id, const
123 char *name, pid_t pid, float avg_cpus, const char *region_select, bool have_shmem) {
124 /* Initialize private monitor data */
125 2256 monitor_data_t *monitor_data = malloc(sizeof(monitor_data_t));
126 2256 *monitor_data = (const monitor_data_t) {
127 .id = id,
128 .node_shared_id = -1,
129 };
130
131 /* Parse --talp-region-select if needed */
132 2256 monitor_data->flags.enabled = parse_region_select(region_select, name);
133
134 /* Allocate monitor name */
135 2256 char *allocated_name = malloc(DLB_MONITOR_NAME_MAX*sizeof(char));
136 2256 snprintf(allocated_name, DLB_MONITOR_NAME_MAX, "%s", name);
137
138 /* Initialize monitor */
139 2256 *monitor = (const dlb_monitor_t) {
140 .name = allocated_name,
141 .avg_cpus = avg_cpus,
142 ._data = monitor_data,
143 };
144
145 /* Register name in the instrumentation tool */
146 instrument_register_event(MONITOR_REGION, monitor_data->id, name);
147
148 /* Register region in shmem */
149
2/2
✓ Branch 0 taken 1210 times.
✓ Branch 1 taken 1046 times.
2256 if (have_shmem) {
150 1210 int err = shmem_talp__register(pid, avg_cpus, monitor->name, &monitor_data->node_shared_id);
151
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 1207 times.
1210 if (err == DLB_ERR_NOMEM) {
152 3 warning("Region %s has been correctly registered but cannot be shared among other"
153 " processes due to lack of space in the TALP shared memory. Features like"
154 " node report or gathering data from external processes may not work for"
155 " this region. If needed, increase the TALP shared memory capacity using"
156 " the flag --shm-size-multiplier. Run dlb -hh for more info.",
157 monitor->name);
158
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1207 times.
1207 } else if (err < DLB_SUCCESS) {
159 fatal("Unknown error registering region %s, please report bug at %s",
160 monitor->name, PACKAGE_BUGREPORT);
161 }
162 }
163 2256 }
164
165 9 struct dlb_monitor_t* region_get_global(const subprocess_descriptor_t *spd) {
166 9 talp_info_t *talp_info = spd->talp_info;
167
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 1 times.
9 return talp_info ? talp_info->monitor : NULL;
168 }
169
170 31 const char* region_get_global_name(void) {
171 31 return global_region_name;
172 }
173
174 /* Helper function for GTree: Compare region names */
175 39408 int region_compare_by_name(const void *a, const void *b) {
176 39408 return strncmp(a, b, DLB_MONITOR_NAME_MAX-1);
177 }
178
179 /* Helper function for GTree: deallocate */
180 2256 void region_dealloc(void *data) {
181
182 2256 dlb_monitor_t *monitor = data;
183
184 /* Free private data */
185 2256 monitor_data_t *monitor_data = monitor->_data;
186 2256 free(monitor_data);
187 2256 monitor_data = NULL;
188
189 /* Free name */
190 2256 free((char*)monitor->name);
191 2256 monitor->name = NULL;
192
193 /* Free monitor */
194 2256 free(monitor);
195 2256 }
196
197 2268 dlb_monitor_t* region_register(const subprocess_descriptor_t *spd, const char* name) {
198
199 /* Forbidden names */
200
2/2
✓ Branch 0 taken 2267 times.
✓ Branch 1 taken 1 times.
2268 if (name == DLB_LAST_OPEN_REGION
201
2/2
✓ Branch 0 taken 2264 times.
✓ Branch 1 taken 3 times.
2267 || (name != NULL
202
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2262 times.
2264 && strncasecmp("all", name, DLB_MONITOR_NAME_MAX-1) == 0)) {
203 3 return NULL;
204 }
205
206 2265 talp_info_t *talp_info = spd->talp_info;
207
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2265 times.
2265 if (talp_info == NULL) return NULL;
208
209 2265 dlb_monitor_t *monitor = NULL;
210
4/4
✓ Branch 0 taken 2262 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2261 times.
2265 bool anonymous_region = (name == NULL || *name == '\0');
211
4/4
✓ Branch 0 taken 2261 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 2237 times.
2265 bool global_region = !anonymous_region && name == global_region_name;
212
213 /* Check again if the pointers are different but the string content is the
214 * same as the global region, ignoring case */
215
2/2
✓ Branch 0 taken 2261 times.
✓ Branch 1 taken 4 times.
2265 if (!anonymous_region
216
2/2
✓ Branch 0 taken 2237 times.
✓ Branch 1 taken 24 times.
2261 && !global_region
217
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 2232 times.
2237 && strncasecmp(global_region_name, name, DLB_MONITOR_NAME_MAX-1) == 0) {
218 5 name = global_region_name;
219 5 global_region = true;
220 }
221
222 /* Found monitor if already registered */
223
2/2
✓ Branch 0 taken 2261 times.
✓ Branch 1 taken 4 times.
2265 if (!anonymous_region) {
224 2261 pthread_mutex_lock(&talp_info->regions_mutex);
225 {
226 2261 monitor = g_tree_lookup(talp_info->regions, name);
227 }
228 2261 pthread_mutex_unlock(&talp_info->regions_mutex);
229
230
2/2
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 2252 times.
2261 if (monitor != NULL) {
231 9 return monitor;
232 }
233 }
234
235 /* Otherwise, create new monitoring region */
236 2256 monitor = malloc(sizeof(dlb_monitor_t));
237
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2256 times.
2256 fatal_cond(!monitor, "Could not register a new monitoring region."
238 " Please report at "PACKAGE_BUGREPORT);
239
240 /* Determine the initial number of assigned CPUs for the region */
241 2256 float avg_cpus = CPU_COUNT(&spd->process_mask);
242
243 /* Construct name if anonymous region */
244 char monitor_name[DLB_MONITOR_NAME_MAX];
245
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 2252 times.
2256 if (anonymous_region) {
246 4 snprintf(monitor_name, DLB_MONITOR_NAME_MAX, "Anonymous Region %d",
247 get_new_anonymous_id());
248 4 name = monitor_name;
249 }
250
251 /* Initialize values */
252 4512 bool have_shmem = talp_info->flags.have_shmem
253
3/6
✓ Branch 0 taken 1046 times.
✓ Branch 1 taken 1210 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 1046 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
2256 || (talp_info->flags.have_minimal_shmem && global_region);
254 2256 region_initialize(monitor, get_new_monitor_id(), name,
255 2256 spd->id, avg_cpus, spd->options.talp_region_select, have_shmem);
256
257 /* Finally, insert */
258 2256 pthread_mutex_lock(&talp_info->regions_mutex);
259 {
260 2256 g_tree_insert(talp_info->regions, (gpointer)monitor->name, monitor);
261 }
262 2256 pthread_mutex_unlock(&talp_info->regions_mutex);
263
264 2256 return monitor;
265 }
266
267 6 int region_reset(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) {
268
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 5 times.
6 if (monitor == DLB_GLOBAL_REGION) {
269 1 talp_info_t *talp_info = spd->talp_info;
270 1 monitor = talp_info->monitor;
271 }
272
273 6 monitor_data_t *monitor_data = monitor->_data;
274
275 /* Close region if started */
276
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 5 times.
6 if (monitor_data->flags.started) {
277
278 1 talp_info_t *talp_info = spd->talp_info;
279
280 1 pthread_mutex_lock(&talp_info->regions_mutex);
281 {
282 1 monitor_data->flags.started = false;
283 1 talp_info->open_regions = g_slist_remove(talp_info->open_regions, monitor);
284 }
285 1 pthread_mutex_unlock(&talp_info->regions_mutex);
286
287
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 verbose(VB_TALP, "Stopping region %s", monitor->name);
288 instrument_event(MONITOR_REGION, monitor_data->id, EVENT_END);
289 }
290
291 /* Reset everything except these fields: */
292 6 *monitor = (const dlb_monitor_t) {
293 6 .name = monitor->name,
294 6 .num_resets = monitor->num_resets + 1,
295 6 ._data = monitor->_data,
296 };
297
298 6 return DLB_SUCCESS;
299 }
300
301 2692 int region_start(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) {
302 /* Observer threads don't have a valid sample so they cannot start/stop regions */
303
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2690 times.
2692 if (unlikely(thread_is_observer)) return DLB_ERR_PERM;
304
305 2690 talp_info_t *talp_info = spd->talp_info;
306
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2687 times.
2690 if (monitor == DLB_GLOBAL_REGION) {
307 3 monitor = talp_info->monitor;
308 }
309
310 int error;
311 2690 monitor_data_t *monitor_data = monitor->_data;
312
313
4/4
✓ Branch 0 taken 2677 times.
✓ Branch 1 taken 13 times.
✓ Branch 2 taken 2669 times.
✓ Branch 3 taken 8 times.
2690 if (!monitor_data->flags.started && monitor_data->flags.enabled) {
314
315 /* Update this sample first */
316 2669 talp_sample_update(talp_info);
317
318 /* Gather samples from all threads and update regions */
319 2669 talp_aggregate_samples_to_regions(talp_info);
320
321
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2669 times.
2669 verbose(VB_TALP, "Starting region %s", monitor->name);
322 instrument_event(MONITOR_REGION, monitor_data->id, EVENT_BEGIN);
323
324 /* Thread sample was just updated, use timestamp as starting time */
325 2669 talp_sample_t *thread_sample = talp_sample_get(talp_info);
326 2669 monitor->start_time = thread_sample->last_updated_ts;
327 2669 monitor->stop_time = 0;
328
329 2669 pthread_mutex_lock(&talp_info->regions_mutex);
330 {
331 2669 monitor_data->flags.started = true;
332 2669 talp_info->open_regions = g_slist_prepend(talp_info->open_regions, monitor);
333 }
334 2669 pthread_mutex_unlock(&talp_info->regions_mutex);
335
336 /* Normally, the sample state will be 'TALP_STATE_USEFUL' at this point,
337 * but on certain cases where neither talp_mpi_init nor talp_openmp_init
338 * have been called, this is necessary */
339
2/2
✓ Branch 0 taken 23 times.
✓ Branch 1 taken 2646 times.
2669 if (thread_sample->state != TALP_STATE_USEFUL) {
340 23 talp_sample_set_state(talp_info, TALP_STATE_USEFUL);
341 }
342
343 2669 error = DLB_SUCCESS;
344 } else {
345 21 error = DLB_NOUPDT;
346 }
347
348 2690 return error;
349 }
350
351 2787 int region_stop(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) {
352 /* Observer threads don't have a valid sample so they cannot start/stop regions */
353
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2784 times.
2787 if (unlikely(thread_is_observer)) return DLB_ERR_PERM;
354
355 2784 talp_info_t *talp_info = spd->talp_info;
356
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2781 times.
2784 if (monitor == DLB_GLOBAL_REGION) {
357 3 monitor = talp_info->monitor;
358
2/2
✓ Branch 0 taken 106 times.
✓ Branch 1 taken 2675 times.
2781 } else if (monitor == DLB_LAST_OPEN_REGION) {
359
2/2
✓ Branch 0 taken 105 times.
✓ Branch 1 taken 1 times.
106 if (talp_info->open_regions != NULL) {
360 105 monitor = talp_info->open_regions->data;
361 } else {
362 1 return DLB_ERR_NOENT;
363 }
364 }
365
366 int error;
367 2783 monitor_data_t *monitor_data = monitor->_data;
368
369
2/2
✓ Branch 0 taken 2668 times.
✓ Branch 1 taken 115 times.
2783 if (monitor_data->flags.started) {
370
371 /* Update this sample first */
372 2668 talp_sample_update(talp_info);
373
374 /* Gather samples from all threads and update regions */
375 2668 talp_aggregate_samples_to_regions(talp_info);
376
377 /* Stop timer */
378 2668 talp_sample_t *thread_sample = talp_sample_get(talp_info);
379 2668 monitor->stop_time = thread_sample->last_updated_ts;
380 2668 monitor->elapsed_time += monitor->stop_time - monitor->start_time;
381 2668 ++(monitor->num_measurements);
382
383 2668 pthread_mutex_lock(&talp_info->regions_mutex);
384 {
385 2668 monitor_data->flags.started = false;
386 2668 talp_info->open_regions = g_slist_remove(talp_info->open_regions, monitor);
387 }
388 2668 pthread_mutex_unlock(&talp_info->regions_mutex);
389
390
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2668 times.
2668 verbose(VB_TALP, "Stopping region %s", monitor->name);
391 instrument_event(MONITOR_REGION, monitor_data->id, EVENT_END);
392 2668 error = DLB_SUCCESS;
393 } else {
394 115 error = DLB_NOUPDT;
395 }
396
397 2783 return error;
398 }
399
400 14 bool region_is_started(const dlb_monitor_t *monitor) {
401 14 return ((monitor_data_t*)monitor->_data)->flags.started;
402 }
403
404 1 void region_set_internal(struct dlb_monitor_t *monitor, bool internal) {
405 1 ((monitor_data_t*)monitor->_data)->flags.internal = internal;
406 1 }
407
408 8 int region_report(const subprocess_descriptor_t *spd, const dlb_monitor_t *monitor) {
409 8 talp_info_t *talp_info = spd->talp_info;
410
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
8 if (monitor == DLB_GLOBAL_REGION) {
411 1 monitor = talp_info->monitor;
412 }
413
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
8 if (((monitor_data_t*)monitor->_data)->flags.internal) {
414 1 return DLB_NOUPDT;
415 }
416
417 7 talp_output_print_monitoring_region(monitor, mu_to_str(&spd->process_mask),
418 talp_info->flags);
419
420 7 return DLB_SUCCESS;
421 }
422