GCC Code Coverage Report


Directory: src/
File: src/talp/regions.c
Date: 2026-04-21 15:16:03
Exec Total Coverage
Lines: 172 173 99.4%
Functions: 15 15 100.0%
Branches: 88 98 89.8%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2025 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include "talp/regions.h"
25
26 #include "LB_comm/shmem_talp.h"
27 #include "LB_core/spd.h"
28 #include "apis/dlb_errors.h"
29 #include "apis/dlb_talp.h"
30 #include "support/debug.h"
31 #include "support/gtree.h"
32 #include "support/mask_utils.h"
33 #include "support/tracing.h"
34 #include "talp/talp.h"
35 #include "talp/talp_output.h"
36 #include "talp/talp_types.h"
37
38 #include <pthread.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42
43 extern __thread bool thread_is_observer;
44
45
46 /*********************************************************************************/
47 /* TALP Monitoring Regions */
48 /*********************************************************************************/
49
50 const char *global_region_name = DLB_GLOBAL_REGION_NAME;
51
52
53 /* Unique region ids */
54 2256 static int get_new_monitor_id(void) {
55 static atomic_int id = 0;
56 2256 return DLB_ATOMIC_ADD_FETCH_RLX(&id, 1);
57 }
58
59 /* Unique anonymous regions */
60 4 static int get_new_anonymous_id(void) {
61 static atomic_int id = 0;
62 4 return DLB_ATOMIC_ADD_FETCH_RLX(&id, 1);
63 }
64
65 /* Return true if the region is to be enabled.
66 * region_select format:
67 * --talp-region-select=[(include|exclude):]<region-list>
68 */
69 2256 static bool parse_region_select(const char *region_select, const char *region_name) {
70
71 /* Default case, all regions enabled */
72
1/2
✓ Branch 0 taken 2256 times.
✗ Branch 1 not taken.
2256 if (region_select == NULL
73
2/2
✓ Branch 0 taken 2242 times.
✓ Branch 1 taken 14 times.
2256 || region_select[0] == '\0') {
74 2242 return true;
75 }
76
77 /* Select inclusion or exclusion mode,
78 * and advance pointer */
79 14 bool in_inclusion_mode = true;
80
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 10 times.
14 if (strncmp(region_select, "exclude:", strlen("exclude:")) == 0) {
81 4 in_inclusion_mode = false;
82 4 region_select += strlen("exclude:");
83
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 7 times.
10 } else if (strncmp(region_select, "include:", strlen("include:")) == 0) {
84 3 region_select += strlen("include:");
85 }
86
87 /* If "[(include|exclude):]all" */
88
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 11 times.
14 if (strcmp(region_select, "all") == 0) {
89 3 return in_inclusion_mode;
90 }
91
92 /* Break region_select into tokens and find region_name */
93 11 bool found_in_select = false;
94 11 size_t len = strlen(region_select);
95 11 char *region_select_copy = malloc(sizeof(char)*(len+1));
96 11 strcpy(region_select_copy, region_select);
97 11 char *saveptr = NULL;
98 11 char *token = strtok_r(region_select_copy, ",", &saveptr);
99
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 5 times.
21 while (token) {
100 /* Region name is found */
101
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 11 times.
16 if (strcmp(token, region_name) == 0) {
102 5 found_in_select = true;
103 5 break;
104 }
105
106 /* Region name is same as global region, and same as token (ignoring case) */
107
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 8 times.
11 if (strcasecmp(region_name, global_region_name) == 0
108
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
3 && strcasecmp(token, global_region_name) == 0) {
109 1 found_in_select = true;
110 1 break;
111 }
112
113 /* next token */
114 10 token = strtok_r(NULL, ",", &saveptr);
115 }
116 11 free(region_select_copy);
117
118
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 3 times.
11 return in_inclusion_mode ? found_in_select : !found_in_select;
119 }
120
121 2256 static void region_initialize(dlb_monitor_t *monitor, int id, const
122 char *name, pid_t pid, float avg_cpus, const char *region_select, bool have_shmem) {
123 /* Initialize private monitor data */
124 2256 monitor_data_t *monitor_data = malloc(sizeof(monitor_data_t));
125 2256 *monitor_data = (const monitor_data_t) {
126 .id = id,
127 .node_shared_id = -1,
128 };
129
130 /* Parse --talp-region-select if needed */
131 2256 monitor_data->flags.enabled = parse_region_select(region_select, name);
132
133 /* Allocate monitor name */
134 2256 char *allocated_name = malloc(DLB_MONITOR_NAME_MAX*sizeof(char));
135 2256 snprintf(allocated_name, DLB_MONITOR_NAME_MAX, "%s", name);
136
137 /* Initialize monitor */
138 2256 *monitor = (const dlb_monitor_t) {
139 .name = allocated_name,
140 .avg_cpus = avg_cpus,
141 ._data = monitor_data,
142 };
143
144 /* Register name in the instrumentation tool */
145 instrument_register_event(MONITOR_REGION, monitor_data->id, name);
146
147 /* Register region in shmem */
148
2/2
✓ Branch 0 taken 1210 times.
✓ Branch 1 taken 1046 times.
2256 if (have_shmem) {
149 1210 int err = shmem_talp__register(pid, avg_cpus, monitor->name, &monitor_data->node_shared_id);
150
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 1207 times.
1210 if (err == DLB_ERR_NOMEM) {
151 3 warning("Region %s has been correctly registered but cannot be shared among other"
152 " processes due to lack of space in the TALP shared memory. Features like"
153 " node report or gathering data from external processes may not work for"
154 " this region. If needed, increase the TALP shared memory capacity using"
155 " the flag --shm-size-multiplier. Run dlb -hh for more info.",
156 monitor->name);
157
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1207 times.
1207 } else if (err < DLB_SUCCESS) {
158 fatal("Unknown error registering region %s, please report bug at %s",
159 monitor->name, PACKAGE_BUGREPORT);
160 }
161 }
162 2256 }
163
164 9 struct dlb_monitor_t* region_get_global(const subprocess_descriptor_t *spd) {
165 9 talp_info_t *talp_info = spd->talp_info;
166
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 1 times.
9 return talp_info ? talp_info->monitor : NULL;
167 }
168
169 31 const char* region_get_global_name(void) {
170 31 return global_region_name;
171 }
172
173 /* Helper function for GTree: Compare region names */
174 39408 int region_compare_by_name(const void *a, const void *b) {
175 39408 return strncmp(a, b, DLB_MONITOR_NAME_MAX-1);
176 }
177
178 /* Helper function for GTree: deallocate */
179 2256 void region_dealloc(void *data) {
180
181 2256 dlb_monitor_t *monitor = data;
182
183 /* Free private data */
184 2256 monitor_data_t *monitor_data = monitor->_data;
185 2256 free(monitor_data);
186 2256 monitor_data = NULL;
187
188 /* Free name */
189 2256 free((char*)monitor->name);
190 2256 monitor->name = NULL;
191
192 /* Free monitor */
193 2256 free(monitor);
194 2256 }
195
196 2268 dlb_monitor_t* region_register(const subprocess_descriptor_t *spd, const char* name) {
197
198 /* Forbidden names */
199
2/2
✓ Branch 0 taken 2267 times.
✓ Branch 1 taken 1 times.
2268 if (name == DLB_LAST_OPEN_REGION
200
2/2
✓ Branch 0 taken 2264 times.
✓ Branch 1 taken 3 times.
2267 || (name != NULL
201
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2262 times.
2264 && strncasecmp("all", name, DLB_MONITOR_NAME_MAX-1) == 0)) {
202 3 return NULL;
203 }
204
205 2265 talp_info_t *talp_info = spd->talp_info;
206
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2265 times.
2265 if (talp_info == NULL) return NULL;
207
208 2265 dlb_monitor_t *monitor = NULL;
209
4/4
✓ Branch 0 taken 2262 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 2261 times.
2265 bool anonymous_region = (name == NULL || *name == '\0');
210
4/4
✓ Branch 0 taken 2261 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 2237 times.
2265 bool global_region = !anonymous_region && name == global_region_name;
211
212 /* Check again if the pointers are different but the string content is the
213 * same as the global region, ignoring case */
214
2/2
✓ Branch 0 taken 2261 times.
✓ Branch 1 taken 4 times.
2265 if (!anonymous_region
215
2/2
✓ Branch 0 taken 2237 times.
✓ Branch 1 taken 24 times.
2261 && !global_region
216
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 2232 times.
2237 && strncasecmp(global_region_name, name, DLB_MONITOR_NAME_MAX-1) == 0) {
217 5 name = global_region_name;
218 5 global_region = true;
219 }
220
221 /* Found monitor if already registered */
222
2/2
✓ Branch 0 taken 2261 times.
✓ Branch 1 taken 4 times.
2265 if (!anonymous_region) {
223 2261 pthread_mutex_lock(&talp_info->regions_mutex);
224 {
225 2261 monitor = g_tree_lookup(talp_info->regions, name);
226 }
227 2261 pthread_mutex_unlock(&talp_info->regions_mutex);
228
229
2/2
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 2252 times.
2261 if (monitor != NULL) {
230 9 return monitor;
231 }
232 }
233
234 /* Otherwise, create new monitoring region */
235 2256 monitor = malloc(sizeof(dlb_monitor_t));
236
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2256 times.
2256 fatal_cond(!monitor, "Could not register a new monitoring region."
237 " Please report at "PACKAGE_BUGREPORT);
238
239 /* Determine the initial number of assigned CPUs for the region */
240 2256 float avg_cpus = CPU_COUNT(&spd->process_mask);
241
242 /* Construct name if anonymous region */
243 char monitor_name[DLB_MONITOR_NAME_MAX];
244
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 2252 times.
2256 if (anonymous_region) {
245 4 snprintf(monitor_name, DLB_MONITOR_NAME_MAX, "Anonymous Region %d",
246 get_new_anonymous_id());
247 4 name = monitor_name;
248 }
249
250 /* Initialize values */
251 4512 bool have_shmem = talp_info->flags.have_shmem
252
3/6
✓ Branch 0 taken 1046 times.
✓ Branch 1 taken 1210 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 1046 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
2256 || (talp_info->flags.have_minimal_shmem && global_region);
253 2256 region_initialize(monitor, get_new_monitor_id(), name,
254 2256 spd->id, avg_cpus, spd->options.talp_region_select, have_shmem);
255
256 /* Finally, insert */
257 2256 pthread_mutex_lock(&talp_info->regions_mutex);
258 {
259 2256 g_tree_insert(talp_info->regions, (gpointer)monitor->name, monitor);
260 }
261 2256 pthread_mutex_unlock(&talp_info->regions_mutex);
262
263 2256 return monitor;
264 }
265
266 6 int region_reset(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) {
267
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 5 times.
6 if (monitor == DLB_GLOBAL_REGION) {
268 1 talp_info_t *talp_info = spd->talp_info;
269 1 monitor = talp_info->monitor;
270 }
271
272 6 monitor_data_t *monitor_data = monitor->_data;
273
274 /* Close region if started */
275
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 5 times.
6 if (monitor_data->flags.started) {
276
277 1 talp_info_t *talp_info = spd->talp_info;
278
279 1 pthread_mutex_lock(&talp_info->regions_mutex);
280 {
281 1 monitor_data->flags.started = false;
282 1 talp_info->open_regions = g_slist_remove(talp_info->open_regions, monitor);
283 }
284 1 pthread_mutex_unlock(&talp_info->regions_mutex);
285
286
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 verbose(VB_TALP, "Stopping region %s", monitor->name);
287 instrument_event(MONITOR_REGION, monitor_data->id, EVENT_END);
288 }
289
290 /* Reset everything except these fields: */
291 6 *monitor = (const dlb_monitor_t) {
292 6 .name = monitor->name,
293 6 .num_resets = monitor->num_resets + 1,
294 6 ._data = monitor->_data,
295 };
296
297 6 return DLB_SUCCESS;
298 }
299
300 2692 int region_start(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) {
301 /* Observer threads don't have a valid sample so they cannot start/stop regions */
302
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2690 times.
2692 if (unlikely(thread_is_observer)) return DLB_ERR_PERM;
303
304 2690 talp_info_t *talp_info = spd->talp_info;
305
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2687 times.
2690 if (monitor == DLB_GLOBAL_REGION) {
306 3 monitor = talp_info->monitor;
307 }
308
309 int error;
310 2690 monitor_data_t *monitor_data = monitor->_data;
311
312
4/4
✓ Branch 0 taken 2677 times.
✓ Branch 1 taken 13 times.
✓ Branch 2 taken 2669 times.
✓ Branch 3 taken 8 times.
2690 if (!monitor_data->flags.started && monitor_data->flags.enabled) {
313 /* Gather samples from all threads and update regions */
314 2669 talp_flush_samples_to_regions(spd);
315
316
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2669 times.
2669 verbose(VB_TALP, "Starting region %s", monitor->name);
317 instrument_event(MONITOR_REGION, monitor_data->id, EVENT_BEGIN);
318
319 /* Thread sample was just updated, use timestamp as starting time */
320 2669 talp_sample_t *thread_sample = talp_get_thread_sample(spd);
321 2669 monitor->start_time = thread_sample->last_updated_timestamp;
322 2669 monitor->stop_time = 0;
323
324 2669 pthread_mutex_lock(&talp_info->regions_mutex);
325 {
326 2669 monitor_data->flags.started = true;
327 2669 talp_info->open_regions = g_slist_prepend(talp_info->open_regions, monitor);
328 }
329 2669 pthread_mutex_unlock(&talp_info->regions_mutex);
330
331 /* Normally, the sample state will be 'TALP_STATE_USEFUL' at this point,
332 * but on certain cases where neither talp_mpi_init nor talp_openmp_init
333 * have been called, this is necessary */
334
2/2
✓ Branch 0 taken 23 times.
✓ Branch 1 taken 2646 times.
2669 if (thread_sample->state != TALP_STATE_USEFUL) {
335 23 talp_set_sample_state(spd, thread_sample, TALP_STATE_USEFUL);
336 }
337
338 2669 error = DLB_SUCCESS;
339 } else {
340 21 error = DLB_NOUPDT;
341 }
342
343 2690 return error;
344 }
345
346 2787 int region_stop(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) {
347 /* Observer threads don't have a valid sample so they cannot start/stop regions */
348
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2784 times.
2787 if (unlikely(thread_is_observer)) return DLB_ERR_PERM;
349
350 2784 talp_info_t *talp_info = spd->talp_info;
351
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2781 times.
2784 if (monitor == DLB_GLOBAL_REGION) {
352 3 monitor = talp_info->monitor;
353
2/2
✓ Branch 0 taken 106 times.
✓ Branch 1 taken 2675 times.
2781 } else if (monitor == DLB_LAST_OPEN_REGION) {
354
2/2
✓ Branch 0 taken 105 times.
✓ Branch 1 taken 1 times.
106 if (talp_info->open_regions != NULL) {
355 105 monitor = talp_info->open_regions->data;
356 } else {
357 1 return DLB_ERR_NOENT;
358 }
359 }
360
361 int error;
362 2783 monitor_data_t *monitor_data = monitor->_data;
363
364
2/2
✓ Branch 0 taken 2668 times.
✓ Branch 1 taken 115 times.
2783 if (monitor_data->flags.started) {
365 /* Gather samples from all threads and update regions */
366 2668 talp_flush_samples_to_regions(spd);
367
368 /* Stop timer */
369 2668 talp_sample_t *thread_sample = talp_get_thread_sample(spd);
370 2668 monitor->stop_time = thread_sample->last_updated_timestamp;
371 2668 monitor->elapsed_time += monitor->stop_time - monitor->start_time;
372 2668 ++(monitor->num_measurements);
373
374 2668 pthread_mutex_lock(&talp_info->regions_mutex);
375 {
376 2668 monitor_data->flags.started = false;
377 2668 talp_info->open_regions = g_slist_remove(talp_info->open_regions, monitor);
378 }
379 2668 pthread_mutex_unlock(&talp_info->regions_mutex);
380
381
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2668 times.
2668 verbose(VB_TALP, "Stopping region %s", monitor->name);
382 instrument_event(MONITOR_REGION, monitor_data->id, EVENT_END);
383 2668 error = DLB_SUCCESS;
384 } else {
385 115 error = DLB_NOUPDT;
386 }
387
388 2783 return error;
389 }
390
391 14 bool region_is_started(const dlb_monitor_t *monitor) {
392 14 return ((monitor_data_t*)monitor->_data)->flags.started;
393 }
394
395 1 void region_set_internal(struct dlb_monitor_t *monitor, bool internal) {
396 1 ((monitor_data_t*)monitor->_data)->flags.internal = internal;
397 1 }
398
399 8 int region_report(const subprocess_descriptor_t *spd, const dlb_monitor_t *monitor) {
400 8 talp_info_t *talp_info = spd->talp_info;
401
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
8 if (monitor == DLB_GLOBAL_REGION) {
402 1 monitor = talp_info->monitor;
403 }
404
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
8 if (((monitor_data_t*)monitor->_data)->flags.internal) {
405 1 return DLB_NOUPDT;
406 }
407
408 7 talp_output_print_monitoring_region(monitor, mu_to_str(&spd->process_mask),
409 talp_info->flags);
410
411 7 return DLB_SUCCESS;
412 }
413