GCC Code Coverage Report


Directory: src/
File: src/talp/regions.c
Date: 2025-11-21 10:34:40
Exec Total Coverage
Lines: 175 176 99.4%
Functions: 15 15 100.0%
Branches: 88 98 89.8%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2025 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include "talp/regions.h"
25
26 #include "LB_comm/shmem_talp.h"
27 #include "LB_core/spd.h"
28 #include "apis/dlb_errors.h"
29 #include "apis/dlb_talp.h"
30 #include "support/debug.h"
31 #include "support/gtree.h"
32 #include "support/mask_utils.h"
33 #include "support/tracing.h"
34 #include "talp/talp.h"
35 #include "talp/talp_output.h"
36 #include "talp/talp_types.h"
37
38 #include <pthread.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42
43 extern __thread bool thread_is_observer;
44
45
46 /*********************************************************************************/
47 /* TALP Monitoring Regions */
48 /*********************************************************************************/
49
50 const char *global_region_name = DLB_GLOBAL_REGION_NAME;
51
52
53 /* Unique region ids */
54 5850 static int get_new_monitor_id(void) {
55 static atomic_int id = 0;
56 5850 return DLB_ATOMIC_ADD_FETCH_RLX(&id, 1);
57 }
58
59 /* Unique anonymous regions */
60 4 static int get_new_anonymous_id(void) {
61 static atomic_int id = 0;
62 4 return DLB_ATOMIC_ADD_FETCH_RLX(&id, 1);
63 }
64
65 /* Return true if the region is to be enabled.
66 * region_select format:
67 * --talp-region-select=[(include|exclude):]<region-list>
68 */
69 5850 static bool parse_region_select(const char *region_select, const char *region_name) {
70
71 /* Default case, all regions enabled */
72
1/2
✓ Branch 0 taken 5850 times.
✗ Branch 1 not taken.
5850 if (region_select == NULL
73
2/2
✓ Branch 0 taken 5836 times.
✓ Branch 1 taken 14 times.
5850 || region_select[0] == '\0') {
74 5836 return true;
75 }
76
77 /* Select inclusion or exclusion mode,
78 * and advance pointer */
79 14 bool in_inclusion_mode = true;
80
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 10 times.
14 if (strncmp(region_select, "exclude:", strlen("exclude:")) == 0) {
81 4 in_inclusion_mode = false;
82 4 region_select += strlen("exclude:");
83
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 7 times.
10 } else if (strncmp(region_select, "include:", strlen("include:")) == 0) {
84 3 region_select += strlen("include:");
85 }
86
87 /* If "[(include|exclude):]all" */
88
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 11 times.
14 if (strcmp(region_select, "all") == 0) {
89 3 return in_inclusion_mode;
90 }
91
92 /* Break region_select into tokens and find region_name */
93 11 bool found_in_select = false;
94 11 size_t len = strlen(region_select);
95 11 char *region_select_copy = malloc(sizeof(char)*(len+1));
96 11 strcpy(region_select_copy, region_select);
97 11 char *saveptr = NULL;
98 11 char *token = strtok_r(region_select_copy, ",", &saveptr);
99
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 5 times.
21 while (token) {
100 /* Region name is found */
101
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 11 times.
16 if (strcmp(token, region_name) == 0) {
102 5 found_in_select = true;
103 5 break;
104 }
105
106 /* Region name is same as global region, and same as token (ignoring case) */
107
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 8 times.
11 if (strcasecmp(region_name, global_region_name) == 0
108
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
3 && strcasecmp(token, global_region_name) == 0) {
109 1 found_in_select = true;
110 1 break;
111 }
112
113 /* next token */
114 10 token = strtok_r(NULL, ",", &saveptr);
115 }
116 11 free(region_select_copy);
117
118
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 3 times.
11 return in_inclusion_mode ? found_in_select : !found_in_select;
119 }
120
121 5850 static void region_initialize(dlb_monitor_t *monitor, int id, const
122 char *name, pid_t pid, float avg_cpus, const char *region_select, bool have_shmem) {
123 /* Initialize private monitor data */
124 5850 monitor_data_t *monitor_data = malloc(sizeof(monitor_data_t));
125 5850 *monitor_data = (const monitor_data_t) {
126 .id = id,
127 .node_shared_id = -1,
128 };
129
130 /* Parse --talp-region-select if needed */
131 5850 monitor_data->flags.enabled = parse_region_select(region_select, name);
132
133 /* Allocate monitor name */
134 5850 char *allocated_name = malloc(DLB_MONITOR_NAME_MAX*sizeof(char));
135 5850 snprintf(allocated_name, DLB_MONITOR_NAME_MAX, "%s", name);
136
137 /* Initialize monitor */
138 5850 *monitor = (const dlb_monitor_t) {
139 .name = allocated_name,
140 .avg_cpus = avg_cpus,
141 ._data = monitor_data,
142 };
143
144 /* Register name in the instrumentation tool */
145 instrument_register_event(MONITOR_REGION, monitor_data->id, name);
146
147 /* Register region in shmem */
148
2/2
✓ Branch 0 taken 4806 times.
✓ Branch 1 taken 1044 times.
5850 if (have_shmem) {
149 4806 int err = shmem_talp__register(pid, avg_cpus, monitor->name, &monitor_data->node_shared_id);
150
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 4803 times.
4806 if (err == DLB_ERR_NOMEM) {
151 3 warning("Region %s has been correctly registered but cannot be shared among other"
152 " processes due to lack of space in the TALP shared memory. Features like"
153 " node report or gathering data from external processes may not work for"
154 " this region. If needed, increase the TALP shared memory capacity using"
155 " the flag --shm-size-multiplier. Run dlb -hh for more info.",
156 monitor->name);
157
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 4803 times.
4803 } else if (err < DLB_SUCCESS) {
158 fatal("Unknown error registering region %s, please report bug at %s",
159 monitor->name, PACKAGE_BUGREPORT);
160 }
161 }
162 5850 }
163
164 9 struct dlb_monitor_t* region_get_global(const subprocess_descriptor_t *spd) {
165 9 talp_info_t *talp_info = spd->talp_info;
166
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 1 times.
9 return talp_info ? talp_info->monitor : NULL;
167 }
168
169 25 const char* region_get_global_name(void) {
170 25 return global_region_name;
171 }
172
173 /* Helper function for GTree: Compare region names */
174 121326 int region_compare_by_name(const void *a, const void *b) {
175 121326 return strncmp(a, b, DLB_MONITOR_NAME_MAX-1);
176 }
177
178 /* Helper function for GTree: deallocate */
179 5850 void region_dealloc(void *data) {
180
181 5850 dlb_monitor_t *monitor = data;
182
183 /* Free private data */
184 5850 monitor_data_t *monitor_data = monitor->_data;
185 5850 free(monitor_data);
186 5850 monitor_data = NULL;
187
188 /* Free name */
189 5850 free((char*)monitor->name);
190 5850 monitor->name = NULL;
191
192 /* Free monitor */
193 5850 free(monitor);
194 5850 }
195
196 5862 dlb_monitor_t* region_register(const subprocess_descriptor_t *spd, const char* name) {
197
198 /* Forbidden names */
199
2/2
✓ Branch 0 taken 5861 times.
✓ Branch 1 taken 1 times.
5862 if (name == DLB_LAST_OPEN_REGION
200
2/2
✓ Branch 0 taken 5858 times.
✓ Branch 1 taken 3 times.
5861 || (name != NULL
201
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 5856 times.
5858 && strncasecmp("all", name, DLB_MONITOR_NAME_MAX-1) == 0)) {
202 3 return NULL;
203 }
204
205 5859 talp_info_t *talp_info = spd->talp_info;
206
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5859 times.
5859 if (talp_info == NULL) return NULL;
207
208 5859 dlb_monitor_t *monitor = NULL;
209
4/4
✓ Branch 0 taken 5856 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 5855 times.
5859 bool anonymous_region = (name == NULL || *name == '\0');
210
4/4
✓ Branch 0 taken 5855 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 18 times.
✓ Branch 3 taken 5837 times.
5859 bool global_region = !anonymous_region && name == global_region_name;
211
212 /* Check again if the pointers are different but the string content is the
213 * same as the global region, ignoring case */
214
2/2
✓ Branch 0 taken 5855 times.
✓ Branch 1 taken 4 times.
5859 if (!anonymous_region
215
2/2
✓ Branch 0 taken 5837 times.
✓ Branch 1 taken 18 times.
5855 && !global_region
216
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5832 times.
5837 && strncasecmp(global_region_name, name, DLB_MONITOR_NAME_MAX-1) == 0) {
217 5 name = global_region_name;
218 5 global_region = true;
219 }
220
221 /* Found monitor if already registered */
222
2/2
✓ Branch 0 taken 5855 times.
✓ Branch 1 taken 4 times.
5859 if (!anonymous_region) {
223 5855 pthread_mutex_lock(&talp_info->regions_mutex);
224 {
225 5855 monitor = g_tree_lookup(talp_info->regions, name);
226 }
227 5855 pthread_mutex_unlock(&talp_info->regions_mutex);
228
229
2/2
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 5846 times.
5855 if (monitor != NULL) {
230 9 return monitor;
231 }
232 }
233
234 /* Otherwise, create new monitoring region */
235 5850 monitor = malloc(sizeof(dlb_monitor_t));
236
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5850 times.
5850 fatal_cond(!monitor, "Could not register a new monitoring region."
237 " Please report at "PACKAGE_BUGREPORT);
238
239 /* Determine the initial number of assigned CPUs for the region */
240 5850 float avg_cpus = CPU_COUNT(&spd->process_mask);
241
242 /* Construct name if anonymous region */
243 char monitor_name[DLB_MONITOR_NAME_MAX];
244
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 5846 times.
5850 if (anonymous_region) {
245 4 snprintf(monitor_name, DLB_MONITOR_NAME_MAX, "Anonymous Region %d",
246 get_new_anonymous_id());
247 4 name = monitor_name;
248 }
249
250 /* Initialize values */
251 11700 bool have_shmem = talp_info->flags.have_shmem
252
3/6
✓ Branch 0 taken 1044 times.
✓ Branch 1 taken 4806 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 1044 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
5850 || (talp_info->flags.have_minimal_shmem && global_region);
253 5850 region_initialize(monitor, get_new_monitor_id(), name,
254 5850 spd->id, avg_cpus, spd->options.talp_region_select, have_shmem);
255
256 /* Finally, insert */
257 5850 pthread_mutex_lock(&talp_info->regions_mutex);
258 {
259 5850 g_tree_insert(talp_info->regions, (gpointer)monitor->name, monitor);
260 }
261 5850 pthread_mutex_unlock(&talp_info->regions_mutex);
262
263 5850 return monitor;
264 }
265
266 6 int region_reset(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) {
267
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 5 times.
6 if (monitor == DLB_GLOBAL_REGION) {
268 1 talp_info_t *talp_info = spd->talp_info;
269 1 monitor = talp_info->monitor;
270 }
271
272 6 monitor_data_t *monitor_data = monitor->_data;
273
274 /* Close region if started */
275
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 5 times.
6 if (monitor_data->flags.started) {
276
277 1 talp_info_t *talp_info = spd->talp_info;
278
279 1 pthread_mutex_lock(&talp_info->regions_mutex);
280 {
281 1 monitor_data->flags.started = false;
282 1 talp_info->open_regions = g_slist_remove(talp_info->open_regions, monitor);
283 }
284 1 pthread_mutex_unlock(&talp_info->regions_mutex);
285
286
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 verbose(VB_TALP, "Stopping region %s", monitor->name);
287 instrument_event(MONITOR_REGION, monitor_data->id, EVENT_END);
288 }
289
290 /* Reset everything except these fields: */
291 6 *monitor = (const dlb_monitor_t) {
292 6 .name = monitor->name,
293 6 .num_resets = monitor->num_resets + 1,
294 6 ._data = monitor->_data,
295 };
296
297 6 return DLB_SUCCESS;
298 }
299
300 6283 int region_start(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) {
301 /* Observer threads don't have a valid sample so they cannot start/stop regions */
302
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 6281 times.
6283 if (unlikely(thread_is_observer)) return DLB_ERR_PERM;
303
304 6281 talp_info_t *talp_info = spd->talp_info;
305
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6278 times.
6281 if (monitor == DLB_GLOBAL_REGION) {
306 3 monitor = talp_info->monitor;
307 }
308
309 int error;
310 6281 monitor_data_t *monitor_data = monitor->_data;
311
312
4/4
✓ Branch 0 taken 6271 times.
✓ Branch 1 taken 10 times.
✓ Branch 2 taken 6263 times.
✓ Branch 3 taken 8 times.
6281 if (!monitor_data->flags.started && monitor_data->flags.enabled) {
313 /* Gather samples from all threads and update regions */
314 6263 talp_flush_samples_to_regions(spd);
315
316
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6263 times.
6263 verbose(VB_TALP, "Starting region %s", monitor->name);
317 instrument_event(MONITOR_REGION, monitor_data->id, EVENT_BEGIN);
318
319 /* Thread sample was just updated, use timestamp as starting time */
320 6263 talp_sample_t *thread_sample = talp_get_thread_sample(spd);
321 6263 monitor->start_time = thread_sample->last_updated_timestamp;
322 6263 monitor->stop_time = 0;
323
324 6263 pthread_mutex_lock(&talp_info->regions_mutex);
325 {
326 6263 monitor_data->flags.started = true;
327 6263 talp_info->open_regions = g_slist_prepend(talp_info->open_regions, monitor);
328 }
329 6263 pthread_mutex_unlock(&talp_info->regions_mutex);
330
331 /* Normally, the sample state will be 'useful' at this point, but on
332 * certain cases where neither talp_mpi_init nor talp_openmp_init have
333 * been called, this is necessary */
334
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 6246 times.
6263 if (thread_sample->state != useful) {
335 17 talp_set_sample_state(thread_sample, useful, talp_info->flags.papi);
336 }
337
338 6263 error = DLB_SUCCESS;
339 } else {
340 18 error = DLB_NOUPDT;
341 }
342
343 6281 return error;
344 }
345
346 6380 int region_stop(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) {
347 /* Observer threads don't have a valid sample so they cannot start/stop regions */
348
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6377 times.
6380 if (unlikely(thread_is_observer)) return DLB_ERR_PERM;
349
350 6377 talp_info_t *talp_info = spd->talp_info;
351
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6374 times.
6377 if (monitor == DLB_GLOBAL_REGION) {
352 3 monitor = talp_info->monitor;
353
2/2
✓ Branch 0 taken 106 times.
✓ Branch 1 taken 6268 times.
6374 } else if (monitor == DLB_LAST_OPEN_REGION) {
354
2/2
✓ Branch 0 taken 105 times.
✓ Branch 1 taken 1 times.
106 if (talp_info->open_regions != NULL) {
355 105 monitor = talp_info->open_regions->data;
356 } else {
357 1 return DLB_ERR_NOENT;
358 }
359 }
360
361 int error;
362 6376 monitor_data_t *monitor_data = monitor->_data;
363
364
2/2
✓ Branch 0 taken 6261 times.
✓ Branch 1 taken 115 times.
6376 if (monitor_data->flags.started) {
365 /* Gather samples from all threads and update regions */
366 6261 talp_flush_samples_to_regions(spd);
367
368 /* Stop timer */
369 6261 talp_sample_t *thread_sample = talp_get_thread_sample(spd);
370 6261 monitor->stop_time = thread_sample->last_updated_timestamp;
371 6261 monitor->elapsed_time += monitor->stop_time - monitor->start_time;
372 6261 ++(monitor->num_measurements);
373
374 6261 pthread_mutex_lock(&talp_info->regions_mutex);
375 {
376 6261 monitor_data->flags.started = false;
377 6261 talp_info->open_regions = g_slist_remove(talp_info->open_regions, monitor);
378 }
379 6261 pthread_mutex_unlock(&talp_info->regions_mutex);
380
381
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6261 times.
6261 verbose(VB_TALP, "Stopping region %s", monitor->name);
382 instrument_event(MONITOR_REGION, monitor_data->id, EVENT_END);
383 6261 error = DLB_SUCCESS;
384 } else {
385 115 error = DLB_NOUPDT;
386 }
387
388 6376 return error;
389 }
390
391 12 bool region_is_started(const dlb_monitor_t *monitor) {
392 12 return ((monitor_data_t*)monitor->_data)->flags.started;
393 }
394
395 1 void region_set_internal(struct dlb_monitor_t *monitor, bool internal) {
396 1 ((monitor_data_t*)monitor->_data)->flags.internal = internal;
397 1 }
398
399 8 int region_report(const subprocess_descriptor_t *spd, const dlb_monitor_t *monitor) {
400 8 talp_info_t *talp_info = spd->talp_info;
401
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
8 if (monitor == DLB_GLOBAL_REGION) {
402 1 monitor = talp_info->monitor;
403 }
404
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
8 if (((monitor_data_t*)monitor->_data)->flags.internal) {
405 1 return DLB_NOUPDT;
406 }
407
408 #ifdef PAPI_LIB
409 bool have_papi = talp_info->flags.papi;
410 #else
411 7 bool have_papi = false;
412 #endif
413 7 talp_output_print_monitoring_region(monitor, mu_to_str(&spd->process_mask),
414 7 talp_info->flags.have_mpi, talp_info->flags.have_openmp,
415 7 talp_info->flags.have_gpu, have_papi);
416
417 7 return DLB_SUCCESS;
418 }
419