GCC Code Coverage Report


Directory: src/
File: src/talp/talp_openmp.c
Date: 2026-04-21 15:16:03
Exec Total Coverage
Lines: 144 158 91.1%
Functions: 15 15 100.0%
Branches: 47 90 52.2%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2025 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #include "talp/talp_openmp.h"
21
22 #include "LB_numThreads/omptool.h"
23 #include "LB_comm/shmem_talp.h"
24 #include "LB_core/DLB_kernel.h"
25 #include "apis/dlb_talp.h"
26 #include "support/debug.h"
27 #include "talp/regions.h"
28 #include "talp/talp.h"
29 #include "talp/talp_hwc.h"
30 #include "talp/talp_types.h"
31
32 #include <unistd.h>
33
34 extern __thread bool thread_is_observer;
35
36 /* Update all open nested regions (so, excluding the innermost) and add the
37 * time since its start time until the sample last timestamp (which is the time
38 * that has yet not been added to the regions) as omp_serialization_time */
39 1 static void update_serialization_in_nested_regions(const subprocess_descriptor_t *spd,
40 const talp_sample_t *sample) {
41
42 1 talp_info_t *talp_info = spd->talp_info;
43
44 /* Update all open nested regions */
45 1 pthread_mutex_lock(&talp_info->regions_mutex);
46 {
47 2 GSList *nested_open_regions = talp_info->open_regions
48 1 ? talp_info->open_regions->next
49
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 : NULL;
50
51 1 for (GSList *node = nested_open_regions;
52
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 node != NULL;
53 1 node = node->next) {
54
55 1 dlb_monitor_t *monitor = node->data;
56 1 monitor->omp_serialization_time +=
57 1 sample->last_updated_timestamp - monitor->start_time;
58 }
59 }
60 1 pthread_mutex_unlock(&talp_info->regions_mutex);
61 1 }
62
63
64 /*********************************************************************************/
65 /* TALP OpenMP functions */
66 /*********************************************************************************/
67
68 /* samples involved in parallel level 1 */
69 static talp_sample_t** parallel_samples_l1 = NULL;
70 static unsigned int parallel_samples_l1_capacity = 0;
71
72 1 void talp_openmp_init(pid_t pid, const options_t* options) {
73
74
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 ensure(!thread_is_observer, "An observer thread cannot call talp_openmp_init");
75
76 1 const subprocess_descriptor_t *spd = thread_spd;
77 1 talp_info_t *talp_info = spd->talp_info;
78
79
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (talp_info) {
80 1 monitor_data_t *monitor_data = talp_info->monitor->_data;
81 1 talp_info->flags.have_openmp = true;
82
83 /* Fix up number of CPUs for the global region */
84 1 float cpus = CPU_COUNT(&spd->process_mask);
85 1 talp_info->monitor->avg_cpus = cpus;
86 1 shmem_talp__set_avg_cpus(monitor_data->node_shared_id, cpus);
87
88 /* Start global region (no-op if already started) */
89 1 region_start(spd, talp_info->monitor);
90
91 /* Set useful state */
92 1 talp_sample_t *sample = talp_get_thread_sample(spd);
93 1 talp_set_sample_state(spd, sample, TALP_STATE_USEFUL);
94 }
95 1 }
96
97 1 void talp_openmp_finalize(void) {
98
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (parallel_samples_l1 != NULL) {
99 1 free(parallel_samples_l1);
100 1 parallel_samples_l1 = NULL;
101 1 parallel_samples_l1_capacity = 0;
102 }
103 1 }
104
105 2 void talp_openmp_thread_begin(ompt_thread_t thread_type) {
106
107 2 const subprocess_descriptor_t *spd = thread_spd;
108 2 talp_info_t *talp_info = spd->talp_info;
109
110
2/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
2 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
111
112 /* Initial thread is already in useful state, set omp_out for others */
113 2 talp_sample_t *sample = talp_get_thread_sample(spd);
114
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 if (sample->state == TALP_STATE_DISABLED) {
115 /* Not initial thread: */
116
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (talp_info->flags.have_hwc) {
117 talp_hwc_thread_init();
118 }
119 1 talp_set_sample_state(spd, sample, TALP_STATE_NOT_USEFUL_OMP_OUT);
120
121 /* The initial time of the sample is set to match the start time of
122 * the innermost open region, but other nested open regions need to
123 * be fixed */
124 1 update_serialization_in_nested_regions(spd, sample);
125 }
126 }
127
128 2 void talp_openmp_thread_end(void) {
129
130 2 const subprocess_descriptor_t *spd = thread_spd;
131 2 talp_info_t *talp_info = spd->talp_info;
132
133
3/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
2 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
134
135 /* Update thread sample with the last microsample */
136 1 talp_sample_t *sample = talp_get_thread_sample(spd);
137 1 talp_update_sample(spd, sample, TALP_NO_TIMESTAMP);
138
139 /* Update state */
140 1 talp_set_sample_state(spd, sample, TALP_STATE_DISABLED);
141
142 /* Finalize PAPI per-thread state */
143
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (talp_info->flags.have_hwc) {
144 talp_hwc_thread_finalize();
145 }
146 }
147
148 2 void talp_openmp_parallel_begin(omptool_parallel_data_t *parallel_data) {
149
150
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 fatal_cond(parallel_data->requested_parallelism < 1,
151 "Requested parallel region of invalid size in %s. Please report bug at %s.",
152 __func__, PACKAGE_BUGREPORT);
153
154 2 const subprocess_descriptor_t *spd = thread_spd;
155 2 talp_info_t *talp_info = spd->talp_info;
156
157
2/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
2 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
158
159 2 int parallel_level = parallel_data->level;
160
161
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (parallel_level == 1) {
162 /* Resize samples of parallel 1 if needed */
163 2 unsigned int requested_parallelism = parallel_data->requested_parallelism;
164
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (requested_parallelism > parallel_samples_l1_capacity) {
165 2 void *ptr = realloc(parallel_samples_l1,
166 sizeof(talp_sample_t*)*requested_parallelism);
167
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 fatal_cond(!ptr, "realloc failed in %s", __func__);
168 2 parallel_samples_l1 = ptr;
169 2 parallel_samples_l1_capacity = requested_parallelism;
170 }
171
172 /* Assign local data */
173 2 parallel_data->talp_parallel_data = parallel_samples_l1;
174
175 } else if (parallel_level > 1) {
176 /* Allocate parallel samples array */
177 unsigned int requested_parallelism = parallel_data->requested_parallelism;
178 void *ptr = malloc(sizeof(talp_sample_t*)*requested_parallelism);
179 fatal_cond(!ptr, "malloc failed in %s", __func__);
180
181 /* Assign local data */
182 parallel_data->talp_parallel_data = ptr;
183 }
184
185 /* Update stats */
186 2 talp_sample_t *sample = talp_get_thread_sample(spd);
187 2 DLB_ATOMIC_ADD_RLX(&sample->stats.num_omp_parallels, 1);
188
189 /* Update main thread serial mode if this is the outermost parallel region */
190
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (parallel_level == 1) {
191 2 talp_set_main_sample_in_serial_mode(false);
192 }
193 }
194
195 2 void talp_openmp_parallel_end(omptool_parallel_data_t *parallel_data) {
196
197 2 const subprocess_descriptor_t *spd = thread_spd;
198 2 talp_info_t *talp_info = spd->talp_info;
199
200
2/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
2 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
201
202 /* Update thread sample with the last microsample */
203 2 talp_sample_t *sample = talp_get_thread_sample(spd);
204 2 talp_update_sample(spd, sample, TALP_NO_TIMESTAMP);
205
206 2 int parallel_level = parallel_data->level;
207
208
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (parallel_level == 1) {
209 /* Flush and aggregate all samples of the parallel region */
210 2 talp_flush_sample_subset_to_regions(spd,
211 2 parallel_data->talp_parallel_data,
212 parallel_data->actual_parallelism);
213
214 } else if (parallel_level > 1) {
215 /* Flush and aggregate all samples of this parallel except this
216 * thread's sample. The primary thread of a nested parallel region
217 * will keep its samples until it finishes as non-primary
218 * team-worker or reaches the level 1 parallel region */
219 talp_sample_t **parallel_samples = parallel_data->talp_parallel_data;
220 talp_flush_sample_subset_to_regions(spd,
221 &parallel_samples[1],
222 parallel_data->actual_parallelism-1);
223
224 /* free local data */
225 free(parallel_data->talp_parallel_data);
226 parallel_data->talp_parallel_data = NULL;
227 }
228
229 /* Update current threads's state */
230 2 talp_set_sample_state(spd, sample, TALP_STATE_USEFUL);
231
232 /* Update the state of the rest of team-worker threads
233 * (note that talp_set_sample_state cannot be used here because we are
234 * impersonating a worker thread) */
235 2 talp_sample_t **parallel_samples = parallel_data->talp_parallel_data;
236
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
3 for (unsigned int i = 1; i < parallel_data->actual_parallelism; ++i) {
237 1 talp_sample_t *worker_sample = parallel_samples[i];
238
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (worker_sample->state == TALP_STATE_NOT_USEFUL_OMP_IN) {
239 worker_sample->state = TALP_STATE_NOT_USEFUL_OMP_OUT;
240 }
241 }
242
243 /* Update main thread serial mode if this was the outermost parallel region */
244
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (parallel_level == 1) {
245 2 talp_set_main_sample_in_serial_mode(true);
246 }
247 }
248
249 3 void talp_openmp_into_parallel_function(
250 omptool_parallel_data_t *parallel_data, unsigned int index) {
251
252 3 const subprocess_descriptor_t *spd = thread_spd;
253 3 talp_info_t *talp_info = spd->talp_info;
254
255
2/4
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
3 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
256
257 /* Assign thread sample as team-worker of this parallel */
258 3 talp_sample_t *sample = talp_get_thread_sample(spd);
259 3 talp_sample_t **parallel_samples = parallel_data->talp_parallel_data;
260 /* Probably optimized, but try to avoid invalidating
261 * the cache line on reused parallel data */
262
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1 times.
3 if (parallel_samples[index] != sample) {
263 2 parallel_samples[index] = sample;
264 }
265
266 /* Update thread sample with the last microsample */
267 3 talp_update_sample(spd, sample, TALP_NO_TIMESTAMP);
268
269 /* Update state */
270 3 talp_set_sample_state(spd, sample, TALP_STATE_USEFUL);
271 }
272
273 1 void talp_openmp_outof_parallel_function(void) {
274
275 1 const subprocess_descriptor_t *spd = thread_spd;
276 1 talp_info_t *talp_info = spd->talp_info;
277
278
2/4
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
1 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
279
280 /* Update thread sample with the last microsample */
281 1 talp_sample_t *sample = talp_get_thread_sample(spd);
282 1 talp_update_sample(spd, sample, TALP_NO_TIMESTAMP);
283
284 /* Update state */
285 1 talp_set_sample_state(spd, sample, TALP_STATE_NOT_USEFUL_OMP_OUT);
286 }
287
288 3 void talp_openmp_into_parallel_implicit_barrier(omptool_parallel_data_t *parallel_data) {
289
290 3 const subprocess_descriptor_t *spd = thread_spd;
291 3 talp_info_t *talp_info = spd->talp_info;
292
293
2/4
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
3 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
294
295 /* Update thread sample with the last microsample */
296 3 talp_sample_t *sample = talp_get_thread_sample(spd);
297 3 talp_update_sample(spd, sample, TALP_NO_TIMESTAMP);
298
299 /* Update state */
300 3 talp_set_sample_state(spd, sample, TALP_STATE_NOT_USEFUL_OMP_IN);
301 }
302
303 3 void talp_openmp_into_parallel_sync(omptool_parallel_data_t *parallel_data) {
304
305 3 const subprocess_descriptor_t *spd = thread_spd;
306 3 talp_info_t *talp_info = spd->talp_info;
307
308
2/4
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
3 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
309
310 /* Update thread sample with the last microsample */
311 3 talp_sample_t *sample = talp_get_thread_sample(spd);
312 3 talp_update_sample(spd, sample, TALP_NO_TIMESTAMP);
313
314 /* Update state */
315 3 talp_set_sample_state(spd, sample, TALP_STATE_NOT_USEFUL_OMP_IN);
316 }
317
318 3 void talp_openmp_outof_parallel_sync(omptool_parallel_data_t *parallel_data) {
319
320 3 const subprocess_descriptor_t *spd = thread_spd;
321 3 talp_info_t *talp_info = spd->talp_info;
322
323
2/4
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
3 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
324
325 /* Update thread sample with the last microsample */
326 3 talp_sample_t *sample = talp_get_thread_sample(spd);
327 3 talp_update_sample(spd, sample, TALP_NO_TIMESTAMP);
328
329 /* Update state */
330 3 talp_set_sample_state(spd, sample, TALP_STATE_USEFUL);
331 }
332
333 3 void talp_openmp_task_create(void) {
334
335 3 const subprocess_descriptor_t *spd = thread_spd;
336 3 talp_info_t *talp_info = spd->talp_info;
337
338
2/4
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
3 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
339
340 /* Just update stats */
341 3 talp_sample_t *sample = talp_get_thread_sample(spd);
342 3 DLB_ATOMIC_ADD_RLX(&sample->stats.num_omp_tasks, 1);
343 }
344
345 3 void talp_openmp_task_complete(void) {
346
347 3 const subprocess_descriptor_t *spd = thread_spd;
348 3 talp_info_t *talp_info = spd->talp_info;
349
350
2/4
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
3 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
351
352 /* Update thread sample with the last microsample */
353 3 talp_sample_t *sample = talp_get_thread_sample(spd);
354 3 talp_update_sample(spd, sample, TALP_NO_TIMESTAMP);
355
356 /* Update state (FIXME: tasks outside of parallels? */
357 3 talp_set_sample_state(spd, sample, TALP_STATE_NOT_USEFUL_OMP_IN);
358 }
359
360 6 void talp_openmp_task_switch(void) {
361
362 6 const subprocess_descriptor_t *spd = thread_spd;
363 6 talp_info_t *talp_info = spd->talp_info;
364
365
2/4
✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 6 times.
6 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
366
367 /* Update thread sample with the last microsample */
368 6 talp_sample_t *sample = talp_get_thread_sample(spd);
369 6 talp_update_sample(spd, sample, TALP_NO_TIMESTAMP);
370
371 /* Update state */
372 6 talp_set_sample_state(spd, sample, TALP_STATE_USEFUL);
373 }
374
375
376 /*********************************************************************************/
377 /* Vtable for handling omptool events */
378 /*********************************************************************************/
379
380 const omptool_event_funcs_t talp_events_vtable = (const omptool_event_funcs_t) {
381 .init = talp_openmp_init,
382 .finalize = talp_openmp_finalize,
383 .into_mpi = NULL,
384 .outof_mpi = NULL,
385 .lend_from_api = NULL,
386 .thread_begin = talp_openmp_thread_begin,
387 .thread_end = talp_openmp_thread_end,
388 .thread_role_shift = NULL,
389 .parallel_begin = talp_openmp_parallel_begin,
390 .parallel_end = talp_openmp_parallel_end,
391 .into_parallel_function = talp_openmp_into_parallel_function,
392 .outof_parallel_function = talp_openmp_outof_parallel_function,
393 .into_parallel_implicit_barrier = talp_openmp_into_parallel_implicit_barrier,
394 .into_parallel_sync = talp_openmp_into_parallel_sync,
395 .outof_parallel_sync = talp_openmp_outof_parallel_sync,
396 .task_create = talp_openmp_task_create,
397 .task_complete = talp_openmp_task_complete,
398 .task_switch = talp_openmp_task_switch,
399 };
400