GCC Code Coverage Report


Directory: src/
File: src/talp/talp_openmp.c
Date: 2026-06-05 08:54:23
Exec Total Coverage
Lines: 135 147 91.8%
Functions: 15 15 100.0%
Branches: 47 90 52.2%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2026 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #include "talp/talp_openmp.h"
21
22 #include "LB_numThreads/omptool.h"
23 #include "LB_comm/shmem_talp.h"
24 #include "LB_core/DLB_kernel.h"
25 #include "apis/dlb_talp.h"
26 #include "support/debug.h"
27 #include "talp/regions.h"
28 #include "talp/sample.h"
29 #include "talp/talp.h"
30 #include "talp/talp_hwc.h"
31 #include "talp/talp_types.h"
32
33 #include <unistd.h>
34
35 extern __thread bool thread_is_observer;
36
37 /* Update all open nested regions (so, excluding the innermost) and add the
38 * time since its start time until the sample last timestamp (which is the time
39 * that has yet not been added to the regions) as omp_serialization_time */
40 1 static void update_serialization_in_nested_regions(const subprocess_descriptor_t *spd,
41 const talp_sample_t *sample) {
42
43 1 talp_info_t *talp_info = spd->talp_info;
44
45 /* Update all open nested regions */
46 1 pthread_mutex_lock(&talp_info->regions_mutex);
47 {
48 2 GSList *nested_open_regions = talp_info->open_regions
49 1 ? talp_info->open_regions->next
50
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 : NULL;
51
52 1 for (GSList *node = nested_open_regions;
53
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 node != NULL;
54 1 node = node->next) {
55
56 1 dlb_monitor_t *monitor = node->data;
57 1 monitor->omp_serialization_time +=
58 1 sample->last_updated_ts - monitor->start_time;
59 }
60 }
61 1 pthread_mutex_unlock(&talp_info->regions_mutex);
62 1 }
63
64
65 /*********************************************************************************/
66 /* TALP OpenMP functions */
67 /*********************************************************************************/
68
69 /* samples involved in parallel level 1 */
70 static talp_sample_t** parallel_samples_l1 = NULL;
71 static unsigned int parallel_samples_l1_capacity = 0;
72
73 1 void talp_openmp_init(pid_t pid, const options_t* options) {
74
75
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 ensure(!thread_is_observer, "An observer thread cannot call talp_openmp_init");
76
77 1 const subprocess_descriptor_t *spd = thread_spd;
78 1 talp_info_t *talp_info = spd->talp_info;
79
80
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (talp_info) {
81 1 monitor_data_t *monitor_data = talp_info->monitor->_data;
82 1 talp_info->flags.have_openmp = true;
83
84 /* Fix up number of CPUs for the global region */
85 1 float cpus = CPU_COUNT(&spd->process_mask);
86 1 talp_info->monitor->avg_cpus = cpus;
87 1 shmem_talp__set_avg_cpus(monitor_data->node_shared_id, cpus);
88
89 /* Start global region (no-op if already started) */
90 1 region_start(spd, talp_info->monitor);
91
92 /* Set useful state */
93 1 talp_sample_set_state(talp_info, TALP_STATE_USEFUL);
94 }
95 1 }
96
97 1 void talp_openmp_finalize(void) {
98
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (parallel_samples_l1 != NULL) {
99 1 free(parallel_samples_l1);
100 1 parallel_samples_l1 = NULL;
101 1 parallel_samples_l1_capacity = 0;
102 }
103 1 }
104
105 2 void talp_openmp_thread_begin(ompt_thread_t thread_type) {
106
107 2 const subprocess_descriptor_t *spd = thread_spd;
108 2 talp_info_t *talp_info = spd->talp_info;
109
110
2/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
2 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
111
112 /* Initial thread is already in useful state, set omp_out for others */
113 2 talp_sample_t *sample = talp_sample_get(talp_info);
114
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 if (sample->state == TALP_STATE_DISABLED) {
115 /* Not initial thread: */
116
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (talp_info->flags.have_hwc) {
117 talp_hwc_thread_init();
118 }
119 1 talp_sample_set_state(talp_info, TALP_STATE_NOT_USEFUL_OMP_OUT);
120
121 /* The initial time of the sample is set to match the start time of
122 * the innermost open region, but other nested open regions need to
123 * be fixed */
124 1 update_serialization_in_nested_regions(spd, sample);
125 }
126 }
127
128 2 void talp_openmp_thread_end(void) {
129
130 2 const subprocess_descriptor_t *spd = thread_spd;
131 2 talp_info_t *talp_info = spd->talp_info;
132
133
3/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
2 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
134
135 /* Update thread sample */
136 1 talp_sample_update(talp_info);
137
138 /* Update state */
139 1 talp_sample_set_state(talp_info, TALP_STATE_DISABLED);
140
141 /* Finalize PAPI per-thread state */
142
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (talp_info->flags.have_hwc) {
143 talp_hwc_thread_finalize();
144 }
145 }
146
147 2 void talp_openmp_parallel_begin(omptool_parallel_data_t *parallel_data) {
148
149
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 fatal_cond(parallel_data->requested_parallelism < 1,
150 "Requested parallel region of invalid size in %s. Please report bug at %s.",
151 __func__, PACKAGE_BUGREPORT);
152
153 2 const subprocess_descriptor_t *spd = thread_spd;
154 2 talp_info_t *talp_info = spd->talp_info;
155
156
2/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
2 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
157
158 2 int parallel_level = parallel_data->level;
159
160
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (parallel_level == 1) {
161 /* Resize samples of parallel 1 if needed */
162 2 unsigned int requested_parallelism = parallel_data->requested_parallelism;
163
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (requested_parallelism > parallel_samples_l1_capacity) {
164 2 void *ptr = realloc(parallel_samples_l1,
165 sizeof(talp_sample_t*)*requested_parallelism);
166
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 fatal_cond(!ptr, "realloc failed in %s", __func__);
167 2 parallel_samples_l1 = ptr;
168 2 parallel_samples_l1_capacity = requested_parallelism;
169 }
170
171 /* Assign local data */
172 2 parallel_data->talp_parallel_data = parallel_samples_l1;
173
174 } else if (parallel_level > 1) {
175 /* Allocate parallel samples array */
176 unsigned int requested_parallelism = parallel_data->requested_parallelism;
177 void *ptr = malloc(sizeof(talp_sample_t*)*requested_parallelism);
178 fatal_cond(!ptr, "malloc failed in %s", __func__);
179
180 /* Assign local data */
181 parallel_data->talp_parallel_data = ptr;
182 }
183
184 /* Update stats */
185 2 talp_sample_t *sample = talp_sample_get(talp_info);
186 2 DLB_ATOMIC_ADD_RLX(&sample->stats.num_omp_parallels, 1);
187
188 /* Update main thread serial mode if this is the outermost parallel region */
189
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (parallel_level == 1) {
190 2 talp_sample_set_main_serial_mode(false);
191 }
192 }
193
194 2 void talp_openmp_parallel_end(omptool_parallel_data_t *parallel_data) {
195
196 2 const subprocess_descriptor_t *spd = thread_spd;
197 2 talp_info_t *talp_info = spd->talp_info;
198
199
2/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
2 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
200
201 /* Update thread */
202 2 talp_sample_update(talp_info);
203
204 2 int parallel_level = parallel_data->level;
205 2 talp_sample_t **parallel_samples = parallel_data->talp_parallel_data;
206 2 unsigned int num_samples = parallel_data->actual_parallelism;
207
208
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (parallel_level == 1) {
209 /* Flush and aggregate all samples of the parallel region */
210 2 talp_aggregate_subset_to_regions(talp_info,
211 parallel_samples,
212 num_samples);
213
214 } else if (parallel_level > 1) {
215 /* Flush and aggregate all samples of this parallel except this
216 * thread's sample. The primary thread of a nested parallel region
217 * will keep its samples until it finishes as non-primary
218 * team-worker or reaches the level 1 parallel region */
219 talp_aggregate_subset_to_regions(talp_info,
220 &parallel_samples[1],
221 num_samples - 1);
222 }
223
224 /* Update the state of the rest of team-worker threads
225 * (note that talp_set_sample_state cannot be used here because we are
226 * impersonating a worker thread) */
227
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
3 for (unsigned int i = 1; i < num_samples; ++i) {
228 1 talp_sample_t *worker_sample = parallel_samples[i];
229
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (worker_sample->state == TALP_STATE_NOT_USEFUL_OMP_IN) {
230 worker_sample->state = TALP_STATE_NOT_USEFUL_OMP_OUT;
231 }
232 }
233
234 /* Update current threads's state */
235 2 talp_sample_set_state(talp_info, TALP_STATE_USEFUL);
236
237
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (parallel_level == 1) {
238 /* Update main thread serial mode if this was the outermost parallel region */
239 2 talp_sample_set_main_serial_mode(true);
240 } else {
241 /* free local data */
242 free(parallel_data->talp_parallel_data);
243 parallel_data->talp_parallel_data = NULL;
244 }
245 }
246
247 3 void talp_openmp_into_parallel_function(
248 omptool_parallel_data_t *parallel_data, unsigned int index) {
249
250 3 const subprocess_descriptor_t *spd = thread_spd;
251 3 talp_info_t *talp_info = spd->talp_info;
252
253
2/4
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
3 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
254
255 /* Assign thread sample as team-worker of this parallel */
256 3 talp_sample_t *sample = talp_sample_get(talp_info);
257 3 talp_sample_t **parallel_samples = parallel_data->talp_parallel_data;
258 /* Probably optimized, but try to avoid invalidating
259 * the cache line on reused parallel data */
260
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1 times.
3 if (parallel_samples[index] != sample) {
261 2 parallel_samples[index] = sample;
262 }
263
264 /* Update thread sample */
265 3 talp_sample_update(talp_info);
266
267 /* Update state */
268 3 talp_sample_set_state(talp_info, TALP_STATE_USEFUL);
269 }
270
271 1 void talp_openmp_outof_parallel_function(void) {
272
273 1 const subprocess_descriptor_t *spd = thread_spd;
274 1 talp_info_t *talp_info = spd->talp_info;
275
276
2/4
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 times.
1 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
277 /* Update thread sample */
278 1 talp_sample_update(talp_info);
279
280 /* Update state */
281 1 talp_sample_set_state(talp_info, TALP_STATE_NOT_USEFUL_OMP_OUT);
282 }
283
284 3 void talp_openmp_into_parallel_implicit_barrier(omptool_parallel_data_t *parallel_data) {
285
286 3 const subprocess_descriptor_t *spd = thread_spd;
287 3 talp_info_t *talp_info = spd->talp_info;
288
289
2/4
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
3 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
290
291 /* Update thread sample */
292 3 talp_sample_update(talp_info);
293
294 /* Update state */
295 3 talp_sample_set_state(talp_info, TALP_STATE_NOT_USEFUL_OMP_IN);
296 }
297
298 3 void talp_openmp_into_parallel_sync(omptool_parallel_data_t *parallel_data) {
299
300 3 const subprocess_descriptor_t *spd = thread_spd;
301 3 talp_info_t *talp_info = spd->talp_info;
302
303
2/4
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
3 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
304
305 /* Update thread sample */
306 3 talp_sample_update(talp_info);
307
308 /* Update state */
309 3 talp_sample_set_state(talp_info, TALP_STATE_NOT_USEFUL_OMP_IN);
310 }
311
312 3 void talp_openmp_outof_parallel_sync(omptool_parallel_data_t *parallel_data) {
313
314 3 const subprocess_descriptor_t *spd = thread_spd;
315 3 talp_info_t *talp_info = spd->talp_info;
316
317
2/4
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
3 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
318
319 /* Update thread sample */
320 3 talp_sample_update(talp_info);
321
322 /* Update state */
323 3 talp_sample_set_state(talp_info, TALP_STATE_USEFUL);
324 }
325
326 3 void talp_openmp_task_create(void) {
327
328 3 const subprocess_descriptor_t *spd = thread_spd;
329 3 talp_info_t *talp_info = spd->talp_info;
330
331
2/4
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
3 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
332
333 /* Just update stats */
334 3 talp_sample_t *sample = talp_sample_get(talp_info);
335 3 DLB_ATOMIC_ADD_RLX(&sample->stats.num_omp_tasks, 1);
336 }
337
338 3 void talp_openmp_task_complete(void) {
339
340 3 const subprocess_descriptor_t *spd = thread_spd;
341 3 talp_info_t *talp_info = spd->talp_info;
342
343
2/4
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
3 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
344
345 /* Update thread sample */
346 3 talp_sample_update(talp_info);
347
348 /* Update state (FIXME: tasks outside of parallels?) */
349 3 talp_sample_set_state(talp_info, TALP_STATE_NOT_USEFUL_OMP_IN);
350 }
351
352 6 void talp_openmp_task_switch(void) {
353
354 6 const subprocess_descriptor_t *spd = thread_spd;
355 6 talp_info_t *talp_info = spd->talp_info;
356
357
2/4
✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 6 times.
6 if (talp_info == NULL || !talp_info->flags.have_openmp) return;
358
359 /* Update thread sample */
360 6 talp_sample_update(talp_info);
361
362 /* Update state */
363 6 talp_sample_set_state(talp_info, TALP_STATE_USEFUL);
364 }
365
366
367 /*********************************************************************************/
368 /* Vtable for handling omptool events */
369 /*********************************************************************************/
370
371 const omptool_event_funcs_t talp_events_vtable = (const omptool_event_funcs_t) {
372 .init = talp_openmp_init,
373 .finalize = talp_openmp_finalize,
374 .into_mpi = NULL,
375 .outof_mpi = NULL,
376 .lend_from_api = NULL,
377 .thread_begin = talp_openmp_thread_begin,
378 .thread_end = talp_openmp_thread_end,
379 .thread_role_shift = NULL,
380 .parallel_begin = talp_openmp_parallel_begin,
381 .parallel_end = talp_openmp_parallel_end,
382 .into_parallel_function = talp_openmp_into_parallel_function,
383 .outof_parallel_function = talp_openmp_outof_parallel_function,
384 .into_parallel_implicit_barrier = talp_openmp_into_parallel_implicit_barrier,
385 .into_parallel_sync = talp_openmp_into_parallel_sync,
386 .outof_parallel_sync = talp_openmp_outof_parallel_sync,
387 .task_create = talp_openmp_task_create,
388 .task_complete = talp_openmp_task_complete,
389 .task_switch = talp_openmp_task_switch,
390 };
391