GCC Code Coverage Report


Directory: src/
File: src/talp/talp_openmp.c
Date: 2025-11-21 10:34:40
Exec Total Coverage
Lines: 150 164 91.5%
Functions: 15 15 100.0%
Branches: 33 62 53.2%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2025 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #include "talp/talp_openmp.h"
21
22 #include "LB_numThreads/omptool.h"
23 #include "LB_comm/shmem_talp.h"
24 #include "LB_core/DLB_kernel.h"
25 #include "apis/dlb_talp.h"
26 #include "support/debug.h"
27 #include "talp/regions.h"
28 #include "talp/talp.h"
29 #include "talp/talp_types.h"
30
31 #include <unistd.h>
32
33 extern __thread bool thread_is_observer;
34
35 /* Update all open nested regions (so, excluding the innermost) and add the
36 * time since its start time until the sample last timestamp (which is the time
37 * that has yet not been added to the regions) as omp_serialization_time */
38 1 static void update_serialization_in_nested_regions(const subprocess_descriptor_t *spd,
39 const talp_sample_t *sample) {
40
41 1 talp_info_t *talp_info = spd->talp_info;
42
43 /* Update all open nested regions */
44 1 pthread_mutex_lock(&talp_info->regions_mutex);
45 {
46 2 GSList *nested_open_regions = talp_info->open_regions
47 1 ? talp_info->open_regions->next
48
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 : NULL;
49
50 1 for (GSList *node = nested_open_regions;
51
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 node != NULL;
52 1 node = node->next) {
53
54 1 dlb_monitor_t *monitor = node->data;
55 1 monitor->omp_serialization_time +=
56 1 sample->last_updated_timestamp - monitor->start_time;
57 }
58 }
59 1 pthread_mutex_unlock(&talp_info->regions_mutex);
60 1 }
61
62
63 /*********************************************************************************/
64 /* TALP OpenMP functions */
65 /*********************************************************************************/
66
67 /* samples involved in parallel level 1 */
68 static talp_sample_t** parallel_samples_l1 = NULL;
69 static unsigned int parallel_samples_l1_capacity = 0;
70
71 1 void talp_openmp_init(pid_t pid, const options_t* options) {
72
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 ensure(!thread_is_observer, "An observer thread cannot call talp_openmp_init");
73
74 1 const subprocess_descriptor_t *spd = thread_spd;
75 1 talp_info_t *talp_info = spd->talp_info;
76
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (talp_info) {
77 1 monitor_data_t *monitor_data = talp_info->monitor->_data;
78 1 talp_info->flags.have_openmp = true;
79
80 /* Fix up number of CPUs for the global region */
81 1 float cpus = CPU_COUNT(&spd->process_mask);
82 1 talp_info->monitor->avg_cpus = cpus;
83 1 shmem_talp__set_avg_cpus(monitor_data->node_shared_id, cpus);
84
85 /* Start global region (no-op if already started) */
86 1 region_start(spd, talp_info->monitor);
87
88 /* Set useful state */
89 1 talp_sample_t *sample = talp_get_thread_sample(spd);
90 1 talp_set_sample_state(sample, useful, talp_info->flags.papi);
91 }
92 1 }
93
94 1 void talp_openmp_finalize(void) {
95
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (parallel_samples_l1 != NULL) {
96 1 free(parallel_samples_l1);
97 1 parallel_samples_l1 = NULL;
98 1 parallel_samples_l1_capacity = 0;
99 }
100 1 }
101
102 2 void talp_openmp_thread_begin(ompt_thread_t thread_type) {
103 2 const subprocess_descriptor_t *spd = thread_spd;
104 2 talp_info_t *talp_info = spd->talp_info;
105
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (talp_info) {
106 /* Initial thread is already in useful state, set omp_out for others */
107 2 talp_sample_t *sample = talp_get_thread_sample(spd);
108
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 if (sample->state == disabled) {
109 /* Not initial thread: */
110
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (talp_info->flags.papi) {
111 talp_init_papi_counters();
112 }
113 1 talp_set_sample_state(sample, not_useful_omp_out, talp_info->flags.papi);
114
115 /* The initial time of the sample is set to match the start time of
116 * the innermost open region, but other nested open regions need to
117 * be fixed */
118 1 update_serialization_in_nested_regions(spd, sample);
119 }
120 }
121 2 }
122
123 2 void talp_openmp_thread_end(void) {
124 2 const subprocess_descriptor_t *spd = thread_spd;
125 2 talp_info_t *talp_info = spd->talp_info;
126
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 if (talp_info) {
127 /* Update thread sample with the last microsample */
128 1 talp_sample_t *sample = talp_get_thread_sample(spd);
129 1 talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP);
130
131 /* Update state */
132 1 talp_set_sample_state(sample, disabled, talp_info->flags.papi);
133
134 /* Finalize PAPI per-thread state */
135
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (talp_info->flags.papi) {
136 talp_fini_papi_counters();
137 }
138 }
139 2 }
140
141 2 void talp_openmp_parallel_begin(omptool_parallel_data_t *parallel_data) {
142
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 fatal_cond(parallel_data->requested_parallelism < 1,
143 "Requested parallel region of invalid size in %s. Please report bug at %s.",
144 __func__, PACKAGE_BUGREPORT);
145
146 2 const subprocess_descriptor_t *spd = thread_spd;
147 2 talp_info_t *talp_info = spd->talp_info;
148
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (talp_info) {
149
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (parallel_data->level == 1) {
150 /* Resize samples of parallel 1 if needed */
151 2 unsigned int requested_parallelism = parallel_data->requested_parallelism;
152
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (requested_parallelism > parallel_samples_l1_capacity) {
153 2 void *ptr = realloc(parallel_samples_l1,
154 sizeof(talp_sample_t*)*requested_parallelism);
155
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 fatal_cond(!ptr, "realloc failed in %s", __func__);
156 2 parallel_samples_l1 = ptr;
157 2 parallel_samples_l1_capacity = requested_parallelism;
158 }
159
160 /* Assign local data */
161 2 parallel_data->talp_parallel_data = parallel_samples_l1;
162
163 } else if (parallel_data->level > 1) {
164 /* Allocate parallel samples array */
165 unsigned int requested_parallelism = parallel_data->requested_parallelism;
166 void *ptr = malloc(sizeof(talp_sample_t*)*requested_parallelism);
167 fatal_cond(!ptr, "malloc failed in %s", __func__);
168
169 /* Assign local data */
170 parallel_data->talp_parallel_data = ptr;
171 }
172
173 /* Update stats */
174 2 talp_sample_t *sample = talp_get_thread_sample(spd);
175 2 DLB_ATOMIC_ADD_RLX(&sample->stats.num_omp_parallels, 1);
176 }
177 2 }
178
179 2 void talp_openmp_parallel_end(omptool_parallel_data_t *parallel_data) {
180 2 const subprocess_descriptor_t *spd = thread_spd;
181 2 talp_info_t *talp_info = spd->talp_info;
182
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (talp_info) {
183 /* Update thread sample with the last microsample */
184 2 talp_sample_t *sample = talp_get_thread_sample(spd);
185 2 talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP);
186
187
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (parallel_data->level == 1) {
188 /* Flush and aggregate all samples of the parallel region */
189 2 talp_flush_sample_subset_to_regions(spd,
190 2 parallel_data->talp_parallel_data,
191 parallel_data->actual_parallelism);
192
193 } else if (parallel_data->level > 1) {
194 /* Flush and aggregate all samples of this parallel except this
195 * thread's sample. The primary thread of a nested parallel region
196 * will keep its samples until it finishes as non-primary
197 * team-worker or reaches the level 1 parallel region */
198 talp_sample_t **parallel_samples = parallel_data->talp_parallel_data;
199 talp_flush_sample_subset_to_regions(spd,
200 &parallel_samples[1],
201 parallel_data->actual_parallelism-1);
202
203 /* free local data */
204 free(parallel_data->talp_parallel_data);
205 parallel_data->talp_parallel_data = NULL;
206 }
207
208 /* Update current threads's state */
209 2 talp_set_sample_state(sample, useful, talp_info->flags.papi);
210
211 /* Update the state of the rest of team-worker threads
212 * (note that talp_set_sample_state cannot be used here because we are
213 * impersonating a worker thread) */
214 2 talp_sample_t **parallel_samples = parallel_data->talp_parallel_data;
215
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
3 for (unsigned int i = 1; i < parallel_data->actual_parallelism; ++i) {
216 1 talp_sample_t *worker_sample = parallel_samples[i];
217
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (worker_sample->state == not_useful_omp_in) {
218 worker_sample->state = not_useful_omp_out;
219 }
220 }
221 }
222 2 }
223
224 3 void talp_openmp_into_parallel_function(
225 omptool_parallel_data_t *parallel_data, unsigned int index) {
226 3 const subprocess_descriptor_t *spd = thread_spd;
227 3 talp_info_t *talp_info = spd->talp_info;
228
1/2
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
3 if (talp_info) {
229 /* Assign thread sample as team-worker of this parallel */
230 3 talp_sample_t *sample = talp_get_thread_sample(spd);
231 3 talp_sample_t **parallel_samples = parallel_data->talp_parallel_data;
232 /* Probably optimized, but try to avoid invalidating
233 * the cache line on reused parallel data */
234
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1 times.
3 if (parallel_samples[index] != sample) {
235 2 parallel_samples[index] = sample;
236 }
237
238 /* Update thread sample with the last microsample */
239 3 talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP);
240
241 /* Update state */
242 3 talp_set_sample_state(sample, useful, talp_info->flags.papi);
243 }
244 3 }
245
246 1 void talp_openmp_outof_parallel_function(void) {
247 1 const subprocess_descriptor_t *spd = thread_spd;
248 1 talp_info_t *talp_info = spd->talp_info;
249
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (talp_info) {
250 /* Update thread sample with the last microsample */
251 1 talp_sample_t *sample = talp_get_thread_sample(spd);
252 1 talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP);
253
254 /* Update state */
255 1 talp_set_sample_state(sample, not_useful_omp_out, talp_info->flags.papi);
256 }
257 1 }
258
259 3 void talp_openmp_into_parallel_implicit_barrier(omptool_parallel_data_t *parallel_data) {
260 3 const subprocess_descriptor_t *spd = thread_spd;
261 3 talp_info_t *talp_info = spd->talp_info;
262
1/2
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
3 if (talp_info) {
263 /* Update thread sample with the last microsample */
264 3 talp_sample_t *sample = talp_get_thread_sample(spd);
265 3 talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP);
266
267 /* Update state */
268 3 talp_set_sample_state(sample, not_useful_omp_in, talp_info->flags.papi);
269 }
270 3 }
271
272 3 void talp_openmp_into_parallel_sync(omptool_parallel_data_t *parallel_data) {
273 3 const subprocess_descriptor_t *spd = thread_spd;
274 3 talp_info_t *talp_info = spd->talp_info;
275
1/2
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
3 if (talp_info) {
276 /* Update thread sample with the last microsample */
277 3 talp_sample_t *sample = talp_get_thread_sample(spd);
278 3 talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP);
279
280 /* Update state */
281 3 talp_set_sample_state(sample, not_useful_omp_in, talp_info->flags.papi);
282 }
283 3 }
284
285 3 void talp_openmp_outof_parallel_sync(omptool_parallel_data_t *parallel_data) {
286 3 const subprocess_descriptor_t *spd = thread_spd;
287 3 talp_info_t *talp_info = spd->talp_info;
288
1/2
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
3 if (talp_info) {
289 /* Update thread sample with the last microsample */
290 3 talp_sample_t *sample = talp_get_thread_sample(spd);
291 3 talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP);
292
293 /* Update state */
294 3 talp_set_sample_state(sample, useful, talp_info->flags.papi);
295 }
296 3 }
297
298 3 void talp_openmp_task_create(void) {
299 3 const subprocess_descriptor_t *spd = thread_spd;
300 3 talp_info_t *talp_info = spd->talp_info;
301
1/2
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
3 if (talp_info) {
302 /* Just update stats */
303 3 talp_sample_t *sample = talp_get_thread_sample(spd);
304 3 DLB_ATOMIC_ADD_RLX(&sample->stats.num_omp_tasks, 1);
305 }
306 3 }
307
308 3 void talp_openmp_task_complete(void) {
309 3 const subprocess_descriptor_t *spd = thread_spd;
310 3 talp_info_t *talp_info = spd->talp_info;
311
1/2
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
3 if (talp_info) {
312 /* Update thread sample with the last microsample */
313 3 talp_sample_t *sample = talp_get_thread_sample(spd);
314 3 talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP);
315
316 /* Update state (FIXME: tasks outside of parallels? */
317 3 talp_set_sample_state(sample, not_useful_omp_in, talp_info->flags.papi);
318 }
319 3 }
320
321 6 void talp_openmp_task_switch(void) {
322 6 const subprocess_descriptor_t *spd = thread_spd;
323 6 talp_info_t *talp_info = spd->talp_info;
324
1/2
✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
6 if (talp_info) {
325 /* Update thread sample with the last microsample */
326 6 talp_sample_t *sample = talp_get_thread_sample(spd);
327 6 talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP);
328
329 /* Update state */
330 6 talp_set_sample_state(sample, useful, talp_info->flags.papi);
331 }
332 6 }
333
334
335 /*********************************************************************************/
336 /* Vtable for handling omptool events */
337 /*********************************************************************************/
338
339 const omptool_event_funcs_t talp_events_vtable = (const omptool_event_funcs_t) {
340 .init = talp_openmp_init,
341 .finalize = talp_openmp_finalize,
342 .into_mpi = NULL,
343 .outof_mpi = NULL,
344 .lend_from_api = NULL,
345 .thread_begin = talp_openmp_thread_begin,
346 .thread_end = talp_openmp_thread_end,
347 .thread_role_shift = NULL,
348 .parallel_begin = talp_openmp_parallel_begin,
349 .parallel_end = talp_openmp_parallel_end,
350 .into_parallel_function = talp_openmp_into_parallel_function,
351 .outof_parallel_function = talp_openmp_outof_parallel_function,
352 .into_parallel_implicit_barrier = talp_openmp_into_parallel_implicit_barrier,
353 .into_parallel_sync = talp_openmp_into_parallel_sync,
354 .outof_parallel_sync = talp_openmp_outof_parallel_sync,
355 .task_create = talp_openmp_task_create,
356 .task_complete = talp_openmp_task_complete,
357 .task_switch = talp_openmp_task_switch,
358 };
359