GCC Code Coverage Report


Directory: src/
File: src/talp/sample.c
Date: 2026-06-05 08:54:23
Exec Total Coverage
Lines: 154 176 87.5%
Functions: 13 13 100.0%
Branches: 37 58 63.8%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2026 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #include "talp/sample.h"
21
22 #include "support/debug.h"
23 #include "support/dlb_common.h"
24 #include "support/mytime.h"
25 #include "support/tracing.h"
26 #include "talp/backend.h"
27 #include "talp/talp_hwc.h"
28
29 #include <pthread.h>
30 #include <stdlib.h>
31 #include <string.h>
32
33
34 extern __thread bool thread_is_observer;
35
36 static __thread talp_sample_t* _tls_sample = NULL;
37 static __thread bool _is_main_sample = false;
38 static __thread bool _is_main_sample_in_serial_mode = false;
39
40 static void set_state(const talp_info_t *talp_info,
41 talp_sample_t *sample, talp_sample_state_t new_state);
42
43
44 /*********************************************************************************/
45 /* Init / Finalize */
46 /*********************************************************************************/
47
48 23 void talp_sample_init(talp_info_t *talp_info) {
49
50 23 talp_info->sample_registry = (sample_registry_t){
51 .mutex = PTHREAD_MUTEX_INITIALIZER,
52 };
53 23 }
54
55 23 void talp_sample_finalize(talp_info_t *talp_info) {
56
57 /* Warning about _tls_sample in worker threads:
58 * worker threads do not call this function, so currently they are
59 * not deallocating their sample.
60 * In some cases, it might happen that a worker thread exits without
61 * the main thread reducing its sample, so in these cases the sample
62 * needs to outlive the thread.
63 * The main thread could deallocate it at this point, but then the
64 * TLS variable would be broken if TALP is reinitialized again.
65 * For now we will keep it like this and will revisit if needed. */
66
67 /* Deallocate main thread sample */
68 23 free(_tls_sample);
69 23 _tls_sample = NULL;
70
71 /* Deallocate samples list */
72 23 sample_registry_t *registry = &talp_info->sample_registry;
73 23 pthread_mutex_lock(&registry->mutex);
74 {
75 23 free(registry->samples);
76 23 registry->samples = NULL;
77 23 registry->num_samples = 0;
78 }
79 23 pthread_mutex_unlock(&registry->mutex);
80 23 }
81
82
83 /*********************************************************************************/
84 /* Sample getters & setters */
85 /*********************************************************************************/
86
87 4 bool talp_sample_is_main(void) {
88 4 return _is_main_sample;
89 }
90
91 /* Quick test, without locking and without generating a new sample */
92 5354 bool talp_sample_is_mine(const talp_sample_t *sample) {
93
3/4
✓ Branch 0 taken 5354 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 5347 times.
✓ Branch 3 taken 7 times.
5354 return sample != NULL && sample == _tls_sample;
94 }
95
96 /* Sets the TLS variable _is_main_sample_in_serial_mode. This function is
97 * called by the main thread when beginning or ending parallel region of level 1.
98 * FIXME: free agent threads may break this condition.
99 *
100 * Sets whether the main thread is running in serial mode. */
101 4 void talp_sample_set_main_serial_mode(bool serial_mode) {
102
103
1/2
✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
4 if (_is_main_sample) {
104 4 _is_main_sample_in_serial_mode = serial_mode;
105 }
106 4 }
107
108 /* Get the TLS associated sample */
109 16172 talp_sample_t* talp_sample_get(talp_info_t *talp_info) {
110
111 /* Thread already has an allocated sample, return it */
112
2/2
✓ Branch 0 taken 16148 times.
✓ Branch 1 taken 24 times.
16172 if (likely(_tls_sample != NULL)) return _tls_sample;
113
114 /* Observer threads don't have a valid sample */
115
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
24 if (unlikely(thread_is_observer)) return NULL;
116
117 /* Otherwise, allocate */
118 24 sample_registry_t *registry = &talp_info->sample_registry;
119 24 pthread_mutex_lock(&registry->mutex);
120 {
121 24 int num_samples = ++registry->num_samples;
122 24 void *samples = realloc(registry->samples, sizeof(talp_sample_t*)*num_samples);
123
1/2
✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
24 if (samples) {
124 24 void *new_sample = NULL;
125
1/2
✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
24 if (posix_memalign(&new_sample, DLB_CACHE_LINE, sizeof(talp_sample_t)) == 0) {
126 24 _tls_sample = new_sample;
127 24 *_tls_sample = (talp_sample_t){0};
128 24 registry->samples = samples;
129 24 registry->samples[num_samples-1] = new_sample;
130
2/2
✓ Branch 0 taken 22 times.
✓ Branch 1 taken 2 times.
24 if (num_samples == 1) {
131 22 _is_main_sample = true;
132 22 _is_main_sample_in_serial_mode = true;
133 }
134 } else {
135 // error
136 free(new_sample);
137 _tls_sample = NULL;
138 }
139 }
140 }
141 24 pthread_mutex_unlock(&registry->mutex);
142
143
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
24 fatal_cond(_tls_sample == NULL, "TALP: could not allocate thread sample");
144
145 /* If a thread is created mid-region, its initial time is that of the
146 * innermost open region, otherwise it is the current time */
147 int64_t last_updated_ts;
148
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 22 times.
24 if (talp_info->open_regions) {
149 2 const dlb_monitor_t *monitor = talp_info->open_regions->data;
150 2 last_updated_ts = monitor->start_time;
151 } else {
152 22 last_updated_ts = get_time_in_ns();
153 }
154
155 24 _tls_sample->last_updated_ts = last_updated_ts;
156
157 24 set_state(talp_info, _tls_sample, TALP_STATE_DISABLED);
158
159 #ifdef INSTRUMENTATION_VERSION
160 unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR};
161 long long hwc_values[] = {0, 0};
162 instrument_nevent(2, events, hwc_values);
163 #endif
164
165 24 return _tls_sample;
166 }
167
168
169 /*********************************************************************************/
170 /* Sample update */
171 /*********************************************************************************/
172
173 /* Compute new microsample (time since last update) and update sample values */
174 5383 void talp_sample_update(talp_info_t *talp_info) {
175
176 5383 talp_sample_t *sample = talp_sample_get(talp_info);
177
178 /* Observer threads ignore this function */
179
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5383 times.
5383 if (unlikely(sample == NULL)) return;
180
181 /* Compute duration and set new last_updated_ts */
182 5383 int64_t now = get_time_in_ns();
183 5383 int64_t microsample_duration = now - sample->last_updated_ts;
184 5383 sample->last_updated_ts = now;
185
186 /* Update the appropriate sample timer */
187
5/7
✓ Branch 0 taken 23 times.
✓ Branch 1 taken 5342 times.
✓ Branch 2 taken 7 times.
✓ Branch 3 taken 9 times.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
5383 switch(sample->state) {
188 23 case TALP_STATE_DISABLED:
189 23 break;
190 5342 case TALP_STATE_USEFUL:
191 5342 DLB_ATOMIC_ADD_RLX(&sample->timers.useful, microsample_duration);
192 5342 break;
193 7 case TALP_STATE_NOT_USEFUL_MPI:
194 7 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_mpi, microsample_duration);
195
1/2
✓ Branch 0 taken 7 times.
✗ Branch 1 not taken.
7 if (_is_main_sample_in_serial_mode) {
196 // Add worker threads' time to special timer
197 7 int num_cpus = talp_info->sample_registry.num_samples;
198 7 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_during_mpi,
199 microsample_duration * (num_cpus-1));
200 }
201 7 break;
202 9 case TALP_STATE_NOT_USEFUL_OMP_IN:
203 9 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_in, microsample_duration);
204 9 break;
205 2 case TALP_STATE_NOT_USEFUL_OMP_OUT:
206 2 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_out, microsample_duration);
207 2 break;
208 case TALP_STATE_NOT_USEFUL_GPU:
209 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_gpu, microsample_duration);
210 break;
211 }
212
213
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5383 times.
5383 if (talp_info->flags.have_hwc) {
214 hwc_measurements_t measurements;
215 if (talp_hwc_collect(&measurements)) {
216 /* Atomically add HWC values to sample structure */
217 DLB_ATOMIC_ADD_RLX(&sample->counters.cycles, measurements.cycles);
218 DLB_ATOMIC_ADD_RLX(&sample->counters.instructions, measurements.instructions);
219 }
220
221 #ifdef INSTRUMENTATION_VERSION
222 // It's safe to emit even if talp_hwc_collect returned false,
223 // struct is zero'ed in that case
224 unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR};
225 long long hwc_values[] = {measurements.cycles, measurements.instructions};
226 instrument_nevent(2, events, hwc_values);
227 #endif
228 }
229 }
230
231 7 void talp_sample_update_foreign(talp_info_t *talp_info, talp_sample_t *sample, int64_t now) {
232
233 /* Observer threads ignore this function */
234
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 7 times.
7 if (unlikely(sample == NULL)) return;
235
236 /* Compute duration and set new last_updated_ts */
237 7 int64_t microsample_duration = now - sample->last_updated_ts;
238 7 sample->last_updated_ts = now;
239
240 /* Update the appropriate sample timer */
241
2/7
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 3 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
7 switch(sample->state) {
242 4 case TALP_STATE_DISABLED:
243 4 break;
244 3 case TALP_STATE_USEFUL:
245 3 DLB_ATOMIC_ADD_RLX(&sample->timers.useful, microsample_duration);
246 3 break;
247 case TALP_STATE_NOT_USEFUL_MPI:
248 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_mpi, microsample_duration);
249 break;
250 case TALP_STATE_NOT_USEFUL_OMP_IN:
251 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_in, microsample_duration);
252 break;
253 case TALP_STATE_NOT_USEFUL_OMP_OUT:
254 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_out, microsample_duration);
255 break;
256 case TALP_STATE_NOT_USEFUL_GPU:
257 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_gpu, microsample_duration);
258 break;
259 }
260 }
261
262
263 97 static void set_state(const talp_info_t *restrict talp_info,
264 talp_sample_t *restrict sample, talp_sample_state_t new_state) {
265
266
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 97 times.
97 if (talp_info->flags.have_hwc) {
267 talp_sample_state_t old = sample->state;
268 talp_hwc_on_state_change(old, new_state);
269 }
270
271 97 sample->state = new_state;
272
273 instrument_event(MONITOR_STATE,
274 new_state == TALP_STATE_DISABLED ? MONITOR_STATE_DISABLED
275 : new_state == TALP_STATE_USEFUL ? MONITOR_STATE_USEFUL
276 : new_state == TALP_STATE_NOT_USEFUL_MPI ? MONITOR_STATE_NOT_USEFUL_MPI
277 : new_state == TALP_STATE_NOT_USEFUL_OMP_IN ? MONITOR_STATE_NOT_USEFUL_OMP_IN
278 : new_state == TALP_STATE_NOT_USEFUL_OMP_OUT ? MONITOR_STATE_NOT_USEFUL_OMP_OUT
279 : new_state == TALP_STATE_NOT_USEFUL_GPU ? MONITOR_STATE_NOT_USEFUL_GPU
280 : 0,
281 EVENT_BEGIN);
282 97 }
283
284 73 void talp_sample_set_state(talp_info_t *talp_info, talp_sample_state_t new_state) {
285
286 73 talp_sample_t *sample = talp_sample_get(talp_info);
287 73 set_state(talp_info, sample, new_state);
288 73 }
289
290
291 /*********************************************************************************/
292 /* Sample aggregation */
293 /*********************************************************************************/
294
295 /* Flush and aggregate a single sample into a macrosample */
296 5354 static inline void flush_sample_to_macrosample(talp_sample_t *restrict sample,
297 talp_macrosample_t *restrict macrosample) {
298
299 /* Timers */
300 5354 macrosample->timers.useful +=
301 5354 DLB_ATOMIC_EXCH_RLX(&sample->timers.useful, 0);
302 5354 macrosample->timers.not_useful_mpi +=
303 5354 DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_mpi, 0);
304 5354 macrosample->timers.not_useful_omp_during_mpi +=
305 5354 DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_omp_during_mpi, 0);
306 5354 macrosample->timers.not_useful_omp_out +=
307 5354 DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_omp_out, 0);
308 /* timers.not_useful_omp_in is not flushed here, make sure struct is empty */
309
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5354 times.
5354 ensure(DLB_ATOMIC_LD_RLX(&sample->timers.not_useful_omp_in) == 0,
310 "Inconsistency in TALP sample metric not_useful_omp_in."
311 " Please, report bug at " PACKAGE_BUGREPORT);
312 5354 macrosample->timers.not_useful_gpu +=
313 5354 DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_gpu, 0);
314
315 /* Counters */
316 5354 macrosample->counters.cycles +=
317 5354 DLB_ATOMIC_EXCH_RLX(&sample->counters.cycles, 0);
318 5354 macrosample->counters.instructions +=
319 5354 DLB_ATOMIC_EXCH_RLX(&sample->counters.instructions, 0);
320
321 /* Stats */
322 5354 macrosample->stats.num_mpi_calls +=
323 5354 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_mpi_calls, 0);
324 5354 macrosample->stats.num_omp_parallels +=
325 5354 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_omp_parallels, 0);
326 5354 macrosample->stats.num_omp_tasks +=
327 5354 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_omp_tasks, 0);
328 5354 macrosample->stats.num_gpu_runtime_calls +=
329 5354 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_gpu_runtime_calls, 0);
330 5354 }
331
332
333 /* Aggregate all samples.
334 * This function assumes that the current thread's sample was just updated. */
335 5345 void talp_sample_aggregate_all_to_macrosample(
336 talp_info_t *restrict talp_info, talp_macrosample_t *restrict macrosample) {
337
338 5345 sample_registry_t *registry = &talp_info->sample_registry;
339 5345 talp_sample_t *current_sample = talp_sample_get(talp_info);
340 5345 int64_t now = current_sample->last_updated_ts;
341
342 /* Accumulate samples from all threads */
343 5345 pthread_mutex_lock(&registry->mutex);
344 {
345 5345 int num_samples = registry->num_samples;
346 5345 macrosample->num_cpus = num_samples;
347
348 /* Force-update and aggregate all samples */
349
2/2
✓ Branch 0 taken 5351 times.
✓ Branch 1 taken 5345 times.
10696 for (int i = 0; i < num_samples; ++i) {
350 5351 talp_sample_t *sample = registry->samples[i];
351
2/2
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 5345 times.
5351 if (!talp_sample_is_mine(sample)) {
352 6 talp_sample_update_foreign(talp_info, sample, now);
353 }
354 5351 flush_sample_to_macrosample(sample, macrosample);
355 }
356 }
357 5345 pthread_mutex_unlock(&registry->mutex);
358 5345 }
359
360 /* Aggregate a subset of samples.
361 * This function assumes that the current thread's sample was just updated.
362 * OpenMP derived metrics are computed here and added to the main sample. */
363 2 void talp_sample_aggregate_subset_to_macrosample(
364 talp_info_t *restrict talp_info,
365 talp_sample_t **restrict samples,
366 unsigned int nelems,
367 talp_macrosample_t *restrict macrosample) {
368
369 2 sample_registry_t *registry = &talp_info->sample_registry;
370 2 talp_sample_t *sample = talp_sample_get(talp_info);
371 2 int64_t now = sample->last_updated_ts;
372
373 2 int64_t sched_timer = 0;
374 2 int64_t lb_timer = 0;
375
376 2 pthread_mutex_lock(&registry->mutex);
377 {
378 /* Iterate first to force-update all samples and compute the minimum
379 * not-useful-omp-in among them */
380 2 int64_t min_not_useful_omp_in = INT64_MAX;
381
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
5 for (unsigned int i = 0; i < nelems; ++i) {
382 3 talp_sample_t *worker_sample = samples[i];
383
2/2
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 2 times.
3 if (!talp_sample_is_mine(worker_sample)) {
384 1 talp_sample_update_foreign(talp_info, worker_sample, now);
385 }
386 3 min_not_useful_omp_in = min_int64(min_not_useful_omp_in,
387 3 DLB_ATOMIC_LD_RLX(&worker_sample->timers.not_useful_omp_in));
388 }
389
390 /* Iterate again to accumulate Load Balance, and to aggregate sample */
391 2 sched_timer = min_not_useful_omp_in * nelems;
392
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
5 for (unsigned int i = 0; i < nelems; ++i) {
393 3 talp_sample_t *worker_sample = samples[i];
394 3 lb_timer += DLB_ATOMIC_EXCH_RLX(&worker_sample->timers.not_useful_omp_in, 0)
395 3 - min_not_useful_omp_in;
396 3 flush_sample_to_macrosample(worker_sample, macrosample);
397 }
398 }
399 2 pthread_mutex_unlock(&registry->mutex);
400
401 /* Update derived timers into macrosample */
402 2 macrosample->num_cpus = nelems;
403 2 macrosample->timers.not_useful_omp_in_lb = lb_timer;
404 2 macrosample->timers.not_useful_omp_in_sched = sched_timer;
405 2 }
406