| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /*********************************************************************************/ | ||
| 2 | /* Copyright 2009-2026 Barcelona Supercomputing Center */ | ||
| 3 | /* */ | ||
| 4 | /* This file is part of the DLB library. */ | ||
| 5 | /* */ | ||
| 6 | /* DLB is free software: you can redistribute it and/or modify */ | ||
| 7 | /* it under the terms of the GNU Lesser General Public License as published by */ | ||
| 8 | /* the Free Software Foundation, either version 3 of the License, or */ | ||
| 9 | /* (at your option) any later version. */ | ||
| 10 | /* */ | ||
| 11 | /* DLB is distributed in the hope that it will be useful, */ | ||
| 12 | /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
| 13 | /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
| 14 | /* GNU Lesser General Public License for more details. */ | ||
| 15 | /* */ | ||
| 16 | /* You should have received a copy of the GNU Lesser General Public License */ | ||
| 17 | /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */ | ||
| 18 | /*********************************************************************************/ | ||
| 19 | |||
| 20 | #include "talp/sample.h" | ||
| 21 | |||
| 22 | #include "support/debug.h" | ||
| 23 | #include "support/dlb_common.h" | ||
| 24 | #include "support/mytime.h" | ||
| 25 | #include "support/tracing.h" | ||
| 26 | #include "talp/backend.h" | ||
| 27 | #include "talp/talp_hwc.h" | ||
| 28 | |||
| 29 | #include <pthread.h> | ||
| 30 | #include <stdlib.h> | ||
| 31 | #include <string.h> | ||
| 32 | |||
| 33 | |||
| 34 | extern __thread bool thread_is_observer; | ||
| 35 | |||
| 36 | static __thread talp_sample_t* _tls_sample = NULL; | ||
| 37 | static __thread bool _is_main_sample = false; | ||
| 38 | static __thread bool _is_main_sample_in_serial_mode = false; | ||
| 39 | |||
| 40 | static void set_state(const talp_info_t *talp_info, | ||
| 41 | talp_sample_t *sample, talp_sample_state_t new_state); | ||
| 42 | |||
| 43 | |||
| 44 | /*********************************************************************************/ | ||
| 45 | /* Init / Finalize */ | ||
| 46 | /*********************************************************************************/ | ||
| 47 | |||
| 48 | 23 | void talp_sample_init(talp_info_t *talp_info) { | |
| 49 | |||
| 50 | 23 | talp_info->sample_registry = (sample_registry_t){ | |
| 51 | .mutex = PTHREAD_MUTEX_INITIALIZER, | ||
| 52 | }; | ||
| 53 | 23 | } | |
| 54 | |||
| 55 | 23 | void talp_sample_finalize(talp_info_t *talp_info) { | |
| 56 | |||
| 57 | /* Warning about _tls_sample in worker threads: | ||
| 58 | * worker threads do not call this function, so currently they are | ||
| 59 | * not deallocating their sample. | ||
| 60 | * In some cases, it might happen that a worker thread exits without | ||
| 61 | * the main thread reducing its sample, so in these cases the sample | ||
| 62 | * needs to outlive the thread. | ||
| 63 | * The main thread could deallocate it at this point, but then the | ||
| 64 | * TLS variable would be broken if TALP is reinitialized again. | ||
| 65 | * For now we will keep it like this and will revisit if needed. */ | ||
| 66 | |||
| 67 | /* Deallocate main thread sample */ | ||
| 68 | 23 | free(_tls_sample); | |
| 69 | 23 | _tls_sample = NULL; | |
| 70 | |||
| 71 | /* Deallocate samples list */ | ||
| 72 | 23 | sample_registry_t *registry = &talp_info->sample_registry; | |
| 73 | 23 | pthread_mutex_lock(®istry->mutex); | |
| 74 | { | ||
| 75 | 23 | free(registry->samples); | |
| 76 | 23 | registry->samples = NULL; | |
| 77 | 23 | registry->num_samples = 0; | |
| 78 | } | ||
| 79 | 23 | pthread_mutex_unlock(®istry->mutex); | |
| 80 | 23 | } | |
| 81 | |||
| 82 | |||
| 83 | /*********************************************************************************/ | ||
| 84 | /* Sample getters & setters */ | ||
| 85 | /*********************************************************************************/ | ||
| 86 | |||
| 87 | 4 | bool talp_sample_is_main(void) { | |
| 88 | 4 | return _is_main_sample; | |
| 89 | } | ||
| 90 | |||
| 91 | /* Quick test, without locking and without generating a new sample */ | ||
| 92 | 5354 | bool talp_sample_is_mine(const talp_sample_t *sample) { | |
| 93 |
3/4✓ Branch 0 taken 5354 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 5347 times.
✓ Branch 3 taken 7 times.
|
5354 | return sample != NULL && sample == _tls_sample; |
| 94 | } | ||
| 95 | |||
| 96 | /* Sets the TLS variable _is_main_sample_in_serial_mode. This function is | ||
| 97 | * called by the main thread when beginning or ending parallel region of level 1. | ||
| 98 | * FIXME: free agent threads may break this condition. | ||
| 99 | * | ||
| 100 | * Sets whether the main thread is running in serial mode. */ | ||
| 101 | 4 | void talp_sample_set_main_serial_mode(bool serial_mode) { | |
| 102 | |||
| 103 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | if (_is_main_sample) { |
| 104 | 4 | _is_main_sample_in_serial_mode = serial_mode; | |
| 105 | } | ||
| 106 | 4 | } | |
| 107 | |||
| 108 | /* Get the TLS associated sample */ | ||
| 109 | 16172 | talp_sample_t* talp_sample_get(talp_info_t *talp_info) { | |
| 110 | |||
| 111 | /* Thread already has an allocated sample, return it */ | ||
| 112 |
2/2✓ Branch 0 taken 16148 times.
✓ Branch 1 taken 24 times.
|
16172 | if (likely(_tls_sample != NULL)) return _tls_sample; |
| 113 | |||
| 114 | /* Observer threads don't have a valid sample */ | ||
| 115 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
|
24 | if (unlikely(thread_is_observer)) return NULL; |
| 116 | |||
| 117 | /* Otherwise, allocate */ | ||
| 118 | 24 | sample_registry_t *registry = &talp_info->sample_registry; | |
| 119 | 24 | pthread_mutex_lock(®istry->mutex); | |
| 120 | { | ||
| 121 | 24 | int num_samples = ++registry->num_samples; | |
| 122 | 24 | void *samples = realloc(registry->samples, sizeof(talp_sample_t*)*num_samples); | |
| 123 |
1/2✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
|
24 | if (samples) { |
| 124 | 24 | void *new_sample = NULL; | |
| 125 |
1/2✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
|
24 | if (posix_memalign(&new_sample, DLB_CACHE_LINE, sizeof(talp_sample_t)) == 0) { |
| 126 | 24 | _tls_sample = new_sample; | |
| 127 | 24 | *_tls_sample = (talp_sample_t){0}; | |
| 128 | 24 | registry->samples = samples; | |
| 129 | 24 | registry->samples[num_samples-1] = new_sample; | |
| 130 |
2/2✓ Branch 0 taken 22 times.
✓ Branch 1 taken 2 times.
|
24 | if (num_samples == 1) { |
| 131 | 22 | _is_main_sample = true; | |
| 132 | 22 | _is_main_sample_in_serial_mode = true; | |
| 133 | } | ||
| 134 | } else { | ||
| 135 | // error | ||
| 136 | ✗ | free(new_sample); | |
| 137 | ✗ | _tls_sample = NULL; | |
| 138 | } | ||
| 139 | } | ||
| 140 | } | ||
| 141 | 24 | pthread_mutex_unlock(®istry->mutex); | |
| 142 | |||
| 143 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
|
24 | fatal_cond(_tls_sample == NULL, "TALP: could not allocate thread sample"); |
| 144 | |||
| 145 | /* If a thread is created mid-region, its initial time is that of the | ||
| 146 | * innermost open region, otherwise it is the current time */ | ||
| 147 | int64_t last_updated_ts; | ||
| 148 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 22 times.
|
24 | if (talp_info->open_regions) { |
| 149 | 2 | const dlb_monitor_t *monitor = talp_info->open_regions->data; | |
| 150 | 2 | last_updated_ts = monitor->start_time; | |
| 151 | } else { | ||
| 152 | 22 | last_updated_ts = get_time_in_ns(); | |
| 153 | } | ||
| 154 | |||
| 155 | 24 | _tls_sample->last_updated_ts = last_updated_ts; | |
| 156 | |||
| 157 | 24 | set_state(talp_info, _tls_sample, TALP_STATE_DISABLED); | |
| 158 | |||
| 159 | #ifdef INSTRUMENTATION_VERSION | ||
| 160 | unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR}; | ||
| 161 | long long hwc_values[] = {0, 0}; | ||
| 162 | instrument_nevent(2, events, hwc_values); | ||
| 163 | #endif | ||
| 164 | |||
| 165 | 24 | return _tls_sample; | |
| 166 | } | ||
| 167 | |||
| 168 | |||
| 169 | /*********************************************************************************/ | ||
| 170 | /* Sample update */ | ||
| 171 | /*********************************************************************************/ | ||
| 172 | |||
| 173 | /* Compute new microsample (time since last update) and update sample values */ | ||
| 174 | 5383 | void talp_sample_update(talp_info_t *talp_info) { | |
| 175 | |||
| 176 | 5383 | talp_sample_t *sample = talp_sample_get(talp_info); | |
| 177 | |||
| 178 | /* Observer threads ignore this function */ | ||
| 179 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5383 times.
|
5383 | if (unlikely(sample == NULL)) return; |
| 180 | |||
| 181 | /* Compute duration and set new last_updated_ts */ | ||
| 182 | 5383 | int64_t now = get_time_in_ns(); | |
| 183 | 5383 | int64_t microsample_duration = now - sample->last_updated_ts; | |
| 184 | 5383 | sample->last_updated_ts = now; | |
| 185 | |||
| 186 | /* Update the appropriate sample timer */ | ||
| 187 |
5/7✓ Branch 0 taken 23 times.
✓ Branch 1 taken 5342 times.
✓ Branch 2 taken 7 times.
✓ Branch 3 taken 9 times.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
|
5383 | switch(sample->state) { |
| 188 | 23 | case TALP_STATE_DISABLED: | |
| 189 | 23 | break; | |
| 190 | 5342 | case TALP_STATE_USEFUL: | |
| 191 | 5342 | DLB_ATOMIC_ADD_RLX(&sample->timers.useful, microsample_duration); | |
| 192 | 5342 | break; | |
| 193 | 7 | case TALP_STATE_NOT_USEFUL_MPI: | |
| 194 | 7 | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_mpi, microsample_duration); | |
| 195 |
1/2✓ Branch 0 taken 7 times.
✗ Branch 1 not taken.
|
7 | if (_is_main_sample_in_serial_mode) { |
| 196 | // Add worker threads' time to special timer | ||
| 197 | 7 | int num_cpus = talp_info->sample_registry.num_samples; | |
| 198 | 7 | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_during_mpi, | |
| 199 | microsample_duration * (num_cpus-1)); | ||
| 200 | } | ||
| 201 | 7 | break; | |
| 202 | 9 | case TALP_STATE_NOT_USEFUL_OMP_IN: | |
| 203 | 9 | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_in, microsample_duration); | |
| 204 | 9 | break; | |
| 205 | 2 | case TALP_STATE_NOT_USEFUL_OMP_OUT: | |
| 206 | 2 | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_out, microsample_duration); | |
| 207 | 2 | break; | |
| 208 | ✗ | case TALP_STATE_NOT_USEFUL_GPU: | |
| 209 | ✗ | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_gpu, microsample_duration); | |
| 210 | ✗ | break; | |
| 211 | } | ||
| 212 | |||
| 213 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5383 times.
|
5383 | if (talp_info->flags.have_hwc) { |
| 214 | hwc_measurements_t measurements; | ||
| 215 | ✗ | if (talp_hwc_collect(&measurements)) { | |
| 216 | /* Atomically add HWC values to sample structure */ | ||
| 217 | ✗ | DLB_ATOMIC_ADD_RLX(&sample->counters.cycles, measurements.cycles); | |
| 218 | ✗ | DLB_ATOMIC_ADD_RLX(&sample->counters.instructions, measurements.instructions); | |
| 219 | } | ||
| 220 | |||
| 221 | #ifdef INSTRUMENTATION_VERSION | ||
| 222 | // It's safe to emit even if talp_hwc_collect returned false, | ||
| 223 | // struct is zero'ed in that case | ||
| 224 | unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR}; | ||
| 225 | long long hwc_values[] = {measurements.cycles, measurements.instructions}; | ||
| 226 | instrument_nevent(2, events, hwc_values); | ||
| 227 | #endif | ||
| 228 | } | ||
| 229 | } | ||
| 230 | |||
| 231 | 7 | void talp_sample_update_foreign(talp_info_t *talp_info, talp_sample_t *sample, int64_t now) { | |
| 232 | |||
| 233 | /* Observer threads ignore this function */ | ||
| 234 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 7 times.
|
7 | if (unlikely(sample == NULL)) return; |
| 235 | |||
| 236 | /* Compute duration and set new last_updated_ts */ | ||
| 237 | 7 | int64_t microsample_duration = now - sample->last_updated_ts; | |
| 238 | 7 | sample->last_updated_ts = now; | |
| 239 | |||
| 240 | /* Update the appropriate sample timer */ | ||
| 241 |
2/7✓ Branch 0 taken 4 times.
✓ Branch 1 taken 3 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
|
7 | switch(sample->state) { |
| 242 | 4 | case TALP_STATE_DISABLED: | |
| 243 | 4 | break; | |
| 244 | 3 | case TALP_STATE_USEFUL: | |
| 245 | 3 | DLB_ATOMIC_ADD_RLX(&sample->timers.useful, microsample_duration); | |
| 246 | 3 | break; | |
| 247 | ✗ | case TALP_STATE_NOT_USEFUL_MPI: | |
| 248 | ✗ | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_mpi, microsample_duration); | |
| 249 | ✗ | break; | |
| 250 | ✗ | case TALP_STATE_NOT_USEFUL_OMP_IN: | |
| 251 | ✗ | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_in, microsample_duration); | |
| 252 | ✗ | break; | |
| 253 | ✗ | case TALP_STATE_NOT_USEFUL_OMP_OUT: | |
| 254 | ✗ | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_out, microsample_duration); | |
| 255 | ✗ | break; | |
| 256 | ✗ | case TALP_STATE_NOT_USEFUL_GPU: | |
| 257 | ✗ | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_gpu, microsample_duration); | |
| 258 | ✗ | break; | |
| 259 | } | ||
| 260 | } | ||
| 261 | |||
| 262 | |||
| 263 | 97 | static void set_state(const talp_info_t *restrict talp_info, | |
| 264 | talp_sample_t *restrict sample, talp_sample_state_t new_state) { | ||
| 265 | |||
| 266 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 97 times.
|
97 | if (talp_info->flags.have_hwc) { |
| 267 | ✗ | talp_sample_state_t old = sample->state; | |
| 268 | ✗ | talp_hwc_on_state_change(old, new_state); | |
| 269 | } | ||
| 270 | |||
| 271 | 97 | sample->state = new_state; | |
| 272 | |||
| 273 | instrument_event(MONITOR_STATE, | ||
| 274 | new_state == TALP_STATE_DISABLED ? MONITOR_STATE_DISABLED | ||
| 275 | : new_state == TALP_STATE_USEFUL ? MONITOR_STATE_USEFUL | ||
| 276 | : new_state == TALP_STATE_NOT_USEFUL_MPI ? MONITOR_STATE_NOT_USEFUL_MPI | ||
| 277 | : new_state == TALP_STATE_NOT_USEFUL_OMP_IN ? MONITOR_STATE_NOT_USEFUL_OMP_IN | ||
| 278 | : new_state == TALP_STATE_NOT_USEFUL_OMP_OUT ? MONITOR_STATE_NOT_USEFUL_OMP_OUT | ||
| 279 | : new_state == TALP_STATE_NOT_USEFUL_GPU ? MONITOR_STATE_NOT_USEFUL_GPU | ||
| 280 | : 0, | ||
| 281 | EVENT_BEGIN); | ||
| 282 | 97 | } | |
| 283 | |||
| 284 | 73 | void talp_sample_set_state(talp_info_t *talp_info, talp_sample_state_t new_state) { | |
| 285 | |||
| 286 | 73 | talp_sample_t *sample = talp_sample_get(talp_info); | |
| 287 | 73 | set_state(talp_info, sample, new_state); | |
| 288 | 73 | } | |
| 289 | |||
| 290 | |||
| 291 | /*********************************************************************************/ | ||
| 292 | /* Sample aggregation */ | ||
| 293 | /*********************************************************************************/ | ||
| 294 | |||
| 295 | /* Flush and aggregate a single sample into a macrosample */ | ||
| 296 | 5354 | static inline void flush_sample_to_macrosample(talp_sample_t *restrict sample, | |
| 297 | talp_macrosample_t *restrict macrosample) { | ||
| 298 | |||
| 299 | /* Timers */ | ||
| 300 | 5354 | macrosample->timers.useful += | |
| 301 | 5354 | DLB_ATOMIC_EXCH_RLX(&sample->timers.useful, 0); | |
| 302 | 5354 | macrosample->timers.not_useful_mpi += | |
| 303 | 5354 | DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_mpi, 0); | |
| 304 | 5354 | macrosample->timers.not_useful_omp_during_mpi += | |
| 305 | 5354 | DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_omp_during_mpi, 0); | |
| 306 | 5354 | macrosample->timers.not_useful_omp_out += | |
| 307 | 5354 | DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_omp_out, 0); | |
| 308 | /* timers.not_useful_omp_in is not flushed here, make sure struct is empty */ | ||
| 309 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5354 times.
|
5354 | ensure(DLB_ATOMIC_LD_RLX(&sample->timers.not_useful_omp_in) == 0, |
| 310 | "Inconsistency in TALP sample metric not_useful_omp_in." | ||
| 311 | " Please, report bug at " PACKAGE_BUGREPORT); | ||
| 312 | 5354 | macrosample->timers.not_useful_gpu += | |
| 313 | 5354 | DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_gpu, 0); | |
| 314 | |||
| 315 | /* Counters */ | ||
| 316 | 5354 | macrosample->counters.cycles += | |
| 317 | 5354 | DLB_ATOMIC_EXCH_RLX(&sample->counters.cycles, 0); | |
| 318 | 5354 | macrosample->counters.instructions += | |
| 319 | 5354 | DLB_ATOMIC_EXCH_RLX(&sample->counters.instructions, 0); | |
| 320 | |||
| 321 | /* Stats */ | ||
| 322 | 5354 | macrosample->stats.num_mpi_calls += | |
| 323 | 5354 | DLB_ATOMIC_EXCH_RLX(&sample->stats.num_mpi_calls, 0); | |
| 324 | 5354 | macrosample->stats.num_omp_parallels += | |
| 325 | 5354 | DLB_ATOMIC_EXCH_RLX(&sample->stats.num_omp_parallels, 0); | |
| 326 | 5354 | macrosample->stats.num_omp_tasks += | |
| 327 | 5354 | DLB_ATOMIC_EXCH_RLX(&sample->stats.num_omp_tasks, 0); | |
| 328 | 5354 | macrosample->stats.num_gpu_runtime_calls += | |
| 329 | 5354 | DLB_ATOMIC_EXCH_RLX(&sample->stats.num_gpu_runtime_calls, 0); | |
| 330 | 5354 | } | |
| 331 | |||
| 332 | |||
| 333 | /* Aggregate all samples. | ||
| 334 | * This function assumes that the current thread's sample was just updated. */ | ||
| 335 | 5345 | void talp_sample_aggregate_all_to_macrosample( | |
| 336 | talp_info_t *restrict talp_info, talp_macrosample_t *restrict macrosample) { | ||
| 337 | |||
| 338 | 5345 | sample_registry_t *registry = &talp_info->sample_registry; | |
| 339 | 5345 | talp_sample_t *current_sample = talp_sample_get(talp_info); | |
| 340 | 5345 | int64_t now = current_sample->last_updated_ts; | |
| 341 | |||
| 342 | /* Accumulate samples from all threads */ | ||
| 343 | 5345 | pthread_mutex_lock(®istry->mutex); | |
| 344 | { | ||
| 345 | 5345 | int num_samples = registry->num_samples; | |
| 346 | 5345 | macrosample->num_cpus = num_samples; | |
| 347 | |||
| 348 | /* Force-update and aggregate all samples */ | ||
| 349 |
2/2✓ Branch 0 taken 5351 times.
✓ Branch 1 taken 5345 times.
|
10696 | for (int i = 0; i < num_samples; ++i) { |
| 350 | 5351 | talp_sample_t *sample = registry->samples[i]; | |
| 351 |
2/2✓ Branch 1 taken 6 times.
✓ Branch 2 taken 5345 times.
|
5351 | if (!talp_sample_is_mine(sample)) { |
| 352 | 6 | talp_sample_update_foreign(talp_info, sample, now); | |
| 353 | } | ||
| 354 | 5351 | flush_sample_to_macrosample(sample, macrosample); | |
| 355 | } | ||
| 356 | } | ||
| 357 | 5345 | pthread_mutex_unlock(®istry->mutex); | |
| 358 | 5345 | } | |
| 359 | |||
| 360 | /* Aggregate a subset of samples. | ||
| 361 | * This function assumes that the current thread's sample was just updated. | ||
| 362 | * OpenMP derived metrics are computed here and added to the main sample. */ | ||
| 363 | 2 | void talp_sample_aggregate_subset_to_macrosample( | |
| 364 | talp_info_t *restrict talp_info, | ||
| 365 | talp_sample_t **restrict samples, | ||
| 366 | unsigned int nelems, | ||
| 367 | talp_macrosample_t *restrict macrosample) { | ||
| 368 | |||
| 369 | 2 | sample_registry_t *registry = &talp_info->sample_registry; | |
| 370 | 2 | talp_sample_t *sample = talp_sample_get(talp_info); | |
| 371 | 2 | int64_t now = sample->last_updated_ts; | |
| 372 | |||
| 373 | 2 | int64_t sched_timer = 0; | |
| 374 | 2 | int64_t lb_timer = 0; | |
| 375 | |||
| 376 | 2 | pthread_mutex_lock(®istry->mutex); | |
| 377 | { | ||
| 378 | /* Iterate first to force-update all samples and compute the minimum | ||
| 379 | * not-useful-omp-in among them */ | ||
| 380 | 2 | int64_t min_not_useful_omp_in = INT64_MAX; | |
| 381 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
|
5 | for (unsigned int i = 0; i < nelems; ++i) { |
| 382 | 3 | talp_sample_t *worker_sample = samples[i]; | |
| 383 |
2/2✓ Branch 1 taken 1 times.
✓ Branch 2 taken 2 times.
|
3 | if (!talp_sample_is_mine(worker_sample)) { |
| 384 | 1 | talp_sample_update_foreign(talp_info, worker_sample, now); | |
| 385 | } | ||
| 386 | 3 | min_not_useful_omp_in = min_int64(min_not_useful_omp_in, | |
| 387 | 3 | DLB_ATOMIC_LD_RLX(&worker_sample->timers.not_useful_omp_in)); | |
| 388 | } | ||
| 389 | |||
| 390 | /* Iterate again to accumulate Load Balance, and to aggregate sample */ | ||
| 391 | 2 | sched_timer = min_not_useful_omp_in * nelems; | |
| 392 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
|
5 | for (unsigned int i = 0; i < nelems; ++i) { |
| 393 | 3 | talp_sample_t *worker_sample = samples[i]; | |
| 394 | 3 | lb_timer += DLB_ATOMIC_EXCH_RLX(&worker_sample->timers.not_useful_omp_in, 0) | |
| 395 | 3 | - min_not_useful_omp_in; | |
| 396 | 3 | flush_sample_to_macrosample(worker_sample, macrosample); | |
| 397 | } | ||
| 398 | } | ||
| 399 | 2 | pthread_mutex_unlock(®istry->mutex); | |
| 400 | |||
| 401 | /* Update derived timers into macrosample */ | ||
| 402 | 2 | macrosample->num_cpus = nelems; | |
| 403 | 2 | macrosample->timers.not_useful_omp_in_lb = lb_timer; | |
| 404 | 2 | macrosample->timers.not_useful_omp_in_sched = sched_timer; | |
| 405 | 2 | } | |
| 406 |