| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /*********************************************************************************/ | ||
| 2 | /* Copyright 2009-2025 Barcelona Supercomputing Center */ | ||
| 3 | /* */ | ||
| 4 | /* This file is part of the DLB library. */ | ||
| 5 | /* */ | ||
| 6 | /* DLB is free software: you can redistribute it and/or modify */ | ||
| 7 | /* it under the terms of the GNU Lesser General Public License as published by */ | ||
| 8 | /* the Free Software Foundation, either version 3 of the License, or */ | ||
| 9 | /* (at your option) any later version. */ | ||
| 10 | /* */ | ||
| 11 | /* DLB is distributed in the hope that it will be useful, */ | ||
| 12 | /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
| 13 | /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
| 14 | /* GNU Lesser General Public License for more details. */ | ||
| 15 | /* */ | ||
| 16 | /* You should have received a copy of the GNU Lesser General Public License */ | ||
| 17 | /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */ | ||
| 18 | /*********************************************************************************/ | ||
| 19 | |||
| 20 | #include "talp/talp_openmp.h" | ||
| 21 | |||
| 22 | #include "LB_numThreads/omptool.h" | ||
| 23 | #include "LB_comm/shmem_talp.h" | ||
| 24 | #include "LB_core/DLB_kernel.h" | ||
| 25 | #include "apis/dlb_talp.h" | ||
| 26 | #include "support/debug.h" | ||
| 27 | #include "talp/regions.h" | ||
| 28 | #include "talp/talp.h" | ||
| 29 | #include "talp/talp_types.h" | ||
| 30 | |||
| 31 | #include <unistd.h> | ||
| 32 | |||
| 33 | extern __thread bool thread_is_observer; | ||
| 34 | |||
| 35 | /* Update all open nested regions (so, excluding the innermost) and add the | ||
| 36 | * time since its start time until the sample last timestamp (which is the time | ||
| 37 | * that has yet not been added to the regions) as omp_serialization_time */ | ||
| 38 | 1 | static void update_serialization_in_nested_regions(const subprocess_descriptor_t *spd, | |
| 39 | const talp_sample_t *sample) { | ||
| 40 | |||
| 41 | 1 | talp_info_t *talp_info = spd->talp_info; | |
| 42 | |||
| 43 | /* Update all open nested regions */ | ||
| 44 | 1 | pthread_mutex_lock(&talp_info->regions_mutex); | |
| 45 | { | ||
| 46 | 2 | GSList *nested_open_regions = talp_info->open_regions | |
| 47 | 1 | ? talp_info->open_regions->next | |
| 48 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | : NULL; |
| 49 | |||
| 50 | 1 | for (GSList *node = nested_open_regions; | |
| 51 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | node != NULL; |
| 52 | 1 | node = node->next) { | |
| 53 | |||
| 54 | 1 | dlb_monitor_t *monitor = node->data; | |
| 55 | 1 | monitor->omp_serialization_time += | |
| 56 | 1 | sample->last_updated_timestamp - monitor->start_time; | |
| 57 | } | ||
| 58 | } | ||
| 59 | 1 | pthread_mutex_unlock(&talp_info->regions_mutex); | |
| 60 | 1 | } | |
| 61 | |||
| 62 | |||
| 63 | /*********************************************************************************/ | ||
| 64 | /* TALP OpenMP functions */ | ||
| 65 | /*********************************************************************************/ | ||
| 66 | |||
| 67 | /* samples involved in parallel level 1 */ | ||
| 68 | static talp_sample_t** parallel_samples_l1 = NULL; | ||
| 69 | static unsigned int parallel_samples_l1_capacity = 0; | ||
| 70 | |||
| 71 | 1 | void talp_openmp_init(pid_t pid, const options_t* options) { | |
| 72 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | ensure(!thread_is_observer, "An observer thread cannot call talp_openmp_init"); |
| 73 | |||
| 74 | 1 | const subprocess_descriptor_t *spd = thread_spd; | |
| 75 | 1 | talp_info_t *talp_info = spd->talp_info; | |
| 76 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | if (talp_info) { |
| 77 | 1 | monitor_data_t *monitor_data = talp_info->monitor->_data; | |
| 78 | 1 | talp_info->flags.have_openmp = true; | |
| 79 | |||
| 80 | /* Fix up number of CPUs for the global region */ | ||
| 81 | 1 | float cpus = CPU_COUNT(&spd->process_mask); | |
| 82 | 1 | talp_info->monitor->avg_cpus = cpus; | |
| 83 | 1 | shmem_talp__set_avg_cpus(monitor_data->node_shared_id, cpus); | |
| 84 | |||
| 85 | /* Start global region (no-op if already started) */ | ||
| 86 | 1 | region_start(spd, talp_info->monitor); | |
| 87 | |||
| 88 | /* Set useful state */ | ||
| 89 | 1 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
| 90 | 1 | talp_set_sample_state(sample, useful, talp_info->flags.papi); | |
| 91 | } | ||
| 92 | 1 | } | |
| 93 | |||
| 94 | 1 | void talp_openmp_finalize(void) { | |
| 95 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | if (parallel_samples_l1 != NULL) { |
| 96 | 1 | free(parallel_samples_l1); | |
| 97 | 1 | parallel_samples_l1 = NULL; | |
| 98 | 1 | parallel_samples_l1_capacity = 0; | |
| 99 | } | ||
| 100 | 1 | } | |
| 101 | |||
| 102 | 2 | void talp_openmp_thread_begin(ompt_thread_t thread_type) { | |
| 103 | 2 | const subprocess_descriptor_t *spd = thread_spd; | |
| 104 | 2 | talp_info_t *talp_info = spd->talp_info; | |
| 105 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (talp_info) { |
| 106 | /* Initial thread is already in useful state, set omp_out for others */ | ||
| 107 | 2 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
| 108 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | if (sample->state == disabled) { |
| 109 | /* Not initial thread: */ | ||
| 110 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (talp_info->flags.papi) { |
| 111 | ✗ | talp_init_papi_counters(); | |
| 112 | } | ||
| 113 | 1 | talp_set_sample_state(sample, not_useful_omp_out, talp_info->flags.papi); | |
| 114 | |||
| 115 | /* The initial time of the sample is set to match the start time of | ||
| 116 | * the innermost open region, but other nested open regions need to | ||
| 117 | * be fixed */ | ||
| 118 | 1 | update_serialization_in_nested_regions(spd, sample); | |
| 119 | } | ||
| 120 | } | ||
| 121 | 2 | } | |
| 122 | |||
| 123 | 2 | void talp_openmp_thread_end(void) { | |
| 124 | 2 | const subprocess_descriptor_t *spd = thread_spd; | |
| 125 | 2 | talp_info_t *talp_info = spd->talp_info; | |
| 126 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | if (talp_info) { |
| 127 | /* Update thread sample with the last microsample */ | ||
| 128 | 1 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
| 129 | 1 | talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP); | |
| 130 | |||
| 131 | /* Update state */ | ||
| 132 | 1 | talp_set_sample_state(sample, disabled, talp_info->flags.papi); | |
| 133 | |||
| 134 | /* Finalize PAPI per-thread state */ | ||
| 135 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (talp_info->flags.papi) { |
| 136 | ✗ | talp_fini_papi_counters(); | |
| 137 | } | ||
| 138 | } | ||
| 139 | 2 | } | |
| 140 | |||
| 141 | 2 | void talp_openmp_parallel_begin(omptool_parallel_data_t *parallel_data) { | |
| 142 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | fatal_cond(parallel_data->requested_parallelism < 1, |
| 143 | "Requested parallel region of invalid size in %s. Please report bug at %s.", | ||
| 144 | __func__, PACKAGE_BUGREPORT); | ||
| 145 | |||
| 146 | 2 | const subprocess_descriptor_t *spd = thread_spd; | |
| 147 | 2 | talp_info_t *talp_info = spd->talp_info; | |
| 148 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (talp_info) { |
| 149 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (parallel_data->level == 1) { |
| 150 | /* Resize samples of parallel 1 if needed */ | ||
| 151 | 2 | unsigned int requested_parallelism = parallel_data->requested_parallelism; | |
| 152 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (requested_parallelism > parallel_samples_l1_capacity) { |
| 153 | 2 | void *ptr = realloc(parallel_samples_l1, | |
| 154 | sizeof(talp_sample_t*)*requested_parallelism); | ||
| 155 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | fatal_cond(!ptr, "realloc failed in %s", __func__); |
| 156 | 2 | parallel_samples_l1 = ptr; | |
| 157 | 2 | parallel_samples_l1_capacity = requested_parallelism; | |
| 158 | } | ||
| 159 | |||
| 160 | /* Assign local data */ | ||
| 161 | 2 | parallel_data->talp_parallel_data = parallel_samples_l1; | |
| 162 | |||
| 163 | ✗ | } else if (parallel_data->level > 1) { | |
| 164 | /* Allocate parallel samples array */ | ||
| 165 | ✗ | unsigned int requested_parallelism = parallel_data->requested_parallelism; | |
| 166 | ✗ | void *ptr = malloc(sizeof(talp_sample_t*)*requested_parallelism); | |
| 167 | ✗ | fatal_cond(!ptr, "malloc failed in %s", __func__); | |
| 168 | |||
| 169 | /* Assign local data */ | ||
| 170 | ✗ | parallel_data->talp_parallel_data = ptr; | |
| 171 | } | ||
| 172 | |||
| 173 | /* Update stats */ | ||
| 174 | 2 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
| 175 | 2 | DLB_ATOMIC_ADD_RLX(&sample->stats.num_omp_parallels, 1); | |
| 176 | } | ||
| 177 | 2 | } | |
| 178 | |||
| 179 | 2 | void talp_openmp_parallel_end(omptool_parallel_data_t *parallel_data) { | |
| 180 | 2 | const subprocess_descriptor_t *spd = thread_spd; | |
| 181 | 2 | talp_info_t *talp_info = spd->talp_info; | |
| 182 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (talp_info) { |
| 183 | /* Update thread sample with the last microsample */ | ||
| 184 | 2 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
| 185 | 2 | talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP); | |
| 186 | |||
| 187 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (parallel_data->level == 1) { |
| 188 | /* Flush and aggregate all samples of the parallel region */ | ||
| 189 | 2 | talp_flush_sample_subset_to_regions(spd, | |
| 190 | 2 | parallel_data->talp_parallel_data, | |
| 191 | parallel_data->actual_parallelism); | ||
| 192 | |||
| 193 | ✗ | } else if (parallel_data->level > 1) { | |
| 194 | /* Flush and aggregate all samples of this parallel except this | ||
| 195 | * thread's sample. The primary thread of a nested parallel region | ||
| 196 | * will keep its samples until it finishes as non-primary | ||
| 197 | * team-worker or reaches the level 1 parallel region */ | ||
| 198 | ✗ | talp_sample_t **parallel_samples = parallel_data->talp_parallel_data; | |
| 199 | ✗ | talp_flush_sample_subset_to_regions(spd, | |
| 200 | ¶llel_samples[1], | ||
| 201 | ✗ | parallel_data->actual_parallelism-1); | |
| 202 | |||
| 203 | /* free local data */ | ||
| 204 | ✗ | free(parallel_data->talp_parallel_data); | |
| 205 | ✗ | parallel_data->talp_parallel_data = NULL; | |
| 206 | } | ||
| 207 | |||
| 208 | /* Update current threads's state */ | ||
| 209 | 2 | talp_set_sample_state(sample, useful, talp_info->flags.papi); | |
| 210 | |||
| 211 | /* Update the state of the rest of team-worker threads | ||
| 212 | * (note that talp_set_sample_state cannot be used here because we are | ||
| 213 | * impersonating a worker thread) */ | ||
| 214 | 2 | talp_sample_t **parallel_samples = parallel_data->talp_parallel_data; | |
| 215 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
|
3 | for (unsigned int i = 1; i < parallel_data->actual_parallelism; ++i) { |
| 216 | 1 | talp_sample_t *worker_sample = parallel_samples[i]; | |
| 217 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (worker_sample->state == not_useful_omp_in) { |
| 218 | ✗ | worker_sample->state = not_useful_omp_out; | |
| 219 | } | ||
| 220 | } | ||
| 221 | } | ||
| 222 | 2 | } | |
| 223 | |||
| 224 | 3 | void talp_openmp_into_parallel_function( | |
| 225 | omptool_parallel_data_t *parallel_data, unsigned int index) { | ||
| 226 | 3 | const subprocess_descriptor_t *spd = thread_spd; | |
| 227 | 3 | talp_info_t *talp_info = spd->talp_info; | |
| 228 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (talp_info) { |
| 229 | /* Assign thread sample as team-worker of this parallel */ | ||
| 230 | 3 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
| 231 | 3 | talp_sample_t **parallel_samples = parallel_data->talp_parallel_data; | |
| 232 | /* Probably optimized, but try to avoid invalidating | ||
| 233 | * the cache line on reused parallel data */ | ||
| 234 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1 times.
|
3 | if (parallel_samples[index] != sample) { |
| 235 | 2 | parallel_samples[index] = sample; | |
| 236 | } | ||
| 237 | |||
| 238 | /* Update thread sample with the last microsample */ | ||
| 239 | 3 | talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP); | |
| 240 | |||
| 241 | /* Update state */ | ||
| 242 | 3 | talp_set_sample_state(sample, useful, talp_info->flags.papi); | |
| 243 | } | ||
| 244 | 3 | } | |
| 245 | |||
| 246 | 1 | void talp_openmp_outof_parallel_function(void) { | |
| 247 | 1 | const subprocess_descriptor_t *spd = thread_spd; | |
| 248 | 1 | talp_info_t *talp_info = spd->talp_info; | |
| 249 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | if (talp_info) { |
| 250 | /* Update thread sample with the last microsample */ | ||
| 251 | 1 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
| 252 | 1 | talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP); | |
| 253 | |||
| 254 | /* Update state */ | ||
| 255 | 1 | talp_set_sample_state(sample, not_useful_omp_out, talp_info->flags.papi); | |
| 256 | } | ||
| 257 | 1 | } | |
| 258 | |||
| 259 | 3 | void talp_openmp_into_parallel_implicit_barrier(omptool_parallel_data_t *parallel_data) { | |
| 260 | 3 | const subprocess_descriptor_t *spd = thread_spd; | |
| 261 | 3 | talp_info_t *talp_info = spd->talp_info; | |
| 262 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (talp_info) { |
| 263 | /* Update thread sample with the last microsample */ | ||
| 264 | 3 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
| 265 | 3 | talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP); | |
| 266 | |||
| 267 | /* Update state */ | ||
| 268 | 3 | talp_set_sample_state(sample, not_useful_omp_in, talp_info->flags.papi); | |
| 269 | } | ||
| 270 | 3 | } | |
| 271 | |||
| 272 | 3 | void talp_openmp_into_parallel_sync(omptool_parallel_data_t *parallel_data) { | |
| 273 | 3 | const subprocess_descriptor_t *spd = thread_spd; | |
| 274 | 3 | talp_info_t *talp_info = spd->talp_info; | |
| 275 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (talp_info) { |
| 276 | /* Update thread sample with the last microsample */ | ||
| 277 | 3 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
| 278 | 3 | talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP); | |
| 279 | |||
| 280 | /* Update state */ | ||
| 281 | 3 | talp_set_sample_state(sample, not_useful_omp_in, talp_info->flags.papi); | |
| 282 | } | ||
| 283 | 3 | } | |
| 284 | |||
| 285 | 3 | void talp_openmp_outof_parallel_sync(omptool_parallel_data_t *parallel_data) { | |
| 286 | 3 | const subprocess_descriptor_t *spd = thread_spd; | |
| 287 | 3 | talp_info_t *talp_info = spd->talp_info; | |
| 288 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (talp_info) { |
| 289 | /* Update thread sample with the last microsample */ | ||
| 290 | 3 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
| 291 | 3 | talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP); | |
| 292 | |||
| 293 | /* Update state */ | ||
| 294 | 3 | talp_set_sample_state(sample, useful, talp_info->flags.papi); | |
| 295 | } | ||
| 296 | 3 | } | |
| 297 | |||
| 298 | 3 | void talp_openmp_task_create(void) { | |
| 299 | 3 | const subprocess_descriptor_t *spd = thread_spd; | |
| 300 | 3 | talp_info_t *talp_info = spd->talp_info; | |
| 301 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (talp_info) { |
| 302 | /* Just update stats */ | ||
| 303 | 3 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
| 304 | 3 | DLB_ATOMIC_ADD_RLX(&sample->stats.num_omp_tasks, 1); | |
| 305 | } | ||
| 306 | 3 | } | |
| 307 | |||
| 308 | 3 | void talp_openmp_task_complete(void) { | |
| 309 | 3 | const subprocess_descriptor_t *spd = thread_spd; | |
| 310 | 3 | talp_info_t *talp_info = spd->talp_info; | |
| 311 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (talp_info) { |
| 312 | /* Update thread sample with the last microsample */ | ||
| 313 | 3 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
| 314 | 3 | talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP); | |
| 315 | |||
| 316 | /* Update state (FIXME: tasks outside of parallels? */ | ||
| 317 | 3 | talp_set_sample_state(sample, not_useful_omp_in, talp_info->flags.papi); | |
| 318 | } | ||
| 319 | 3 | } | |
| 320 | |||
| 321 | 6 | void talp_openmp_task_switch(void) { | |
| 322 | 6 | const subprocess_descriptor_t *spd = thread_spd; | |
| 323 | 6 | talp_info_t *talp_info = spd->talp_info; | |
| 324 |
1/2✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
|
6 | if (talp_info) { |
| 325 | /* Update thread sample with the last microsample */ | ||
| 326 | 6 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
| 327 | 6 | talp_update_sample(sample, talp_info->flags.papi, TALP_NO_TIMESTAMP); | |
| 328 | |||
| 329 | /* Update state */ | ||
| 330 | 6 | talp_set_sample_state(sample, useful, talp_info->flags.papi); | |
| 331 | } | ||
| 332 | 6 | } | |
| 333 | |||
| 334 | |||
| 335 | /*********************************************************************************/ | ||
| 336 | /* Vtable for handling omptool events */ | ||
| 337 | /*********************************************************************************/ | ||
| 338 | |||
| 339 | const omptool_event_funcs_t talp_events_vtable = (const omptool_event_funcs_t) { | ||
| 340 | .init = talp_openmp_init, | ||
| 341 | .finalize = talp_openmp_finalize, | ||
| 342 | .into_mpi = NULL, | ||
| 343 | .outof_mpi = NULL, | ||
| 344 | .lend_from_api = NULL, | ||
| 345 | .thread_begin = talp_openmp_thread_begin, | ||
| 346 | .thread_end = talp_openmp_thread_end, | ||
| 347 | .thread_role_shift = NULL, | ||
| 348 | .parallel_begin = talp_openmp_parallel_begin, | ||
| 349 | .parallel_end = talp_openmp_parallel_end, | ||
| 350 | .into_parallel_function = talp_openmp_into_parallel_function, | ||
| 351 | .outof_parallel_function = talp_openmp_outof_parallel_function, | ||
| 352 | .into_parallel_implicit_barrier = talp_openmp_into_parallel_implicit_barrier, | ||
| 353 | .into_parallel_sync = talp_openmp_into_parallel_sync, | ||
| 354 | .outof_parallel_sync = talp_openmp_outof_parallel_sync, | ||
| 355 | .task_create = talp_openmp_task_create, | ||
| 356 | .task_complete = talp_openmp_task_complete, | ||
| 357 | .task_switch = talp_openmp_task_switch, | ||
| 358 | }; | ||
| 359 |