| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /*********************************************************************************/ | ||
| 2 | /* Copyright 2009-2026 Barcelona Supercomputing Center */ | ||
| 3 | /* */ | ||
| 4 | /* This file is part of the DLB library. */ | ||
| 5 | /* */ | ||
| 6 | /* DLB is free software: you can redistribute it and/or modify */ | ||
| 7 | /* it under the terms of the GNU Lesser General Public License as published by */ | ||
| 8 | /* the Free Software Foundation, either version 3 of the License, or */ | ||
| 9 | /* (at your option) any later version. */ | ||
| 10 | /* */ | ||
| 11 | /* DLB is distributed in the hope that it will be useful, */ | ||
| 12 | /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
| 13 | /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
| 14 | /* GNU Lesser General Public License for more details. */ | ||
| 15 | /* */ | ||
| 16 | /* You should have received a copy of the GNU Lesser General Public License */ | ||
| 17 | /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */ | ||
| 18 | /*********************************************************************************/ | ||
| 19 | |||
| 20 | #include "talp/talp_mpi.h" | ||
| 21 | |||
| 22 | #include "LB_comm/shmem_talp.h" | ||
| 23 | #include "LB_core/node_barrier.h" | ||
| 24 | #include "LB_core/spd.h" | ||
| 25 | #include "apis/dlb_talp.h" | ||
| 26 | #include "support/atomic.h" | ||
| 27 | #include "support/debug.h" | ||
| 28 | #include "support/mask_utils.h" | ||
| 29 | #include "talp/regions.h" | ||
| 30 | #include "talp/sample.h" | ||
| 31 | #include "talp/talp.h" | ||
| 32 | #include "talp/talp_record.h" | ||
| 33 | #include "talp/talp_types.h" | ||
| 34 | #ifdef MPI_LIB | ||
| 35 | #include "mpi/mpi_core.h" | ||
| 36 | #endif | ||
| 37 | |||
| 38 | #include <stdio.h> | ||
| 39 | #include <string.h> | ||
| 40 | |||
| 41 | extern __thread bool thread_is_observer; | ||
| 42 | |||
| 43 | |||
| 44 | #ifdef MPI_LIB | ||
| 45 | /* Communicate among all MPI processes so that everyone has the same monitoring regions */ | ||
| 46 | static void talp_register_common_mpi_regions(const subprocess_descriptor_t *spd) { | ||
| 47 | /* Note: there's a potential race condition if this function is called | ||
| 48 | * (which happens on talp_mpi_finalize or talp_finalize) while another | ||
| 49 | * thread creates a monitoring region. The solution would be to lock the | ||
| 50 | * entire routine and call a specialized registering function that does not | ||
| 51 | * lock, or use a recursive lock. The situation is strange enough to not | ||
| 52 | * support it */ | ||
| 53 | |||
| 54 | talp_info_t *talp_info = spd->talp_info; | ||
| 55 | |||
| 56 | /* Warn about open regions */ | ||
| 57 | for (GSList *node = talp_info->open_regions; | ||
| 58 | node != NULL; | ||
| 59 | node = node->next) { | ||
| 60 | const dlb_monitor_t *monitor = node->data; | ||
| 61 | warning("Region %s is still open during MPI_Finalize." | ||
| 62 | " Collected data may be incomplete.", | ||
| 63 | monitor->name); | ||
| 64 | } | ||
| 65 | |||
| 66 | /* Gather recvcounts for each process | ||
| 67 | * (Each process may have different number of monitors) */ | ||
| 68 | int nregions = g_tree_nnodes(talp_info->regions); | ||
| 69 | int chars_to_send = nregions * DLB_MONITOR_NAME_MAX; | ||
| 70 | int *recvcounts = malloc(_mpi_size * sizeof(int)); | ||
| 71 | PMPI_Allgather(&chars_to_send, 1, MPI_INT, | ||
| 72 | recvcounts, 1, MPI_INT, getWorldComm()); | ||
| 73 | |||
| 74 | /* Compute total characters to gather via MPI */ | ||
| 75 | int i; | ||
| 76 | int total_chars = 0; | ||
| 77 | for (i=0; i<_mpi_size; ++i) { | ||
| 78 | total_chars += recvcounts[i]; | ||
| 79 | } | ||
| 80 | |||
| 81 | if (total_chars > 0) { | ||
| 82 | /* Prepare sendbuffer */ | ||
| 83 | char *sendbuffer = malloc(nregions * DLB_MONITOR_NAME_MAX * sizeof(char)); | ||
| 84 | char *sendptr = sendbuffer; | ||
| 85 | for (GTreeNode *node = g_tree_node_first(talp_info->regions); | ||
| 86 | node != NULL; | ||
| 87 | node = g_tree_node_next(node)) { | ||
| 88 | const dlb_monitor_t *monitor = g_tree_node_value(node); | ||
| 89 | strcpy(sendptr, monitor->name); | ||
| 90 | sendptr += DLB_MONITOR_NAME_MAX; | ||
| 91 | } | ||
| 92 | |||
| 93 | /* Prepare recvbuffer */ | ||
| 94 | char *recvbuffer = malloc(total_chars * sizeof(char)); | ||
| 95 | |||
| 96 | /* Compute displacements */ | ||
| 97 | int *displs = malloc(_mpi_size * sizeof(int)); | ||
| 98 | int next_disp = 0; | ||
| 99 | for (i=0; i<_mpi_size; ++i) { | ||
| 100 | displs[i] = next_disp; | ||
| 101 | next_disp += recvcounts[i]; | ||
| 102 | } | ||
| 103 | |||
| 104 | /* Gather all regions */ | ||
| 105 | PMPI_Allgatherv(sendbuffer, nregions * DLB_MONITOR_NAME_MAX, MPI_CHAR, | ||
| 106 | recvbuffer, recvcounts, displs, MPI_CHAR, getWorldComm()); | ||
| 107 | |||
| 108 | /* Register all regions. Existing ones will be skipped. */ | ||
| 109 | for (i=0; i<total_chars; i+=DLB_MONITOR_NAME_MAX) { | ||
| 110 | region_register(spd, &recvbuffer[i]); | ||
| 111 | } | ||
| 112 | |||
| 113 | free(sendbuffer); | ||
| 114 | free(recvbuffer); | ||
| 115 | free(displs); | ||
| 116 | } | ||
| 117 | |||
| 118 | free(recvcounts); | ||
| 119 | } | ||
| 120 | #endif | ||
| 121 | |||
| 122 | |||
| 123 | /*********************************************************************************/ | ||
| 124 | /* TALP MPI functions */ | ||
| 125 | /*********************************************************************************/ | ||
| 126 | |||
| 127 | /* Start global monitoring region (if not already started) */ | ||
| 128 | 9 | void talp_mpi_init(const subprocess_descriptor_t *spd) { | |
| 129 | |||
| 130 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 9 times.
|
9 | ensure(!thread_is_observer, "An observer thread cannot call talp_mpi_init"); |
| 131 | |||
| 132 | 9 | talp_info_t *talp_info = spd->talp_info; | |
| 133 | |||
| 134 |
1/2✓ Branch 0 taken 9 times.
✗ Branch 1 not taken.
|
9 | if (talp_info) { |
| 135 | 9 | talp_info->flags.have_mpi = true; | |
| 136 | |||
| 137 | /* Start global region (no-op if already started) */ | ||
| 138 | 9 | region_start(spd, talp_info->monitor); | |
| 139 | |||
| 140 | /* Add MPI_Init statistic and set useful state */ | ||
| 141 | 9 | talp_sample_t *sample = talp_sample_get(talp_info); | |
| 142 | 9 | DLB_ATOMIC_ADD_RLX(&sample->stats.num_mpi_calls, 1); | |
| 143 | 9 | talp_sample_set_state(talp_info, TALP_STATE_USEFUL); | |
| 144 | } | ||
| 145 | 9 | } | |
| 146 | |||
| 147 | /* Stop global monitoring region and gather APP data if needed */ | ||
| 148 | 6 | void talp_mpi_finalize(const subprocess_descriptor_t *spd) { | |
| 149 | |||
| 150 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | ensure(!thread_is_observer, "An observer thread cannot call talp_mpi_finalize"); |
| 151 | |||
| 152 | 6 | talp_info_t *talp_info = spd->talp_info; | |
| 153 | |||
| 154 |
2/4✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 6 times.
|
6 | if (talp_info == NULL || !talp_info->flags.have_mpi) return; |
| 155 | |||
| 156 | #ifdef MPI_LIB | ||
| 157 | /* We also need to measure the waiting time of an MPI_Finalize. | ||
| 158 | * For this, we call an MPI_Barrier and the appropriate TALP functions. | ||
| 159 | * The num_mpi_calls variable is also incremented inside those. */ | ||
| 160 | sync_call_flags_t flags = { .is_blocking = true, .is_collective = true }; | ||
| 161 | talp_into_sync_call(spd, flags); | ||
| 162 | PMPI_Barrier(getWorldComm()); | ||
| 163 | talp_out_of_sync_call(spd, flags); | ||
| 164 | #else | ||
| 165 | /* Add MPI_Finalize to the number of MPI calls. | ||
| 166 | * Even though talp_mpi_finalize should never be called if no MPI_LIB, | ||
| 167 | * we keep this case for testing purposes. */ | ||
| 168 | 6 | talp_sample_t *sample = talp_sample_get(talp_info); | |
| 169 | 6 | DLB_ATOMIC_ADD_RLX(&sample->stats.num_mpi_calls, 1); | |
| 170 | #endif | ||
| 171 | |||
| 172 | /* Stop global region */ | ||
| 173 | 6 | region_stop(spd, talp_info->monitor); | |
| 174 | |||
| 175 | 6 | monitor_data_t *monitor_data = talp_info->monitor->_data; | |
| 176 | |||
| 177 | /* Update shared memory values */ | ||
| 178 |
3/4✓ Branch 0 taken 5 times.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 5 times.
|
6 | if (talp_info->flags.have_shmem || talp_info->flags.have_minimal_shmem) { |
| 179 | // TODO: is it needed? isn't it updated when stopped? | ||
| 180 | 1 | shmem_talp__set_times(monitor_data->node_shared_id, | |
| 181 | 1 | talp_info->monitor->mpi_time, | |
| 182 | 1 | talp_info->monitor->useful_time); | |
| 183 | } | ||
| 184 | |||
| 185 | /* If TALP partial output is enabled, metrics are not merged here. | ||
| 186 | * Output is written per process in talp_finalize() */ | ||
| 187 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | if (spd->options.talp_partial_output) return; |
| 188 | |||
| 189 | #ifdef MPI_LIB | ||
| 190 | /* If performing any kind of TALP summary, check that the number of processes | ||
| 191 | * registered in the shared memory matches with the number of MPI processes in the node. | ||
| 192 | * This check is needed to avoid deadlocks on finalize. */ | ||
| 193 | if (spd->options.talp_summary) { | ||
| 194 | verbose(VB_TALP, "Gathering TALP metrics"); | ||
| 195 | /* FIXME: use Named Barrier */ | ||
| 196 | /* if (shmem_barrier__get_num_participants(spd->options.barrier_id) == _mpis_per_node) { */ | ||
| 197 | |||
| 198 | /* Gather data among processes in the node if node summary is enabled */ | ||
| 199 | if (spd->options.talp_summary & SUMMARY_NODE) { | ||
| 200 | talp_record_node_summary(spd); | ||
| 201 | } | ||
| 202 | |||
| 203 | /* Gather data among MPIs if any of these summaries is enabled */ | ||
| 204 | if (spd->options.talp_summary | ||
| 205 | & (SUMMARY_POP_METRICS | SUMMARY_PROCESS)) { | ||
| 206 | /* Ensure everyone has the same monitoring regions */ | ||
| 207 | talp_register_common_mpi_regions(spd); | ||
| 208 | |||
| 209 | /* Finally, reduce data */ | ||
| 210 | for (GTreeNode *node = g_tree_node_first(talp_info->regions); | ||
| 211 | node != NULL; | ||
| 212 | node = g_tree_node_next(node)) { | ||
| 213 | const dlb_monitor_t *monitor = g_tree_node_value(node); | ||
| 214 | if (spd->options.talp_summary & SUMMARY_POP_METRICS) { | ||
| 215 | talp_record_pop_summary(spd, monitor); | ||
| 216 | } | ||
| 217 | if (spd->options.talp_summary & SUMMARY_PROCESS) { | ||
| 218 | talp_record_process_summary(spd, monitor); | ||
| 219 | } | ||
| 220 | } | ||
| 221 | } | ||
| 222 | |||
| 223 | /* Synchronize all processes in node before continuing with DLB finalization */ | ||
| 224 | node_barrier(spd, NULL); | ||
| 225 | /* } else { */ | ||
| 226 | /* warning("The number of MPI processes and processes registered in DLB differ." */ | ||
| 227 | /* " TALP will not print any summary."); */ | ||
| 228 | /* } */ | ||
| 229 | } | ||
| 230 | #endif | ||
| 231 | } | ||
| 232 | |||
| 233 | 8 | void talp_into_sync_call(const subprocess_descriptor_t *spd, sync_call_flags_t flags) { | |
| 234 | |||
| 235 | /* Observer threads may call MPI functions, but TALP must ignore them */ | ||
| 236 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
|
8 | if (unlikely(thread_is_observer)) return; |
| 237 | |||
| 238 | 7 | talp_info_t *talp_info = spd->talp_info; | |
| 239 | |||
| 240 |
2/4✓ Branch 0 taken 7 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 7 times.
|
7 | if (talp_info == NULL || !talp_info->flags.have_mpi) return; |
| 241 | |||
| 242 | /* Update sample */ | ||
| 243 | 7 | talp_sample_update(talp_info); | |
| 244 | |||
| 245 | /* Into Sync call -> not_useful_mpi */ | ||
| 246 | 7 | talp_sample_set_state(talp_info, TALP_STATE_NOT_USEFUL_MPI); | |
| 247 | } | ||
| 248 | |||
| 249 | 8 | void talp_out_of_sync_call(const subprocess_descriptor_t *spd, sync_call_flags_t flags) { | |
| 250 | |||
| 251 | /* Observer threads may call MPI functions, but TALP must ignore them */ | ||
| 252 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
|
8 | if (unlikely(thread_is_observer)) return; |
| 253 | |||
| 254 | 7 | talp_info_t *talp_info = spd->talp_info; | |
| 255 | |||
| 256 |
2/4✓ Branch 0 taken 7 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 7 times.
|
7 | if (talp_info == NULL || !talp_info->flags.have_mpi) return; |
| 257 | |||
| 258 | /* Update sample */ | ||
| 259 | 7 | talp_sample_update(talp_info); | |
| 260 | |||
| 261 | /* Add statistic */ | ||
| 262 | 7 | talp_sample_t *sample = talp_sample_get(talp_info); | |
| 263 | 7 | DLB_ATOMIC_ADD_RLX(&sample->stats.num_mpi_calls, 1); | |
| 264 | |||
| 265 | /* Out of Sync call -> useful */ | ||
| 266 | 7 | talp_sample_set_state(talp_info, TALP_STATE_USEFUL); | |
| 267 | |||
| 268 | /* Only when needed, update all regions */ | ||
| 269 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 3 times.
|
7 | if (talp_info->flags.external_profiler |
| 270 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | && talp_sample_is_main() |
| 271 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 3 times.
|
4 | && flags.is_blocking |
| 272 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | && flags.is_collective) { |
| 273 | 1 | talp_aggregate_samples_to_regions(talp_info); | |
| 274 | } | ||
| 275 | } | ||
| 276 |