| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /*********************************************************************************/ | ||
| 2 | /* Copyright 2009-2026 Barcelona Supercomputing Center */ | ||
| 3 | /* */ | ||
| 4 | /* This file is part of the DLB library. */ | ||
| 5 | /* */ | ||
| 6 | /* DLB is free software: you can redistribute it and/or modify */ | ||
| 7 | /* it under the terms of the GNU Lesser General Public License as published by */ | ||
| 8 | /* the Free Software Foundation, either version 3 of the License, or */ | ||
| 9 | /* (at your option) any later version. */ | ||
| 10 | /* */ | ||
| 11 | /* DLB is distributed in the hope that it will be useful, */ | ||
| 12 | /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
| 13 | /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
| 14 | /* GNU Lesser General Public License for more details. */ | ||
| 15 | /* */ | ||
| 16 | /* You should have received a copy of the GNU Lesser General Public License */ | ||
| 17 | /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */ | ||
| 18 | /*********************************************************************************/ | ||
| 19 | |||
| 20 | /* Tracking Application Live Performance */ | ||
| 21 | |||
| 22 | #include "apis/dlb_talp.h" | ||
| 23 | |||
| 24 | #include "apis/dlb_errors.h" | ||
| 25 | #include "LB_core/spd.h" | ||
| 26 | #include "LB_core/DLB_kernel.h" | ||
| 27 | #include "LB_comm/shmem_cpuinfo.h" | ||
| 28 | #include "LB_comm/shmem_procinfo.h" | ||
| 29 | #include "LB_comm/shmem_talp.h" | ||
| 30 | #include "support/dlb_common.h" | ||
| 31 | #include "support/mask_utils.h" | ||
| 32 | #include "support/mytime.h" | ||
| 33 | #include "talp/regions.h" | ||
| 34 | #include "talp/sample.h" | ||
| 35 | #include "talp/talp.h" | ||
| 36 | |||
| 37 | |||
| 38 | /*********************************************************************************/ | ||
| 39 | /* TALP */ | ||
| 40 | /*********************************************************************************/ | ||
| 41 | |||
| 42 | DLB_EXPORT_SYMBOL | ||
| 43 | 9 | int DLB_TALP_Attach(void) { | |
| 44 | int lewi_color; | ||
| 45 | char shm_key[MAX_OPTION_LENGTH]; | ||
| 46 | char *shm_key_ptr; | ||
| 47 | int shm_size_multiplier; | ||
| 48 | 9 | spd_enter_dlb(thread_spd); | |
| 49 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 3 times.
|
9 | if (!thread_spd->dlb_initialized) { |
| 50 | 6 | set_observer_role(true); | |
| 51 | 6 | options_parse_entry("--lewi-color", &lewi_color); | |
| 52 | 6 | options_parse_entry("--shm-key", shm_key); | |
| 53 | 6 | options_parse_entry("--shm-size-multiplier", &shm_size_multiplier); | |
| 54 | 6 | shm_key_ptr = shm_key; | |
| 55 | } else { | ||
| 56 | 3 | lewi_color = thread_spd->options.lewi_color; | |
| 57 | 3 | shm_key_ptr = thread_spd->options.shm_key; | |
| 58 | 3 | shm_size_multiplier = thread_spd->options.shm_size_multiplier; | |
| 59 | } | ||
| 60 | 9 | shmem_cpuinfo_ext__init(shm_key_ptr, lewi_color); | |
| 61 | 9 | shmem_procinfo_ext__init(shm_key_ptr, shm_size_multiplier); | |
| 62 | 9 | shmem_talp_ext__init(shm_key_ptr, shm_size_multiplier); | |
| 63 | 9 | return DLB_SUCCESS; | |
| 64 | } | ||
| 65 | |||
| 66 | DLB_EXPORT_SYMBOL | ||
| 67 | 9 | int DLB_TALP_Detach(void) { | |
| 68 | 9 | int error = shmem_cpuinfo_ext__finalize(); | |
| 69 |
1/2✓ Branch 0 taken 9 times.
✗ Branch 1 not taken.
|
9 | error = error ? error : shmem_procinfo_ext__finalize(); |
| 70 |
1/2✓ Branch 0 taken 9 times.
✗ Branch 1 not taken.
|
9 | error = error ? error : shmem_talp_ext__finalize(); |
| 71 | 9 | return error; | |
| 72 | } | ||
| 73 | |||
| 74 | DLB_EXPORT_SYMBOL | ||
| 75 | 1 | int DLB_TALP_GetNumCPUs(int *ncpus) { | |
| 76 | 1 | *ncpus = mu_get_system_size(); | |
| 77 | 1 | return DLB_SUCCESS; | |
| 78 | } | ||
| 79 | |||
| 80 | DLB_EXPORT_SYMBOL | ||
| 81 | 5 | int DLB_TALP_GetPidList(int *pidlist, int *nelems, int max_len) { | |
| 82 | 5 | return shmem_procinfo__getpidlist(pidlist, nelems, max_len); | |
| 83 | } | ||
| 84 | |||
| 85 | DLB_EXPORT_SYMBOL | ||
| 86 | 5 | int DLB_TALP_GetTimes(int pid, double *mpi_time, double *useful_time) { | |
| 87 | |||
| 88 | int error; | ||
| 89 | |||
| 90 |
4/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 3 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 3 times.
|
7 | if (pid == 0 || (thread_spd && thread_spd->id == pid)) { |
| 91 | /* Same process */ | ||
| 92 | 2 | const dlb_monitor_t *monitor = region_get_global(thread_spd); | |
| 93 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | if (monitor != NULL) { |
| 94 | 1 | *mpi_time = nsecs_to_secs(monitor->mpi_time); | |
| 95 | 1 | *useful_time = nsecs_to_secs(monitor->useful_time); | |
| 96 | 1 | error = DLB_SUCCESS; | |
| 97 | } else { | ||
| 98 | 1 | error = DLB_ERR_NOTALP; | |
| 99 | } | ||
| 100 | } else { | ||
| 101 | /* Different process, fetch from shared memory */ | ||
| 102 | talp_region_list_t region; | ||
| 103 | 3 | error = shmem_talp__get_region(®ion, pid, region_get_global_name()); | |
| 104 | |||
| 105 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1 times.
|
3 | if (error == DLB_SUCCESS) { |
| 106 | 2 | *mpi_time = nsecs_to_secs(region.mpi_time); | |
| 107 | 2 | *useful_time = nsecs_to_secs(region.useful_time); | |
| 108 | } | ||
| 109 | } | ||
| 110 | |||
| 111 | 5 | return error; | |
| 112 | } | ||
| 113 | |||
| 114 | DLB_EXPORT_SYMBOL | ||
| 115 | 4 | int DLB_TALP_GetNodeTimes(const char *name, dlb_node_times_t *node_times_list, | |
| 116 | int *nelems, int max_len) { | ||
| 117 | |||
| 118 | int error; | ||
| 119 | |||
| 120 |
2/2✓ Branch 1 taken 3 times.
✓ Branch 2 taken 1 times.
|
4 | if (shmem_talp__initialized()) { |
| 121 | /* Only if a worker process started with --talp-external-profiler */ | ||
| 122 | 3 | int shmem_max_regions = shmem_talp__get_max_regions(); | |
| 123 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
|
3 | if (max_len > shmem_max_regions) { |
| 124 | 1 | max_len = shmem_max_regions; | |
| 125 | } | ||
| 126 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (name == DLB_GLOBAL_REGION) { |
| 127 | 3 | name = region_get_global_name(); | |
| 128 | } | ||
| 129 | 3 | talp_region_list_t *region_list = malloc(sizeof(talp_region_list_t)*max_len); | |
| 130 | 3 | error = shmem_talp__get_regionlist(region_list, nelems, max_len, name); | |
| 131 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (error == DLB_SUCCESS) { |
| 132 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 3 times.
|
8 | for (int i=0; i<*nelems; ++i) { |
| 133 | 5 | node_times_list[i] = (const dlb_node_times_t) { | |
| 134 | 5 | .pid = region_list[i].pid, | |
| 135 | 5 | .mpi_time = region_list[i].mpi_time, | |
| 136 | 5 | .useful_time = region_list[i].useful_time, | |
| 137 | }; | ||
| 138 | } | ||
| 139 | } | ||
| 140 | 3 | free(region_list); | |
| 141 | } else { | ||
| 142 | /* shmem does not exist */ | ||
| 143 | 1 | error = DLB_ERR_NOSHMEM; | |
| 144 | } | ||
| 145 | |||
| 146 | 4 | return error; | |
| 147 | } | ||
| 148 | |||
| 149 | DLB_EXPORT_SYMBOL | ||
| 150 | 2 | int DLB_TALP_QueryPOPNodeMetrics(const char *name, dlb_node_metrics_t *node_metrics) { | |
| 151 |
2/2✓ Branch 1 taken 1 times.
✓ Branch 2 taken 1 times.
|
2 | if (shmem_talp__initialized()) { |
| 152 | /* Only if a worker process started with --talp-external-profiler */ | ||
| 153 | 1 | return talp_query_pop_node_metrics(name, node_metrics); | |
| 154 | } else { | ||
| 155 | 1 | return DLB_ERR_NOSHMEM; | |
| 156 | } | ||
| 157 | } | ||
| 158 | |||
| 159 | |||
| 160 | /*********************************************************************************/ | ||
| 161 | /* TALP Monitoring Regions */ | ||
| 162 | /*********************************************************************************/ | ||
| 163 | |||
| 164 | DLB_EXPORT_SYMBOL | ||
| 165 | 4 | dlb_monitor_t* DLB_MonitoringRegionGetGlobal(void) { | |
| 166 | 4 | spd_enter_dlb(thread_spd); | |
| 167 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 3 times.
|
4 | if (unlikely(!thread_spd->talp_info)) { |
| 168 | 1 | return NULL; | |
| 169 | } | ||
| 170 | 3 | return region_get_global(thread_spd); | |
| 171 | } | ||
| 172 | |||
| 173 | DLB_EXPORT_SYMBOL | ||
| 174 | DLB_ALIAS(dlb_monitor_t*, DLB_MonitoringRegionGetImplicit, (void), (), \ | ||
| 175 | DLB_MonitoringRegionGetGlobal) | ||
| 176 | |||
| 177 | DLB_EXPORT_SYMBOL | ||
| 178 | DLB_ALIAS(const dlb_monitor_t*, DLB_MonitoringRegionGetMPIRegion, (void), (), \ | ||
| 179 | DLB_MonitoringRegionGetGlobal) | ||
| 180 | |||
| 181 | DLB_EXPORT_SYMBOL | ||
| 182 | 12 | dlb_monitor_t* DLB_MonitoringRegionRegister(const char *name){ | |
| 183 | 12 | spd_enter_dlb(thread_spd); | |
| 184 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 10 times.
|
12 | if (unlikely(!thread_spd->talp_info)) { |
| 185 | 2 | return NULL; | |
| 186 | } | ||
| 187 | 10 | return region_register(thread_spd, name); | |
| 188 | } | ||
| 189 | |||
| 190 | DLB_EXPORT_SYMBOL | ||
| 191 | 3 | int DLB_MonitoringRegionReset(dlb_monitor_t *handle){ | |
| 192 | 3 | spd_enter_dlb(thread_spd); | |
| 193 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1 times.
|
3 | if (unlikely(!thread_spd->talp_info)) { |
| 194 | 2 | return DLB_ERR_NOTALP; | |
| 195 | } | ||
| 196 | 1 | return region_reset(thread_spd, handle); | |
| 197 | } | ||
| 198 | |||
| 199 | DLB_EXPORT_SYMBOL | ||
| 200 | 311 | int DLB_MonitoringRegionStart(dlb_monitor_t *handle){ | |
| 201 | 311 | spd_enter_dlb(thread_spd); | |
| 202 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 309 times.
|
311 | if (unlikely(!thread_spd->talp_info)) { |
| 203 | 2 | return DLB_ERR_NOTALP; | |
| 204 | } | ||
| 205 | 309 | return region_start(thread_spd, handle); | |
| 206 | } | ||
| 207 | |||
| 208 | DLB_EXPORT_SYMBOL | ||
| 209 | 411 | int DLB_MonitoringRegionStop(dlb_monitor_t *handle){ | |
| 210 | 411 | spd_enter_dlb(thread_spd); | |
| 211 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 409 times.
|
411 | if (unlikely(!thread_spd->talp_info)) { |
| 212 | 2 | return DLB_ERR_NOTALP; | |
| 213 | } | ||
| 214 | 409 | return region_stop(thread_spd, handle); | |
| 215 | } | ||
| 216 | |||
| 217 | DLB_EXPORT_SYMBOL | ||
| 218 | 6 | int DLB_MonitoringRegionReport(const dlb_monitor_t *handle){ | |
| 219 | 6 | spd_enter_dlb(thread_spd); | |
| 220 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 4 times.
|
6 | if (unlikely(!thread_spd->talp_info)) { |
| 221 | 2 | return DLB_ERR_NOTALP; | |
| 222 | } | ||
| 223 | 4 | return region_report(thread_spd, handle); | |
| 224 | } | ||
| 225 | |||
| 226 | DLB_EXPORT_SYMBOL | ||
| 227 | 4 | int DLB_MonitoringRegionsUpdate(void) { | |
| 228 | 4 | spd_enter_dlb(thread_spd); | |
| 229 | |||
| 230 | 4 | void *talp_info = thread_spd->talp_info; | |
| 231 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 3 times.
|
4 | if (unlikely(!talp_info)) { |
| 232 | 1 | return DLB_ERR_NOTALP; | |
| 233 | } | ||
| 234 | |||
| 235 | 3 | talp_sample_update(talp_info); | |
| 236 | 3 | return talp_aggregate_samples_to_regions(talp_info); | |
| 237 | } | ||
| 238 | |||
| 239 | DLB_EXPORT_SYMBOL | ||
| 240 | 1 | int DLB_TALP_CollectPOPMetrics(dlb_monitor_t *monitor, dlb_pop_metrics_t *pop_metrics) { | |
| 241 | 1 | spd_enter_dlb(thread_spd); | |
| 242 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | if (unlikely(!thread_spd->talp_info)) { |
| 243 | 1 | return DLB_ERR_NOTALP; | |
| 244 | } | ||
| 245 | ✗ | return talp_collect_pop_metrics(thread_spd, monitor, pop_metrics); | |
| 246 | } | ||
| 247 | |||
| 248 | DLB_EXPORT_SYMBOL | ||
| 249 | 1 | int DLB_TALP_CollectPOPNodeMetrics(dlb_monitor_t *monitor, dlb_node_metrics_t *node_metrics) { | |
| 250 | 1 | spd_enter_dlb(thread_spd); | |
| 251 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | if (unlikely(!thread_spd->talp_info)) { |
| 252 | 1 | return DLB_ERR_NOTALP; | |
| 253 | } | ||
| 254 | ✗ | if (unlikely(!thread_spd->options.barrier)) { | |
| 255 | ✗ | return DLB_ERR_NOCOMP; | |
| 256 | } | ||
| 257 | ✗ | return talp_collect_pop_node_metrics(thread_spd, monitor, node_metrics); | |
| 258 | } | ||
| 259 | |||
| 260 | DLB_EXPORT_SYMBOL | ||
| 261 | DLB_ALIAS(int, DLB_TALP_CollectNodeMetrics, \ | ||
| 262 | (dlb_monitor_t *monitor, dlb_node_metrics_t *node_metrics), \ | ||
| 263 | (monitor, node_metrics), \ | ||
| 264 | DLB_TALP_CollectPOPNodeMetrics) | ||
| 265 |