GCC Code Coverage Report


Directory: src/
File: src/LB_core/DLB_talp.c
Date: 2024-11-22 17:07:10
Exec Total Coverage
Lines: 614 634 96.8%
Functions: 49 51 96.1%
Branches: 202 272 74.3%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2024 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include "LB_core/DLB_talp.h"
25
26 #include "LB_core/node_barrier.h"
27 #include "LB_core/spd.h"
28 #include "LB_core/talp_types.h"
29 #include "apis/dlb_talp.h"
30 #include "apis/dlb_errors.h"
31 #include "support/debug.h"
32 #include "support/error.h"
33 #include "support/mytime.h"
34 #include "support/tracing.h"
35 #include "support/options.h"
36 #include "support/mask_utils.h"
37 #include "support/perf_metrics.h"
38 #include "support/talp_output.h"
39 #include "LB_comm/shmem_talp.h"
40 #include "LB_numThreads/omptool.h"
41 #ifdef MPI_LIB
42 #include "LB_MPI/process_MPI.h"
43 #endif
44
45 #include <sched.h>
46 #include <string.h>
47 #include <stdlib.h>
48 #include <pthread.h>
49
50 #ifdef PAPI_LIB
51 #include <papi.h>
52 #endif
53
54 extern __thread bool thread_is_observer;
55
56 static inline void set_sample_state(talp_sample_t *sample, enum talp_sample_state state,
57 bool papi);
58 static void talp_dealloc_samples(const subprocess_descriptor_t *spd);
59 static void talp_record_monitor(const subprocess_descriptor_t *spd,
60 const dlb_monitor_t *monitor);
61 static void talp_aggregate_samples(const subprocess_descriptor_t *spd);
62 static talp_sample_t* talp_get_thread_sample(const subprocess_descriptor_t *spd);
63
64 #if MPI_LIB
65 static MPI_Datatype mpi_int64_type;
66 #endif
67
68 #if PAPI_LIB
69 static __thread int EventSet = PAPI_NULL;
70 #endif
71
72 const char *global_region_name = "Global";
73
74
75 /*********************************************************************************/
76 /* TALP Monitoring Regions */
77 /*********************************************************************************/
78
79 /* Helper function for GTree: Compare region names */
80 19817 static gint key_compare_func(gconstpointer a, gconstpointer b) {
81 19817 return strncmp(a, b, DLB_MONITOR_NAME_MAX-1);
82 }
83
84 /* Helper function for GTree: deallocate */
85 1039 static void dealloc_region(gpointer data) {
86
87 1039 dlb_monitor_t *monitor = data;
88
89 /* Free private data */
90 1039 monitor_data_t *monitor_data = monitor->_data;
91 1039 free(monitor_data);
92 1039 monitor_data = NULL;
93
94 /* Free name */
95 1039 free((char*)monitor->name);
96 1039 monitor->name = NULL;
97
98 /* Free monitor */
99 1039 free(monitor);
100 1039 }
101
102 /* Unique region ids */
103 1039 static int get_new_monitor_id(void) {
104 static atomic_int id = 0;
105 1039 return DLB_ATOMIC_ADD_FETCH_RLX(&id, 1);
106 }
107
108 /* Unique anonymous regions */
109 4 static int get_new_anonymous_id(void) {
110 static atomic_int id = 0;
111 4 return DLB_ATOMIC_ADD_FETCH_RLX(&id, 1);
112 }
113
114 /* Return true if the region is to be enabled */
115 1039 static bool parse_region_select(const char *region_select, const char *region_name) {
116 /* Default case, all regions enabled */
117
1/2
✓ Branch 0 taken 1039 times.
✗ Branch 1 not taken.
1039 if (region_select == NULL
118
2/2
✓ Branch 0 taken 1011 times.
✓ Branch 1 taken 28 times.
1039 || region_select[0] == '\0'
119
2/2
✓ Branch 0 taken 1004 times.
✓ Branch 1 taken 7 times.
1011 || strcmp(region_select, "all") == 0) {
120 1032 return true;
121 }
122
123 /* Special case, all regions disabled */
124
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 6 times.
7 if (strcmp(region_select, "none") == 0) {
125 1 return false;
126 }
127
128 /* Break region_select into tokens and find region_name */
129 6 bool found_in_select = false;
130 6 size_t len = strlen(region_select);
131 6 char *region_select_copy = malloc(sizeof(char)*(len+1));
132 6 strcpy(region_select_copy, region_select);
133 char *saveptr;
134 6 char *token = strtok_r(region_select_copy, ",", &saveptr);
135
2/2
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 3 times.
12 while (token) {
136 /* Region name is found */
137
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 7 times.
9 if (strcmp(token, region_name) == 0) {
138 2 found_in_select = true;
139 2 break;
140 }
141
142 /* Region name is same as global region, and same as token (ignoring case) */
143
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 4 times.
7 if (strcasecmp(region_name, global_region_name) == 0
144
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
3 && strcasecmp(token, global_region_name) == 0) {
145 1 found_in_select = true;
146 1 break;
147 }
148
149 /* next token */
150 6 token = strtok_r(NULL, ",", &saveptr);
151 }
152 6 free(region_select_copy);
153 6 return found_in_select;
154 }
155
156 1039 static void monitoring_region_initialize(dlb_monitor_t *monitor, int id, const
157 char *name, pid_t pid, float avg_cpus, const char *region_select, bool have_shmem) {
158 /* Initialize private monitor data */
159 1039 monitor_data_t *monitor_data = malloc(sizeof(monitor_data_t));
160 1039 *monitor_data = (const monitor_data_t) {
161 .id = id,
162 .node_shared_id = -1,
163 };
164
165 /* Parse --talp-region-select if needed */
166 1039 monitor_data->flags.enabled = parse_region_select(region_select, name);
167
168 /* Allocate monitor name */
169 1039 char *allocated_name = malloc(DLB_MONITOR_NAME_MAX*sizeof(char));
170 1039 snprintf(allocated_name, DLB_MONITOR_NAME_MAX, "%s", name);
171
172 /* Initialize monitor */
173 1039 *monitor = (const dlb_monitor_t) {
174 .name = allocated_name,
175 .avg_cpus = avg_cpus,
176 ._data = monitor_data,
177 };
178
179 /* Register name in the instrumentation tool */
180 instrument_register_event(MONITOR_REGION, monitor_data->id, name);
181
182 /* Register region in shmem */
183
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 1036 times.
1039 if (have_shmem) {
184 3 int err = shmem_talp__register(pid, avg_cpus, monitor->name, &monitor_data->node_shared_id);
185
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3 times.
3 if (err == DLB_ERR_NOMEM) {
186 warning("Region %s has been correctly registered but cannot be shared among other"
187 " processes due to lack of space in the TALP shared memory. Features like"
188 " node report or gathering data from external processes may not work for"
189 " this region. If needed, increase the TALP shared memory capacity using"
190 " the flag --talp-regions-per-proc. Run dlb -hh for more info.",
191 monitor->name);
192
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 3 times.
3 } else if (err < DLB_SUCCESS) {
193 fatal("Unknown error registering region %s, please report bug at %s",
194 monitor->name, PACKAGE_BUGREPORT);
195 }
196 }
197 1039 }
198
199 7 struct dlb_monitor_t* monitoring_region_get_global_region(
200 const subprocess_descriptor_t *spd) {
201 7 talp_info_t *talp_info = spd->talp_info;
202
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 1 times.
7 return talp_info ? talp_info->monitor : NULL;
203 }
204
205 6 const char* monitoring_region_get_global_region_name(void) {
206 6 return global_region_name;
207 }
208
209 1045 dlb_monitor_t* monitoring_region_register(const subprocess_descriptor_t *spd,
210 const char* name) {
211
212 1045 talp_info_t *talp_info = spd->talp_info;
213
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1045 times.
1045 if (talp_info == NULL) return NULL;
214
215 1045 dlb_monitor_t *monitor = NULL;
216
4/4
✓ Branch 0 taken 1042 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 1041 times.
1045 bool anonymous_region = (name == NULL || *name == '\0');
217
4/4
✓ Branch 0 taken 1041 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 13 times.
✓ Branch 3 taken 1028 times.
1045 bool global_region = !anonymous_region && name == global_region_name;
218
219 /* Check again if the pointers are different but the string content is the
220 * same as the global region, ignoring case */
221
2/2
✓ Branch 0 taken 1041 times.
✓ Branch 1 taken 4 times.
1045 if (!anonymous_region
222
2/2
✓ Branch 0 taken 1028 times.
✓ Branch 1 taken 13 times.
1041 && !global_region
223
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 1025 times.
1028 && strncasecmp(global_region_name, name, DLB_MONITOR_NAME_MAX-1) == 0) {
224 3 name = global_region_name;
225 3 global_region = true;
226 }
227
228 /* Found monitor if already registered */
229
2/2
✓ Branch 0 taken 1041 times.
✓ Branch 1 taken 4 times.
1045 if (!anonymous_region) {
230 1041 pthread_mutex_lock(&talp_info->regions_mutex);
231 {
232 1041 monitor = g_tree_lookup(talp_info->regions, name);
233 }
234 1041 pthread_mutex_unlock(&talp_info->regions_mutex);
235
236
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 1035 times.
1041 if (monitor != NULL) {
237 6 return monitor;
238 }
239 }
240
241 /* Otherwise, create new monitoring region */
242 1039 monitor = malloc(sizeof(dlb_monitor_t));
243
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1039 times.
1039 fatal_cond(!monitor, "Could not register a new monitoring region."
244 " Please report at "PACKAGE_BUGREPORT);
245
246 /* Determine the initial number of assigned CPUs for the region */
247 1039 float avg_cpus = CPU_COUNT(&spd->process_mask);
248
249 /* Construct name if anonymous region */
250 char monitor_name[DLB_MONITOR_NAME_MAX];
251
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 1035 times.
1039 if (anonymous_region) {
252 4 snprintf(monitor_name, DLB_MONITOR_NAME_MAX, "Anonymous Region %d",
253 get_new_anonymous_id());
254 4 name = monitor_name;
255 }
256
257 /* Initialize values */
258 2078 bool have_shmem = talp_info->flags.have_shmem
259
3/6
✓ Branch 0 taken 1036 times.
✓ Branch 1 taken 3 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 1036 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
1039 || (talp_info->flags.have_minimal_shmem && global_region);
260 1039 monitoring_region_initialize(monitor, get_new_monitor_id(), name,
261 1039 spd->id, avg_cpus, spd->options.talp_region_select, have_shmem);
262
263 /* Finally, insert */
264 1039 pthread_mutex_lock(&talp_info->regions_mutex);
265 {
266 1039 g_tree_insert(talp_info->regions, (gpointer)monitor->name, monitor);
267 }
268 1039 pthread_mutex_unlock(&talp_info->regions_mutex);
269
270 1039 return monitor;
271 }
272
273 3 int monitoring_region_reset(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) {
274
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
3 if (monitor == DLB_GLOBAL_REGION) {
275 1 talp_info_t *talp_info = spd->talp_info;
276 1 monitor = talp_info->monitor;
277 }
278
279 /* Reset everything except these fields: */
280 3 *monitor = (const dlb_monitor_t) {
281 3 .name = monitor->name,
282 3 .num_resets = monitor->num_resets + 1,
283 3 ._data = monitor->_data,
284 };
285
286 3 monitor_data_t *monitor_data = monitor->_data;
287 3 monitor_data->flags.started = false;
288
289 3 return DLB_SUCCESS;
290 }
291
292 1457 int monitoring_region_start(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) {
293 /* Observer threads don't have a valid sample so they cannot start/stop regions */
294
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1456 times.
1457 if (unlikely(thread_is_observer)) return DLB_ERR_PERM;
295
296 1456 talp_info_t *talp_info = spd->talp_info;
297
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1454 times.
1456 if (monitor == DLB_GLOBAL_REGION) {
298 2 monitor = talp_info->monitor;
299 }
300
301 int error;
302 1456 monitor_data_t *monitor_data = monitor->_data;
303
304
4/4
✓ Branch 0 taken 1454 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1450 times.
✓ Branch 3 taken 4 times.
1456 if (!monitor_data->flags.started && monitor_data->flags.enabled) {
305 /* Gather samples from all threads and update regions */
306 1450 talp_aggregate_samples(spd);
307
308
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1450 times.
1450 verbose(VB_TALP, "Starting region %s", monitor->name);
309 instrument_event(MONITOR_REGION, monitor_data->id, EVENT_BEGIN);
310
311 /* Thread sample was just updated, use timestamp as starting time */
312 1450 talp_sample_t *thread_sample = talp_get_thread_sample(spd);
313 1450 monitor->start_time = thread_sample->last_updated_timestamp;
314 1450 monitor->stop_time = 0;
315
316 1450 pthread_mutex_lock(&talp_info->regions_mutex);
317 {
318 1450 monitor_data->flags.started = true;
319 1450 talp_info->open_regions = g_slist_prepend(talp_info->open_regions, monitor);
320 }
321 1450 pthread_mutex_unlock(&talp_info->regions_mutex);
322
323 /* Normally, the sample state will be 'useful' at this point, but on
324 * certain cases where neither talp_mpi_init nor talp_openmp_init have
325 * been called, this is necessary */
326
2/2
✓ Branch 0 taken 10 times.
✓ Branch 1 taken 1440 times.
1450 if (thread_sample->state != useful) {
327 10 set_sample_state(thread_sample, useful, talp_info->flags.papi);
328 }
329
330 1450 error = DLB_SUCCESS;
331 } else {
332 6 error = DLB_NOUPDT;
333 }
334
335 1456 return error;
336 }
337
338 1461 int monitoring_region_stop(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) {
339 /* Observer threads don't have a valid sample so they cannot start/stop regions */
340
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1459 times.
1461 if (unlikely(thread_is_observer)) return DLB_ERR_PERM;
341
342 1459 talp_info_t *talp_info = spd->talp_info;
343
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1457 times.
1459 if (monitor == DLB_GLOBAL_REGION) {
344 2 monitor = talp_info->monitor;
345
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 1451 times.
1457 } else if (monitor == DLB_LAST_OPEN_REGION) {
346
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 1 times.
6 if (talp_info->open_regions != NULL) {
347 5 monitor = talp_info->open_regions->data;
348 } else {
349 1 return DLB_ERR_NOENT;
350 }
351 }
352
353 int error;
354 1458 monitor_data_t *monitor_data = monitor->_data;
355
356
2/2
✓ Branch 0 taken 1449 times.
✓ Branch 1 taken 9 times.
1458 if (monitor_data->flags.started) {
357 /* Gather samples from all threads and update regions */
358 1449 talp_aggregate_samples(spd);
359
360 /* Stop timer */
361 1449 talp_sample_t *thread_sample = talp_get_thread_sample(spd);
362 1449 monitor->stop_time = thread_sample->last_updated_timestamp;
363 1449 monitor->elapsed_time += monitor->stop_time - monitor->start_time;
364 1449 ++(monitor->num_measurements);
365
366 1449 pthread_mutex_lock(&talp_info->regions_mutex);
367 {
368 1449 monitor_data->flags.started = false;
369 1449 talp_info->open_regions = g_slist_remove(talp_info->open_regions, monitor);
370 }
371 1449 pthread_mutex_unlock(&talp_info->regions_mutex);
372
373
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1449 times.
1449 verbose(VB_TALP, "Stopping region %s", monitor->name);
374 instrument_event(MONITOR_REGION, monitor_data->id, EVENT_END);
375 1449 error = DLB_SUCCESS;
376 } else {
377 9 error = DLB_NOUPDT;
378 }
379
380 1458 return error;
381 }
382
383 9 bool monitoring_region_is_started(const dlb_monitor_t *monitor) {
384 9 return ((monitor_data_t*)monitor->_data)->flags.started;
385 }
386
387 1 void monitoring_region_set_internal(struct dlb_monitor_t *monitor, bool internal) {
388 1 ((monitor_data_t*)monitor->_data)->flags.internal = internal;
389 1 }
390
391 8 int monitoring_region_report(const subprocess_descriptor_t *spd, const dlb_monitor_t *monitor) {
392 8 talp_info_t *talp_info = spd->talp_info;
393
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
8 if (monitor == DLB_GLOBAL_REGION) {
394 1 monitor = talp_info->monitor;
395 }
396
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
8 if (((monitor_data_t*)monitor->_data)->flags.internal) {
397 1 return DLB_NOUPDT;
398 }
399
400 #ifdef PAPI_LIB
401 bool have_papi = talp_info->flags.papi;
402 #else
403 7 bool have_papi = false;
404 #endif
405 7 talp_output_print_monitoring_region(monitor, mu_to_str(&spd->process_mask),
406 7 talp_info->flags.have_mpi, talp_info->flags.have_openmp, have_papi);
407
408 7 return DLB_SUCCESS;
409 }
410
411 5 int monitoring_regions_force_update(const subprocess_descriptor_t *spd) {
412 /* Observer threads don't have a valid sample so they cannot start/stop regions */
413
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
5 if (unlikely(thread_is_observer)) return DLB_ERR_PERM;
414
415 4 talp_aggregate_samples(spd);
416 4 return DLB_SUCCESS;
417 }
418
419 /* Update all monitoring regions with the macrosample */
420 2909 static void monitoring_regions_update_all(const subprocess_descriptor_t *spd,
421 const talp_macrosample_t *macrosample, int num_cpus) {
422 2909 talp_info_t *talp_info = spd->talp_info;
423
424 /* Update all open regions */
425 2909 pthread_mutex_lock(&talp_info->regions_mutex);
426 {
427 2909 for (GSList *node = talp_info->open_regions;
428
2/2
✓ Branch 0 taken 2298 times.
✓ Branch 1 taken 2909 times.
5207 node != NULL;
429 2298 node = node->next) {
430 2298 dlb_monitor_t *monitor = node->data;
431 2298 monitor_data_t *monitor_data = monitor->_data;
432
433 /* Update number of CPUs if needed */
434 2298 monitor->num_cpus = max_int(monitor->num_cpus, num_cpus);
435
436 /* Timers */
437 2298 monitor->useful_time += macrosample->timers.useful;
438 2298 monitor->mpi_time += macrosample->timers.not_useful_mpi;
439 2298 monitor->omp_load_imbalance_time += macrosample->timers.not_useful_omp_in_lb;
440 2298 monitor->omp_scheduling_time += macrosample->timers.not_useful_omp_in_sched;
441 2298 monitor->omp_serialization_time += macrosample->timers.not_useful_omp_out;
442 #ifdef PAPI_LIB
443 /* Counters */
444 monitor->cycles += macrosample->counters.cycles;
445 monitor->instructions += macrosample->counters.instructions;
446 #endif
447 /* Stats */
448 2298 monitor->num_mpi_calls += macrosample->stats.num_mpi_calls;
449 2298 monitor->num_omp_parallels += macrosample->stats.num_omp_parallels;
450 2298 monitor->num_omp_tasks += macrosample->stats.num_omp_tasks;
451
452 /* Update shared memory only if requested */
453
2/2
✓ Branch 0 taken 11 times.
✓ Branch 1 taken 2287 times.
2298 if (talp_info->flags.external_profiler) {
454 11 shmem_talp__set_times(monitor_data->node_shared_id,
455 monitor->mpi_time,
456 monitor->useful_time);
457 }
458 }
459 }
460 2909 pthread_mutex_unlock(&talp_info->regions_mutex);
461 2909 }
462
463 /* Update all open nested regions (so, excluding the innermost) and add the
464 * time since its start time until the sample last timestamp (which is the time
465 * that has yet not been added to the regions) as omp_serialization_time */
466 1 static void monitoring_regions_update_nested(const subprocess_descriptor_t *spd,
467 const talp_sample_t *sample) {
468
469 1 talp_info_t *talp_info = spd->talp_info;
470
471 /* Update all open nested regions */
472 1 pthread_mutex_lock(&talp_info->regions_mutex);
473 {
474 2 GSList *nested_open_regions = talp_info->open_regions
475 1 ? talp_info->open_regions->next
476
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 : NULL;
477
478 1 for (GSList *node = nested_open_regions;
479
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 node != NULL;
480 1 node = node->next) {
481
482 1 dlb_monitor_t *monitor = node->data;
483 1 monitor->omp_serialization_time +=
484 1 sample->last_updated_timestamp - monitor->start_time;
485 }
486 }
487 1 pthread_mutex_unlock(&talp_info->regions_mutex);
488 1 }
489
490
491 /*********************************************************************************/
492 /* Init / Finalize */
493 /*********************************************************************************/
494
495 /* Executed once */
496 static inline int init_papi(void) {
497 #ifdef PAPI_LIB
498 ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi,
499 "Error invoking %s when PAPI has been disabled", __FUNCTION__);
500
501 /* Library init */
502 if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) {
503 warning("PAPI Library versions differ");
504 return -1;
505 }
506
507 /* Activate thread tracing */
508 int error = PAPI_thread_init(pthread_self);
509 if (error != PAPI_OK) {
510 warning("PAPI Error during thread initialization. %d: %s",
511 error, PAPI_strerror(error));
512 return -1;
513 }
514 #endif
515 return 0;
516 }
517
518 /* Executed once per thread */
519 static inline int init_papi_counters(void) {
520 #ifdef PAPI_LIB
521 ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi,
522 "Error invoking %s when PAPI has been disabled", __FUNCTION__);
523
524 int error = PAPI_register_thread();
525 if (error != PAPI_OK) {
526 warning("PAPI Error during thread registration. %d: %s",
527 error, PAPI_strerror(error));
528 return -1;
529 }
530
531 /* Eventset creation */
532 EventSet = PAPI_NULL;
533 error = PAPI_create_eventset(&EventSet);
534 if (error != PAPI_OK) {
535 warning("PAPI Error during eventset creation. %d: %s",
536 error, PAPI_strerror(error));
537 return -1;
538 }
539
540 int Events[2] = {PAPI_TOT_CYC, PAPI_TOT_INS};
541 error = PAPI_add_events(EventSet, Events, 2);
542 if (error != PAPI_OK) {
543 warning("PAPI Error adding events. %d: %s",
544 error, PAPI_strerror(error));
545 return -1;
546 }
547
548 /* Start tracing */
549 error = PAPI_start(EventSet);
550 if (error != PAPI_OK) {
551 warning("PAPI Error during tracing initialization: %d: %s",
552 error, PAPI_strerror(error));
553 return -1;
554 }
555 #endif
556 return 0;
557 }
558
559 static inline void reset_papi_counters(void) {
560 #ifdef PAPI_LIB
561 ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi,
562 "Error invoking %s when PAPI has been disabled", __FUNCTION__);
563
564 int error = PAPI_reset(EventSet);
565 if (error != PAPI_OK) verbose(VB_TALP, "Error resetting counters");
566 #endif
567 }
568
569 13 void talp_init(subprocess_descriptor_t *spd) {
570
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 13 times.
13 ensure(!spd->talp_info, "TALP already initialized");
571
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 13 times.
13 ensure(!thread_is_observer, "An observer thread cannot call talp_init");
572
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 10 times.
13 verbose(VB_TALP, "Initializing TALP module");
573
574 /* Initialize talp info */
575 13 talp_info_t *talp_info = malloc(sizeof(talp_info_t));
576 13 *talp_info = (const talp_info_t) {
577 .flags = {
578 13 .external_profiler = spd->options.talp_external_profiler,
579 13 .papi = spd->options.talp_papi,
580 13 .have_shmem = spd->options.talp_external_profiler,
581 13 .have_minimal_shmem = !spd->options.talp_external_profiler
582
3/4
✓ Branch 0 taken 11 times.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 11 times.
13 && spd->options.talp_summary & SUMMARY_NODE,
583 },
584 13 .regions = g_tree_new_full(
585 (GCompareDataFunc)key_compare_func,
586 NULL, NULL, dealloc_region),
587 .regions_mutex = PTHREAD_MUTEX_INITIALIZER,
588 .samples_mutex = PTHREAD_MUTEX_INITIALIZER,
589 };
590 13 spd->talp_info = talp_info;
591
592 /* Initialize shared memory */
593 13 int regions_per_proc = talp_info->flags.have_shmem
594 13 ? spd->options.talp_regions_per_proc
595
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 11 times.
13 : talp_info->flags.have_minimal_shmem
596 11 ? 1
597 11 : 0;
598
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 11 times.
13 if (regions_per_proc > 0) {
599 2 shmem_talp__init(spd->options.shm_key, regions_per_proc);
600 }
601
602 /* Initialize global region monitor
603 * (at this point we don't know how many CPUs, it will be fixed in talp_openmp_init) */
604 13 talp_info->monitor = monitoring_region_register(spd, global_region_name);
605
606
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 13 times.
13 verbose(VB_TALP, "TALP module with workers mask: %s", mu_to_str(&spd->process_mask));
607
608 /* Initialize and start running PAPI */
609
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 9 times.
13 if (talp_info->flags.papi) {
610 #ifdef PAPI_LIB
611 if (init_papi() != 0 || init_papi_counters() != 0) {
612 warning("PAPI initialization has failed, disabling option.");
613 talp_info->flags.papi = false;
614 }
615 #else
616 4 warning("DLB has not been configured with PAPI support, disabling option.");
617 4 talp_info->flags.papi = false;
618 #endif
619 }
620 13 }
621
622 13 void talp_finalize(subprocess_descriptor_t *spd) {
623
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 13 times.
13 ensure(spd->talp_info, "TALP is not initialized");
624
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 13 times.
13 ensure(!thread_is_observer, "An observer thread cannot call talp_finalize");
625
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 13 times.
13 verbose(VB_TALP, "Finalizing TALP module");
626
627 13 talp_info_t *talp_info = spd->talp_info;
628
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 5 times.
13 if (!talp_info->flags.have_mpi) {
629 /* If we don't have MPI support, regions may be still running and
630 * without being recorded to talp_output. Do that now. */
631
632 /* Stop open regions
633 * (Note that monitoring_region_stop need to acquire the regions_mutex
634 * lock, so we we need to iterate without it) */
635
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
10 while(talp_info->open_regions != NULL) {
636 2 dlb_monitor_t *monitor = talp_info->open_regions->data;
637 2 monitoring_region_stop(spd, monitor);
638 }
639
640 8 pthread_mutex_lock(&talp_info->regions_mutex);
641 {
642 /* Record all regions */
643 8 for (GTreeNode *node = g_tree_node_first(talp_info->regions);
644
2/2
✓ Branch 0 taken 13 times.
✓ Branch 1 taken 8 times.
21 node != NULL;
645 13 node = g_tree_node_next(node)) {
646 13 const dlb_monitor_t *monitor = g_tree_node_value(node);
647 13 talp_record_monitor(spd, monitor);
648 }
649 }
650 8 pthread_mutex_unlock(&talp_info->regions_mutex);
651 }
652
653 /* Print/write all collected summaries */
654 13 talp_output_finalize(spd->options.talp_output_file);
655
656 /* Deallocate samples structure */
657 13 talp_dealloc_samples(spd);
658
659 /* Finalize shared memory */
660
3/4
✓ Branch 0 taken 11 times.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 11 times.
13 if (talp_info->flags.have_shmem || talp_info->flags.have_minimal_shmem) {
661 2 shmem_talp__finalize(spd->id);
662 }
663
664 #ifdef PAPI_LIB
665 if (talp_info->flags.papi) {
666 PAPI_shutdown();
667 }
668 #endif
669
670 /* Deallocate monitoring regions and talp_info */
671 13 pthread_mutex_lock(&talp_info->regions_mutex);
672 {
673 /* Destroy GTree, each node is deallocated with the function dealloc_region */
674 13 g_tree_destroy(talp_info->regions);
675 13 talp_info->regions = NULL;
676 13 talp_info->monitor = NULL;
677
678 /* Destroy list of open regions */
679 13 g_slist_free(talp_info->open_regions);
680 13 talp_info->open_regions = NULL;
681 }
682 13 pthread_mutex_unlock(&talp_info->regions_mutex);
683 13 free(talp_info);
684 13 spd->talp_info = NULL;
685 13 }
686
687
688 /*********************************************************************************/
689 /* Sample functions */
690 /*********************************************************************************/
691
692 static __thread talp_sample_t* _tls_sample = NULL;
693
694 /* WARNING: this function may only be called when updating own thread's sample */
695 67 static inline void set_sample_state(talp_sample_t *sample, enum talp_sample_state state,
696 bool papi) {
697 67 sample->state = state;
698
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 67 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
67 if (papi && state == useful) {
699 reset_papi_counters();
700 }
701 instrument_event(MONITOR_STATE,
702 state == disabled ? MONITOR_STATE_DISABLED
703 : state == useful ? MONITOR_STATE_USEFUL
704 : state == not_useful_mpi ? MONITOR_STATE_NOT_USEFUL_MPI
705 : state == not_useful_omp_in ? MONITOR_STATE_NOT_USEFUL_OMP_IN
706 : state == not_useful_omp_out ? MONITOR_STATE_NOT_USEFUL_OMP_OUT
707 : 0,
708 EVENT_BEGIN);
709 67 }
710
711 /* Get the TLS associated sample */
712 2957 static talp_sample_t* talp_get_thread_sample(const subprocess_descriptor_t *spd) {
713 /* Thread already has an allocated sample, return it */
714
2/2
✓ Branch 0 taken 2946 times.
✓ Branch 1 taken 11 times.
2957 if (likely(_tls_sample != NULL)) return _tls_sample;
715
716 /* Observer threads don't have a valid sample */
717
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 11 times.
11 if (unlikely(thread_is_observer)) return NULL;
718
719 /* Otherwise, allocate */
720 11 talp_info_t *talp_info = spd->talp_info;
721 11 pthread_mutex_lock(&talp_info->samples_mutex);
722 {
723 11 int ncpus = ++talp_info->ncpus;
724 11 void *samples = realloc(talp_info->samples, sizeof(talp_sample_t*)*ncpus);
725
1/2
✓ Branch 0 taken 11 times.
✗ Branch 1 not taken.
11 if (samples) {
726 11 talp_info->samples = samples;
727 void *new_sample;
728
1/2
✓ Branch 0 taken 11 times.
✗ Branch 1 not taken.
11 if (posix_memalign(&new_sample, DLB_CACHE_LINE, sizeof(talp_sample_t)) == 0) {
729 11 _tls_sample = new_sample;
730 11 talp_info->samples[ncpus-1] = new_sample;
731 }
732 }
733 }
734 11 pthread_mutex_unlock(&talp_info->samples_mutex);
735
736
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 11 times.
11 fatal_cond(_tls_sample == NULL, "TALP: could not allocate thread sample");
737
738 /* If a thread is created mid-region, its initial time is that of the
739 * innermost open region, otherwise it is the current time */
740 int64_t last_updated_timestamp;
741
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 10 times.
11 if (talp_info->open_regions) {
742 1 const dlb_monitor_t *monitor = talp_info->open_regions->data;
743 1 last_updated_timestamp = monitor->start_time;
744 } else {
745 10 last_updated_timestamp = get_time_in_ns();
746 }
747
748 11 *_tls_sample = (const talp_sample_t) {
749 .last_updated_timestamp = last_updated_timestamp,
750 };
751
752 11 set_sample_state(_tls_sample, disabled, talp_info->flags.papi);
753
754 #ifdef INSTRUMENTATION_VERSION
755 unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR};
756 long long papi_values[] = {0, 0};
757 instrument_nevent(2, events, papi_values);
758 #endif
759
760 11 return _tls_sample;
761 }
762
763 13 static void talp_dealloc_samples(const subprocess_descriptor_t *spd) {
764 /* This only nullifies the current thread pointer, but this function is
765 * only really important when TALP is finalized and then initialized again
766 */
767 13 _tls_sample = NULL;
768
769 13 talp_info_t *talp_info = spd->talp_info;
770 13 pthread_mutex_lock(&talp_info->samples_mutex);
771 {
772 13 free(talp_info->samples);
773 13 talp_info->samples = NULL;
774 }
775 13 pthread_mutex_unlock(&talp_info->samples_mutex);
776 13 }
777
778 enum { NO_TIMESTAMP = 0 };
779 /* Compute new microsample (time since last update) and update sample values */
780 2937 static void talp_update_sample(talp_sample_t *sample, bool papi, int64_t timestamp) {
781 /* Observer threads ignore this function */
782
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2937 times.
2937 if (unlikely(sample == NULL)) return;
783
784 /* Compute duration and set new last_updated_timestamp */
785
2/2
✓ Branch 0 taken 34 times.
✓ Branch 1 taken 2903 times.
2937 int64_t now = timestamp == NO_TIMESTAMP ? get_time_in_ns() : timestamp;
786 2937 int64_t microsample_duration = now - sample->last_updated_timestamp;
787 2937 sample->last_updated_timestamp = now;
788
789 /* Update the appropriate sample timer */
790
5/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 2914 times.
✓ Branch 2 taken 7 times.
✓ Branch 3 taken 11 times.
✓ Branch 4 taken 1 times.
✗ Branch 5 not taken.
2937 switch(sample->state) {
791 4 case disabled:
792 4 break;
793 2914 case useful:
794 2914 DLB_ATOMIC_ADD_RLX(&sample->timers.useful, microsample_duration);
795 2914 break;
796 7 case not_useful_mpi:
797 7 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_mpi, microsample_duration);
798 7 break;
799 11 case not_useful_omp_in:
800 11 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_in, microsample_duration);
801 11 break;
802 1 case not_useful_omp_out:
803 1 DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_out, microsample_duration);
804 1 break;
805 }
806
807 #ifdef PAPI_LIB
808 if (papi) {
809 /* Only read counters if we are updating this thread's sample */
810 if (sample == talp_get_thread_sample(thread_spd)) {
811 if (sample->state == useful) {
812 /* Read */
813 long long papi_values[2];
814 int error = PAPI_read(EventSet, papi_values);
815 if (error != PAPI_OK) {
816 verbose(VB_TALP, "stop return code: %d, %s", error, PAPI_strerror(error));
817 }
818
819 /* Atomically add papi_values to sample structure */
820 DLB_ATOMIC_ADD_RLX(&sample->counters.cycles, papi_values[0]);
821 DLB_ATOMIC_ADD_RLX(&sample->counters.instructions, papi_values[1]);
822
823 #ifdef INSTRUMENTATION_VERSION
824 unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR};
825 instrument_nevent(2, events, papi_values);
826 #endif
827
828 /* Counters are reset here and each time the sample is set to useful */
829 reset_papi_counters();
830 }
831 else {
832 #ifdef INSTRUMENTATION_VERSION
833 /* Emit 0's to distinguish useful chunks in traces */
834 unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR};
835 long long papi_values[] = {0, 0};
836 instrument_nevent(2, events, papi_values);
837 #endif
838 }
839 }
840 }
841 #endif
842 }
843
844 /* Flush and aggregate a single sample into a macrosample */
845 2903 static inline void talp_aggregate_sample(talp_sample_t *sample,
846 talp_macrosample_t *macrosample) {
847 /* Timers */
848 2903 macrosample->timers.useful +=
849 2903 DLB_ATOMIC_EXCH_RLX(&sample->timers.useful, 0);
850 2903 macrosample->timers.not_useful_mpi +=
851 2903 DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_mpi, 0);
852 2903 macrosample->timers.not_useful_omp_out +=
853 2903 DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_omp_out, 0);
854 /* timers.not_useful_omp_in is not flushed here, make sure struct is empty */
855
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2903 times.
2903 ensure(DLB_ATOMIC_LD_RLX(&sample->timers.not_useful_omp_in) == 0,
856 "Inconsistency in TALP sample metric not_useful_omp_in."
857 " Please, report bug at " PACKAGE_BUGREPORT);
858
859 #ifdef PAPI_LIB
860 /* Counters */
861 macrosample->counters.cycles +=
862 DLB_ATOMIC_EXCH_RLX(&sample->counters.cycles, 0);
863 macrosample->counters.instructions +=
864 DLB_ATOMIC_EXCH_RLX(&sample->counters.instructions, 0);
865 #endif
866
867 /* Stats */
868 2903 macrosample->stats.num_mpi_calls +=
869 2903 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_mpi_calls, 0);
870 2903 macrosample->stats.num_omp_parallels +=
871 2903 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_omp_parallels, 0);
872 2903 macrosample->stats.num_omp_tasks +=
873 2903 DLB_ATOMIC_EXCH_RLX(&sample->stats.num_omp_tasks, 0);
874 2903 }
875
876 /* Accumulate values from samples of all threads and update regions */
877 2907 static void talp_aggregate_samples(const subprocess_descriptor_t *spd) {
878
879 int num_cpus;
880 2907 talp_info_t *talp_info = spd->talp_info;
881
882 /* Accumulate samples from all threads */
883 2907 talp_macrosample_t macrosample = (const talp_macrosample_t) {};
884 2907 pthread_mutex_lock(&talp_info->samples_mutex);
885 {
886 2907 num_cpus = talp_info->ncpus;
887
888 /* Force-update and aggregate all samples */
889 2907 int64_t timestamp = get_time_in_ns();
890
2/2
✓ Branch 0 taken 2900 times.
✓ Branch 1 taken 2907 times.
5807 for (int i = 0; i < num_cpus; ++i) {
891 2900 talp_update_sample(talp_info->samples[i], talp_info->flags.papi, timestamp);
892 2900 talp_aggregate_sample(talp_info->samples[i], &macrosample);
893 }
894 }
895 2907 pthread_mutex_unlock(&talp_info->samples_mutex);
896
897 /* Update all started regions */
898 2907 monitoring_regions_update_all(spd, &macrosample, num_cpus);
899 2907 }
900
901 /* Accumulate samples from only a subset of samples of a parallel region.
902 * Load Balance and Scheduling are computed here based on all samples. */
903 2 static void talp_aggregate_samples_parallel(const subprocess_descriptor_t *spd,
904 talp_sample_t **samples, unsigned int nelems) {
905
906 2 talp_info_t *talp_info = spd->talp_info;
907 2 talp_macrosample_t macrosample = (const talp_macrosample_t) {};
908 2 pthread_mutex_lock(&talp_info->samples_mutex);
909 {
910 /* Iterate first to force-update all samples and compute the minimum
911 * not-useful-omp-in among them */
912 2 int64_t timestamp = get_time_in_ns();
913 2 int64_t min_not_useful_omp_in = INT64_MAX;
914 unsigned int i;
915
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
5 for (i=0; i<nelems; ++i) {
916 3 talp_update_sample(samples[i], talp_info->flags.papi, timestamp);
917 3 min_not_useful_omp_in = min_int64(min_not_useful_omp_in,
918 3 DLB_ATOMIC_LD_RLX(&samples[i]->timers.not_useful_omp_in));
919 }
920
921 /* Iterate again to accumulate Load Balance, and to aggregate sample */
922 2 int64_t sched_timer = min_not_useful_omp_in * nelems;
923 2 int64_t lb_timer = 0;
924
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
5 for (i=0; i<nelems; ++i) {
925 3 lb_timer += DLB_ATOMIC_EXCH_RLX(&samples[i]->timers.not_useful_omp_in, 0)
926 3 - min_not_useful_omp_in;
927 3 talp_aggregate_sample(samples[i], &macrosample);
928 }
929
930 /* Update derived timers into macrosample */
931 2 macrosample.timers.not_useful_omp_in_lb = lb_timer;
932 2 macrosample.timers.not_useful_omp_in_sched = sched_timer;
933 }
934 2 pthread_mutex_unlock(&talp_info->samples_mutex);
935
936 /* Update all started regions */
937 2 monitoring_regions_update_all(spd, &macrosample, nelems);
938 2 }
939
940
941 /*********************************************************************************/
942 /* TALP Record in serial (non-MPI) mode */
943 /*********************************************************************************/
944
945 /* For any given monitor, record metrics considering only this (sub-)process */
946 13 static void talp_record_monitor(const subprocess_descriptor_t *spd,
947 const dlb_monitor_t *monitor) {
948
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 11 times.
13 if (spd->options.talp_summary & SUMMARY_PROCESS) {
949
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 verbose(VB_TALP, "TALP process summary: recording region %s", monitor->name);
950
951 2 process_record_t process_record = {
952 .rank = 0,
953 2 .pid = spd->id,
954 .monitor = *monitor,
955 };
956
957 /* Fill hostname and CPU mask strings in process_record */
958 2 gethostname(process_record.hostname, HOST_NAME_MAX);
959 2 snprintf(process_record.cpuset, TALP_OUTPUT_CPUSET_MAX, "%s",
960 mu_to_str(&spd->process_mask));
961 2 mu_get_quoted_mask(&spd->process_mask, process_record.cpuset_quoted,
962 TALP_OUTPUT_CPUSET_MAX);
963
964 /* Add record */
965 2 talp_output_record_process(monitor->name, &process_record, 1);
966 }
967
968
1/2
✓ Branch 0 taken 13 times.
✗ Branch 1 not taken.
13 if (spd->options.talp_summary & SUMMARY_POP_METRICS) {
969
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 5 times.
13 if (monitor->elapsed_time > 0) {
970
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
8 verbose(VB_TALP, "TALP summary: recording region %s", monitor->name);
971
972 8 talp_info_t *talp_info = spd->talp_info;
973 8 const pop_base_metrics_t base_metrics = {
974 8 .num_cpus = monitor->num_cpus,
975 .num_mpi_ranks = 0,
976 .num_nodes = 1,
977 8 .avg_cpus = monitor->avg_cpus,
978 8 .cycles = (double)monitor->cycles,
979 8 .instructions = (double)monitor->instructions,
980 8 .num_measurements = monitor->num_measurements,
981 8 .num_mpi_calls = monitor->num_mpi_calls,
982 8 .num_omp_parallels = monitor->num_omp_parallels,
983 8 .num_omp_tasks = monitor->num_omp_tasks,
984 8 .elapsed_time = monitor->elapsed_time,
985 8 .useful_time = monitor->useful_time,
986 8 .mpi_time = monitor->mpi_time,
987 8 .omp_load_imbalance_time = monitor->omp_load_imbalance_time,
988 8 .omp_scheduling_time = monitor->omp_scheduling_time,
989 8 .omp_serialization_time = monitor->omp_serialization_time,
990 8 .useful_normd_app = (double)monitor->useful_time / monitor->num_cpus,
991 8 .mpi_normd_app = (double)monitor->mpi_time / monitor->num_cpus,
992 8 .max_useful_normd_proc = (double)monitor->useful_time / monitor->num_cpus,
993 8 .max_useful_normd_node = (double)monitor->useful_time / monitor->num_cpus,
994 8 .mpi_normd_of_max_useful = (double)monitor->mpi_time / monitor->num_cpus,
995 };
996
997 dlb_pop_metrics_t pop_metrics;
998 8 talp_base_metrics_to_pop_metrics(monitor->name, &base_metrics, &pop_metrics);
999 8 talp_output_record_pop_metrics(&pop_metrics);
1000
1001
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 5 times.
8 if(monitor == talp_info->monitor) {
1002 3 talp_output_record_resources(monitor->num_cpus,
1003 /* num_nodes */ 1, /* num_ranks */ 0);
1004 }
1005
1006 } else {
1007
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 5 times.
5 verbose(VB_TALP, "TALP summary: recording empty region %s", monitor->name);
1008 5 dlb_pop_metrics_t pop_metrics = {0};
1009 5 snprintf(pop_metrics.name, DLB_MONITOR_NAME_MAX, "%s", monitor->name);
1010 5 talp_output_record_pop_metrics(&pop_metrics);
1011 }
1012 }
1013 13 }
1014
1015
1016 /*********************************************************************************/
1017 /* TALP Record in MPI mode */
1018 /*********************************************************************************/
1019
1020 #if MPI_LIB
1021 /* Compute Node summary of all Global Monitors and record data */
1022 static void talp_record_node_summary(const subprocess_descriptor_t *spd) {
1023
1024 node_record_t *node_summary = NULL;
1025 size_t node_summary_size = 0;
1026
1027 /* Perform a barrier so that all processes in the node have arrived at the
1028 * MPI_Finalize */
1029 node_barrier(spd, NULL);
1030
1031 /* Node process 0 reduces all global regions from all processes in the node */
1032 if (_process_id == 0) {
1033 /* Obtain a list of regions associated with the Global Region Name, sorted by PID */
1034 int max_procs = mu_get_system_size();
1035 talp_region_list_t *region_list = malloc(max_procs * sizeof(talp_region_list_t));
1036 int nelems;
1037 shmem_talp__get_regionlist(region_list, &nelems, max_procs, global_region_name);
1038
1039 /* Allocate and initialize node summary structure */
1040 node_summary_size = sizeof(node_record_t) + sizeof(process_in_node_record_t) * nelems;
1041 node_summary = malloc(node_summary_size);
1042 *node_summary = (const node_record_t) {
1043 .node_id = _node_id,
1044 .nelems = nelems,
1045 };
1046
1047 /* Iterate the PID list and gather times of every process */
1048 for (int i = 0; i < nelems; ++i) {
1049 int64_t mpi_time = region_list[i].mpi_time;
1050 int64_t useful_time = region_list[i].useful_time;
1051
1052 /* Save times in local structure */
1053 node_summary->processes[i].pid = region_list[i].pid;
1054 node_summary->processes[i].mpi_time = mpi_time;
1055 node_summary->processes[i].useful_time = useful_time;
1056
1057 /* Accumulate total and max values */
1058 node_summary->avg_useful_time += useful_time;
1059 node_summary->avg_mpi_time += mpi_time;
1060 node_summary->max_useful_time = max_int64(useful_time, node_summary->max_useful_time);
1061 node_summary->max_mpi_time = max_int64(mpi_time, node_summary->max_mpi_time);
1062 }
1063 free(region_list);
1064
1065 /* Compute average values */
1066 node_summary->avg_useful_time /= node_summary->nelems;
1067 node_summary->avg_mpi_time /= node_summary->nelems;
1068 }
1069
1070 /* Perform a final barrier so that all processes let the _process_id 0 to
1071 * gather all the data */
1072 node_barrier(spd, NULL);
1073
1074 /* All main processes from each node send data to rank 0 */
1075 if (_process_id == 0) {
1076 verbose(VB_TALP, "Node summary: gathering data");
1077
1078 /* MPI type: pid_t */
1079 MPI_Datatype mpi_pid_type;
1080 PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(pid_t), &mpi_pid_type);
1081
1082 /* MPI struct type: process_in_node_record_t */
1083 MPI_Datatype mpi_process_info_type;
1084 {
1085 int count = 3;
1086 int blocklengths[] = {1, 1, 1};
1087 MPI_Aint displacements[] = {
1088 offsetof(process_in_node_record_t, pid),
1089 offsetof(process_in_node_record_t, mpi_time),
1090 offsetof(process_in_node_record_t, useful_time)};
1091 MPI_Datatype types[] = {mpi_pid_type, mpi_int64_type, mpi_int64_type};
1092 MPI_Datatype tmp_type;
1093 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
1094 PMPI_Type_create_resized(tmp_type, 0, sizeof(process_in_node_record_t),
1095 &mpi_process_info_type);
1096 PMPI_Type_commit(&mpi_process_info_type);
1097 }
1098
1099 /* MPI struct type: node_record_t */
1100 MPI_Datatype mpi_node_record_type;;
1101 {
1102 int count = 7;
1103 int blocklengths[] = {1, 1, 1, 1, 1, 1, node_summary->nelems};
1104 MPI_Aint displacements[] = {
1105 offsetof(node_record_t, node_id),
1106 offsetof(node_record_t, nelems),
1107 offsetof(node_record_t, avg_useful_time),
1108 offsetof(node_record_t, avg_mpi_time),
1109 offsetof(node_record_t, max_useful_time),
1110 offsetof(node_record_t, max_mpi_time),
1111 offsetof(node_record_t, processes)};
1112 MPI_Datatype types[] = {MPI_INT, MPI_INT, mpi_int64_type, mpi_int64_type,
1113 mpi_int64_type, mpi_int64_type, mpi_process_info_type};
1114 MPI_Datatype tmp_type;
1115 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
1116 PMPI_Type_create_resized(tmp_type, 0, node_summary_size, &mpi_node_record_type);
1117 PMPI_Type_commit(&mpi_node_record_type);
1118 }
1119
1120 /* Gather data */
1121 void *recvbuf = NULL;
1122 if (_mpi_rank == 0) {
1123 recvbuf = malloc(_num_nodes * node_summary_size);
1124 }
1125 PMPI_Gather(node_summary, 1, mpi_node_record_type,
1126 recvbuf, 1, mpi_node_record_type,
1127 0, getInterNodeComm());
1128
1129 /* Free send buffer and MPI Datatypes */
1130 free(node_summary);
1131 PMPI_Type_free(&mpi_process_info_type);
1132 PMPI_Type_free(&mpi_node_record_type);
1133
1134 /* Add records */
1135 if (_mpi_rank == 0) {
1136 for (int node_id = 0; node_id < _num_nodes; ++node_id) {
1137 verbose(VB_TALP, "Node summary: recording node %d", node_id);
1138 node_summary = recvbuf + node_summary_size*node_id;
1139 ensure( node_id == node_summary->node_id, "Node id error in %s", __func__ );
1140 talp_output_record_node(node_summary);
1141 }
1142 free(recvbuf);
1143 }
1144 }
1145 }
1146 #endif
1147
1148 #ifdef MPI_LIB
1149 /* Gather PROCESS data of a monitor among all ranks and record it in rank 0 */
1150 static void talp_record_process_summary(const subprocess_descriptor_t *spd,
1151 const dlb_monitor_t *monitor) {
1152
1153 /* Internal monitors will not be recorded */
1154 if (((monitor_data_t*)monitor->_data)->flags.internal) {
1155 return;
1156 }
1157
1158 if (_mpi_rank == 0) {
1159 verbose(VB_TALP, "Process summary: gathering region %s", monitor->name);
1160 }
1161
1162 process_record_t process_record_send = {
1163 .rank = _mpi_rank,
1164 .pid = spd->id,
1165 .node_id = _node_id,
1166 .monitor = *monitor,
1167 };
1168
1169 /* Invalidate pointers of the copied monitor */
1170 process_record_send.monitor.name = NULL;
1171 process_record_send.monitor._data = NULL;
1172
1173 /* Fill hostname and CPU mask strings in process_record_send */
1174 gethostname(process_record_send.hostname, HOST_NAME_MAX);
1175 snprintf(process_record_send.cpuset, TALP_OUTPUT_CPUSET_MAX, "%s",
1176 mu_to_str(&spd->process_mask));
1177 mu_get_quoted_mask(&spd->process_mask, process_record_send.cpuset_quoted,
1178 TALP_OUTPUT_CPUSET_MAX);
1179
1180 /* MPI type: pid_t */
1181 MPI_Datatype mpi_pid_type;
1182 PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(pid_t), &mpi_pid_type);
1183
1184 /* Note: obviously, it doesn't make sense to send addresses via MPI, but we
1185 * are sending the whole dlb_monitor_t, so... Addresses are discarded
1186 * either way. */
1187
1188 /* MPI type: void* */
1189 MPI_Datatype address_type;
1190 PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(void*), &address_type);
1191
1192 /* MPI struct type: dlb_monitor_t */
1193 MPI_Datatype mpi_dlb_monitor_type;
1194 {
1195 enum {count = 19};
1196 int blocklengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
1197 MPI_Aint displacements[] = {
1198 offsetof(dlb_monitor_t, name),
1199 offsetof(dlb_monitor_t, num_cpus),
1200 offsetof(dlb_monitor_t, avg_cpus),
1201 offsetof(dlb_monitor_t, cycles),
1202 offsetof(dlb_monitor_t, instructions),
1203 offsetof(dlb_monitor_t, num_measurements),
1204 offsetof(dlb_monitor_t, num_resets),
1205 offsetof(dlb_monitor_t, num_mpi_calls),
1206 offsetof(dlb_monitor_t, num_omp_parallels),
1207 offsetof(dlb_monitor_t, num_omp_tasks),
1208 offsetof(dlb_monitor_t, start_time),
1209 offsetof(dlb_monitor_t, stop_time),
1210 offsetof(dlb_monitor_t, elapsed_time),
1211 offsetof(dlb_monitor_t, useful_time),
1212 offsetof(dlb_monitor_t, mpi_time),
1213 offsetof(dlb_monitor_t, omp_load_imbalance_time),
1214 offsetof(dlb_monitor_t, omp_scheduling_time),
1215 offsetof(dlb_monitor_t, omp_serialization_time),
1216 offsetof(dlb_monitor_t, _data)};
1217 MPI_Datatype types[] = {address_type, MPI_INT, MPI_FLOAT,
1218 mpi_int64_type, mpi_int64_type, MPI_INT, MPI_INT, mpi_int64_type,
1219 mpi_int64_type, mpi_int64_type, mpi_int64_type, mpi_int64_type,
1220 mpi_int64_type, mpi_int64_type, mpi_int64_type, mpi_int64_type,
1221 mpi_int64_type, mpi_int64_type, address_type};
1222 MPI_Datatype tmp_type;
1223 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
1224 PMPI_Type_create_resized(tmp_type, 0, sizeof(dlb_monitor_t), &mpi_dlb_monitor_type);
1225 PMPI_Type_commit(&mpi_dlb_monitor_type);
1226
1227 static_ensure(sizeof(blocklengths)/sizeof(blocklengths[0]) == count);
1228 static_ensure(sizeof(displacements)/sizeof(displacements[0]) == count);
1229 static_ensure(sizeof(types)/sizeof(types[0]) == count);
1230 }
1231
1232 /* MPI struct type: process_record_t */
1233 MPI_Datatype mpi_process_record_type;
1234 {
1235 int count = 7;
1236 int blocklengths[] = {1, 1, 1, HOST_NAME_MAX,
1237 TALP_OUTPUT_CPUSET_MAX, TALP_OUTPUT_CPUSET_MAX, 1};
1238 MPI_Aint displacements[] = {
1239 offsetof(process_record_t, rank),
1240 offsetof(process_record_t, pid),
1241 offsetof(process_record_t, node_id),
1242 offsetof(process_record_t, hostname),
1243 offsetof(process_record_t, cpuset),
1244 offsetof(process_record_t, cpuset_quoted),
1245 offsetof(process_record_t, monitor)};
1246 MPI_Datatype types[] = {MPI_INT, mpi_pid_type, MPI_INT, MPI_CHAR, MPI_CHAR,
1247 MPI_CHAR, mpi_dlb_monitor_type};
1248 MPI_Datatype tmp_type;
1249 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
1250 PMPI_Type_create_resized(tmp_type, 0, sizeof(process_record_t),
1251 &mpi_process_record_type);
1252 PMPI_Type_commit(&mpi_process_record_type);
1253 }
1254
1255 /* Gather data */
1256 process_record_t *recvbuf = NULL;
1257 if (_mpi_rank == 0) {
1258 recvbuf = malloc(_mpi_size * sizeof(process_record_t));
1259 }
1260 PMPI_Gather(&process_record_send, 1, mpi_process_record_type,
1261 recvbuf, 1, mpi_process_record_type,
1262 0, getWorldComm());
1263
1264 /* Add records */
1265 if (_mpi_rank == 0) {
1266 for (int rank = 0; rank < _mpi_size; ++rank) {
1267 verbose(VB_TALP, "Process summary: recording region %s on rank %d",
1268 monitor->name, rank);
1269 talp_output_record_process(monitor->name, &recvbuf[rank], _mpi_size);
1270 }
1271 free(recvbuf);
1272 }
1273
1274 /* Free MPI types */
1275 PMPI_Type_free(&mpi_dlb_monitor_type);
1276 PMPI_Type_free(&mpi_process_record_type);
1277 }
1278 #endif
1279
1280 #ifdef MPI_LIB
1281
1282 /* The following node and app reductions are needed to compute POP metrics: */
1283
1284 /*** Node reduction ***/
1285
1286 /* Data type to reduce among processes in node */
1287 typedef struct node_reduction {
1288 bool node_used;
1289 int cpus_node;
1290 int64_t sum_useful;
1291 } node_reduction_t;
1292
1293 /* Function called in the MPI node reduction */
1294 static void talp_mpi_node_reduction_fn(void *invec, void *inoutvec, int *len,
1295 MPI_Datatype *datatype) {
1296 node_reduction_t *in = invec;
1297 node_reduction_t *inout = inoutvec;
1298
1299 int _len = *len;
1300 for (int i = 0; i < _len; ++i) {
1301 if (in[i].node_used) {
1302 inout[i].node_used = true;
1303 inout[i].cpus_node += in[i].cpus_node;
1304 inout[i].sum_useful += in[i].sum_useful;
1305 }
1306 }
1307 }
1308
1309 /* Function to perform the reduction at node level */
1310 static void talp_reduce_pop_metrics_node_reduction(node_reduction_t *node_reduction,
1311 const dlb_monitor_t *monitor) {
1312
1313 const node_reduction_t node_reduction_send = {
1314 .node_used = monitor->num_measurements > 0,
1315 .cpus_node = monitor->num_cpus,
1316 .sum_useful = monitor->useful_time,
1317 };
1318
1319 /* MPI struct type: node_reduction_t */
1320 MPI_Datatype mpi_node_reduction_type;
1321 {
1322 int count = 3;
1323 int blocklengths[] = {1, 1, 1};
1324 MPI_Aint displacements[] = {
1325 offsetof(node_reduction_t, node_used),
1326 offsetof(node_reduction_t, cpus_node),
1327 offsetof(node_reduction_t, sum_useful)};
1328 MPI_Datatype types[] = {MPI_C_BOOL, MPI_INT, mpi_int64_type};
1329 MPI_Datatype tmp_type;
1330 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
1331 PMPI_Type_create_resized(tmp_type, 0, sizeof(node_reduction_t),
1332 &mpi_node_reduction_type);
1333 PMPI_Type_commit(&mpi_node_reduction_type);
1334 }
1335
1336 /* Define MPI operation */
1337 MPI_Op node_reduction_op;
1338 PMPI_Op_create(talp_mpi_node_reduction_fn, true, &node_reduction_op);
1339
1340 /* MPI reduction */
1341 PMPI_Reduce(&node_reduction_send, node_reduction, 1,
1342 mpi_node_reduction_type, node_reduction_op,
1343 0, getNodeComm());
1344
1345 /* Free MPI types */
1346 PMPI_Type_free(&mpi_node_reduction_type);
1347 PMPI_Op_free(&node_reduction_op);
1348 }
1349
1350 /** App reduction ***/
1351
1352 /* Data type to reduce among processes in application */
1353 typedef struct app_reduction_t {
1354 int num_cpus;
1355 int num_nodes;
1356 float avg_cpus;
1357 double cycles;
1358 double instructions;
1359 int64_t num_measurements;
1360 int64_t num_mpi_calls;
1361 int64_t num_omp_parallels;
1362 int64_t num_omp_tasks;
1363 int64_t elapsed_time;
1364 int64_t useful_time;
1365 int64_t mpi_time;
1366 int64_t omp_load_imbalance_time;
1367 int64_t omp_scheduling_time;
1368 int64_t omp_serialization_time;
1369 double max_useful_normd_proc;
1370 double max_useful_normd_node;
1371 double mpi_normd_of_max_useful;
1372 } app_reduction_t;
1373
1374 /* Function called in the MPI app reduction */
1375 static void talp_mpi_reduction_fn(void *invec, void *inoutvec, int *len,
1376 MPI_Datatype *datatype) {
1377 app_reduction_t *in = invec;
1378 app_reduction_t *inout = inoutvec;
1379
1380 int _len = *len;
1381 for (int i = 0; i < _len; ++i) {
1382 inout[i].num_cpus += in[i].num_cpus;
1383 inout[i].num_nodes += in[i].num_nodes;
1384 inout[i].avg_cpus += in[i].avg_cpus;
1385 inout[i].cycles += in[i].cycles;
1386 inout[i].instructions += in[i].instructions;
1387 inout[i].num_measurements += in[i].num_measurements;
1388 inout[i].num_mpi_calls += in[i].num_mpi_calls;
1389 inout[i].num_omp_parallels += in[i].num_omp_parallels;
1390 inout[i].num_omp_tasks += in[i].num_omp_tasks;
1391 inout[i].elapsed_time = max_int64(inout[i].elapsed_time, in[i].elapsed_time);
1392 inout[i].useful_time += in[i].useful_time;
1393 inout[i].mpi_time += in[i].mpi_time;
1394 inout[i].omp_load_imbalance_time += in[i].omp_load_imbalance_time;
1395 inout[i].omp_scheduling_time += in[i].omp_scheduling_time;
1396 inout[i].omp_serialization_time += in[i].omp_serialization_time;
1397 inout[i].max_useful_normd_node =
1398 max_int64(inout[i].max_useful_normd_node, in[i].max_useful_normd_node);
1399 if (in[i].max_useful_normd_proc > inout[i].max_useful_normd_proc) {
1400 inout[i].max_useful_normd_proc = in[i].max_useful_normd_proc;
1401 inout[i].mpi_normd_of_max_useful = in[i].mpi_normd_of_max_useful;
1402 }
1403 }
1404 }
1405
1406 /* Function to perform the reduction at application level */
1407 static void talp_reduce_pop_metrics_app_reduction(app_reduction_t *app_reduction,
1408 const node_reduction_t *node_reduction, const dlb_monitor_t *monitor,
1409 bool all_to_all) {
1410
1411 double max_useful_normd_proc = monitor->num_cpus == 0 ? 0.0
1412 : (double)monitor->useful_time / monitor->num_cpus;
1413 double max_useful_normd_node = _process_id != 0 ? 0.0
1414 : node_reduction->cpus_node == 0 ? 0.0
1415 : (double)node_reduction->sum_useful / node_reduction->cpus_node;
1416 double mpi_normd_of_max_useful = monitor->num_cpus == 0 ? 0.0
1417 : (double)monitor->mpi_time / monitor->num_cpus;
1418
1419 const app_reduction_t app_reduction_send = {
1420 .num_cpus = monitor->num_cpus,
1421 .num_nodes = _process_id == 0 && node_reduction->node_used ? 1 : 0,
1422 .avg_cpus = monitor->avg_cpus,
1423 .cycles = (double)monitor->cycles,
1424 .instructions = (double)monitor->instructions,
1425 .num_measurements = monitor->num_measurements,
1426 .num_mpi_calls = monitor->num_mpi_calls,
1427 .num_omp_parallels = monitor->num_omp_parallels,
1428 .num_omp_tasks = monitor->num_omp_tasks,
1429 .elapsed_time = monitor->elapsed_time,
1430 .useful_time = monitor->useful_time,
1431 .mpi_time = monitor->mpi_time,
1432 .omp_load_imbalance_time = monitor->omp_load_imbalance_time,
1433 .omp_scheduling_time = monitor->omp_scheduling_time,
1434 .omp_serialization_time = monitor->omp_serialization_time,
1435 .max_useful_normd_proc = max_useful_normd_proc,
1436 .max_useful_normd_node = max_useful_normd_node,
1437 .mpi_normd_of_max_useful = mpi_normd_of_max_useful,
1438 };
1439
1440 /* MPI struct type: app_reduction_t */
1441 MPI_Datatype mpi_app_reduction_type;
1442 {
1443 enum {count = 18};
1444 int blocklengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
1445 MPI_Aint displacements[] = {
1446 offsetof(app_reduction_t, num_cpus),
1447 offsetof(app_reduction_t, num_nodes),
1448 offsetof(app_reduction_t, avg_cpus),
1449 offsetof(app_reduction_t, cycles),
1450 offsetof(app_reduction_t, instructions),
1451 offsetof(app_reduction_t, num_measurements),
1452 offsetof(app_reduction_t, num_mpi_calls),
1453 offsetof(app_reduction_t, num_omp_parallels),
1454 offsetof(app_reduction_t, num_omp_tasks),
1455 offsetof(app_reduction_t, elapsed_time),
1456 offsetof(app_reduction_t, useful_time),
1457 offsetof(app_reduction_t, mpi_time),
1458 offsetof(app_reduction_t, omp_load_imbalance_time),
1459 offsetof(app_reduction_t, omp_scheduling_time),
1460 offsetof(app_reduction_t, omp_serialization_time),
1461 offsetof(app_reduction_t, max_useful_normd_proc),
1462 offsetof(app_reduction_t, max_useful_normd_node),
1463 offsetof(app_reduction_t, mpi_normd_of_max_useful)};
1464 MPI_Datatype types[] = {MPI_INT, MPI_INT, MPI_FLOAT, MPI_DOUBLE,
1465 MPI_DOUBLE, mpi_int64_type, mpi_int64_type, mpi_int64_type,
1466 mpi_int64_type, mpi_int64_type, mpi_int64_type, mpi_int64_type,
1467 mpi_int64_type, mpi_int64_type, mpi_int64_type, MPI_DOUBLE,
1468 MPI_DOUBLE, MPI_DOUBLE};
1469 MPI_Datatype tmp_type;
1470 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
1471 PMPI_Type_create_resized(tmp_type, 0, sizeof(app_reduction_t),
1472 &mpi_app_reduction_type);
1473 PMPI_Type_commit(&mpi_app_reduction_type);
1474
1475 static_ensure(sizeof(blocklengths)/sizeof(blocklengths[0]) == count);
1476 static_ensure(sizeof(displacements)/sizeof(displacements[0]) == count);
1477 static_ensure(sizeof(types)/sizeof(types[0]) == count);
1478 }
1479
1480 /* Define MPI operation */
1481 MPI_Op app_reduction_op;
1482 MPI_Op_create(talp_mpi_reduction_fn, true, &app_reduction_op);
1483
1484 /* MPI reduction */
1485 if (!all_to_all) {
1486 PMPI_Reduce(&app_reduction_send, app_reduction, 1,
1487 mpi_app_reduction_type, app_reduction_op,
1488 0, getWorldComm());
1489 } else {
1490 PMPI_Allreduce(&app_reduction_send, app_reduction, 1,
1491 mpi_app_reduction_type, app_reduction_op,
1492 getWorldComm());
1493 }
1494
1495 /* Free MPI types */
1496 PMPI_Type_free(&mpi_app_reduction_type);
1497 PMPI_Op_free(&app_reduction_op);
1498 }
1499
1500 static void talp_reduce_pop_metrics(pop_base_metrics_t *base_metrics,
1501 const dlb_monitor_t *monitor, bool all_to_all) {
1502
1503 /* First, reduce some values among processes in the node,
1504 * needed to compute pop metrics */
1505 node_reduction_t node_reduction = {0};
1506 talp_reduce_pop_metrics_node_reduction(&node_reduction, monitor);
1507
1508 /* With the node reduction, reduce again among all process */
1509 app_reduction_t app_reduction = {0};
1510 talp_reduce_pop_metrics_app_reduction(&app_reduction, &node_reduction,
1511 monitor, all_to_all);
1512
1513 /* Finally, fill output base_metrics... */
1514
1515 int num_mpi_ranks;
1516 PMPI_Comm_size(getWorldComm(), &num_mpi_ranks);
1517
1518 /* These values do not need a specific MPI reduction and can be deduced
1519 * from the already reduced number of CPUs */
1520 double useful_normd_app = app_reduction.num_cpus == 0 ? 0.0
1521 : (double)app_reduction.useful_time / app_reduction.num_cpus;
1522 double mpi_normd_app = app_reduction.num_cpus == 0 ? 0.0
1523 : (double)app_reduction.mpi_time / app_reduction.num_cpus;
1524
1525 *base_metrics = (const pop_base_metrics_t) {
1526 .num_cpus = app_reduction.num_cpus,
1527 .num_mpi_ranks = num_mpi_ranks,
1528 .num_nodes = app_reduction.num_nodes,
1529 .avg_cpus = app_reduction.avg_cpus,
1530 .cycles = app_reduction.cycles,
1531 .instructions = app_reduction.instructions,
1532 .num_measurements = app_reduction.num_measurements,
1533 .num_mpi_calls = app_reduction.num_mpi_calls,
1534 .num_omp_parallels = app_reduction.num_omp_parallels,
1535 .num_omp_tasks = app_reduction.num_omp_tasks,
1536 .elapsed_time = app_reduction.elapsed_time,
1537 .useful_time = app_reduction.useful_time,
1538 .mpi_time = app_reduction.mpi_time,
1539 .omp_load_imbalance_time = app_reduction.omp_load_imbalance_time,
1540 .omp_scheduling_time = app_reduction.omp_scheduling_time,
1541 .omp_serialization_time = app_reduction.omp_serialization_time,
1542 .useful_normd_app = useful_normd_app,
1543 .mpi_normd_app = mpi_normd_app,
1544 .max_useful_normd_proc = app_reduction.max_useful_normd_proc,
1545 .max_useful_normd_node = app_reduction.max_useful_normd_node,
1546 .mpi_normd_of_max_useful = app_reduction.mpi_normd_of_max_useful,
1547 };
1548 }
1549 #endif
1550
1551 #ifdef MPI_LIB
1552 /* Gather POP METRICS data of a monitor among all ranks and record it in rank 0 */
1553 static void talp_record_pop_summary(const subprocess_descriptor_t *spd,
1554 const dlb_monitor_t *monitor) {
1555
1556 /* Internal monitors will not be recorded */
1557 if (((monitor_data_t*)monitor->_data)->flags.internal) {
1558 return;
1559 }
1560
1561 if (_mpi_rank == 0) {
1562 verbose(VB_TALP, "TALP summary: gathering region %s", monitor->name);
1563 }
1564
1565 talp_info_t *talp_info = spd->talp_info;
1566
1567 /* Reduce monitor among all MPI ranks into MPI rank 0 */
1568 pop_base_metrics_t base_metrics;
1569 talp_reduce_pop_metrics(&base_metrics, monitor, false);
1570
1571 if (_mpi_rank == 0) {
1572 if (base_metrics.elapsed_time > 0) {
1573
1574 /* Only the global region records the resources */
1575 if (monitor == talp_info->monitor) {
1576 talp_output_record_resources(base_metrics.num_cpus,
1577 base_metrics.num_nodes, base_metrics.num_mpi_ranks);
1578 }
1579
1580 /* Construct pop_metrics out of base metrics */
1581 dlb_pop_metrics_t pop_metrics;
1582 talp_base_metrics_to_pop_metrics(monitor->name, &base_metrics, &pop_metrics);
1583
1584 /* Record */
1585 verbose(VB_TALP, "TALP summary: recording region %s", monitor->name);
1586 talp_output_record_pop_metrics(&pop_metrics);
1587
1588 } else {
1589 /* Record empty */
1590 verbose(VB_TALP, "TALP summary: recording empty region %s", monitor->name);
1591 dlb_pop_metrics_t pop_metrics = {0};
1592 snprintf(pop_metrics.name, DLB_MONITOR_NAME_MAX, "%s", monitor->name);
1593 talp_output_record_pop_metrics(&pop_metrics);
1594 }
1595 }
1596 }
1597 #endif
1598
1599 #ifdef MPI_LIB
1600 /* Communicate among all MPI processes so that everyone has the same monitoring regions */
1601 static void talp_register_common_mpi_regions(const subprocess_descriptor_t *spd) {
1602 /* Note: there's a potential race condition if this function is called
1603 * (which happens on talp_mpi_finalize or talp_finalize) while another
1604 * thread creates a monitoring region. The solution would be to lock the
1605 * entire routine and call a specialized registering function that does not
1606 * lock, or use a recursive lock. The situation is strange enough to not
1607 * support it */
1608
1609 talp_info_t *talp_info = spd->talp_info;
1610
1611 /* Warn about open regions */
1612 for (GSList *node = talp_info->open_regions;
1613 node != NULL;
1614 node = node->next) {
1615 const dlb_monitor_t *monitor = node->data;
1616 warning("Region %s is still open during MPI_Finalize."
1617 " Collected data may be incomplete.",
1618 monitor->name);
1619 }
1620
1621 /* Gather recvcounts for each process
1622 * (Each process may have different number of monitors) */
1623 int nregions = g_tree_nnodes(talp_info->regions);
1624 int chars_to_send = nregions * DLB_MONITOR_NAME_MAX;
1625 int *recvcounts = malloc(_mpi_size * sizeof(int));
1626 PMPI_Allgather(&chars_to_send, 1, MPI_INT,
1627 recvcounts, 1, MPI_INT, getWorldComm());
1628
1629 /* Compute total characters to gather via MPI */
1630 int i;
1631 int total_chars = 0;
1632 for (i=0; i<_mpi_size; ++i) {
1633 total_chars += recvcounts[i];
1634 }
1635
1636 if (total_chars > 0) {
1637 /* Prepare sendbuffer */
1638 char *sendbuffer = malloc(nregions * DLB_MONITOR_NAME_MAX * sizeof(char));
1639 char *sendptr = sendbuffer;
1640 for (GTreeNode *node = g_tree_node_first(talp_info->regions);
1641 node != NULL;
1642 node = g_tree_node_next(node)) {
1643 const dlb_monitor_t *monitor = g_tree_node_value(node);
1644 strcpy(sendptr, monitor->name);
1645 sendptr += DLB_MONITOR_NAME_MAX;
1646 }
1647
1648 /* Prepare recvbuffer */
1649 char *recvbuffer = malloc(total_chars * sizeof(char));
1650
1651 /* Compute displacements */
1652 int *displs = malloc(_mpi_size * sizeof(int));
1653 int next_disp = 0;
1654 for (i=0; i<_mpi_size; ++i) {
1655 displs[i] = next_disp;
1656 next_disp += recvcounts[i];
1657 }
1658
1659 /* Gather all regions */
1660 PMPI_Allgatherv(sendbuffer, nregions * DLB_MONITOR_NAME_MAX, MPI_CHAR,
1661 recvbuffer, recvcounts, displs, MPI_CHAR, getWorldComm());
1662
1663 /* Register all regions. Existing ones will be skipped. */
1664 for (i=0; i<total_chars; i+=DLB_MONITOR_NAME_MAX) {
1665 monitoring_region_register(spd, &recvbuffer[i]);
1666 }
1667
1668 free(sendbuffer);
1669 free(recvbuffer);
1670 free(displs);
1671 }
1672
1673 free(recvcounts);
1674 }
1675 #endif
1676
1677
1678 /*********************************************************************************/
1679 /* TALP MPI functions */
1680 /*********************************************************************************/
1681
1682 /* Start global monitoring region (if not already started) */
1683 6 void talp_mpi_init(const subprocess_descriptor_t *spd) {
1684
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
6 ensure(!thread_is_observer, "An observer thread cannot call talp_mpi_init");
1685
1686 /* Initialize MPI type */
1687 #if MPI_LIB
1688 # if MPI_VERSION >= 3
1689 mpi_int64_type = MPI_INT64_T;
1690 # else
1691 PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(int64_t), &mpi_int64_type);
1692 # endif
1693 #endif
1694
1695 6 talp_info_t *talp_info = spd->talp_info;
1696
1/2
✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
6 if (talp_info) {
1697 6 talp_info->flags.have_mpi = true;
1698
1699 /* Start global region (no-op if already started) */
1700 6 monitoring_region_start(spd, talp_info->monitor);
1701
1702 /* Add MPI_Init statistic and set useful state */
1703 6 talp_sample_t *sample = talp_get_thread_sample(spd);
1704 6 DLB_ATOMIC_ADD_RLX(&sample->stats.num_mpi_calls, 1);
1705 6 set_sample_state(sample, useful, talp_info->flags.papi);
1706 }
1707 6 }
1708
1709 /* Stop global monitoring region and gather APP data if needed */
1710 6 void talp_mpi_finalize(const subprocess_descriptor_t *spd) {
1711
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
6 ensure(!thread_is_observer, "An observer thread cannot call talp_mpi_finalize");
1712 6 talp_info_t *talp_info = spd->talp_info;
1713
1/2
✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
6 if (talp_info) {
1714 /* Add MPI_Finalize */
1715 6 talp_sample_t *sample = talp_get_thread_sample(spd);
1716 6 DLB_ATOMIC_ADD_RLX(&sample->stats.num_mpi_calls, 1);
1717
1718 /* Stop global region */
1719 6 monitoring_region_stop(spd, talp_info->monitor);
1720
1721 6 monitor_data_t *monitor_data = talp_info->monitor->_data;
1722
1723 /* Update shared memory values */
1724
3/4
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 5 times.
6 if (talp_info->flags.have_shmem || talp_info->flags.have_minimal_shmem) {
1725 // TODO: is it needed? isn't it updated when stopped?
1726 1 shmem_talp__set_times(monitor_data->node_shared_id,
1727 1 talp_info->monitor->mpi_time,
1728 1 talp_info->monitor->useful_time);
1729 }
1730
1731 #ifdef MPI_LIB
1732 /* If performing any kind of TALP summary, check that the number of processes
1733 * registered in the shared memory matches with the number of MPI processes in the node.
1734 * This check is needed to avoid deadlocks on finalize. */
1735 if (spd->options.talp_summary) {
1736 verbose(VB_TALP, "Gathering TALP metrics");
1737 /* FIXME: use Named Barrier */
1738 /* if (shmem_barrier__get_num_participants(spd->options.barrier_id) == _mpis_per_node) { */
1739
1740 /* Gather data among processes in the node if node summary is enabled */
1741 if (spd->options.talp_summary & SUMMARY_NODE) {
1742 talp_record_node_summary(spd);
1743 }
1744
1745 /* Gather data among MPIs if any of these summaries is enabled */
1746 if (spd->options.talp_summary
1747 & (SUMMARY_POP_METRICS | SUMMARY_PROCESS)) {
1748 /* Ensure everyone has the same monitoring regions */
1749 talp_register_common_mpi_regions(spd);
1750
1751 /* Finally, reduce data */
1752 for (GTreeNode *node = g_tree_node_first(talp_info->regions);
1753 node != NULL;
1754 node = g_tree_node_next(node)) {
1755 const dlb_monitor_t *monitor = g_tree_node_value(node);
1756 if (spd->options.talp_summary & SUMMARY_POP_METRICS) {
1757 talp_record_pop_summary(spd, monitor);
1758 }
1759 if (spd->options.talp_summary & SUMMARY_PROCESS) {
1760 talp_record_process_summary(spd, monitor);
1761 }
1762 }
1763 }
1764
1765 /* Synchronize all processes in node before continuing with DLB finalization */
1766 node_barrier(spd, NULL);
1767 /* } else { */
1768 /* warning("The number of MPI processes and processes registered in DLB differ." */
1769 /* " TALP will not print any summary."); */
1770 /* } */
1771 }
1772 #endif
1773 }
1774 6 }
1775
1776 /* Decide whether to update the current sample (most likely and cheaper)
1777 * or to aggregate all samples.
1778 * We will only aggregate all samples if the external profiler is enabled
1779 * and this MPI call is a blocking collective call. */
1780 14 static inline void update_sample_on_sync_call(const subprocess_descriptor_t *spd,
1781 const talp_info_t *talp_info, talp_sample_t *sample, bool is_blocking_collective) {
1782
1783
4/4
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 4 times.
14 if (!talp_info->flags.external_profiler || !is_blocking_collective) {
1784 /* Likely scenario, just update the sample */
1785 10 talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP);
1786 } else {
1787 /* If talp_info->flags.external_profiler && is_blocking_collective:
1788 * aggregate samples and update all monitoring regions */
1789 4 talp_aggregate_samples(spd);
1790 }
1791 14 }
1792
1793 8 void talp_into_sync_call(const subprocess_descriptor_t *spd, bool is_blocking_collective) {
1794 /* Observer threads may call MPI functions, but TALP must ignore them */
1795
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
8 if (unlikely(thread_is_observer)) return;
1796
1797 7 const talp_info_t *talp_info = spd->talp_info;
1798
1/2
✓ Branch 0 taken 7 times.
✗ Branch 1 not taken.
7 if (talp_info) {
1799 /* Update sample */
1800 7 talp_sample_t *sample = talp_get_thread_sample(spd);
1801 7 update_sample_on_sync_call(spd, talp_info, sample, is_blocking_collective);
1802
1803 /* Into Sync call -> not_useful_mpi */
1804 7 set_sample_state(sample, not_useful_mpi, talp_info->flags.papi);
1805 }
1806 }
1807
1808 8 void talp_out_of_sync_call(const subprocess_descriptor_t *spd, bool is_blocking_collective) {
1809 /* Observer threads may call MPI functions, but TALP must ignore them */
1810
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
8 if (unlikely(thread_is_observer)) return;
1811
1812 7 const talp_info_t *talp_info = spd->talp_info;
1813
1/2
✓ Branch 0 taken 7 times.
✗ Branch 1 not taken.
7 if (talp_info) {
1814 /* Update sample */
1815 7 talp_sample_t *sample = talp_get_thread_sample(spd);
1816 7 DLB_ATOMIC_ADD_RLX(&sample->stats.num_mpi_calls, 1);
1817 7 update_sample_on_sync_call(spd, talp_info, sample, is_blocking_collective);
1818
1819 /* Out of Sync call -> useful */
1820 7 set_sample_state(sample, useful, talp_info->flags.papi);
1821 }
1822 }
1823
1824
1825 /*********************************************************************************/
1826 /* TALP OpenMP functions */
1827 /*********************************************************************************/
1828
1829 /* samples involved in parallel level 1 */
1830 static talp_sample_t** parallel_samples_l1 = NULL;
1831 static unsigned int parallel_samples_l1_capacity = 0;
1832
1833 1 void talp_openmp_init(pid_t pid, const options_t* options) {
1834
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 ensure(!thread_is_observer, "An observer thread cannot call talp_openmp_init");
1835
1836 1 const subprocess_descriptor_t *spd = thread_spd;
1837 1 talp_info_t *talp_info = spd->talp_info;
1838
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (talp_info) {
1839 1 monitor_data_t *monitor_data = talp_info->monitor->_data;
1840 1 talp_info->flags.have_openmp = true;
1841
1842 /* Fix up number of CPUs for the global region */
1843 1 float cpus = CPU_COUNT(&spd->process_mask);
1844 1 talp_info->monitor->avg_cpus = cpus;
1845 1 shmem_talp__set_avg_cpus(monitor_data->node_shared_id, cpus);
1846
1847 /* Start global region (no-op if already started) */
1848 1 monitoring_region_start(spd, talp_info->monitor);
1849
1850 /* Set useful state */
1851 1 talp_sample_t *sample = talp_get_thread_sample(spd);
1852 1 set_sample_state(sample, useful, talp_info->flags.papi);
1853 }
1854 1 }
1855
1856 1 void talp_openmp_finalize(void) {
1857
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (parallel_samples_l1 != NULL) {
1858 1 free(parallel_samples_l1);
1859 1 parallel_samples_l1 = NULL;
1860 1 parallel_samples_l1_capacity = 0;
1861 }
1862 1 }
1863
1864 2 void talp_openmp_thread_begin() {
1865 2 const subprocess_descriptor_t *spd = thread_spd;
1866 2 talp_info_t *talp_info = spd->talp_info;
1867
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (talp_info) {
1868 /* Initial thread is already in useful state, set omp_out for others */
1869 2 talp_sample_t *sample = talp_get_thread_sample(spd);
1870
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 if (sample->state == disabled) {
1871 /* Not initial thread: */
1872
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (talp_info->flags.papi) {
1873 init_papi_counters();
1874 }
1875 1 set_sample_state(sample, not_useful_omp_out, talp_info->flags.papi);
1876
1877 /* The initial time of the sample is set to match the start time of
1878 * the innermost open region, but other nested open regions need to
1879 * be fixed */
1880 1 monitoring_regions_update_nested(spd, sample);
1881 }
1882 }
1883 2 }
1884
1885 2 void talp_openmp_thread_end(void) {
1886 2 const subprocess_descriptor_t *spd = thread_spd;
1887 2 talp_info_t *talp_info = spd->talp_info;
1888
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 if (talp_info) {
1889 /* Update thread sample with the last microsample */
1890 1 talp_sample_t *sample = talp_get_thread_sample(spd);
1891 1 talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP);
1892
1893 /* Update state */
1894 1 set_sample_state(sample, disabled, talp_info->flags.papi);
1895 }
1896 2 }
1897
1898 2 void talp_openmp_parallel_begin(omptool_parallel_data_t *parallel_data) {
1899
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 fatal_cond(parallel_data->requested_parallelism < 1,
1900 "Requested parallel region of invalid size in %s. Please report bug at %s.",
1901 __func__, PACKAGE_BUGREPORT);
1902
1903 2 const subprocess_descriptor_t *spd = thread_spd;
1904 2 talp_info_t *talp_info = spd->talp_info;
1905
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (talp_info) {
1906
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (parallel_data->level == 1) {
1907 /* Resize samples of parallel 1 if needed */
1908 2 unsigned int requested_parallelism = parallel_data->requested_parallelism;
1909
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (requested_parallelism > parallel_samples_l1_capacity) {
1910 2 void *ptr = realloc(parallel_samples_l1,
1911 sizeof(talp_sample_t*)*requested_parallelism);
1912
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 fatal_cond(!ptr, "realloc failed in %s", __func__);
1913 2 parallel_samples_l1 = ptr;
1914 2 parallel_samples_l1_capacity = requested_parallelism;
1915 }
1916
1917 /* Assign local data */
1918 2 parallel_data->talp_parallel_data = parallel_samples_l1;
1919
1920 } else if (parallel_data->level > 1) {
1921 /* Allocate parallel samples array */
1922 unsigned int requested_parallelism = parallel_data->requested_parallelism;
1923 void *ptr = malloc(sizeof(talp_sample_t*)*requested_parallelism);
1924 fatal_cond(!ptr, "malloc failed in %s", __func__);
1925
1926 /* Assign local data */
1927 parallel_data->talp_parallel_data = ptr;
1928 }
1929
1930 /* Update stats */
1931 2 talp_sample_t *sample = talp_get_thread_sample(spd);
1932 2 DLB_ATOMIC_ADD_RLX(&sample->stats.num_omp_parallels, 1);
1933 }
1934 2 }
1935
1936 2 void talp_openmp_parallel_end(omptool_parallel_data_t *parallel_data) {
1937 2 const subprocess_descriptor_t *spd = thread_spd;
1938 2 talp_info_t *talp_info = spd->talp_info;
1939
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (talp_info) {
1940 /* Update thread sample with the last microsample */
1941 2 talp_sample_t *sample = talp_get_thread_sample(spd);
1942 2 talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP);
1943
1944
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (parallel_data->level == 1) {
1945 /* Flush and aggregate all samples of the parallel region */
1946 2 talp_aggregate_samples_parallel(spd,
1947 2 parallel_data->talp_parallel_data,
1948 parallel_data->actual_parallelism);
1949
1950 } else if (parallel_data->level > 1) {
1951 /* Flush and aggregate all samples of this parallel except this
1952 * thread's sample. The primary thread of a nested parallel region
1953 * will keep its samples until it finishes as non-primary
1954 * team-worker or reaches the level 1 parallel region */
1955 talp_sample_t **parallel_samples = parallel_data->talp_parallel_data;
1956 talp_aggregate_samples_parallel(spd,
1957 &parallel_samples[1],
1958 parallel_data->actual_parallelism-1);
1959
1960 /* free local data */
1961 free(parallel_data->talp_parallel_data);
1962 parallel_data->talp_parallel_data = NULL;
1963 }
1964
1965 /* Update current threads's state */
1966 2 set_sample_state(sample, useful, talp_info->flags.papi);
1967
1968 /* Update the state of the rest of team-worker threads
1969 * (note that set_sample_state cannot be used here because we are
1970 * impersonating a worker thread) */
1971 2 talp_sample_t **parallel_samples = parallel_data->talp_parallel_data;
1972
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
3 for (unsigned int i = 1; i < parallel_data->actual_parallelism; ++i) {
1973 1 talp_sample_t *worker_sample = parallel_samples[i];
1974
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (worker_sample->state == not_useful_omp_in) {
1975 worker_sample->state = not_useful_omp_out;
1976 }
1977 }
1978 }
1979 2 }
1980
1981 3 void talp_openmp_into_parallel_function(
1982 omptool_parallel_data_t *parallel_data, unsigned int index) {
1983 3 const subprocess_descriptor_t *spd = thread_spd;
1984 3 talp_info_t *talp_info = spd->talp_info;
1985
1/2
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
3 if (talp_info) {
1986 /* Assign thread sample as team-worker of this parallel */
1987 3 talp_sample_t *sample = talp_get_thread_sample(spd);
1988 3 talp_sample_t **parallel_samples = parallel_data->talp_parallel_data;
1989 /* Probably optimized, but try to avoid invalidating
1990 * the cache line on reused parallel data */
1991
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1 times.
3 if (parallel_samples[index] != sample) {
1992 2 parallel_samples[index] = sample;
1993 }
1994
1995 /* Update thread sample with the last microsample */
1996 3 talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP);
1997
1998 /* Update state */
1999 3 set_sample_state(sample, useful, talp_info->flags.papi);
2000 }
2001 3 }
2002
2003 3 void talp_openmp_into_parallel_implicit_barrier(omptool_parallel_data_t *parallel_data) {
2004 3 const subprocess_descriptor_t *spd = thread_spd;
2005 3 talp_info_t *talp_info = spd->talp_info;
2006
1/2
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
3 if (talp_info) {
2007 /* Update thread sample with the last microsample */
2008 3 talp_sample_t *sample = talp_get_thread_sample(spd);
2009 3 talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP);
2010
2011 /* Update state */
2012 3 set_sample_state(sample, not_useful_omp_in, talp_info->flags.papi);
2013 }
2014 3 }
2015
2016 3 void talp_openmp_into_parallel_sync(omptool_parallel_data_t *parallel_data) {
2017 3 const subprocess_descriptor_t *spd = thread_spd;
2018 3 talp_info_t *talp_info = spd->talp_info;
2019
1/2
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
3 if (talp_info) {
2020 /* Update thread sample with the last microsample */
2021 3 talp_sample_t *sample = talp_get_thread_sample(spd);
2022 3 talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP);
2023
2024 /* Update state */
2025 3 set_sample_state(sample, not_useful_omp_in, talp_info->flags.papi);
2026 }
2027 3 }
2028
2029 3 void talp_openmp_outof_parallel_sync(omptool_parallel_data_t *parallel_data) {
2030 3 const subprocess_descriptor_t *spd = thread_spd;
2031 3 talp_info_t *talp_info = spd->talp_info;
2032
1/2
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
3 if (talp_info) {
2033 /* Update thread sample with the last microsample */
2034 3 talp_sample_t *sample = talp_get_thread_sample(spd);
2035 3 talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP);
2036
2037 /* Update state */
2038 3 set_sample_state(sample, useful, talp_info->flags.papi);
2039 }
2040 3 }
2041
2042 3 void talp_openmp_task_create(void) {
2043 3 const subprocess_descriptor_t *spd = thread_spd;
2044 3 talp_info_t *talp_info = spd->talp_info;
2045
1/2
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
3 if (talp_info) {
2046 /* Just update stats */
2047 3 talp_sample_t *sample = talp_get_thread_sample(spd);
2048 3 DLB_ATOMIC_ADD_RLX(&sample->stats.num_omp_tasks, 1);
2049 }
2050 3 }
2051
2052 3 void talp_openmp_task_complete(void) {
2053 3 const subprocess_descriptor_t *spd = thread_spd;
2054 3 talp_info_t *talp_info = spd->talp_info;
2055
1/2
✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
3 if (talp_info) {
2056 /* Update thread sample with the last microsample */
2057 3 talp_sample_t *sample = talp_get_thread_sample(spd);
2058 3 talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP);
2059
2060 /* Update state (FIXME: tasks outside of parallels? */
2061 3 set_sample_state(sample, not_useful_omp_in, talp_info->flags.papi);
2062 }
2063 3 }
2064
2065 6 void talp_openmp_task_switch(void) {
2066 6 const subprocess_descriptor_t *spd = thread_spd;
2067 6 talp_info_t *talp_info = spd->talp_info;
2068
1/2
✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
6 if (talp_info) {
2069 /* Update thread sample with the last microsample */
2070 6 talp_sample_t *sample = talp_get_thread_sample(spd);
2071 6 talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP);
2072
2073 /* Update state */
2074 6 set_sample_state(sample, useful, talp_info->flags.papi);
2075 }
2076 6 }
2077
2078 /*********************************************************************************/
2079 /* TALP collect functions for 3rd party programs: */
2080 /* - It's also safe to call it from a 1st party program */
2081 /* - Requires --talp-external-profiler set up in application */
2082 /* - Des not need to synchronize with application */
2083 /*********************************************************************************/
2084
2085 /* Function that may be called from a third-party process to compute
2086 * node_metrics for a given region */
2087 5 int talp_query_pop_node_metrics(const char *name, dlb_node_metrics_t *node_metrics) {
2088
2089
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
5 if (name == NULL) {
2090 1 name = global_region_name;
2091 }
2092
2093 5 int error = DLB_SUCCESS;
2094 5 int64_t total_mpi_time = 0;
2095 5 int64_t total_useful_time = 0;
2096 5 int64_t max_mpi_time = 0;
2097 5 int64_t max_useful_time = 0;
2098
2099 /* Obtain a list of regions in the node associated with given region */
2100 5 int max_procs = mu_get_system_size();
2101 5 talp_region_list_t *region_list = malloc(max_procs * sizeof(talp_region_list_t));
2102 int nelems;
2103 5 shmem_talp__get_regionlist(region_list, &nelems, max_procs, name);
2104
2105 /* Count how many processes have started the region */
2106 5 int processes_per_node = 0;
2107
2108 /* Iterate the PID list and gather times of every process */
2109 int i;
2110
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 5 times.
11 for (i = 0; i <nelems; ++i) {
2111 6 int64_t mpi_time = region_list[i].mpi_time;
2112 6 int64_t useful_time = region_list[i].useful_time;
2113
2114 /* Accumulate total and max values */
2115
3/4
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 3 times.
✗ Branch 3 not taken.
6 if (mpi_time > 0 || useful_time > 0) {
2116 6 ++processes_per_node;
2117 6 total_mpi_time += mpi_time;
2118 6 total_useful_time += useful_time;
2119 6 max_mpi_time = max_int64(mpi_time, max_mpi_time);
2120 6 max_useful_time = max_int64(useful_time, max_useful_time);
2121 }
2122 }
2123 5 free(region_list);
2124
2125 #if MPI_LIB
2126 int node_id = _node_id;
2127 #else
2128 5 int node_id = 0;
2129 #endif
2130
2131
1/2
✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
5 if (processes_per_node > 0) {
2132 /* Compute POP metrics with some inferred values */
2133 perf_metrics_mpi_t metrics;
2134 5 perf_metrics__infer_mpi_model(
2135 &metrics,
2136 processes_per_node,
2137 total_useful_time,
2138 total_mpi_time,
2139 max_useful_time);
2140
2141 /* Initialize structure */
2142 5 *node_metrics = (const dlb_node_metrics_t) {
2143 .node_id = node_id,
2144 .processes_per_node = processes_per_node,
2145 .total_useful_time = total_useful_time,
2146 .total_mpi_time = total_mpi_time,
2147 .max_useful_time = max_useful_time,
2148 .max_mpi_time = max_mpi_time,
2149 5 .parallel_efficiency = metrics.parallel_efficiency,
2150 5 .communication_efficiency = metrics.communication_efficiency,
2151 5 .load_balance = metrics.load_balance,
2152 };
2153 5 snprintf(node_metrics->name, DLB_MONITOR_NAME_MAX, "%s", name);
2154 } else {
2155 error = DLB_ERR_NOENT;
2156 }
2157
2158 5 return error;
2159 }
2160
2161
2162
2163 /*********************************************************************************/
2164 /* TALP collect functions for 1st party programs */
2165 /* - Requires synchronization (MPI or node barrier) among all processes */
2166 /*********************************************************************************/
2167
2168 /* Perform MPI collective calls and compute the current POP metrics for the
2169 * specified monitor. If monitor is NULL, the global monitoring region is
2170 * assumed.
2171 * Pre-conditions:
2172 * - the given monitor must have been registered in all MPI ranks
2173 * - pop_metrics is an allocated structure
2174 */
2175 1 int talp_collect_pop_metrics(const subprocess_descriptor_t *spd,
2176 dlb_monitor_t *monitor, dlb_pop_metrics_t *pop_metrics) {
2177 #ifdef MPI_LIB
2178 talp_info_t *talp_info = spd->talp_info;
2179 if (monitor == NULL) {
2180 monitor = talp_info->monitor;
2181 }
2182
2183 /* Stop monitor so that metrics are updated */
2184 bool resume_region = monitoring_region_stop(spd, monitor) == DLB_SUCCESS;
2185
2186 /* Reduce monitor among all MPI ranks and everbody collects (all-to-all) */
2187 pop_base_metrics_t base_metrics;
2188 talp_reduce_pop_metrics(&base_metrics, monitor, true);
2189
2190 /* Construct output pop_metrics out of base metrics */
2191 talp_base_metrics_to_pop_metrics(monitor->name, &base_metrics, pop_metrics);
2192
2193 /* Resume monitor */
2194 if (resume_region) {
2195 monitoring_region_start(spd, monitor);
2196 }
2197
2198 return DLB_SUCCESS;
2199 #else
2200 1 return DLB_ERR_NOCOMP;
2201 #endif
2202 }
2203
2204 /* Node-collective function to compute node_metrics for a given region */
2205 4 int talp_collect_pop_node_metrics(const subprocess_descriptor_t *spd,
2206 dlb_monitor_t *monitor, dlb_node_metrics_t *node_metrics) {
2207
2208 4 talp_info_t *talp_info = spd->talp_info;
2209
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 monitor = monitor ? monitor : talp_info->monitor;
2210 4 monitor_data_t *monitor_data = monitor->_data;
2211
2212 /* Stop monitor so that metrics are updated */
2213 4 bool resume_region = monitoring_region_stop(spd, monitor) == DLB_SUCCESS;
2214
2215 /* This functionality needs a shared memory, create a temporary one if needed */
2216
1/2
✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
4 if (!talp_info->flags.have_shmem) {
2217 4 shmem_talp__init(spd->options.shm_key, 1);
2218 4 shmem_talp__register(spd->id, monitor->avg_cpus, monitor->name,
2219 &monitor_data->node_shared_id);
2220 }
2221
2222 /* Update the shared memory with this process' metrics */
2223 4 shmem_talp__set_times(monitor_data->node_shared_id,
2224 monitor->mpi_time,
2225 monitor->useful_time);
2226
2227 /* Perform a node barrier to ensure everyone has updated their metrics */
2228 4 node_barrier(spd, NULL);
2229
2230 /* Compute node metrics for that region name */
2231 4 talp_query_pop_node_metrics(monitor->name, node_metrics);
2232
2233 /* Remove shared memory if it was a temporary one */
2234
1/2
✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
4 if (!talp_info->flags.have_shmem) {
2235 4 shmem_talp__finalize(spd->id);
2236 }
2237
2238 /* Resume monitor */
2239
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 3 times.
4 if (resume_region) {
2240 1 monitoring_region_start(spd, monitor);
2241 }
2242
2243 4 return DLB_SUCCESS;
2244 }
2245