Line | Branch | Exec | Source |
---|---|---|---|
1 | /*********************************************************************************/ | ||
2 | /* Copyright 2009-2024 Barcelona Supercomputing Center */ | ||
3 | /* */ | ||
4 | /* This file is part of the DLB library. */ | ||
5 | /* */ | ||
6 | /* DLB is free software: you can redistribute it and/or modify */ | ||
7 | /* it under the terms of the GNU Lesser General Public License as published by */ | ||
8 | /* the Free Software Foundation, either version 3 of the License, or */ | ||
9 | /* (at your option) any later version. */ | ||
10 | /* */ | ||
11 | /* DLB is distributed in the hope that it will be useful, */ | ||
12 | /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
13 | /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
14 | /* GNU Lesser General Public License for more details. */ | ||
15 | /* */ | ||
16 | /* You should have received a copy of the GNU Lesser General Public License */ | ||
17 | /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */ | ||
18 | /*********************************************************************************/ | ||
19 | |||
20 | #ifdef HAVE_CONFIG_H | ||
21 | #include <config.h> | ||
22 | #endif | ||
23 | |||
24 | #include "LB_core/DLB_talp.h" | ||
25 | |||
26 | #include "LB_core/node_barrier.h" | ||
27 | #include "LB_core/spd.h" | ||
28 | #include "LB_core/talp_types.h" | ||
29 | #include "apis/dlb_talp.h" | ||
30 | #include "apis/dlb_errors.h" | ||
31 | #include "support/debug.h" | ||
32 | #include "support/error.h" | ||
33 | #include "support/mytime.h" | ||
34 | #include "support/tracing.h" | ||
35 | #include "support/options.h" | ||
36 | #include "support/mask_utils.h" | ||
37 | #include "support/perf_metrics.h" | ||
38 | #include "support/talp_output.h" | ||
39 | #include "LB_comm/shmem_talp.h" | ||
40 | #include "LB_numThreads/omptool.h" | ||
41 | #ifdef MPI_LIB | ||
42 | #include "LB_MPI/process_MPI.h" | ||
43 | #endif | ||
44 | |||
45 | #include <sched.h> | ||
46 | #include <string.h> | ||
47 | #include <stdlib.h> | ||
48 | #include <pthread.h> | ||
49 | |||
50 | #ifdef PAPI_LIB | ||
51 | #include <papi.h> | ||
52 | #endif | ||
53 | |||
54 | extern __thread bool thread_is_observer; | ||
55 | |||
56 | static inline void set_sample_state(talp_sample_t *sample, enum talp_sample_state state, | ||
57 | bool papi); | ||
58 | static void talp_dealloc_samples(const subprocess_descriptor_t *spd); | ||
59 | static void talp_record_monitor(const subprocess_descriptor_t *spd, | ||
60 | const dlb_monitor_t *monitor); | ||
61 | static void talp_aggregate_samples(const subprocess_descriptor_t *spd); | ||
62 | static talp_sample_t* talp_get_thread_sample(const subprocess_descriptor_t *spd); | ||
63 | |||
64 | #if MPI_LIB | ||
65 | static MPI_Datatype mpi_int64_type; | ||
66 | #endif | ||
67 | |||
68 | #if PAPI_LIB | ||
69 | static __thread int EventSet = PAPI_NULL; | ||
70 | #endif | ||
71 | |||
72 | const char *global_region_name = "Global"; | ||
73 | |||
74 | |||
75 | /*********************************************************************************/ | ||
76 | /* TALP Monitoring Regions */ | ||
77 | /*********************************************************************************/ | ||
78 | |||
79 | /* Helper function for GTree: Compare region names */ | ||
80 | 19817 | static gint key_compare_func(gconstpointer a, gconstpointer b) { | |
81 | 19817 | return strncmp(a, b, DLB_MONITOR_NAME_MAX-1); | |
82 | } | ||
83 | |||
84 | /* Helper function for GTree: deallocate */ | ||
85 | 1039 | static void dealloc_region(gpointer data) { | |
86 | |||
87 | 1039 | dlb_monitor_t *monitor = data; | |
88 | |||
89 | /* Free private data */ | ||
90 | 1039 | monitor_data_t *monitor_data = monitor->_data; | |
91 | 1039 | free(monitor_data); | |
92 | 1039 | monitor_data = NULL; | |
93 | |||
94 | /* Free name */ | ||
95 | 1039 | free((char*)monitor->name); | |
96 | 1039 | monitor->name = NULL; | |
97 | |||
98 | /* Free monitor */ | ||
99 | 1039 | free(monitor); | |
100 | 1039 | } | |
101 | |||
102 | /* Unique region ids */ | ||
103 | 1039 | static int get_new_monitor_id(void) { | |
104 | static atomic_int id = 0; | ||
105 | 1039 | return DLB_ATOMIC_ADD_FETCH_RLX(&id, 1); | |
106 | } | ||
107 | |||
108 | /* Unique anonymous regions */ | ||
109 | 4 | static int get_new_anonymous_id(void) { | |
110 | static atomic_int id = 0; | ||
111 | 4 | return DLB_ATOMIC_ADD_FETCH_RLX(&id, 1); | |
112 | } | ||
113 | |||
114 | /* Return true if the region is to be enabled */ | ||
115 | 1039 | static bool parse_region_select(const char *region_select, const char *region_name) { | |
116 | /* Default case, all regions enabled */ | ||
117 |
1/2✓ Branch 0 taken 1039 times.
✗ Branch 1 not taken.
|
1039 | if (region_select == NULL |
118 |
2/2✓ Branch 0 taken 1011 times.
✓ Branch 1 taken 28 times.
|
1039 | || region_select[0] == '\0' |
119 |
2/2✓ Branch 0 taken 1004 times.
✓ Branch 1 taken 7 times.
|
1011 | || strcmp(region_select, "all") == 0) { |
120 | 1032 | return true; | |
121 | } | ||
122 | |||
123 | /* Special case, all regions disabled */ | ||
124 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 6 times.
|
7 | if (strcmp(region_select, "none") == 0) { |
125 | 1 | return false; | |
126 | } | ||
127 | |||
128 | /* Break region_select into tokens and find region_name */ | ||
129 | 6 | bool found_in_select = false; | |
130 | 6 | size_t len = strlen(region_select); | |
131 | 6 | char *region_select_copy = malloc(sizeof(char)*(len+1)); | |
132 | 6 | strcpy(region_select_copy, region_select); | |
133 | char *saveptr; | ||
134 | 6 | char *token = strtok_r(region_select_copy, ",", &saveptr); | |
135 |
2/2✓ Branch 0 taken 9 times.
✓ Branch 1 taken 3 times.
|
12 | while (token) { |
136 | /* Region name is found */ | ||
137 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 7 times.
|
9 | if (strcmp(token, region_name) == 0) { |
138 | 2 | found_in_select = true; | |
139 | 2 | break; | |
140 | } | ||
141 | |||
142 | /* Region name is same as global region, and same as token (ignoring case) */ | ||
143 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 4 times.
|
7 | if (strcasecmp(region_name, global_region_name) == 0 |
144 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
|
3 | && strcasecmp(token, global_region_name) == 0) { |
145 | 1 | found_in_select = true; | |
146 | 1 | break; | |
147 | } | ||
148 | |||
149 | /* next token */ | ||
150 | 6 | token = strtok_r(NULL, ",", &saveptr); | |
151 | } | ||
152 | 6 | free(region_select_copy); | |
153 | 6 | return found_in_select; | |
154 | } | ||
155 | |||
156 | 1039 | static void monitoring_region_initialize(dlb_monitor_t *monitor, int id, const | |
157 | char *name, pid_t pid, float avg_cpus, const char *region_select, bool have_shmem) { | ||
158 | /* Initialize private monitor data */ | ||
159 | 1039 | monitor_data_t *monitor_data = malloc(sizeof(monitor_data_t)); | |
160 | 1039 | *monitor_data = (const monitor_data_t) { | |
161 | .id = id, | ||
162 | .node_shared_id = -1, | ||
163 | }; | ||
164 | |||
165 | /* Parse --talp-region-select if needed */ | ||
166 | 1039 | monitor_data->flags.enabled = parse_region_select(region_select, name); | |
167 | |||
168 | /* Allocate monitor name */ | ||
169 | 1039 | char *allocated_name = malloc(DLB_MONITOR_NAME_MAX*sizeof(char)); | |
170 | 1039 | snprintf(allocated_name, DLB_MONITOR_NAME_MAX, "%s", name); | |
171 | |||
172 | /* Initialize monitor */ | ||
173 | 1039 | *monitor = (const dlb_monitor_t) { | |
174 | .name = allocated_name, | ||
175 | .avg_cpus = avg_cpus, | ||
176 | ._data = monitor_data, | ||
177 | }; | ||
178 | |||
179 | /* Register name in the instrumentation tool */ | ||
180 | instrument_register_event(MONITOR_REGION, monitor_data->id, name); | ||
181 | |||
182 | /* Register region in shmem */ | ||
183 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 1036 times.
|
1039 | if (have_shmem) { |
184 | 3 | int err = shmem_talp__register(pid, avg_cpus, monitor->name, &monitor_data->node_shared_id); | |
185 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 3 times.
|
3 | if (err == DLB_ERR_NOMEM) { |
186 | ✗ | warning("Region %s has been correctly registered but cannot be shared among other" | |
187 | " processes due to lack of space in the TALP shared memory. Features like" | ||
188 | " node report or gathering data from external processes may not work for" | ||
189 | " this region. If needed, increase the TALP shared memory capacity using" | ||
190 | " the flag --talp-regions-per-proc. Run dlb -hh for more info.", | ||
191 | monitor->name); | ||
192 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 3 times.
|
3 | } else if (err < DLB_SUCCESS) { |
193 | ✗ | fatal("Unknown error registering region %s, please report bug at %s", | |
194 | monitor->name, PACKAGE_BUGREPORT); | ||
195 | } | ||
196 | } | ||
197 | 1039 | } | |
198 | |||
199 | 7 | struct dlb_monitor_t* monitoring_region_get_global_region( | |
200 | const subprocess_descriptor_t *spd) { | ||
201 | 7 | talp_info_t *talp_info = spd->talp_info; | |
202 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 1 times.
|
7 | return talp_info ? talp_info->monitor : NULL; |
203 | } | ||
204 | |||
205 | 6 | const char* monitoring_region_get_global_region_name(void) { | |
206 | 6 | return global_region_name; | |
207 | } | ||
208 | |||
209 | 1045 | dlb_monitor_t* monitoring_region_register(const subprocess_descriptor_t *spd, | |
210 | const char* name) { | ||
211 | |||
212 | 1045 | talp_info_t *talp_info = spd->talp_info; | |
213 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1045 times.
|
1045 | if (talp_info == NULL) return NULL; |
214 | |||
215 | 1045 | dlb_monitor_t *monitor = NULL; | |
216 |
4/4✓ Branch 0 taken 1042 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 1041 times.
|
1045 | bool anonymous_region = (name == NULL || *name == '\0'); |
217 |
4/4✓ Branch 0 taken 1041 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 13 times.
✓ Branch 3 taken 1028 times.
|
1045 | bool global_region = !anonymous_region && name == global_region_name; |
218 | |||
219 | /* Check again if the pointers are different but the string content is the | ||
220 | * same as the global region, ignoring case */ | ||
221 |
2/2✓ Branch 0 taken 1041 times.
✓ Branch 1 taken 4 times.
|
1045 | if (!anonymous_region |
222 |
2/2✓ Branch 0 taken 1028 times.
✓ Branch 1 taken 13 times.
|
1041 | && !global_region |
223 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 1025 times.
|
1028 | && strncasecmp(global_region_name, name, DLB_MONITOR_NAME_MAX-1) == 0) { |
224 | 3 | name = global_region_name; | |
225 | 3 | global_region = true; | |
226 | } | ||
227 | |||
228 | /* Found monitor if already registered */ | ||
229 |
2/2✓ Branch 0 taken 1041 times.
✓ Branch 1 taken 4 times.
|
1045 | if (!anonymous_region) { |
230 | 1041 | pthread_mutex_lock(&talp_info->regions_mutex); | |
231 | { | ||
232 | 1041 | monitor = g_tree_lookup(talp_info->regions, name); | |
233 | } | ||
234 | 1041 | pthread_mutex_unlock(&talp_info->regions_mutex); | |
235 | |||
236 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 1035 times.
|
1041 | if (monitor != NULL) { |
237 | 6 | return monitor; | |
238 | } | ||
239 | } | ||
240 | |||
241 | /* Otherwise, create new monitoring region */ | ||
242 | 1039 | monitor = malloc(sizeof(dlb_monitor_t)); | |
243 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1039 times.
|
1039 | fatal_cond(!monitor, "Could not register a new monitoring region." |
244 | " Please report at "PACKAGE_BUGREPORT); | ||
245 | |||
246 | /* Determine the initial number of assigned CPUs for the region */ | ||
247 | 1039 | float avg_cpus = CPU_COUNT(&spd->process_mask); | |
248 | |||
249 | /* Construct name if anonymous region */ | ||
250 | char monitor_name[DLB_MONITOR_NAME_MAX]; | ||
251 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 1035 times.
|
1039 | if (anonymous_region) { |
252 | 4 | snprintf(monitor_name, DLB_MONITOR_NAME_MAX, "Anonymous Region %d", | |
253 | get_new_anonymous_id()); | ||
254 | 4 | name = monitor_name; | |
255 | } | ||
256 | |||
257 | /* Initialize values */ | ||
258 | 2078 | bool have_shmem = talp_info->flags.have_shmem | |
259 |
3/6✓ Branch 0 taken 1036 times.
✓ Branch 1 taken 3 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 1036 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
1039 | || (talp_info->flags.have_minimal_shmem && global_region); |
260 | 1039 | monitoring_region_initialize(monitor, get_new_monitor_id(), name, | |
261 | 1039 | spd->id, avg_cpus, spd->options.talp_region_select, have_shmem); | |
262 | |||
263 | /* Finally, insert */ | ||
264 | 1039 | pthread_mutex_lock(&talp_info->regions_mutex); | |
265 | { | ||
266 | 1039 | g_tree_insert(talp_info->regions, (gpointer)monitor->name, monitor); | |
267 | } | ||
268 | 1039 | pthread_mutex_unlock(&talp_info->regions_mutex); | |
269 | |||
270 | 1039 | return monitor; | |
271 | } | ||
272 | |||
273 | 3 | int monitoring_region_reset(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) { | |
274 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
|
3 | if (monitor == DLB_GLOBAL_REGION) { |
275 | 1 | talp_info_t *talp_info = spd->talp_info; | |
276 | 1 | monitor = talp_info->monitor; | |
277 | } | ||
278 | |||
279 | /* Reset everything except these fields: */ | ||
280 | 3 | *monitor = (const dlb_monitor_t) { | |
281 | 3 | .name = monitor->name, | |
282 | 3 | .num_resets = monitor->num_resets + 1, | |
283 | 3 | ._data = monitor->_data, | |
284 | }; | ||
285 | |||
286 | 3 | monitor_data_t *monitor_data = monitor->_data; | |
287 | 3 | monitor_data->flags.started = false; | |
288 | |||
289 | 3 | return DLB_SUCCESS; | |
290 | } | ||
291 | |||
292 | 1457 | int monitoring_region_start(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) { | |
293 | /* Observer threads don't have a valid sample so they cannot start/stop regions */ | ||
294 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1456 times.
|
1457 | if (unlikely(thread_is_observer)) return DLB_ERR_PERM; |
295 | |||
296 | 1456 | talp_info_t *talp_info = spd->talp_info; | |
297 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1454 times.
|
1456 | if (monitor == DLB_GLOBAL_REGION) { |
298 | 2 | monitor = talp_info->monitor; | |
299 | } | ||
300 | |||
301 | int error; | ||
302 | 1456 | monitor_data_t *monitor_data = monitor->_data; | |
303 | |||
304 |
4/4✓ Branch 0 taken 1454 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1450 times.
✓ Branch 3 taken 4 times.
|
1456 | if (!monitor_data->flags.started && monitor_data->flags.enabled) { |
305 | /* Gather samples from all threads and update regions */ | ||
306 | 1450 | talp_aggregate_samples(spd); | |
307 | |||
308 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1450 times.
|
1450 | verbose(VB_TALP, "Starting region %s", monitor->name); |
309 | instrument_event(MONITOR_REGION, monitor_data->id, EVENT_BEGIN); | ||
310 | |||
311 | /* Thread sample was just updated, use timestamp as starting time */ | ||
312 | 1450 | talp_sample_t *thread_sample = talp_get_thread_sample(spd); | |
313 | 1450 | monitor->start_time = thread_sample->last_updated_timestamp; | |
314 | 1450 | monitor->stop_time = 0; | |
315 | |||
316 | 1450 | pthread_mutex_lock(&talp_info->regions_mutex); | |
317 | { | ||
318 | 1450 | monitor_data->flags.started = true; | |
319 | 1450 | talp_info->open_regions = g_slist_prepend(talp_info->open_regions, monitor); | |
320 | } | ||
321 | 1450 | pthread_mutex_unlock(&talp_info->regions_mutex); | |
322 | |||
323 | /* Normally, the sample state will be 'useful' at this point, but on | ||
324 | * certain cases where neither talp_mpi_init nor talp_openmp_init have | ||
325 | * been called, this is necessary */ | ||
326 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 1440 times.
|
1450 | if (thread_sample->state != useful) { |
327 | 10 | set_sample_state(thread_sample, useful, talp_info->flags.papi); | |
328 | } | ||
329 | |||
330 | 1450 | error = DLB_SUCCESS; | |
331 | } else { | ||
332 | 6 | error = DLB_NOUPDT; | |
333 | } | ||
334 | |||
335 | 1456 | return error; | |
336 | } | ||
337 | |||
338 | 1461 | int monitoring_region_stop(const subprocess_descriptor_t *spd, dlb_monitor_t *monitor) { | |
339 | /* Observer threads don't have a valid sample so they cannot start/stop regions */ | ||
340 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1459 times.
|
1461 | if (unlikely(thread_is_observer)) return DLB_ERR_PERM; |
341 | |||
342 | 1459 | talp_info_t *talp_info = spd->talp_info; | |
343 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1457 times.
|
1459 | if (monitor == DLB_GLOBAL_REGION) { |
344 | 2 | monitor = talp_info->monitor; | |
345 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 1451 times.
|
1457 | } else if (monitor == DLB_LAST_OPEN_REGION) { |
346 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 1 times.
|
6 | if (talp_info->open_regions != NULL) { |
347 | 5 | monitor = talp_info->open_regions->data; | |
348 | } else { | ||
349 | 1 | return DLB_ERR_NOENT; | |
350 | } | ||
351 | } | ||
352 | |||
353 | int error; | ||
354 | 1458 | monitor_data_t *monitor_data = monitor->_data; | |
355 | |||
356 |
2/2✓ Branch 0 taken 1449 times.
✓ Branch 1 taken 9 times.
|
1458 | if (monitor_data->flags.started) { |
357 | /* Gather samples from all threads and update regions */ | ||
358 | 1449 | talp_aggregate_samples(spd); | |
359 | |||
360 | /* Stop timer */ | ||
361 | 1449 | talp_sample_t *thread_sample = talp_get_thread_sample(spd); | |
362 | 1449 | monitor->stop_time = thread_sample->last_updated_timestamp; | |
363 | 1449 | monitor->elapsed_time += monitor->stop_time - monitor->start_time; | |
364 | 1449 | ++(monitor->num_measurements); | |
365 | |||
366 | 1449 | pthread_mutex_lock(&talp_info->regions_mutex); | |
367 | { | ||
368 | 1449 | monitor_data->flags.started = false; | |
369 | 1449 | talp_info->open_regions = g_slist_remove(talp_info->open_regions, monitor); | |
370 | } | ||
371 | 1449 | pthread_mutex_unlock(&talp_info->regions_mutex); | |
372 | |||
373 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1449 times.
|
1449 | verbose(VB_TALP, "Stopping region %s", monitor->name); |
374 | instrument_event(MONITOR_REGION, monitor_data->id, EVENT_END); | ||
375 | 1449 | error = DLB_SUCCESS; | |
376 | } else { | ||
377 | 9 | error = DLB_NOUPDT; | |
378 | } | ||
379 | |||
380 | 1458 | return error; | |
381 | } | ||
382 | |||
383 | 9 | bool monitoring_region_is_started(const dlb_monitor_t *monitor) { | |
384 | 9 | return ((monitor_data_t*)monitor->_data)->flags.started; | |
385 | } | ||
386 | |||
387 | 1 | void monitoring_region_set_internal(struct dlb_monitor_t *monitor, bool internal) { | |
388 | 1 | ((monitor_data_t*)monitor->_data)->flags.internal = internal; | |
389 | 1 | } | |
390 | |||
391 | 8 | int monitoring_region_report(const subprocess_descriptor_t *spd, const dlb_monitor_t *monitor) { | |
392 | 8 | talp_info_t *talp_info = spd->talp_info; | |
393 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
|
8 | if (monitor == DLB_GLOBAL_REGION) { |
394 | 1 | monitor = talp_info->monitor; | |
395 | } | ||
396 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
|
8 | if (((monitor_data_t*)monitor->_data)->flags.internal) { |
397 | 1 | return DLB_NOUPDT; | |
398 | } | ||
399 | |||
400 | #ifdef PAPI_LIB | ||
401 | bool have_papi = talp_info->flags.papi; | ||
402 | #else | ||
403 | 7 | bool have_papi = false; | |
404 | #endif | ||
405 | 7 | talp_output_print_monitoring_region(monitor, mu_to_str(&spd->process_mask), | |
406 | 7 | talp_info->flags.have_mpi, talp_info->flags.have_openmp, have_papi); | |
407 | |||
408 | 7 | return DLB_SUCCESS; | |
409 | } | ||
410 | |||
411 | 5 | int monitoring_regions_force_update(const subprocess_descriptor_t *spd) { | |
412 | /* Observer threads don't have a valid sample so they cannot start/stop regions */ | ||
413 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
|
5 | if (unlikely(thread_is_observer)) return DLB_ERR_PERM; |
414 | |||
415 | 4 | talp_aggregate_samples(spd); | |
416 | 4 | return DLB_SUCCESS; | |
417 | } | ||
418 | |||
419 | /* Update all monitoring regions with the macrosample */ | ||
420 | 2909 | static void monitoring_regions_update_all(const subprocess_descriptor_t *spd, | |
421 | const talp_macrosample_t *macrosample, int num_cpus) { | ||
422 | 2909 | talp_info_t *talp_info = spd->talp_info; | |
423 | |||
424 | /* Update all open regions */ | ||
425 | 2909 | pthread_mutex_lock(&talp_info->regions_mutex); | |
426 | { | ||
427 | 2909 | for (GSList *node = talp_info->open_regions; | |
428 |
2/2✓ Branch 0 taken 2298 times.
✓ Branch 1 taken 2909 times.
|
5207 | node != NULL; |
429 | 2298 | node = node->next) { | |
430 | 2298 | dlb_monitor_t *monitor = node->data; | |
431 | 2298 | monitor_data_t *monitor_data = monitor->_data; | |
432 | |||
433 | /* Update number of CPUs if needed */ | ||
434 | 2298 | monitor->num_cpus = max_int(monitor->num_cpus, num_cpus); | |
435 | |||
436 | /* Timers */ | ||
437 | 2298 | monitor->useful_time += macrosample->timers.useful; | |
438 | 2298 | monitor->mpi_time += macrosample->timers.not_useful_mpi; | |
439 | 2298 | monitor->omp_load_imbalance_time += macrosample->timers.not_useful_omp_in_lb; | |
440 | 2298 | monitor->omp_scheduling_time += macrosample->timers.not_useful_omp_in_sched; | |
441 | 2298 | monitor->omp_serialization_time += macrosample->timers.not_useful_omp_out; | |
442 | #ifdef PAPI_LIB | ||
443 | /* Counters */ | ||
444 | monitor->cycles += macrosample->counters.cycles; | ||
445 | monitor->instructions += macrosample->counters.instructions; | ||
446 | #endif | ||
447 | /* Stats */ | ||
448 | 2298 | monitor->num_mpi_calls += macrosample->stats.num_mpi_calls; | |
449 | 2298 | monitor->num_omp_parallels += macrosample->stats.num_omp_parallels; | |
450 | 2298 | monitor->num_omp_tasks += macrosample->stats.num_omp_tasks; | |
451 | |||
452 | /* Update shared memory only if requested */ | ||
453 |
2/2✓ Branch 0 taken 11 times.
✓ Branch 1 taken 2287 times.
|
2298 | if (talp_info->flags.external_profiler) { |
454 | 11 | shmem_talp__set_times(monitor_data->node_shared_id, | |
455 | monitor->mpi_time, | ||
456 | monitor->useful_time); | ||
457 | } | ||
458 | } | ||
459 | } | ||
460 | 2909 | pthread_mutex_unlock(&talp_info->regions_mutex); | |
461 | 2909 | } | |
462 | |||
463 | /* Update all open nested regions (so, excluding the innermost) and add the | ||
464 | * time since its start time until the sample last timestamp (which is the time | ||
465 | * that has yet not been added to the regions) as omp_serialization_time */ | ||
466 | 1 | static void monitoring_regions_update_nested(const subprocess_descriptor_t *spd, | |
467 | const talp_sample_t *sample) { | ||
468 | |||
469 | 1 | talp_info_t *talp_info = spd->talp_info; | |
470 | |||
471 | /* Update all open nested regions */ | ||
472 | 1 | pthread_mutex_lock(&talp_info->regions_mutex); | |
473 | { | ||
474 | 2 | GSList *nested_open_regions = talp_info->open_regions | |
475 | 1 | ? talp_info->open_regions->next | |
476 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | : NULL; |
477 | |||
478 | 1 | for (GSList *node = nested_open_regions; | |
479 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | node != NULL; |
480 | 1 | node = node->next) { | |
481 | |||
482 | 1 | dlb_monitor_t *monitor = node->data; | |
483 | 1 | monitor->omp_serialization_time += | |
484 | 1 | sample->last_updated_timestamp - monitor->start_time; | |
485 | } | ||
486 | } | ||
487 | 1 | pthread_mutex_unlock(&talp_info->regions_mutex); | |
488 | 1 | } | |
489 | |||
490 | |||
491 | /*********************************************************************************/ | ||
492 | /* Init / Finalize */ | ||
493 | /*********************************************************************************/ | ||
494 | |||
495 | /* Executed once */ | ||
496 | static inline int init_papi(void) { | ||
497 | #ifdef PAPI_LIB | ||
498 | ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi, | ||
499 | "Error invoking %s when PAPI has been disabled", __FUNCTION__); | ||
500 | |||
501 | /* Library init */ | ||
502 | if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) { | ||
503 | warning("PAPI Library versions differ"); | ||
504 | return -1; | ||
505 | } | ||
506 | |||
507 | /* Activate thread tracing */ | ||
508 | int error = PAPI_thread_init(pthread_self); | ||
509 | if (error != PAPI_OK) { | ||
510 | warning("PAPI Error during thread initialization. %d: %s", | ||
511 | error, PAPI_strerror(error)); | ||
512 | return -1; | ||
513 | } | ||
514 | #endif | ||
515 | return 0; | ||
516 | } | ||
517 | |||
518 | /* Executed once per thread */ | ||
519 | ✗ | static inline int init_papi_counters(void) { | |
520 | #ifdef PAPI_LIB | ||
521 | ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi, | ||
522 | "Error invoking %s when PAPI has been disabled", __FUNCTION__); | ||
523 | |||
524 | int error = PAPI_register_thread(); | ||
525 | if (error != PAPI_OK) { | ||
526 | warning("PAPI Error during thread registration. %d: %s", | ||
527 | error, PAPI_strerror(error)); | ||
528 | return -1; | ||
529 | } | ||
530 | |||
531 | /* Eventset creation */ | ||
532 | EventSet = PAPI_NULL; | ||
533 | error = PAPI_create_eventset(&EventSet); | ||
534 | if (error != PAPI_OK) { | ||
535 | warning("PAPI Error during eventset creation. %d: %s", | ||
536 | error, PAPI_strerror(error)); | ||
537 | return -1; | ||
538 | } | ||
539 | |||
540 | int Events[2] = {PAPI_TOT_CYC, PAPI_TOT_INS}; | ||
541 | error = PAPI_add_events(EventSet, Events, 2); | ||
542 | if (error != PAPI_OK) { | ||
543 | warning("PAPI Error adding events. %d: %s", | ||
544 | error, PAPI_strerror(error)); | ||
545 | return -1; | ||
546 | } | ||
547 | |||
548 | /* Start tracing */ | ||
549 | error = PAPI_start(EventSet); | ||
550 | if (error != PAPI_OK) { | ||
551 | warning("PAPI Error during tracing initialization: %d: %s", | ||
552 | error, PAPI_strerror(error)); | ||
553 | return -1; | ||
554 | } | ||
555 | #endif | ||
556 | ✗ | return 0; | |
557 | } | ||
558 | |||
559 | ✗ | static inline void reset_papi_counters(void) { | |
560 | #ifdef PAPI_LIB | ||
561 | ensure( ((talp_info_t*)thread_spd->talp_info)->flags.papi, | ||
562 | "Error invoking %s when PAPI has been disabled", __FUNCTION__); | ||
563 | |||
564 | int error = PAPI_reset(EventSet); | ||
565 | if (error != PAPI_OK) verbose(VB_TALP, "Error resetting counters"); | ||
566 | #endif | ||
567 | } | ||
568 | |||
569 | 13 | void talp_init(subprocess_descriptor_t *spd) { | |
570 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 13 times.
|
13 | ensure(!spd->talp_info, "TALP already initialized"); |
571 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 13 times.
|
13 | ensure(!thread_is_observer, "An observer thread cannot call talp_init"); |
572 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 10 times.
|
13 | verbose(VB_TALP, "Initializing TALP module"); |
573 | |||
574 | /* Initialize talp info */ | ||
575 | 13 | talp_info_t *talp_info = malloc(sizeof(talp_info_t)); | |
576 | 13 | *talp_info = (const talp_info_t) { | |
577 | .flags = { | ||
578 | 13 | .external_profiler = spd->options.talp_external_profiler, | |
579 | 13 | .papi = spd->options.talp_papi, | |
580 | 13 | .have_shmem = spd->options.talp_external_profiler, | |
581 | 13 | .have_minimal_shmem = !spd->options.talp_external_profiler | |
582 |
3/4✓ Branch 0 taken 11 times.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 11 times.
|
13 | && spd->options.talp_summary & SUMMARY_NODE, |
583 | }, | ||
584 | 13 | .regions = g_tree_new_full( | |
585 | (GCompareDataFunc)key_compare_func, | ||
586 | NULL, NULL, dealloc_region), | ||
587 | .regions_mutex = PTHREAD_MUTEX_INITIALIZER, | ||
588 | .samples_mutex = PTHREAD_MUTEX_INITIALIZER, | ||
589 | }; | ||
590 | 13 | spd->talp_info = talp_info; | |
591 | |||
592 | /* Initialize shared memory */ | ||
593 | 13 | int regions_per_proc = talp_info->flags.have_shmem | |
594 | 13 | ? spd->options.talp_regions_per_proc | |
595 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 11 times.
|
13 | : talp_info->flags.have_minimal_shmem |
596 | 11 | ? 1 | |
597 | 11 | : 0; | |
598 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 11 times.
|
13 | if (regions_per_proc > 0) { |
599 | 2 | shmem_talp__init(spd->options.shm_key, regions_per_proc); | |
600 | } | ||
601 | |||
602 | /* Initialize global region monitor | ||
603 | * (at this point we don't know how many CPUs, it will be fixed in talp_openmp_init) */ | ||
604 | 13 | talp_info->monitor = monitoring_region_register(spd, global_region_name); | |
605 | |||
606 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 13 times.
|
13 | verbose(VB_TALP, "TALP module with workers mask: %s", mu_to_str(&spd->process_mask)); |
607 | |||
608 | /* Initialize and start running PAPI */ | ||
609 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 9 times.
|
13 | if (talp_info->flags.papi) { |
610 | #ifdef PAPI_LIB | ||
611 | if (init_papi() != 0 || init_papi_counters() != 0) { | ||
612 | warning("PAPI initialization has failed, disabling option."); | ||
613 | talp_info->flags.papi = false; | ||
614 | } | ||
615 | #else | ||
616 | 4 | warning("DLB has not been configured with PAPI support, disabling option."); | |
617 | 4 | talp_info->flags.papi = false; | |
618 | #endif | ||
619 | } | ||
620 | 13 | } | |
621 | |||
622 | 13 | void talp_finalize(subprocess_descriptor_t *spd) { | |
623 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 13 times.
|
13 | ensure(spd->talp_info, "TALP is not initialized"); |
624 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 13 times.
|
13 | ensure(!thread_is_observer, "An observer thread cannot call talp_finalize"); |
625 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 13 times.
|
13 | verbose(VB_TALP, "Finalizing TALP module"); |
626 | |||
627 | 13 | talp_info_t *talp_info = spd->talp_info; | |
628 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 5 times.
|
13 | if (!talp_info->flags.have_mpi) { |
629 | /* If we don't have MPI support, regions may be still running and | ||
630 | * without being recorded to talp_output. Do that now. */ | ||
631 | |||
632 | /* Stop open regions | ||
633 | * (Note that monitoring_region_stop need to acquire the regions_mutex | ||
634 | * lock, so we we need to iterate without it) */ | ||
635 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
|
10 | while(talp_info->open_regions != NULL) { |
636 | 2 | dlb_monitor_t *monitor = talp_info->open_regions->data; | |
637 | 2 | monitoring_region_stop(spd, monitor); | |
638 | } | ||
639 | |||
640 | 8 | pthread_mutex_lock(&talp_info->regions_mutex); | |
641 | { | ||
642 | /* Record all regions */ | ||
643 | 8 | for (GTreeNode *node = g_tree_node_first(talp_info->regions); | |
644 |
2/2✓ Branch 0 taken 13 times.
✓ Branch 1 taken 8 times.
|
21 | node != NULL; |
645 | 13 | node = g_tree_node_next(node)) { | |
646 | 13 | const dlb_monitor_t *monitor = g_tree_node_value(node); | |
647 | 13 | talp_record_monitor(spd, monitor); | |
648 | } | ||
649 | } | ||
650 | 8 | pthread_mutex_unlock(&talp_info->regions_mutex); | |
651 | } | ||
652 | |||
653 | /* Print/write all collected summaries */ | ||
654 | 13 | talp_output_finalize(spd->options.talp_output_file); | |
655 | |||
656 | /* Deallocate samples structure */ | ||
657 | 13 | talp_dealloc_samples(spd); | |
658 | |||
659 | /* Finalize shared memory */ | ||
660 |
3/4✓ Branch 0 taken 11 times.
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 11 times.
|
13 | if (talp_info->flags.have_shmem || talp_info->flags.have_minimal_shmem) { |
661 | 2 | shmem_talp__finalize(spd->id); | |
662 | } | ||
663 | |||
664 | #ifdef PAPI_LIB | ||
665 | if (talp_info->flags.papi) { | ||
666 | PAPI_shutdown(); | ||
667 | } | ||
668 | #endif | ||
669 | |||
670 | /* Deallocate monitoring regions and talp_info */ | ||
671 | 13 | pthread_mutex_lock(&talp_info->regions_mutex); | |
672 | { | ||
673 | /* Destroy GTree, each node is deallocated with the function dealloc_region */ | ||
674 | 13 | g_tree_destroy(talp_info->regions); | |
675 | 13 | talp_info->regions = NULL; | |
676 | 13 | talp_info->monitor = NULL; | |
677 | |||
678 | /* Destroy list of open regions */ | ||
679 | 13 | g_slist_free(talp_info->open_regions); | |
680 | 13 | talp_info->open_regions = NULL; | |
681 | } | ||
682 | 13 | pthread_mutex_unlock(&talp_info->regions_mutex); | |
683 | 13 | free(talp_info); | |
684 | 13 | spd->talp_info = NULL; | |
685 | 13 | } | |
686 | |||
687 | |||
688 | /*********************************************************************************/ | ||
689 | /* Sample functions */ | ||
690 | /*********************************************************************************/ | ||
691 | |||
692 | static __thread talp_sample_t* _tls_sample = NULL; | ||
693 | |||
694 | /* WARNING: this function may only be called when updating own thread's sample */ | ||
695 | 67 | static inline void set_sample_state(talp_sample_t *sample, enum talp_sample_state state, | |
696 | bool papi) { | ||
697 | 67 | sample->state = state; | |
698 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 67 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
67 | if (papi && state == useful) { |
699 | ✗ | reset_papi_counters(); | |
700 | } | ||
701 | instrument_event(MONITOR_STATE, | ||
702 | state == disabled ? MONITOR_STATE_DISABLED | ||
703 | : state == useful ? MONITOR_STATE_USEFUL | ||
704 | : state == not_useful_mpi ? MONITOR_STATE_NOT_USEFUL_MPI | ||
705 | : state == not_useful_omp_in ? MONITOR_STATE_NOT_USEFUL_OMP_IN | ||
706 | : state == not_useful_omp_out ? MONITOR_STATE_NOT_USEFUL_OMP_OUT | ||
707 | : 0, | ||
708 | EVENT_BEGIN); | ||
709 | 67 | } | |
710 | |||
711 | /* Get the TLS associated sample */ | ||
712 | 2957 | static talp_sample_t* talp_get_thread_sample(const subprocess_descriptor_t *spd) { | |
713 | /* Thread already has an allocated sample, return it */ | ||
714 |
2/2✓ Branch 0 taken 2946 times.
✓ Branch 1 taken 11 times.
|
2957 | if (likely(_tls_sample != NULL)) return _tls_sample; |
715 | |||
716 | /* Observer threads don't have a valid sample */ | ||
717 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 11 times.
|
11 | if (unlikely(thread_is_observer)) return NULL; |
718 | |||
719 | /* Otherwise, allocate */ | ||
720 | 11 | talp_info_t *talp_info = spd->talp_info; | |
721 | 11 | pthread_mutex_lock(&talp_info->samples_mutex); | |
722 | { | ||
723 | 11 | int ncpus = ++talp_info->ncpus; | |
724 | 11 | void *samples = realloc(talp_info->samples, sizeof(talp_sample_t*)*ncpus); | |
725 |
1/2✓ Branch 0 taken 11 times.
✗ Branch 1 not taken.
|
11 | if (samples) { |
726 | 11 | talp_info->samples = samples; | |
727 | void *new_sample; | ||
728 |
1/2✓ Branch 0 taken 11 times.
✗ Branch 1 not taken.
|
11 | if (posix_memalign(&new_sample, DLB_CACHE_LINE, sizeof(talp_sample_t)) == 0) { |
729 | 11 | _tls_sample = new_sample; | |
730 | 11 | talp_info->samples[ncpus-1] = new_sample; | |
731 | } | ||
732 | } | ||
733 | } | ||
734 | 11 | pthread_mutex_unlock(&talp_info->samples_mutex); | |
735 | |||
736 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 11 times.
|
11 | fatal_cond(_tls_sample == NULL, "TALP: could not allocate thread sample"); |
737 | |||
738 | /* If a thread is created mid-region, its initial time is that of the | ||
739 | * innermost open region, otherwise it is the current time */ | ||
740 | int64_t last_updated_timestamp; | ||
741 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 10 times.
|
11 | if (talp_info->open_regions) { |
742 | 1 | const dlb_monitor_t *monitor = talp_info->open_regions->data; | |
743 | 1 | last_updated_timestamp = monitor->start_time; | |
744 | } else { | ||
745 | 10 | last_updated_timestamp = get_time_in_ns(); | |
746 | } | ||
747 | |||
748 | 11 | *_tls_sample = (const talp_sample_t) { | |
749 | .last_updated_timestamp = last_updated_timestamp, | ||
750 | }; | ||
751 | |||
752 | 11 | set_sample_state(_tls_sample, disabled, talp_info->flags.papi); | |
753 | |||
754 | #ifdef INSTRUMENTATION_VERSION | ||
755 | unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR}; | ||
756 | long long papi_values[] = {0, 0}; | ||
757 | instrument_nevent(2, events, papi_values); | ||
758 | #endif | ||
759 | |||
760 | 11 | return _tls_sample; | |
761 | } | ||
762 | |||
763 | 13 | static void talp_dealloc_samples(const subprocess_descriptor_t *spd) { | |
764 | /* This only nullifies the current thread pointer, but this function is | ||
765 | * only really important when TALP is finalized and then initialized again | ||
766 | */ | ||
767 | 13 | _tls_sample = NULL; | |
768 | |||
769 | 13 | talp_info_t *talp_info = spd->talp_info; | |
770 | 13 | pthread_mutex_lock(&talp_info->samples_mutex); | |
771 | { | ||
772 | 13 | free(talp_info->samples); | |
773 | 13 | talp_info->samples = NULL; | |
774 | } | ||
775 | 13 | pthread_mutex_unlock(&talp_info->samples_mutex); | |
776 | 13 | } | |
777 | |||
778 | enum { NO_TIMESTAMP = 0 }; | ||
779 | /* Compute new microsample (time since last update) and update sample values */ | ||
780 | 2937 | static void talp_update_sample(talp_sample_t *sample, bool papi, int64_t timestamp) { | |
781 | /* Observer threads ignore this function */ | ||
782 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2937 times.
|
2937 | if (unlikely(sample == NULL)) return; |
783 | |||
784 | /* Compute duration and set new last_updated_timestamp */ | ||
785 |
2/2✓ Branch 0 taken 34 times.
✓ Branch 1 taken 2903 times.
|
2937 | int64_t now = timestamp == NO_TIMESTAMP ? get_time_in_ns() : timestamp; |
786 | 2937 | int64_t microsample_duration = now - sample->last_updated_timestamp; | |
787 | 2937 | sample->last_updated_timestamp = now; | |
788 | |||
789 | /* Update the appropriate sample timer */ | ||
790 |
5/6✓ Branch 0 taken 4 times.
✓ Branch 1 taken 2914 times.
✓ Branch 2 taken 7 times.
✓ Branch 3 taken 11 times.
✓ Branch 4 taken 1 times.
✗ Branch 5 not taken.
|
2937 | switch(sample->state) { |
791 | 4 | case disabled: | |
792 | 4 | break; | |
793 | 2914 | case useful: | |
794 | 2914 | DLB_ATOMIC_ADD_RLX(&sample->timers.useful, microsample_duration); | |
795 | 2914 | break; | |
796 | 7 | case not_useful_mpi: | |
797 | 7 | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_mpi, microsample_duration); | |
798 | 7 | break; | |
799 | 11 | case not_useful_omp_in: | |
800 | 11 | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_in, microsample_duration); | |
801 | 11 | break; | |
802 | 1 | case not_useful_omp_out: | |
803 | 1 | DLB_ATOMIC_ADD_RLX(&sample->timers.not_useful_omp_out, microsample_duration); | |
804 | 1 | break; | |
805 | } | ||
806 | |||
807 | #ifdef PAPI_LIB | ||
808 | if (papi) { | ||
809 | /* Only read counters if we are updating this thread's sample */ | ||
810 | if (sample == talp_get_thread_sample(thread_spd)) { | ||
811 | if (sample->state == useful) { | ||
812 | /* Read */ | ||
813 | long long papi_values[2]; | ||
814 | int error = PAPI_read(EventSet, papi_values); | ||
815 | if (error != PAPI_OK) { | ||
816 | verbose(VB_TALP, "stop return code: %d, %s", error, PAPI_strerror(error)); | ||
817 | } | ||
818 | |||
819 | /* Atomically add papi_values to sample structure */ | ||
820 | DLB_ATOMIC_ADD_RLX(&sample->counters.cycles, papi_values[0]); | ||
821 | DLB_ATOMIC_ADD_RLX(&sample->counters.instructions, papi_values[1]); | ||
822 | |||
823 | #ifdef INSTRUMENTATION_VERSION | ||
824 | unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR}; | ||
825 | instrument_nevent(2, events, papi_values); | ||
826 | #endif | ||
827 | |||
828 | /* Counters are reset here and each time the sample is set to useful */ | ||
829 | reset_papi_counters(); | ||
830 | } | ||
831 | else { | ||
832 | #ifdef INSTRUMENTATION_VERSION | ||
833 | /* Emit 0's to distinguish useful chunks in traces */ | ||
834 | unsigned events[] = {MONITOR_CYCLES, MONITOR_INSTR}; | ||
835 | long long papi_values[] = {0, 0}; | ||
836 | instrument_nevent(2, events, papi_values); | ||
837 | #endif | ||
838 | } | ||
839 | } | ||
840 | } | ||
841 | #endif | ||
842 | } | ||
843 | |||
844 | /* Flush and aggregate a single sample into a macrosample */ | ||
845 | 2903 | static inline void talp_aggregate_sample(talp_sample_t *sample, | |
846 | talp_macrosample_t *macrosample) { | ||
847 | /* Timers */ | ||
848 | 2903 | macrosample->timers.useful += | |
849 | 2903 | DLB_ATOMIC_EXCH_RLX(&sample->timers.useful, 0); | |
850 | 2903 | macrosample->timers.not_useful_mpi += | |
851 | 2903 | DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_mpi, 0); | |
852 | 2903 | macrosample->timers.not_useful_omp_out += | |
853 | 2903 | DLB_ATOMIC_EXCH_RLX(&sample->timers.not_useful_omp_out, 0); | |
854 | /* timers.not_useful_omp_in is not flushed here, make sure struct is empty */ | ||
855 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2903 times.
|
2903 | ensure(DLB_ATOMIC_LD_RLX(&sample->timers.not_useful_omp_in) == 0, |
856 | "Inconsistency in TALP sample metric not_useful_omp_in." | ||
857 | " Please, report bug at " PACKAGE_BUGREPORT); | ||
858 | |||
859 | #ifdef PAPI_LIB | ||
860 | /* Counters */ | ||
861 | macrosample->counters.cycles += | ||
862 | DLB_ATOMIC_EXCH_RLX(&sample->counters.cycles, 0); | ||
863 | macrosample->counters.instructions += | ||
864 | DLB_ATOMIC_EXCH_RLX(&sample->counters.instructions, 0); | ||
865 | #endif | ||
866 | |||
867 | /* Stats */ | ||
868 | 2903 | macrosample->stats.num_mpi_calls += | |
869 | 2903 | DLB_ATOMIC_EXCH_RLX(&sample->stats.num_mpi_calls, 0); | |
870 | 2903 | macrosample->stats.num_omp_parallels += | |
871 | 2903 | DLB_ATOMIC_EXCH_RLX(&sample->stats.num_omp_parallels, 0); | |
872 | 2903 | macrosample->stats.num_omp_tasks += | |
873 | 2903 | DLB_ATOMIC_EXCH_RLX(&sample->stats.num_omp_tasks, 0); | |
874 | 2903 | } | |
875 | |||
876 | /* Accumulate values from samples of all threads and update regions */ | ||
877 | 2907 | static void talp_aggregate_samples(const subprocess_descriptor_t *spd) { | |
878 | |||
879 | int num_cpus; | ||
880 | 2907 | talp_info_t *talp_info = spd->talp_info; | |
881 | |||
882 | /* Accumulate samples from all threads */ | ||
883 | 2907 | talp_macrosample_t macrosample = (const talp_macrosample_t) {}; | |
884 | 2907 | pthread_mutex_lock(&talp_info->samples_mutex); | |
885 | { | ||
886 | 2907 | num_cpus = talp_info->ncpus; | |
887 | |||
888 | /* Force-update and aggregate all samples */ | ||
889 | 2907 | int64_t timestamp = get_time_in_ns(); | |
890 |
2/2✓ Branch 0 taken 2900 times.
✓ Branch 1 taken 2907 times.
|
5807 | for (int i = 0; i < num_cpus; ++i) { |
891 | 2900 | talp_update_sample(talp_info->samples[i], talp_info->flags.papi, timestamp); | |
892 | 2900 | talp_aggregate_sample(talp_info->samples[i], ¯osample); | |
893 | } | ||
894 | } | ||
895 | 2907 | pthread_mutex_unlock(&talp_info->samples_mutex); | |
896 | |||
897 | /* Update all started regions */ | ||
898 | 2907 | monitoring_regions_update_all(spd, ¯osample, num_cpus); | |
899 | 2907 | } | |
900 | |||
901 | /* Accumulate samples from only a subset of samples of a parallel region. | ||
902 | * Load Balance and Scheduling are computed here based on all samples. */ | ||
903 | 2 | static void talp_aggregate_samples_parallel(const subprocess_descriptor_t *spd, | |
904 | talp_sample_t **samples, unsigned int nelems) { | ||
905 | |||
906 | 2 | talp_info_t *talp_info = spd->talp_info; | |
907 | 2 | talp_macrosample_t macrosample = (const talp_macrosample_t) {}; | |
908 | 2 | pthread_mutex_lock(&talp_info->samples_mutex); | |
909 | { | ||
910 | /* Iterate first to force-update all samples and compute the minimum | ||
911 | * not-useful-omp-in among them */ | ||
912 | 2 | int64_t timestamp = get_time_in_ns(); | |
913 | 2 | int64_t min_not_useful_omp_in = INT64_MAX; | |
914 | unsigned int i; | ||
915 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
|
5 | for (i=0; i<nelems; ++i) { |
916 | 3 | talp_update_sample(samples[i], talp_info->flags.papi, timestamp); | |
917 | 3 | min_not_useful_omp_in = min_int64(min_not_useful_omp_in, | |
918 | 3 | DLB_ATOMIC_LD_RLX(&samples[i]->timers.not_useful_omp_in)); | |
919 | } | ||
920 | |||
921 | /* Iterate again to accumulate Load Balance, and to aggregate sample */ | ||
922 | 2 | int64_t sched_timer = min_not_useful_omp_in * nelems; | |
923 | 2 | int64_t lb_timer = 0; | |
924 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
|
5 | for (i=0; i<nelems; ++i) { |
925 | 3 | lb_timer += DLB_ATOMIC_EXCH_RLX(&samples[i]->timers.not_useful_omp_in, 0) | |
926 | 3 | - min_not_useful_omp_in; | |
927 | 3 | talp_aggregate_sample(samples[i], ¯osample); | |
928 | } | ||
929 | |||
930 | /* Update derived timers into macrosample */ | ||
931 | 2 | macrosample.timers.not_useful_omp_in_lb = lb_timer; | |
932 | 2 | macrosample.timers.not_useful_omp_in_sched = sched_timer; | |
933 | } | ||
934 | 2 | pthread_mutex_unlock(&talp_info->samples_mutex); | |
935 | |||
936 | /* Update all started regions */ | ||
937 | 2 | monitoring_regions_update_all(spd, ¯osample, nelems); | |
938 | 2 | } | |
939 | |||
940 | |||
941 | /*********************************************************************************/ | ||
942 | /* TALP Record in serial (non-MPI) mode */ | ||
943 | /*********************************************************************************/ | ||
944 | |||
945 | /* For any given monitor, record metrics considering only this (sub-)process */ | ||
946 | 13 | static void talp_record_monitor(const subprocess_descriptor_t *spd, | |
947 | const dlb_monitor_t *monitor) { | ||
948 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 11 times.
|
13 | if (spd->options.talp_summary & SUMMARY_PROCESS) { |
949 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | verbose(VB_TALP, "TALP process summary: recording region %s", monitor->name); |
950 | |||
951 | 2 | process_record_t process_record = { | |
952 | .rank = 0, | ||
953 | 2 | .pid = spd->id, | |
954 | .monitor = *monitor, | ||
955 | }; | ||
956 | |||
957 | /* Fill hostname and CPU mask strings in process_record */ | ||
958 | 2 | gethostname(process_record.hostname, HOST_NAME_MAX); | |
959 | 2 | snprintf(process_record.cpuset, TALP_OUTPUT_CPUSET_MAX, "%s", | |
960 | mu_to_str(&spd->process_mask)); | ||
961 | 2 | mu_get_quoted_mask(&spd->process_mask, process_record.cpuset_quoted, | |
962 | TALP_OUTPUT_CPUSET_MAX); | ||
963 | |||
964 | /* Add record */ | ||
965 | 2 | talp_output_record_process(monitor->name, &process_record, 1); | |
966 | } | ||
967 | |||
968 |
1/2✓ Branch 0 taken 13 times.
✗ Branch 1 not taken.
|
13 | if (spd->options.talp_summary & SUMMARY_POP_METRICS) { |
969 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 5 times.
|
13 | if (monitor->elapsed_time > 0) { |
970 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 8 times.
|
8 | verbose(VB_TALP, "TALP summary: recording region %s", monitor->name); |
971 | |||
972 | 8 | talp_info_t *talp_info = spd->talp_info; | |
973 | 8 | const pop_base_metrics_t base_metrics = { | |
974 | 8 | .num_cpus = monitor->num_cpus, | |
975 | .num_mpi_ranks = 0, | ||
976 | .num_nodes = 1, | ||
977 | 8 | .avg_cpus = monitor->avg_cpus, | |
978 | 8 | .cycles = (double)monitor->cycles, | |
979 | 8 | .instructions = (double)monitor->instructions, | |
980 | 8 | .num_measurements = monitor->num_measurements, | |
981 | 8 | .num_mpi_calls = monitor->num_mpi_calls, | |
982 | 8 | .num_omp_parallels = monitor->num_omp_parallels, | |
983 | 8 | .num_omp_tasks = monitor->num_omp_tasks, | |
984 | 8 | .elapsed_time = monitor->elapsed_time, | |
985 | 8 | .useful_time = monitor->useful_time, | |
986 | 8 | .mpi_time = monitor->mpi_time, | |
987 | 8 | .omp_load_imbalance_time = monitor->omp_load_imbalance_time, | |
988 | 8 | .omp_scheduling_time = monitor->omp_scheduling_time, | |
989 | 8 | .omp_serialization_time = monitor->omp_serialization_time, | |
990 | 8 | .useful_normd_app = (double)monitor->useful_time / monitor->num_cpus, | |
991 | 8 | .mpi_normd_app = (double)monitor->mpi_time / monitor->num_cpus, | |
992 | 8 | .max_useful_normd_proc = (double)monitor->useful_time / monitor->num_cpus, | |
993 | 8 | .max_useful_normd_node = (double)monitor->useful_time / monitor->num_cpus, | |
994 | 8 | .mpi_normd_of_max_useful = (double)monitor->mpi_time / monitor->num_cpus, | |
995 | }; | ||
996 | |||
997 | dlb_pop_metrics_t pop_metrics; | ||
998 | 8 | talp_base_metrics_to_pop_metrics(monitor->name, &base_metrics, &pop_metrics); | |
999 | 8 | talp_output_record_pop_metrics(&pop_metrics); | |
1000 | |||
1001 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 5 times.
|
8 | if(monitor == talp_info->monitor) { |
1002 | 3 | talp_output_record_resources(monitor->num_cpus, | |
1003 | /* num_nodes */ 1, /* num_ranks */ 0); | ||
1004 | } | ||
1005 | |||
1006 | } else { | ||
1007 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5 times.
|
5 | verbose(VB_TALP, "TALP summary: recording empty region %s", monitor->name); |
1008 | 5 | dlb_pop_metrics_t pop_metrics = {0}; | |
1009 | 5 | snprintf(pop_metrics.name, DLB_MONITOR_NAME_MAX, "%s", monitor->name); | |
1010 | 5 | talp_output_record_pop_metrics(&pop_metrics); | |
1011 | } | ||
1012 | } | ||
1013 | 13 | } | |
1014 | |||
1015 | |||
1016 | /*********************************************************************************/ | ||
1017 | /* TALP Record in MPI mode */ | ||
1018 | /*********************************************************************************/ | ||
1019 | |||
1020 | #if MPI_LIB | ||
1021 | /* Compute Node summary of all Global Monitors and record data */ | ||
1022 | static void talp_record_node_summary(const subprocess_descriptor_t *spd) { | ||
1023 | |||
1024 | node_record_t *node_summary = NULL; | ||
1025 | size_t node_summary_size = 0; | ||
1026 | |||
1027 | /* Perform a barrier so that all processes in the node have arrived at the | ||
1028 | * MPI_Finalize */ | ||
1029 | node_barrier(spd, NULL); | ||
1030 | |||
1031 | /* Node process 0 reduces all global regions from all processes in the node */ | ||
1032 | if (_process_id == 0) { | ||
1033 | /* Obtain a list of regions associated with the Global Region Name, sorted by PID */ | ||
1034 | int max_procs = mu_get_system_size(); | ||
1035 | talp_region_list_t *region_list = malloc(max_procs * sizeof(talp_region_list_t)); | ||
1036 | int nelems; | ||
1037 | shmem_talp__get_regionlist(region_list, &nelems, max_procs, global_region_name); | ||
1038 | |||
1039 | /* Allocate and initialize node summary structure */ | ||
1040 | node_summary_size = sizeof(node_record_t) + sizeof(process_in_node_record_t) * nelems; | ||
1041 | node_summary = malloc(node_summary_size); | ||
1042 | *node_summary = (const node_record_t) { | ||
1043 | .node_id = _node_id, | ||
1044 | .nelems = nelems, | ||
1045 | }; | ||
1046 | |||
1047 | /* Iterate the PID list and gather times of every process */ | ||
1048 | for (int i = 0; i < nelems; ++i) { | ||
1049 | int64_t mpi_time = region_list[i].mpi_time; | ||
1050 | int64_t useful_time = region_list[i].useful_time; | ||
1051 | |||
1052 | /* Save times in local structure */ | ||
1053 | node_summary->processes[i].pid = region_list[i].pid; | ||
1054 | node_summary->processes[i].mpi_time = mpi_time; | ||
1055 | node_summary->processes[i].useful_time = useful_time; | ||
1056 | |||
1057 | /* Accumulate total and max values */ | ||
1058 | node_summary->avg_useful_time += useful_time; | ||
1059 | node_summary->avg_mpi_time += mpi_time; | ||
1060 | node_summary->max_useful_time = max_int64(useful_time, node_summary->max_useful_time); | ||
1061 | node_summary->max_mpi_time = max_int64(mpi_time, node_summary->max_mpi_time); | ||
1062 | } | ||
1063 | free(region_list); | ||
1064 | |||
1065 | /* Compute average values */ | ||
1066 | node_summary->avg_useful_time /= node_summary->nelems; | ||
1067 | node_summary->avg_mpi_time /= node_summary->nelems; | ||
1068 | } | ||
1069 | |||
1070 | /* Perform a final barrier so that all processes let the _process_id 0 to | ||
1071 | * gather all the data */ | ||
1072 | node_barrier(spd, NULL); | ||
1073 | |||
1074 | /* All main processes from each node send data to rank 0 */ | ||
1075 | if (_process_id == 0) { | ||
1076 | verbose(VB_TALP, "Node summary: gathering data"); | ||
1077 | |||
1078 | /* MPI type: pid_t */ | ||
1079 | MPI_Datatype mpi_pid_type; | ||
1080 | PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(pid_t), &mpi_pid_type); | ||
1081 | |||
1082 | /* MPI struct type: process_in_node_record_t */ | ||
1083 | MPI_Datatype mpi_process_info_type; | ||
1084 | { | ||
1085 | int count = 3; | ||
1086 | int blocklengths[] = {1, 1, 1}; | ||
1087 | MPI_Aint displacements[] = { | ||
1088 | offsetof(process_in_node_record_t, pid), | ||
1089 | offsetof(process_in_node_record_t, mpi_time), | ||
1090 | offsetof(process_in_node_record_t, useful_time)}; | ||
1091 | MPI_Datatype types[] = {mpi_pid_type, mpi_int64_type, mpi_int64_type}; | ||
1092 | MPI_Datatype tmp_type; | ||
1093 | PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type); | ||
1094 | PMPI_Type_create_resized(tmp_type, 0, sizeof(process_in_node_record_t), | ||
1095 | &mpi_process_info_type); | ||
1096 | PMPI_Type_commit(&mpi_process_info_type); | ||
1097 | } | ||
1098 | |||
1099 | /* MPI struct type: node_record_t */ | ||
1100 | MPI_Datatype mpi_node_record_type;; | ||
1101 | { | ||
1102 | int count = 7; | ||
1103 | int blocklengths[] = {1, 1, 1, 1, 1, 1, node_summary->nelems}; | ||
1104 | MPI_Aint displacements[] = { | ||
1105 | offsetof(node_record_t, node_id), | ||
1106 | offsetof(node_record_t, nelems), | ||
1107 | offsetof(node_record_t, avg_useful_time), | ||
1108 | offsetof(node_record_t, avg_mpi_time), | ||
1109 | offsetof(node_record_t, max_useful_time), | ||
1110 | offsetof(node_record_t, max_mpi_time), | ||
1111 | offsetof(node_record_t, processes)}; | ||
1112 | MPI_Datatype types[] = {MPI_INT, MPI_INT, mpi_int64_type, mpi_int64_type, | ||
1113 | mpi_int64_type, mpi_int64_type, mpi_process_info_type}; | ||
1114 | MPI_Datatype tmp_type; | ||
1115 | PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type); | ||
1116 | PMPI_Type_create_resized(tmp_type, 0, node_summary_size, &mpi_node_record_type); | ||
1117 | PMPI_Type_commit(&mpi_node_record_type); | ||
1118 | } | ||
1119 | |||
1120 | /* Gather data */ | ||
1121 | void *recvbuf = NULL; | ||
1122 | if (_mpi_rank == 0) { | ||
1123 | recvbuf = malloc(_num_nodes * node_summary_size); | ||
1124 | } | ||
1125 | PMPI_Gather(node_summary, 1, mpi_node_record_type, | ||
1126 | recvbuf, 1, mpi_node_record_type, | ||
1127 | 0, getInterNodeComm()); | ||
1128 | |||
1129 | /* Free send buffer and MPI Datatypes */ | ||
1130 | free(node_summary); | ||
1131 | PMPI_Type_free(&mpi_process_info_type); | ||
1132 | PMPI_Type_free(&mpi_node_record_type); | ||
1133 | |||
1134 | /* Add records */ | ||
1135 | if (_mpi_rank == 0) { | ||
1136 | for (int node_id = 0; node_id < _num_nodes; ++node_id) { | ||
1137 | verbose(VB_TALP, "Node summary: recording node %d", node_id); | ||
1138 | node_summary = recvbuf + node_summary_size*node_id; | ||
1139 | ensure( node_id == node_summary->node_id, "Node id error in %s", __func__ ); | ||
1140 | talp_output_record_node(node_summary); | ||
1141 | } | ||
1142 | free(recvbuf); | ||
1143 | } | ||
1144 | } | ||
1145 | } | ||
1146 | #endif | ||
1147 | |||
1148 | #ifdef MPI_LIB | ||
1149 | /* Gather PROCESS data of a monitor among all ranks and record it in rank 0 */ | ||
1150 | static void talp_record_process_summary(const subprocess_descriptor_t *spd, | ||
1151 | const dlb_monitor_t *monitor) { | ||
1152 | |||
1153 | /* Internal monitors will not be recorded */ | ||
1154 | if (((monitor_data_t*)monitor->_data)->flags.internal) { | ||
1155 | return; | ||
1156 | } | ||
1157 | |||
1158 | if (_mpi_rank == 0) { | ||
1159 | verbose(VB_TALP, "Process summary: gathering region %s", monitor->name); | ||
1160 | } | ||
1161 | |||
1162 | process_record_t process_record_send = { | ||
1163 | .rank = _mpi_rank, | ||
1164 | .pid = spd->id, | ||
1165 | .node_id = _node_id, | ||
1166 | .monitor = *monitor, | ||
1167 | }; | ||
1168 | |||
1169 | /* Invalidate pointers of the copied monitor */ | ||
1170 | process_record_send.monitor.name = NULL; | ||
1171 | process_record_send.monitor._data = NULL; | ||
1172 | |||
1173 | /* Fill hostname and CPU mask strings in process_record_send */ | ||
1174 | gethostname(process_record_send.hostname, HOST_NAME_MAX); | ||
1175 | snprintf(process_record_send.cpuset, TALP_OUTPUT_CPUSET_MAX, "%s", | ||
1176 | mu_to_str(&spd->process_mask)); | ||
1177 | mu_get_quoted_mask(&spd->process_mask, process_record_send.cpuset_quoted, | ||
1178 | TALP_OUTPUT_CPUSET_MAX); | ||
1179 | |||
1180 | /* MPI type: pid_t */ | ||
1181 | MPI_Datatype mpi_pid_type; | ||
1182 | PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(pid_t), &mpi_pid_type); | ||
1183 | |||
1184 | /* Note: obviously, it doesn't make sense to send addresses via MPI, but we | ||
1185 | * are sending the whole dlb_monitor_t, so... Addresses are discarded | ||
1186 | * either way. */ | ||
1187 | |||
1188 | /* MPI type: void* */ | ||
1189 | MPI_Datatype address_type; | ||
1190 | PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(void*), &address_type); | ||
1191 | |||
1192 | /* MPI struct type: dlb_monitor_t */ | ||
1193 | MPI_Datatype mpi_dlb_monitor_type; | ||
1194 | { | ||
1195 | enum {count = 19}; | ||
1196 | int blocklengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; | ||
1197 | MPI_Aint displacements[] = { | ||
1198 | offsetof(dlb_monitor_t, name), | ||
1199 | offsetof(dlb_monitor_t, num_cpus), | ||
1200 | offsetof(dlb_monitor_t, avg_cpus), | ||
1201 | offsetof(dlb_monitor_t, cycles), | ||
1202 | offsetof(dlb_monitor_t, instructions), | ||
1203 | offsetof(dlb_monitor_t, num_measurements), | ||
1204 | offsetof(dlb_monitor_t, num_resets), | ||
1205 | offsetof(dlb_monitor_t, num_mpi_calls), | ||
1206 | offsetof(dlb_monitor_t, num_omp_parallels), | ||
1207 | offsetof(dlb_monitor_t, num_omp_tasks), | ||
1208 | offsetof(dlb_monitor_t, start_time), | ||
1209 | offsetof(dlb_monitor_t, stop_time), | ||
1210 | offsetof(dlb_monitor_t, elapsed_time), | ||
1211 | offsetof(dlb_monitor_t, useful_time), | ||
1212 | offsetof(dlb_monitor_t, mpi_time), | ||
1213 | offsetof(dlb_monitor_t, omp_load_imbalance_time), | ||
1214 | offsetof(dlb_monitor_t, omp_scheduling_time), | ||
1215 | offsetof(dlb_monitor_t, omp_serialization_time), | ||
1216 | offsetof(dlb_monitor_t, _data)}; | ||
1217 | MPI_Datatype types[] = {address_type, MPI_INT, MPI_FLOAT, | ||
1218 | mpi_int64_type, mpi_int64_type, MPI_INT, MPI_INT, mpi_int64_type, | ||
1219 | mpi_int64_type, mpi_int64_type, mpi_int64_type, mpi_int64_type, | ||
1220 | mpi_int64_type, mpi_int64_type, mpi_int64_type, mpi_int64_type, | ||
1221 | mpi_int64_type, mpi_int64_type, address_type}; | ||
1222 | MPI_Datatype tmp_type; | ||
1223 | PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type); | ||
1224 | PMPI_Type_create_resized(tmp_type, 0, sizeof(dlb_monitor_t), &mpi_dlb_monitor_type); | ||
1225 | PMPI_Type_commit(&mpi_dlb_monitor_type); | ||
1226 | |||
1227 | static_ensure(sizeof(blocklengths)/sizeof(blocklengths[0]) == count); | ||
1228 | static_ensure(sizeof(displacements)/sizeof(displacements[0]) == count); | ||
1229 | static_ensure(sizeof(types)/sizeof(types[0]) == count); | ||
1230 | } | ||
1231 | |||
1232 | /* MPI struct type: process_record_t */ | ||
1233 | MPI_Datatype mpi_process_record_type; | ||
1234 | { | ||
1235 | int count = 7; | ||
1236 | int blocklengths[] = {1, 1, 1, HOST_NAME_MAX, | ||
1237 | TALP_OUTPUT_CPUSET_MAX, TALP_OUTPUT_CPUSET_MAX, 1}; | ||
1238 | MPI_Aint displacements[] = { | ||
1239 | offsetof(process_record_t, rank), | ||
1240 | offsetof(process_record_t, pid), | ||
1241 | offsetof(process_record_t, node_id), | ||
1242 | offsetof(process_record_t, hostname), | ||
1243 | offsetof(process_record_t, cpuset), | ||
1244 | offsetof(process_record_t, cpuset_quoted), | ||
1245 | offsetof(process_record_t, monitor)}; | ||
1246 | MPI_Datatype types[] = {MPI_INT, mpi_pid_type, MPI_INT, MPI_CHAR, MPI_CHAR, | ||
1247 | MPI_CHAR, mpi_dlb_monitor_type}; | ||
1248 | MPI_Datatype tmp_type; | ||
1249 | PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type); | ||
1250 | PMPI_Type_create_resized(tmp_type, 0, sizeof(process_record_t), | ||
1251 | &mpi_process_record_type); | ||
1252 | PMPI_Type_commit(&mpi_process_record_type); | ||
1253 | } | ||
1254 | |||
1255 | /* Gather data */ | ||
1256 | process_record_t *recvbuf = NULL; | ||
1257 | if (_mpi_rank == 0) { | ||
1258 | recvbuf = malloc(_mpi_size * sizeof(process_record_t)); | ||
1259 | } | ||
1260 | PMPI_Gather(&process_record_send, 1, mpi_process_record_type, | ||
1261 | recvbuf, 1, mpi_process_record_type, | ||
1262 | 0, getWorldComm()); | ||
1263 | |||
1264 | /* Add records */ | ||
1265 | if (_mpi_rank == 0) { | ||
1266 | for (int rank = 0; rank < _mpi_size; ++rank) { | ||
1267 | verbose(VB_TALP, "Process summary: recording region %s on rank %d", | ||
1268 | monitor->name, rank); | ||
1269 | talp_output_record_process(monitor->name, &recvbuf[rank], _mpi_size); | ||
1270 | } | ||
1271 | free(recvbuf); | ||
1272 | } | ||
1273 | |||
1274 | /* Free MPI types */ | ||
1275 | PMPI_Type_free(&mpi_dlb_monitor_type); | ||
1276 | PMPI_Type_free(&mpi_process_record_type); | ||
1277 | } | ||
1278 | #endif | ||
1279 | |||
1280 | #ifdef MPI_LIB | ||
1281 | |||
1282 | /* The following node and app reductions are needed to compute POP metrics: */ | ||
1283 | |||
1284 | /*** Node reduction ***/ | ||
1285 | |||
1286 | /* Data type to reduce among processes in node */ | ||
1287 | typedef struct node_reduction { | ||
1288 | bool node_used; | ||
1289 | int cpus_node; | ||
1290 | int64_t sum_useful; | ||
1291 | } node_reduction_t; | ||
1292 | |||
1293 | /* Function called in the MPI node reduction */ | ||
1294 | static void talp_mpi_node_reduction_fn(void *invec, void *inoutvec, int *len, | ||
1295 | MPI_Datatype *datatype) { | ||
1296 | node_reduction_t *in = invec; | ||
1297 | node_reduction_t *inout = inoutvec; | ||
1298 | |||
1299 | int _len = *len; | ||
1300 | for (int i = 0; i < _len; ++i) { | ||
1301 | if (in[i].node_used) { | ||
1302 | inout[i].node_used = true; | ||
1303 | inout[i].cpus_node += in[i].cpus_node; | ||
1304 | inout[i].sum_useful += in[i].sum_useful; | ||
1305 | } | ||
1306 | } | ||
1307 | } | ||
1308 | |||
1309 | /* Function to perform the reduction at node level */ | ||
1310 | static void talp_reduce_pop_metrics_node_reduction(node_reduction_t *node_reduction, | ||
1311 | const dlb_monitor_t *monitor) { | ||
1312 | |||
1313 | const node_reduction_t node_reduction_send = { | ||
1314 | .node_used = monitor->num_measurements > 0, | ||
1315 | .cpus_node = monitor->num_cpus, | ||
1316 | .sum_useful = monitor->useful_time, | ||
1317 | }; | ||
1318 | |||
1319 | /* MPI struct type: node_reduction_t */ | ||
1320 | MPI_Datatype mpi_node_reduction_type; | ||
1321 | { | ||
1322 | int count = 3; | ||
1323 | int blocklengths[] = {1, 1, 1}; | ||
1324 | MPI_Aint displacements[] = { | ||
1325 | offsetof(node_reduction_t, node_used), | ||
1326 | offsetof(node_reduction_t, cpus_node), | ||
1327 | offsetof(node_reduction_t, sum_useful)}; | ||
1328 | MPI_Datatype types[] = {MPI_C_BOOL, MPI_INT, mpi_int64_type}; | ||
1329 | MPI_Datatype tmp_type; | ||
1330 | PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type); | ||
1331 | PMPI_Type_create_resized(tmp_type, 0, sizeof(node_reduction_t), | ||
1332 | &mpi_node_reduction_type); | ||
1333 | PMPI_Type_commit(&mpi_node_reduction_type); | ||
1334 | } | ||
1335 | |||
1336 | /* Define MPI operation */ | ||
1337 | MPI_Op node_reduction_op; | ||
1338 | PMPI_Op_create(talp_mpi_node_reduction_fn, true, &node_reduction_op); | ||
1339 | |||
1340 | /* MPI reduction */ | ||
1341 | PMPI_Reduce(&node_reduction_send, node_reduction, 1, | ||
1342 | mpi_node_reduction_type, node_reduction_op, | ||
1343 | 0, getNodeComm()); | ||
1344 | |||
1345 | /* Free MPI types */ | ||
1346 | PMPI_Type_free(&mpi_node_reduction_type); | ||
1347 | PMPI_Op_free(&node_reduction_op); | ||
1348 | } | ||
1349 | |||
1350 | /** App reduction ***/ | ||
1351 | |||
1352 | /* Data type to reduce among processes in application */ | ||
1353 | typedef struct app_reduction_t { | ||
1354 | int num_cpus; | ||
1355 | int num_nodes; | ||
1356 | float avg_cpus; | ||
1357 | double cycles; | ||
1358 | double instructions; | ||
1359 | int64_t num_measurements; | ||
1360 | int64_t num_mpi_calls; | ||
1361 | int64_t num_omp_parallels; | ||
1362 | int64_t num_omp_tasks; | ||
1363 | int64_t elapsed_time; | ||
1364 | int64_t useful_time; | ||
1365 | int64_t mpi_time; | ||
1366 | int64_t omp_load_imbalance_time; | ||
1367 | int64_t omp_scheduling_time; | ||
1368 | int64_t omp_serialization_time; | ||
1369 | double max_useful_normd_proc; | ||
1370 | double max_useful_normd_node; | ||
1371 | double mpi_normd_of_max_useful; | ||
1372 | } app_reduction_t; | ||
1373 | |||
1374 | /* Function called in the MPI app reduction */ | ||
1375 | static void talp_mpi_reduction_fn(void *invec, void *inoutvec, int *len, | ||
1376 | MPI_Datatype *datatype) { | ||
1377 | app_reduction_t *in = invec; | ||
1378 | app_reduction_t *inout = inoutvec; | ||
1379 | |||
1380 | int _len = *len; | ||
1381 | for (int i = 0; i < _len; ++i) { | ||
1382 | inout[i].num_cpus += in[i].num_cpus; | ||
1383 | inout[i].num_nodes += in[i].num_nodes; | ||
1384 | inout[i].avg_cpus += in[i].avg_cpus; | ||
1385 | inout[i].cycles += in[i].cycles; | ||
1386 | inout[i].instructions += in[i].instructions; | ||
1387 | inout[i].num_measurements += in[i].num_measurements; | ||
1388 | inout[i].num_mpi_calls += in[i].num_mpi_calls; | ||
1389 | inout[i].num_omp_parallels += in[i].num_omp_parallels; | ||
1390 | inout[i].num_omp_tasks += in[i].num_omp_tasks; | ||
1391 | inout[i].elapsed_time = max_int64(inout[i].elapsed_time, in[i].elapsed_time); | ||
1392 | inout[i].useful_time += in[i].useful_time; | ||
1393 | inout[i].mpi_time += in[i].mpi_time; | ||
1394 | inout[i].omp_load_imbalance_time += in[i].omp_load_imbalance_time; | ||
1395 | inout[i].omp_scheduling_time += in[i].omp_scheduling_time; | ||
1396 | inout[i].omp_serialization_time += in[i].omp_serialization_time; | ||
1397 | inout[i].max_useful_normd_node = | ||
1398 | max_int64(inout[i].max_useful_normd_node, in[i].max_useful_normd_node); | ||
1399 | if (in[i].max_useful_normd_proc > inout[i].max_useful_normd_proc) { | ||
1400 | inout[i].max_useful_normd_proc = in[i].max_useful_normd_proc; | ||
1401 | inout[i].mpi_normd_of_max_useful = in[i].mpi_normd_of_max_useful; | ||
1402 | } | ||
1403 | } | ||
1404 | } | ||
1405 | |||
1406 | /* Function to perform the reduction at application level */ | ||
1407 | static void talp_reduce_pop_metrics_app_reduction(app_reduction_t *app_reduction, | ||
1408 | const node_reduction_t *node_reduction, const dlb_monitor_t *monitor, | ||
1409 | bool all_to_all) { | ||
1410 | |||
1411 | double max_useful_normd_proc = monitor->num_cpus == 0 ? 0.0 | ||
1412 | : (double)monitor->useful_time / monitor->num_cpus; | ||
1413 | double max_useful_normd_node = _process_id != 0 ? 0.0 | ||
1414 | : node_reduction->cpus_node == 0 ? 0.0 | ||
1415 | : (double)node_reduction->sum_useful / node_reduction->cpus_node; | ||
1416 | double mpi_normd_of_max_useful = monitor->num_cpus == 0 ? 0.0 | ||
1417 | : (double)monitor->mpi_time / monitor->num_cpus; | ||
1418 | |||
1419 | const app_reduction_t app_reduction_send = { | ||
1420 | .num_cpus = monitor->num_cpus, | ||
1421 | .num_nodes = _process_id == 0 && node_reduction->node_used ? 1 : 0, | ||
1422 | .avg_cpus = monitor->avg_cpus, | ||
1423 | .cycles = (double)monitor->cycles, | ||
1424 | .instructions = (double)monitor->instructions, | ||
1425 | .num_measurements = monitor->num_measurements, | ||
1426 | .num_mpi_calls = monitor->num_mpi_calls, | ||
1427 | .num_omp_parallels = monitor->num_omp_parallels, | ||
1428 | .num_omp_tasks = monitor->num_omp_tasks, | ||
1429 | .elapsed_time = monitor->elapsed_time, | ||
1430 | .useful_time = monitor->useful_time, | ||
1431 | .mpi_time = monitor->mpi_time, | ||
1432 | .omp_load_imbalance_time = monitor->omp_load_imbalance_time, | ||
1433 | .omp_scheduling_time = monitor->omp_scheduling_time, | ||
1434 | .omp_serialization_time = monitor->omp_serialization_time, | ||
1435 | .max_useful_normd_proc = max_useful_normd_proc, | ||
1436 | .max_useful_normd_node = max_useful_normd_node, | ||
1437 | .mpi_normd_of_max_useful = mpi_normd_of_max_useful, | ||
1438 | }; | ||
1439 | |||
1440 | /* MPI struct type: app_reduction_t */ | ||
1441 | MPI_Datatype mpi_app_reduction_type; | ||
1442 | { | ||
1443 | enum {count = 18}; | ||
1444 | int blocklengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; | ||
1445 | MPI_Aint displacements[] = { | ||
1446 | offsetof(app_reduction_t, num_cpus), | ||
1447 | offsetof(app_reduction_t, num_nodes), | ||
1448 | offsetof(app_reduction_t, avg_cpus), | ||
1449 | offsetof(app_reduction_t, cycles), | ||
1450 | offsetof(app_reduction_t, instructions), | ||
1451 | offsetof(app_reduction_t, num_measurements), | ||
1452 | offsetof(app_reduction_t, num_mpi_calls), | ||
1453 | offsetof(app_reduction_t, num_omp_parallels), | ||
1454 | offsetof(app_reduction_t, num_omp_tasks), | ||
1455 | offsetof(app_reduction_t, elapsed_time), | ||
1456 | offsetof(app_reduction_t, useful_time), | ||
1457 | offsetof(app_reduction_t, mpi_time), | ||
1458 | offsetof(app_reduction_t, omp_load_imbalance_time), | ||
1459 | offsetof(app_reduction_t, omp_scheduling_time), | ||
1460 | offsetof(app_reduction_t, omp_serialization_time), | ||
1461 | offsetof(app_reduction_t, max_useful_normd_proc), | ||
1462 | offsetof(app_reduction_t, max_useful_normd_node), | ||
1463 | offsetof(app_reduction_t, mpi_normd_of_max_useful)}; | ||
1464 | MPI_Datatype types[] = {MPI_INT, MPI_INT, MPI_FLOAT, MPI_DOUBLE, | ||
1465 | MPI_DOUBLE, mpi_int64_type, mpi_int64_type, mpi_int64_type, | ||
1466 | mpi_int64_type, mpi_int64_type, mpi_int64_type, mpi_int64_type, | ||
1467 | mpi_int64_type, mpi_int64_type, mpi_int64_type, MPI_DOUBLE, | ||
1468 | MPI_DOUBLE, MPI_DOUBLE}; | ||
1469 | MPI_Datatype tmp_type; | ||
1470 | PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type); | ||
1471 | PMPI_Type_create_resized(tmp_type, 0, sizeof(app_reduction_t), | ||
1472 | &mpi_app_reduction_type); | ||
1473 | PMPI_Type_commit(&mpi_app_reduction_type); | ||
1474 | |||
1475 | static_ensure(sizeof(blocklengths)/sizeof(blocklengths[0]) == count); | ||
1476 | static_ensure(sizeof(displacements)/sizeof(displacements[0]) == count); | ||
1477 | static_ensure(sizeof(types)/sizeof(types[0]) == count); | ||
1478 | } | ||
1479 | |||
1480 | /* Define MPI operation */ | ||
1481 | MPI_Op app_reduction_op; | ||
1482 | MPI_Op_create(talp_mpi_reduction_fn, true, &app_reduction_op); | ||
1483 | |||
1484 | /* MPI reduction */ | ||
1485 | if (!all_to_all) { | ||
1486 | PMPI_Reduce(&app_reduction_send, app_reduction, 1, | ||
1487 | mpi_app_reduction_type, app_reduction_op, | ||
1488 | 0, getWorldComm()); | ||
1489 | } else { | ||
1490 | PMPI_Allreduce(&app_reduction_send, app_reduction, 1, | ||
1491 | mpi_app_reduction_type, app_reduction_op, | ||
1492 | getWorldComm()); | ||
1493 | } | ||
1494 | |||
1495 | /* Free MPI types */ | ||
1496 | PMPI_Type_free(&mpi_app_reduction_type); | ||
1497 | PMPI_Op_free(&app_reduction_op); | ||
1498 | } | ||
1499 | |||
1500 | static void talp_reduce_pop_metrics(pop_base_metrics_t *base_metrics, | ||
1501 | const dlb_monitor_t *monitor, bool all_to_all) { | ||
1502 | |||
1503 | /* First, reduce some values among processes in the node, | ||
1504 | * needed to compute pop metrics */ | ||
1505 | node_reduction_t node_reduction = {0}; | ||
1506 | talp_reduce_pop_metrics_node_reduction(&node_reduction, monitor); | ||
1507 | |||
1508 | /* With the node reduction, reduce again among all process */ | ||
1509 | app_reduction_t app_reduction = {0}; | ||
1510 | talp_reduce_pop_metrics_app_reduction(&app_reduction, &node_reduction, | ||
1511 | monitor, all_to_all); | ||
1512 | |||
1513 | /* Finally, fill output base_metrics... */ | ||
1514 | |||
1515 | int num_mpi_ranks; | ||
1516 | PMPI_Comm_size(getWorldComm(), &num_mpi_ranks); | ||
1517 | |||
1518 | /* These values do not need a specific MPI reduction and can be deduced | ||
1519 | * from the already reduced number of CPUs */ | ||
1520 | double useful_normd_app = app_reduction.num_cpus == 0 ? 0.0 | ||
1521 | : (double)app_reduction.useful_time / app_reduction.num_cpus; | ||
1522 | double mpi_normd_app = app_reduction.num_cpus == 0 ? 0.0 | ||
1523 | : (double)app_reduction.mpi_time / app_reduction.num_cpus; | ||
1524 | |||
1525 | *base_metrics = (const pop_base_metrics_t) { | ||
1526 | .num_cpus = app_reduction.num_cpus, | ||
1527 | .num_mpi_ranks = num_mpi_ranks, | ||
1528 | .num_nodes = app_reduction.num_nodes, | ||
1529 | .avg_cpus = app_reduction.avg_cpus, | ||
1530 | .cycles = app_reduction.cycles, | ||
1531 | .instructions = app_reduction.instructions, | ||
1532 | .num_measurements = app_reduction.num_measurements, | ||
1533 | .num_mpi_calls = app_reduction.num_mpi_calls, | ||
1534 | .num_omp_parallels = app_reduction.num_omp_parallels, | ||
1535 | .num_omp_tasks = app_reduction.num_omp_tasks, | ||
1536 | .elapsed_time = app_reduction.elapsed_time, | ||
1537 | .useful_time = app_reduction.useful_time, | ||
1538 | .mpi_time = app_reduction.mpi_time, | ||
1539 | .omp_load_imbalance_time = app_reduction.omp_load_imbalance_time, | ||
1540 | .omp_scheduling_time = app_reduction.omp_scheduling_time, | ||
1541 | .omp_serialization_time = app_reduction.omp_serialization_time, | ||
1542 | .useful_normd_app = useful_normd_app, | ||
1543 | .mpi_normd_app = mpi_normd_app, | ||
1544 | .max_useful_normd_proc = app_reduction.max_useful_normd_proc, | ||
1545 | .max_useful_normd_node = app_reduction.max_useful_normd_node, | ||
1546 | .mpi_normd_of_max_useful = app_reduction.mpi_normd_of_max_useful, | ||
1547 | }; | ||
1548 | } | ||
1549 | #endif | ||
1550 | |||
1551 | #ifdef MPI_LIB | ||
1552 | /* Gather POP METRICS data of a monitor among all ranks and record it in rank 0 */ | ||
1553 | static void talp_record_pop_summary(const subprocess_descriptor_t *spd, | ||
1554 | const dlb_monitor_t *monitor) { | ||
1555 | |||
1556 | /* Internal monitors will not be recorded */ | ||
1557 | if (((monitor_data_t*)monitor->_data)->flags.internal) { | ||
1558 | return; | ||
1559 | } | ||
1560 | |||
1561 | if (_mpi_rank == 0) { | ||
1562 | verbose(VB_TALP, "TALP summary: gathering region %s", monitor->name); | ||
1563 | } | ||
1564 | |||
1565 | talp_info_t *talp_info = spd->talp_info; | ||
1566 | |||
1567 | /* Reduce monitor among all MPI ranks into MPI rank 0 */ | ||
1568 | pop_base_metrics_t base_metrics; | ||
1569 | talp_reduce_pop_metrics(&base_metrics, monitor, false); | ||
1570 | |||
1571 | if (_mpi_rank == 0) { | ||
1572 | if (base_metrics.elapsed_time > 0) { | ||
1573 | |||
1574 | /* Only the global region records the resources */ | ||
1575 | if (monitor == talp_info->monitor) { | ||
1576 | talp_output_record_resources(base_metrics.num_cpus, | ||
1577 | base_metrics.num_nodes, base_metrics.num_mpi_ranks); | ||
1578 | } | ||
1579 | |||
1580 | /* Construct pop_metrics out of base metrics */ | ||
1581 | dlb_pop_metrics_t pop_metrics; | ||
1582 | talp_base_metrics_to_pop_metrics(monitor->name, &base_metrics, &pop_metrics); | ||
1583 | |||
1584 | /* Record */ | ||
1585 | verbose(VB_TALP, "TALP summary: recording region %s", monitor->name); | ||
1586 | talp_output_record_pop_metrics(&pop_metrics); | ||
1587 | |||
1588 | } else { | ||
1589 | /* Record empty */ | ||
1590 | verbose(VB_TALP, "TALP summary: recording empty region %s", monitor->name); | ||
1591 | dlb_pop_metrics_t pop_metrics = {0}; | ||
1592 | snprintf(pop_metrics.name, DLB_MONITOR_NAME_MAX, "%s", monitor->name); | ||
1593 | talp_output_record_pop_metrics(&pop_metrics); | ||
1594 | } | ||
1595 | } | ||
1596 | } | ||
1597 | #endif | ||
1598 | |||
1599 | #ifdef MPI_LIB | ||
1600 | /* Communicate among all MPI processes so that everyone has the same monitoring regions */ | ||
1601 | static void talp_register_common_mpi_regions(const subprocess_descriptor_t *spd) { | ||
1602 | /* Note: there's a potential race condition if this function is called | ||
1603 | * (which happens on talp_mpi_finalize or talp_finalize) while another | ||
1604 | * thread creates a monitoring region. The solution would be to lock the | ||
1605 | * entire routine and call a specialized registering function that does not | ||
1606 | * lock, or use a recursive lock. The situation is strange enough to not | ||
1607 | * support it */ | ||
1608 | |||
1609 | talp_info_t *talp_info = spd->talp_info; | ||
1610 | |||
1611 | /* Warn about open regions */ | ||
1612 | for (GSList *node = talp_info->open_regions; | ||
1613 | node != NULL; | ||
1614 | node = node->next) { | ||
1615 | const dlb_monitor_t *monitor = node->data; | ||
1616 | warning("Region %s is still open during MPI_Finalize." | ||
1617 | " Collected data may be incomplete.", | ||
1618 | monitor->name); | ||
1619 | } | ||
1620 | |||
1621 | /* Gather recvcounts for each process | ||
1622 | * (Each process may have different number of monitors) */ | ||
1623 | int nregions = g_tree_nnodes(talp_info->regions); | ||
1624 | int chars_to_send = nregions * DLB_MONITOR_NAME_MAX; | ||
1625 | int *recvcounts = malloc(_mpi_size * sizeof(int)); | ||
1626 | PMPI_Allgather(&chars_to_send, 1, MPI_INT, | ||
1627 | recvcounts, 1, MPI_INT, getWorldComm()); | ||
1628 | |||
1629 | /* Compute total characters to gather via MPI */ | ||
1630 | int i; | ||
1631 | int total_chars = 0; | ||
1632 | for (i=0; i<_mpi_size; ++i) { | ||
1633 | total_chars += recvcounts[i]; | ||
1634 | } | ||
1635 | |||
1636 | if (total_chars > 0) { | ||
1637 | /* Prepare sendbuffer */ | ||
1638 | char *sendbuffer = malloc(nregions * DLB_MONITOR_NAME_MAX * sizeof(char)); | ||
1639 | char *sendptr = sendbuffer; | ||
1640 | for (GTreeNode *node = g_tree_node_first(talp_info->regions); | ||
1641 | node != NULL; | ||
1642 | node = g_tree_node_next(node)) { | ||
1643 | const dlb_monitor_t *monitor = g_tree_node_value(node); | ||
1644 | strcpy(sendptr, monitor->name); | ||
1645 | sendptr += DLB_MONITOR_NAME_MAX; | ||
1646 | } | ||
1647 | |||
1648 | /* Prepare recvbuffer */ | ||
1649 | char *recvbuffer = malloc(total_chars * sizeof(char)); | ||
1650 | |||
1651 | /* Compute displacements */ | ||
1652 | int *displs = malloc(_mpi_size * sizeof(int)); | ||
1653 | int next_disp = 0; | ||
1654 | for (i=0; i<_mpi_size; ++i) { | ||
1655 | displs[i] = next_disp; | ||
1656 | next_disp += recvcounts[i]; | ||
1657 | } | ||
1658 | |||
1659 | /* Gather all regions */ | ||
1660 | PMPI_Allgatherv(sendbuffer, nregions * DLB_MONITOR_NAME_MAX, MPI_CHAR, | ||
1661 | recvbuffer, recvcounts, displs, MPI_CHAR, getWorldComm()); | ||
1662 | |||
1663 | /* Register all regions. Existing ones will be skipped. */ | ||
1664 | for (i=0; i<total_chars; i+=DLB_MONITOR_NAME_MAX) { | ||
1665 | monitoring_region_register(spd, &recvbuffer[i]); | ||
1666 | } | ||
1667 | |||
1668 | free(sendbuffer); | ||
1669 | free(recvbuffer); | ||
1670 | free(displs); | ||
1671 | } | ||
1672 | |||
1673 | free(recvcounts); | ||
1674 | } | ||
1675 | #endif | ||
1676 | |||
1677 | |||
1678 | /*********************************************************************************/ | ||
1679 | /* TALP MPI functions */ | ||
1680 | /*********************************************************************************/ | ||
1681 | |||
1682 | /* Start global monitoring region (if not already started) */ | ||
1683 | 6 | void talp_mpi_init(const subprocess_descriptor_t *spd) { | |
1684 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | ensure(!thread_is_observer, "An observer thread cannot call talp_mpi_init"); |
1685 | |||
1686 | /* Initialize MPI type */ | ||
1687 | #if MPI_LIB | ||
1688 | # if MPI_VERSION >= 3 | ||
1689 | mpi_int64_type = MPI_INT64_T; | ||
1690 | # else | ||
1691 | PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(int64_t), &mpi_int64_type); | ||
1692 | # endif | ||
1693 | #endif | ||
1694 | |||
1695 | 6 | talp_info_t *talp_info = spd->talp_info; | |
1696 |
1/2✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
|
6 | if (talp_info) { |
1697 | 6 | talp_info->flags.have_mpi = true; | |
1698 | |||
1699 | /* Start global region (no-op if already started) */ | ||
1700 | 6 | monitoring_region_start(spd, talp_info->monitor); | |
1701 | |||
1702 | /* Add MPI_Init statistic and set useful state */ | ||
1703 | 6 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
1704 | 6 | DLB_ATOMIC_ADD_RLX(&sample->stats.num_mpi_calls, 1); | |
1705 | 6 | set_sample_state(sample, useful, talp_info->flags.papi); | |
1706 | } | ||
1707 | 6 | } | |
1708 | |||
1709 | /* Stop global monitoring region and gather APP data if needed */ | ||
1710 | 6 | void talp_mpi_finalize(const subprocess_descriptor_t *spd) { | |
1711 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | ensure(!thread_is_observer, "An observer thread cannot call talp_mpi_finalize"); |
1712 | 6 | talp_info_t *talp_info = spd->talp_info; | |
1713 |
1/2✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
|
6 | if (talp_info) { |
1714 | /* Add MPI_Finalize */ | ||
1715 | 6 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
1716 | 6 | DLB_ATOMIC_ADD_RLX(&sample->stats.num_mpi_calls, 1); | |
1717 | |||
1718 | /* Stop global region */ | ||
1719 | 6 | monitoring_region_stop(spd, talp_info->monitor); | |
1720 | |||
1721 | 6 | monitor_data_t *monitor_data = talp_info->monitor->_data; | |
1722 | |||
1723 | /* Update shared memory values */ | ||
1724 |
3/4✓ Branch 0 taken 5 times.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 5 times.
|
6 | if (talp_info->flags.have_shmem || talp_info->flags.have_minimal_shmem) { |
1725 | // TODO: is it needed? isn't it updated when stopped? | ||
1726 | 1 | shmem_talp__set_times(monitor_data->node_shared_id, | |
1727 | 1 | talp_info->monitor->mpi_time, | |
1728 | 1 | talp_info->monitor->useful_time); | |
1729 | } | ||
1730 | |||
1731 | #ifdef MPI_LIB | ||
1732 | /* If performing any kind of TALP summary, check that the number of processes | ||
1733 | * registered in the shared memory matches with the number of MPI processes in the node. | ||
1734 | * This check is needed to avoid deadlocks on finalize. */ | ||
1735 | if (spd->options.talp_summary) { | ||
1736 | verbose(VB_TALP, "Gathering TALP metrics"); | ||
1737 | /* FIXME: use Named Barrier */ | ||
1738 | /* if (shmem_barrier__get_num_participants(spd->options.barrier_id) == _mpis_per_node) { */ | ||
1739 | |||
1740 | /* Gather data among processes in the node if node summary is enabled */ | ||
1741 | if (spd->options.talp_summary & SUMMARY_NODE) { | ||
1742 | talp_record_node_summary(spd); | ||
1743 | } | ||
1744 | |||
1745 | /* Gather data among MPIs if any of these summaries is enabled */ | ||
1746 | if (spd->options.talp_summary | ||
1747 | & (SUMMARY_POP_METRICS | SUMMARY_PROCESS)) { | ||
1748 | /* Ensure everyone has the same monitoring regions */ | ||
1749 | talp_register_common_mpi_regions(spd); | ||
1750 | |||
1751 | /* Finally, reduce data */ | ||
1752 | for (GTreeNode *node = g_tree_node_first(talp_info->regions); | ||
1753 | node != NULL; | ||
1754 | node = g_tree_node_next(node)) { | ||
1755 | const dlb_monitor_t *monitor = g_tree_node_value(node); | ||
1756 | if (spd->options.talp_summary & SUMMARY_POP_METRICS) { | ||
1757 | talp_record_pop_summary(spd, monitor); | ||
1758 | } | ||
1759 | if (spd->options.talp_summary & SUMMARY_PROCESS) { | ||
1760 | talp_record_process_summary(spd, monitor); | ||
1761 | } | ||
1762 | } | ||
1763 | } | ||
1764 | |||
1765 | /* Synchronize all processes in node before continuing with DLB finalization */ | ||
1766 | node_barrier(spd, NULL); | ||
1767 | /* } else { */ | ||
1768 | /* warning("The number of MPI processes and processes registered in DLB differ." */ | ||
1769 | /* " TALP will not print any summary."); */ | ||
1770 | /* } */ | ||
1771 | } | ||
1772 | #endif | ||
1773 | } | ||
1774 | 6 | } | |
1775 | |||
1776 | /* Decide whether to update the current sample (most likely and cheaper) | ||
1777 | * or to aggregate all samples. | ||
1778 | * We will only aggregate all samples if the external profiler is enabled | ||
1779 | * and this MPI call is a blocking collective call. */ | ||
1780 | 14 | static inline void update_sample_on_sync_call(const subprocess_descriptor_t *spd, | |
1781 | const talp_info_t *talp_info, talp_sample_t *sample, bool is_blocking_collective) { | ||
1782 | |||
1783 |
4/4✓ Branch 0 taken 8 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 4 times.
|
14 | if (!talp_info->flags.external_profiler || !is_blocking_collective) { |
1784 | /* Likely scenario, just update the sample */ | ||
1785 | 10 | talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP); | |
1786 | } else { | ||
1787 | /* If talp_info->flags.external_profiler && is_blocking_collective: | ||
1788 | * aggregate samples and update all monitoring regions */ | ||
1789 | 4 | talp_aggregate_samples(spd); | |
1790 | } | ||
1791 | 14 | } | |
1792 | |||
1793 | 8 | void talp_into_sync_call(const subprocess_descriptor_t *spd, bool is_blocking_collective) { | |
1794 | /* Observer threads may call MPI functions, but TALP must ignore them */ | ||
1795 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
|
8 | if (unlikely(thread_is_observer)) return; |
1796 | |||
1797 | 7 | const talp_info_t *talp_info = spd->talp_info; | |
1798 |
1/2✓ Branch 0 taken 7 times.
✗ Branch 1 not taken.
|
7 | if (talp_info) { |
1799 | /* Update sample */ | ||
1800 | 7 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
1801 | 7 | update_sample_on_sync_call(spd, talp_info, sample, is_blocking_collective); | |
1802 | |||
1803 | /* Into Sync call -> not_useful_mpi */ | ||
1804 | 7 | set_sample_state(sample, not_useful_mpi, talp_info->flags.papi); | |
1805 | } | ||
1806 | } | ||
1807 | |||
1808 | 8 | void talp_out_of_sync_call(const subprocess_descriptor_t *spd, bool is_blocking_collective) { | |
1809 | /* Observer threads may call MPI functions, but TALP must ignore them */ | ||
1810 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 7 times.
|
8 | if (unlikely(thread_is_observer)) return; |
1811 | |||
1812 | 7 | const talp_info_t *talp_info = spd->talp_info; | |
1813 |
1/2✓ Branch 0 taken 7 times.
✗ Branch 1 not taken.
|
7 | if (talp_info) { |
1814 | /* Update sample */ | ||
1815 | 7 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
1816 | 7 | DLB_ATOMIC_ADD_RLX(&sample->stats.num_mpi_calls, 1); | |
1817 | 7 | update_sample_on_sync_call(spd, talp_info, sample, is_blocking_collective); | |
1818 | |||
1819 | /* Out of Sync call -> useful */ | ||
1820 | 7 | set_sample_state(sample, useful, talp_info->flags.papi); | |
1821 | } | ||
1822 | } | ||
1823 | |||
1824 | |||
1825 | /*********************************************************************************/ | ||
1826 | /* TALP OpenMP functions */ | ||
1827 | /*********************************************************************************/ | ||
1828 | |||
1829 | /* samples involved in parallel level 1 */ | ||
1830 | static talp_sample_t** parallel_samples_l1 = NULL; | ||
1831 | static unsigned int parallel_samples_l1_capacity = 0; | ||
1832 | |||
1833 | 1 | void talp_openmp_init(pid_t pid, const options_t* options) { | |
1834 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | ensure(!thread_is_observer, "An observer thread cannot call talp_openmp_init"); |
1835 | |||
1836 | 1 | const subprocess_descriptor_t *spd = thread_spd; | |
1837 | 1 | talp_info_t *talp_info = spd->talp_info; | |
1838 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | if (talp_info) { |
1839 | 1 | monitor_data_t *monitor_data = talp_info->monitor->_data; | |
1840 | 1 | talp_info->flags.have_openmp = true; | |
1841 | |||
1842 | /* Fix up number of CPUs for the global region */ | ||
1843 | 1 | float cpus = CPU_COUNT(&spd->process_mask); | |
1844 | 1 | talp_info->monitor->avg_cpus = cpus; | |
1845 | 1 | shmem_talp__set_avg_cpus(monitor_data->node_shared_id, cpus); | |
1846 | |||
1847 | /* Start global region (no-op if already started) */ | ||
1848 | 1 | monitoring_region_start(spd, talp_info->monitor); | |
1849 | |||
1850 | /* Set useful state */ | ||
1851 | 1 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
1852 | 1 | set_sample_state(sample, useful, talp_info->flags.papi); | |
1853 | } | ||
1854 | 1 | } | |
1855 | |||
1856 | 1 | void talp_openmp_finalize(void) { | |
1857 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | if (parallel_samples_l1 != NULL) { |
1858 | 1 | free(parallel_samples_l1); | |
1859 | 1 | parallel_samples_l1 = NULL; | |
1860 | 1 | parallel_samples_l1_capacity = 0; | |
1861 | } | ||
1862 | 1 | } | |
1863 | |||
1864 | 2 | void talp_openmp_thread_begin() { | |
1865 | 2 | const subprocess_descriptor_t *spd = thread_spd; | |
1866 | 2 | talp_info_t *talp_info = spd->talp_info; | |
1867 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (talp_info) { |
1868 | /* Initial thread is already in useful state, set omp_out for others */ | ||
1869 | 2 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
1870 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | if (sample->state == disabled) { |
1871 | /* Not initial thread: */ | ||
1872 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (talp_info->flags.papi) { |
1873 | ✗ | init_papi_counters(); | |
1874 | } | ||
1875 | 1 | set_sample_state(sample, not_useful_omp_out, talp_info->flags.papi); | |
1876 | |||
1877 | /* The initial time of the sample is set to match the start time of | ||
1878 | * the innermost open region, but other nested open regions need to | ||
1879 | * be fixed */ | ||
1880 | 1 | monitoring_regions_update_nested(spd, sample); | |
1881 | } | ||
1882 | } | ||
1883 | 2 | } | |
1884 | |||
1885 | 2 | void talp_openmp_thread_end(void) { | |
1886 | 2 | const subprocess_descriptor_t *spd = thread_spd; | |
1887 | 2 | talp_info_t *talp_info = spd->talp_info; | |
1888 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | if (talp_info) { |
1889 | /* Update thread sample with the last microsample */ | ||
1890 | 1 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
1891 | 1 | talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP); | |
1892 | |||
1893 | /* Update state */ | ||
1894 | 1 | set_sample_state(sample, disabled, talp_info->flags.papi); | |
1895 | } | ||
1896 | 2 | } | |
1897 | |||
1898 | 2 | void talp_openmp_parallel_begin(omptool_parallel_data_t *parallel_data) { | |
1899 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | fatal_cond(parallel_data->requested_parallelism < 1, |
1900 | "Requested parallel region of invalid size in %s. Please report bug at %s.", | ||
1901 | __func__, PACKAGE_BUGREPORT); | ||
1902 | |||
1903 | 2 | const subprocess_descriptor_t *spd = thread_spd; | |
1904 | 2 | talp_info_t *talp_info = spd->talp_info; | |
1905 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (talp_info) { |
1906 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (parallel_data->level == 1) { |
1907 | /* Resize samples of parallel 1 if needed */ | ||
1908 | 2 | unsigned int requested_parallelism = parallel_data->requested_parallelism; | |
1909 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (requested_parallelism > parallel_samples_l1_capacity) { |
1910 | 2 | void *ptr = realloc(parallel_samples_l1, | |
1911 | sizeof(talp_sample_t*)*requested_parallelism); | ||
1912 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | fatal_cond(!ptr, "realloc failed in %s", __func__); |
1913 | 2 | parallel_samples_l1 = ptr; | |
1914 | 2 | parallel_samples_l1_capacity = requested_parallelism; | |
1915 | } | ||
1916 | |||
1917 | /* Assign local data */ | ||
1918 | 2 | parallel_data->talp_parallel_data = parallel_samples_l1; | |
1919 | |||
1920 | ✗ | } else if (parallel_data->level > 1) { | |
1921 | /* Allocate parallel samples array */ | ||
1922 | ✗ | unsigned int requested_parallelism = parallel_data->requested_parallelism; | |
1923 | ✗ | void *ptr = malloc(sizeof(talp_sample_t*)*requested_parallelism); | |
1924 | ✗ | fatal_cond(!ptr, "malloc failed in %s", __func__); | |
1925 | |||
1926 | /* Assign local data */ | ||
1927 | ✗ | parallel_data->talp_parallel_data = ptr; | |
1928 | } | ||
1929 | |||
1930 | /* Update stats */ | ||
1931 | 2 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
1932 | 2 | DLB_ATOMIC_ADD_RLX(&sample->stats.num_omp_parallels, 1); | |
1933 | } | ||
1934 | 2 | } | |
1935 | |||
1936 | 2 | void talp_openmp_parallel_end(omptool_parallel_data_t *parallel_data) { | |
1937 | 2 | const subprocess_descriptor_t *spd = thread_spd; | |
1938 | 2 | talp_info_t *talp_info = spd->talp_info; | |
1939 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (talp_info) { |
1940 | /* Update thread sample with the last microsample */ | ||
1941 | 2 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
1942 | 2 | talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP); | |
1943 | |||
1944 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (parallel_data->level == 1) { |
1945 | /* Flush and aggregate all samples of the parallel region */ | ||
1946 | 2 | talp_aggregate_samples_parallel(spd, | |
1947 | 2 | parallel_data->talp_parallel_data, | |
1948 | parallel_data->actual_parallelism); | ||
1949 | |||
1950 | ✗ | } else if (parallel_data->level > 1) { | |
1951 | /* Flush and aggregate all samples of this parallel except this | ||
1952 | * thread's sample. The primary thread of a nested parallel region | ||
1953 | * will keep its samples until it finishes as non-primary | ||
1954 | * team-worker or reaches the level 1 parallel region */ | ||
1955 | ✗ | talp_sample_t **parallel_samples = parallel_data->talp_parallel_data; | |
1956 | ✗ | talp_aggregate_samples_parallel(spd, | |
1957 | ¶llel_samples[1], | ||
1958 | ✗ | parallel_data->actual_parallelism-1); | |
1959 | |||
1960 | /* free local data */ | ||
1961 | ✗ | free(parallel_data->talp_parallel_data); | |
1962 | ✗ | parallel_data->talp_parallel_data = NULL; | |
1963 | } | ||
1964 | |||
1965 | /* Update current threads's state */ | ||
1966 | 2 | set_sample_state(sample, useful, talp_info->flags.papi); | |
1967 | |||
1968 | /* Update the state of the rest of team-worker threads | ||
1969 | * (note that set_sample_state cannot be used here because we are | ||
1970 | * impersonating a worker thread) */ | ||
1971 | 2 | talp_sample_t **parallel_samples = parallel_data->talp_parallel_data; | |
1972 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
|
3 | for (unsigned int i = 1; i < parallel_data->actual_parallelism; ++i) { |
1973 | 1 | talp_sample_t *worker_sample = parallel_samples[i]; | |
1974 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (worker_sample->state == not_useful_omp_in) { |
1975 | ✗ | worker_sample->state = not_useful_omp_out; | |
1976 | } | ||
1977 | } | ||
1978 | } | ||
1979 | 2 | } | |
1980 | |||
1981 | 3 | void talp_openmp_into_parallel_function( | |
1982 | omptool_parallel_data_t *parallel_data, unsigned int index) { | ||
1983 | 3 | const subprocess_descriptor_t *spd = thread_spd; | |
1984 | 3 | talp_info_t *talp_info = spd->talp_info; | |
1985 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (talp_info) { |
1986 | /* Assign thread sample as team-worker of this parallel */ | ||
1987 | 3 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
1988 | 3 | talp_sample_t **parallel_samples = parallel_data->talp_parallel_data; | |
1989 | /* Probably optimized, but try to avoid invalidating | ||
1990 | * the cache line on reused parallel data */ | ||
1991 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1 times.
|
3 | if (parallel_samples[index] != sample) { |
1992 | 2 | parallel_samples[index] = sample; | |
1993 | } | ||
1994 | |||
1995 | /* Update thread sample with the last microsample */ | ||
1996 | 3 | talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP); | |
1997 | |||
1998 | /* Update state */ | ||
1999 | 3 | set_sample_state(sample, useful, talp_info->flags.papi); | |
2000 | } | ||
2001 | 3 | } | |
2002 | |||
2003 | 3 | void talp_openmp_into_parallel_implicit_barrier(omptool_parallel_data_t *parallel_data) { | |
2004 | 3 | const subprocess_descriptor_t *spd = thread_spd; | |
2005 | 3 | talp_info_t *talp_info = spd->talp_info; | |
2006 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (talp_info) { |
2007 | /* Update thread sample with the last microsample */ | ||
2008 | 3 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
2009 | 3 | talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP); | |
2010 | |||
2011 | /* Update state */ | ||
2012 | 3 | set_sample_state(sample, not_useful_omp_in, talp_info->flags.papi); | |
2013 | } | ||
2014 | 3 | } | |
2015 | |||
2016 | 3 | void talp_openmp_into_parallel_sync(omptool_parallel_data_t *parallel_data) { | |
2017 | 3 | const subprocess_descriptor_t *spd = thread_spd; | |
2018 | 3 | talp_info_t *talp_info = spd->talp_info; | |
2019 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (talp_info) { |
2020 | /* Update thread sample with the last microsample */ | ||
2021 | 3 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
2022 | 3 | talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP); | |
2023 | |||
2024 | /* Update state */ | ||
2025 | 3 | set_sample_state(sample, not_useful_omp_in, talp_info->flags.papi); | |
2026 | } | ||
2027 | 3 | } | |
2028 | |||
2029 | 3 | void talp_openmp_outof_parallel_sync(omptool_parallel_data_t *parallel_data) { | |
2030 | 3 | const subprocess_descriptor_t *spd = thread_spd; | |
2031 | 3 | talp_info_t *talp_info = spd->talp_info; | |
2032 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (talp_info) { |
2033 | /* Update thread sample with the last microsample */ | ||
2034 | 3 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
2035 | 3 | talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP); | |
2036 | |||
2037 | /* Update state */ | ||
2038 | 3 | set_sample_state(sample, useful, talp_info->flags.papi); | |
2039 | } | ||
2040 | 3 | } | |
2041 | |||
2042 | 3 | void talp_openmp_task_create(void) { | |
2043 | 3 | const subprocess_descriptor_t *spd = thread_spd; | |
2044 | 3 | talp_info_t *talp_info = spd->talp_info; | |
2045 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (talp_info) { |
2046 | /* Just update stats */ | ||
2047 | 3 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
2048 | 3 | DLB_ATOMIC_ADD_RLX(&sample->stats.num_omp_tasks, 1); | |
2049 | } | ||
2050 | 3 | } | |
2051 | |||
2052 | 3 | void talp_openmp_task_complete(void) { | |
2053 | 3 | const subprocess_descriptor_t *spd = thread_spd; | |
2054 | 3 | talp_info_t *talp_info = spd->talp_info; | |
2055 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (talp_info) { |
2056 | /* Update thread sample with the last microsample */ | ||
2057 | 3 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
2058 | 3 | talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP); | |
2059 | |||
2060 | /* Update state (FIXME: tasks outside of parallels? */ | ||
2061 | 3 | set_sample_state(sample, not_useful_omp_in, talp_info->flags.papi); | |
2062 | } | ||
2063 | 3 | } | |
2064 | |||
2065 | 6 | void talp_openmp_task_switch(void) { | |
2066 | 6 | const subprocess_descriptor_t *spd = thread_spd; | |
2067 | 6 | talp_info_t *talp_info = spd->talp_info; | |
2068 |
1/2✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
|
6 | if (talp_info) { |
2069 | /* Update thread sample with the last microsample */ | ||
2070 | 6 | talp_sample_t *sample = talp_get_thread_sample(spd); | |
2071 | 6 | talp_update_sample(sample, talp_info->flags.papi, NO_TIMESTAMP); | |
2072 | |||
2073 | /* Update state */ | ||
2074 | 6 | set_sample_state(sample, useful, talp_info->flags.papi); | |
2075 | } | ||
2076 | 6 | } | |
2077 | |||
2078 | /*********************************************************************************/ | ||
2079 | /* TALP collect functions for 3rd party programs: */ | ||
2080 | /* - It's also safe to call it from a 1st party program */ | ||
2081 | /* - Requires --talp-external-profiler set up in application */ | ||
2082 | /* - Des not need to synchronize with application */ | ||
2083 | /*********************************************************************************/ | ||
2084 | |||
2085 | /* Function that may be called from a third-party process to compute | ||
2086 | * node_metrics for a given region */ | ||
2087 | 5 | int talp_query_pop_node_metrics(const char *name, dlb_node_metrics_t *node_metrics) { | |
2088 | |||
2089 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
|
5 | if (name == NULL) { |
2090 | 1 | name = global_region_name; | |
2091 | } | ||
2092 | |||
2093 | 5 | int error = DLB_SUCCESS; | |
2094 | 5 | int64_t total_mpi_time = 0; | |
2095 | 5 | int64_t total_useful_time = 0; | |
2096 | 5 | int64_t max_mpi_time = 0; | |
2097 | 5 | int64_t max_useful_time = 0; | |
2098 | |||
2099 | /* Obtain a list of regions in the node associated with given region */ | ||
2100 | 5 | int max_procs = mu_get_system_size(); | |
2101 | 5 | talp_region_list_t *region_list = malloc(max_procs * sizeof(talp_region_list_t)); | |
2102 | int nelems; | ||
2103 | 5 | shmem_talp__get_regionlist(region_list, &nelems, max_procs, name); | |
2104 | |||
2105 | /* Count how many processes have started the region */ | ||
2106 | 5 | int processes_per_node = 0; | |
2107 | |||
2108 | /* Iterate the PID list and gather times of every process */ | ||
2109 | int i; | ||
2110 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 5 times.
|
11 | for (i = 0; i <nelems; ++i) { |
2111 | 6 | int64_t mpi_time = region_list[i].mpi_time; | |
2112 | 6 | int64_t useful_time = region_list[i].useful_time; | |
2113 | |||
2114 | /* Accumulate total and max values */ | ||
2115 |
3/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 3 times.
✗ Branch 3 not taken.
|
6 | if (mpi_time > 0 || useful_time > 0) { |
2116 | 6 | ++processes_per_node; | |
2117 | 6 | total_mpi_time += mpi_time; | |
2118 | 6 | total_useful_time += useful_time; | |
2119 | 6 | max_mpi_time = max_int64(mpi_time, max_mpi_time); | |
2120 | 6 | max_useful_time = max_int64(useful_time, max_useful_time); | |
2121 | } | ||
2122 | } | ||
2123 | 5 | free(region_list); | |
2124 | |||
2125 | #if MPI_LIB | ||
2126 | int node_id = _node_id; | ||
2127 | #else | ||
2128 | 5 | int node_id = 0; | |
2129 | #endif | ||
2130 | |||
2131 |
1/2✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
|
5 | if (processes_per_node > 0) { |
2132 | /* Compute POP metrics with some inferred values */ | ||
2133 | perf_metrics_mpi_t metrics; | ||
2134 | 5 | perf_metrics__infer_mpi_model( | |
2135 | &metrics, | ||
2136 | processes_per_node, | ||
2137 | total_useful_time, | ||
2138 | total_mpi_time, | ||
2139 | max_useful_time); | ||
2140 | |||
2141 | /* Initialize structure */ | ||
2142 | 5 | *node_metrics = (const dlb_node_metrics_t) { | |
2143 | .node_id = node_id, | ||
2144 | .processes_per_node = processes_per_node, | ||
2145 | .total_useful_time = total_useful_time, | ||
2146 | .total_mpi_time = total_mpi_time, | ||
2147 | .max_useful_time = max_useful_time, | ||
2148 | .max_mpi_time = max_mpi_time, | ||
2149 | 5 | .parallel_efficiency = metrics.parallel_efficiency, | |
2150 | 5 | .communication_efficiency = metrics.communication_efficiency, | |
2151 | 5 | .load_balance = metrics.load_balance, | |
2152 | }; | ||
2153 | 5 | snprintf(node_metrics->name, DLB_MONITOR_NAME_MAX, "%s", name); | |
2154 | } else { | ||
2155 | ✗ | error = DLB_ERR_NOENT; | |
2156 | } | ||
2157 | |||
2158 | 5 | return error; | |
2159 | } | ||
2160 | |||
2161 | |||
2162 | |||
2163 | /*********************************************************************************/ | ||
2164 | /* TALP collect functions for 1st party programs */ | ||
2165 | /* - Requires synchronization (MPI or node barrier) among all processes */ | ||
2166 | /*********************************************************************************/ | ||
2167 | |||
2168 | /* Perform MPI collective calls and compute the current POP metrics for the | ||
2169 | * specified monitor. If monitor is NULL, the global monitoring region is | ||
2170 | * assumed. | ||
2171 | * Pre-conditions: | ||
2172 | * - the given monitor must have been registered in all MPI ranks | ||
2173 | * - pop_metrics is an allocated structure | ||
2174 | */ | ||
2175 | 1 | int talp_collect_pop_metrics(const subprocess_descriptor_t *spd, | |
2176 | dlb_monitor_t *monitor, dlb_pop_metrics_t *pop_metrics) { | ||
2177 | #ifdef MPI_LIB | ||
2178 | talp_info_t *talp_info = spd->talp_info; | ||
2179 | if (monitor == NULL) { | ||
2180 | monitor = talp_info->monitor; | ||
2181 | } | ||
2182 | |||
2183 | /* Stop monitor so that metrics are updated */ | ||
2184 | bool resume_region = monitoring_region_stop(spd, monitor) == DLB_SUCCESS; | ||
2185 | |||
2186 | /* Reduce monitor among all MPI ranks and everbody collects (all-to-all) */ | ||
2187 | pop_base_metrics_t base_metrics; | ||
2188 | talp_reduce_pop_metrics(&base_metrics, monitor, true); | ||
2189 | |||
2190 | /* Construct output pop_metrics out of base metrics */ | ||
2191 | talp_base_metrics_to_pop_metrics(monitor->name, &base_metrics, pop_metrics); | ||
2192 | |||
2193 | /* Resume monitor */ | ||
2194 | if (resume_region) { | ||
2195 | monitoring_region_start(spd, monitor); | ||
2196 | } | ||
2197 | |||
2198 | return DLB_SUCCESS; | ||
2199 | #else | ||
2200 | 1 | return DLB_ERR_NOCOMP; | |
2201 | #endif | ||
2202 | } | ||
2203 | |||
2204 | /* Node-collective function to compute node_metrics for a given region */ | ||
2205 | 4 | int talp_collect_pop_node_metrics(const subprocess_descriptor_t *spd, | |
2206 | dlb_monitor_t *monitor, dlb_node_metrics_t *node_metrics) { | ||
2207 | |||
2208 | 4 | talp_info_t *talp_info = spd->talp_info; | |
2209 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | monitor = monitor ? monitor : talp_info->monitor; |
2210 | 4 | monitor_data_t *monitor_data = monitor->_data; | |
2211 | |||
2212 | /* Stop monitor so that metrics are updated */ | ||
2213 | 4 | bool resume_region = monitoring_region_stop(spd, monitor) == DLB_SUCCESS; | |
2214 | |||
2215 | /* This functionality needs a shared memory, create a temporary one if needed */ | ||
2216 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | if (!talp_info->flags.have_shmem) { |
2217 | 4 | shmem_talp__init(spd->options.shm_key, 1); | |
2218 | 4 | shmem_talp__register(spd->id, monitor->avg_cpus, monitor->name, | |
2219 | &monitor_data->node_shared_id); | ||
2220 | } | ||
2221 | |||
2222 | /* Update the shared memory with this process' metrics */ | ||
2223 | 4 | shmem_talp__set_times(monitor_data->node_shared_id, | |
2224 | monitor->mpi_time, | ||
2225 | monitor->useful_time); | ||
2226 | |||
2227 | /* Perform a node barrier to ensure everyone has updated their metrics */ | ||
2228 | 4 | node_barrier(spd, NULL); | |
2229 | |||
2230 | /* Compute node metrics for that region name */ | ||
2231 | 4 | talp_query_pop_node_metrics(monitor->name, node_metrics); | |
2232 | |||
2233 | /* Remove shared memory if it was a temporary one */ | ||
2234 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | if (!talp_info->flags.have_shmem) { |
2235 | 4 | shmem_talp__finalize(spd->id); | |
2236 | } | ||
2237 | |||
2238 | /* Resume monitor */ | ||
2239 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 3 times.
|
4 | if (resume_region) { |
2240 | 1 | monitoring_region_start(spd, monitor); | |
2241 | } | ||
2242 | |||
2243 | 4 | return DLB_SUCCESS; | |
2244 | } | ||
2245 |