GCC Code Coverage Report


Directory: src/
File: src/talp/talp_record.c
Date: 2025-11-21 10:34:40
Exec Total Coverage
Lines: 23 23 100.0%
Functions: 1 1 100.0%
Branches: 11 14 78.6%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2025 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #include "talp/talp_record.h"
21
22 #include "LB_comm/shmem_talp.h"
23 #include "LB_core/node_barrier.h"
24 #include "LB_core/spd.h"
25 #include "apis/dlb_talp.h"
26 #include "support/debug.h"
27 #include "support/mask_utils.h"
28 #include "support/options.h"
29 #include "talp/perf_metrics.h"
30 #include "talp/regions.h"
31 #include "talp/talp_output.h"
32 #include "talp/talp_types.h"
33 #ifdef MPI_LIB
34 #include "mpi/mpi_core.h"
35 #endif
36
37 #include <stddef.h>
38 #include <stdio.h>
39 #include <unistd.h>
40
41
42 /*********************************************************************************/
43 /* TALP Record in serial (non-MPI) mode */
44 /*********************************************************************************/
45
46 /* For any given monitor, record metrics considering only this (sub-)process */
47 4818 void talp_record_monitor(const subprocess_descriptor_t *spd,
48 const dlb_monitor_t *monitor) {
49
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 4816 times.
4818 if (spd->options.talp_summary & SUMMARY_PROCESS) {
50
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 verbose(VB_TALP, "TALP process summary: recording region %s", monitor->name);
51
52 2 process_record_t process_record = {
53 .rank = 0,
54 2 .pid = spd->id,
55 .monitor = *monitor,
56 };
57
58 /* Fill hostname and CPU mask strings in process_record */
59 2 gethostname(process_record.hostname, HOST_NAME_MAX);
60 2 snprintf(process_record.cpuset, TALP_OUTPUT_CPUSET_MAX, "%s",
61 mu_to_str(&spd->process_mask));
62 2 mu_get_quoted_mask(&spd->process_mask, process_record.cpuset_quoted,
63 TALP_OUTPUT_CPUSET_MAX);
64
65 /* Add record */
66 2 talp_output_record_process(monitor->name, &process_record, 1);
67 }
68
69
2/2
✓ Branch 0 taken 15 times.
✓ Branch 1 taken 4803 times.
4818 if (spd->options.talp_summary & SUMMARY_POP_METRICS) {
70
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 1 times.
15 if (monitor->elapsed_time > 0) {
71
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 14 times.
14 verbose(VB_TALP, "TALP summary: recording region %s", monitor->name);
72
73 pop_base_metrics_t base_metrics;
74 14 perf_metrics__local_monitor_into_base_metrics(&base_metrics, monitor);
75
76 dlb_pop_metrics_t pop_metrics;
77 14 perf_metrics__base_to_pop_metrics(monitor->name, &base_metrics, &pop_metrics);
78 14 talp_output_record_pop_metrics(&pop_metrics);
79
80 14 talp_info_t *talp_info = spd->talp_info;
81
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 6 times.
14 if(monitor == talp_info->monitor) {
82 8 talp_output_record_resources(monitor->num_cpus,
83 /* num_nodes */ 1, /* num_ranks */ 0, base_metrics.num_gpus);
84 }
85
86 } else {
87
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 verbose(VB_TALP, "TALP summary: recording empty region %s", monitor->name);
88 1 dlb_pop_metrics_t pop_metrics = {0};
89 1 snprintf(pop_metrics.name, DLB_MONITOR_NAME_MAX, "%s", monitor->name);
90 1 talp_output_record_pop_metrics(&pop_metrics);
91 }
92 }
93 4818 }
94
95
96 /*********************************************************************************/
97 /* TALP Record in MPI mode */
98 /*********************************************************************************/
99
100 #if MPI_LIB
101
102 /* Compute Node summary of all Global Monitors and record data */
103 void talp_record_node_summary(const subprocess_descriptor_t *spd) {
104 node_record_t *node_summary = NULL;
105 size_t node_summary_size = 0;
106
107 /* Perform a barrier so that all processes in the node have arrived at the
108 * MPI_Finalize */
109 node_barrier(spd, NULL);
110
111 /* Node process 0 reduces all global regions from all processes in the node */
112 if (_process_id == 0) {
113 /* Obtain a list of regions associated with the Global Region Name, sorted by PID */
114 int max_procs = mu_get_system_size();
115 talp_region_list_t *region_list = malloc(max_procs * sizeof(talp_region_list_t));
116 int nelems;
117 shmem_talp__get_regionlist(region_list, &nelems, max_procs, region_get_global_name());
118
119 /* Allocate and initialize node summary structure */
120 node_summary_size = sizeof(node_record_t) + sizeof(process_in_node_record_t) * nelems;
121 node_summary = malloc(node_summary_size);
122 *node_summary = (const node_record_t) {
123 .node_id = _node_id,
124 .nelems = nelems,
125 };
126
127 /* Iterate the PID list and gather times of every process */
128 for (int i = 0; i < nelems; ++i) {
129 int64_t mpi_time = region_list[i].mpi_time;
130 int64_t useful_time = region_list[i].useful_time;
131
132 /* Save times in local structure */
133 node_summary->processes[i].pid = region_list[i].pid;
134 node_summary->processes[i].mpi_time = mpi_time;
135 node_summary->processes[i].useful_time = useful_time;
136
137 /* Accumulate total and max values */
138 node_summary->avg_useful_time += useful_time;
139 node_summary->avg_mpi_time += mpi_time;
140 node_summary->max_useful_time = max_int64(useful_time, node_summary->max_useful_time);
141 node_summary->max_mpi_time = max_int64(mpi_time, node_summary->max_mpi_time);
142 }
143 free(region_list);
144
145 /* Compute average values */
146 node_summary->avg_useful_time /= node_summary->nelems;
147 node_summary->avg_mpi_time /= node_summary->nelems;
148 }
149
150 /* Perform a final barrier so that all processes let the _process_id 0 to
151 * gather all the data */
152 node_barrier(spd, NULL);
153
154 /* All main processes from each node send data to rank 0 */
155 if (_process_id == 0) {
156 verbose(VB_TALP, "Node summary: gathering data");
157
158 /* MPI type: int64_t */
159 MPI_Datatype mpi_int64_type = get_mpi_int64_type();
160
161 /* MPI type: pid_t */
162 MPI_Datatype mpi_pid_type;
163 PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(pid_t), &mpi_pid_type);
164
165 /* MPI struct type: process_in_node_record_t */
166 MPI_Datatype mpi_process_info_type;
167 {
168 int count = 3;
169 int blocklengths[] = {1, 1, 1};
170 MPI_Aint displacements[] = {
171 offsetof(process_in_node_record_t, pid),
172 offsetof(process_in_node_record_t, mpi_time),
173 offsetof(process_in_node_record_t, useful_time)};
174 MPI_Datatype types[] = {mpi_pid_type, mpi_int64_type, mpi_int64_type};
175 MPI_Datatype tmp_type;
176 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
177 PMPI_Type_create_resized(tmp_type, 0, sizeof(process_in_node_record_t),
178 &mpi_process_info_type);
179 PMPI_Type_commit(&mpi_process_info_type);
180 }
181
182 /* MPI struct type: node_record_t */
183 MPI_Datatype mpi_node_record_type;;
184 {
185 int count = 7;
186 int blocklengths[] = {1, 1, 1, 1, 1, 1, node_summary->nelems};
187 MPI_Aint displacements[] = {
188 offsetof(node_record_t, node_id),
189 offsetof(node_record_t, nelems),
190 offsetof(node_record_t, avg_useful_time),
191 offsetof(node_record_t, avg_mpi_time),
192 offsetof(node_record_t, max_useful_time),
193 offsetof(node_record_t, max_mpi_time),
194 offsetof(node_record_t, processes)};
195 MPI_Datatype types[] = {MPI_INT, MPI_INT, mpi_int64_type, mpi_int64_type,
196 mpi_int64_type, mpi_int64_type, mpi_process_info_type};
197 MPI_Datatype tmp_type;
198 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
199 PMPI_Type_create_resized(tmp_type, 0, node_summary_size, &mpi_node_record_type);
200 PMPI_Type_commit(&mpi_node_record_type);
201 }
202
203 /* Gather data */
204 void *recvbuf = NULL;
205 if (_mpi_rank == 0) {
206 recvbuf = malloc(_num_nodes * node_summary_size);
207 }
208 PMPI_Gather(node_summary, 1, mpi_node_record_type,
209 recvbuf, 1, mpi_node_record_type,
210 0, getInterNodeComm());
211
212 /* Free send buffer and MPI Datatypes */
213 free(node_summary);
214 PMPI_Type_free(&mpi_process_info_type);
215 PMPI_Type_free(&mpi_node_record_type);
216
217 /* Add records */
218 if (_mpi_rank == 0) {
219 for (int node_id = 0; node_id < _num_nodes; ++node_id) {
220 verbose(VB_TALP, "Node summary: recording node %d", node_id);
221 node_record_t *node_record = (node_record_t*)(
222 (unsigned char *)recvbuf + node_summary_size * node_id);
223 ensure( node_id == node_record->node_id, "Node id error in %s", __func__ );
224 talp_output_record_node(node_record);
225 }
226 free(recvbuf);
227 }
228 }
229 }
230
231 /* Gather PROCESS data of a monitor among all ranks and record it in rank 0 */
232 void talp_record_process_summary(const subprocess_descriptor_t *spd,
233 const dlb_monitor_t *monitor) {
234
235 /* Internal monitors will not be recorded */
236 if (((monitor_data_t*)monitor->_data)->flags.internal) {
237 return;
238 }
239
240 if (_mpi_rank == 0) {
241 verbose(VB_TALP, "Process summary: gathering region %s", monitor->name);
242 }
243
244 process_record_t process_record_send = {
245 .rank = _mpi_rank,
246 .pid = spd->id,
247 .node_id = _node_id,
248 .monitor = *monitor,
249 };
250
251 /* Invalidate pointers of the copied monitor */
252 process_record_send.monitor.name = NULL;
253 process_record_send.monitor._data = NULL;
254
255 /* Fill hostname and CPU mask strings in process_record_send */
256 gethostname(process_record_send.hostname, HOST_NAME_MAX);
257 snprintf(process_record_send.cpuset, TALP_OUTPUT_CPUSET_MAX, "%s",
258 mu_to_str(&spd->process_mask));
259 mu_get_quoted_mask(&spd->process_mask, process_record_send.cpuset_quoted,
260 TALP_OUTPUT_CPUSET_MAX);
261
262 /* MPI type: int64_t */
263 MPI_Datatype mpi_int64_type = get_mpi_int64_type();
264
265 /* MPI type: pid_t */
266 MPI_Datatype mpi_pid_type;
267 PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(pid_t), &mpi_pid_type);
268
269 /* Note: obviously, it doesn't make sense to send addresses via MPI, but we
270 * are sending the whole dlb_monitor_t, so... Addresses are discarded
271 * either way. */
272
273 /* MPI type: void* */
274 MPI_Datatype address_type;
275 PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(void*), &address_type);
276
277 /* MPI struct type: dlb_monitor_t */
278 MPI_Datatype mpi_dlb_monitor_type;
279 {
280 int blocklengths[] = {
281 1, 1, 1, /* Name + Resources: num_cpus, avg_cpus */
282 1, 1, /* Hardware counters: cycles, instructions */
283 1, 1, 1, 1, 1, 1, /* Statistics: num_* */
284 1, 1, /* Monitor Start and Stop times */
285 1, 1, 1, 1, 1, 1, 1, /* Host Times */
286 1, 1, 1, /* Device Times */
287 1}; /* _data */
288
289 enum {count = sizeof(blocklengths) / sizeof(blocklengths[0])};
290
291 MPI_Aint displacements[] = {
292 offsetof(dlb_monitor_t, name),
293 /* Resources */
294 offsetof(dlb_monitor_t, num_cpus),
295 offsetof(dlb_monitor_t, avg_cpus),
296 /* Hardware counters */
297 offsetof(dlb_monitor_t, cycles),
298 offsetof(dlb_monitor_t, instructions),
299 /* Statistics */
300 offsetof(dlb_monitor_t, num_measurements),
301 offsetof(dlb_monitor_t, num_resets),
302 offsetof(dlb_monitor_t, num_mpi_calls),
303 offsetof(dlb_monitor_t, num_omp_parallels),
304 offsetof(dlb_monitor_t, num_omp_tasks),
305 offsetof(dlb_monitor_t, num_gpu_runtime_calls),
306 /* Monitor Start and Stop times */
307 offsetof(dlb_monitor_t, start_time),
308 offsetof(dlb_monitor_t, stop_time),
309 /* Host Times */
310 offsetof(dlb_monitor_t, elapsed_time),
311 offsetof(dlb_monitor_t, useful_time),
312 offsetof(dlb_monitor_t, mpi_time),
313 offsetof(dlb_monitor_t, omp_load_imbalance_time),
314 offsetof(dlb_monitor_t, omp_scheduling_time),
315 offsetof(dlb_monitor_t, omp_serialization_time),
316 offsetof(dlb_monitor_t, gpu_runtime_time),
317 /* Device Times */
318 offsetof(dlb_monitor_t, gpu_useful_time),
319 offsetof(dlb_monitor_t, gpu_communication_time),
320 offsetof(dlb_monitor_t, gpu_inactive_time),
321 /* _data */
322 offsetof(dlb_monitor_t, _data)};
323
324 MPI_Datatype types[] = {
325 address_type, MPI_INT, MPI_FLOAT, /* Name + Resources: num_cpus, avg_cpus */
326 mpi_int64_type, mpi_int64_type, /* Hardware counters: cycles, instructions */
327 MPI_INT, MPI_INT,
328 mpi_int64_type, mpi_int64_type,
329 mpi_int64_type, mpi_int64_type, /* Statistics: num_* */
330 mpi_int64_type, mpi_int64_type, /* Monitor Start and Stop times */
331 mpi_int64_type, mpi_int64_type,
332 mpi_int64_type, mpi_int64_type,
333 mpi_int64_type, mpi_int64_type,
334 mpi_int64_type, /* Host Times */
335 mpi_int64_type, mpi_int64_type,
336 mpi_int64_type, /* Device Times */
337 address_type}; /* _data */
338
339 MPI_Datatype tmp_type;
340 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
341 PMPI_Type_create_resized(tmp_type, 0, sizeof(dlb_monitor_t), &mpi_dlb_monitor_type);
342 PMPI_Type_commit(&mpi_dlb_monitor_type);
343
344 static_ensure(sizeof(blocklengths)/sizeof(blocklengths[0]) == count,
345 "blocklengths size mismatch");
346 static_ensure(sizeof(displacements)/sizeof(displacements[0]) == count,
347 "displacements size mismatch");
348 static_ensure(sizeof(types)/sizeof(types[0]) == count,
349 "types size mismatch");
350 }
351
352 /* MPI struct type: process_record_t */
353 MPI_Datatype mpi_process_record_type;
354 {
355 int count = 7;
356 int blocklengths[] = {1, 1, 1, HOST_NAME_MAX,
357 TALP_OUTPUT_CPUSET_MAX, TALP_OUTPUT_CPUSET_MAX, 1};
358 MPI_Aint displacements[] = {
359 offsetof(process_record_t, rank),
360 offsetof(process_record_t, pid),
361 offsetof(process_record_t, node_id),
362 offsetof(process_record_t, hostname),
363 offsetof(process_record_t, cpuset),
364 offsetof(process_record_t, cpuset_quoted),
365 offsetof(process_record_t, monitor)};
366 MPI_Datatype types[] = {MPI_INT, mpi_pid_type, MPI_INT, MPI_CHAR, MPI_CHAR,
367 MPI_CHAR, mpi_dlb_monitor_type};
368 MPI_Datatype tmp_type;
369 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
370 PMPI_Type_create_resized(tmp_type, 0, sizeof(process_record_t),
371 &mpi_process_record_type);
372 PMPI_Type_commit(&mpi_process_record_type);
373 }
374
375 /* Gather data */
376 process_record_t *recvbuf = NULL;
377 if (_mpi_rank == 0) {
378 recvbuf = malloc(_mpi_size * sizeof(process_record_t));
379 }
380 PMPI_Gather(&process_record_send, 1, mpi_process_record_type,
381 recvbuf, 1, mpi_process_record_type,
382 0, getWorldComm());
383
384 /* Add records */
385 if (_mpi_rank == 0) {
386 for (int rank = 0; rank < _mpi_size; ++rank) {
387 verbose(VB_TALP, "Process summary: recording region %s on rank %d",
388 monitor->name, rank);
389 talp_output_record_process(monitor->name, &recvbuf[rank], _mpi_size);
390 }
391 free(recvbuf);
392 }
393
394 /* Free MPI types */
395 PMPI_Type_free(&mpi_dlb_monitor_type);
396 PMPI_Type_free(&mpi_process_record_type);
397 }
398
399 /* Gather POP METRICS data of a monitor among all ranks and record it in rank 0 */
400 void talp_record_pop_summary(const subprocess_descriptor_t *spd,
401 const dlb_monitor_t *monitor) {
402
403 /* Internal monitors will not be recorded */
404 if (((monitor_data_t*)monitor->_data)->flags.internal) {
405 return;
406 }
407
408 if (_mpi_rank == 0) {
409 verbose(VB_TALP, "TALP summary: gathering region %s", monitor->name);
410 }
411
412 talp_info_t *talp_info = spd->talp_info;
413
414 /* Reduce monitor among all MPI ranks into MPI rank 0 */
415 pop_base_metrics_t base_metrics;
416 perf_metrics__reduce_monitor_into_base_metrics(&base_metrics, monitor, false);
417
418 if (_mpi_rank == 0) {
419 if (base_metrics.elapsed_time > 0) {
420
421 /* Only the global region records the resources */
422 if (monitor == talp_info->monitor) {
423 talp_output_record_resources(base_metrics.num_cpus,
424 base_metrics.num_nodes, base_metrics.num_mpi_ranks,
425 base_metrics.num_gpus);
426 }
427
428 /* Construct pop_metrics out of base metrics */
429 dlb_pop_metrics_t pop_metrics;
430 perf_metrics__base_to_pop_metrics(monitor->name, &base_metrics, &pop_metrics);
431
432 /* Record */
433 verbose(VB_TALP, "TALP summary: recording region %s", monitor->name);
434 talp_output_record_pop_metrics(&pop_metrics);
435
436 } else {
437 /* Record empty */
438 verbose(VB_TALP, "TALP summary: recording empty region %s", monitor->name);
439 dlb_pop_metrics_t pop_metrics = {0};
440 snprintf(pop_metrics.name, DLB_MONITOR_NAME_MAX, "%s", monitor->name);
441 talp_output_record_pop_metrics(&pop_metrics);
442 }
443 }
444 }
445
446 #endif /* MPI_LIB */
447