GCC Code Coverage Report


Directory: src/
File: src/talp/talp_record.c
Date: 2026-03-27 16:05:46
Exec Total Coverage
Lines: 25 25 100.0%
Functions: 1 1 100.0%
Branches: 13 16 81.2%

Line Branch Exec Source
1 /*********************************************************************************/
2 /* Copyright 2009-2025 Barcelona Supercomputing Center */
3 /* */
4 /* This file is part of the DLB library. */
5 /* */
6 /* DLB is free software: you can redistribute it and/or modify */
7 /* it under the terms of the GNU Lesser General Public License as published by */
8 /* the Free Software Foundation, either version 3 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* DLB is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU Lesser General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU Lesser General Public License */
17 /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18 /*********************************************************************************/
19
20 #include "talp/talp_record.h"
21
22 #include "LB_comm/shmem_talp.h"
23 #include "LB_core/node_barrier.h"
24 #include "LB_core/spd.h"
25 #include "apis/dlb_talp.h"
26 #include "support/debug.h"
27 #include "support/mask_utils.h"
28 #include "support/options.h"
29 #include "talp/perf_metrics.h"
30 #include "talp/regions.h"
31 #include "talp/talp_output.h"
32 #include "talp/talp_types.h"
33 #ifdef MPI_LIB
34 #include "mpi/mpi_core.h"
35 #endif
36
37 #include <stddef.h>
38 #include <stdio.h>
39 #include <unistd.h>
40
41
42 /*********************************************************************************/
43 /* TALP Record in serial (non-MPI) mode */
44 /*********************************************************************************/
45
46 /* For any given monitor, record metrics considering only this (sub-)process */
47 1221 void talp_record_monitor(const subprocess_descriptor_t *spd,
48 const dlb_monitor_t *monitor) {
49
50
2/2
✓ Branch 0 taken 18 times.
✓ Branch 1 taken 1203 times.
1221 if (spd->options.talp_summary != SUMMARY_NONE) {
51 18 talp_output_record_process_info();
52 }
53
54
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1219 times.
1221 if (spd->options.talp_summary & SUMMARY_PROCESS) {
55
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
2 verbose(VB_TALP, "TALP process summary: recording region %s", monitor->name);
56
57 2 process_record_t process_record = {
58 .rank = 0,
59 2 .pid = spd->id,
60 .monitor = *monitor,
61 };
62
63 /* Fill hostname and CPU mask strings in process_record */
64 2 gethostname(process_record.hostname, HOST_NAME_MAX);
65 2 snprintf(process_record.cpuset, TALP_OUTPUT_CPUSET_MAX, "%s",
66 mu_to_str(&spd->process_mask));
67 2 mu_get_quoted_mask(&spd->process_mask, process_record.cpuset_quoted,
68 TALP_OUTPUT_CPUSET_MAX);
69
70 /* Add record */
71 2 talp_output_record_process(monitor->name, &process_record, 1);
72 }
73
74
2/2
✓ Branch 0 taken 18 times.
✓ Branch 1 taken 1203 times.
1221 if (spd->options.talp_summary & SUMMARY_POP_METRICS) {
75
2/2
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 1 times.
18 if (monitor->elapsed_time > 0) {
76
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 17 times.
17 verbose(VB_TALP, "TALP summary: recording region %s", monitor->name);
77
78 pop_base_metrics_t base_metrics;
79 17 perf_metrics__local_monitor_into_base_metrics(&base_metrics, monitor);
80
81 dlb_pop_metrics_t pop_metrics;
82 17 perf_metrics__base_to_pop_metrics(monitor->name, &base_metrics, &pop_metrics);
83 17 talp_output_record_pop_metrics(&pop_metrics);
84
85 17 talp_info_t *talp_info = spd->talp_info;
86
2/2
✓ Branch 0 taken 11 times.
✓ Branch 1 taken 6 times.
17 if(monitor == talp_info->monitor) {
87 11 talp_output_record_resources(monitor->num_cpus,
88 /* num_nodes */ 1, /* num_ranks */ 0, base_metrics.num_gpus);
89 }
90
91 } else {
92
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 verbose(VB_TALP, "TALP summary: recording empty region %s", monitor->name);
93 1 dlb_pop_metrics_t pop_metrics = {0};
94 1 snprintf(pop_metrics.name, DLB_MONITOR_NAME_MAX, "%s", monitor->name);
95 1 talp_output_record_pop_metrics(&pop_metrics);
96 }
97 }
98 1221 }
99
100
101 /*********************************************************************************/
102 /* TALP Record in MPI mode */
103 /*********************************************************************************/
104
105 #if MPI_LIB
106
107 /* Compute Node summary of all Global Monitors and record data */
108 void talp_record_node_summary(const subprocess_descriptor_t *spd) {
109 node_record_t *node_summary = NULL;
110 size_t node_summary_size = 0;
111
112 /* Perform a barrier so that all processes in the node have arrived at the
113 * MPI_Finalize */
114 node_barrier(spd, NULL);
115
116 /* Node process 0 reduces all global regions from all processes in the node */
117 if (_process_id == 0) {
118 /* Obtain a list of regions associated with the Global Region Name, sorted by PID */
119 int max_procs = mu_get_system_size();
120 talp_region_list_t *region_list = malloc(max_procs * sizeof(talp_region_list_t));
121 int nelems;
122 shmem_talp__get_regionlist(region_list, &nelems, max_procs, region_get_global_name());
123
124 /* Allocate and initialize node summary structure */
125 node_summary_size = sizeof(node_record_t) + sizeof(process_in_node_record_t) * nelems;
126 node_summary = malloc(node_summary_size);
127 *node_summary = (const node_record_t) {
128 .node_id = _node_id,
129 .nelems = nelems,
130 };
131
132 /* Iterate the PID list and gather times of every process */
133 for (int i = 0; i < nelems; ++i) {
134 int64_t mpi_time = region_list[i].mpi_time;
135 int64_t useful_time = region_list[i].useful_time;
136
137 /* Save times in local structure */
138 node_summary->processes[i].pid = region_list[i].pid;
139 node_summary->processes[i].mpi_time = mpi_time;
140 node_summary->processes[i].useful_time = useful_time;
141
142 /* Accumulate total and max values */
143 node_summary->avg_useful_time += useful_time;
144 node_summary->avg_mpi_time += mpi_time;
145 node_summary->max_useful_time = max_int64(useful_time, node_summary->max_useful_time);
146 node_summary->max_mpi_time = max_int64(mpi_time, node_summary->max_mpi_time);
147 }
148 free(region_list);
149
150 /* Compute average values */
151 node_summary->avg_useful_time /= node_summary->nelems;
152 node_summary->avg_mpi_time /= node_summary->nelems;
153 }
154
155 /* Perform a final barrier so that all processes let the _process_id 0 to
156 * gather all the data */
157 node_barrier(spd, NULL);
158
159 /* All main processes from each node send data to rank 0 */
160 if (_process_id == 0) {
161 verbose(VB_TALP, "Node summary: gathering data");
162
163 /* MPI type: int64_t */
164 MPI_Datatype mpi_int64_type = get_mpi_int64_type();
165
166 /* MPI type: pid_t */
167 MPI_Datatype mpi_pid_type;
168 PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(pid_t), &mpi_pid_type);
169
170 /* MPI struct type: process_in_node_record_t */
171 MPI_Datatype mpi_process_info_type;
172 {
173 int count = 3;
174 int blocklengths[] = {1, 1, 1};
175 MPI_Aint displacements[] = {
176 offsetof(process_in_node_record_t, pid),
177 offsetof(process_in_node_record_t, mpi_time),
178 offsetof(process_in_node_record_t, useful_time)};
179 MPI_Datatype types[] = {mpi_pid_type, mpi_int64_type, mpi_int64_type};
180 MPI_Datatype tmp_type;
181 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
182 PMPI_Type_create_resized(tmp_type, 0, sizeof(process_in_node_record_t),
183 &mpi_process_info_type);
184 PMPI_Type_commit(&mpi_process_info_type);
185 }
186
187 /* MPI struct type: node_record_t */
188 MPI_Datatype mpi_node_record_type;;
189 {
190 int count = 7;
191 int blocklengths[] = {1, 1, 1, 1, 1, 1, node_summary->nelems};
192 MPI_Aint displacements[] = {
193 offsetof(node_record_t, node_id),
194 offsetof(node_record_t, nelems),
195 offsetof(node_record_t, avg_useful_time),
196 offsetof(node_record_t, avg_mpi_time),
197 offsetof(node_record_t, max_useful_time),
198 offsetof(node_record_t, max_mpi_time),
199 offsetof(node_record_t, processes)};
200 MPI_Datatype types[] = {MPI_INT, MPI_INT, mpi_int64_type, mpi_int64_type,
201 mpi_int64_type, mpi_int64_type, mpi_process_info_type};
202 MPI_Datatype tmp_type;
203 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
204 PMPI_Type_create_resized(tmp_type, 0, node_summary_size, &mpi_node_record_type);
205 PMPI_Type_commit(&mpi_node_record_type);
206 }
207
208 /* Gather data */
209 void *recvbuf = NULL;
210 if (_mpi_rank == 0) {
211 recvbuf = malloc(_num_nodes * node_summary_size);
212 }
213 PMPI_Gather(node_summary, 1, mpi_node_record_type,
214 recvbuf, 1, mpi_node_record_type,
215 0, getInterNodeComm());
216
217 /* Free send buffer and MPI Datatypes */
218 free(node_summary);
219 PMPI_Type_free(&mpi_process_info_type);
220 PMPI_Type_free(&mpi_node_record_type);
221
222 /* Add records */
223 if (_mpi_rank == 0) {
224 for (int node_id = 0; node_id < _num_nodes; ++node_id) {
225 verbose(VB_TALP, "Node summary: recording node %d", node_id);
226 node_record_t *node_record = (node_record_t*)(
227 (unsigned char *)recvbuf + node_summary_size * node_id);
228 ensure( node_id == node_record->node_id, "Node id error in %s", __func__ );
229 talp_output_record_node(node_record);
230 }
231 free(recvbuf);
232 }
233 }
234 }
235
236 /* Gather PROCESS data of a monitor among all ranks and record it in rank 0 */
237 void talp_record_process_summary(const subprocess_descriptor_t *spd,
238 const dlb_monitor_t *monitor) {
239
240 /* Internal monitors will not be recorded */
241 if (((monitor_data_t*)monitor->_data)->flags.internal) {
242 return;
243 }
244
245 if (_mpi_rank == 0) {
246 verbose(VB_TALP, "Process summary: gathering region %s", monitor->name);
247 }
248
249 process_record_t process_record_send = {
250 .rank = _mpi_rank,
251 .pid = spd->id,
252 .node_id = _node_id,
253 .monitor = *monitor,
254 };
255
256 /* Invalidate pointers of the copied monitor */
257 process_record_send.monitor.name = NULL;
258 process_record_send.monitor._data = NULL;
259
260 /* Fill hostname and CPU mask strings in process_record_send */
261 gethostname(process_record_send.hostname, HOST_NAME_MAX);
262 snprintf(process_record_send.cpuset, TALP_OUTPUT_CPUSET_MAX, "%s",
263 mu_to_str(&spd->process_mask));
264 mu_get_quoted_mask(&spd->process_mask, process_record_send.cpuset_quoted,
265 TALP_OUTPUT_CPUSET_MAX);
266
267 /* MPI type: int64_t */
268 MPI_Datatype mpi_int64_type = get_mpi_int64_type();
269
270 /* MPI type: pid_t */
271 MPI_Datatype mpi_pid_type;
272 PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(pid_t), &mpi_pid_type);
273
274 /* Note: obviously, it doesn't make sense to send addresses via MPI, but we
275 * are sending the whole dlb_monitor_t, so... Addresses are discarded
276 * either way. */
277
278 /* MPI type: void* */
279 MPI_Datatype address_type;
280 PMPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(void*), &address_type);
281
282 /* MPI struct type: dlb_monitor_t */
283 MPI_Datatype mpi_dlb_monitor_type;
284 {
285 int blocklengths[] = {
286 1, 1, 1, /* Name + Resources: num_cpus, avg_cpus */
287 1, 1, /* Hardware counters: cycles, instructions */
288 1, 1, 1, 1, 1, 1, /* Statistics: num_* */
289 1, 1, /* Monitor Start and Stop times */
290 1, 1, 1, 1, 1, 1, 1, /* Host Times */
291 1, 1, 1, /* Device Times */
292 1}; /* _data */
293
294 enum {count = sizeof(blocklengths) / sizeof(blocklengths[0])};
295
296 MPI_Aint displacements[] = {
297 offsetof(dlb_monitor_t, name),
298 /* Resources */
299 offsetof(dlb_monitor_t, num_cpus),
300 offsetof(dlb_monitor_t, avg_cpus),
301 /* Hardware counters */
302 offsetof(dlb_monitor_t, cycles),
303 offsetof(dlb_monitor_t, instructions),
304 /* Statistics */
305 offsetof(dlb_monitor_t, num_measurements),
306 offsetof(dlb_monitor_t, num_resets),
307 offsetof(dlb_monitor_t, num_mpi_calls),
308 offsetof(dlb_monitor_t, num_omp_parallels),
309 offsetof(dlb_monitor_t, num_omp_tasks),
310 offsetof(dlb_monitor_t, num_gpu_runtime_calls),
311 /* Monitor Start and Stop times */
312 offsetof(dlb_monitor_t, start_time),
313 offsetof(dlb_monitor_t, stop_time),
314 /* Host Times */
315 offsetof(dlb_monitor_t, elapsed_time),
316 offsetof(dlb_monitor_t, useful_time),
317 offsetof(dlb_monitor_t, mpi_time),
318 offsetof(dlb_monitor_t, omp_load_imbalance_time),
319 offsetof(dlb_monitor_t, omp_scheduling_time),
320 offsetof(dlb_monitor_t, omp_serialization_time),
321 offsetof(dlb_monitor_t, gpu_runtime_time),
322 /* Device Times */
323 offsetof(dlb_monitor_t, gpu_useful_time),
324 offsetof(dlb_monitor_t, gpu_communication_time),
325 offsetof(dlb_monitor_t, gpu_inactive_time),
326 /* _data */
327 offsetof(dlb_monitor_t, _data)};
328
329 MPI_Datatype types[] = {
330 address_type, MPI_INT, MPI_FLOAT, /* Name + Resources: num_cpus, avg_cpus */
331 mpi_int64_type, mpi_int64_type, /* Hardware counters: cycles, instructions */
332 MPI_INT, MPI_INT,
333 mpi_int64_type, mpi_int64_type,
334 mpi_int64_type, mpi_int64_type, /* Statistics: num_* */
335 mpi_int64_type, mpi_int64_type, /* Monitor Start and Stop times */
336 mpi_int64_type, mpi_int64_type,
337 mpi_int64_type, mpi_int64_type,
338 mpi_int64_type, mpi_int64_type,
339 mpi_int64_type, /* Host Times */
340 mpi_int64_type, mpi_int64_type,
341 mpi_int64_type, /* Device Times */
342 address_type}; /* _data */
343
344 MPI_Datatype tmp_type;
345 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
346 PMPI_Type_create_resized(tmp_type, 0, sizeof(dlb_monitor_t), &mpi_dlb_monitor_type);
347 PMPI_Type_commit(&mpi_dlb_monitor_type);
348
349 static_ensure(sizeof(blocklengths)/sizeof(blocklengths[0]) == count,
350 "blocklengths size mismatch");
351 static_ensure(sizeof(displacements)/sizeof(displacements[0]) == count,
352 "displacements size mismatch");
353 static_ensure(sizeof(types)/sizeof(types[0]) == count,
354 "types size mismatch");
355 }
356
357 /* MPI struct type: process_record_t */
358 MPI_Datatype mpi_process_record_type;
359 {
360 int count = 7;
361 int blocklengths[] = {1, 1, 1, HOST_NAME_MAX,
362 TALP_OUTPUT_CPUSET_MAX, TALP_OUTPUT_CPUSET_MAX, 1};
363 MPI_Aint displacements[] = {
364 offsetof(process_record_t, rank),
365 offsetof(process_record_t, pid),
366 offsetof(process_record_t, node_id),
367 offsetof(process_record_t, hostname),
368 offsetof(process_record_t, cpuset),
369 offsetof(process_record_t, cpuset_quoted),
370 offsetof(process_record_t, monitor)};
371 MPI_Datatype types[] = {MPI_INT, mpi_pid_type, MPI_INT, MPI_CHAR, MPI_CHAR,
372 MPI_CHAR, mpi_dlb_monitor_type};
373 MPI_Datatype tmp_type;
374 PMPI_Type_create_struct(count, blocklengths, displacements, types, &tmp_type);
375 PMPI_Type_create_resized(tmp_type, 0, sizeof(process_record_t),
376 &mpi_process_record_type);
377 PMPI_Type_commit(&mpi_process_record_type);
378 }
379
380 /* Gather data */
381 process_record_t *recvbuf = NULL;
382 if (_mpi_rank == 0) {
383 recvbuf = malloc(_mpi_size * sizeof(process_record_t));
384 }
385 PMPI_Gather(&process_record_send, 1, mpi_process_record_type,
386 recvbuf, 1, mpi_process_record_type,
387 0, getWorldComm());
388
389 /* Add records */
390 if (_mpi_rank == 0) {
391 for (int rank = 0; rank < _mpi_size; ++rank) {
392 verbose(VB_TALP, "Process summary: recording region %s on rank %d",
393 monitor->name, rank);
394 talp_output_record_process(monitor->name, &recvbuf[rank], _mpi_size);
395 }
396 free(recvbuf);
397 }
398
399 /* Free MPI types */
400 PMPI_Type_free(&mpi_dlb_monitor_type);
401 PMPI_Type_free(&mpi_process_record_type);
402 }
403
404 /* Gather POP METRICS data of a monitor among all ranks and record it in rank 0 */
405 void talp_record_pop_summary(const subprocess_descriptor_t *spd,
406 const dlb_monitor_t *monitor) {
407
408 /* Internal monitors will not be recorded */
409 if (((monitor_data_t*)monitor->_data)->flags.internal) {
410 return;
411 }
412
413 if (_mpi_rank == 0) {
414 verbose(VB_TALP, "TALP summary: gathering region %s", monitor->name);
415 }
416
417 talp_info_t *talp_info = spd->talp_info;
418
419 /* Reduce monitor among all MPI ranks into MPI rank 0 */
420 pop_base_metrics_t base_metrics;
421 perf_metrics__reduce_monitor_into_base_metrics(&base_metrics, monitor, false);
422
423 if (_mpi_rank == 0) {
424 if (base_metrics.elapsed_time > 0) {
425
426 /* Only the global region records the resources */
427 if (monitor == talp_info->monitor) {
428 talp_output_record_resources(base_metrics.num_cpus,
429 base_metrics.num_nodes, base_metrics.num_mpi_ranks,
430 base_metrics.num_gpus);
431 }
432
433 /* Construct pop_metrics out of base metrics */
434 dlb_pop_metrics_t pop_metrics;
435 perf_metrics__base_to_pop_metrics(monitor->name, &base_metrics, &pop_metrics);
436
437 /* Record */
438 verbose(VB_TALP, "TALP summary: recording region %s", monitor->name);
439 talp_output_record_pop_metrics(&pop_metrics);
440
441 } else {
442 /* Record empty */
443 verbose(VB_TALP, "TALP summary: recording empty region %s", monitor->name);
444 dlb_pop_metrics_t pop_metrics = {0};
445 snprintf(pop_metrics.name, DLB_MONITOR_NAME_MAX, "%s", monitor->name);
446 talp_output_record_pop_metrics(&pop_metrics);
447 }
448 }
449 }
450
451 #endif /* MPI_LIB */
452