Dynamic Load Balance 3.6.1+32-59d1
dlb_talp.h
Go to the documentation of this file.
1/*********************************************************************************/
2/* Copyright 2009-2025 Barcelona Supercomputing Center */
3/* */
4/* This file is part of the DLB library. */
5/* */
6/* DLB is free software: you can redistribute it and/or modify */
7/* it under the terms of the GNU Lesser General Public License as published by */
8/* the Free Software Foundation, either version 3 of the License, or */
9/* (at your option) any later version. */
10/* */
11/* DLB is distributed in the hope that it will be useful, */
12/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14/* GNU Lesser General Public License for more details. */
15/* */
16/* You should have received a copy of the GNU Lesser General Public License */
17/* along with DLB. If not, see <https://www.gnu.org/licenses/>. */
18/*********************************************************************************/
19
20#ifndef DLB_API_TALP_H
21#define DLB_API_TALP_H
22
23#include <time.h>
24#include <stdint.h>
25
26#define DLB_GLOBAL_REGION_NAME "Global"
27#define DLB_GLOBAL_REGION NULL
28#define DLB_MPI_REGION NULL /* deprecated in favor of DLB_GLOBAL_REGION */
29#define DLB_IMPLICIT_REGION NULL /* deprecated in favor of DLB_GLOBAL_REGION */
30#define DLB_LAST_OPEN_REGION (void*)1
31
32enum { DLB_MONITOR_NAME_MAX = 128 };
33
35typedef struct dlb_monitor_t {
37 const char *name;
41 float avg_cpus;
43 int64_t cycles;
45 int64_t instructions;
59 int64_t start_time;
61 int64_t stop_time;
63 int64_t elapsed_time;
65 int64_t useful_time;
67 int64_t mpi_time;
87 void *_data;
89
91typedef struct dlb_pop_metrics_t {
101 float avg_cpus;
105 double cycles;
121 int64_t useful_time;
123 int64_t mpi_time;
139 /* TBD */
141 /* TBD */
143 /* TBD: remove? */
145 /* TBD */
147 /* TBD */
180
182typedef struct dlb_node_metrics_t {
208
210typedef struct dlb_node_times_t {
212 pid_t pid;
214 int64_t mpi_time;
216 int64_t useful_time;
218
219#ifdef __cplusplus
220extern "C"
221{
222#endif
223
224/*********************************************************************************/
225/* */
226/* The following functions are intended to be called from 1st-party or */
227/* 3rd-party programs indistinctly; that is, DLB applications, or external */
228/* profilers as long as they invoke DLB_TALP_Attach. */
229/* */
230/*********************************************************************************/
231
232/*********************************************************************************/
233/* TALP */
234/*********************************************************************************/
235
244int DLB_TALP_Attach(void);
245
253int DLB_TALP_Detach(void);
254
259int DLB_TALP_GetNumCPUs(int *ncpus);
260
268int DLB_TALP_GetPidList(int *pidlist, int *nelems, int max_len);
269
279int DLB_TALP_GetTimes(int pid, double *mpi_time, double *useful_time);
280
292int DLB_TALP_GetNodeTimes(const char *name, dlb_node_times_t *node_times_list,
293 int *nelems, int max_len);
294
305int DLB_TALP_QueryPOPNodeMetrics(const char *name, dlb_node_metrics_t *node_metrics);
306
307
308/*********************************************************************************/
309/* */
310/* The functions declared below are intended to be called only from 1st-party */
311/* programs, and they should return an error if they are called from external */
312/* profilers. */
313/* */
314/* DISCLAIMER: This header file may be split in two in the next major release. */
315/* */
316/*********************************************************************************/
317
318/*********************************************************************************/
319/* TALP Monitoring Regions */
320/*********************************************************************************/
321
326
328 __attribute__((deprecated("DLB_MonitoringRegionGetGlobal")));
329
331 __attribute__((deprecated("DLB_MonitoringRegionGetGlobal")));
332
344
354
369
379
386
396
405
418
420 __attribute__((deprecated("DLB_TALP_CollectPOPNodeMetrics")));
421
422#ifdef __cplusplus
423}
424#endif
425
426
427#endif /* DLB_API_TALP_H */
__attribute__((constructor))
Definition: DLB_interface.c:47
int DLB_MonitoringRegionReset(dlb_monitor_t *handle)
Reset monitoring region.
Definition: DLB_interface_talp.c:190
int DLB_MonitoringRegionsUpdate(void)
Update all monitoring regions.
Definition: DLB_interface_talp.c:226
@ DLB_MONITOR_NAME_MAX
Definition: dlb_talp.h:32
int DLB_TALP_GetPidList(int *pidlist, int *nelems, int max_len)
Get the list of running processes registered in the DLB system.
Definition: DLB_interface_talp.c:80
int DLB_TALP_CollectPOPNodeMetrics(dlb_monitor_t *monitor, dlb_node_metrics_t *node_metrics)
Perform a node collective communication to collect TALP node metrics.
Definition: DLB_interface_talp.c:244
int DLB_TALP_CollectNodeMetrics(dlb_monitor_t *monitor, dlb_node_metrics_t *node_metrics) __attribute__((deprecated("DLB_TALP_CollectPOPNodeMetrics")))
int DLB_TALP_Attach(void)
Attach current process to DLB system as TALP administrator.
Definition: DLB_interface_talp.c:42
int DLB_TALP_Detach(void)
Detach current process from DLB system.
Definition: DLB_interface_talp.c:66
int DLB_TALP_QueryPOPNodeMetrics(const char *name, dlb_node_metrics_t *node_metrics)
From either 1st or 3rd party, query node metrics for one region.
Definition: DLB_interface_talp.c:149
const dlb_monitor_t * DLB_MonitoringRegionGetMPIRegion(void) __attribute__((deprecated("DLB_MonitoringRegionGetGlobal")))
int DLB_MonitoringRegionReport(const dlb_monitor_t *handle)
Print a report to stderr of the monitoring region.
Definition: DLB_interface_talp.c:217
int DLB_TALP_GetNodeTimes(const char *name, dlb_node_times_t *node_times_list, int *nelems, int max_len)
Get the list of raw times for the specified region.
Definition: DLB_interface_talp.c:114
int DLB_TALP_GetNumCPUs(int *ncpus)
Get the number of CPUs in the node.
Definition: DLB_interface_talp.c:74
int DLB_TALP_GetTimes(int pid, double *mpi_time, double *useful_time)
Get the CPU time spent on MPI and useful computation for the given process.
Definition: DLB_interface_talp.c:85
dlb_monitor_t * DLB_MonitoringRegionGetGlobal(void)
Get the pointer of the global application-wide Monitoring Region.
Definition: DLB_interface_talp.c:164
dlb_monitor_t * DLB_MonitoringRegionRegister(const char *name)
Register a new Monitoring Region, or obtain the associated pointer by name.
int DLB_MonitoringRegionStop(dlb_monitor_t *handle)
Stop (or pause) monitoring region.
Definition: DLB_interface_talp.c:208
int DLB_TALP_CollectPOPMetrics(dlb_monitor_t *monitor, dlb_pop_metrics_t *pop_metrics)
Perform an MPI collective communication to collect POP metrics.
Definition: DLB_interface_talp.c:235
int DLB_MonitoringRegionStart(dlb_monitor_t *handle)
Start (or unpause) monitoring region.
Definition: DLB_interface_talp.c:199
dlb_monitor_t * DLB_MonitoringRegionGetImplicit(void) __attribute__((deprecated("DLB_MonitoringRegionGetGlobal")))
Definition: dlb_talp.h:35
int64_t gpu_inactive_time
Definition: dlb_talp.h:85
int64_t elapsed_time
Definition: dlb_talp.h:63
int64_t num_omp_parallels
Definition: dlb_talp.h:53
int64_t instructions
Definition: dlb_talp.h:45
int64_t num_omp_tasks
Definition: dlb_talp.h:55
int num_measurements
Definition: dlb_talp.h:47
int64_t num_gpu_runtime_calls
Definition: dlb_talp.h:57
int64_t num_mpi_calls
Definition: dlb_talp.h:51
int64_t gpu_useful_time
Definition: dlb_talp.h:81
int64_t cycles
Definition: dlb_talp.h:43
int64_t gpu_communication_time
Definition: dlb_talp.h:83
int64_t omp_serialization_time
Definition: dlb_talp.h:76
const char * name
Definition: dlb_talp.h:37
void * _data
Definition: dlb_talp.h:87
int64_t useful_time
Definition: dlb_talp.h:65
int64_t gpu_runtime_time
Definition: dlb_talp.h:79
int64_t mpi_time
Definition: dlb_talp.h:67
int64_t stop_time
Definition: dlb_talp.h:61
int64_t omp_load_imbalance_time
Definition: dlb_talp.h:70
float avg_cpus
Definition: dlb_talp.h:41
int64_t omp_scheduling_time
Definition: dlb_talp.h:73
int num_cpus
Definition: dlb_talp.h:39
int num_resets
Definition: dlb_talp.h:49
int64_t start_time
Definition: dlb_talp.h:59
Definition: dlb_talp.h:182
int64_t max_useful_time
Definition: dlb_talp.h:197
int64_t total_mpi_time
Definition: dlb_talp.h:194
int64_t max_mpi_time
Definition: dlb_talp.h:200
int node_id
Definition: dlb_talp.h:186
float communication_efficiency
Definition: dlb_talp.h:204
int64_t total_useful_time
Definition: dlb_talp.h:191
float load_balance
Definition: dlb_talp.h:206
char name[DLB_MONITOR_NAME_MAX]
Definition: dlb_talp.h:184
int processes_per_node
Definition: dlb_talp.h:188
float parallel_efficiency
Definition: dlb_talp.h:202
Definition: dlb_talp.h:210
int64_t useful_time
Definition: dlb_talp.h:216
int64_t mpi_time
Definition: dlb_talp.h:214
pid_t pid
Definition: dlb_talp.h:212
Definition: dlb_talp.h:91
int64_t gpu_inactive_time
Definition: dlb_talp.h:144
int64_t elapsed_time
Definition: dlb_talp.h:119
float mpi_load_balance_out
Definition: dlb_talp.h:160
int64_t num_omp_parallels
Definition: dlb_talp.h:113
double min_mpi_normd_proc
Definition: dlb_talp.h:136
float gpu_load_balance
Definition: dlb_talp.h:174
double min_mpi_normd_node
Definition: dlb_talp.h:138
float mpi_parallel_efficiency
Definition: dlb_talp.h:152
int64_t num_omp_tasks
Definition: dlb_talp.h:115
double instructions
Definition: dlb_talp.h:107
float mpi_load_balance_in
Definition: dlb_talp.h:158
float mpi_communication_efficiency
Definition: dlb_talp.h:154
int64_t num_measurements
Definition: dlb_talp.h:109
int64_t num_gpu_runtime_calls
Definition: dlb_talp.h:117
int64_t num_mpi_calls
Definition: dlb_talp.h:111
int64_t gpu_useful_time
Definition: dlb_talp.h:140
int64_t gpu_communication_time
Definition: dlb_talp.h:142
int64_t omp_serialization_time
Definition: dlb_talp.h:132
float omp_parallel_efficiency
Definition: dlb_talp.h:162
int num_gpus
Definition: dlb_talp.h:103
float mpi_load_balance
Definition: dlb_talp.h:156
float omp_scheduling_efficiency
Definition: dlb_talp.h:166
int64_t useful_time
Definition: dlb_talp.h:121
float omp_load_balance
Definition: dlb_talp.h:164
int64_t gpu_runtime_time
Definition: dlb_talp.h:134
int64_t mpi_time
Definition: dlb_talp.h:123
float gpu_orchestration_efficiency
Definition: dlb_talp.h:178
float gpu_communication_efficiency
Definition: dlb_talp.h:176
int64_t omp_load_imbalance_time
Definition: dlb_talp.h:126
char name[DLB_MONITOR_NAME_MAX]
Definition: dlb_talp.h:93
int64_t max_gpu_useful_time
Definition: dlb_talp.h:146
float avg_cpus
Definition: dlb_talp.h:101
double cycles
Definition: dlb_talp.h:105
int64_t omp_scheduling_time
Definition: dlb_talp.h:129
float device_offload_efficiency
Definition: dlb_talp.h:170
int num_nodes
Definition: dlb_talp.h:99
int num_mpi_ranks
Definition: dlb_talp.h:97
int num_cpus
Definition: dlb_talp.h:95
int64_t max_gpu_active_time
Definition: dlb_talp.h:148
float parallel_efficiency
Definition: dlb_talp.h:150
float omp_serialization_efficiency
Definition: dlb_talp.h:168
float gpu_parallel_efficiency
Definition: dlb_talp.h:172