Dynamic Load Balance 3.6.1+32-59d1
Functions
DLB_interface_talp.c File Reference
#include "apis/dlb_talp.h"
#include "apis/dlb_errors.h"
#include "LB_core/spd.h"
#include "LB_core/DLB_kernel.h"
#include "LB_comm/shmem_cpuinfo.h"
#include "LB_comm/shmem_procinfo.h"
#include "LB_comm/shmem_talp.h"
#include "support/dlb_common.h"
#include "support/mask_utils.h"
#include "support/mytime.h"
#include "talp/regions.h"
#include "talp/talp.h"
Include dependency graph for DLB_interface_talp.c:

Functions

DLB_EXPORT_SYMBOL int DLB_TALP_Attach (void)
 Attach current process to DLB system as TALP administrator. More...
 
DLB_EXPORT_SYMBOL int DLB_TALP_Detach (void)
 Detach current process from DLB system. More...
 
DLB_EXPORT_SYMBOL int DLB_TALP_GetNumCPUs (int *ncpus)
 Get the number of CPUs in the node. More...
 
DLB_EXPORT_SYMBOL int DLB_TALP_GetPidList (int *pidlist, int *nelems, int max_len)
 Get the list of running processes registered in the DLB system. More...
 
DLB_EXPORT_SYMBOL int DLB_TALP_GetTimes (int pid, double *mpi_time, double *useful_time)
 Get the CPU time spent on MPI and useful computation for the given process. More...
 
DLB_EXPORT_SYMBOL int DLB_TALP_GetNodeTimes (const char *name, dlb_node_times_t *node_times_list, int *nelems, int max_len)
 Get the list of raw times for the specified region. More...
 
DLB_EXPORT_SYMBOL int DLB_TALP_QueryPOPNodeMetrics (const char *name, dlb_node_metrics_t *node_metrics)
 From either 1st or 3rd party, query node metrics for one region. More...
 
DLB_EXPORT_SYMBOL dlb_monitor_tDLB_MonitoringRegionGetGlobal (void)
 Get the pointer of the global application-wide Monitoring Region. More...
 
DLB_EXPORT_SYMBOL DLB_ALIAS (dlb_monitor_t *, DLB_MonitoringRegionGetImplicit, DLB_EXPORT_SYMBOLDLB_ALIAS(const dlb_monitor_t(void), DLB_EXPORT_SYMBOLDLB_ALIAS(const dlb_monitor_t(), DLB_MonitoringRegionGetGlobal)
 
DLB_EXPORT_SYMBOL int DLB_MonitoringRegionReset (dlb_monitor_t *handle)
 Reset monitoring region. More...
 
DLB_EXPORT_SYMBOL int DLB_MonitoringRegionStart (dlb_monitor_t *handle)
 Start (or unpause) monitoring region. More...
 
DLB_EXPORT_SYMBOL int DLB_MonitoringRegionStop (dlb_monitor_t *handle)
 Stop (or pause) monitoring region. More...
 
DLB_EXPORT_SYMBOL int DLB_MonitoringRegionReport (const dlb_monitor_t *handle)
 Print a report to stderr of the monitoring region. More...
 
DLB_EXPORT_SYMBOL int DLB_MonitoringRegionsUpdate (void)
 Update all monitoring regions. More...
 
DLB_EXPORT_SYMBOL int DLB_TALP_CollectPOPMetrics (dlb_monitor_t *monitor, dlb_pop_metrics_t *pop_metrics)
 Perform an MPI collective communication to collect POP metrics. More...
 
DLB_EXPORT_SYMBOL int DLB_TALP_CollectPOPNodeMetrics (dlb_monitor_t *monitor, dlb_node_metrics_t *node_metrics)
 Perform a node collective communication to collect TALP node metrics. More...
 

Function Documentation

◆ DLB_TALP_Attach()

DLB_EXPORT_SYMBOL int DLB_TALP_Attach ( void  )

Attach current process to DLB system as TALP administrator.

Returns
DLB_SUCCESS on success

Once the process is attached to DLB as TALP administrator, it may perform the below actions described in this file. This way, the process is able to obtain some TALP values such as time spent in computation or MPI for each of the DLB running processes.

◆ DLB_TALP_Detach()

DLB_EXPORT_SYMBOL int DLB_TALP_Detach ( void  )

Detach current process from DLB system.

Returns
DLB_SUCCESS on success
DLB_ERR_NOSHMEM if cannot find shared memory to detach from

If previously attached, a process must call this function to correctly close internal DLB file descriptors and clean data.

◆ DLB_TALP_GetNumCPUs()

DLB_EXPORT_SYMBOL int DLB_TALP_GetNumCPUs ( int *  ncpus)

Get the number of CPUs in the node.

Parameters
[out]ncpusthe number of CPUs
Returns
DLB_SUCCESS on success

◆ DLB_TALP_GetPidList()

DLB_EXPORT_SYMBOL int DLB_TALP_GetPidList ( int *  pidlist,
int *  nelems,
int  max_len 
)

Get the list of running processes registered in the DLB system.

Parameters
[out]pidlistThe output list
[out]nelemsNumber of elements in the list
[in]max_lenMax capacity of the list
Returns
DLB_SUCCESS on success
DLB_ERR_NOSHMEM if cannot find shared memory

◆ DLB_TALP_GetTimes()

DLB_EXPORT_SYMBOL int DLB_TALP_GetTimes ( int  pid,
double *  mpi_time,
double *  useful_time 
)

Get the CPU time spent on MPI and useful computation for the given process.

Parameters
[in]pidtarget Process ID, or 0 if own process
[out]mpi_timeCPU time spent on MPI in seconds
[out]useful_timeCPU time spend on useful computation in seconds
Returns
DLB_SUCCESS on success
DLB_ERR_NOPROC if target pid is not registered in the DLB system
DLB_ERR_NOSHMEM if cannot find shared memory
DLB_ERR_NOTALP if target is own process and TALP is not enabled

◆ DLB_TALP_GetNodeTimes()

DLB_EXPORT_SYMBOL int DLB_TALP_GetNodeTimes ( const char *  name,
dlb_node_times_t node_times_list,
int *  nelems,
int  max_len 
)

Get the list of raw times for the specified region.

Parameters
[in]nameName to identify the region
[out]node_times_listThe output list
[out]nelemsNumber of elements in the list
[in]max_lenMax capacity of the list
Returns
DLB_SUCCESS on success
DLB_ERR_NOSHMEM if cannot find shared memory

Note: This function requires DLB_ARGS+=" --talp-external-profiler" even if it's called from 1st-party programs.

◆ DLB_TALP_QueryPOPNodeMetrics()

DLB_EXPORT_SYMBOL int DLB_TALP_QueryPOPNodeMetrics ( const char *  name,
dlb_node_metrics_t node_metrics 
)

From either 1st or 3rd party, query node metrics for one region.

Parameters
[in]nameName to identify the region
[out]node_metricsAllocated structure where the collected metrics will be stored
Returns
DLB_SUCCESS on success
DLB_ERR_NOENT if no data for the given name
DLB_ERR_NOSHMEM if cannot find shared memory

Note: This function requires DLB_ARGS+=" --talp-external-profiler" even if it's called from 1st-party programs.

◆ DLB_MonitoringRegionGetGlobal()

DLB_EXPORT_SYMBOL dlb_monitor_t * DLB_MonitoringRegionGetGlobal ( void  )

Get the pointer of the global application-wide Monitoring Region.

Returns
monitor handle to be used on queries, or NULL if TALP is not enabled

◆ DLB_ALIAS()

DLB_EXPORT_SYMBOL DLB_ALIAS ( dlb_monitor_t ,
DLB_MonitoringRegionGetImplicit  ,
DLB_EXPORT_SYMBOLDLB_ALIAS(const dlb_monitor_t  (void),
DLB_EXPORT_SYMBOLDLB_ALIAS(const dlb_monitor_t  (),
DLB_MonitoringRegionGetGlobal   
)

◆ DLB_MonitoringRegionReset()

DLB_EXPORT_SYMBOL int DLB_MonitoringRegionReset ( dlb_monitor_t handle)

Reset monitoring region.

Parameters
[in]handleMonitoring handle that identifies the region, or DLB_GLOBAL_REGION
Returns
DLB_SUCCESS on success
DLB_ERR_NOTALP if TALP is not enabled

Reset all values of the monitoring region except num_resets, which is incremented. If the region is open, discard all intermediate values and close it.

◆ DLB_MonitoringRegionStart()

DLB_EXPORT_SYMBOL int DLB_MonitoringRegionStart ( dlb_monitor_t handle)

Start (or unpause) monitoring region.

Parameters
[in]handleMonitoring handle that identifies the region, or DLB_GLOBAL_REGION
Returns
DLB_SUCCESS on success
DLB_ERR_NOTALP if TALP is not enabled
DLB_ERR_PERM if this thread cannot start the monitoring region

Notes on multi-threading:

  • It is not safe to start or stop regions in OpenMP worksharing constructs.
  • If a region is started and stopped before the application has reached maximum parallelism (e.g., before a parallel construct), the unused resources will not be taken into account. This can result in higher OpenMP efficiencies than expected.

◆ DLB_MonitoringRegionStop()

DLB_EXPORT_SYMBOL int DLB_MonitoringRegionStop ( dlb_monitor_t handle)

Stop (or pause) monitoring region.

Parameters
[in]handleMonitoring handle that identifies the region, DLB_GLOBAL_REGION, or DLB_LAST_OPEN_REGION
Returns
DLB_SUCCESS on success
DLB_ERR_NOTALP if TALP is not enabled
DLB_ERR_NOENT if DLB_LAST_OPEN_REGION does not match any region
DLB_ERR_PERM if this thread cannot stop the monitoring region

◆ DLB_MonitoringRegionReport()

DLB_EXPORT_SYMBOL int DLB_MonitoringRegionReport ( const dlb_monitor_t handle)

Print a report to stderr of the monitoring region.

Parameters
[in]handleMonitoring handle that identifies the region, or DLB_GLOBAL_REGION
Returns
DLB_SUCCESS on success
DLB_ERR_NOTALP if TALP is not enabled

◆ DLB_MonitoringRegionsUpdate()

DLB_EXPORT_SYMBOL int DLB_MonitoringRegionsUpdate ( void  )

Update all monitoring regions.

Returns
DLB_SUCCESS on success
DLB_ERR_PERM if this thread cannot update the monitoring region

Monitoring regions are only updated in certain situations, like when starting/stopping a region, or finalizing MPI. This routine forces the update of all started monitoring regions

◆ DLB_TALP_CollectPOPMetrics()

DLB_EXPORT_SYMBOL int DLB_TALP_CollectPOPMetrics ( dlb_monitor_t monitor,
dlb_pop_metrics_t pop_metrics 
)

Perform an MPI collective communication to collect POP metrics.

Parameters
[in]monitorMonitoring handle that identifies the region, or DLB_GLOBAL_REGION macro (NULL) if global application-wide region
[out]pop_metricsAllocated structure where the collected metrics will be stored
Returns
DLB_SUCCESS on success
DLB_ERR_NOTALP if TALP is not enabled

◆ DLB_TALP_CollectPOPNodeMetrics()

DLB_EXPORT_SYMBOL int DLB_TALP_CollectPOPNodeMetrics ( dlb_monitor_t monitor,
dlb_node_metrics_t node_metrics 
)

Perform a node collective communication to collect TALP node metrics.

Parameters
[in]monitorMonitoring handle that identifies the region, or DLB_GLOBAL_REGION macro (NULL) if global application-wide region
[out]node_metricsAllocated structure where the collected metrics will be stored
Returns
DLB_SUCCESS on success
DLB_ERR_NOTALP if TALP is not enabled
DLB_ERR_NOCOMP if support for barrier is disabled, i.e., –no-barrier

This functions performs a node barrier to collect the data. All processes that are running in the node must invoke this function.