Line | Branch | Exec | Source |
---|---|---|---|
1 | /*********************************************************************************/ | ||
2 | /* Copyright 2009-2024 Barcelona Supercomputing Center */ | ||
3 | /* */ | ||
4 | /* This file is part of the DLB library. */ | ||
5 | /* */ | ||
6 | /* DLB is free software: you can redistribute it and/or modify */ | ||
7 | /* it under the terms of the GNU Lesser General Public License as published by */ | ||
8 | /* the Free Software Foundation, either version 3 of the License, or */ | ||
9 | /* (at your option) any later version. */ | ||
10 | /* */ | ||
11 | /* DLB is distributed in the hope that it will be useful, */ | ||
12 | /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
13 | /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
14 | /* GNU Lesser General Public License for more details. */ | ||
15 | /* */ | ||
16 | /* You should have received a copy of the GNU Lesser General Public License */ | ||
17 | /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */ | ||
18 | /*********************************************************************************/ | ||
19 | |||
20 | /* Tracking Application Live Performance */ | ||
21 | |||
22 | #include "apis/dlb_talp.h" | ||
23 | |||
24 | #include "apis/dlb_errors.h" | ||
25 | #include "LB_core/spd.h" | ||
26 | #include "LB_core/DLB_talp.h" | ||
27 | #include "LB_core/DLB_kernel.h" | ||
28 | #include "LB_comm/shmem_cpuinfo.h" | ||
29 | #include "LB_comm/shmem_procinfo.h" | ||
30 | #include "LB_comm/shmem_talp.h" | ||
31 | #include "support/debug.h" | ||
32 | #include "support/mask_utils.h" | ||
33 | #include "support/mytime.h" | ||
34 | |||
35 | #pragma GCC visibility push(default) | ||
36 | |||
37 | /*********************************************************************************/ | ||
38 | /* TALP */ | ||
39 | /*********************************************************************************/ | ||
40 | |||
41 | 9 | int DLB_TALP_Attach(void) { | |
42 | int lewi_color; | ||
43 | char shm_key[MAX_OPTION_LENGTH]; | ||
44 | char *shm_key_ptr; | ||
45 | 9 | spd_enter_dlb(thread_spd); | |
46 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 3 times.
|
9 | if (!thread_spd->dlb_initialized) { |
47 | 6 | set_observer_role(true); | |
48 | 6 | options_parse_entry("--shm-key", shm_key); | |
49 | 6 | options_parse_entry("--lewi-color", &lewi_color); | |
50 | 6 | shm_key_ptr = shm_key; | |
51 | } else { | ||
52 | 3 | lewi_color = thread_spd->options.lewi_color; | |
53 | 3 | shm_key_ptr = thread_spd->options.shm_key; | |
54 | } | ||
55 | 9 | shmem_cpuinfo_ext__init(shm_key_ptr, lewi_color); | |
56 | 9 | shmem_procinfo_ext__init(shm_key_ptr); | |
57 | 9 | shmem_talp_ext__init(shm_key_ptr, 0); | |
58 | 9 | return DLB_SUCCESS; | |
59 | } | ||
60 | |||
61 | 9 | int DLB_TALP_Detach(void) { | |
62 | 9 | int error = shmem_cpuinfo_ext__finalize(); | |
63 |
1/2✓ Branch 0 taken 9 times.
✗ Branch 1 not taken.
|
9 | error = error ? error : shmem_procinfo_ext__finalize(); |
64 |
1/2✓ Branch 0 taken 9 times.
✗ Branch 1 not taken.
|
9 | error = error ? error : shmem_talp_ext__finalize(); |
65 | 9 | return error; | |
66 | } | ||
67 | |||
68 | 1 | int DLB_TALP_GetNumCPUs(int *ncpus) { | |
69 | 1 | *ncpus = mu_get_system_size(); | |
70 | 1 | return DLB_SUCCESS; | |
71 | } | ||
72 | |||
73 | 5 | int DLB_TALP_GetPidList(int *pidlist, int *nelems, int max_len) { | |
74 | 5 | return shmem_procinfo__getpidlist(pidlist, nelems, max_len); | |
75 | } | ||
76 | |||
77 | 5 | int DLB_TALP_GetTimes(int pid, double *mpi_time, double *useful_time) { | |
78 | |||
79 | int error; | ||
80 | |||
81 |
4/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 3 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 3 times.
|
7 | if (pid == 0 || (thread_spd && thread_spd->id == pid)) { |
82 | /* Same process */ | ||
83 | 2 | const dlb_monitor_t *monitor = monitoring_region_get_global_region(thread_spd); | |
84 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | if (monitor != NULL) { |
85 | 1 | *mpi_time = nsecs_to_secs(monitor->mpi_time); | |
86 | 1 | *useful_time = nsecs_to_secs(monitor->useful_time); | |
87 | 1 | error = DLB_SUCCESS; | |
88 | } else { | ||
89 | 1 | error = DLB_ERR_NOTALP; | |
90 | } | ||
91 | } else { | ||
92 | /* Different process, fetch from shared memory */ | ||
93 | talp_region_list_t region; | ||
94 | 3 | error = shmem_talp__get_region(®ion, pid, | |
95 | monitoring_region_get_global_region_name()); | ||
96 | |||
97 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1 times.
|
3 | if (error == DLB_SUCCESS) { |
98 | 2 | *mpi_time = nsecs_to_secs(region.mpi_time); | |
99 | 2 | *useful_time = nsecs_to_secs(region.useful_time); | |
100 | } | ||
101 | } | ||
102 | |||
103 | 5 | return error; | |
104 | } | ||
105 | |||
106 | 4 | int DLB_TALP_GetNodeTimes(const char *name, dlb_node_times_t *node_times_list, | |
107 | int *nelems, int max_len) { | ||
108 | |||
109 | int error; | ||
110 | |||
111 |
2/2✓ Branch 1 taken 3 times.
✓ Branch 2 taken 1 times.
|
4 | if (shmem_talp__initialized()) { |
112 | /* Only if a worker process started with --talp-external-profiler */ | ||
113 | 3 | int shmem_max_regions = shmem_talp__get_max_regions(); | |
114 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
|
3 | if (max_len > shmem_max_regions) { |
115 | 1 | max_len = shmem_max_regions; | |
116 | } | ||
117 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (name == DLB_GLOBAL_REGION) { |
118 | 3 | name = monitoring_region_get_global_region_name(); | |
119 | } | ||
120 | 3 | talp_region_list_t *region_list = malloc(sizeof(talp_region_list_t)*max_len); | |
121 | 3 | error = shmem_talp__get_regionlist(region_list, nelems, max_len, name); | |
122 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (error == DLB_SUCCESS) { |
123 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 3 times.
|
8 | for (int i=0; i<*nelems; ++i) { |
124 | 5 | node_times_list[i] = (const dlb_node_times_t) { | |
125 | 5 | .pid = region_list[i].pid, | |
126 | 5 | .mpi_time = region_list[i].mpi_time, | |
127 | 5 | .useful_time = region_list[i].useful_time, | |
128 | }; | ||
129 | } | ||
130 | } | ||
131 | 3 | free(region_list); | |
132 | } else { | ||
133 | /* shmem does not exist */ | ||
134 | 1 | error = DLB_ERR_NOSHMEM; | |
135 | } | ||
136 | |||
137 | 4 | return error; | |
138 | } | ||
139 | |||
140 | 2 | int DLB_TALP_QueryPOPNodeMetrics(const char *name, dlb_node_metrics_t *node_metrics) { | |
141 |
2/2✓ Branch 1 taken 1 times.
✓ Branch 2 taken 1 times.
|
2 | if (shmem_talp__initialized()) { |
142 | /* Only if a worker process started with --talp-external-profiler */ | ||
143 | 1 | return talp_query_pop_node_metrics(name, node_metrics); | |
144 | } else { | ||
145 | 1 | return DLB_ERR_NOSHMEM; | |
146 | } | ||
147 | } | ||
148 | |||
149 | |||
150 | /*********************************************************************************/ | ||
151 | /* TALP Monitoring Regions */ | ||
152 | /*********************************************************************************/ | ||
153 | |||
154 | 2 | dlb_monitor_t* DLB_MonitoringRegionGetGlobal(void) { | |
155 | 2 | spd_enter_dlb(thread_spd); | |
156 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | if (unlikely(!thread_spd->talp_info)) { |
157 | 1 | return NULL; | |
158 | } | ||
159 | 1 | return monitoring_region_get_global_region(thread_spd); | |
160 | } | ||
161 | |||
162 | dlb_monitor_t* DLB_MonitoringRegionGetImplicit(void) | ||
163 | __attribute__((alias("DLB_MonitoringRegionGetGlobal"))); | ||
164 | |||
165 | const dlb_monitor_t* DLB_MonitoringRegionGetMPIRegion(void) | ||
166 | __attribute__((alias("DLB_MonitoringRegionGetGlobal"))); | ||
167 | |||
168 | 10 | dlb_monitor_t* DLB_MonitoringRegionRegister(const char *name){ | |
169 | 10 | spd_enter_dlb(thread_spd); | |
170 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 8 times.
|
10 | if (unlikely(!thread_spd->talp_info)) { |
171 | 2 | return NULL; | |
172 | } | ||
173 | 8 | return monitoring_region_register(thread_spd, name); | |
174 | } | ||
175 | |||
176 | 3 | int DLB_MonitoringRegionReset(dlb_monitor_t *handle){ | |
177 | 3 | spd_enter_dlb(thread_spd); | |
178 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1 times.
|
3 | if (unlikely(!thread_spd->talp_info)) { |
179 | 2 | return DLB_ERR_NOTALP; | |
180 | } | ||
181 | 1 | return monitoring_region_reset(thread_spd, handle); | |
182 | } | ||
183 | |||
184 | 307 | int DLB_MonitoringRegionStart(dlb_monitor_t *handle){ | |
185 | 307 | spd_enter_dlb(thread_spd); | |
186 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 305 times.
|
307 | if (unlikely(!thread_spd->talp_info)) { |
187 | 2 | return DLB_ERR_NOTALP; | |
188 | } | ||
189 | 305 | return monitoring_region_start(thread_spd, handle); | |
190 | } | ||
191 | |||
192 | 307 | int DLB_MonitoringRegionStop(dlb_monitor_t *handle){ | |
193 | 307 | spd_enter_dlb(thread_spd); | |
194 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 305 times.
|
307 | if (unlikely(!thread_spd->talp_info)) { |
195 | 2 | return DLB_ERR_NOTALP; | |
196 | } | ||
197 | 305 | return monitoring_region_stop(thread_spd, handle); | |
198 | } | ||
199 | |||
200 | 6 | int DLB_MonitoringRegionReport(const dlb_monitor_t *handle){ | |
201 | 6 | spd_enter_dlb(thread_spd); | |
202 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 4 times.
|
6 | if (unlikely(!thread_spd->talp_info)) { |
203 | 2 | return DLB_ERR_NOTALP; | |
204 | } | ||
205 | 4 | return monitoring_region_report(thread_spd, handle); | |
206 | } | ||
207 | |||
208 | 1 | int DLB_MonitoringRegionsUpdate(void) { | |
209 | 1 | spd_enter_dlb(thread_spd); | |
210 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | if (unlikely(!thread_spd->talp_info)) { |
211 | 1 | return DLB_ERR_NOTALP; | |
212 | } | ||
213 | ✗ | return monitoring_regions_force_update(thread_spd); | |
214 | } | ||
215 | |||
216 | 1 | int DLB_TALP_CollectPOPMetrics(dlb_monitor_t *monitor, dlb_pop_metrics_t *pop_metrics) { | |
217 | 1 | spd_enter_dlb(thread_spd); | |
218 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | if (unlikely(!thread_spd->talp_info)) { |
219 | 1 | return DLB_ERR_NOTALP; | |
220 | } | ||
221 | ✗ | return talp_collect_pop_metrics(thread_spd, monitor, pop_metrics); | |
222 | } | ||
223 | |||
224 | 1 | int DLB_TALP_CollectPOPNodeMetrics(dlb_monitor_t *monitor, dlb_node_metrics_t *node_metrics) { | |
225 | 1 | spd_enter_dlb(thread_spd); | |
226 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | if (unlikely(!thread_spd->talp_info)) { |
227 | 1 | return DLB_ERR_NOTALP; | |
228 | } | ||
229 | ✗ | if (unlikely(!thread_spd->options.barrier)) { | |
230 | ✗ | return DLB_ERR_NOCOMP; | |
231 | } | ||
232 | ✗ | return talp_collect_pop_node_metrics(thread_spd, monitor, node_metrics); | |
233 | } | ||
234 | |||
235 | int DLB_TALP_CollectNodeMetrics(dlb_monitor_t *monitor, dlb_node_metrics_t *node_metrics) | ||
236 | __attribute__((alias("DLB_TALP_CollectPOPNodeMetrics"))); | ||
237 |