Line | Branch | Exec | Source |
---|---|---|---|
1 | /*********************************************************************************/ | ||
2 | /* Copyright 2009-2024 Barcelona Supercomputing Center */ | ||
3 | /* */ | ||
4 | /* This file is part of the DLB library. */ | ||
5 | /* */ | ||
6 | /* DLB is free software: you can redistribute it and/or modify */ | ||
7 | /* it under the terms of the GNU Lesser General Public License as published by */ | ||
8 | /* the Free Software Foundation, either version 3 of the License, or */ | ||
9 | /* (at your option) any later version. */ | ||
10 | /* */ | ||
11 | /* DLB is distributed in the hope that it will be useful, */ | ||
12 | /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
13 | /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
14 | /* GNU Lesser General Public License for more details. */ | ||
15 | /* */ | ||
16 | /* You should have received a copy of the GNU Lesser General Public License */ | ||
17 | /* along with DLB. If not, see <https://www.gnu.org/licenses/>. */ | ||
18 | /*********************************************************************************/ | ||
19 | |||
20 | #ifdef HAVE_CONFIG_H | ||
21 | #include <config.h> | ||
22 | #endif | ||
23 | |||
24 | #include "LB_core/node_barrier.h" | ||
25 | |||
26 | #include "apis/dlb_errors.h" | ||
27 | #include "apis/dlb_types.h" | ||
28 | #include "LB_core/spd.h" | ||
29 | #include "LB_comm/shmem_barrier.h" | ||
30 | #include "support/debug.h" | ||
31 | #include "support/tracing.h" | ||
32 | |||
33 | #include <pthread.h> | ||
34 | #include <stdlib.h> | ||
35 | #include <string.h> | ||
36 | |||
37 | /* Per process, private Barrier data */ | ||
38 | typedef struct barrier_info { | ||
39 | char default_barrier_name[BARRIER_NAME_MAX]; /* only to keep compatibility with --barrier-id */ | ||
40 | barrier_t *default_barrier; | ||
41 | barrier_t **barrier_list; | ||
42 | int max_barriers; | ||
43 | } barrier_info_t; | ||
44 | |||
45 | static const char *default_barrier_name = "default"; | ||
46 | static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; | ||
47 | |||
48 | /* Parse, for the specific barrier, whether it should do LeWI based on: | ||
49 | * - if barrier_name == default_barrier_name: | ||
50 | * if lewi_barrier and !lewi_barrier_select; | ||
51 | * or "default" in lewi_barrier_select | ||
52 | * - else: | ||
53 | * if barrier_flags has LEWI; | ||
54 | * or if barrier_flags has SELECTIVE and name in lewi_barrier_select | ||
55 | */ | ||
56 | 112 | static bool parse_lewi_barrier(const char *barrier_name, bool lewi_barrier, | |
57 | const char *lewi_barrier_select, int api_flags) { | ||
58 |
2/2✓ Branch 0 taken 82 times.
✓ Branch 1 taken 30 times.
|
112 | if (strncmp(barrier_name, default_barrier_name, BARRIER_NAME_MAX) == 0) { |
59 |
2/2✓ Branch 0 taken 70 times.
✓ Branch 1 taken 12 times.
|
82 | if (strlen(lewi_barrier_select) == 0) { |
60 | /* Default barrier: --lewi-barrier-select not set, --lewi-barrier dictates */ | ||
61 | 70 | return lewi_barrier; | |
62 | } | ||
63 | } else { | ||
64 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 25 times.
|
30 | if (api_flags == DLB_BARRIER_LEWI_ON) { |
65 | /* Named barrier: LeWI is forced by API */ | ||
66 | 5 | return true; | |
67 |
2/2✓ Branch 0 taken 19 times.
✓ Branch 1 taken 6 times.
|
25 | } else if (api_flags == DLB_BARRIER_LEWI_OFF) { |
68 | /* Named barrier: LeWI is disallowed by API */ | ||
69 | 19 | return false; | |
70 | } | ||
71 | } | ||
72 | |||
73 | /* Find barrier_name in --lewi-barrier-select */ | ||
74 | 18 | size_t len = strlen(lewi_barrier_select); | |
75 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | if (len > 0) { |
76 | 16 | bool found_in_select = false; | |
77 | 16 | char *barrier_select_copy = malloc(sizeof(char)*(len+1)); | |
78 | 16 | strcpy(barrier_select_copy, lewi_barrier_select); | |
79 | char *saveptr; | ||
80 | 16 | char *token = strtok_r(barrier_select_copy, ",", &saveptr); | |
81 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 8 times.
|
32 | while (token) { |
82 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 16 times.
|
24 | if (strcmp(token, barrier_name) == 0) { |
83 | 8 | found_in_select = true; | |
84 | 8 | break; | |
85 | } | ||
86 | /* next token */ | ||
87 | 16 | token = strtok_r(NULL, ",", &saveptr); | |
88 | } | ||
89 | 16 | free(barrier_select_copy); | |
90 | |||
91 | 16 | return found_in_select; | |
92 | } | ||
93 | |||
94 | 2 | return false; | |
95 | } | ||
96 | |||
97 | 81 | void node_barrier_init(subprocess_descriptor_t *spd) { | |
98 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 81 times.
|
81 | if (spd->barrier_info != NULL) { |
99 | ✗ | fatal("Cannot initialize Node Barrier, barrier_info not NULL\n" | |
100 | "Please, report bug at " PACKAGE_BUGREPORT); | ||
101 | } | ||
102 | |||
103 | /* Even though default_barrier_name may change, no harm to use it here | ||
104 | * because we parse the user's options. */ | ||
105 | 81 | bool lewi_barrier = parse_lewi_barrier(default_barrier_name, | |
106 | 81 | spd->options.lewi_barrier, spd->options.lewi_barrier_select, 0); | |
107 | |||
108 | barrier_info_t *barrier_info; | ||
109 | 81 | pthread_mutex_lock(&mutex); | |
110 | { | ||
111 | /* Initialize barrier_info */ | ||
112 | 81 | barrier_info = malloc(sizeof(barrier_info_t)); | |
113 | 81 | *barrier_info = (const barrier_info_t){}; | |
114 | 81 | spd->barrier_info = barrier_info; | |
115 | |||
116 | /* --barrier-id may be deprecated in the future, but for now we just modify | ||
117 | * the default barrier name so that processes with different barrier id's | ||
118 | * don't synchronize with each other. */ | ||
119 |
1/2✓ Branch 0 taken 81 times.
✗ Branch 1 not taken.
|
81 | if (spd->options.barrier_id == 0) { |
120 | 81 | sprintf(barrier_info->default_barrier_name, "%s", default_barrier_name); | |
121 | } else { | ||
122 | ✗ | snprintf(barrier_info->default_barrier_name, BARRIER_NAME_MAX, | |
123 | "default (id: %d)", spd->options.barrier_id); | ||
124 | } | ||
125 | |||
126 | /* Initialize default barrier */ | ||
127 | 162 | barrier_info->default_barrier = shmem_barrier__register( | |
128 | 81 | barrier_info->default_barrier_name, lewi_barrier); | |
129 | |||
130 | /* Initialize barrier_list */ | ||
131 | 81 | barrier_info->max_barriers = shmem_barrier__get_max_barriers(); | |
132 | 81 | barrier_info->barrier_list = calloc(barrier_info->max_barriers, sizeof(void*)); | |
133 | } | ||
134 | 81 | pthread_mutex_unlock(&mutex); | |
135 | |||
136 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 81 times.
|
81 | if (barrier_info->default_barrier == NULL) { |
137 | ✗ | warning("DLB system barrier could nout be initialized"); | |
138 | } | ||
139 | 81 | } | |
140 | |||
141 | 81 | void node_barrier_finalize(subprocess_descriptor_t *spd) { | |
142 | 81 | pthread_mutex_lock(&mutex); | |
143 | { | ||
144 |
1/2✓ Branch 0 taken 81 times.
✗ Branch 1 not taken.
|
81 | if (spd->barrier_info != NULL) { |
145 | /* Detach all, no need to check for non NULL values */ | ||
146 | 81 | barrier_info_t *barrier_info = spd->barrier_info; | |
147 | 81 | shmem_barrier__detach(barrier_info->default_barrier); | |
148 | int i; | ||
149 |
2/2✓ Branch 0 taken 1036 times.
✓ Branch 1 taken 81 times.
|
1117 | for (i=0; i<barrier_info->max_barriers; ++i) { |
150 | 1036 | shmem_barrier__detach(barrier_info->barrier_list[i]); | |
151 | } | ||
152 | 81 | free(barrier_info->barrier_list); | |
153 | 81 | *barrier_info = (const barrier_info_t){}; | |
154 | 81 | free(spd->barrier_info); | |
155 | 81 | spd->barrier_info = NULL; | |
156 | } | ||
157 | } | ||
158 | 81 | pthread_mutex_unlock(&mutex); | |
159 | 81 | } | |
160 | |||
161 | 38 | barrier_t* node_barrier_register(subprocess_descriptor_t *spd, | |
162 | const char *barrier_name, int flags) { | ||
163 | |||
164 | /* This function does not allow registering the default barrier */ | ||
165 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 38 times.
|
38 | if (barrier_name == NULL) return NULL; |
166 | |||
167 | 38 | barrier_t *barrier = NULL; | |
168 |
1/2✓ Branch 0 taken 38 times.
✗ Branch 1 not taken.
|
38 | if (spd->options.barrier) { |
169 | /* The register function cannot know whether the calling process is a new | ||
170 | * participant or just a query for the pointer. If we have at least one | ||
171 | * registered named barrier, we need to check the shared memory first. */ | ||
172 | 38 | barrier_info_t *barrier_info = spd->barrier_info; | |
173 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 21 times.
|
38 | if (barrier_info->barrier_list[0] != NULL) { |
174 | 17 | barrier = shmem_barrier__find(barrier_name); | |
175 |
2/2✓ Branch 0 taken 9 times.
✓ Branch 1 taken 8 times.
|
17 | if (barrier != NULL) { |
176 | /* Barrier is found in shmem, check if it's registered within the spd. */ | ||
177 | int i; | ||
178 | 9 | int max_barriers = barrier_info->max_barriers; | |
179 |
2/2✓ Branch 0 taken 37 times.
✓ Branch 1 taken 1 times.
|
38 | for (i=0; i<max_barriers; ++i) { |
180 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 29 times.
|
37 | if (barrier_info->barrier_list[i] == barrier) { |
181 | /* Barrier already registered in spd */ | ||
182 | 8 | return barrier; | |
183 | } | ||
184 | } | ||
185 | /* Barrier is not registered within this spd */ | ||
186 | 1 | barrier = NULL; | |
187 | } | ||
188 | } | ||
189 | |||
190 | /* Register if not found */ | ||
191 |
1/2✓ Branch 0 taken 30 times.
✗ Branch 1 not taken.
|
30 | if (barrier == NULL) { |
192 | 30 | bool lewi_barrier = parse_lewi_barrier(barrier_name, | |
193 | 30 | spd->options.lewi_barrier, | |
194 | 30 | spd->options.lewi_barrier_select, flags); | |
195 | 30 | barrier = shmem_barrier__register(barrier_name, lewi_barrier); | |
196 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 30 times.
|
30 | if (barrier == NULL) return NULL; |
197 | } | ||
198 | |||
199 | /* Update the barrier list, if needed */ | ||
200 | int i; | ||
201 | 30 | int max_barriers = barrier_info->max_barriers; | |
202 | 30 | pthread_mutex_lock(&mutex); | |
203 | { | ||
204 |
1/2✓ Branch 0 taken 54 times.
✗ Branch 1 not taken.
|
54 | for (i=0; i<max_barriers; ++i) { |
205 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 24 times.
|
54 | if (barrier_info->barrier_list[i] == NULL) { |
206 | 30 | barrier_info->barrier_list[i] = barrier; | |
207 | 30 | break; | |
208 | } | ||
209 | } | ||
210 | } | ||
211 | 30 | pthread_mutex_unlock(&mutex); | |
212 | |||
213 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 30 times.
|
30 | ensure(i < max_barriers, "Cannot register Node Barrier, no space left in" |
214 | " barrier_list.\nPlease, report bug at " PACKAGE_BUGREPORT); | ||
215 | } | ||
216 | |||
217 | 30 | return barrier; | |
218 | } | ||
219 | |||
220 | 49 | int node_barrier(const subprocess_descriptor_t *spd, barrier_t *barrier) { | |
221 | int error; | ||
222 |
2/2✓ Branch 0 taken 47 times.
✓ Branch 1 taken 2 times.
|
49 | if (spd->options.barrier) { |
223 | /* Check whether barrier is valid */ | ||
224 | 47 | barrier_info_t *barrier_info = spd->barrier_info; | |
225 |
2/2✓ Branch 0 taken 17 times.
✓ Branch 1 taken 30 times.
|
47 | if (barrier == NULL) { |
226 | /* If barrier is not provided we only need to check the reserved | ||
227 | * position in barrier_list */ | ||
228 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 1 times.
|
17 | if (barrier_info->default_barrier != NULL) { |
229 | 16 | barrier = barrier_info->default_barrier; | |
230 | } | ||
231 |
1/2✓ Branch 0 taken 30 times.
✗ Branch 1 not taken.
|
30 | } else if (unlikely(barrier == barrier_info->default_barrier)) { |
232 | /* barrier provided is the default barrier, nothing to do. | ||
233 | * (default_barrier pointer is never exposed, keep this one just in case) */ | ||
234 | } else { | ||
235 | /* Otherwise, we need to check whether the provided barrier has | ||
236 | * not been detached */ | ||
237 | 30 | int i = 0; | |
238 | 30 | int max_barriers = barrier_info->max_barriers; | |
239 | 30 | pthread_mutex_lock(&mutex); | |
240 | { | ||
241 | 30 | while (i<max_barriers | |
242 |
2/2✓ Branch 0 taken 50 times.
✓ Branch 1 taken 3 times.
|
53 | && barrier_info->barrier_list[i] != NULL |
243 |
3/4✓ Branch 0 taken 53 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 23 times.
✓ Branch 3 taken 27 times.
|
103 | && barrier_info->barrier_list[i] != barrier) { |
244 | 23 | ++i; | |
245 | } | ||
246 | |||
247 |
3/4✓ Branch 0 taken 30 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 27 times.
|
30 | if (i == max_barriers || barrier_info->barrier_list[i] == NULL) { |
248 | /* Not found in barrier_list */ | ||
249 | 3 | barrier = NULL; | |
250 | } | ||
251 | } | ||
252 | 30 | pthread_mutex_unlock(&mutex); | |
253 | } | ||
254 | |||
255 | /* If barrier was found, perform the actual barrier */ | ||
256 |
2/2✓ Branch 0 taken 43 times.
✓ Branch 1 taken 4 times.
|
47 | if (barrier != NULL) { |
257 | instrument_event(RUNTIME_EVENT, EVENT_BARRIER, EVENT_BEGIN); | ||
258 | 43 | shmem_barrier__barrier(barrier); | |
259 | instrument_event(RUNTIME_EVENT, EVENT_BARRIER, EVENT_END); | ||
260 | 43 | error = DLB_SUCCESS; | |
261 | } else { | ||
262 | /* barrier not found in barrier_info, possibly a detached barrier */ | ||
263 | 4 | error = DLB_NOUPDT; | |
264 | } | ||
265 | } else { | ||
266 | 2 | error = DLB_ERR_NOCOMP; | |
267 | } | ||
268 | |||
269 | 49 | return error; | |
270 | } | ||
271 | |||
272 | 11 | int node_barrier_attach(subprocess_descriptor_t *spd, barrier_t *barrier) { | |
273 | int error; | ||
274 |
2/2✓ Branch 0 taken 9 times.
✓ Branch 1 taken 2 times.
|
11 | if (spd->options.barrier) { |
275 | 9 | barrier_info_t *barrier_info = spd->barrier_info; | |
276 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6 times.
|
9 | if (barrier == NULL) { |
277 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 2 times.
|
3 | if (barrier_info->default_barrier == NULL) { |
278 | /* Register default barrier again */ | ||
279 | 1 | bool lewi_barrier = parse_lewi_barrier(default_barrier_name, | |
280 | 1 | spd->options.lewi_barrier, | |
281 | 1 | spd->options.lewi_barrier_select, 0); | |
282 | 2 | barrier_info->default_barrier = shmem_barrier__register( | |
283 | 1 | barrier_info->default_barrier_name, | |
284 | lewi_barrier); | ||
285 | // return number of participants | ||
286 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | error = barrier_info->default_barrier ? 1 : DLB_ERR_NOMEM; |
287 | } else { | ||
288 | /* Default barrier already attached */ | ||
289 | 2 | error = DLB_ERR_PERM; | |
290 | } | ||
291 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | } else if (unlikely(barrier == barrier_info->default_barrier)) { |
292 | /* barrier provided is the default barrier, already attached. | ||
293 | * (default_barrier pointer is never exposed, keep this one just in case) */ | ||
294 | ✗ | error = DLB_ERR_PERM; | |
295 | } else { | ||
296 | 6 | int i = 0; | |
297 | 6 | int max_barriers = shmem_barrier__get_max_barriers(); | |
298 | 6 | pthread_mutex_lock(&mutex); | |
299 | { | ||
300 | /* Find first NULL place or barrier in barrier_list */ | ||
301 | 6 | while (i<max_barriers | |
302 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 3 times.
|
8 | && barrier_info->barrier_list[i] != NULL |
303 |
3/4✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 3 times.
|
13 | && barrier_info->barrier_list[i] != barrier) { |
304 | 2 | ++i; | |
305 | } | ||
306 | |||
307 |
3/4✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 3 times.
|
6 | if (i < max_barriers && barrier_info->barrier_list[i] == barrier) { |
308 | /* Already in the barrier_list */ | ||
309 | 3 | error = DLB_ERR_PERM; | |
310 | } else { | ||
311 | /* Attach */ | ||
312 | 3 | error = shmem_barrier__attach(barrier); | |
313 | |||
314 | /* Add barrier to the barrier_list */ | ||
315 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (error >= 0) { |
316 | 3 | barrier_info->barrier_list[i] = barrier; | |
317 | } | ||
318 | } | ||
319 | } | ||
320 | 6 | pthread_mutex_unlock(&mutex); | |
321 | } | ||
322 | } else { | ||
323 | /* no --barrier */ | ||
324 | 2 | error = DLB_ERR_NOCOMP; | |
325 | } | ||
326 | |||
327 | 11 | return error; | |
328 | } | ||
329 | |||
330 | 17 | int node_barrier_detach(subprocess_descriptor_t *spd, barrier_t *barrier) { | |
331 | int error; | ||
332 |
2/2✓ Branch 0 taken 15 times.
✓ Branch 1 taken 2 times.
|
17 | if (spd->options.barrier) { |
333 | 15 | barrier_info_t *barrier_info = spd->barrier_info; | |
334 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 9 times.
|
15 | if (barrier == NULL) { |
335 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 3 times.
|
6 | if (barrier_info->default_barrier != NULL) { |
336 | /* Detach default barrier */ | ||
337 | 3 | error = shmem_barrier__detach(barrier_info->default_barrier); | |
338 |
1/2✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
|
3 | if (error >= 0) { |
339 | 3 | barrier_info->default_barrier = NULL; | |
340 | } | ||
341 | } else { | ||
342 | /* Default barrier already detached */ | ||
343 | 3 | error = DLB_ERR_PERM; | |
344 | } | ||
345 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 9 times.
|
9 | } else if (unlikely(barrier == barrier_info->default_barrier)) { |
346 | /* Detach default barrier. | ||
347 | * (default_barrier pointer is never exposed, keep this one just in case) */ | ||
348 | ✗ | error = shmem_barrier__detach(barrier_info->default_barrier); | |
349 | ✗ | if (error >= 0) { | |
350 | ✗ | barrier_info->default_barrier = NULL; | |
351 | } | ||
352 | } else { | ||
353 | 9 | int i = 0; | |
354 | 9 | int max_barriers = shmem_barrier__get_max_barriers(); | |
355 | 9 | pthread_mutex_lock(&mutex); | |
356 | { | ||
357 | /* Find first NULL place or barrier in barrier_list */ | ||
358 | 9 | while (i<max_barriers | |
359 |
2/2✓ Branch 0 taken 11 times.
✓ Branch 1 taken 1 times.
|
12 | && barrier_info->barrier_list[i] != NULL |
360 |
3/4✓ Branch 0 taken 12 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 8 times.
|
23 | && barrier_info->barrier_list[i] != barrier) { |
361 | 3 | ++i; | |
362 | } | ||
363 | |||
364 |
3/4✓ Branch 0 taken 9 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 8 times.
|
9 | if (i == max_barriers || barrier_info->barrier_list[i] == NULL) { |
365 | /* Not found in barrier_list */ | ||
366 | 1 | error = DLB_ERR_PERM; | |
367 | } else { | ||
368 | /* Detach */ | ||
369 | 8 | error = shmem_barrier__detach(barrier); | |
370 | |||
371 | /* Remove barrier from the barrier_list */ | ||
372 |
1/2✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
|
8 | if (error >= 0) { |
373 | 8 | memmove(&barrier_info->barrier_list[i], | |
374 | 8 | &barrier_info->barrier_list[i+1], | |
375 | 8 | sizeof(barrier_info->barrier_list[0]) * (max_barriers-1-i)); | |
376 | 8 | barrier_info->barrier_list[max_barriers-1] = NULL; | |
377 | } | ||
378 | } | ||
379 | } | ||
380 | 9 | pthread_mutex_unlock(&mutex); | |
381 | } | ||
382 | } else { | ||
383 | /* no --barrier */ | ||
384 | 2 | error = DLB_ERR_NOCOMP; | |
385 | } | ||
386 | |||
387 | 17 | return error; | |
388 | } | ||
389 |