14 #include "kmp_wait_release.h" 17 #include "kmp_stats.h" 18 #include "ompt-specific.h" 21 #include <immintrin.h> 22 #define USE_NGO_STORES 1 25 #include "tsan_annotations.h" 27 #if KMP_MIC && USE_NGO_STORES 29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) 30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory") 34 #define ngo_load(src) ((void)0) 35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) 36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) 37 #define ngo_sync() ((void)0) 40 void __kmp_print_structure(
void);
45 template <
bool cancellable = false>
46 static bool __kmp_linear_barrier_gather_template(
47 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
48 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
49 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50 kmp_team_t *team = this_thr->th.th_team;
51 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52 kmp_info_t **other_threads = team->t.t_threads;
56 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57 gtid, team->t.t_id, tid, bt));
58 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY 62 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64 __itt_get_timestamp();
69 if (!KMP_MASTER_TID(tid)) {
71 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" 72 "arrived(%p): %llu => %llu\n",
73 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
80 ANNOTATE_BARRIER_BEGIN(this_thr);
81 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
84 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85 int nproc = this_thr->th.th_team_nproc;
88 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
91 for (i = 1; i < nproc; ++i) {
95 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
97 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 98 "arrived(%p) == %llu\n",
99 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
101 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
105 kmp_flag_64<true, false> flag(
106 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
107 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
110 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
112 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
114 ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY 118 if (__kmp_forkjoin_frames_mode == 2) {
119 this_thr->th.th_bar_min_time = KMP_MIN(
120 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
125 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
128 ANNOTATE_REDUCE_AFTER(reduce);
129 OMPT_REDUCTION_DECL(this_thr, gtid);
130 OMPT_REDUCTION_BEGIN;
131 (*reduce)(this_thr->th.th_local.reduce_data,
132 other_threads[i]->th.th_local.reduce_data);
134 ANNOTATE_REDUCE_BEFORE(reduce);
135 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
139 team_bar->b_arrived = new_state;
140 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d " 141 "arrived(%p) = %llu\n",
142 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
147 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148 gtid, team->t.t_id, tid, bt));
152 template <
bool cancellable = false>
153 static bool __kmp_linear_barrier_release_template(
154 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
155 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
156 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
160 if (KMP_MASTER_TID(tid)) {
162 kmp_uint32 nproc = this_thr->th.th_team_nproc;
163 kmp_info_t **other_threads;
165 team = __kmp_threads[gtid]->th.th_team;
166 KMP_DEBUG_ASSERT(team != NULL);
167 other_threads = team->t.t_threads;
169 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for " 171 gtid, team->t.t_id, tid, bt));
174 #if KMP_BARRIER_ICV_PUSH 176 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177 if (propagate_icvs) {
178 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179 for (i = 1; i < nproc; ++i) {
180 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
182 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183 &team->t.t_implicit_task_taskdata[0].td_icvs);
188 #endif // KMP_BARRIER_ICV_PUSH 191 for (i = 1; i < nproc; ++i) {
195 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
199 (
"__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " 200 "go(%p): %u => %u\n",
201 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203 other_threads[i]->th.th_bar[bt].bb.b_go,
204 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205 ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
212 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
215 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
216 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
219 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
220 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
222 ANNOTATE_BARRIER_END(this_thr);
223 #if USE_ITT_BUILD && USE_ITT_NOTIFY 224 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
227 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
229 __kmp_itt_task_starting(itt_sync_obj);
231 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
234 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
235 if (itt_sync_obj != NULL)
237 __kmp_itt_task_finished(itt_sync_obj);
241 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
245 tid = __kmp_tid_from_gtid(gtid);
246 team = __kmp_threads[gtid]->th.th_team;
248 KMP_DEBUG_ASSERT(team != NULL);
249 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
251 (
"__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
252 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
257 (
"__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
258 gtid, team->t.t_id, tid, bt));
262 static void __kmp_linear_barrier_gather(
263 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
264 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
265 __kmp_linear_barrier_gather_template<false>(
266 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
269 static bool __kmp_linear_barrier_gather_cancellable(
270 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
271 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
272 return __kmp_linear_barrier_gather_template<true>(
273 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
276 static void __kmp_linear_barrier_release(
277 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
278 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
279 __kmp_linear_barrier_release_template<false>(
280 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
283 static bool __kmp_linear_barrier_release_cancellable(
284 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
285 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
286 return __kmp_linear_barrier_release_template<true>(
287 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
291 static void __kmp_tree_barrier_gather(
292 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
293 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
294 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
295 kmp_team_t *team = this_thr->th.th_team;
296 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
297 kmp_info_t **other_threads = team->t.t_threads;
298 kmp_uint32 nproc = this_thr->th.th_team_nproc;
299 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
300 kmp_uint32 branch_factor = 1 << branch_bits;
302 kmp_uint32 child_tid;
303 kmp_uint64 new_state = 0;
306 20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
307 gtid, team->t.t_id, tid, bt));
308 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
310 #if USE_ITT_BUILD && USE_ITT_NOTIFY 312 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
313 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
314 __itt_get_timestamp();
319 child_tid = (tid << branch_bits) + 1;
320 if (child_tid < nproc) {
322 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
325 kmp_info_t *child_thr = other_threads[child_tid];
326 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
329 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
331 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
334 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 335 "arrived(%p) == %llu\n",
336 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
337 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
339 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
340 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
341 ANNOTATE_BARRIER_END(child_thr);
342 #if USE_ITT_BUILD && USE_ITT_NOTIFY 345 if (__kmp_forkjoin_frames_mode == 2) {
346 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
347 child_thr->th.th_bar_min_time);
352 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
353 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
354 team->t.t_id, child_tid));
355 ANNOTATE_REDUCE_AFTER(reduce);
356 OMPT_REDUCTION_DECL(this_thr, gtid);
357 OMPT_REDUCTION_BEGIN;
358 (*reduce)(this_thr->th.th_local.reduce_data,
359 child_thr->th.th_local.reduce_data);
361 ANNOTATE_REDUCE_BEFORE(reduce);
362 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
366 }
while (child <= branch_factor && child_tid < nproc);
369 if (!KMP_MASTER_TID(tid)) {
370 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
373 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 374 "arrived(%p): %llu => %llu\n",
375 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
376 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
377 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
383 ANNOTATE_BARRIER_BEGIN(this_thr);
384 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
389 team->t.t_bar[bt].b_arrived = new_state;
391 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
392 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d " 393 "arrived(%p) = %llu\n",
394 gtid, team->t.t_id, tid, team->t.t_id,
395 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
398 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
399 gtid, team->t.t_id, tid, bt));
402 static void __kmp_tree_barrier_release(
403 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
404 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
405 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
407 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
409 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
410 kmp_uint32 branch_factor = 1 << branch_bits;
412 kmp_uint32 child_tid;
417 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
418 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
420 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
421 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
422 ANNOTATE_BARRIER_END(this_thr);
423 #if USE_ITT_BUILD && USE_ITT_NOTIFY 424 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
427 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
429 __kmp_itt_task_starting(itt_sync_obj);
431 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
434 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
435 if (itt_sync_obj != NULL)
437 __kmp_itt_task_finished(itt_sync_obj);
441 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
445 team = __kmp_threads[gtid]->th.th_team;
446 KMP_DEBUG_ASSERT(team != NULL);
447 tid = __kmp_tid_from_gtid(gtid);
449 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
451 (
"__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
452 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
455 team = __kmp_threads[gtid]->th.th_team;
456 KMP_DEBUG_ASSERT(team != NULL);
457 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for " 459 gtid, team->t.t_id, tid, bt));
461 nproc = this_thr->th.th_team_nproc;
462 child_tid = (tid << branch_bits) + 1;
464 if (child_tid < nproc) {
465 kmp_info_t **other_threads = team->t.t_threads;
469 kmp_info_t *child_thr = other_threads[child_tid];
470 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
473 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
475 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
478 #if KMP_BARRIER_ICV_PUSH 480 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
481 if (propagate_icvs) {
482 __kmp_init_implicit_task(team->t.t_ident,
483 team->t.t_threads[child_tid], team,
485 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
486 &team->t.t_implicit_task_taskdata[0].td_icvs);
489 #endif // KMP_BARRIER_ICV_PUSH 491 (
"__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 492 "go(%p): %u => %u\n",
493 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
494 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
495 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
497 ANNOTATE_BARRIER_BEGIN(child_thr);
498 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
502 }
while (child <= branch_factor && child_tid < nproc);
505 20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
506 gtid, team->t.t_id, tid, bt));
510 static void __kmp_hyper_barrier_gather(
511 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
512 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
513 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
514 kmp_team_t *team = this_thr->th.th_team;
515 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
516 kmp_info_t **other_threads = team->t.t_threads;
517 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
518 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
519 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
520 kmp_uint32 branch_factor = 1 << branch_bits;
526 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
527 gtid, team->t.t_id, tid, bt));
528 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
530 #if USE_ITT_BUILD && USE_ITT_NOTIFY 532 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
533 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
534 __itt_get_timestamp();
539 kmp_flag_64<> p_flag(&thr_bar->b_arrived);
540 for (level = 0, offset = 1; offset < num_threads;
541 level += branch_bits, offset <<= branch_bits) {
543 kmp_uint32 child_tid;
545 if (((tid >> level) & (branch_factor - 1)) != 0) {
546 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
550 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 551 "arrived(%p): %llu => %llu\n",
552 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
553 team->t.t_id, parent_tid, &thr_bar->b_arrived,
555 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
560 ANNOTATE_BARRIER_BEGIN(this_thr);
561 p_flag.set_waiter(other_threads[parent_tid]);
567 if (new_state == KMP_BARRIER_UNUSED_STATE)
568 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
569 for (child = 1, child_tid = tid + (1 << level);
570 child < branch_factor && child_tid < num_threads;
571 child++, child_tid += (1 << level)) {
572 kmp_info_t *child_thr = other_threads[child_tid];
573 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
575 kmp_uint32 next_child_tid = child_tid + (1 << level);
577 if (child + 1 < branch_factor && next_child_tid < num_threads)
579 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
582 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 583 "arrived(%p) == %llu\n",
584 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
585 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
587 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
588 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
589 ANNOTATE_BARRIER_END(child_thr);
591 #if USE_ITT_BUILD && USE_ITT_NOTIFY 594 if (__kmp_forkjoin_frames_mode == 2) {
595 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
596 child_thr->th.th_bar_min_time);
601 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
602 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
603 team->t.t_id, child_tid));
604 ANNOTATE_REDUCE_AFTER(reduce);
605 OMPT_REDUCTION_DECL(this_thr, gtid);
606 OMPT_REDUCTION_BEGIN;
607 (*reduce)(this_thr->th.th_local.reduce_data,
608 child_thr->th.th_local.reduce_data);
610 ANNOTATE_REDUCE_BEFORE(reduce);
611 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
616 if (KMP_MASTER_TID(tid)) {
618 if (new_state == KMP_BARRIER_UNUSED_STATE)
619 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
621 team->t.t_bar[bt].b_arrived = new_state;
622 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d " 623 "arrived(%p) = %llu\n",
624 gtid, team->t.t_id, tid, team->t.t_id,
625 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
628 20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
629 gtid, team->t.t_id, tid, bt));
633 #define KMP_REVERSE_HYPER_BAR 634 static void __kmp_hyper_barrier_release(
635 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
636 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
637 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
639 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
640 kmp_info_t **other_threads;
641 kmp_uint32 num_threads;
642 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
643 kmp_uint32 branch_factor = 1 << branch_bits;
645 kmp_uint32 child_tid;
653 if (KMP_MASTER_TID(tid)) {
654 team = __kmp_threads[gtid]->th.th_team;
655 KMP_DEBUG_ASSERT(team != NULL);
656 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for " 658 gtid, team->t.t_id, tid, bt));
659 #if KMP_BARRIER_ICV_PUSH 660 if (propagate_icvs) {
661 copy_icvs(&thr_bar->th_fixed_icvs,
662 &team->t.t_implicit_task_taskdata[tid].td_icvs);
666 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
667 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
669 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
670 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
671 ANNOTATE_BARRIER_END(this_thr);
672 #if USE_ITT_BUILD && USE_ITT_NOTIFY 673 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
675 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
677 __kmp_itt_task_starting(itt_sync_obj);
679 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
682 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
683 if (itt_sync_obj != NULL)
685 __kmp_itt_task_finished(itt_sync_obj);
689 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
693 team = __kmp_threads[gtid]->th.th_team;
694 KMP_DEBUG_ASSERT(team != NULL);
695 tid = __kmp_tid_from_gtid(gtid);
697 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
699 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
700 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
703 num_threads = this_thr->th.th_team_nproc;
704 other_threads = team->t.t_threads;
706 #ifdef KMP_REVERSE_HYPER_BAR 708 for (level = 0, offset = 1;
709 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
710 level += branch_bits, offset <<= branch_bits)
714 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
715 level -= branch_bits, offset >>= branch_bits)
718 for (level = 0, offset = 1; offset < num_threads;
719 level += branch_bits, offset <<= branch_bits)
722 #ifdef KMP_REVERSE_HYPER_BAR 725 child = num_threads >> ((level == 0) ? level : level - 1);
726 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
727 child_tid = tid + (child << level);
728 child >= 1; child--, child_tid -= (1 << level))
730 if (((tid >> level) & (branch_factor - 1)) != 0)
735 for (child = 1, child_tid = tid + (1 << level);
736 child < branch_factor && child_tid < num_threads;
737 child++, child_tid += (1 << level))
738 #endif // KMP_REVERSE_HYPER_BAR 740 if (child_tid >= num_threads)
743 kmp_info_t *child_thr = other_threads[child_tid];
744 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
746 kmp_uint32 next_child_tid = child_tid - (1 << level);
748 #ifdef KMP_REVERSE_HYPER_BAR 749 if (child - 1 >= 1 && next_child_tid < num_threads)
751 if (child + 1 < branch_factor && next_child_tid < num_threads)
752 #endif // KMP_REVERSE_HYPER_BAR 754 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
757 #if KMP_BARRIER_ICV_PUSH 759 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
760 #endif // KMP_BARRIER_ICV_PUSH 764 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 765 "go(%p): %u => %u\n",
766 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
767 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
768 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
770 ANNOTATE_BARRIER_BEGIN(child_thr);
771 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
776 #if KMP_BARRIER_ICV_PUSH 777 if (propagate_icvs &&
778 !KMP_MASTER_TID(tid)) {
779 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
781 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
782 &thr_bar->th_fixed_icvs);
787 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
788 gtid, team->t.t_id, tid, bt));
801 static bool __kmp_init_hierarchical_barrier_thread(
enum barrier_type bt,
802 kmp_bstate_t *thr_bar,
803 kmp_uint32 nproc,
int gtid,
804 int tid, kmp_team_t *team) {
806 bool uninitialized = thr_bar->team == NULL;
807 bool team_changed = team != thr_bar->team;
808 bool team_sz_changed = nproc != thr_bar->nproc;
809 bool tid_changed = tid != thr_bar->old_tid;
812 if (uninitialized || team_sz_changed) {
813 __kmp_get_hierarchy(nproc, thr_bar);
816 if (uninitialized || team_sz_changed || tid_changed) {
817 thr_bar->my_level = thr_bar->depth - 1;
818 thr_bar->parent_tid = -1;
819 if (!KMP_MASTER_TID(tid)) {
822 while (d < thr_bar->depth) {
825 if (d == thr_bar->depth - 2) {
826 thr_bar->parent_tid = 0;
827 thr_bar->my_level = d;
829 }
else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
832 thr_bar->parent_tid = tid - rem;
833 thr_bar->my_level = d;
839 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
840 (thr_bar->skip_per_level[thr_bar->my_level])),
842 thr_bar->old_tid = tid;
843 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
844 thr_bar->team = team;
845 thr_bar->parent_bar =
846 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
848 if (uninitialized || team_changed || tid_changed) {
849 thr_bar->team = team;
850 thr_bar->parent_bar =
851 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
854 if (uninitialized || team_sz_changed || tid_changed) {
855 thr_bar->nproc = nproc;
856 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
857 if (thr_bar->my_level == 0)
858 thr_bar->leaf_kids = 0;
859 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
860 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
861 thr_bar->leaf_state = 0;
862 for (
int i = 0; i < thr_bar->leaf_kids; ++i)
863 ((
char *)&(thr_bar->leaf_state))[7 - i] = 1;
868 static void __kmp_hierarchical_barrier_gather(
869 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
870 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
871 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
872 kmp_team_t *team = this_thr->th.th_team;
873 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
874 kmp_uint32 nproc = this_thr->th.th_team_nproc;
875 kmp_info_t **other_threads = team->t.t_threads;
876 kmp_uint64 new_state = 0;
878 int level = team->t.t_level;
880 ->th.th_teams_microtask)
881 if (this_thr->th.th_teams_size.nteams > 1)
884 thr_bar->use_oncore_barrier = 1;
886 thr_bar->use_oncore_barrier = 0;
888 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for " 890 gtid, team->t.t_id, tid, bt));
891 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
893 #if USE_ITT_BUILD && USE_ITT_NOTIFY 895 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
896 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
900 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
903 if (thr_bar->my_level) {
906 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
907 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
908 thr_bar->use_oncore_barrier) {
909 if (thr_bar->leaf_kids) {
911 kmp_uint64 leaf_state =
913 ? thr_bar->b_arrived | thr_bar->leaf_state
914 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
915 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " 917 gtid, team->t.t_id, tid));
918 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
919 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
921 ANNOTATE_REDUCE_AFTER(reduce);
922 OMPT_REDUCTION_DECL(this_thr, gtid);
923 OMPT_REDUCTION_BEGIN;
924 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
926 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 928 gtid, team->t.t_id, tid,
929 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
931 ANNOTATE_BARRIER_END(other_threads[child_tid]);
932 (*reduce)(this_thr->th.th_local.reduce_data,
933 other_threads[child_tid]->th.th_local.reduce_data);
936 ANNOTATE_REDUCE_BEFORE(reduce);
937 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
940 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
943 for (kmp_uint32 d = 1; d < thr_bar->my_level;
945 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
946 skip = thr_bar->skip_per_level[d];
949 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
950 kmp_info_t *child_thr = other_threads[child_tid];
951 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
952 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 954 "arrived(%p) == %llu\n",
955 gtid, team->t.t_id, tid,
956 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
957 child_tid, &child_bar->b_arrived, new_state));
958 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
959 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
960 ANNOTATE_BARRIER_END(child_thr);
962 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 964 gtid, team->t.t_id, tid,
965 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
967 ANNOTATE_REDUCE_AFTER(reduce);
968 (*reduce)(this_thr->th.th_local.reduce_data,
969 child_thr->th.th_local.reduce_data);
970 ANNOTATE_REDUCE_BEFORE(reduce);
971 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
976 for (kmp_uint32 d = 0; d < thr_bar->my_level;
978 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
979 skip = thr_bar->skip_per_level[d];
982 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
983 kmp_info_t *child_thr = other_threads[child_tid];
984 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
985 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 987 "arrived(%p) == %llu\n",
988 gtid, team->t.t_id, tid,
989 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
990 child_tid, &child_bar->b_arrived, new_state));
991 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
992 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
993 ANNOTATE_BARRIER_END(child_thr);
995 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 997 gtid, team->t.t_id, tid,
998 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1000 ANNOTATE_REDUCE_AFTER(reduce);
1001 (*reduce)(this_thr->th.th_local.reduce_data,
1002 child_thr->th.th_local.reduce_data);
1003 ANNOTATE_REDUCE_BEFORE(reduce);
1004 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1012 if (!KMP_MASTER_TID(tid)) {
1013 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing" 1014 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1015 gtid, team->t.t_id, tid,
1016 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1017 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1018 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1022 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1023 !thr_bar->use_oncore_barrier) {
1025 ANNOTATE_BARRIER_BEGIN(this_thr);
1026 kmp_flag_64<> flag(&thr_bar->b_arrived,
1027 other_threads[thr_bar->parent_tid]);
1031 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1032 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1033 thr_bar->offset + 1);
1034 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1038 team->t.t_bar[bt].b_arrived = new_state;
1039 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d " 1040 "arrived(%p) = %llu\n",
1041 gtid, team->t.t_id, tid, team->t.t_id,
1042 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1045 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for " 1046 "barrier type %d\n",
1047 gtid, team->t.t_id, tid, bt));
1050 static void __kmp_hierarchical_barrier_release(
1051 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
1052 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
1053 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1055 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1057 bool team_change =
false;
1059 if (KMP_MASTER_TID(tid)) {
1060 team = __kmp_threads[gtid]->th.th_team;
1061 KMP_DEBUG_ASSERT(team != NULL);
1062 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary " 1063 "entered barrier type %d\n",
1064 gtid, team->t.t_id, tid, bt));
1067 if (!thr_bar->use_oncore_barrier ||
1068 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1069 thr_bar->team == NULL) {
1071 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1072 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1073 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1074 ANNOTATE_BARRIER_END(this_thr);
1075 TCW_8(thr_bar->b_go,
1076 KMP_INIT_BARRIER_STATE);
1080 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1081 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1082 thr_bar->offset + 1, bt,
1083 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1084 flag.wait(this_thr, TRUE);
1085 if (thr_bar->wait_flag ==
1086 KMP_BARRIER_SWITCHING) {
1087 TCW_8(thr_bar->b_go,
1088 KMP_INIT_BARRIER_STATE);
1090 (RCAST(
volatile char *,
1091 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1094 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1096 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1099 team = __kmp_threads[gtid]->th.th_team;
1100 KMP_DEBUG_ASSERT(team != NULL);
1101 tid = __kmp_tid_from_gtid(gtid);
1105 (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1106 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1110 nproc = this_thr->th.th_team_nproc;
1111 int level = team->t.t_level;
1112 if (team->t.t_threads[0]
1113 ->th.th_teams_microtask) {
1114 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1115 this_thr->th.th_teams_level == level)
1117 if (this_thr->th.th_teams_size.nteams > 1)
1121 thr_bar->use_oncore_barrier = 1;
1123 thr_bar->use_oncore_barrier = 0;
1127 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1128 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1129 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1135 #if KMP_BARRIER_ICV_PUSH 1136 if (propagate_icvs) {
1137 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1141 copy_icvs(&thr_bar->th_fixed_icvs,
1142 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1143 }
else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1144 thr_bar->use_oncore_barrier) {
1145 if (!thr_bar->my_level)
1148 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1149 &thr_bar->parent_bar->th_fixed_icvs);
1152 if (thr_bar->my_level)
1154 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1156 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1157 &thr_bar->parent_bar->th_fixed_icvs);
1160 #endif // KMP_BARRIER_ICV_PUSH 1163 if (thr_bar->my_level) {
1164 kmp_int32 child_tid;
1166 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1167 thr_bar->use_oncore_barrier) {
1168 if (KMP_MASTER_TID(tid)) {
1171 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1174 ngo_load(&thr_bar->th_fixed_icvs);
1177 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (
int)nproc;
1178 child_tid += thr_bar->skip_per_level[1]) {
1179 kmp_bstate_t *child_bar =
1180 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1181 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1182 "releasing T#%d(%d:%d)" 1183 " go(%p): %u => %u\n",
1184 gtid, team->t.t_id, tid,
1185 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1186 child_tid, &child_bar->b_go, child_bar->b_go,
1187 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1190 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1194 TCW_8(thr_bar->b_go,
1195 KMP_INIT_BARRIER_STATE);
1197 if (thr_bar->leaf_kids) {
1200 old_leaf_kids < thr_bar->leaf_kids) {
1201 if (old_leaf_kids) {
1202 thr_bar->b_go |= old_leaf_state;
1205 last = tid + thr_bar->skip_per_level[1];
1208 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1210 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1211 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1214 (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" 1215 " T#%d(%d:%d) go(%p): %u => %u\n",
1216 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1217 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1218 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1220 ANNOTATE_BARRIER_BEGIN(child_thr);
1221 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1226 thr_bar->b_go |= thr_bar->leaf_state;
1230 for (
int d = thr_bar->my_level - 1; d >= 0;
1232 last = tid + thr_bar->skip_per_level[d + 1];
1233 kmp_uint32 skip = thr_bar->skip_per_level[d];
1236 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1237 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1238 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1239 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1240 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1241 gtid, team->t.t_id, tid,
1242 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1243 child_tid, &child_bar->b_go, child_bar->b_go,
1244 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1246 ANNOTATE_BARRIER_BEGIN(child_thr);
1247 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1252 #if KMP_BARRIER_ICV_PUSH 1253 if (propagate_icvs && !KMP_MASTER_TID(tid))
1255 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1256 &thr_bar->th_fixed_icvs);
1257 #endif // KMP_BARRIER_ICV_PUSH 1259 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for " 1260 "barrier type %d\n",
1261 gtid, team->t.t_id, tid, bt));
1269 template <
bool cancellable>
struct is_cancellable {};
1270 template <>
struct is_cancellable<true> {
1272 is_cancellable() : value(false) {}
1273 is_cancellable(
bool b) : value(b) {}
1274 is_cancellable &operator=(
bool b) {
1278 operator bool()
const {
return value; }
1280 template <>
struct is_cancellable<false> {
1281 is_cancellable &operator=(
bool b) {
return *
this; }
1282 constexpr
operator bool()
const {
return false; }
1293 template <
bool cancellable = false>
1294 static int __kmp_barrier_template(
enum barrier_type bt,
int gtid,
int is_split,
1295 size_t reduce_size,
void *reduce_data,
1296 void (*reduce)(
void *,
void *)) {
1297 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1298 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1299 int tid = __kmp_tid_from_gtid(gtid);
1300 kmp_info_t *this_thr = __kmp_threads[gtid];
1301 kmp_team_t *team = this_thr->th.th_team;
1303 is_cancellable<cancellable> cancelled;
1304 #if OMPT_SUPPORT && OMPT_OPTIONAL 1305 ompt_data_t *my_task_data;
1306 ompt_data_t *my_parallel_data;
1307 void *return_address;
1308 ompt_sync_region_t barrier_kind;
1311 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1312 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1314 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1316 if (ompt_enabled.enabled) {
1318 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1319 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1320 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1321 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1322 if (ompt_enabled.ompt_callback_sync_region) {
1323 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1324 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1327 if (ompt_enabled.ompt_callback_sync_region_wait) {
1328 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1329 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1336 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1340 if (!team->t.t_serialized) {
1343 void *itt_sync_obj = NULL;
1345 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1346 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1349 if (__kmp_tasking_mode == tskm_extra_barrier) {
1350 __kmp_tasking_barrier(team, this_thr, gtid);
1352 (
"__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1353 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1360 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1362 this_thr->th.th_team_bt_intervals =
1363 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1364 this_thr->th.th_team_bt_set =
1365 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1367 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1372 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1373 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1377 if (KMP_MASTER_TID(tid)) {
1378 team->t.t_bar[bt].b_master_arrived += 1;
1380 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1383 if (reduce != NULL) {
1385 this_thr->th.th_local.reduce_data = reduce_data;
1388 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1390 __kmp_task_team_setup(this_thr, team, 0);
1393 cancelled = __kmp_linear_barrier_gather_cancellable(
1394 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1396 switch (__kmp_barrier_gather_pattern[bt]) {
1397 case bp_hyper_bar: {
1399 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1400 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1401 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1404 case bp_hierarchical_bar: {
1405 __kmp_hierarchical_barrier_gather(
1406 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1411 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1412 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1413 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1417 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1418 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1425 if (KMP_MASTER_TID(tid)) {
1427 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1428 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1433 team->t.t_bar[bt].b_team_arrived += 1;
1436 if (__kmp_omp_cancellation) {
1437 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1439 if (cancel_request == cancel_loop ||
1440 cancel_request == cancel_sections) {
1441 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1449 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1450 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1452 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1454 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1455 __kmp_forkjoin_frames_mode &&
1456 (this_thr->th.th_teams_microtask == NULL ||
1457 this_thr->th.th_teams_size.nteams == 1) &&
1458 team->t.t_active_level == 1) {
1459 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1460 kmp_uint64 cur_time = __itt_get_timestamp();
1461 kmp_info_t **other_threads = team->t.t_threads;
1462 int nproc = this_thr->th.th_team_nproc;
1464 switch (__kmp_forkjoin_frames_mode) {
1466 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1468 this_thr->th.th_frame_time = cur_time;
1472 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1476 if (__itt_metadata_add_ptr) {
1478 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1481 this_thr->th.th_bar_arrive_time = 0;
1482 for (i = 1; i < nproc; ++i) {
1483 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1484 other_threads[i]->th.th_bar_arrive_time = 0;
1486 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1488 (kmp_uint64)(reduce != NULL));
1490 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1492 this_thr->th.th_frame_time = cur_time;
1500 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1501 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1504 if ((status == 1 || !is_split) && !cancelled) {
1506 cancelled = __kmp_linear_barrier_release_cancellable(
1507 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1509 switch (__kmp_barrier_release_pattern[bt]) {
1510 case bp_hyper_bar: {
1511 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1512 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1513 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1516 case bp_hierarchical_bar: {
1517 __kmp_hierarchical_barrier_release(
1518 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1522 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1523 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1524 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1528 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1529 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1533 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1534 __kmp_task_team_sync(this_thr, team);
1542 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1543 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1547 if (__kmp_tasking_mode != tskm_immediate_exec) {
1548 if (this_thr->th.th_task_team != NULL) {
1550 void *itt_sync_obj = NULL;
1551 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1552 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1553 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1557 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1559 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1560 __kmp_task_team_setup(this_thr, team, 0);
1563 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1564 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1569 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1570 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1571 __kmp_tid_from_gtid(gtid), status));
1574 if (ompt_enabled.enabled) {
1576 if (ompt_enabled.ompt_callback_sync_region_wait) {
1577 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1578 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1581 if (ompt_enabled.ompt_callback_sync_region) {
1582 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1583 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1587 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1590 ANNOTATE_BARRIER_END(&team->t.t_bar);
1593 return (
int)cancelled;
1598 int __kmp_barrier(
enum barrier_type bt,
int gtid,
int is_split,
1599 size_t reduce_size,
void *reduce_data,
1600 void (*reduce)(
void *,
void *)) {
1601 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1605 #if defined(KMP_GOMP_COMPAT) 1607 int __kmp_barrier_gomp_cancel(
int gtid) {
1608 if (__kmp_omp_cancellation) {
1609 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1612 int tid = __kmp_tid_from_gtid(gtid);
1613 kmp_info_t *this_thr = __kmp_threads[gtid];
1614 if (KMP_MASTER_TID(tid)) {
1618 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1619 KMP_BARRIER_STATE_BUMP;
1624 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1629 void __kmp_end_split_barrier(
enum barrier_type bt,
int gtid) {
1630 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1631 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1632 KMP_DEBUG_ASSERT(bt < bs_last_barrier);
1633 int tid = __kmp_tid_from_gtid(gtid);
1634 kmp_info_t *this_thr = __kmp_threads[gtid];
1635 kmp_team_t *team = this_thr->th.th_team;
1637 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1638 if (!team->t.t_serialized) {
1639 if (KMP_MASTER_GTID(gtid)) {
1640 switch (__kmp_barrier_release_pattern[bt]) {
1641 case bp_hyper_bar: {
1642 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1643 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1644 FALSE USE_ITT_BUILD_ARG(NULL));
1647 case bp_hierarchical_bar: {
1648 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1649 FALSE USE_ITT_BUILD_ARG(NULL));
1653 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1654 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1655 FALSE USE_ITT_BUILD_ARG(NULL));
1659 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1660 FALSE USE_ITT_BUILD_ARG(NULL));
1663 if (__kmp_tasking_mode != tskm_immediate_exec) {
1664 __kmp_task_team_sync(this_thr, team);
1668 ANNOTATE_BARRIER_END(&team->t.t_bar);
1671 void __kmp_join_barrier(
int gtid) {
1672 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1673 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1675 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1677 kmp_info_t *this_thr = __kmp_threads[gtid];
1680 kmp_info_t *master_thread;
1686 void *itt_sync_obj = NULL;
1688 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1690 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1696 team = this_thr->th.th_team;
1697 nproc = this_thr->th.th_team_nproc;
1698 KMP_DEBUG_ASSERT((
int)nproc == team->t.t_nproc);
1699 tid = __kmp_tid_from_gtid(gtid);
1701 team_id = team->t.t_id;
1703 master_thread = this_thr->th.th_team_master;
1705 if (master_thread != team->t.t_threads[0]) {
1706 __kmp_print_structure();
1709 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1713 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1714 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1715 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1716 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1717 gtid, team_id, tid));
1719 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1721 if (ompt_enabled.enabled) {
1723 ompt_data_t *my_task_data;
1724 ompt_data_t *my_parallel_data;
1725 void *codeptr = NULL;
1726 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1727 if (KMP_MASTER_TID(ds_tid) &&
1728 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1729 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1730 codeptr = team->t.ompt_team_info.master_return_address;
1731 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1732 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1733 if (ompt_enabled.ompt_callback_sync_region) {
1734 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1735 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1736 my_task_data, codeptr);
1738 if (ompt_enabled.ompt_callback_sync_region_wait) {
1739 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1740 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1741 my_task_data, codeptr);
1743 if (!KMP_MASTER_TID(ds_tid))
1744 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1746 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1750 if (__kmp_tasking_mode == tskm_extra_barrier) {
1751 __kmp_tasking_barrier(team, this_thr, gtid);
1752 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1756 if (__kmp_tasking_mode != tskm_immediate_exec) {
1757 KA_TRACE(20, (
"__kmp_join_barrier: T#%d, old team = %d, old task_team = " 1758 "%p, th_task_team = %p\n",
1759 __kmp_gtid_from_thread(this_thr), team_id,
1760 team->t.t_task_team[this_thr->th.th_task_state],
1761 this_thr->th.th_task_team));
1762 KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1763 team->t.t_task_team[this_thr->th.th_task_state]);
1772 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1774 this_thr->th.th_team_bt_intervals =
1775 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1776 this_thr->th.th_team_bt_set =
1777 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1779 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1784 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1785 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1788 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1789 case bp_hyper_bar: {
1790 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1791 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1792 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1795 case bp_hierarchical_bar: {
1796 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1797 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1801 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1802 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1803 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1807 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1808 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1816 if (KMP_MASTER_TID(tid)) {
1817 if (__kmp_tasking_mode != tskm_immediate_exec) {
1818 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1820 if (__kmp_display_affinity) {
1821 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1823 #if KMP_STATS_ENABLED 1827 for (
int i = 0; i < team->t.t_nproc; ++i) {
1828 kmp_info_t *team_thread = team->t.t_threads[i];
1829 if (team_thread == this_thr)
1831 team_thread->th.th_stats->setIdleFlag();
1832 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1833 team_thread->th.th_sleep_loc != NULL)
1834 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1835 team_thread->th.th_sleep_loc);
1839 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1840 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1843 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1845 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1846 __kmp_forkjoin_frames_mode &&
1847 (this_thr->th.th_teams_microtask == NULL ||
1848 this_thr->th.th_teams_size.nteams == 1) &&
1849 team->t.t_active_level == 1) {
1850 kmp_uint64 cur_time = __itt_get_timestamp();
1851 ident_t *loc = team->t.t_ident;
1852 kmp_info_t **other_threads = team->t.t_threads;
1853 int nproc = this_thr->th.th_team_nproc;
1855 switch (__kmp_forkjoin_frames_mode) {
1857 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1861 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1865 if (__itt_metadata_add_ptr) {
1867 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1870 this_thr->th.th_bar_arrive_time = 0;
1871 for (i = 1; i < nproc; ++i) {
1872 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1873 other_threads[i]->th.th_bar_arrive_time = 0;
1875 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1876 cur_time, delta, 0);
1878 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1880 this_thr->th.th_frame_time = cur_time;
1888 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1889 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1894 if (KMP_MASTER_TID(tid)) {
1897 (
"__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1898 gtid, team_id, tid, nproc));
1905 (
"__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1907 ANNOTATE_BARRIER_END(&team->t.t_bar);
1912 void __kmp_fork_barrier(
int gtid,
int tid) {
1913 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1914 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1915 kmp_info_t *this_thr = __kmp_threads[gtid];
1916 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1918 void *itt_sync_obj = NULL;
1921 ANNOTATE_BARRIER_END(&team->t.t_bar);
1923 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1924 (team != NULL) ? team->t.t_id : -1, tid));
1927 if (KMP_MASTER_TID(tid)) {
1928 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1929 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1931 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1932 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1937 KMP_DEBUG_ASSERT(team);
1938 kmp_info_t **other_threads = team->t.t_threads;
1944 for (i = 1; i < team->t.t_nproc; ++i) {
1946 (
"__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go " 1948 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1949 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1950 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1952 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1953 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1954 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1958 if (__kmp_tasking_mode != tskm_immediate_exec) {
1960 __kmp_task_team_setup(this_thr, team, 0);
1969 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1971 this_thr->th.th_team_bt_intervals =
1972 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1973 this_thr->th.th_team_bt_set =
1974 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1976 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1981 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1982 case bp_hyper_bar: {
1983 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1984 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1985 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1988 case bp_hierarchical_bar: {
1989 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1990 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1994 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1995 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1996 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2000 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2001 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2006 if (ompt_enabled.enabled &&
2007 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2008 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2009 ompt_data_t *task_data = (team)
2010 ? OMPT_CUR_TASK_DATA(this_thr)
2011 : &(this_thr->th.ompt_thread_info.task_data);
2012 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2014 void *codeptr = NULL;
2015 if (KMP_MASTER_TID(ds_tid) &&
2016 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2017 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2018 codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2019 if (ompt_enabled.ompt_callback_sync_region_wait) {
2020 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2021 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2024 if (ompt_enabled.ompt_callback_sync_region) {
2025 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2026 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2030 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2031 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2032 ompt_scope_end, NULL, task_data, 0, ds_tid,
2033 ompt_task_implicit);
2039 if (TCR_4(__kmp_global.g.g_done)) {
2040 this_thr->th.th_task_team = NULL;
2042 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2043 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2044 if (!KMP_MASTER_TID(tid)) {
2045 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2047 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2051 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2059 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2060 KMP_DEBUG_ASSERT(team != NULL);
2061 tid = __kmp_tid_from_gtid(gtid);
2063 #if KMP_BARRIER_ICV_PULL 2071 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2072 if (!KMP_MASTER_TID(tid)) {
2076 (
"__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2077 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2079 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2080 &team->t.t_threads[0]
2081 ->th.th_bar[bs_forkjoin_barrier]
2085 #endif // KMP_BARRIER_ICV_PULL 2087 if (__kmp_tasking_mode != tskm_immediate_exec) {
2088 __kmp_task_team_sync(this_thr, team);
2091 #if KMP_AFFINITY_SUPPORTED 2092 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2093 if (proc_bind == proc_bind_intel) {
2095 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2096 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2098 }
else if (proc_bind != proc_bind_false) {
2099 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2100 KA_TRACE(100, (
"__kmp_fork_barrier: T#%d already in correct place %d\n",
2101 __kmp_gtid_from_thread(this_thr),
2102 this_thr->th.th_current_place));
2104 __kmp_affinity_set_place(gtid);
2107 #endif // KMP_AFFINITY_SUPPORTED 2109 if (__kmp_display_affinity) {
2110 if (team->t.t_display_affinity
2111 #
if KMP_AFFINITY_SUPPORTED
2112 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2116 __kmp_aux_display_affinity(gtid, NULL);
2117 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2118 this_thr->th.th_prev_level = team->t.t_level;
2121 if (!KMP_MASTER_TID(tid))
2122 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2124 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2125 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2126 if (!KMP_MASTER_TID(tid)) {
2128 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2129 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2133 ANNOTATE_BARRIER_END(&team->t.t_bar);
2134 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2135 team->t.t_id, tid));
2138 void __kmp_setup_icv_copy(kmp_team_t *team,
int new_nproc,
2139 kmp_internal_control_t *new_icvs,
ident_t *loc) {
2140 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2142 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2143 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2148 #if KMP_BARRIER_ICV_PULL 2152 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
2155 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2157 KF_TRACE(10, (
"__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2158 team->t.t_threads[0], team));
2159 #elif KMP_BARRIER_ICV_PUSH 2162 KF_TRACE(10, (
"__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2163 team->t.t_threads[0], team));
2168 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
2170 for (
int f = 1; f < new_nproc; ++f) {
2172 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2173 f, team->t.t_threads[f], team));
2174 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2175 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2176 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2177 f, team->t.t_threads[f], team));
2180 #endif // KMP_BARRIER_ICV_PULL