LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp_wait_release.h"
14 #include "kmp_barrier.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 // for distributed barrier
20 #include "kmp_affinity.h"
21 
22 #if KMP_MIC
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
25 #endif // KMP_MIC
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 // Distributed barrier
44 
45 // Compute how many threads to have polling each cache-line.
46 // We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47 void distributedBarrier::computeVarsForN(size_t n) {
48  int nsockets = 1;
49  if (__kmp_topology) {
50  int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
51  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
52  int ncores_per_socket =
53  __kmp_topology->calculate_ratio(core_level, socket_level);
54  nsockets = __kmp_topology->get_count(socket_level);
55 
56  if (nsockets <= 0)
57  nsockets = 1;
58  if (ncores_per_socket <= 0)
59  ncores_per_socket = 1;
60 
61  threads_per_go = ncores_per_socket >> 1;
62  if (!fix_threads_per_go) {
63  // Minimize num_gos
64  if (threads_per_go > 4) {
65  if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
66  threads_per_go = threads_per_go >> 1;
67  }
68  if (threads_per_go > 4 && nsockets == 1)
69  threads_per_go = threads_per_go >> 1;
70  }
71  }
72  if (threads_per_go == 0)
73  threads_per_go = 1;
74  fix_threads_per_go = true;
75  num_gos = n / threads_per_go;
76  if (n % threads_per_go)
77  num_gos++;
78  if (nsockets == 1 || num_gos == 1)
79  num_groups = 1;
80  else {
81  num_groups = num_gos / nsockets;
82  if (num_gos % nsockets)
83  num_groups++;
84  }
85  if (num_groups <= 0)
86  num_groups = 1;
87  gos_per_group = num_gos / num_groups;
88  if (num_gos % num_groups)
89  gos_per_group++;
90  threads_per_group = threads_per_go * gos_per_group;
91  } else {
92  num_gos = n / threads_per_go;
93  if (n % threads_per_go)
94  num_gos++;
95  if (num_gos == 1)
96  num_groups = 1;
97  else {
98  num_groups = num_gos / 2;
99  if (num_gos % 2)
100  num_groups++;
101  }
102  gos_per_group = num_gos / num_groups;
103  if (num_gos % num_groups)
104  gos_per_group++;
105  threads_per_group = threads_per_go * gos_per_group;
106  }
107 }
108 
109 void distributedBarrier::computeGo(size_t n) {
110  // Minimize num_gos
111  for (num_gos = 1;; num_gos++)
112  if (IDEAL_CONTENTION * num_gos >= n)
113  break;
114  threads_per_go = n / num_gos;
115  if (n % num_gos)
116  threads_per_go++;
117  while (num_gos > MAX_GOS) {
118  threads_per_go++;
119  num_gos = n / threads_per_go;
120  if (n % threads_per_go)
121  num_gos++;
122  }
123  computeVarsForN(n);
124 }
125 
126 // This function is to resize the barrier arrays when the new number of threads
127 // exceeds max_threads, which is the current size of all the arrays
128 void distributedBarrier::resize(size_t nthr) {
129  KMP_DEBUG_ASSERT(nthr > max_threads);
130 
131  // expand to requested size * 2
132  max_threads = nthr * 2;
133 
134  // allocate arrays to new max threads
135  for (int i = 0; i < MAX_ITERS; ++i) {
136  if (flags[i])
137  flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138  max_threads * sizeof(flags_s));
139  else
140  flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
141  }
142 
143  if (go)
144  go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
145  else
146  go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
147 
148  if (iter)
149  iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
150  else
151  iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
152 
153  if (sleep)
154  sleep =
155  (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
156  else
157  sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
158 }
159 
160 // This function is to set all the go flags that threads might be waiting
161 // on, and when blocktime is not infinite, it should be followed by a wake-up
162 // call to each thread
163 kmp_uint64 distributedBarrier::go_release() {
164  kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
165  for (size_t j = 0; j < num_gos; j++) {
166  go[j].go.store(next_go);
167  }
168  return next_go;
169 }
170 
171 void distributedBarrier::go_reset() {
172  for (size_t j = 0; j < max_threads; ++j) {
173  for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
174  flags[i][j].stillNeed = 1;
175  }
176  go[j].go.store(0);
177  iter[j].iter = 0;
178  }
179 }
180 
181 // This function inits/re-inits the distributed barrier for a particular number
182 // of threads. If a resize of arrays is needed, it calls the resize function.
183 void distributedBarrier::init(size_t nthr) {
184  size_t old_max = max_threads;
185  if (nthr > max_threads) { // need more space in arrays
186  resize(nthr);
187  }
188 
189  for (size_t i = 0; i < max_threads; i++) {
190  for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
191  flags[j][i].stillNeed = 1;
192  }
193  go[i].go.store(0);
194  iter[i].iter = 0;
195  if (i >= old_max)
196  sleep[i].sleep = false;
197  }
198 
199  // Recalculate num_gos, etc. based on new nthr
200  computeVarsForN(nthr);
201 
202  num_threads = nthr;
203 
204  if (team_icvs == NULL)
205  team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
206 }
207 
208 // This function is used only when KMP_BLOCKTIME is not infinite.
209 // static
210 void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
211  size_t start, size_t stop, size_t inc,
212  size_t tid) {
213  KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
214  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
215  return;
216 
217  kmp_info_t **other_threads = team->t.t_threads;
218  for (size_t thr = start; thr < stop; thr += inc) {
219  KMP_DEBUG_ASSERT(other_threads[thr]);
220  int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
221  // Wake up worker regardless of if it appears to be sleeping or not
222  __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
223  }
224 }
225 
226 static void __kmp_dist_barrier_gather(
227  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
228  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
229  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
230  kmp_team_t *team;
231  distributedBarrier *b;
232  kmp_info_t **other_threads;
233  kmp_uint64 my_current_iter, my_next_iter;
234  kmp_uint32 nproc;
235  bool group_leader;
236 
237  team = this_thr->th.th_team;
238  nproc = this_thr->th.th_team_nproc;
239  other_threads = team->t.t_threads;
240  b = team->t.b;
241  my_current_iter = b->iter[tid].iter;
242  my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
243  group_leader = ((tid % b->threads_per_group) == 0);
244 
245  KA_TRACE(20,
246  ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
247  gtid, team->t.t_id, tid, bt));
248 
249 #if USE_ITT_BUILD && USE_ITT_NOTIFY
250  // Barrier imbalance - save arrive time to the thread
251  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
252  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
253  __itt_get_timestamp();
254  }
255 #endif
256 
257  if (group_leader) {
258  // Start from the thread after the group leader
259  size_t group_start = tid + 1;
260  size_t group_end = tid + b->threads_per_group;
261  size_t threads_pending = 0;
262 
263  if (group_end > nproc)
264  group_end = nproc;
265  do { // wait for threads in my group
266  threads_pending = 0;
267  // Check all the flags every time to avoid branch misspredict
268  for (size_t thr = group_start; thr < group_end; thr++) {
269  // Each thread uses a different cache line
270  threads_pending += b->flags[my_current_iter][thr].stillNeed;
271  }
272  // Execute tasks here
273  if (__kmp_tasking_mode != tskm_immediate_exec) {
274  kmp_task_team_t *task_team = this_thr->th.th_task_team;
275  if (task_team != NULL) {
276  if (TCR_SYNC_4(task_team->tt.tt_active)) {
277  if (KMP_TASKING_ENABLED(task_team)) {
278  int tasks_completed = FALSE;
279  __kmp_atomic_execute_tasks_64(
280  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
281  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
282  } else
283  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
284  }
285  } else {
286  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
287  } // if
288  }
289  if (TCR_4(__kmp_global.g.g_done)) {
290  if (__kmp_global.g.g_abort)
291  __kmp_abort_thread();
292  break;
293  } else if (__kmp_tasking_mode != tskm_immediate_exec &&
294  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
295  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
296  }
297  } while (threads_pending > 0);
298 
299  if (reduce) { // Perform reduction if needed
300  OMPT_REDUCTION_DECL(this_thr, gtid);
301  OMPT_REDUCTION_BEGIN;
302  // Group leader reduces all threads in group
303  for (size_t thr = group_start; thr < group_end; thr++) {
304  (*reduce)(this_thr->th.th_local.reduce_data,
305  other_threads[thr]->th.th_local.reduce_data);
306  }
307  OMPT_REDUCTION_END;
308  }
309 
310  // Set flag for next iteration
311  b->flags[my_next_iter][tid].stillNeed = 1;
312  // Each thread uses a different cache line; resets stillNeed to 0 to
313  // indicate it has reached the barrier
314  b->flags[my_current_iter][tid].stillNeed = 0;
315 
316  do { // wait for all group leaders
317  threads_pending = 0;
318  for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
319  threads_pending += b->flags[my_current_iter][thr].stillNeed;
320  }
321  // Execute tasks here
322  if (__kmp_tasking_mode != tskm_immediate_exec) {
323  kmp_task_team_t *task_team = this_thr->th.th_task_team;
324  if (task_team != NULL) {
325  if (TCR_SYNC_4(task_team->tt.tt_active)) {
326  if (KMP_TASKING_ENABLED(task_team)) {
327  int tasks_completed = FALSE;
328  __kmp_atomic_execute_tasks_64(
329  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
330  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
331  } else
332  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
333  }
334  } else {
335  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
336  } // if
337  }
338  if (TCR_4(__kmp_global.g.g_done)) {
339  if (__kmp_global.g.g_abort)
340  __kmp_abort_thread();
341  break;
342  } else if (__kmp_tasking_mode != tskm_immediate_exec &&
343  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
344  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
345  }
346  } while (threads_pending > 0);
347 
348  if (reduce) { // Perform reduction if needed
349  if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
350  OMPT_REDUCTION_DECL(this_thr, gtid);
351  OMPT_REDUCTION_BEGIN;
352  for (size_t thr = b->threads_per_group; thr < nproc;
353  thr += b->threads_per_group) {
354  (*reduce)(this_thr->th.th_local.reduce_data,
355  other_threads[thr]->th.th_local.reduce_data);
356  }
357  OMPT_REDUCTION_END;
358  }
359  }
360  } else {
361  // Set flag for next iteration
362  b->flags[my_next_iter][tid].stillNeed = 1;
363  // Each thread uses a different cache line; resets stillNeed to 0 to
364  // indicate it has reached the barrier
365  b->flags[my_current_iter][tid].stillNeed = 0;
366  }
367 
368  KMP_MFENCE();
369 
370  KA_TRACE(20,
371  ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
372  gtid, team->t.t_id, tid, bt));
373 }
374 
375 static void __kmp_dist_barrier_release(
376  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
377  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
378  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
379  kmp_team_t *team;
380  distributedBarrier *b;
381  kmp_bstate_t *thr_bar;
382  kmp_uint64 my_current_iter, next_go;
383  size_t my_go_index;
384  bool group_leader;
385 
386  KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
387  gtid, tid, bt));
388 
389  thr_bar = &this_thr->th.th_bar[bt].bb;
390 
391  if (!KMP_MASTER_TID(tid)) {
392  // workers and non-master group leaders need to check their presence in team
393  do {
394  if (this_thr->th.th_used_in_team.load() != 1 &&
395  this_thr->th.th_used_in_team.load() != 3) {
396  // Thread is not in use in a team. Wait on location in tid's thread
397  // struct. The 0 value tells anyone looking that this thread is spinning
398  // or sleeping until this location becomes 3 again; 3 is the transition
399  // state to get to 1 which is waiting on go and being in the team
400  kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
401  if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
402  0) ||
403  this_thr->th.th_used_in_team.load() == 0) {
404  my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
405  }
406 #if USE_ITT_BUILD && USE_ITT_NOTIFY
407  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
408  // In fork barrier where we could not get the object reliably
409  itt_sync_obj =
410  __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
411  // Cancel wait on previous parallel region...
412  __kmp_itt_task_starting(itt_sync_obj);
413 
414  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
415  return;
416 
417  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
418  if (itt_sync_obj != NULL)
419  // Call prepare as early as possible for "new" barrier
420  __kmp_itt_task_finished(itt_sync_obj);
421  } else
422 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
423  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
424  return;
425  }
426  if (this_thr->th.th_used_in_team.load() != 1 &&
427  this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
428  continue;
429  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
430  return;
431 
432  // At this point, the thread thinks it is in use in a team, or in
433  // transition to be used in a team, but it might have reached this barrier
434  // before it was marked unused by the team. Unused threads are awoken and
435  // shifted to wait on local thread struct elsewhere. It also might reach
436  // this point by being picked up for use by a different team. Either way,
437  // we need to update the tid.
438  tid = __kmp_tid_from_gtid(gtid);
439  team = this_thr->th.th_team;
440  KMP_DEBUG_ASSERT(tid >= 0);
441  KMP_DEBUG_ASSERT(team);
442  b = team->t.b;
443  my_current_iter = b->iter[tid].iter;
444  next_go = my_current_iter + distributedBarrier::MAX_ITERS;
445  my_go_index = tid / b->threads_per_go;
446  if (this_thr->th.th_used_in_team.load() == 3) {
447  (void)KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3,
448  1);
449  }
450  // Check if go flag is set
451  if (b->go[my_go_index].go.load() != next_go) {
452  // Wait on go flag on team
453  kmp_atomic_flag_64<false, true> my_flag(
454  &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
455  my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
456  KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
457  b->iter[tid].iter == 0);
458  KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
459  }
460 
461  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
462  return;
463  // At this point, the thread's go location was set. This means the primary
464  // thread is safely in the barrier, and so this thread's data is
465  // up-to-date, but we should check again that this thread is really in
466  // use in the team, as it could have been woken up for the purpose of
467  // changing team size, or reaping threads at shutdown.
468  if (this_thr->th.th_used_in_team.load() == 1)
469  break;
470  } while (1);
471 
472  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
473  return;
474 
475  group_leader = ((tid % b->threads_per_group) == 0);
476  if (group_leader) {
477  // Tell all the threads in my group they can go!
478  for (size_t go_idx = my_go_index + 1;
479  go_idx < my_go_index + b->gos_per_group; go_idx++) {
480  b->go[go_idx].go.store(next_go);
481  }
482  // Fence added so that workers can see changes to go. sfence inadequate.
483  KMP_MFENCE();
484  }
485 
486 #if KMP_BARRIER_ICV_PUSH
487  if (propagate_icvs) { // copy ICVs to final dest
488  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
489  tid, FALSE);
490  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
491  (kmp_internal_control_t *)team->t.b->team_icvs);
492  copy_icvs(&thr_bar->th_fixed_icvs,
493  &team->t.t_implicit_task_taskdata[tid].td_icvs);
494  }
495 #endif
496  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
497  // This thread is now awake and participating in the barrier;
498  // wake up the other threads in the group
499  size_t nproc = this_thr->th.th_team_nproc;
500  size_t group_end = tid + b->threads_per_group;
501  if (nproc < group_end)
502  group_end = nproc;
503  __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
504  }
505  } else { // Primary thread
506  team = this_thr->th.th_team;
507  b = team->t.b;
508  my_current_iter = b->iter[tid].iter;
509  next_go = my_current_iter + distributedBarrier::MAX_ITERS;
510 #if KMP_BARRIER_ICV_PUSH
511  if (propagate_icvs) {
512  // primary thread has ICVs in final destination; copy
513  copy_icvs(&thr_bar->th_fixed_icvs,
514  &team->t.t_implicit_task_taskdata[tid].td_icvs);
515  }
516 #endif
517  // Tell all the group leaders they can go!
518  for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
519  b->go[go_idx].go.store(next_go);
520  }
521 
522  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
523  // Wake-up the group leaders
524  size_t nproc = this_thr->th.th_team_nproc;
525  __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
526  b->threads_per_group, tid);
527  }
528 
529  // Tell all the threads in my group they can go!
530  for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
531  b->go[go_idx].go.store(next_go);
532  }
533 
534  // Fence added so that workers can see changes to go. sfence inadequate.
535  KMP_MFENCE();
536 
537  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
538  // Wake-up the other threads in my group
539  size_t nproc = this_thr->th.th_team_nproc;
540  size_t group_end = tid + b->threads_per_group;
541  if (nproc < group_end)
542  group_end = nproc;
543  __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
544  }
545  }
546  // Update to next iteration
547  KMP_ASSERT(my_current_iter == b->iter[tid].iter);
548  b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
549 
550  KA_TRACE(
551  20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
552  gtid, team->t.t_id, tid, bt));
553 }
554 
555 // Linear Barrier
556 template <bool cancellable = false>
557 static bool __kmp_linear_barrier_gather_template(
558  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
559  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
560  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
561  kmp_team_t *team = this_thr->th.th_team;
562  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
563  kmp_info_t **other_threads = team->t.t_threads;
564 
565  KA_TRACE(
566  20,
567  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
568  gtid, team->t.t_id, tid, bt));
569  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
570 
571 #if USE_ITT_BUILD && USE_ITT_NOTIFY
572  // Barrier imbalance - save arrive time to the thread
573  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
574  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
575  __itt_get_timestamp();
576  }
577 #endif
578  // We now perform a linear reduction to signal that all of the threads have
579  // arrived.
580  if (!KMP_MASTER_TID(tid)) {
581  KA_TRACE(20,
582  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
583  "arrived(%p): %llu => %llu\n",
584  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
585  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
586  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
587  // Mark arrival to primary thread
588  /* After performing this write, a worker thread may not assume that the team
589  is valid any more - it could be deallocated by the primary thread at any
590  time. */
591  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
592  flag.release();
593  } else {
594  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
595  int nproc = this_thr->th.th_team_nproc;
596  int i;
597  // Don't have to worry about sleep bit here or atomic since team setting
598  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
599 
600  // Collect all the worker team member threads.
601  for (i = 1; i < nproc; ++i) {
602 #if KMP_CACHE_MANAGE
603  // Prefetch next thread's arrived count
604  if (i + 1 < nproc)
605  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
606 #endif /* KMP_CACHE_MANAGE */
607  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
608  "arrived(%p) == %llu\n",
609  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
610  team->t.t_id, i,
611  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
612 
613  // Wait for worker thread to arrive
614  if (cancellable) {
615  kmp_flag_64<true, false> flag(
616  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
617  if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
618  return true;
619  } else {
620  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
621  new_state);
622  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
623  }
624 #if USE_ITT_BUILD && USE_ITT_NOTIFY
625  // Barrier imbalance - write min of the thread time and the other thread
626  // time to the thread.
627  if (__kmp_forkjoin_frames_mode == 2) {
628  this_thr->th.th_bar_min_time = KMP_MIN(
629  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
630  }
631 #endif
632  if (reduce) {
633  KA_TRACE(100,
634  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
635  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
636  team->t.t_id, i));
637  OMPT_REDUCTION_DECL(this_thr, gtid);
638  OMPT_REDUCTION_BEGIN;
639  (*reduce)(this_thr->th.th_local.reduce_data,
640  other_threads[i]->th.th_local.reduce_data);
641  OMPT_REDUCTION_END;
642  }
643  }
644  // Don't have to worry about sleep bit here or atomic since team setting
645  team_bar->b_arrived = new_state;
646  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
647  "arrived(%p) = %llu\n",
648  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
649  new_state));
650  }
651  KA_TRACE(
652  20,
653  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
654  gtid, team->t.t_id, tid, bt));
655  return false;
656 }
657 
658 template <bool cancellable = false>
659 static bool __kmp_linear_barrier_release_template(
660  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
661  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
662  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
663  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
664  kmp_team_t *team;
665 
666  if (KMP_MASTER_TID(tid)) {
667  unsigned int i;
668  kmp_uint32 nproc = this_thr->th.th_team_nproc;
669  kmp_info_t **other_threads;
670 
671  team = __kmp_threads[gtid]->th.th_team;
672  KMP_DEBUG_ASSERT(team != NULL);
673  other_threads = team->t.t_threads;
674 
675  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
676  "barrier type %d\n",
677  gtid, team->t.t_id, tid, bt));
678 
679  if (nproc > 1) {
680 #if KMP_BARRIER_ICV_PUSH
681  {
682  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
683  if (propagate_icvs) {
684  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
685  for (i = 1; i < nproc; ++i) {
686  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
687  team, i, FALSE);
688  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
689  &team->t.t_implicit_task_taskdata[0].td_icvs);
690  }
691  ngo_sync();
692  }
693  }
694 #endif // KMP_BARRIER_ICV_PUSH
695 
696  // Now, release all of the worker threads
697  for (i = 1; i < nproc; ++i) {
698 #if KMP_CACHE_MANAGE
699  // Prefetch next thread's go flag
700  if (i + 1 < nproc)
701  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
702 #endif /* KMP_CACHE_MANAGE */
703  KA_TRACE(
704  20,
705  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
706  "go(%p): %u => %u\n",
707  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
708  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
709  other_threads[i]->th.th_bar[bt].bb.b_go,
710  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
711  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
712  other_threads[i]);
713  flag.release();
714  }
715  }
716  } else { // Wait for the PRIMARY thread to release us
717  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
718  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
719  if (cancellable) {
720  kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
721  if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
722  return true;
723  } else {
724  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
725  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
726  }
727 #if USE_ITT_BUILD && USE_ITT_NOTIFY
728  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
729  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
730  // disabled)
731  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
732  // Cancel wait on previous parallel region...
733  __kmp_itt_task_starting(itt_sync_obj);
734 
735  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
736  return false;
737 
738  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
739  if (itt_sync_obj != NULL)
740  // Call prepare as early as possible for "new" barrier
741  __kmp_itt_task_finished(itt_sync_obj);
742  } else
743 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
744  // Early exit for reaping threads releasing forkjoin barrier
745  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
746  return false;
747 // The worker thread may now assume that the team is valid.
748 #ifdef KMP_DEBUG
749  tid = __kmp_tid_from_gtid(gtid);
750  team = __kmp_threads[gtid]->th.th_team;
751 #endif
752  KMP_DEBUG_ASSERT(team != NULL);
753  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
754  KA_TRACE(20,
755  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
756  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
757  KMP_MB(); // Flush all pending memory write invalidates.
758  }
759  KA_TRACE(
760  20,
761  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
762  gtid, team->t.t_id, tid, bt));
763  return false;
764 }
765 
766 static void __kmp_linear_barrier_gather(
767  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
768  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
769  __kmp_linear_barrier_gather_template<false>(
770  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
771 }
772 
773 static bool __kmp_linear_barrier_gather_cancellable(
774  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
775  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
776  return __kmp_linear_barrier_gather_template<true>(
777  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
778 }
779 
780 static void __kmp_linear_barrier_release(
781  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
782  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
783  __kmp_linear_barrier_release_template<false>(
784  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
785 }
786 
787 static bool __kmp_linear_barrier_release_cancellable(
788  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
789  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
790  return __kmp_linear_barrier_release_template<true>(
791  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
792 }
793 
794 // Tree barrier
795 static void __kmp_tree_barrier_gather(
796  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
797  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
798  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
799  kmp_team_t *team = this_thr->th.th_team;
800  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
801  kmp_info_t **other_threads = team->t.t_threads;
802  kmp_uint32 nproc = this_thr->th.th_team_nproc;
803  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
804  kmp_uint32 branch_factor = 1 << branch_bits;
805  kmp_uint32 child;
806  kmp_uint32 child_tid;
807  kmp_uint64 new_state = 0;
808 
809  KA_TRACE(
810  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
811  gtid, team->t.t_id, tid, bt));
812  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
813 
814 #if USE_ITT_BUILD && USE_ITT_NOTIFY
815  // Barrier imbalance - save arrive time to the thread
816  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
817  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
818  __itt_get_timestamp();
819  }
820 #endif
821  // Perform tree gather to wait until all threads have arrived; reduce any
822  // required data as we go
823  child_tid = (tid << branch_bits) + 1;
824  if (child_tid < nproc) {
825  // Parent threads wait for all their children to arrive
826  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
827  child = 1;
828  do {
829  kmp_info_t *child_thr = other_threads[child_tid];
830  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
831 #if KMP_CACHE_MANAGE
832  // Prefetch next thread's arrived count
833  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
834  KMP_CACHE_PREFETCH(
835  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
836 #endif /* KMP_CACHE_MANAGE */
837  KA_TRACE(20,
838  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
839  "arrived(%p) == %llu\n",
840  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
841  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
842  // Wait for child to arrive
843  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
844  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
845 #if USE_ITT_BUILD && USE_ITT_NOTIFY
846  // Barrier imbalance - write min of the thread time and a child time to
847  // the thread.
848  if (__kmp_forkjoin_frames_mode == 2) {
849  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
850  child_thr->th.th_bar_min_time);
851  }
852 #endif
853  if (reduce) {
854  KA_TRACE(100,
855  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
856  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
857  team->t.t_id, child_tid));
858  OMPT_REDUCTION_DECL(this_thr, gtid);
859  OMPT_REDUCTION_BEGIN;
860  (*reduce)(this_thr->th.th_local.reduce_data,
861  child_thr->th.th_local.reduce_data);
862  OMPT_REDUCTION_END;
863  }
864  child++;
865  child_tid++;
866  } while (child <= branch_factor && child_tid < nproc);
867  }
868 
869  if (!KMP_MASTER_TID(tid)) { // Worker threads
870  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
871 
872  KA_TRACE(20,
873  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
874  "arrived(%p): %llu => %llu\n",
875  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
876  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
877  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
878 
879  // Mark arrival to parent thread
880  /* After performing this write, a worker thread may not assume that the team
881  is valid any more - it could be deallocated by the primary thread at any
882  time. */
883  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
884  flag.release();
885  } else {
886  // Need to update the team arrived pointer if we are the primary thread
887  if (nproc > 1) // New value was already computed above
888  team->t.t_bar[bt].b_arrived = new_state;
889  else
890  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
891  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
892  "arrived(%p) = %llu\n",
893  gtid, team->t.t_id, tid, team->t.t_id,
894  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
895  }
896  KA_TRACE(20,
897  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
898  gtid, team->t.t_id, tid, bt));
899 }
900 
901 static void __kmp_tree_barrier_release(
902  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
903  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
904  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
905  kmp_team_t *team;
906  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
907  kmp_uint32 nproc;
908  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
909  kmp_uint32 branch_factor = 1 << branch_bits;
910  kmp_uint32 child;
911  kmp_uint32 child_tid;
912 
913  // Perform a tree release for all of the threads that have been gathered
914  if (!KMP_MASTER_TID(
915  tid)) { // Handle fork barrier workers who aren't part of a team yet
916  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
917  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
918  // Wait for parent thread to release us
919  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
920  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
921 #if USE_ITT_BUILD && USE_ITT_NOTIFY
922  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
923  // In fork barrier where we could not get the object reliably (or
924  // ITTNOTIFY is disabled)
925  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
926  // Cancel wait on previous parallel region...
927  __kmp_itt_task_starting(itt_sync_obj);
928 
929  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
930  return;
931 
932  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
933  if (itt_sync_obj != NULL)
934  // Call prepare as early as possible for "new" barrier
935  __kmp_itt_task_finished(itt_sync_obj);
936  } else
937 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
938  // Early exit for reaping threads releasing forkjoin barrier
939  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
940  return;
941 
942  // The worker thread may now assume that the team is valid.
943  team = __kmp_threads[gtid]->th.th_team;
944  KMP_DEBUG_ASSERT(team != NULL);
945  tid = __kmp_tid_from_gtid(gtid);
946 
947  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
948  KA_TRACE(20,
949  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
950  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
951  KMP_MB(); // Flush all pending memory write invalidates.
952  } else {
953  team = __kmp_threads[gtid]->th.th_team;
954  KMP_DEBUG_ASSERT(team != NULL);
955  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
956  "barrier type %d\n",
957  gtid, team->t.t_id, tid, bt));
958  }
959  nproc = this_thr->th.th_team_nproc;
960  child_tid = (tid << branch_bits) + 1;
961 
962  if (child_tid < nproc) {
963  kmp_info_t **other_threads = team->t.t_threads;
964  child = 1;
965  // Parent threads release all their children
966  do {
967  kmp_info_t *child_thr = other_threads[child_tid];
968  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
969 #if KMP_CACHE_MANAGE
970  // Prefetch next thread's go count
971  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
972  KMP_CACHE_PREFETCH(
973  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
974 #endif /* KMP_CACHE_MANAGE */
975 
976 #if KMP_BARRIER_ICV_PUSH
977  {
978  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
979  if (propagate_icvs) {
980  __kmp_init_implicit_task(team->t.t_ident,
981  team->t.t_threads[child_tid], team,
982  child_tid, FALSE);
983  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
984  &team->t.t_implicit_task_taskdata[0].td_icvs);
985  }
986  }
987 #endif // KMP_BARRIER_ICV_PUSH
988  KA_TRACE(20,
989  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
990  "go(%p): %u => %u\n",
991  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
992  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
993  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
994  // Release child from barrier
995  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
996  flag.release();
997  child++;
998  child_tid++;
999  } while (child <= branch_factor && child_tid < nproc);
1000  }
1001  KA_TRACE(
1002  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1003  gtid, team->t.t_id, tid, bt));
1004 }
1005 
1006 // Hyper Barrier
1007 static void __kmp_hyper_barrier_gather(
1008  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1009  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1010  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1011  kmp_team_t *team = this_thr->th.th_team;
1012  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1013  kmp_info_t **other_threads = team->t.t_threads;
1014  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1015  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1016  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1017  kmp_uint32 branch_factor = 1 << branch_bits;
1018  kmp_uint32 offset;
1019  kmp_uint32 level;
1020 
1021  KA_TRACE(
1022  20,
1023  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1024  gtid, team->t.t_id, tid, bt));
1025  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1026 
1027 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1028  // Barrier imbalance - save arrive time to the thread
1029  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1030  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1031  __itt_get_timestamp();
1032  }
1033 #endif
1034  /* Perform a hypercube-embedded tree gather to wait until all of the threads
1035  have arrived, and reduce any required data as we go. */
1036  kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1037  for (level = 0, offset = 1; offset < num_threads;
1038  level += branch_bits, offset <<= branch_bits) {
1039  kmp_uint32 child;
1040  kmp_uint32 child_tid;
1041 
1042  if (((tid >> level) & (branch_factor - 1)) != 0) {
1043  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1044 
1045  KMP_MB(); // Synchronize parent and child threads.
1046  KA_TRACE(20,
1047  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1048  "arrived(%p): %llu => %llu\n",
1049  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1050  team->t.t_id, parent_tid, &thr_bar->b_arrived,
1051  thr_bar->b_arrived,
1052  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1053  // Mark arrival to parent thread
1054  /* After performing this write (in the last iteration of the enclosing for
1055  loop), a worker thread may not assume that the team is valid any more
1056  - it could be deallocated by the primary thread at any time. */
1057  p_flag.set_waiter(other_threads[parent_tid]);
1058  p_flag.release();
1059  break;
1060  }
1061 
1062  // Parent threads wait for children to arrive
1063  if (new_state == KMP_BARRIER_UNUSED_STATE)
1064  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1065  for (child = 1, child_tid = tid + (1 << level);
1066  child < branch_factor && child_tid < num_threads;
1067  child++, child_tid += (1 << level)) {
1068  kmp_info_t *child_thr = other_threads[child_tid];
1069  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1070 #if KMP_CACHE_MANAGE
1071  kmp_uint32 next_child_tid = child_tid + (1 << level);
1072  // Prefetch next thread's arrived count
1073  if (child + 1 < branch_factor && next_child_tid < num_threads)
1074  KMP_CACHE_PREFETCH(
1075  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1076 #endif /* KMP_CACHE_MANAGE */
1077  KA_TRACE(20,
1078  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1079  "arrived(%p) == %llu\n",
1080  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1081  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1082  // Wait for child to arrive
1083  kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1084  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1085  KMP_MB(); // Synchronize parent and child threads.
1086 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1087  // Barrier imbalance - write min of the thread time and a child time to
1088  // the thread.
1089  if (__kmp_forkjoin_frames_mode == 2) {
1090  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1091  child_thr->th.th_bar_min_time);
1092  }
1093 #endif
1094  if (reduce) {
1095  KA_TRACE(100,
1096  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1097  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1098  team->t.t_id, child_tid));
1099  OMPT_REDUCTION_DECL(this_thr, gtid);
1100  OMPT_REDUCTION_BEGIN;
1101  (*reduce)(this_thr->th.th_local.reduce_data,
1102  child_thr->th.th_local.reduce_data);
1103  OMPT_REDUCTION_END;
1104  }
1105  }
1106  }
1107 
1108  if (KMP_MASTER_TID(tid)) {
1109  // Need to update the team arrived pointer if we are the primary thread
1110  if (new_state == KMP_BARRIER_UNUSED_STATE)
1111  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1112  else
1113  team->t.t_bar[bt].b_arrived = new_state;
1114  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1115  "arrived(%p) = %llu\n",
1116  gtid, team->t.t_id, tid, team->t.t_id,
1117  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1118  }
1119  KA_TRACE(
1120  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1121  gtid, team->t.t_id, tid, bt));
1122 }
1123 
1124 // The reverse versions seem to beat the forward versions overall
1125 #define KMP_REVERSE_HYPER_BAR
1126 static void __kmp_hyper_barrier_release(
1127  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1128  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1129  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1130  kmp_team_t *team;
1131  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1132  kmp_info_t **other_threads;
1133  kmp_uint32 num_threads;
1134  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1135  kmp_uint32 branch_factor = 1 << branch_bits;
1136  kmp_uint32 child;
1137  kmp_uint32 child_tid;
1138  kmp_uint32 offset;
1139  kmp_uint32 level;
1140 
1141  /* Perform a hypercube-embedded tree release for all of the threads that have
1142  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1143  are released in the reverse order of the corresponding gather, otherwise
1144  threads are released in the same order. */
1145  if (KMP_MASTER_TID(tid)) { // primary thread
1146  team = __kmp_threads[gtid]->th.th_team;
1147  KMP_DEBUG_ASSERT(team != NULL);
1148  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1149  "barrier type %d\n",
1150  gtid, team->t.t_id, tid, bt));
1151 #if KMP_BARRIER_ICV_PUSH
1152  if (propagate_icvs) { // primary already has ICVs in final destination; copy
1153  copy_icvs(&thr_bar->th_fixed_icvs,
1154  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1155  }
1156 #endif
1157  } else { // Handle fork barrier workers who aren't part of a team yet
1158  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1159  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1160  // Wait for parent thread to release us
1161  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1162  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1163 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1164  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1165  // In fork barrier where we could not get the object reliably
1166  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1167  // Cancel wait on previous parallel region...
1168  __kmp_itt_task_starting(itt_sync_obj);
1169 
1170  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1171  return;
1172 
1173  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1174  if (itt_sync_obj != NULL)
1175  // Call prepare as early as possible for "new" barrier
1176  __kmp_itt_task_finished(itt_sync_obj);
1177  } else
1178 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1179  // Early exit for reaping threads releasing forkjoin barrier
1180  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1181  return;
1182 
1183  // The worker thread may now assume that the team is valid.
1184  team = __kmp_threads[gtid]->th.th_team;
1185  KMP_DEBUG_ASSERT(team != NULL);
1186  tid = __kmp_tid_from_gtid(gtid);
1187 
1188  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1189  KA_TRACE(20,
1190  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1191  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1192  KMP_MB(); // Flush all pending memory write invalidates.
1193  }
1194  num_threads = this_thr->th.th_team_nproc;
1195  other_threads = team->t.t_threads;
1196 
1197 #ifdef KMP_REVERSE_HYPER_BAR
1198  // Count up to correct level for parent
1199  for (level = 0, offset = 1;
1200  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1201  level += branch_bits, offset <<= branch_bits)
1202  ;
1203 
1204  // Now go down from there
1205  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1206  level -= branch_bits, offset >>= branch_bits)
1207 #else
1208  // Go down the tree, level by level
1209  for (level = 0, offset = 1; offset < num_threads;
1210  level += branch_bits, offset <<= branch_bits)
1211 #endif // KMP_REVERSE_HYPER_BAR
1212  {
1213 #ifdef KMP_REVERSE_HYPER_BAR
1214  /* Now go in reverse order through the children, highest to lowest.
1215  Initial setting of child is conservative here. */
1216  child = num_threads >> ((level == 0) ? level : level - 1);
1217  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1218  child_tid = tid + (child << level);
1219  child >= 1; child--, child_tid -= (1 << level))
1220 #else
1221  if (((tid >> level) & (branch_factor - 1)) != 0)
1222  // No need to go lower than this, since this is the level parent would be
1223  // notified
1224  break;
1225  // Iterate through children on this level of the tree
1226  for (child = 1, child_tid = tid + (1 << level);
1227  child < branch_factor && child_tid < num_threads;
1228  child++, child_tid += (1 << level))
1229 #endif // KMP_REVERSE_HYPER_BAR
1230  {
1231  if (child_tid >= num_threads)
1232  continue; // Child doesn't exist so keep going
1233  else {
1234  kmp_info_t *child_thr = other_threads[child_tid];
1235  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1236 #if KMP_CACHE_MANAGE
1237  kmp_uint32 next_child_tid = child_tid - (1 << level);
1238 // Prefetch next thread's go count
1239 #ifdef KMP_REVERSE_HYPER_BAR
1240  if (child - 1 >= 1 && next_child_tid < num_threads)
1241 #else
1242  if (child + 1 < branch_factor && next_child_tid < num_threads)
1243 #endif // KMP_REVERSE_HYPER_BAR
1244  KMP_CACHE_PREFETCH(
1245  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1246 #endif /* KMP_CACHE_MANAGE */
1247 
1248 #if KMP_BARRIER_ICV_PUSH
1249  if (propagate_icvs) // push my fixed ICVs to my child
1250  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1251 #endif // KMP_BARRIER_ICV_PUSH
1252 
1253  KA_TRACE(
1254  20,
1255  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1256  "go(%p): %u => %u\n",
1257  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1258  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1259  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1260  // Release child from barrier
1261  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1262  flag.release();
1263  }
1264  }
1265  }
1266 #if KMP_BARRIER_ICV_PUSH
1267  if (propagate_icvs &&
1268  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1269  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1270  FALSE);
1271  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1272  &thr_bar->th_fixed_icvs);
1273  }
1274 #endif
1275  KA_TRACE(
1276  20,
1277  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1278  gtid, team->t.t_id, tid, bt));
1279 }
1280 
1281 // Hierarchical Barrier
1282 
1283 // Initialize thread barrier data
1284 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1285  Performs the minimum amount of initialization required based on how the team
1286  has changed. Returns true if leaf children will require both on-core and
1287  traditional wake-up mechanisms. For example, if the team size increases,
1288  threads already in the team will respond to on-core wakeup on their parent
1289  thread, but threads newly added to the team will only be listening on the
1290  their local b_go. */
1291 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
1292  kmp_bstate_t *thr_bar,
1293  kmp_uint32 nproc, int gtid,
1294  int tid, kmp_team_t *team) {
1295  // Checks to determine if (re-)initialization is needed
1296  bool uninitialized = thr_bar->team == NULL;
1297  bool team_changed = team != thr_bar->team;
1298  bool team_sz_changed = nproc != thr_bar->nproc;
1299  bool tid_changed = tid != thr_bar->old_tid;
1300  bool retval = false;
1301 
1302  if (uninitialized || team_sz_changed) {
1303  __kmp_get_hierarchy(nproc, thr_bar);
1304  }
1305 
1306  if (uninitialized || team_sz_changed || tid_changed) {
1307  thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1308  thr_bar->parent_tid = -1; // default for primary thread
1309  if (!KMP_MASTER_TID(tid)) {
1310  // if not primary thread, find parent thread in hierarchy
1311  kmp_uint32 d = 0;
1312  while (d < thr_bar->depth) { // find parent based on level of thread in
1313  // hierarchy, and note level
1314  kmp_uint32 rem;
1315  if (d == thr_bar->depth - 2) { // reached level right below the primary
1316  thr_bar->parent_tid = 0;
1317  thr_bar->my_level = d;
1318  break;
1319  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1320  // TODO: can we make the above op faster?
1321  // thread is not a subtree root at next level, so this is max
1322  thr_bar->parent_tid = tid - rem;
1323  thr_bar->my_level = d;
1324  break;
1325  }
1326  ++d;
1327  }
1328  }
1329  __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1330  (thr_bar->skip_per_level[thr_bar->my_level])),
1331  &(thr_bar->offset));
1332  thr_bar->old_tid = tid;
1333  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1334  thr_bar->team = team;
1335  thr_bar->parent_bar =
1336  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1337  }
1338  if (uninitialized || team_changed || tid_changed) {
1339  thr_bar->team = team;
1340  thr_bar->parent_bar =
1341  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1342  retval = true;
1343  }
1344  if (uninitialized || team_sz_changed || tid_changed) {
1345  thr_bar->nproc = nproc;
1346  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1347  if (thr_bar->my_level == 0)
1348  thr_bar->leaf_kids = 0;
1349  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1350  __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1351  thr_bar->leaf_state = 0;
1352  for (int i = 0; i < thr_bar->leaf_kids; ++i)
1353  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1354  }
1355  return retval;
1356 }
1357 
1358 static void __kmp_hierarchical_barrier_gather(
1359  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1360  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1361  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1362  kmp_team_t *team = this_thr->th.th_team;
1363  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1364  kmp_uint32 nproc = this_thr->th.th_team_nproc;
1365  kmp_info_t **other_threads = team->t.t_threads;
1366  kmp_uint64 new_state = 0;
1367 
1368  int level = team->t.t_level;
1369  if (other_threads[0]
1370  ->th.th_teams_microtask) // are we inside the teams construct?
1371  if (this_thr->th.th_teams_size.nteams > 1)
1372  ++level; // level was not increased in teams construct for team_of_masters
1373  if (level == 1)
1374  thr_bar->use_oncore_barrier = 1;
1375  else
1376  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1377 
1378  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1379  "barrier type %d\n",
1380  gtid, team->t.t_id, tid, bt));
1381  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1382 
1383 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1384  // Barrier imbalance - save arrive time to the thread
1385  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1386  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1387  }
1388 #endif
1389 
1390  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1391  team);
1392 
1393  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1394  kmp_int32 child_tid;
1395  new_state =
1396  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1397  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1398  thr_bar->use_oncore_barrier) {
1399  if (thr_bar->leaf_kids) {
1400  // First, wait for leaf children to check-in on my b_arrived flag
1401  kmp_uint64 leaf_state =
1402  KMP_MASTER_TID(tid)
1403  ? thr_bar->b_arrived | thr_bar->leaf_state
1404  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1405  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1406  "for leaf kids\n",
1407  gtid, team->t.t_id, tid));
1408  kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1409  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1410  if (reduce) {
1411  OMPT_REDUCTION_DECL(this_thr, gtid);
1412  OMPT_REDUCTION_BEGIN;
1413  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1414  ++child_tid) {
1415  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1416  "T#%d(%d:%d)\n",
1417  gtid, team->t.t_id, tid,
1418  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1419  child_tid));
1420  (*reduce)(this_thr->th.th_local.reduce_data,
1421  other_threads[child_tid]->th.th_local.reduce_data);
1422  }
1423  OMPT_REDUCTION_END;
1424  }
1425  // clear leaf_state bits
1426  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1427  }
1428  // Next, wait for higher level children on each child's b_arrived flag
1429  for (kmp_uint32 d = 1; d < thr_bar->my_level;
1430  ++d) { // gather lowest level threads first, but skip 0
1431  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1432  skip = thr_bar->skip_per_level[d];
1433  if (last > nproc)
1434  last = nproc;
1435  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1436  kmp_info_t *child_thr = other_threads[child_tid];
1437  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1438  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1439  "T#%d(%d:%d) "
1440  "arrived(%p) == %llu\n",
1441  gtid, team->t.t_id, tid,
1442  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1443  child_tid, &child_bar->b_arrived, new_state));
1444  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1445  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1446  if (reduce) {
1447  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1448  "T#%d(%d:%d)\n",
1449  gtid, team->t.t_id, tid,
1450  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1451  child_tid));
1452  (*reduce)(this_thr->th.th_local.reduce_data,
1453  child_thr->th.th_local.reduce_data);
1454  }
1455  }
1456  }
1457  } else { // Blocktime is not infinite
1458  for (kmp_uint32 d = 0; d < thr_bar->my_level;
1459  ++d) { // Gather lowest level threads first
1460  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1461  skip = thr_bar->skip_per_level[d];
1462  if (last > nproc)
1463  last = nproc;
1464  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1465  kmp_info_t *child_thr = other_threads[child_tid];
1466  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1467  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1468  "T#%d(%d:%d) "
1469  "arrived(%p) == %llu\n",
1470  gtid, team->t.t_id, tid,
1471  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1472  child_tid, &child_bar->b_arrived, new_state));
1473  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1474  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1475  if (reduce) {
1476  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1477  "T#%d(%d:%d)\n",
1478  gtid, team->t.t_id, tid,
1479  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1480  child_tid));
1481  (*reduce)(this_thr->th.th_local.reduce_data,
1482  child_thr->th.th_local.reduce_data);
1483  }
1484  }
1485  }
1486  }
1487  }
1488  // All subordinates are gathered; now release parent if not primary thread
1489 
1490  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1491  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1492  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1493  gtid, team->t.t_id, tid,
1494  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1495  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1496  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1497  /* Mark arrival to parent: After performing this write, a worker thread may
1498  not assume that the team is valid any more - it could be deallocated by
1499  the primary thread at any time. */
1500  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1501  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1502  // flag; release it
1503  kmp_flag_64<> flag(&thr_bar->b_arrived,
1504  other_threads[thr_bar->parent_tid]);
1505  flag.release();
1506  } else {
1507  // Leaf does special release on "offset" bits of parent's b_arrived flag
1508  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1509  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1510  thr_bar->offset + 1);
1511  flag.set_waiter(other_threads[thr_bar->parent_tid]);
1512  flag.release();
1513  }
1514  } else { // Primary thread needs to update the team's b_arrived value
1515  team->t.t_bar[bt].b_arrived = new_state;
1516  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1517  "arrived(%p) = %llu\n",
1518  gtid, team->t.t_id, tid, team->t.t_id,
1519  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1520  }
1521  // Is the team access below unsafe or just technically invalid?
1522  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1523  "barrier type %d\n",
1524  gtid, team->t.t_id, tid, bt));
1525 }
1526 
1527 static void __kmp_hierarchical_barrier_release(
1528  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1529  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1530  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1531  kmp_team_t *team;
1532  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1533  kmp_uint32 nproc;
1534  bool team_change = false; // indicates on-core barrier shouldn't be used
1535 
1536  if (KMP_MASTER_TID(tid)) {
1537  team = __kmp_threads[gtid]->th.th_team;
1538  KMP_DEBUG_ASSERT(team != NULL);
1539  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1540  "entered barrier type %d\n",
1541  gtid, team->t.t_id, tid, bt));
1542  } else { // Worker threads
1543  // Wait for parent thread to release me
1544  if (!thr_bar->use_oncore_barrier ||
1545  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1546  thr_bar->team == NULL) {
1547  // Use traditional method of waiting on my own b_go flag
1548  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1549  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1550  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1551  TCW_8(thr_bar->b_go,
1552  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1553  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1554  // infinite, not nested
1555  // Wait on my "offset" bits on parent's b_go flag
1556  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1557  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1558  thr_bar->offset + 1, bt,
1559  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1560  flag.wait(this_thr, TRUE);
1561  if (thr_bar->wait_flag ==
1562  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1563  TCW_8(thr_bar->b_go,
1564  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1565  } else { // Reset my bits on parent's b_go flag
1566  (RCAST(volatile char *,
1567  &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1568  }
1569  }
1570  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1571  // Early exit for reaping threads releasing forkjoin barrier
1572  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1573  return;
1574  // The worker thread may now assume that the team is valid.
1575  team = __kmp_threads[gtid]->th.th_team;
1576  KMP_DEBUG_ASSERT(team != NULL);
1577  tid = __kmp_tid_from_gtid(gtid);
1578 
1579  KA_TRACE(
1580  20,
1581  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1582  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1583  KMP_MB(); // Flush all pending memory write invalidates.
1584  }
1585 
1586  nproc = this_thr->th.th_team_nproc;
1587  int level = team->t.t_level;
1588  if (team->t.t_threads[0]
1589  ->th.th_teams_microtask) { // are we inside the teams construct?
1590  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1591  this_thr->th.th_teams_level == level)
1592  ++level; // level was not increased in teams construct for team_of_workers
1593  if (this_thr->th.th_teams_size.nteams > 1)
1594  ++level; // level was not increased in teams construct for team_of_masters
1595  }
1596  if (level == 1)
1597  thr_bar->use_oncore_barrier = 1;
1598  else
1599  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1600 
1601  // If the team size has increased, we still communicate with old leaves via
1602  // oncore barrier.
1603  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1604  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1605  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1606  tid, team);
1607  // But if the entire team changes, we won't use oncore barrier at all
1608  if (team_change)
1609  old_leaf_kids = 0;
1610 
1611 #if KMP_BARRIER_ICV_PUSH
1612  if (propagate_icvs) {
1613  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1614  FALSE);
1615  if (KMP_MASTER_TID(
1616  tid)) { // primary already has copy in final destination; copy
1617  copy_icvs(&thr_bar->th_fixed_icvs,
1618  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1619  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1620  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1621  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1622  // leaves (on-core children) pull parent's fixed ICVs directly to local
1623  // ICV store
1624  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1625  &thr_bar->parent_bar->th_fixed_icvs);
1626  // non-leaves will get ICVs piggybacked with b_go via NGO store
1627  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1628  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1629  // access
1630  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1631  else // leaves copy parent's fixed ICVs directly to local ICV store
1632  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1633  &thr_bar->parent_bar->th_fixed_icvs);
1634  }
1635  }
1636 #endif // KMP_BARRIER_ICV_PUSH
1637 
1638  // Now, release my children
1639  if (thr_bar->my_level) { // not a leaf
1640  kmp_int32 child_tid;
1641  kmp_uint32 last;
1642  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1643  thr_bar->use_oncore_barrier) {
1644  if (KMP_MASTER_TID(tid)) { // do a flat release
1645  // Set local b_go to bump children via NGO store of the cache line
1646  // containing IVCs and b_go.
1647  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1648  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1649  // the cache line
1650  ngo_load(&thr_bar->th_fixed_icvs);
1651  // This loops over all the threads skipping only the leaf nodes in the
1652  // hierarchy
1653  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1654  child_tid += thr_bar->skip_per_level[1]) {
1655  kmp_bstate_t *child_bar =
1656  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1657  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1658  "releasing T#%d(%d:%d)"
1659  " go(%p): %u => %u\n",
1660  gtid, team->t.t_id, tid,
1661  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1662  child_tid, &child_bar->b_go, child_bar->b_go,
1663  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1664  // Use ngo store (if available) to both store ICVs and release child
1665  // via child's b_go
1666  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1667  }
1668  ngo_sync();
1669  }
1670  TCW_8(thr_bar->b_go,
1671  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1672  // Now, release leaf children
1673  if (thr_bar->leaf_kids) { // if there are any
1674  // We test team_change on the off-chance that the level 1 team changed.
1675  if (team_change ||
1676  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1677  if (old_leaf_kids) { // release old leaf kids
1678  thr_bar->b_go |= old_leaf_state;
1679  }
1680  // Release new leaf kids
1681  last = tid + thr_bar->skip_per_level[1];
1682  if (last > nproc)
1683  last = nproc;
1684  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1685  ++child_tid) { // skip_per_level[0]=1
1686  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1687  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1688  KA_TRACE(
1689  20,
1690  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1691  " T#%d(%d:%d) go(%p): %u => %u\n",
1692  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1693  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1694  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1695  // Release child using child's b_go flag
1696  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1697  flag.release();
1698  }
1699  } else { // Release all children at once with leaf_state bits on my own
1700  // b_go flag
1701  thr_bar->b_go |= thr_bar->leaf_state;
1702  }
1703  }
1704  } else { // Blocktime is not infinite; do a simple hierarchical release
1705  for (int d = thr_bar->my_level - 1; d >= 0;
1706  --d) { // Release highest level threads first
1707  last = tid + thr_bar->skip_per_level[d + 1];
1708  kmp_uint32 skip = thr_bar->skip_per_level[d];
1709  if (last > nproc)
1710  last = nproc;
1711  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1712  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1713  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1714  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1715  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1716  gtid, team->t.t_id, tid,
1717  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1718  child_tid, &child_bar->b_go, child_bar->b_go,
1719  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1720  // Release child using child's b_go flag
1721  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1722  flag.release();
1723  }
1724  }
1725  }
1726 #if KMP_BARRIER_ICV_PUSH
1727  if (propagate_icvs && !KMP_MASTER_TID(tid))
1728  // non-leaves copy ICVs from fixed ICVs to local dest
1729  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1730  &thr_bar->th_fixed_icvs);
1731 #endif // KMP_BARRIER_ICV_PUSH
1732  }
1733  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1734  "barrier type %d\n",
1735  gtid, team->t.t_id, tid, bt));
1736 }
1737 
1738 // End of Barrier Algorithms
1739 
1740 // type traits for cancellable value
1741 // if cancellable is true, then is_cancellable is a normal boolean variable
1742 // if cancellable is false, then is_cancellable is a compile time constant
1743 template <bool cancellable> struct is_cancellable {};
1744 template <> struct is_cancellable<true> {
1745  bool value;
1746  is_cancellable() : value(false) {}
1747  is_cancellable(bool b) : value(b) {}
1748  is_cancellable &operator=(bool b) {
1749  value = b;
1750  return *this;
1751  }
1752  operator bool() const { return value; }
1753 };
1754 template <> struct is_cancellable<false> {
1755  is_cancellable &operator=(bool b) { return *this; }
1756  constexpr operator bool() const { return false; }
1757 };
1758 
1759 // Internal function to do a barrier.
1760 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1761  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1762  barrier
1763  When cancellable = false,
1764  Returns 0 if primary thread, 1 if worker thread.
1765  When cancellable = true
1766  Returns 0 if not cancelled, 1 if cancelled. */
1767 template <bool cancellable = false>
1768 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1769  size_t reduce_size, void *reduce_data,
1770  void (*reduce)(void *, void *)) {
1771  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1772  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1773  int tid = __kmp_tid_from_gtid(gtid);
1774  kmp_info_t *this_thr = __kmp_threads[gtid];
1775  kmp_team_t *team = this_thr->th.th_team;
1776  int status = 0;
1777  is_cancellable<cancellable> cancelled;
1778 #if OMPT_SUPPORT && OMPT_OPTIONAL
1779  ompt_data_t *my_task_data;
1780  ompt_data_t *my_parallel_data;
1781  void *return_address;
1782  ompt_sync_region_t barrier_kind;
1783 #endif
1784 
1785  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1786  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1787 
1788 #if OMPT_SUPPORT
1789  if (ompt_enabled.enabled) {
1790 #if OMPT_OPTIONAL
1791  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1792  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1793  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1794  barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1795  if (ompt_enabled.ompt_callback_sync_region) {
1796  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1797  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1798  return_address);
1799  }
1800  if (ompt_enabled.ompt_callback_sync_region_wait) {
1801  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1802  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1803  return_address);
1804  }
1805 #endif
1806  // It is OK to report the barrier state after the barrier begin callback.
1807  // According to the OMPT specification, a compliant implementation may
1808  // even delay reporting this state until the barrier begins to wait.
1809  auto *ompt_thr_info = &this_thr->th.ompt_thread_info;
1810  switch (barrier_kind) {
1811  case ompt_sync_region_barrier_explicit:
1812  ompt_thr_info->state = ompt_state_wait_barrier_explicit;
1813  break;
1814  case ompt_sync_region_barrier_implicit_workshare:
1815  ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;
1816  break;
1817  case ompt_sync_region_barrier_implicit_parallel:
1818  ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;
1819  break;
1820  case ompt_sync_region_barrier_teams:
1821  ompt_thr_info->state = ompt_state_wait_barrier_teams;
1822  break;
1823  case ompt_sync_region_barrier_implementation:
1824  [[fallthrough]];
1825  default:
1826  ompt_thr_info->state = ompt_state_wait_barrier_implementation;
1827  }
1828  }
1829 #endif
1830 
1831  if (!team->t.t_serialized) {
1832 #if USE_ITT_BUILD
1833  // This value will be used in itt notify events below.
1834  void *itt_sync_obj = NULL;
1835 #if USE_ITT_NOTIFY
1836  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1837  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1838 #endif
1839 #endif /* USE_ITT_BUILD */
1840  if (__kmp_tasking_mode == tskm_extra_barrier) {
1841  __kmp_tasking_barrier(team, this_thr, gtid);
1842  KA_TRACE(15,
1843  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1844  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1845  }
1846 
1847  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1848  access it when the team struct is not guaranteed to exist. */
1849  // See note about the corresponding code in __kmp_join_barrier() being
1850  // performance-critical.
1851  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1852 #if KMP_USE_MONITOR
1853  this_thr->th.th_team_bt_intervals =
1854  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1855  this_thr->th.th_team_bt_set =
1856  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1857 #else
1858  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1859 #endif
1860  }
1861 
1862 #if USE_ITT_BUILD
1863  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1864  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1865 #endif /* USE_ITT_BUILD */
1866 #if USE_DEBUGGER
1867  // Let the debugger know: the thread arrived to the barrier and waiting.
1868  if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1869  team->t.t_bar[bt].b_master_arrived += 1;
1870  } else {
1871  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1872  } // if
1873 #endif /* USE_DEBUGGER */
1874  if (reduce != NULL) {
1875  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1876  this_thr->th.th_local.reduce_data = reduce_data;
1877  }
1878 
1879  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1880  __kmp_task_team_setup(this_thr, team);
1881 
1882  if (cancellable) {
1883  cancelled = __kmp_linear_barrier_gather_cancellable(
1884  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1885  } else {
1886  switch (__kmp_barrier_gather_pattern[bt]) {
1887  case bp_dist_bar: {
1888  __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1889  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1890  break;
1891  }
1892  case bp_hyper_bar: {
1893  // don't set branch bits to 0; use linear
1894  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1895  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1896  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1897  break;
1898  }
1899  case bp_hierarchical_bar: {
1900  __kmp_hierarchical_barrier_gather(
1901  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1902  break;
1903  }
1904  case bp_tree_bar: {
1905  // don't set branch bits to 0; use linear
1906  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1907  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1908  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1909  break;
1910  }
1911  default: {
1912  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1913  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1914  }
1915  }
1916  }
1917 
1918  KMP_MB();
1919 
1920  if (KMP_MASTER_TID(tid)) {
1921  status = 0;
1922  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1923  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1924  }
1925 #if USE_DEBUGGER
1926  // Let the debugger know: All threads are arrived and starting leaving the
1927  // barrier.
1928  team->t.t_bar[bt].b_team_arrived += 1;
1929 #endif
1930 
1931  if (__kmp_omp_cancellation) {
1932  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1933  // Reset cancellation flag for worksharing constructs
1934  if (cancel_request == cancel_loop ||
1935  cancel_request == cancel_sections) {
1936  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1937  }
1938  }
1939 #if USE_ITT_BUILD
1940  /* TODO: In case of split reduction barrier, primary thread may send
1941  acquired event early, before the final summation into the shared
1942  variable is done (final summation can be a long operation for array
1943  reductions). */
1944  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1945  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1946 #endif /* USE_ITT_BUILD */
1947 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1948  // Barrier - report frame end (only if active_level == 1)
1949  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1950  __kmp_forkjoin_frames_mode &&
1951  (this_thr->th.th_teams_microtask == NULL || // either not in teams
1952  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1953  team->t.t_active_level == 1) {
1954  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1955  kmp_uint64 cur_time = __itt_get_timestamp();
1956  kmp_info_t **other_threads = team->t.t_threads;
1957  int nproc = this_thr->th.th_team_nproc;
1958  int i;
1959  switch (__kmp_forkjoin_frames_mode) {
1960  case 1:
1961  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1962  loc, nproc);
1963  this_thr->th.th_frame_time = cur_time;
1964  break;
1965  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1966  // be fixed)
1967  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1968  1, loc, nproc);
1969  break;
1970  case 3:
1971  if (__itt_metadata_add_ptr) {
1972  // Initialize with primary thread's wait time
1973  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1974  // Set arrive time to zero to be able to check it in
1975  // __kmp_invoke_task(); the same is done inside the loop below
1976  this_thr->th.th_bar_arrive_time = 0;
1977  for (i = 1; i < nproc; ++i) {
1978  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1979  other_threads[i]->th.th_bar_arrive_time = 0;
1980  }
1981  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1982  cur_time, delta,
1983  (kmp_uint64)(reduce != NULL));
1984  }
1985  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1986  loc, nproc);
1987  this_thr->th.th_frame_time = cur_time;
1988  break;
1989  }
1990  }
1991 #endif /* USE_ITT_BUILD */
1992  } else {
1993  status = 1;
1994 #if USE_ITT_BUILD
1995  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1996  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1997 #endif /* USE_ITT_BUILD */
1998  }
1999  if ((status == 1 || !is_split) && !cancelled) {
2000  if (cancellable) {
2001  cancelled = __kmp_linear_barrier_release_cancellable(
2002  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2003  } else {
2004  switch (__kmp_barrier_release_pattern[bt]) {
2005  case bp_dist_bar: {
2006  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2007  __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2008  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2009  break;
2010  }
2011  case bp_hyper_bar: {
2012  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2013  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2014  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2015  break;
2016  }
2017  case bp_hierarchical_bar: {
2018  __kmp_hierarchical_barrier_release(
2019  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2020  break;
2021  }
2022  case bp_tree_bar: {
2023  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2024  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2025  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2026  break;
2027  }
2028  default: {
2029  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2030  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2031  }
2032  }
2033  }
2034  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2035  __kmp_task_team_sync(this_thr, team);
2036  }
2037  }
2038 
2039 #if USE_ITT_BUILD
2040  /* GEH: TODO: Move this under if-condition above and also include in
2041  __kmp_end_split_barrier(). This will more accurately represent the actual
2042  release time of the threads for split barriers. */
2043  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2044  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2045 #endif /* USE_ITT_BUILD */
2046  } else { // Team is serialized.
2047  status = 0;
2048  if (__kmp_tasking_mode != tskm_immediate_exec) {
2049  if (this_thr->th.th_task_team != NULL) {
2050 #if USE_ITT_NOTIFY
2051  void *itt_sync_obj = NULL;
2052  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2053  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2054  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2055  }
2056 #endif
2057 
2058  KMP_DEBUG_ASSERT(
2059  this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
2060  this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2061  TRUE);
2062  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2063  __kmp_task_team_setup(this_thr, team);
2064 
2065 #if USE_ITT_BUILD
2066  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2067  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2068 #endif /* USE_ITT_BUILD */
2069  }
2070  }
2071  }
2072  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2073  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2074  __kmp_tid_from_gtid(gtid), status));
2075 
2076 #if OMPT_SUPPORT
2077  if (ompt_enabled.enabled) {
2078 #if OMPT_OPTIONAL
2079  if (ompt_enabled.ompt_callback_sync_region_wait) {
2080  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2081  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2082  return_address);
2083  }
2084  if (ompt_enabled.ompt_callback_sync_region) {
2085  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2086  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2087  return_address);
2088  }
2089 #endif
2090  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2091  }
2092 #endif
2093 
2094  if (cancellable)
2095  return (int)cancelled;
2096  return status;
2097 }
2098 
2099 // Returns 0 if primary thread, 1 if worker thread.
2100 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2101  size_t reduce_size, void *reduce_data,
2102  void (*reduce)(void *, void *)) {
2103  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2104  reduce);
2105 }
2106 
2107 #if defined(KMP_GOMP_COMPAT)
2108 // Returns 1 if cancelled, 0 otherwise
2109 int __kmp_barrier_gomp_cancel(int gtid) {
2110  if (__kmp_omp_cancellation) {
2111  int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2112  0, NULL, NULL);
2113  if (cancelled) {
2114  int tid = __kmp_tid_from_gtid(gtid);
2115  kmp_info_t *this_thr = __kmp_threads[gtid];
2116  if (KMP_MASTER_TID(tid)) {
2117  // Primary thread does not need to revert anything
2118  } else {
2119  // Workers need to revert their private b_arrived flag
2120  this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2121  KMP_BARRIER_STATE_BUMP;
2122  }
2123  }
2124  return cancelled;
2125  }
2126  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2127  return FALSE;
2128 }
2129 #endif
2130 
2131 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2132  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2133  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2134  KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2135  int tid = __kmp_tid_from_gtid(gtid);
2136  kmp_info_t *this_thr = __kmp_threads[gtid];
2137  kmp_team_t *team = this_thr->th.th_team;
2138 
2139  if (!team->t.t_serialized) {
2140  if (KMP_MASTER_GTID(gtid)) {
2141  switch (__kmp_barrier_release_pattern[bt]) {
2142  case bp_dist_bar: {
2143  __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2144  FALSE USE_ITT_BUILD_ARG(NULL));
2145  break;
2146  }
2147  case bp_hyper_bar: {
2148  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2149  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2150  FALSE USE_ITT_BUILD_ARG(NULL));
2151  break;
2152  }
2153  case bp_hierarchical_bar: {
2154  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2155  FALSE USE_ITT_BUILD_ARG(NULL));
2156  break;
2157  }
2158  case bp_tree_bar: {
2159  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2160  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2161  FALSE USE_ITT_BUILD_ARG(NULL));
2162  break;
2163  }
2164  default: {
2165  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2166  FALSE USE_ITT_BUILD_ARG(NULL));
2167  }
2168  }
2169  if (__kmp_tasking_mode != tskm_immediate_exec) {
2170  __kmp_task_team_sync(this_thr, team);
2171  } // if
2172  }
2173  }
2174 }
2175 
2176 void __kmp_join_barrier(int gtid) {
2177  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2178  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2179 
2180  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2181 
2182  kmp_info_t *this_thr = __kmp_threads[gtid];
2183  kmp_team_t *team;
2184  int tid;
2185 #ifdef KMP_DEBUG
2186  int team_id;
2187 #endif /* KMP_DEBUG */
2188 #if USE_ITT_BUILD
2189  void *itt_sync_obj = NULL;
2190 #if USE_ITT_NOTIFY
2191  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2192  // Get object created at fork_barrier
2193  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2194 #endif
2195 #endif /* USE_ITT_BUILD */
2196 #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2197  int nproc = this_thr->th.th_team_nproc;
2198 #endif
2199  KMP_MB();
2200 
2201  // Get current info
2202  team = this_thr->th.th_team;
2203  KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2204  tid = __kmp_tid_from_gtid(gtid);
2205 #ifdef KMP_DEBUG
2206  team_id = team->t.t_id;
2207  kmp_info_t *master_thread = this_thr->th.th_team_master;
2208  if (master_thread != team->t.t_threads[0]) {
2209  __kmp_print_structure();
2210  }
2211 #endif /* KMP_DEBUG */
2212  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2213  KMP_MB();
2214 
2215  // Verify state
2216  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2217  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2218  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2219  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2220  gtid, team_id, tid));
2221 
2222 #if OMPT_SUPPORT
2223  if (ompt_enabled.enabled) {
2224 #if OMPT_OPTIONAL
2225  ompt_data_t *my_task_data;
2226  ompt_data_t *my_parallel_data;
2227  void *codeptr = NULL;
2228  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2229  if (KMP_MASTER_TID(ds_tid) &&
2230  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2231  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2232  codeptr = team->t.ompt_team_info.master_return_address;
2233  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2234  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2235  ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2236  ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;
2237  if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {
2238  sync_kind = ompt_sync_region_barrier_teams;
2239  ompt_state = ompt_state_wait_barrier_teams;
2240  }
2241  if (ompt_enabled.ompt_callback_sync_region) {
2242  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2243  sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2244  }
2245  if (ompt_enabled.ompt_callback_sync_region_wait) {
2246  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2247  sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2248  }
2249  if (!KMP_MASTER_TID(ds_tid))
2250  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2251 #endif
2252  this_thr->th.ompt_thread_info.state = ompt_state;
2253  }
2254 #endif
2255 
2256  if (__kmp_tasking_mode == tskm_extra_barrier) {
2257  __kmp_tasking_barrier(team, this_thr, gtid);
2258  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2259  gtid, team_id, tid));
2260  }
2261 #ifdef KMP_DEBUG
2262  if (__kmp_tasking_mode != tskm_immediate_exec) {
2263  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2264  "%p, th_task_team = %p\n",
2265  __kmp_gtid_from_thread(this_thr), team_id,
2266  team->t.t_task_team[this_thr->th.th_task_state],
2267  this_thr->th.th_task_team));
2268  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
2269  }
2270 #endif /* KMP_DEBUG */
2271 
2272  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2273  access it when the team struct is not guaranteed to exist. Doing these
2274  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2275  we do not perform the copy if blocktime=infinite, since the values are not
2276  used by __kmp_wait_template() in that case. */
2277  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2278 #if KMP_USE_MONITOR
2279  this_thr->th.th_team_bt_intervals =
2280  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2281  this_thr->th.th_team_bt_set =
2282  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2283 #else
2284  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2285 #endif
2286  }
2287 
2288 #if USE_ITT_BUILD
2289  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2290  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2291 #endif /* USE_ITT_BUILD */
2292 
2293  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2294  case bp_dist_bar: {
2295  __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2296  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2297  break;
2298  }
2299  case bp_hyper_bar: {
2300  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2301  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2302  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2303  break;
2304  }
2305  case bp_hierarchical_bar: {
2306  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2307  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2308  break;
2309  }
2310  case bp_tree_bar: {
2311  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2312  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2313  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2314  break;
2315  }
2316  default: {
2317  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2318  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2319  }
2320  }
2321 
2322  /* From this point on, the team data structure may be deallocated at any time
2323  by the primary thread - it is unsafe to reference it in any of the worker
2324  threads. Any per-team data items that need to be referenced before the
2325  end of the barrier should be moved to the kmp_task_team_t structs. */
2326  if (KMP_MASTER_TID(tid)) {
2327  if (__kmp_tasking_mode != tskm_immediate_exec) {
2328  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2329  }
2330  if (__kmp_display_affinity) {
2331  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2332  }
2333 #if KMP_STATS_ENABLED
2334  // Have primary thread flag the workers to indicate they are now waiting for
2335  // next parallel region, Also wake them up so they switch their timers to
2336  // idle.
2337  for (int i = 0; i < team->t.t_nproc; ++i) {
2338  kmp_info_t *team_thread = team->t.t_threads[i];
2339  if (team_thread == this_thr)
2340  continue;
2341  team_thread->th.th_stats->setIdleFlag();
2342  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2343  team_thread->th.th_sleep_loc != NULL)
2344  __kmp_null_resume_wrapper(team_thread);
2345  }
2346 #endif
2347 #if USE_ITT_BUILD
2348  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2349  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2350 #endif /* USE_ITT_BUILD */
2351 
2352 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2353  // Join barrier - report frame end
2354  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2355  __kmp_forkjoin_frames_mode &&
2356  (this_thr->th.th_teams_microtask == NULL || // either not in teams
2357  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2358  team->t.t_active_level == 1) {
2359  kmp_uint64 cur_time = __itt_get_timestamp();
2360  ident_t *loc = team->t.t_ident;
2361  kmp_info_t **other_threads = team->t.t_threads;
2362  switch (__kmp_forkjoin_frames_mode) {
2363  case 1:
2364  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2365  loc, nproc);
2366  break;
2367  case 2:
2368  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2369  loc, nproc);
2370  break;
2371  case 3:
2372  if (__itt_metadata_add_ptr) {
2373  // Initialize with primary thread's wait time
2374  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2375  // Set arrive time to zero to be able to check it in
2376  // __kmp_invoke_task(); the same is done inside the loop below
2377  this_thr->th.th_bar_arrive_time = 0;
2378  for (int i = 1; i < nproc; ++i) {
2379  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2380  other_threads[i]->th.th_bar_arrive_time = 0;
2381  }
2382  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2383  cur_time, delta, 0);
2384  }
2385  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2386  loc, nproc);
2387  this_thr->th.th_frame_time = cur_time;
2388  break;
2389  }
2390  }
2391 #endif /* USE_ITT_BUILD */
2392  }
2393 #if USE_ITT_BUILD
2394  else {
2395  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2396  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2397  }
2398 #endif /* USE_ITT_BUILD */
2399 
2400 #if KMP_DEBUG
2401  if (KMP_MASTER_TID(tid)) {
2402  KA_TRACE(
2403  15,
2404  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2405  gtid, team_id, tid, nproc));
2406  }
2407 #endif /* KMP_DEBUG */
2408 
2409  // TODO now, mark worker threads as done so they may be disbanded
2410  KMP_MB(); // Flush all pending memory write invalidates.
2411  KA_TRACE(10,
2412  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2413 
2414 }
2415 
2416 // TODO release worker threads' fork barriers as we are ready instead of all at
2417 // once
2418 void __kmp_fork_barrier(int gtid, int tid) {
2419  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2420  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2421  kmp_info_t *this_thr = __kmp_threads[gtid];
2422  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2423 #if USE_ITT_BUILD
2424  void *itt_sync_obj = NULL;
2425 #endif /* USE_ITT_BUILD */
2426 #ifdef KMP_DEBUG
2427  if (team)
2428  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2429  (team != NULL) ? team->t.t_id : -1, tid));
2430 #endif
2431  // th_team pointer only valid for primary thread here
2432  if (KMP_MASTER_TID(tid)) {
2433 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2434  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2435  // Create itt barrier object
2436  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2437  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2438  }
2439 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2440 
2441 #ifdef KMP_DEBUG
2442  KMP_DEBUG_ASSERT(team);
2443  kmp_info_t **other_threads = team->t.t_threads;
2444  int i;
2445 
2446  // Verify state
2447  KMP_MB();
2448 
2449  for (i = 1; i < team->t.t_nproc; ++i) {
2450  KA_TRACE(500,
2451  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2452  "== %u.\n",
2453  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2454  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2455  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2456  KMP_DEBUG_ASSERT(
2457  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2458  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2459  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2460  }
2461 #endif
2462 
2463  if (__kmp_tasking_mode != tskm_immediate_exec)
2464  __kmp_task_team_setup(this_thr, team);
2465 
2466  /* The primary thread may have changed its blocktime between join barrier
2467  and fork barrier. Copy the blocktime info to the thread, where
2468  __kmp_wait_template() can access it when the team struct is not
2469  guaranteed to exist. */
2470  // See note about the corresponding code in __kmp_join_barrier() being
2471  // performance-critical
2472  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2473 #if KMP_USE_MONITOR
2474  this_thr->th.th_team_bt_intervals =
2475  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2476  this_thr->th.th_team_bt_set =
2477  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2478 #else
2479  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2480 #endif
2481  }
2482  } // primary thread
2483 
2484  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2485  case bp_dist_bar: {
2486  __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2487  TRUE USE_ITT_BUILD_ARG(NULL));
2488  break;
2489  }
2490  case bp_hyper_bar: {
2491  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2492  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2493  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2494  break;
2495  }
2496  case bp_hierarchical_bar: {
2497  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2498  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2499  break;
2500  }
2501  case bp_tree_bar: {
2502  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2503  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2504  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2505  break;
2506  }
2507  default: {
2508  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2509  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2510  }
2511  }
2512 
2513 #if OMPT_SUPPORT
2514  ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
2515  if (ompt_enabled.enabled &&
2516  (ompt_state == ompt_state_wait_barrier_teams ||
2517  ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
2518  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2519  ompt_data_t *task_data = (team)
2520  ? OMPT_CUR_TASK_DATA(this_thr)
2521  : &(this_thr->th.ompt_thread_info.task_data);
2522  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2523 #if OMPT_OPTIONAL
2524  void *codeptr = NULL;
2525  if (KMP_MASTER_TID(ds_tid) &&
2526  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2527  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2528  codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2529  ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2530  if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
2531  sync_kind = ompt_sync_region_barrier_teams;
2532  if (ompt_enabled.ompt_callback_sync_region_wait) {
2533  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2534  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2535  }
2536  if (ompt_enabled.ompt_callback_sync_region) {
2537  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2538  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2539  }
2540 #endif
2541  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2542  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2543  ompt_scope_end, NULL, task_data, 0, ds_tid,
2544  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2545  }
2546  }
2547 #endif
2548 
2549  // Early exit for reaping threads releasing forkjoin barrier
2550  if (TCR_4(__kmp_global.g.g_done)) {
2551  this_thr->th.th_task_team = NULL;
2552 
2553 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2554  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2555  if (!KMP_MASTER_TID(tid)) {
2556  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2557  if (itt_sync_obj)
2558  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2559  }
2560  }
2561 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2562  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2563  return;
2564  }
2565 
2566  /* We can now assume that a valid team structure has been allocated by the
2567  primary thread and propagated to all worker threads. The current thread,
2568  however, may not be part of the team, so we can't blindly assume that the
2569  team pointer is non-null. */
2570  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2571  KMP_DEBUG_ASSERT(team != NULL);
2572  tid = __kmp_tid_from_gtid(gtid);
2573 
2574 #if KMP_BARRIER_ICV_PULL
2575  /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2576  __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2577  implicit task has this data before this function is called. We cannot
2578  modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2579  thread struct, because it is not always the case that the threads arrays
2580  have been allocated when __kmp_fork_call() is executed. */
2581  {
2582  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2583  if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2584  // Copy the initial ICVs from the primary thread's thread struct to the
2585  // implicit task for this tid.
2586  KA_TRACE(10,
2587  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2588  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2589  tid, FALSE);
2590  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2591  &team->t.t_threads[0]
2592  ->th.th_bar[bs_forkjoin_barrier]
2593  .bb.th_fixed_icvs);
2594  }
2595  }
2596 #endif // KMP_BARRIER_ICV_PULL
2597 
2598  if (__kmp_tasking_mode != tskm_immediate_exec) {
2599  __kmp_task_team_sync(this_thr, team);
2600  }
2601 
2602 #if KMP_AFFINITY_SUPPORTED
2603  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2604  if (proc_bind == proc_bind_intel) {
2605  // Call dynamic affinity settings
2606  if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2607  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2608  }
2609  } else if (proc_bind != proc_bind_false) {
2610  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2611  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2612  __kmp_gtid_from_thread(this_thr),
2613  this_thr->th.th_current_place));
2614  } else {
2615  __kmp_affinity_bind_place(gtid);
2616  }
2617  }
2618 #endif // KMP_AFFINITY_SUPPORTED
2619  // Perform the display affinity functionality
2620  if (__kmp_display_affinity) {
2621  if (team->t.t_display_affinity
2622 #if KMP_AFFINITY_SUPPORTED
2623  || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2624 #endif
2625  ) {
2626  // NULL means use the affinity-format-var ICV
2627  __kmp_aux_display_affinity(gtid, NULL);
2628  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2629  this_thr->th.th_prev_level = team->t.t_level;
2630  }
2631  }
2632  if (!KMP_MASTER_TID(tid))
2633  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2634 
2635 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2636  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2637  if (!KMP_MASTER_TID(tid)) {
2638  // Get correct barrier object
2639  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2640  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2641  } // (prepare called inside barrier_release)
2642  }
2643 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2644  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2645  team->t.t_id, tid));
2646 }
2647 
2648 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2649  kmp_internal_control_t *new_icvs, ident_t *loc) {
2650  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2651 
2652  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2653  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2654 
2655 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2656  __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2657  implicit task has this data before this function is called. */
2658 #if KMP_BARRIER_ICV_PULL
2659  /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2660  remains untouched), where all of the worker threads can access them and
2661  make their own copies after the barrier. */
2662  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2663  // allocated at this point
2664  copy_icvs(
2665  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2666  new_icvs);
2667  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2668  team->t.t_threads[0], team));
2669 #elif KMP_BARRIER_ICV_PUSH
2670  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2671  // done here.
2672  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2673  team->t.t_threads[0], team));
2674 #else
2675  // Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2676  // time.
2677  ngo_load(new_icvs);
2678  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2679  // allocated at this point
2680  for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2681  // TODO: GEH - pass in better source location info since usually NULL here
2682  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2683  f, team->t.t_threads[f], team));
2684  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2685  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2686  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2687  f, team->t.t_threads[f], team));
2688  }
2689  ngo_sync();
2690 #endif // KMP_BARRIER_ICV_PULL
2691 }
Definition: kmp.h:247