LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24 
25 #include "tsan_annotations.h"
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 
44 // Linear Barrier
45 template <bool cancellable = false>
46 static bool __kmp_linear_barrier_gather_template(
47  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50  kmp_team_t *team = this_thr->th.th_team;
51  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52  kmp_info_t **other_threads = team->t.t_threads;
53 
54  KA_TRACE(
55  20,
56  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57  gtid, team->t.t_id, tid, bt));
58  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61  // Barrier imbalance - save arrive time to the thread
62  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64  __itt_get_timestamp();
65  }
66 #endif
67  // We now perform a linear reduction to signal that all of the threads have
68  // arrived.
69  if (!KMP_MASTER_TID(tid)) {
70  KA_TRACE(20,
71  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72  "arrived(%p): %llu => %llu\n",
73  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76  // Mark arrival to primary thread
77  /* After performing this write, a worker thread may not assume that the team
78  is valid any more - it could be deallocated by the primary thread at any
79  time. */
80  ANNOTATE_BARRIER_BEGIN(this_thr);
81  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
82  flag.release();
83  } else {
84  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85  int nproc = this_thr->th.th_team_nproc;
86  int i;
87  // Don't have to worry about sleep bit here or atomic since team setting
88  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89 
90  // Collect all the worker team member threads.
91  for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93  // Prefetch next thread's arrived count
94  if (i + 1 < nproc)
95  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98  "arrived(%p) == %llu\n",
99  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100  team->t.t_id, i,
101  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102 
103  // Wait for worker thread to arrive
104  if (cancellable) {
105  kmp_flag_64<true, false> flag(
106  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
107  if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
108  return true;
109  } else {
110  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
111  new_state);
112  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113  }
114  ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116  // Barrier imbalance - write min of the thread time and the other thread
117  // time to the thread.
118  if (__kmp_forkjoin_frames_mode == 2) {
119  this_thr->th.th_bar_min_time = KMP_MIN(
120  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121  }
122 #endif
123  if (reduce) {
124  KA_TRACE(100,
125  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127  team->t.t_id, i));
128  ANNOTATE_REDUCE_AFTER(reduce);
129  OMPT_REDUCTION_DECL(this_thr, gtid);
130  OMPT_REDUCTION_BEGIN;
131  (*reduce)(this_thr->th.th_local.reduce_data,
132  other_threads[i]->th.th_local.reduce_data);
133  OMPT_REDUCTION_END;
134  ANNOTATE_REDUCE_BEFORE(reduce);
135  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136  }
137  }
138  // Don't have to worry about sleep bit here or atomic since team setting
139  team_bar->b_arrived = new_state;
140  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141  "arrived(%p) = %llu\n",
142  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143  new_state));
144  }
145  KA_TRACE(
146  20,
147  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148  gtid, team->t.t_id, tid, bt));
149  return false;
150 }
151 
152 template <bool cancellable = false>
153 static bool __kmp_linear_barrier_release_template(
154  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158  kmp_team_t *team;
159 
160  if (KMP_MASTER_TID(tid)) {
161  unsigned int i;
162  kmp_uint32 nproc = this_thr->th.th_team_nproc;
163  kmp_info_t **other_threads;
164 
165  team = __kmp_threads[gtid]->th.th_team;
166  KMP_DEBUG_ASSERT(team != NULL);
167  other_threads = team->t.t_threads;
168 
169  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
170  "barrier type %d\n",
171  gtid, team->t.t_id, tid, bt));
172 
173  if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175  {
176  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177  if (propagate_icvs) {
178  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179  for (i = 1; i < nproc; ++i) {
180  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181  team, i, FALSE);
182  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183  &team->t.t_implicit_task_taskdata[0].td_icvs);
184  }
185  ngo_sync();
186  }
187  }
188 #endif // KMP_BARRIER_ICV_PUSH
189 
190  // Now, release all of the worker threads
191  for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193  // Prefetch next thread's go flag
194  if (i + 1 < nproc)
195  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197  KA_TRACE(
198  20,
199  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200  "go(%p): %u => %u\n",
201  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203  other_threads[i]->th.th_bar[bt].bb.b_go,
204  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205  ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207  other_threads[i]);
208  flag.release();
209  }
210  }
211  } else { // Wait for the PRIMARY thread to release us
212  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214  if (cancellable) {
215  kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
216  if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
217  return true;
218  } else {
219  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
220  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
221  }
222  ANNOTATE_BARRIER_END(this_thr);
223 #if USE_ITT_BUILD && USE_ITT_NOTIFY
224  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
225  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
226  // disabled)
227  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
228  // Cancel wait on previous parallel region...
229  __kmp_itt_task_starting(itt_sync_obj);
230 
231  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
232  return false;
233 
234  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
235  if (itt_sync_obj != NULL)
236  // Call prepare as early as possible for "new" barrier
237  __kmp_itt_task_finished(itt_sync_obj);
238  } else
239 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
240  // Early exit for reaping threads releasing forkjoin barrier
241  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
242  return false;
243 // The worker thread may now assume that the team is valid.
244 #ifdef KMP_DEBUG
245  tid = __kmp_tid_from_gtid(gtid);
246  team = __kmp_threads[gtid]->th.th_team;
247 #endif
248  KMP_DEBUG_ASSERT(team != NULL);
249  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
250  KA_TRACE(20,
251  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
252  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
253  KMP_MB(); // Flush all pending memory write invalidates.
254  }
255  KA_TRACE(
256  20,
257  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
258  gtid, team->t.t_id, tid, bt));
259  return false;
260 }
261 
262 static void __kmp_linear_barrier_gather(
263  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
264  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
265  __kmp_linear_barrier_gather_template<false>(
266  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
267 }
268 
269 static bool __kmp_linear_barrier_gather_cancellable(
270  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
271  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
272  return __kmp_linear_barrier_gather_template<true>(
273  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
274 }
275 
276 static void __kmp_linear_barrier_release(
277  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
278  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
279  __kmp_linear_barrier_release_template<false>(
280  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
281 }
282 
283 static bool __kmp_linear_barrier_release_cancellable(
284  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
285  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
286  return __kmp_linear_barrier_release_template<true>(
287  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
288 }
289 
290 // Tree barrier
291 static void __kmp_tree_barrier_gather(
292  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
293  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
294  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
295  kmp_team_t *team = this_thr->th.th_team;
296  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
297  kmp_info_t **other_threads = team->t.t_threads;
298  kmp_uint32 nproc = this_thr->th.th_team_nproc;
299  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
300  kmp_uint32 branch_factor = 1 << branch_bits;
301  kmp_uint32 child;
302  kmp_uint32 child_tid;
303  kmp_uint64 new_state = 0;
304 
305  KA_TRACE(
306  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
307  gtid, team->t.t_id, tid, bt));
308  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
309 
310 #if USE_ITT_BUILD && USE_ITT_NOTIFY
311  // Barrier imbalance - save arrive time to the thread
312  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
313  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
314  __itt_get_timestamp();
315  }
316 #endif
317  // Perform tree gather to wait until all threads have arrived; reduce any
318  // required data as we go
319  child_tid = (tid << branch_bits) + 1;
320  if (child_tid < nproc) {
321  // Parent threads wait for all their children to arrive
322  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
323  child = 1;
324  do {
325  kmp_info_t *child_thr = other_threads[child_tid];
326  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
327 #if KMP_CACHE_MANAGE
328  // Prefetch next thread's arrived count
329  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
330  KMP_CACHE_PREFETCH(
331  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
332 #endif /* KMP_CACHE_MANAGE */
333  KA_TRACE(20,
334  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
335  "arrived(%p) == %llu\n",
336  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
337  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
338  // Wait for child to arrive
339  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
340  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
341  ANNOTATE_BARRIER_END(child_thr);
342 #if USE_ITT_BUILD && USE_ITT_NOTIFY
343  // Barrier imbalance - write min of the thread time and a child time to
344  // the thread.
345  if (__kmp_forkjoin_frames_mode == 2) {
346  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
347  child_thr->th.th_bar_min_time);
348  }
349 #endif
350  if (reduce) {
351  KA_TRACE(100,
352  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
353  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
354  team->t.t_id, child_tid));
355  ANNOTATE_REDUCE_AFTER(reduce);
356  OMPT_REDUCTION_DECL(this_thr, gtid);
357  OMPT_REDUCTION_BEGIN;
358  (*reduce)(this_thr->th.th_local.reduce_data,
359  child_thr->th.th_local.reduce_data);
360  OMPT_REDUCTION_END;
361  ANNOTATE_REDUCE_BEFORE(reduce);
362  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
363  }
364  child++;
365  child_tid++;
366  } while (child <= branch_factor && child_tid < nproc);
367  }
368 
369  if (!KMP_MASTER_TID(tid)) { // Worker threads
370  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
371 
372  KA_TRACE(20,
373  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
374  "arrived(%p): %llu => %llu\n",
375  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
376  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
377  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
378 
379  // Mark arrival to parent thread
380  /* After performing this write, a worker thread may not assume that the team
381  is valid any more - it could be deallocated by the primary thread at any
382  time. */
383  ANNOTATE_BARRIER_BEGIN(this_thr);
384  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
385  flag.release();
386  } else {
387  // Need to update the team arrived pointer if we are the primary thread
388  if (nproc > 1) // New value was already computed above
389  team->t.t_bar[bt].b_arrived = new_state;
390  else
391  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
392  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
393  "arrived(%p) = %llu\n",
394  gtid, team->t.t_id, tid, team->t.t_id,
395  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
396  }
397  KA_TRACE(20,
398  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
399  gtid, team->t.t_id, tid, bt));
400 }
401 
402 static void __kmp_tree_barrier_release(
403  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
404  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
405  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
406  kmp_team_t *team;
407  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
408  kmp_uint32 nproc;
409  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
410  kmp_uint32 branch_factor = 1 << branch_bits;
411  kmp_uint32 child;
412  kmp_uint32 child_tid;
413 
414  // Perform a tree release for all of the threads that have been gathered
415  if (!KMP_MASTER_TID(
416  tid)) { // Handle fork barrier workers who aren't part of a team yet
417  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
418  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
419  // Wait for parent thread to release us
420  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
421  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
422  ANNOTATE_BARRIER_END(this_thr);
423 #if USE_ITT_BUILD && USE_ITT_NOTIFY
424  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
425  // In fork barrier where we could not get the object reliably (or
426  // ITTNOTIFY is disabled)
427  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
428  // Cancel wait on previous parallel region...
429  __kmp_itt_task_starting(itt_sync_obj);
430 
431  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
432  return;
433 
434  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
435  if (itt_sync_obj != NULL)
436  // Call prepare as early as possible for "new" barrier
437  __kmp_itt_task_finished(itt_sync_obj);
438  } else
439 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
440  // Early exit for reaping threads releasing forkjoin barrier
441  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
442  return;
443 
444  // The worker thread may now assume that the team is valid.
445  team = __kmp_threads[gtid]->th.th_team;
446  KMP_DEBUG_ASSERT(team != NULL);
447  tid = __kmp_tid_from_gtid(gtid);
448 
449  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
450  KA_TRACE(20,
451  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
452  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
453  KMP_MB(); // Flush all pending memory write invalidates.
454  } else {
455  team = __kmp_threads[gtid]->th.th_team;
456  KMP_DEBUG_ASSERT(team != NULL);
457  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
458  "barrier type %d\n",
459  gtid, team->t.t_id, tid, bt));
460  }
461  nproc = this_thr->th.th_team_nproc;
462  child_tid = (tid << branch_bits) + 1;
463 
464  if (child_tid < nproc) {
465  kmp_info_t **other_threads = team->t.t_threads;
466  child = 1;
467  // Parent threads release all their children
468  do {
469  kmp_info_t *child_thr = other_threads[child_tid];
470  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
471 #if KMP_CACHE_MANAGE
472  // Prefetch next thread's go count
473  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
474  KMP_CACHE_PREFETCH(
475  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
476 #endif /* KMP_CACHE_MANAGE */
477 
478 #if KMP_BARRIER_ICV_PUSH
479  {
480  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
481  if (propagate_icvs) {
482  __kmp_init_implicit_task(team->t.t_ident,
483  team->t.t_threads[child_tid], team,
484  child_tid, FALSE);
485  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
486  &team->t.t_implicit_task_taskdata[0].td_icvs);
487  }
488  }
489 #endif // KMP_BARRIER_ICV_PUSH
490  KA_TRACE(20,
491  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
492  "go(%p): %u => %u\n",
493  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
494  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
495  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
496  // Release child from barrier
497  ANNOTATE_BARRIER_BEGIN(child_thr);
498  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
499  flag.release();
500  child++;
501  child_tid++;
502  } while (child <= branch_factor && child_tid < nproc);
503  }
504  KA_TRACE(
505  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
506  gtid, team->t.t_id, tid, bt));
507 }
508 
509 // Hyper Barrier
510 static void __kmp_hyper_barrier_gather(
511  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
512  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
513  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
514  kmp_team_t *team = this_thr->th.th_team;
515  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
516  kmp_info_t **other_threads = team->t.t_threads;
517  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
518  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
519  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
520  kmp_uint32 branch_factor = 1 << branch_bits;
521  kmp_uint32 offset;
522  kmp_uint32 level;
523 
524  KA_TRACE(
525  20,
526  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
527  gtid, team->t.t_id, tid, bt));
528  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
529 
530 #if USE_ITT_BUILD && USE_ITT_NOTIFY
531  // Barrier imbalance - save arrive time to the thread
532  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
533  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
534  __itt_get_timestamp();
535  }
536 #endif
537  /* Perform a hypercube-embedded tree gather to wait until all of the threads
538  have arrived, and reduce any required data as we go. */
539  kmp_flag_64<> p_flag(&thr_bar->b_arrived);
540  for (level = 0, offset = 1; offset < num_threads;
541  level += branch_bits, offset <<= branch_bits) {
542  kmp_uint32 child;
543  kmp_uint32 child_tid;
544 
545  if (((tid >> level) & (branch_factor - 1)) != 0) {
546  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
547 
548  KMP_MB(); // Synchronize parent and child threads.
549  KA_TRACE(20,
550  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
551  "arrived(%p): %llu => %llu\n",
552  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
553  team->t.t_id, parent_tid, &thr_bar->b_arrived,
554  thr_bar->b_arrived,
555  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
556  // Mark arrival to parent thread
557  /* After performing this write (in the last iteration of the enclosing for
558  loop), a worker thread may not assume that the team is valid any more
559  - it could be deallocated by the primary thread at any time. */
560  ANNOTATE_BARRIER_BEGIN(this_thr);
561  p_flag.set_waiter(other_threads[parent_tid]);
562  p_flag.release();
563  break;
564  }
565 
566  // Parent threads wait for children to arrive
567  if (new_state == KMP_BARRIER_UNUSED_STATE)
568  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
569  for (child = 1, child_tid = tid + (1 << level);
570  child < branch_factor && child_tid < num_threads;
571  child++, child_tid += (1 << level)) {
572  kmp_info_t *child_thr = other_threads[child_tid];
573  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
574 #if KMP_CACHE_MANAGE
575  kmp_uint32 next_child_tid = child_tid + (1 << level);
576  // Prefetch next thread's arrived count
577  if (child + 1 < branch_factor && next_child_tid < num_threads)
578  KMP_CACHE_PREFETCH(
579  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
580 #endif /* KMP_CACHE_MANAGE */
581  KA_TRACE(20,
582  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
583  "arrived(%p) == %llu\n",
584  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
585  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
586  // Wait for child to arrive
587  kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
588  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
589  ANNOTATE_BARRIER_END(child_thr);
590  KMP_MB(); // Synchronize parent and child threads.
591 #if USE_ITT_BUILD && USE_ITT_NOTIFY
592  // Barrier imbalance - write min of the thread time and a child time to
593  // the thread.
594  if (__kmp_forkjoin_frames_mode == 2) {
595  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
596  child_thr->th.th_bar_min_time);
597  }
598 #endif
599  if (reduce) {
600  KA_TRACE(100,
601  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
602  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
603  team->t.t_id, child_tid));
604  ANNOTATE_REDUCE_AFTER(reduce);
605  OMPT_REDUCTION_DECL(this_thr, gtid);
606  OMPT_REDUCTION_BEGIN;
607  (*reduce)(this_thr->th.th_local.reduce_data,
608  child_thr->th.th_local.reduce_data);
609  OMPT_REDUCTION_END;
610  ANNOTATE_REDUCE_BEFORE(reduce);
611  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
612  }
613  }
614  }
615 
616  if (KMP_MASTER_TID(tid)) {
617  // Need to update the team arrived pointer if we are the primary thread
618  if (new_state == KMP_BARRIER_UNUSED_STATE)
619  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
620  else
621  team->t.t_bar[bt].b_arrived = new_state;
622  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
623  "arrived(%p) = %llu\n",
624  gtid, team->t.t_id, tid, team->t.t_id,
625  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
626  }
627  KA_TRACE(
628  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
629  gtid, team->t.t_id, tid, bt));
630 }
631 
632 // The reverse versions seem to beat the forward versions overall
633 #define KMP_REVERSE_HYPER_BAR
634 static void __kmp_hyper_barrier_release(
635  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
636  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
637  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
638  kmp_team_t *team;
639  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
640  kmp_info_t **other_threads;
641  kmp_uint32 num_threads;
642  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
643  kmp_uint32 branch_factor = 1 << branch_bits;
644  kmp_uint32 child;
645  kmp_uint32 child_tid;
646  kmp_uint32 offset;
647  kmp_uint32 level;
648 
649  /* Perform a hypercube-embedded tree release for all of the threads that have
650  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
651  are released in the reverse order of the corresponding gather, otherwise
652  threads are released in the same order. */
653  if (KMP_MASTER_TID(tid)) { // primary thread
654  team = __kmp_threads[gtid]->th.th_team;
655  KMP_DEBUG_ASSERT(team != NULL);
656  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
657  "barrier type %d\n",
658  gtid, team->t.t_id, tid, bt));
659 #if KMP_BARRIER_ICV_PUSH
660  if (propagate_icvs) { // primary already has ICVs in final destination; copy
661  copy_icvs(&thr_bar->th_fixed_icvs,
662  &team->t.t_implicit_task_taskdata[tid].td_icvs);
663  }
664 #endif
665  } else { // Handle fork barrier workers who aren't part of a team yet
666  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
667  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
668  // Wait for parent thread to release us
669  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
670  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
671  ANNOTATE_BARRIER_END(this_thr);
672 #if USE_ITT_BUILD && USE_ITT_NOTIFY
673  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
674  // In fork barrier where we could not get the object reliably
675  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
676  // Cancel wait on previous parallel region...
677  __kmp_itt_task_starting(itt_sync_obj);
678 
679  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
680  return;
681 
682  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
683  if (itt_sync_obj != NULL)
684  // Call prepare as early as possible for "new" barrier
685  __kmp_itt_task_finished(itt_sync_obj);
686  } else
687 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
688  // Early exit for reaping threads releasing forkjoin barrier
689  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
690  return;
691 
692  // The worker thread may now assume that the team is valid.
693  team = __kmp_threads[gtid]->th.th_team;
694  KMP_DEBUG_ASSERT(team != NULL);
695  tid = __kmp_tid_from_gtid(gtid);
696 
697  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
698  KA_TRACE(20,
699  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
700  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
701  KMP_MB(); // Flush all pending memory write invalidates.
702  }
703  num_threads = this_thr->th.th_team_nproc;
704  other_threads = team->t.t_threads;
705 
706 #ifdef KMP_REVERSE_HYPER_BAR
707  // Count up to correct level for parent
708  for (level = 0, offset = 1;
709  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
710  level += branch_bits, offset <<= branch_bits)
711  ;
712 
713  // Now go down from there
714  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
715  level -= branch_bits, offset >>= branch_bits)
716 #else
717  // Go down the tree, level by level
718  for (level = 0, offset = 1; offset < num_threads;
719  level += branch_bits, offset <<= branch_bits)
720 #endif // KMP_REVERSE_HYPER_BAR
721  {
722 #ifdef KMP_REVERSE_HYPER_BAR
723  /* Now go in reverse order through the children, highest to lowest.
724  Initial setting of child is conservative here. */
725  child = num_threads >> ((level == 0) ? level : level - 1);
726  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
727  child_tid = tid + (child << level);
728  child >= 1; child--, child_tid -= (1 << level))
729 #else
730  if (((tid >> level) & (branch_factor - 1)) != 0)
731  // No need to go lower than this, since this is the level parent would be
732  // notified
733  break;
734  // Iterate through children on this level of the tree
735  for (child = 1, child_tid = tid + (1 << level);
736  child < branch_factor && child_tid < num_threads;
737  child++, child_tid += (1 << level))
738 #endif // KMP_REVERSE_HYPER_BAR
739  {
740  if (child_tid >= num_threads)
741  continue; // Child doesn't exist so keep going
742  else {
743  kmp_info_t *child_thr = other_threads[child_tid];
744  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
745 #if KMP_CACHE_MANAGE
746  kmp_uint32 next_child_tid = child_tid - (1 << level);
747 // Prefetch next thread's go count
748 #ifdef KMP_REVERSE_HYPER_BAR
749  if (child - 1 >= 1 && next_child_tid < num_threads)
750 #else
751  if (child + 1 < branch_factor && next_child_tid < num_threads)
752 #endif // KMP_REVERSE_HYPER_BAR
753  KMP_CACHE_PREFETCH(
754  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
755 #endif /* KMP_CACHE_MANAGE */
756 
757 #if KMP_BARRIER_ICV_PUSH
758  if (propagate_icvs) // push my fixed ICVs to my child
759  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
760 #endif // KMP_BARRIER_ICV_PUSH
761 
762  KA_TRACE(
763  20,
764  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
765  "go(%p): %u => %u\n",
766  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
767  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
768  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
769  // Release child from barrier
770  ANNOTATE_BARRIER_BEGIN(child_thr);
771  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
772  flag.release();
773  }
774  }
775  }
776 #if KMP_BARRIER_ICV_PUSH
777  if (propagate_icvs &&
778  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
779  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
780  FALSE);
781  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
782  &thr_bar->th_fixed_icvs);
783  }
784 #endif
785  KA_TRACE(
786  20,
787  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
788  gtid, team->t.t_id, tid, bt));
789 }
790 
791 // Hierarchical Barrier
792 
793 // Initialize thread barrier data
794 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
795  Performs the minimum amount of initialization required based on how the team
796  has changed. Returns true if leaf children will require both on-core and
797  traditional wake-up mechanisms. For example, if the team size increases,
798  threads already in the team will respond to on-core wakeup on their parent
799  thread, but threads newly added to the team will only be listening on the
800  their local b_go. */
801 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
802  kmp_bstate_t *thr_bar,
803  kmp_uint32 nproc, int gtid,
804  int tid, kmp_team_t *team) {
805  // Checks to determine if (re-)initialization is needed
806  bool uninitialized = thr_bar->team == NULL;
807  bool team_changed = team != thr_bar->team;
808  bool team_sz_changed = nproc != thr_bar->nproc;
809  bool tid_changed = tid != thr_bar->old_tid;
810  bool retval = false;
811 
812  if (uninitialized || team_sz_changed) {
813  __kmp_get_hierarchy(nproc, thr_bar);
814  }
815 
816  if (uninitialized || team_sz_changed || tid_changed) {
817  thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
818  thr_bar->parent_tid = -1; // default for primary thread
819  if (!KMP_MASTER_TID(tid)) {
820  // if not primary thread, find parent thread in hierarchy
821  kmp_uint32 d = 0;
822  while (d < thr_bar->depth) { // find parent based on level of thread in
823  // hierarchy, and note level
824  kmp_uint32 rem;
825  if (d == thr_bar->depth - 2) { // reached level right below the primary
826  thr_bar->parent_tid = 0;
827  thr_bar->my_level = d;
828  break;
829  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
830  // TODO: can we make the above op faster?
831  // thread is not a subtree root at next level, so this is max
832  thr_bar->parent_tid = tid - rem;
833  thr_bar->my_level = d;
834  break;
835  }
836  ++d;
837  }
838  }
839  __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
840  (thr_bar->skip_per_level[thr_bar->my_level])),
841  &(thr_bar->offset));
842  thr_bar->old_tid = tid;
843  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
844  thr_bar->team = team;
845  thr_bar->parent_bar =
846  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
847  }
848  if (uninitialized || team_changed || tid_changed) {
849  thr_bar->team = team;
850  thr_bar->parent_bar =
851  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
852  retval = true;
853  }
854  if (uninitialized || team_sz_changed || tid_changed) {
855  thr_bar->nproc = nproc;
856  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
857  if (thr_bar->my_level == 0)
858  thr_bar->leaf_kids = 0;
859  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
860  __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
861  thr_bar->leaf_state = 0;
862  for (int i = 0; i < thr_bar->leaf_kids; ++i)
863  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
864  }
865  return retval;
866 }
867 
868 static void __kmp_hierarchical_barrier_gather(
869  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
870  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
871  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
872  kmp_team_t *team = this_thr->th.th_team;
873  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
874  kmp_uint32 nproc = this_thr->th.th_team_nproc;
875  kmp_info_t **other_threads = team->t.t_threads;
876  kmp_uint64 new_state = 0;
877 
878  int level = team->t.t_level;
879  if (other_threads[0]
880  ->th.th_teams_microtask) // are we inside the teams construct?
881  if (this_thr->th.th_teams_size.nteams > 1)
882  ++level; // level was not increased in teams construct for team_of_masters
883  if (level == 1)
884  thr_bar->use_oncore_barrier = 1;
885  else
886  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
887 
888  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
889  "barrier type %d\n",
890  gtid, team->t.t_id, tid, bt));
891  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
892 
893 #if USE_ITT_BUILD && USE_ITT_NOTIFY
894  // Barrier imbalance - save arrive time to the thread
895  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
896  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
897  }
898 #endif
899 
900  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
901  team);
902 
903  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
904  kmp_int32 child_tid;
905  new_state =
906  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
907  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
908  thr_bar->use_oncore_barrier) {
909  if (thr_bar->leaf_kids) {
910  // First, wait for leaf children to check-in on my b_arrived flag
911  kmp_uint64 leaf_state =
912  KMP_MASTER_TID(tid)
913  ? thr_bar->b_arrived | thr_bar->leaf_state
914  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
915  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
916  "for leaf kids\n",
917  gtid, team->t.t_id, tid));
918  kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
919  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
920  if (reduce) {
921  ANNOTATE_REDUCE_AFTER(reduce);
922  OMPT_REDUCTION_DECL(this_thr, gtid);
923  OMPT_REDUCTION_BEGIN;
924  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
925  ++child_tid) {
926  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
927  "T#%d(%d:%d)\n",
928  gtid, team->t.t_id, tid,
929  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
930  child_tid));
931  ANNOTATE_BARRIER_END(other_threads[child_tid]);
932  (*reduce)(this_thr->th.th_local.reduce_data,
933  other_threads[child_tid]->th.th_local.reduce_data);
934  }
935  OMPT_REDUCTION_END;
936  ANNOTATE_REDUCE_BEFORE(reduce);
937  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
938  }
939  // clear leaf_state bits
940  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
941  }
942  // Next, wait for higher level children on each child's b_arrived flag
943  for (kmp_uint32 d = 1; d < thr_bar->my_level;
944  ++d) { // gather lowest level threads first, but skip 0
945  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
946  skip = thr_bar->skip_per_level[d];
947  if (last > nproc)
948  last = nproc;
949  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
950  kmp_info_t *child_thr = other_threads[child_tid];
951  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
952  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
953  "T#%d(%d:%d) "
954  "arrived(%p) == %llu\n",
955  gtid, team->t.t_id, tid,
956  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
957  child_tid, &child_bar->b_arrived, new_state));
958  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
959  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
960  ANNOTATE_BARRIER_END(child_thr);
961  if (reduce) {
962  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
963  "T#%d(%d:%d)\n",
964  gtid, team->t.t_id, tid,
965  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
966  child_tid));
967  ANNOTATE_REDUCE_AFTER(reduce);
968  (*reduce)(this_thr->th.th_local.reduce_data,
969  child_thr->th.th_local.reduce_data);
970  ANNOTATE_REDUCE_BEFORE(reduce);
971  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
972  }
973  }
974  }
975  } else { // Blocktime is not infinite
976  for (kmp_uint32 d = 0; d < thr_bar->my_level;
977  ++d) { // Gather lowest level threads first
978  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
979  skip = thr_bar->skip_per_level[d];
980  if (last > nproc)
981  last = nproc;
982  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
983  kmp_info_t *child_thr = other_threads[child_tid];
984  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
985  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
986  "T#%d(%d:%d) "
987  "arrived(%p) == %llu\n",
988  gtid, team->t.t_id, tid,
989  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
990  child_tid, &child_bar->b_arrived, new_state));
991  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
992  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
993  ANNOTATE_BARRIER_END(child_thr);
994  if (reduce) {
995  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
996  "T#%d(%d:%d)\n",
997  gtid, team->t.t_id, tid,
998  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
999  child_tid));
1000  ANNOTATE_REDUCE_AFTER(reduce);
1001  (*reduce)(this_thr->th.th_local.reduce_data,
1002  child_thr->th.th_local.reduce_data);
1003  ANNOTATE_REDUCE_BEFORE(reduce);
1004  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1005  }
1006  }
1007  }
1008  }
1009  }
1010  // All subordinates are gathered; now release parent if not primary thread
1011 
1012  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1013  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1014  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1015  gtid, team->t.t_id, tid,
1016  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1017  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1018  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1019  /* Mark arrival to parent: After performing this write, a worker thread may
1020  not assume that the team is valid any more - it could be deallocated by
1021  the primary thread at any time. */
1022  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1023  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1024  // flag; release it
1025  ANNOTATE_BARRIER_BEGIN(this_thr);
1026  kmp_flag_64<> flag(&thr_bar->b_arrived,
1027  other_threads[thr_bar->parent_tid]);
1028  flag.release();
1029  } else {
1030  // Leaf does special release on "offset" bits of parent's b_arrived flag
1031  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1032  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1033  thr_bar->offset + 1);
1034  flag.set_waiter(other_threads[thr_bar->parent_tid]);
1035  flag.release();
1036  }
1037  } else { // Primary thread needs to update the team's b_arrived value
1038  team->t.t_bar[bt].b_arrived = new_state;
1039  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1040  "arrived(%p) = %llu\n",
1041  gtid, team->t.t_id, tid, team->t.t_id,
1042  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1043  }
1044  // Is the team access below unsafe or just technically invalid?
1045  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1046  "barrier type %d\n",
1047  gtid, team->t.t_id, tid, bt));
1048 }
1049 
1050 static void __kmp_hierarchical_barrier_release(
1051  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1052  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1053  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1054  kmp_team_t *team;
1055  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1056  kmp_uint32 nproc;
1057  bool team_change = false; // indicates on-core barrier shouldn't be used
1058 
1059  if (KMP_MASTER_TID(tid)) {
1060  team = __kmp_threads[gtid]->th.th_team;
1061  KMP_DEBUG_ASSERT(team != NULL);
1062  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1063  "entered barrier type %d\n",
1064  gtid, team->t.t_id, tid, bt));
1065  } else { // Worker threads
1066  // Wait for parent thread to release me
1067  if (!thr_bar->use_oncore_barrier ||
1068  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1069  thr_bar->team == NULL) {
1070  // Use traditional method of waiting on my own b_go flag
1071  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1072  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1073  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1074  ANNOTATE_BARRIER_END(this_thr);
1075  TCW_8(thr_bar->b_go,
1076  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1077  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1078  // infinite, not nested
1079  // Wait on my "offset" bits on parent's b_go flag
1080  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1081  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1082  thr_bar->offset + 1, bt,
1083  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1084  flag.wait(this_thr, TRUE);
1085  if (thr_bar->wait_flag ==
1086  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1087  TCW_8(thr_bar->b_go,
1088  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1089  } else { // Reset my bits on parent's b_go flag
1090  (RCAST(volatile char *,
1091  &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1092  }
1093  }
1094  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1095  // Early exit for reaping threads releasing forkjoin barrier
1096  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1097  return;
1098  // The worker thread may now assume that the team is valid.
1099  team = __kmp_threads[gtid]->th.th_team;
1100  KMP_DEBUG_ASSERT(team != NULL);
1101  tid = __kmp_tid_from_gtid(gtid);
1102 
1103  KA_TRACE(
1104  20,
1105  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1106  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1107  KMP_MB(); // Flush all pending memory write invalidates.
1108  }
1109 
1110  nproc = this_thr->th.th_team_nproc;
1111  int level = team->t.t_level;
1112  if (team->t.t_threads[0]
1113  ->th.th_teams_microtask) { // are we inside the teams construct?
1114  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1115  this_thr->th.th_teams_level == level)
1116  ++level; // level was not increased in teams construct for team_of_workers
1117  if (this_thr->th.th_teams_size.nteams > 1)
1118  ++level; // level was not increased in teams construct for team_of_masters
1119  }
1120  if (level == 1)
1121  thr_bar->use_oncore_barrier = 1;
1122  else
1123  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1124 
1125  // If the team size has increased, we still communicate with old leaves via
1126  // oncore barrier.
1127  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1128  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1129  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1130  tid, team);
1131  // But if the entire team changes, we won't use oncore barrier at all
1132  if (team_change)
1133  old_leaf_kids = 0;
1134 
1135 #if KMP_BARRIER_ICV_PUSH
1136  if (propagate_icvs) {
1137  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1138  FALSE);
1139  if (KMP_MASTER_TID(
1140  tid)) { // primary already has copy in final destination; copy
1141  copy_icvs(&thr_bar->th_fixed_icvs,
1142  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1143  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1144  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1145  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1146  // leaves (on-core children) pull parent's fixed ICVs directly to local
1147  // ICV store
1148  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1149  &thr_bar->parent_bar->th_fixed_icvs);
1150  // non-leaves will get ICVs piggybacked with b_go via NGO store
1151  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1152  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1153  // access
1154  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1155  else // leaves copy parent's fixed ICVs directly to local ICV store
1156  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1157  &thr_bar->parent_bar->th_fixed_icvs);
1158  }
1159  }
1160 #endif // KMP_BARRIER_ICV_PUSH
1161 
1162  // Now, release my children
1163  if (thr_bar->my_level) { // not a leaf
1164  kmp_int32 child_tid;
1165  kmp_uint32 last;
1166  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1167  thr_bar->use_oncore_barrier) {
1168  if (KMP_MASTER_TID(tid)) { // do a flat release
1169  // Set local b_go to bump children via NGO store of the cache line
1170  // containing IVCs and b_go.
1171  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1172  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1173  // the cache line
1174  ngo_load(&thr_bar->th_fixed_icvs);
1175  // This loops over all the threads skipping only the leaf nodes in the
1176  // hierarchy
1177  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1178  child_tid += thr_bar->skip_per_level[1]) {
1179  kmp_bstate_t *child_bar =
1180  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1181  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1182  "releasing T#%d(%d:%d)"
1183  " go(%p): %u => %u\n",
1184  gtid, team->t.t_id, tid,
1185  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1186  child_tid, &child_bar->b_go, child_bar->b_go,
1187  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1188  // Use ngo store (if available) to both store ICVs and release child
1189  // via child's b_go
1190  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1191  }
1192  ngo_sync();
1193  }
1194  TCW_8(thr_bar->b_go,
1195  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1196  // Now, release leaf children
1197  if (thr_bar->leaf_kids) { // if there are any
1198  // We test team_change on the off-chance that the level 1 team changed.
1199  if (team_change ||
1200  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1201  if (old_leaf_kids) { // release old leaf kids
1202  thr_bar->b_go |= old_leaf_state;
1203  }
1204  // Release new leaf kids
1205  last = tid + thr_bar->skip_per_level[1];
1206  if (last > nproc)
1207  last = nproc;
1208  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1209  ++child_tid) { // skip_per_level[0]=1
1210  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1211  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1212  KA_TRACE(
1213  20,
1214  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1215  " T#%d(%d:%d) go(%p): %u => %u\n",
1216  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1217  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1218  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1219  // Release child using child's b_go flag
1220  ANNOTATE_BARRIER_BEGIN(child_thr);
1221  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1222  flag.release();
1223  }
1224  } else { // Release all children at once with leaf_state bits on my own
1225  // b_go flag
1226  thr_bar->b_go |= thr_bar->leaf_state;
1227  }
1228  }
1229  } else { // Blocktime is not infinite; do a simple hierarchical release
1230  for (int d = thr_bar->my_level - 1; d >= 0;
1231  --d) { // Release highest level threads first
1232  last = tid + thr_bar->skip_per_level[d + 1];
1233  kmp_uint32 skip = thr_bar->skip_per_level[d];
1234  if (last > nproc)
1235  last = nproc;
1236  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1237  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1238  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1239  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1240  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1241  gtid, team->t.t_id, tid,
1242  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1243  child_tid, &child_bar->b_go, child_bar->b_go,
1244  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1245  // Release child using child's b_go flag
1246  ANNOTATE_BARRIER_BEGIN(child_thr);
1247  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1248  flag.release();
1249  }
1250  }
1251  }
1252 #if KMP_BARRIER_ICV_PUSH
1253  if (propagate_icvs && !KMP_MASTER_TID(tid))
1254  // non-leaves copy ICVs from fixed ICVs to local dest
1255  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1256  &thr_bar->th_fixed_icvs);
1257 #endif // KMP_BARRIER_ICV_PUSH
1258  }
1259  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1260  "barrier type %d\n",
1261  gtid, team->t.t_id, tid, bt));
1262 }
1263 
1264 // End of Barrier Algorithms
1265 
1266 // type traits for cancellable value
1267 // if cancellable is true, then is_cancellable is a normal boolean variable
1268 // if cancellable is false, then is_cancellable is a compile time constant
1269 template <bool cancellable> struct is_cancellable {};
1270 template <> struct is_cancellable<true> {
1271  bool value;
1272  is_cancellable() : value(false) {}
1273  is_cancellable(bool b) : value(b) {}
1274  is_cancellable &operator=(bool b) {
1275  value = b;
1276  return *this;
1277  }
1278  operator bool() const { return value; }
1279 };
1280 template <> struct is_cancellable<false> {
1281  is_cancellable &operator=(bool b) { return *this; }
1282  constexpr operator bool() const { return false; }
1283 };
1284 
1285 // Internal function to do a barrier.
1286 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1287  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1288  barrier
1289  When cancellable = false,
1290  Returns 0 if primary thread, 1 if worker thread.
1291  When cancellable = true
1292  Returns 0 if not cancelled, 1 if cancelled. */
1293 template <bool cancellable = false>
1294 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1295  size_t reduce_size, void *reduce_data,
1296  void (*reduce)(void *, void *)) {
1297  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1298  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1299  int tid = __kmp_tid_from_gtid(gtid);
1300  kmp_info_t *this_thr = __kmp_threads[gtid];
1301  kmp_team_t *team = this_thr->th.th_team;
1302  int status = 0;
1303  is_cancellable<cancellable> cancelled;
1304 #if OMPT_SUPPORT && OMPT_OPTIONAL
1305  ompt_data_t *my_task_data;
1306  ompt_data_t *my_parallel_data;
1307  void *return_address;
1308  ompt_sync_region_t barrier_kind;
1309 #endif
1310 
1311  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1312  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1313 
1314  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1315 #if OMPT_SUPPORT
1316  if (ompt_enabled.enabled) {
1317 #if OMPT_OPTIONAL
1318  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1319  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1320  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1321  barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1322  if (ompt_enabled.ompt_callback_sync_region) {
1323  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1324  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1325  return_address);
1326  }
1327  if (ompt_enabled.ompt_callback_sync_region_wait) {
1328  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1329  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1330  return_address);
1331  }
1332 #endif
1333  // It is OK to report the barrier state after the barrier begin callback.
1334  // According to the OMPT specification, a compliant implementation may
1335  // even delay reporting this state until the barrier begins to wait.
1336  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1337  }
1338 #endif
1339 
1340  if (!team->t.t_serialized) {
1341 #if USE_ITT_BUILD
1342  // This value will be used in itt notify events below.
1343  void *itt_sync_obj = NULL;
1344 #if USE_ITT_NOTIFY
1345  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1346  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1347 #endif
1348 #endif /* USE_ITT_BUILD */
1349  if (__kmp_tasking_mode == tskm_extra_barrier) {
1350  __kmp_tasking_barrier(team, this_thr, gtid);
1351  KA_TRACE(15,
1352  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1353  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1354  }
1355 
1356  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1357  access it when the team struct is not guaranteed to exist. */
1358  // See note about the corresponding code in __kmp_join_barrier() being
1359  // performance-critical.
1360  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1361 #if KMP_USE_MONITOR
1362  this_thr->th.th_team_bt_intervals =
1363  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1364  this_thr->th.th_team_bt_set =
1365  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1366 #else
1367  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1368 #endif
1369  }
1370 
1371 #if USE_ITT_BUILD
1372  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1373  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1374 #endif /* USE_ITT_BUILD */
1375 #if USE_DEBUGGER
1376  // Let the debugger know: the thread arrived to the barrier and waiting.
1377  if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1378  team->t.t_bar[bt].b_master_arrived += 1;
1379  } else {
1380  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1381  } // if
1382 #endif /* USE_DEBUGGER */
1383  if (reduce != NULL) {
1384  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1385  this_thr->th.th_local.reduce_data = reduce_data;
1386  }
1387 
1388  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1389  // use 0 to only setup the current team if nthreads > 1
1390  __kmp_task_team_setup(this_thr, team, 0);
1391 
1392  if (cancellable) {
1393  cancelled = __kmp_linear_barrier_gather_cancellable(
1394  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1395  } else {
1396  switch (__kmp_barrier_gather_pattern[bt]) {
1397  case bp_hyper_bar: {
1398  // don't set branch bits to 0; use linear
1399  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1400  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1401  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1402  break;
1403  }
1404  case bp_hierarchical_bar: {
1405  __kmp_hierarchical_barrier_gather(
1406  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1407  break;
1408  }
1409  case bp_tree_bar: {
1410  // don't set branch bits to 0; use linear
1411  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1412  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1413  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1414  break;
1415  }
1416  default: {
1417  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1418  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1419  }
1420  }
1421  }
1422 
1423  KMP_MB();
1424 
1425  if (KMP_MASTER_TID(tid)) {
1426  status = 0;
1427  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1428  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1429  }
1430 #if USE_DEBUGGER
1431  // Let the debugger know: All threads are arrived and starting leaving the
1432  // barrier.
1433  team->t.t_bar[bt].b_team_arrived += 1;
1434 #endif
1435 
1436  if (__kmp_omp_cancellation) {
1437  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1438  // Reset cancellation flag for worksharing constructs
1439  if (cancel_request == cancel_loop ||
1440  cancel_request == cancel_sections) {
1441  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1442  }
1443  }
1444 #if USE_ITT_BUILD
1445  /* TODO: In case of split reduction barrier, primary thread may send
1446  acquired event early, before the final summation into the shared
1447  variable is done (final summation can be a long operation for array
1448  reductions). */
1449  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1450  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1451 #endif /* USE_ITT_BUILD */
1452 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1453  // Barrier - report frame end (only if active_level == 1)
1454  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1455  __kmp_forkjoin_frames_mode &&
1456  (this_thr->th.th_teams_microtask == NULL || // either not in teams
1457  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1458  team->t.t_active_level == 1) {
1459  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1460  kmp_uint64 cur_time = __itt_get_timestamp();
1461  kmp_info_t **other_threads = team->t.t_threads;
1462  int nproc = this_thr->th.th_team_nproc;
1463  int i;
1464  switch (__kmp_forkjoin_frames_mode) {
1465  case 1:
1466  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1467  loc, nproc);
1468  this_thr->th.th_frame_time = cur_time;
1469  break;
1470  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1471  // be fixed)
1472  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1473  1, loc, nproc);
1474  break;
1475  case 3:
1476  if (__itt_metadata_add_ptr) {
1477  // Initialize with primary thread's wait time
1478  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1479  // Set arrive time to zero to be able to check it in
1480  // __kmp_invoke_task(); the same is done inside the loop below
1481  this_thr->th.th_bar_arrive_time = 0;
1482  for (i = 1; i < nproc; ++i) {
1483  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1484  other_threads[i]->th.th_bar_arrive_time = 0;
1485  }
1486  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1487  cur_time, delta,
1488  (kmp_uint64)(reduce != NULL));
1489  }
1490  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1491  loc, nproc);
1492  this_thr->th.th_frame_time = cur_time;
1493  break;
1494  }
1495  }
1496 #endif /* USE_ITT_BUILD */
1497  } else {
1498  status = 1;
1499 #if USE_ITT_BUILD
1500  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1501  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1502 #endif /* USE_ITT_BUILD */
1503  }
1504  if ((status == 1 || !is_split) && !cancelled) {
1505  if (cancellable) {
1506  cancelled = __kmp_linear_barrier_release_cancellable(
1507  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1508  } else {
1509  switch (__kmp_barrier_release_pattern[bt]) {
1510  case bp_hyper_bar: {
1511  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1512  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1513  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1514  break;
1515  }
1516  case bp_hierarchical_bar: {
1517  __kmp_hierarchical_barrier_release(
1518  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1519  break;
1520  }
1521  case bp_tree_bar: {
1522  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1523  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1524  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1525  break;
1526  }
1527  default: {
1528  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1529  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1530  }
1531  }
1532  }
1533  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1534  __kmp_task_team_sync(this_thr, team);
1535  }
1536  }
1537 
1538 #if USE_ITT_BUILD
1539  /* GEH: TODO: Move this under if-condition above and also include in
1540  __kmp_end_split_barrier(). This will more accurately represent the actual
1541  release time of the threads for split barriers. */
1542  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1543  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1544 #endif /* USE_ITT_BUILD */
1545  } else { // Team is serialized.
1546  status = 0;
1547  if (__kmp_tasking_mode != tskm_immediate_exec) {
1548  if (this_thr->th.th_task_team != NULL) {
1549 #if USE_ITT_NOTIFY
1550  void *itt_sync_obj = NULL;
1551  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1552  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1553  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1554  }
1555 #endif
1556 
1557  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1558  TRUE);
1559  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1560  __kmp_task_team_setup(this_thr, team, 0);
1561 
1562 #if USE_ITT_BUILD
1563  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1564  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1565 #endif /* USE_ITT_BUILD */
1566  }
1567  }
1568  }
1569  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1570  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1571  __kmp_tid_from_gtid(gtid), status));
1572 
1573 #if OMPT_SUPPORT
1574  if (ompt_enabled.enabled) {
1575 #if OMPT_OPTIONAL
1576  if (ompt_enabled.ompt_callback_sync_region_wait) {
1577  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1578  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1579  return_address);
1580  }
1581  if (ompt_enabled.ompt_callback_sync_region) {
1582  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1583  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1584  return_address);
1585  }
1586 #endif
1587  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1588  }
1589 #endif
1590  ANNOTATE_BARRIER_END(&team->t.t_bar);
1591 
1592  if (cancellable)
1593  return (int)cancelled;
1594  return status;
1595 }
1596 
1597 // Returns 0 if primary thread, 1 if worker thread.
1598 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1599  size_t reduce_size, void *reduce_data,
1600  void (*reduce)(void *, void *)) {
1601  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1602  reduce);
1603 }
1604 
1605 #if defined(KMP_GOMP_COMPAT)
1606 // Returns 1 if cancelled, 0 otherwise
1607 int __kmp_barrier_gomp_cancel(int gtid) {
1608  if (__kmp_omp_cancellation) {
1609  int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1610  0, NULL, NULL);
1611  if (cancelled) {
1612  int tid = __kmp_tid_from_gtid(gtid);
1613  kmp_info_t *this_thr = __kmp_threads[gtid];
1614  if (KMP_MASTER_TID(tid)) {
1615  // Primary thread does not need to revert anything
1616  } else {
1617  // Workers need to revert their private b_arrived flag
1618  this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1619  KMP_BARRIER_STATE_BUMP;
1620  }
1621  }
1622  return cancelled;
1623  }
1624  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1625  return FALSE;
1626 }
1627 #endif
1628 
1629 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1630  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1631  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1632  KMP_DEBUG_ASSERT(bt < bs_last_barrier);
1633  int tid = __kmp_tid_from_gtid(gtid);
1634  kmp_info_t *this_thr = __kmp_threads[gtid];
1635  kmp_team_t *team = this_thr->th.th_team;
1636 
1637  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1638  if (!team->t.t_serialized) {
1639  if (KMP_MASTER_GTID(gtid)) {
1640  switch (__kmp_barrier_release_pattern[bt]) {
1641  case bp_hyper_bar: {
1642  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1643  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1644  FALSE USE_ITT_BUILD_ARG(NULL));
1645  break;
1646  }
1647  case bp_hierarchical_bar: {
1648  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1649  FALSE USE_ITT_BUILD_ARG(NULL));
1650  break;
1651  }
1652  case bp_tree_bar: {
1653  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1654  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1655  FALSE USE_ITT_BUILD_ARG(NULL));
1656  break;
1657  }
1658  default: {
1659  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1660  FALSE USE_ITT_BUILD_ARG(NULL));
1661  }
1662  }
1663  if (__kmp_tasking_mode != tskm_immediate_exec) {
1664  __kmp_task_team_sync(this_thr, team);
1665  } // if
1666  }
1667  }
1668  ANNOTATE_BARRIER_END(&team->t.t_bar);
1669 }
1670 
1671 void __kmp_join_barrier(int gtid) {
1672  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1673  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1674 
1675  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1676 
1677  kmp_info_t *this_thr = __kmp_threads[gtid];
1678  kmp_team_t *team;
1679  kmp_uint nproc;
1680  kmp_info_t *master_thread;
1681  int tid;
1682 #ifdef KMP_DEBUG
1683  int team_id;
1684 #endif /* KMP_DEBUG */
1685 #if USE_ITT_BUILD
1686  void *itt_sync_obj = NULL;
1687 #if USE_ITT_NOTIFY
1688  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1689  // Get object created at fork_barrier
1690  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1691 #endif
1692 #endif /* USE_ITT_BUILD */
1693  KMP_MB();
1694 
1695  // Get current info
1696  team = this_thr->th.th_team;
1697  nproc = this_thr->th.th_team_nproc;
1698  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1699  tid = __kmp_tid_from_gtid(gtid);
1700 #ifdef KMP_DEBUG
1701  team_id = team->t.t_id;
1702 #endif /* KMP_DEBUG */
1703  master_thread = this_thr->th.th_team_master;
1704 #ifdef KMP_DEBUG
1705  if (master_thread != team->t.t_threads[0]) {
1706  __kmp_print_structure();
1707  }
1708 #endif /* KMP_DEBUG */
1709  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1710  KMP_MB();
1711 
1712  // Verify state
1713  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1714  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1715  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1716  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1717  gtid, team_id, tid));
1718 
1719  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1720 #if OMPT_SUPPORT
1721  if (ompt_enabled.enabled) {
1722 #if OMPT_OPTIONAL
1723  ompt_data_t *my_task_data;
1724  ompt_data_t *my_parallel_data;
1725  void *codeptr = NULL;
1726  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1727  if (KMP_MASTER_TID(ds_tid) &&
1728  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1729  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1730  codeptr = team->t.ompt_team_info.master_return_address;
1731  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1732  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1733  if (ompt_enabled.ompt_callback_sync_region) {
1734  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1735  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1736  my_task_data, codeptr);
1737  }
1738  if (ompt_enabled.ompt_callback_sync_region_wait) {
1739  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1740  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1741  my_task_data, codeptr);
1742  }
1743  if (!KMP_MASTER_TID(ds_tid))
1744  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1745 #endif
1746  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1747  }
1748 #endif
1749 
1750  if (__kmp_tasking_mode == tskm_extra_barrier) {
1751  __kmp_tasking_barrier(team, this_thr, gtid);
1752  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1753  team_id, tid));
1754  }
1755 #ifdef KMP_DEBUG
1756  if (__kmp_tasking_mode != tskm_immediate_exec) {
1757  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1758  "%p, th_task_team = %p\n",
1759  __kmp_gtid_from_thread(this_thr), team_id,
1760  team->t.t_task_team[this_thr->th.th_task_state],
1761  this_thr->th.th_task_team));
1762  KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1763  team->t.t_task_team[this_thr->th.th_task_state]);
1764  }
1765 #endif /* KMP_DEBUG */
1766 
1767  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1768  access it when the team struct is not guaranteed to exist. Doing these
1769  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1770  we do not perform the copy if blocktime=infinite, since the values are not
1771  used by __kmp_wait_template() in that case. */
1772  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1773 #if KMP_USE_MONITOR
1774  this_thr->th.th_team_bt_intervals =
1775  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1776  this_thr->th.th_team_bt_set =
1777  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1778 #else
1779  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1780 #endif
1781  }
1782 
1783 #if USE_ITT_BUILD
1784  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1785  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1786 #endif /* USE_ITT_BUILD */
1787 
1788  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1789  case bp_hyper_bar: {
1790  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1791  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1792  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1793  break;
1794  }
1795  case bp_hierarchical_bar: {
1796  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1797  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1798  break;
1799  }
1800  case bp_tree_bar: {
1801  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1802  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1803  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1804  break;
1805  }
1806  default: {
1807  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1808  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1809  }
1810  }
1811 
1812  /* From this point on, the team data structure may be deallocated at any time
1813  by the primary thread - it is unsafe to reference it in any of the worker
1814  threads. Any per-team data items that need to be referenced before the
1815  end of the barrier should be moved to the kmp_task_team_t structs. */
1816  if (KMP_MASTER_TID(tid)) {
1817  if (__kmp_tasking_mode != tskm_immediate_exec) {
1818  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1819  }
1820  if (__kmp_display_affinity) {
1821  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1822  }
1823 #if KMP_STATS_ENABLED
1824  // Have primary thread flag the workers to indicate they are now waiting for
1825  // next parallel region, Also wake them up so they switch their timers to
1826  // idle.
1827  for (int i = 0; i < team->t.t_nproc; ++i) {
1828  kmp_info_t *team_thread = team->t.t_threads[i];
1829  if (team_thread == this_thr)
1830  continue;
1831  team_thread->th.th_stats->setIdleFlag();
1832  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1833  team_thread->th.th_sleep_loc != NULL)
1834  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1835  team_thread->th.th_sleep_loc);
1836  }
1837 #endif
1838 #if USE_ITT_BUILD
1839  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1840  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1841 #endif /* USE_ITT_BUILD */
1842 
1843 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1844  // Join barrier - report frame end
1845  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1846  __kmp_forkjoin_frames_mode &&
1847  (this_thr->th.th_teams_microtask == NULL || // either not in teams
1848  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1849  team->t.t_active_level == 1) {
1850  kmp_uint64 cur_time = __itt_get_timestamp();
1851  ident_t *loc = team->t.t_ident;
1852  kmp_info_t **other_threads = team->t.t_threads;
1853  int nproc = this_thr->th.th_team_nproc;
1854  int i;
1855  switch (__kmp_forkjoin_frames_mode) {
1856  case 1:
1857  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1858  loc, nproc);
1859  break;
1860  case 2:
1861  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1862  loc, nproc);
1863  break;
1864  case 3:
1865  if (__itt_metadata_add_ptr) {
1866  // Initialize with primary thread's wait time
1867  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1868  // Set arrive time to zero to be able to check it in
1869  // __kmp_invoke_task(); the same is done inside the loop below
1870  this_thr->th.th_bar_arrive_time = 0;
1871  for (i = 1; i < nproc; ++i) {
1872  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1873  other_threads[i]->th.th_bar_arrive_time = 0;
1874  }
1875  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1876  cur_time, delta, 0);
1877  }
1878  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1879  loc, nproc);
1880  this_thr->th.th_frame_time = cur_time;
1881  break;
1882  }
1883  }
1884 #endif /* USE_ITT_BUILD */
1885  }
1886 #if USE_ITT_BUILD
1887  else {
1888  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1889  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1890  }
1891 #endif /* USE_ITT_BUILD */
1892 
1893 #if KMP_DEBUG
1894  if (KMP_MASTER_TID(tid)) {
1895  KA_TRACE(
1896  15,
1897  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1898  gtid, team_id, tid, nproc));
1899  }
1900 #endif /* KMP_DEBUG */
1901 
1902  // TODO now, mark worker threads as done so they may be disbanded
1903  KMP_MB(); // Flush all pending memory write invalidates.
1904  KA_TRACE(10,
1905  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1906 
1907  ANNOTATE_BARRIER_END(&team->t.t_bar);
1908 }
1909 
1910 // TODO release worker threads' fork barriers as we are ready instead of all at
1911 // once
1912 void __kmp_fork_barrier(int gtid, int tid) {
1913  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1914  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1915  kmp_info_t *this_thr = __kmp_threads[gtid];
1916  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1917 #if USE_ITT_BUILD
1918  void *itt_sync_obj = NULL;
1919 #endif /* USE_ITT_BUILD */
1920  if (team)
1921  ANNOTATE_BARRIER_END(&team->t.t_bar);
1922 
1923  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1924  (team != NULL) ? team->t.t_id : -1, tid));
1925 
1926  // th_team pointer only valid for primary thread here
1927  if (KMP_MASTER_TID(tid)) {
1928 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1929  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1930  // Create itt barrier object
1931  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1932  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1933  }
1934 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1935 
1936 #ifdef KMP_DEBUG
1937  KMP_DEBUG_ASSERT(team);
1938  kmp_info_t **other_threads = team->t.t_threads;
1939  int i;
1940 
1941  // Verify state
1942  KMP_MB();
1943 
1944  for (i = 1; i < team->t.t_nproc; ++i) {
1945  KA_TRACE(500,
1946  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1947  "== %u.\n",
1948  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1949  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1950  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1951  KMP_DEBUG_ASSERT(
1952  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1953  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1954  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1955  }
1956 #endif
1957 
1958  if (__kmp_tasking_mode != tskm_immediate_exec) {
1959  // 0 indicates setup current task team if nthreads > 1
1960  __kmp_task_team_setup(this_thr, team, 0);
1961  }
1962 
1963  /* The primary thread may have changed its blocktime between join barrier
1964  and fork barrier. Copy the blocktime info to the thread, where
1965  __kmp_wait_template() can access it when the team struct is not
1966  guaranteed to exist. */
1967  // See note about the corresponding code in __kmp_join_barrier() being
1968  // performance-critical
1969  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1970 #if KMP_USE_MONITOR
1971  this_thr->th.th_team_bt_intervals =
1972  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1973  this_thr->th.th_team_bt_set =
1974  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1975 #else
1976  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1977 #endif
1978  }
1979  } // primary thread
1980 
1981  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1982  case bp_hyper_bar: {
1983  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1984  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1985  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1986  break;
1987  }
1988  case bp_hierarchical_bar: {
1989  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1990  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1991  break;
1992  }
1993  case bp_tree_bar: {
1994  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1995  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1996  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1997  break;
1998  }
1999  default: {
2000  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2001  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2002  }
2003  }
2004 
2005 #if OMPT_SUPPORT
2006  if (ompt_enabled.enabled &&
2007  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2008  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2009  ompt_data_t *task_data = (team)
2010  ? OMPT_CUR_TASK_DATA(this_thr)
2011  : &(this_thr->th.ompt_thread_info.task_data);
2012  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2013 #if OMPT_OPTIONAL
2014  void *codeptr = NULL;
2015  if (KMP_MASTER_TID(ds_tid) &&
2016  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2017  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2018  codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2019  if (ompt_enabled.ompt_callback_sync_region_wait) {
2020  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2021  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2022  codeptr);
2023  }
2024  if (ompt_enabled.ompt_callback_sync_region) {
2025  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2026  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2027  codeptr);
2028  }
2029 #endif
2030  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2031  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2032  ompt_scope_end, NULL, task_data, 0, ds_tid,
2033  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2034  }
2035  }
2036 #endif
2037 
2038  // Early exit for reaping threads releasing forkjoin barrier
2039  if (TCR_4(__kmp_global.g.g_done)) {
2040  this_thr->th.th_task_team = NULL;
2041 
2042 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2043  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2044  if (!KMP_MASTER_TID(tid)) {
2045  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2046  if (itt_sync_obj)
2047  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2048  }
2049  }
2050 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2051  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2052  return;
2053  }
2054 
2055  /* We can now assume that a valid team structure has been allocated by the
2056  primary thread and propagated to all worker threads. The current thread,
2057  however, may not be part of the team, so we can't blindly assume that the
2058  team pointer is non-null. */
2059  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2060  KMP_DEBUG_ASSERT(team != NULL);
2061  tid = __kmp_tid_from_gtid(gtid);
2062 
2063 #if KMP_BARRIER_ICV_PULL
2064  /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2065  __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2066  implicit task has this data before this function is called. We cannot
2067  modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2068  thread struct, because it is not always the case that the threads arrays
2069  have been allocated when __kmp_fork_call() is executed. */
2070  {
2071  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2072  if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2073  // Copy the initial ICVs from the primary thread's thread struct to the
2074  // implicit task for this tid.
2075  KA_TRACE(10,
2076  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2077  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2078  tid, FALSE);
2079  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2080  &team->t.t_threads[0]
2081  ->th.th_bar[bs_forkjoin_barrier]
2082  .bb.th_fixed_icvs);
2083  }
2084  }
2085 #endif // KMP_BARRIER_ICV_PULL
2086 
2087  if (__kmp_tasking_mode != tskm_immediate_exec) {
2088  __kmp_task_team_sync(this_thr, team);
2089  }
2090 
2091 #if KMP_AFFINITY_SUPPORTED
2092  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2093  if (proc_bind == proc_bind_intel) {
2094  // Call dynamic affinity settings
2095  if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2096  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2097  }
2098  } else if (proc_bind != proc_bind_false) {
2099  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2100  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2101  __kmp_gtid_from_thread(this_thr),
2102  this_thr->th.th_current_place));
2103  } else {
2104  __kmp_affinity_set_place(gtid);
2105  }
2106  }
2107 #endif // KMP_AFFINITY_SUPPORTED
2108  // Perform the display affinity functionality
2109  if (__kmp_display_affinity) {
2110  if (team->t.t_display_affinity
2111 #if KMP_AFFINITY_SUPPORTED
2112  || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2113 #endif
2114  ) {
2115  // NULL means use the affinity-format-var ICV
2116  __kmp_aux_display_affinity(gtid, NULL);
2117  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2118  this_thr->th.th_prev_level = team->t.t_level;
2119  }
2120  }
2121  if (!KMP_MASTER_TID(tid))
2122  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2123 
2124 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2125  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2126  if (!KMP_MASTER_TID(tid)) {
2127  // Get correct barrier object
2128  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2129  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2130  } // (prepare called inside barrier_release)
2131  }
2132 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2133  ANNOTATE_BARRIER_END(&team->t.t_bar);
2134  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2135  team->t.t_id, tid));
2136 }
2137 
2138 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2139  kmp_internal_control_t *new_icvs, ident_t *loc) {
2140  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2141 
2142  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2143  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2144 
2145 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2146  __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2147  implicit task has this data before this function is called. */
2148 #if KMP_BARRIER_ICV_PULL
2149  /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2150  remains untouched), where all of the worker threads can access them and
2151  make their own copies after the barrier. */
2152  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2153  // allocated at this point
2154  copy_icvs(
2155  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2156  new_icvs);
2157  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2158  team->t.t_threads[0], team));
2159 #elif KMP_BARRIER_ICV_PUSH
2160  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2161  // done here.
2162  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2163  team->t.t_threads[0], team));
2164 #else
2165  // Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2166  // time.
2167  ngo_load(new_icvs);
2168  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2169  // allocated at this point
2170  for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2171  // TODO: GEH - pass in better source location info since usually NULL here
2172  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2173  f, team->t.t_threads[f], team));
2174  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2175  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2176  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2177  f, team->t.t_threads[f], team));
2178  }
2179  ngo_sync();
2180 #endif // KMP_BARRIER_ICV_PULL
2181 }
Definition: kmp.h:229