LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 /* forward declaration */
25 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
26  kmp_info_t *this_thr);
27 static void __kmp_alloc_task_deque(kmp_info_t *thread,
28  kmp_thread_data_t *thread_data);
29 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
30  kmp_task_team_t *task_team);
31 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
32 
33 #ifdef BUILD_TIED_TASK_STACK
34 
35 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
36 // from top do bottom
37 //
38 // gtid: global thread identifier for thread containing stack
39 // thread_data: thread data for task team thread containing stack
40 // threshold: value above which the trace statement triggers
41 // location: string identifying call site of this function (for trace)
42 static void __kmp_trace_task_stack(kmp_int32 gtid,
43  kmp_thread_data_t *thread_data,
44  int threshold, char *location) {
45  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
46  kmp_taskdata_t **stack_top = task_stack->ts_top;
47  kmp_int32 entries = task_stack->ts_entries;
48  kmp_taskdata_t *tied_task;
49 
50  KA_TRACE(
51  threshold,
52  ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
53  "first_block = %p, stack_top = %p \n",
54  location, gtid, entries, task_stack->ts_first_block, stack_top));
55 
56  KMP_DEBUG_ASSERT(stack_top != NULL);
57  KMP_DEBUG_ASSERT(entries > 0);
58 
59  while (entries != 0) {
60  KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
61  // fix up ts_top if we need to pop from previous block
62  if (entries & TASK_STACK_INDEX_MASK == 0) {
63  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
64 
65  stack_block = stack_block->sb_prev;
66  stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
67  }
68 
69  // finish bookkeeping
70  stack_top--;
71  entries--;
72 
73  tied_task = *stack_top;
74 
75  KMP_DEBUG_ASSERT(tied_task != NULL);
76  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
77 
78  KA_TRACE(threshold,
79  ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
80  "stack_top=%p, tied_task=%p\n",
81  location, gtid, entries, stack_top, tied_task));
82  }
83  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
84 
85  KA_TRACE(threshold,
86  ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
87  location, gtid));
88 }
89 
90 // __kmp_init_task_stack: initialize the task stack for the first time
91 // after a thread_data structure is created.
92 // It should not be necessary to do this again (assuming the stack works).
93 //
94 // gtid: global thread identifier of calling thread
95 // thread_data: thread data for task team thread containing stack
96 static void __kmp_init_task_stack(kmp_int32 gtid,
97  kmp_thread_data_t *thread_data) {
98  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
99  kmp_stack_block_t *first_block;
100 
101  // set up the first block of the stack
102  first_block = &task_stack->ts_first_block;
103  task_stack->ts_top = (kmp_taskdata_t **)first_block;
104  memset((void *)first_block, '\0',
105  TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
106 
107  // initialize the stack to be empty
108  task_stack->ts_entries = TASK_STACK_EMPTY;
109  first_block->sb_next = NULL;
110  first_block->sb_prev = NULL;
111 }
112 
113 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
114 //
115 // gtid: global thread identifier for calling thread
116 // thread_data: thread info for thread containing stack
117 static void __kmp_free_task_stack(kmp_int32 gtid,
118  kmp_thread_data_t *thread_data) {
119  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
120  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
121 
122  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
123  // free from the second block of the stack
124  while (stack_block != NULL) {
125  kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
126 
127  stack_block->sb_next = NULL;
128  stack_block->sb_prev = NULL;
129  if (stack_block != &task_stack->ts_first_block) {
130  __kmp_thread_free(thread,
131  stack_block); // free the block, if not the first
132  }
133  stack_block = next_block;
134  }
135  // initialize the stack to be empty
136  task_stack->ts_entries = 0;
137  task_stack->ts_top = NULL;
138 }
139 
140 // __kmp_push_task_stack: Push the tied task onto the task stack.
141 // Grow the stack if necessary by allocating another block.
142 //
143 // gtid: global thread identifier for calling thread
144 // thread: thread info for thread containing stack
145 // tied_task: the task to push on the stack
146 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
147  kmp_taskdata_t *tied_task) {
148  // GEH - need to consider what to do if tt_threads_data not allocated yet
149  kmp_thread_data_t *thread_data =
150  &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
151  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
152 
153  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
154  return; // Don't push anything on stack if team or team tasks are serialized
155  }
156 
157  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
158  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
159 
160  KA_TRACE(20,
161  ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
162  gtid, thread, tied_task));
163  // Store entry
164  *(task_stack->ts_top) = tied_task;
165 
166  // Do bookkeeping for next push
167  task_stack->ts_top++;
168  task_stack->ts_entries++;
169 
170  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
171  // Find beginning of this task block
172  kmp_stack_block_t *stack_block =
173  (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
174 
175  // Check if we already have a block
176  if (stack_block->sb_next !=
177  NULL) { // reset ts_top to beginning of next block
178  task_stack->ts_top = &stack_block->sb_next->sb_block[0];
179  } else { // Alloc new block and link it up
180  kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
181  thread, sizeof(kmp_stack_block_t));
182 
183  task_stack->ts_top = &new_block->sb_block[0];
184  stack_block->sb_next = new_block;
185  new_block->sb_prev = stack_block;
186  new_block->sb_next = NULL;
187 
188  KA_TRACE(
189  30,
190  ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
191  gtid, tied_task, new_block));
192  }
193  }
194  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
195  tied_task));
196 }
197 
198 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
199 // the task, just check to make sure it matches the ending task passed in.
200 //
201 // gtid: global thread identifier for the calling thread
202 // thread: thread info structure containing stack
203 // tied_task: the task popped off the stack
204 // ending_task: the task that is ending (should match popped task)
205 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
206  kmp_taskdata_t *ending_task) {
207  // GEH - need to consider what to do if tt_threads_data not allocated yet
208  kmp_thread_data_t *thread_data =
209  &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
210  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
211  kmp_taskdata_t *tied_task;
212 
213  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
214  // Don't pop anything from stack if team or team tasks are serialized
215  return;
216  }
217 
218  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
219  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
220 
221  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
222  thread));
223 
224  // fix up ts_top if we need to pop from previous block
225  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
226  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
227 
228  stack_block = stack_block->sb_prev;
229  task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
230  }
231 
232  // finish bookkeeping
233  task_stack->ts_top--;
234  task_stack->ts_entries--;
235 
236  tied_task = *(task_stack->ts_top);
237 
238  KMP_DEBUG_ASSERT(tied_task != NULL);
239  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
240  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
241 
242  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
243  tied_task));
244  return;
245 }
246 #endif /* BUILD_TIED_TASK_STACK */
247 
248 // returns 1 if new task is allowed to execute, 0 otherwise
249 // checks Task Scheduling constraint (if requested) and
250 // mutexinoutset dependencies if any
251 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
252  const kmp_taskdata_t *tasknew,
253  const kmp_taskdata_t *taskcurr) {
254  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
255  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
256  // only descendant of all deferred tied tasks can be scheduled, checking
257  // the last one is enough, as it in turn is the descendant of all others
258  kmp_taskdata_t *current = taskcurr->td_last_tied;
259  KMP_DEBUG_ASSERT(current != NULL);
260  // check if the task is not suspended on barrier
261  if (current->td_flags.tasktype == TASK_EXPLICIT ||
262  current->td_taskwait_thread > 0) { // <= 0 on barrier
263  kmp_int32 level = current->td_level;
264  kmp_taskdata_t *parent = tasknew->td_parent;
265  while (parent != current && parent->td_level > level) {
266  // check generation up to the level of the current task
267  parent = parent->td_parent;
268  KMP_DEBUG_ASSERT(parent != NULL);
269  }
270  if (parent != current)
271  return false;
272  }
273  }
274  // Check mutexinoutset dependencies, acquire locks
275  kmp_depnode_t *node = tasknew->td_depnode;
276  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
277  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
278  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
279  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
280  continue;
281  // could not get the lock, release previous locks
282  for (int j = i - 1; j >= 0; --j)
283  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
284  return false;
285  }
286  // negative num_locks means all locks acquired successfully
287  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
288  }
289  return true;
290 }
291 
292 // __kmp_realloc_task_deque:
293 // Re-allocates a task deque for a particular thread, copies the content from
294 // the old deque and adjusts the necessary data structures relating to the
295 // deque. This operation must be done with the deque_lock being held
296 static void __kmp_realloc_task_deque(kmp_info_t *thread,
297  kmp_thread_data_t *thread_data) {
298  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
299  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
300  kmp_int32 new_size = 2 * size;
301 
302  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
303  "%d] for thread_data %p\n",
304  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
305 
306  kmp_taskdata_t **new_deque =
307  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
308 
309  int i, j;
310  for (i = thread_data->td.td_deque_head, j = 0; j < size;
311  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
312  new_deque[j] = thread_data->td.td_deque[i];
313 
314  __kmp_free(thread_data->td.td_deque);
315 
316  thread_data->td.td_deque_head = 0;
317  thread_data->td.td_deque_tail = size;
318  thread_data->td.td_deque = new_deque;
319  thread_data->td.td_deque_size = new_size;
320 }
321 
322 static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
323  kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
324  kmp_thread_data_t *thread_data = &l->td;
325  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
326  thread_data->td.td_deque_last_stolen = -1;
327  KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
328  "for thread_data %p\n",
329  __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
330  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
331  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
332  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
333  return l;
334 }
335 
336 // The function finds the deque of priority tasks with given priority, or
337 // allocates a new deque and put it into sorted (high -> low) list of deques.
338 // Deques of non-default priority tasks are shared between all threads in team,
339 // as opposed to per-thread deques of tasks with default priority.
340 // The function is called under the lock task_team->tt.tt_task_pri_lock.
341 static kmp_thread_data_t *
342 __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
343  kmp_thread_data_t *thread_data;
344  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
345  if (lst->priority == pri) {
346  // Found queue of tasks with given priority.
347  thread_data = &lst->td;
348  } else if (lst->priority < pri) {
349  // All current priority queues contain tasks with lower priority.
350  // Allocate new one for given priority tasks.
351  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
352  thread_data = &list->td;
353  list->priority = pri;
354  list->next = lst;
355  task_team->tt.tt_task_pri_list = list;
356  } else { // task_team->tt.tt_task_pri_list->priority > pri
357  kmp_task_pri_t *next_queue = lst->next;
358  while (next_queue && next_queue->priority > pri) {
359  lst = next_queue;
360  next_queue = lst->next;
361  }
362  // lst->priority > pri && (next == NULL || pri >= next->priority)
363  if (next_queue == NULL) {
364  // No queue with pri priority, need to allocate new one.
365  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
366  thread_data = &list->td;
367  list->priority = pri;
368  list->next = NULL;
369  lst->next = list;
370  } else if (next_queue->priority == pri) {
371  // Found queue of tasks with given priority.
372  thread_data = &next_queue->td;
373  } else { // lst->priority > pri > next->priority
374  // insert newly allocated between existed queues
375  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
376  thread_data = &list->td;
377  list->priority = pri;
378  list->next = next_queue;
379  lst->next = list;
380  }
381  }
382  return thread_data;
383 }
384 
385 // __kmp_push_priority_task: Add a task to the team's priority task deque
386 static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
387  kmp_taskdata_t *taskdata,
388  kmp_task_team_t *task_team,
389  kmp_int32 pri) {
390  kmp_thread_data_t *thread_data = NULL;
391  KA_TRACE(20,
392  ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
393  gtid, taskdata, pri));
394 
395  // Find task queue specific to priority value
396  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
397  if (UNLIKELY(lst == NULL)) {
398  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
399  if (task_team->tt.tt_task_pri_list == NULL) {
400  // List of queues is still empty, allocate one.
401  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
402  thread_data = &list->td;
403  list->priority = pri;
404  list->next = NULL;
405  task_team->tt.tt_task_pri_list = list;
406  } else {
407  // Other thread initialized a queue. Check if it fits and get thread_data.
408  thread_data = __kmp_get_priority_deque_data(task_team, pri);
409  }
410  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
411  } else {
412  if (lst->priority == pri) {
413  // Found queue of tasks with given priority.
414  thread_data = &lst->td;
415  } else {
416  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
417  thread_data = __kmp_get_priority_deque_data(task_team, pri);
418  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
419  }
420  }
421  KMP_DEBUG_ASSERT(thread_data);
422 
423  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
424  // Check if deque is full
425  if (TCR_4(thread_data->td.td_deque_ntasks) >=
426  TASK_DEQUE_SIZE(thread_data->td)) {
427  if (__kmp_enable_task_throttling &&
428  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
429  thread->th.th_current_task)) {
430  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
431  KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
432  "TASK_NOT_PUSHED for task %p\n",
433  gtid, taskdata));
434  return TASK_NOT_PUSHED;
435  } else {
436  // expand deque to push the task which is not allowed to execute
437  __kmp_realloc_task_deque(thread, thread_data);
438  }
439  }
440  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
441  TASK_DEQUE_SIZE(thread_data->td));
442  // Push taskdata.
443  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
444  // Wrap index.
445  thread_data->td.td_deque_tail =
446  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
447  TCW_4(thread_data->td.td_deque_ntasks,
448  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
449  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
450  KMP_FSYNC_RELEASING(taskdata); // releasing child
451  KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
452  "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
453  gtid, taskdata, thread_data->td.td_deque_ntasks,
454  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
455  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
456  task_team->tt.tt_num_task_pri++; // atomic inc
457  return TASK_SUCCESSFULLY_PUSHED;
458 }
459 
460 // __kmp_push_task: Add a task to the thread's deque
461 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
462  kmp_info_t *thread = __kmp_threads[gtid];
463  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
464 
465  // If we encounter a hidden helper task, and the current thread is not a
466  // hidden helper thread, we have to give the task to any hidden helper thread
467  // starting from its shadow one.
468  if (UNLIKELY(taskdata->td_flags.hidden_helper &&
469  !KMP_HIDDEN_HELPER_THREAD(gtid))) {
470  kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
471  __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
472  // Signal the hidden helper threads.
473  __kmp_hidden_helper_worker_thread_signal();
474  return TASK_SUCCESSFULLY_PUSHED;
475  }
476 
477  kmp_task_team_t *task_team = thread->th.th_task_team;
478  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
479  kmp_thread_data_t *thread_data;
480 
481  KA_TRACE(20,
482  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
483 
484  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
485  // untied task needs to increment counter so that the task structure is not
486  // freed prematurely
487  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
488  KMP_DEBUG_USE_VAR(counter);
489  KA_TRACE(
490  20,
491  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
492  gtid, counter, taskdata));
493  }
494 
495  // The first check avoids building task_team thread data if serialized
496  if (UNLIKELY(taskdata->td_flags.task_serial)) {
497  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
498  "TASK_NOT_PUSHED for task %p\n",
499  gtid, taskdata));
500  return TASK_NOT_PUSHED;
501  }
502 
503  // Now that serialized tasks have returned, we can assume that we are not in
504  // immediate exec mode
505  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
506  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
507  __kmp_enable_tasking(task_team, thread);
508  }
509  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
510  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
511 
512  if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
513  __kmp_max_task_priority > 0) {
514  int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
515  return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
516  }
517 
518  // Find tasking deque specific to encountering thread
519  thread_data = &task_team->tt.tt_threads_data[tid];
520 
521  // No lock needed since only owner can allocate. If the task is hidden_helper,
522  // we don't need it either because we have initialized the dequeue for hidden
523  // helper thread data.
524  if (UNLIKELY(thread_data->td.td_deque == NULL)) {
525  __kmp_alloc_task_deque(thread, thread_data);
526  }
527 
528  int locked = 0;
529  // Check if deque is full
530  if (TCR_4(thread_data->td.td_deque_ntasks) >=
531  TASK_DEQUE_SIZE(thread_data->td)) {
532  if (__kmp_enable_task_throttling &&
533  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
534  thread->th.th_current_task)) {
535  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
536  "TASK_NOT_PUSHED for task %p\n",
537  gtid, taskdata));
538  return TASK_NOT_PUSHED;
539  } else {
540  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
541  locked = 1;
542  if (TCR_4(thread_data->td.td_deque_ntasks) >=
543  TASK_DEQUE_SIZE(thread_data->td)) {
544  // expand deque to push the task which is not allowed to execute
545  __kmp_realloc_task_deque(thread, thread_data);
546  }
547  }
548  }
549  // Lock the deque for the task push operation
550  if (!locked) {
551  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
552  // Need to recheck as we can get a proxy task from thread outside of OpenMP
553  if (TCR_4(thread_data->td.td_deque_ntasks) >=
554  TASK_DEQUE_SIZE(thread_data->td)) {
555  if (__kmp_enable_task_throttling &&
556  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
557  thread->th.th_current_task)) {
558  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
559  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
560  "returning TASK_NOT_PUSHED for task %p\n",
561  gtid, taskdata));
562  return TASK_NOT_PUSHED;
563  } else {
564  // expand deque to push the task which is not allowed to execute
565  __kmp_realloc_task_deque(thread, thread_data);
566  }
567  }
568  }
569  // Must have room since no thread can add tasks but calling thread
570  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
571  TASK_DEQUE_SIZE(thread_data->td));
572 
573  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
574  taskdata; // Push taskdata
575  // Wrap index.
576  thread_data->td.td_deque_tail =
577  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
578  TCW_4(thread_data->td.td_deque_ntasks,
579  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
580  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
581  KMP_FSYNC_RELEASING(taskdata); // releasing child
582  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
583  "task=%p ntasks=%d head=%u tail=%u\n",
584  gtid, taskdata, thread_data->td.td_deque_ntasks,
585  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
586 
587  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
588 
589  return TASK_SUCCESSFULLY_PUSHED;
590 }
591 
592 // __kmp_pop_current_task_from_thread: set up current task from called thread
593 // when team ends
594 //
595 // this_thr: thread structure to set current_task in.
596 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
597  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
598  "this_thread=%p, curtask=%p, "
599  "curtask_parent=%p\n",
600  0, this_thr, this_thr->th.th_current_task,
601  this_thr->th.th_current_task->td_parent));
602 
603  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
604 
605  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
606  "this_thread=%p, curtask=%p, "
607  "curtask_parent=%p\n",
608  0, this_thr, this_thr->th.th_current_task,
609  this_thr->th.th_current_task->td_parent));
610 }
611 
612 // __kmp_push_current_task_to_thread: set up current task in called thread for a
613 // new team
614 //
615 // this_thr: thread structure to set up
616 // team: team for implicit task data
617 // tid: thread within team to set up
618 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
619  int tid) {
620  // current task of the thread is a parent of the new just created implicit
621  // tasks of new team
622  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
623  "curtask=%p "
624  "parent_task=%p\n",
625  tid, this_thr, this_thr->th.th_current_task,
626  team->t.t_implicit_task_taskdata[tid].td_parent));
627 
628  KMP_DEBUG_ASSERT(this_thr != NULL);
629 
630  if (tid == 0) {
631  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
632  team->t.t_implicit_task_taskdata[0].td_parent =
633  this_thr->th.th_current_task;
634  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
635  }
636  } else {
637  team->t.t_implicit_task_taskdata[tid].td_parent =
638  team->t.t_implicit_task_taskdata[0].td_parent;
639  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
640  }
641 
642  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
643  "curtask=%p "
644  "parent_task=%p\n",
645  tid, this_thr, this_thr->th.th_current_task,
646  team->t.t_implicit_task_taskdata[tid].td_parent));
647 }
648 
649 // __kmp_task_start: bookkeeping for a task starting execution
650 //
651 // GTID: global thread id of calling thread
652 // task: task starting execution
653 // current_task: task suspending
654 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
655  kmp_taskdata_t *current_task) {
656  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
657  kmp_info_t *thread = __kmp_threads[gtid];
658 
659  KA_TRACE(10,
660  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
661  gtid, taskdata, current_task));
662 
663  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
664 
665  // mark currently executing task as suspended
666  // TODO: GEH - make sure root team implicit task is initialized properly.
667  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
668  current_task->td_flags.executing = 0;
669 
670 // Add task to stack if tied
671 #ifdef BUILD_TIED_TASK_STACK
672  if (taskdata->td_flags.tiedness == TASK_TIED) {
673  __kmp_push_task_stack(gtid, thread, taskdata);
674  }
675 #endif /* BUILD_TIED_TASK_STACK */
676 
677  // mark starting task as executing and as current task
678  thread->th.th_current_task = taskdata;
679 
680  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
681  taskdata->td_flags.tiedness == TASK_UNTIED);
682  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
683  taskdata->td_flags.tiedness == TASK_UNTIED);
684  taskdata->td_flags.started = 1;
685  taskdata->td_flags.executing = 1;
686  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
687  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
688 
689  // GEH TODO: shouldn't we pass some sort of location identifier here?
690  // APT: yes, we will pass location here.
691  // need to store current thread state (in a thread or taskdata structure)
692  // before setting work_state, otherwise wrong state is set after end of task
693 
694  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
695 
696  return;
697 }
698 
699 #if OMPT_SUPPORT
700 //------------------------------------------------------------------------------
701 // __ompt_task_init:
702 // Initialize OMPT fields maintained by a task. This will only be called after
703 // ompt_start_tool, so we already know whether ompt is enabled or not.
704 
705 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
706  // The calls to __ompt_task_init already have the ompt_enabled condition.
707  task->ompt_task_info.task_data.value = 0;
708  task->ompt_task_info.frame.exit_frame = ompt_data_none;
709  task->ompt_task_info.frame.enter_frame = ompt_data_none;
710  task->ompt_task_info.frame.exit_frame_flags =
711  ompt_frame_runtime | ompt_frame_framepointer;
712  task->ompt_task_info.frame.enter_frame_flags =
713  ompt_frame_runtime | ompt_frame_framepointer;
714  task->ompt_task_info.dispatch_chunk.start = 0;
715  task->ompt_task_info.dispatch_chunk.iterations = 0;
716 }
717 
718 // __ompt_task_start:
719 // Build and trigger task-begin event
720 static inline void __ompt_task_start(kmp_task_t *task,
721  kmp_taskdata_t *current_task,
722  kmp_int32 gtid) {
723  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
724  ompt_task_status_t status = ompt_task_switch;
725  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
726  status = ompt_task_yield;
727  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
728  }
729  /* let OMPT know that we're about to run this task */
730  if (ompt_enabled.ompt_callback_task_schedule) {
731  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
732  &(current_task->ompt_task_info.task_data), status,
733  &(taskdata->ompt_task_info.task_data));
734  }
735  taskdata->ompt_task_info.scheduling_parent = current_task;
736 }
737 
738 // __ompt_task_finish:
739 // Build and trigger final task-schedule event
740 static inline void __ompt_task_finish(kmp_task_t *task,
741  kmp_taskdata_t *resumed_task,
742  ompt_task_status_t status) {
743  if (ompt_enabled.ompt_callback_task_schedule) {
744  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
745  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
746  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
747  status = ompt_task_cancel;
748  }
749 
750  /* let OMPT know that we're returning to the callee task */
751  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
752  &(taskdata->ompt_task_info.task_data), status,
753  (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
754  }
755 }
756 #endif
757 
758 template <bool ompt>
759 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
760  kmp_task_t *task,
761  void *frame_address,
762  void *return_address) {
763  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
764  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
765 
766  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
767  "current_task=%p\n",
768  gtid, loc_ref, taskdata, current_task));
769 
770  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
771  // untied task needs to increment counter so that the task structure is not
772  // freed prematurely
773  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
774  KMP_DEBUG_USE_VAR(counter);
775  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
776  "incremented for task %p\n",
777  gtid, counter, taskdata));
778  }
779 
780  taskdata->td_flags.task_serial =
781  1; // Execute this task immediately, not deferred.
782  __kmp_task_start(gtid, task, current_task);
783 
784 #if OMPT_SUPPORT
785  if (ompt) {
786  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
787  current_task->ompt_task_info.frame.enter_frame.ptr =
788  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
789  current_task->ompt_task_info.frame.enter_frame_flags =
790  taskdata->ompt_task_info.frame.exit_frame_flags =
791  ompt_frame_application | ompt_frame_framepointer;
792  }
793  if (ompt_enabled.ompt_callback_task_create) {
794  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
795  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
796  &(parent_info->task_data), &(parent_info->frame),
797  &(taskdata->ompt_task_info.task_data),
798  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
799  return_address);
800  }
801  __ompt_task_start(task, current_task, gtid);
802  }
803 #endif // OMPT_SUPPORT
804 
805  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
806  loc_ref, taskdata));
807 }
808 
809 #if OMPT_SUPPORT
810 OMPT_NOINLINE
811 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
812  kmp_task_t *task,
813  void *frame_address,
814  void *return_address) {
815  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
816  return_address);
817 }
818 #endif // OMPT_SUPPORT
819 
820 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
821 // execution
822 //
823 // loc_ref: source location information; points to beginning of task block.
824 // gtid: global thread number.
825 // task: task thunk for the started task.
826 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
827  kmp_task_t *task) {
828 #if OMPT_SUPPORT
829  if (UNLIKELY(ompt_enabled.enabled)) {
830  OMPT_STORE_RETURN_ADDRESS(gtid);
831  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
832  OMPT_GET_FRAME_ADDRESS(1),
833  OMPT_LOAD_RETURN_ADDRESS(gtid));
834  return;
835  }
836 #endif
837  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
838 }
839 
840 #ifdef TASK_UNUSED
841 // __kmpc_omp_task_begin: report that a given task has started execution
842 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
843 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
844  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
845 
846  KA_TRACE(
847  10,
848  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
849  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
850 
851  __kmp_task_start(gtid, task, current_task);
852 
853  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
854  loc_ref, KMP_TASK_TO_TASKDATA(task)));
855  return;
856 }
857 #endif // TASK_UNUSED
858 
859 // __kmp_free_task: free the current task space and the space for shareds
860 //
861 // gtid: Global thread ID of calling thread
862 // taskdata: task to free
863 // thread: thread data structure of caller
864 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
865  kmp_info_t *thread) {
866  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
867  taskdata));
868 
869  // Check to make sure all flags and counters have the correct values
870  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
871  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
872  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
873  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
874  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
875  taskdata->td_flags.task_serial == 1);
876  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
877  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
878  // Clear data to not be re-used later by mistake.
879  task->data1.destructors = NULL;
880  task->data2.priority = 0;
881 
882  taskdata->td_flags.freed = 1;
883 // deallocate the taskdata and shared variable blocks associated with this task
884 #if USE_FAST_MEMORY
885  __kmp_fast_free(thread, taskdata);
886 #else /* ! USE_FAST_MEMORY */
887  __kmp_thread_free(thread, taskdata);
888 #endif
889  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
890 }
891 
892 // __kmp_free_task_and_ancestors: free the current task and ancestors without
893 // children
894 //
895 // gtid: Global thread ID of calling thread
896 // taskdata: task to free
897 // thread: thread data structure of caller
898 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
899  kmp_taskdata_t *taskdata,
900  kmp_info_t *thread) {
901  // Proxy tasks must always be allowed to free their parents
902  // because they can be run in background even in serial mode.
903  kmp_int32 team_serial =
904  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
905  !taskdata->td_flags.proxy;
906  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
907 
908  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
909  KMP_DEBUG_ASSERT(children >= 0);
910 
911  // Now, go up the ancestor tree to see if any ancestors can now be freed.
912  while (children == 0) {
913  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
914 
915  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
916  "and freeing itself\n",
917  gtid, taskdata));
918 
919  // --- Deallocate my ancestor task ---
920  __kmp_free_task(gtid, taskdata, thread);
921 
922  taskdata = parent_taskdata;
923 
924  if (team_serial)
925  return;
926  // Stop checking ancestors at implicit task instead of walking up ancestor
927  // tree to avoid premature deallocation of ancestors.
928  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
929  if (taskdata->td_dephash) { // do we need to cleanup dephash?
930  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
931  kmp_tasking_flags_t flags_old = taskdata->td_flags;
932  if (children == 0 && flags_old.complete == 1) {
933  kmp_tasking_flags_t flags_new = flags_old;
934  flags_new.complete = 0;
935  if (KMP_COMPARE_AND_STORE_ACQ32(
936  RCAST(kmp_int32 *, &taskdata->td_flags),
937  *RCAST(kmp_int32 *, &flags_old),
938  *RCAST(kmp_int32 *, &flags_new))) {
939  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
940  "dephash of implicit task %p\n",
941  gtid, taskdata));
942  // cleanup dephash of finished implicit task
943  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
944  }
945  }
946  }
947  return;
948  }
949  // Predecrement simulated by "- 1" calculation
950  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
951  KMP_DEBUG_ASSERT(children >= 0);
952  }
953 
954  KA_TRACE(
955  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
956  "not freeing it yet\n",
957  gtid, taskdata, children));
958 }
959 
960 // Only need to keep track of child task counts if any of the following:
961 // 1. team parallel and tasking not serialized;
962 // 2. it is a proxy or detachable or hidden helper task
963 // 3. the children counter of its parent task is greater than 0.
964 // The reason for the 3rd one is for serialized team that found detached task,
965 // hidden helper task, T. In this case, the execution of T is still deferred,
966 // and it is also possible that a regular task depends on T. In this case, if we
967 // don't track the children, task synchronization will be broken.
968 static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
969  kmp_tasking_flags_t flags = taskdata->td_flags;
970  bool ret = !(flags.team_serial || flags.tasking_ser);
971  ret = ret || flags.proxy == TASK_PROXY ||
972  flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
973  ret = ret ||
974  KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
975  return ret;
976 }
977 
978 // __kmp_task_finish: bookkeeping to do when a task finishes execution
979 //
980 // gtid: global thread ID for calling thread
981 // task: task to be finished
982 // resumed_task: task to be resumed. (may be NULL if task is serialized)
983 //
984 // template<ompt>: effectively ompt_enabled.enabled!=0
985 // the version with ompt=false is inlined, allowing to optimize away all ompt
986 // code in this case
987 template <bool ompt>
988 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
989  kmp_taskdata_t *resumed_task) {
990  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
991  kmp_info_t *thread = __kmp_threads[gtid];
992  kmp_task_team_t *task_team =
993  thread->th.th_task_team; // might be NULL for serial teams...
994 #if KMP_DEBUG
995  kmp_int32 children = 0;
996 #endif
997  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
998  "task %p\n",
999  gtid, taskdata, resumed_task));
1000 
1001  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
1002 
1003 // Pop task from stack if tied
1004 #ifdef BUILD_TIED_TASK_STACK
1005  if (taskdata->td_flags.tiedness == TASK_TIED) {
1006  __kmp_pop_task_stack(gtid, thread, taskdata);
1007  }
1008 #endif /* BUILD_TIED_TASK_STACK */
1009 
1010  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
1011  // untied task needs to check the counter so that the task structure is not
1012  // freed prematurely
1013  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
1014  KA_TRACE(
1015  20,
1016  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
1017  gtid, counter, taskdata));
1018  if (counter > 0) {
1019  // untied task is not done, to be continued possibly by other thread, do
1020  // not free it now
1021  if (resumed_task == NULL) {
1022  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
1023  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1024  // task is the parent
1025  }
1026  thread->th.th_current_task = resumed_task; // restore current_task
1027  resumed_task->td_flags.executing = 1; // resume previous task
1028  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
1029  "resuming task %p\n",
1030  gtid, taskdata, resumed_task));
1031  return;
1032  }
1033  }
1034 
1035  // bookkeeping for resuming task:
1036  // GEH - note tasking_ser => task_serial
1037  KMP_DEBUG_ASSERT(
1038  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
1039  taskdata->td_flags.task_serial);
1040  if (taskdata->td_flags.task_serial) {
1041  if (resumed_task == NULL) {
1042  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1043  // task is the parent
1044  }
1045  } else {
1046  KMP_DEBUG_ASSERT(resumed_task !=
1047  NULL); // verify that resumed task is passed as argument
1048  }
1049 
1050  /* If the tasks' destructor thunk flag has been set, we need to invoke the
1051  destructor thunk that has been generated by the compiler. The code is
1052  placed here, since at this point other tasks might have been released
1053  hence overlapping the destructor invocations with some other work in the
1054  released tasks. The OpenMP spec is not specific on when the destructors
1055  are invoked, so we should be free to choose. */
1056  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
1057  kmp_routine_entry_t destr_thunk = task->data1.destructors;
1058  KMP_ASSERT(destr_thunk);
1059  destr_thunk(gtid, task);
1060  }
1061 
1062  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
1063  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
1064  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
1065 
1066  bool detach = false;
1067  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
1068  if (taskdata->td_allow_completion_event.type ==
1069  KMP_EVENT_ALLOW_COMPLETION) {
1070  // event hasn't been fulfilled yet. Try to detach task.
1071  __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1072  if (taskdata->td_allow_completion_event.type ==
1073  KMP_EVENT_ALLOW_COMPLETION) {
1074  // task finished execution
1075  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1076  taskdata->td_flags.executing = 0; // suspend the finishing task
1077 
1078 #if OMPT_SUPPORT
1079  // For a detached task, which is not completed, we switch back
1080  // the omp_fulfill_event signals completion
1081  // locking is necessary to avoid a race with ompt_task_late_fulfill
1082  if (ompt)
1083  __ompt_task_finish(task, resumed_task, ompt_task_detach);
1084 #endif
1085 
1086  // no access to taskdata after this point!
1087  // __kmp_fulfill_event might free taskdata at any time from now
1088 
1089  taskdata->td_flags.proxy = TASK_PROXY; // proxify!
1090  detach = true;
1091  }
1092  __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1093  }
1094  }
1095 
1096  if (!detach) {
1097  taskdata->td_flags.complete = 1; // mark the task as completed
1098 
1099 #if OMPT_SUPPORT
1100  // This is not a detached task, we are done here
1101  if (ompt)
1102  __ompt_task_finish(task, resumed_task, ompt_task_complete);
1103 #endif
1104  // TODO: What would be the balance between the conditions in the function
1105  // and an atomic operation?
1106  if (__kmp_track_children_task(taskdata)) {
1107  __kmp_release_deps(gtid, taskdata);
1108  // Predecrement simulated by "- 1" calculation
1109 #if KMP_DEBUG
1110  children = -1 +
1111 #endif
1112  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
1113  KMP_DEBUG_ASSERT(children >= 0);
1114  if (taskdata->td_taskgroup)
1115  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1116  } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
1117  task_team->tt.tt_hidden_helper_task_encountered)) {
1118  // if we found proxy or hidden helper tasks there could exist a dependency
1119  // chain with the proxy task as origin
1120  __kmp_release_deps(gtid, taskdata);
1121  }
1122  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
1123  // called. Othertwise, if a task is executed immediately from the
1124  // release_deps code, the flag will be reset to 1 again by this same
1125  // function
1126  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1127  taskdata->td_flags.executing = 0; // suspend the finishing task
1128  }
1129 
1130  KA_TRACE(
1131  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
1132  gtid, taskdata, children));
1133 
1134  // Free this task and then ancestor tasks if they have no children.
1135  // Restore th_current_task first as suggested by John:
1136  // johnmc: if an asynchronous inquiry peers into the runtime system
1137  // it doesn't see the freed task as the current task.
1138  thread->th.th_current_task = resumed_task;
1139  if (!detach)
1140  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
1141 
1142  // TODO: GEH - make sure root team implicit task is initialized properly.
1143  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
1144  resumed_task->td_flags.executing = 1; // resume previous task
1145 
1146  KA_TRACE(
1147  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1148  gtid, taskdata, resumed_task));
1149 
1150  return;
1151 }
1152 
1153 template <bool ompt>
1154 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1155  kmp_int32 gtid,
1156  kmp_task_t *task) {
1157  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1158  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1159  KMP_DEBUG_ASSERT(gtid >= 0);
1160  // this routine will provide task to resume
1161  __kmp_task_finish<ompt>(gtid, task, NULL);
1162 
1163  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1164  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1165 
1166 #if OMPT_SUPPORT
1167  if (ompt) {
1168  ompt_frame_t *ompt_frame;
1169  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1170  ompt_frame->enter_frame = ompt_data_none;
1171  ompt_frame->enter_frame_flags =
1172  ompt_frame_runtime | ompt_frame_framepointer;
1173  }
1174 #endif
1175 
1176  return;
1177 }
1178 
1179 #if OMPT_SUPPORT
1180 OMPT_NOINLINE
1181 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1182  kmp_task_t *task) {
1183  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1184 }
1185 #endif // OMPT_SUPPORT
1186 
1187 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1188 //
1189 // loc_ref: source location information; points to end of task block.
1190 // gtid: global thread number.
1191 // task: task thunk for the completed task.
1192 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1193  kmp_task_t *task) {
1194 #if OMPT_SUPPORT
1195  if (UNLIKELY(ompt_enabled.enabled)) {
1196  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1197  return;
1198  }
1199 #endif
1200  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1201 }
1202 
1203 #ifdef TASK_UNUSED
1204 // __kmpc_omp_task_complete: report that a task has completed execution
1205 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1206 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1207  kmp_task_t *task) {
1208  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1209  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1210 
1211  __kmp_task_finish<false>(gtid, task,
1212  NULL); // Not sure how to find task to resume
1213 
1214  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1215  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1216  return;
1217 }
1218 #endif // TASK_UNUSED
1219 
1220 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1221 // task for a given thread
1222 //
1223 // loc_ref: reference to source location of parallel region
1224 // this_thr: thread data structure corresponding to implicit task
1225 // team: team for this_thr
1226 // tid: thread id of given thread within team
1227 // set_curr_task: TRUE if need to push current task to thread
1228 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1229 // have already been done elsewhere.
1230 // TODO: Get better loc_ref. Value passed in may be NULL
1231 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1232  kmp_team_t *team, int tid, int set_curr_task) {
1233  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1234 
1235  KF_TRACE(
1236  10,
1237  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1238  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1239 
1240  task->td_task_id = KMP_GEN_TASK_ID();
1241  task->td_team = team;
1242  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1243  // in debugger)
1244  task->td_ident = loc_ref;
1245  task->td_taskwait_ident = NULL;
1246  task->td_taskwait_counter = 0;
1247  task->td_taskwait_thread = 0;
1248 
1249  task->td_flags.tiedness = TASK_TIED;
1250  task->td_flags.tasktype = TASK_IMPLICIT;
1251  task->td_flags.proxy = TASK_FULL;
1252 
1253  // All implicit tasks are executed immediately, not deferred
1254  task->td_flags.task_serial = 1;
1255  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1256  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1257 
1258  task->td_flags.started = 1;
1259  task->td_flags.executing = 1;
1260  task->td_flags.complete = 0;
1261  task->td_flags.freed = 0;
1262 
1263  task->td_depnode = NULL;
1264  task->td_last_tied = task;
1265  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1266 
1267  if (set_curr_task) { // only do this init first time thread is created
1268  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1269  // Not used: don't need to deallocate implicit task
1270  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1271  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1272  task->td_dephash = NULL;
1273  __kmp_push_current_task_to_thread(this_thr, team, tid);
1274  } else {
1275  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1276  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1277  }
1278 
1279 #if OMPT_SUPPORT
1280  if (UNLIKELY(ompt_enabled.enabled))
1281  __ompt_task_init(task, tid);
1282 #endif
1283 
1284  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1285  team, task));
1286 }
1287 
1288 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1289 // at the end of parallel regions. Some resources are kept for reuse in the next
1290 // parallel region.
1291 //
1292 // thread: thread data structure corresponding to implicit task
1293 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1294  kmp_taskdata_t *task = thread->th.th_current_task;
1295  if (task->td_dephash) {
1296  int children;
1297  task->td_flags.complete = 1;
1298  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1299  kmp_tasking_flags_t flags_old = task->td_flags;
1300  if (children == 0 && flags_old.complete == 1) {
1301  kmp_tasking_flags_t flags_new = flags_old;
1302  flags_new.complete = 0;
1303  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1304  *RCAST(kmp_int32 *, &flags_old),
1305  *RCAST(kmp_int32 *, &flags_new))) {
1306  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1307  "dephash of implicit task %p\n",
1308  thread->th.th_info.ds.ds_gtid, task));
1309  __kmp_dephash_free_entries(thread, task->td_dephash);
1310  }
1311  }
1312  }
1313 }
1314 
1315 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1316 // when these are destroyed regions
1317 //
1318 // thread: thread data structure corresponding to implicit task
1319 void __kmp_free_implicit_task(kmp_info_t *thread) {
1320  kmp_taskdata_t *task = thread->th.th_current_task;
1321  if (task && task->td_dephash) {
1322  __kmp_dephash_free(thread, task->td_dephash);
1323  task->td_dephash = NULL;
1324  }
1325 }
1326 
1327 // Round up a size to a power of two specified by val: Used to insert padding
1328 // between structures co-allocated using a single malloc() call
1329 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1330  if (size & (val - 1)) {
1331  size &= ~(val - 1);
1332  if (size <= KMP_SIZE_T_MAX - val) {
1333  size += val; // Round up if there is no overflow.
1334  }
1335  }
1336  return size;
1337 } // __kmp_round_up_to_va
1338 
1339 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1340 //
1341 // loc_ref: source location information
1342 // gtid: global thread number.
1343 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1344 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1345 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1346 // private vars accessed in task.
1347 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1348 // in task.
1349 // task_entry: Pointer to task code entry point generated by compiler.
1350 // returns: a pointer to the allocated kmp_task_t structure (task).
1351 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1352  kmp_tasking_flags_t *flags,
1353  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1354  kmp_routine_entry_t task_entry) {
1355  kmp_task_t *task;
1356  kmp_taskdata_t *taskdata;
1357  kmp_info_t *thread = __kmp_threads[gtid];
1358  kmp_team_t *team = thread->th.th_team;
1359  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1360  size_t shareds_offset;
1361 
1362  if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1363  __kmp_middle_initialize();
1364 
1365  if (flags->hidden_helper) {
1366  if (__kmp_enable_hidden_helper) {
1367  if (!TCR_4(__kmp_init_hidden_helper))
1368  __kmp_hidden_helper_initialize();
1369  } else {
1370  // If the hidden helper task is not enabled, reset the flag to FALSE.
1371  flags->hidden_helper = FALSE;
1372  }
1373  }
1374 
1375  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1376  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1377  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1378  sizeof_shareds, task_entry));
1379 
1380  KMP_DEBUG_ASSERT(parent_task);
1381  if (parent_task->td_flags.final) {
1382  if (flags->merged_if0) {
1383  }
1384  flags->final = 1;
1385  }
1386 
1387  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1388  // Untied task encountered causes the TSC algorithm to check entire deque of
1389  // the victim thread. If no untied task encountered, then checking the head
1390  // of the deque should be enough.
1391  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1392  }
1393 
1394  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1395  // the tasking setup
1396  // when that happens is too late.
1397  if (UNLIKELY(flags->proxy == TASK_PROXY ||
1398  flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1399  if (flags->proxy == TASK_PROXY) {
1400  flags->tiedness = TASK_UNTIED;
1401  flags->merged_if0 = 1;
1402  }
1403  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1404  tasking support enabled */
1405  if ((thread->th.th_task_team) == NULL) {
1406  /* This should only happen if the team is serialized
1407  setup a task team and propagate it to the thread */
1408  KMP_DEBUG_ASSERT(team->t.t_serialized);
1409  KA_TRACE(30,
1410  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1411  gtid));
1412  // 1 indicates setup the current team regardless of nthreads
1413  __kmp_task_team_setup(thread, team, 1);
1414  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1415  }
1416  kmp_task_team_t *task_team = thread->th.th_task_team;
1417 
1418  /* tasking must be enabled now as the task might not be pushed */
1419  if (!KMP_TASKING_ENABLED(task_team)) {
1420  KA_TRACE(
1421  30,
1422  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1423  __kmp_enable_tasking(task_team, thread);
1424  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1425  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1426  // No lock needed since only owner can allocate
1427  if (thread_data->td.td_deque == NULL) {
1428  __kmp_alloc_task_deque(thread, thread_data);
1429  }
1430  }
1431 
1432  if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1433  task_team->tt.tt_found_proxy_tasks == FALSE)
1434  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1435  if (flags->hidden_helper &&
1436  task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1437  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1438  }
1439 
1440  // Calculate shared structure offset including padding after kmp_task_t struct
1441  // to align pointers in shared struct
1442  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1443  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1444 
1445  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1446  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1447  shareds_offset));
1448  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1449  sizeof_shareds));
1450 
1451  // Avoid double allocation here by combining shareds with taskdata
1452 #if USE_FAST_MEMORY
1453  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1454  sizeof_shareds);
1455 #else /* ! USE_FAST_MEMORY */
1456  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1457  sizeof_shareds);
1458 #endif /* USE_FAST_MEMORY */
1459 
1460  task = KMP_TASKDATA_TO_TASK(taskdata);
1461 
1462 // Make sure task & taskdata are aligned appropriately
1463 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1464  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1465  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1466 #else
1467  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1468  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1469 #endif
1470  if (sizeof_shareds > 0) {
1471  // Avoid double allocation here by combining shareds with taskdata
1472  task->shareds = &((char *)taskdata)[shareds_offset];
1473  // Make sure shareds struct is aligned to pointer size
1474  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1475  0);
1476  } else {
1477  task->shareds = NULL;
1478  }
1479  task->routine = task_entry;
1480  task->part_id = 0; // AC: Always start with 0 part id
1481 
1482  taskdata->td_task_id = KMP_GEN_TASK_ID();
1483  taskdata->td_team = thread->th.th_team;
1484  taskdata->td_alloc_thread = thread;
1485  taskdata->td_parent = parent_task;
1486  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1487  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1488  taskdata->td_ident = loc_ref;
1489  taskdata->td_taskwait_ident = NULL;
1490  taskdata->td_taskwait_counter = 0;
1491  taskdata->td_taskwait_thread = 0;
1492  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1493  // avoid copying icvs for proxy tasks
1494  if (flags->proxy == TASK_FULL)
1495  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1496 
1497  taskdata->td_flags = *flags;
1498  taskdata->td_task_team = thread->th.th_task_team;
1499  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1500  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1501  // If it is hidden helper task, we need to set the team and task team
1502  // correspondingly.
1503  if (flags->hidden_helper) {
1504  kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1505  taskdata->td_team = shadow_thread->th.th_team;
1506  taskdata->td_task_team = shadow_thread->th.th_task_team;
1507  }
1508 
1509  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1510  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1511 
1512  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1513  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1514 
1515  // GEH - Note we serialize the task if the team is serialized to make sure
1516  // implicit parallel region tasks are not left until program termination to
1517  // execute. Also, it helps locality to execute immediately.
1518 
1519  taskdata->td_flags.task_serial =
1520  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1521  taskdata->td_flags.tasking_ser || flags->merged_if0);
1522 
1523  taskdata->td_flags.started = 0;
1524  taskdata->td_flags.executing = 0;
1525  taskdata->td_flags.complete = 0;
1526  taskdata->td_flags.freed = 0;
1527 
1528  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1529  // start at one because counts current task and children
1530  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1531  taskdata->td_taskgroup =
1532  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1533  taskdata->td_dephash = NULL;
1534  taskdata->td_depnode = NULL;
1535  if (flags->tiedness == TASK_UNTIED)
1536  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1537  else
1538  taskdata->td_last_tied = taskdata;
1539  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1540 #if OMPT_SUPPORT
1541  if (UNLIKELY(ompt_enabled.enabled))
1542  __ompt_task_init(taskdata, gtid);
1543 #endif
1544  // TODO: What would be the balance between the conditions in the function and
1545  // an atomic operation?
1546  if (__kmp_track_children_task(taskdata)) {
1547  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1548  if (parent_task->td_taskgroup)
1549  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1550  // Only need to keep track of allocated child tasks for explicit tasks since
1551  // implicit not deallocated
1552  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1553  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1554  }
1555  if (flags->hidden_helper) {
1556  taskdata->td_flags.task_serial = FALSE;
1557  // Increment the number of hidden helper tasks to be executed
1558  KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1559  }
1560  }
1561 
1562  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1563  gtid, taskdata, taskdata->td_parent));
1564 
1565  return task;
1566 }
1567 
1568 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1569  kmp_int32 flags, size_t sizeof_kmp_task_t,
1570  size_t sizeof_shareds,
1571  kmp_routine_entry_t task_entry) {
1572  kmp_task_t *retval;
1573  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1574  __kmp_assert_valid_gtid(gtid);
1575  input_flags->native = FALSE;
1576  // __kmp_task_alloc() sets up all other runtime flags
1577  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1578  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1579  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1580  input_flags->proxy ? "proxy" : "",
1581  input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1582  sizeof_shareds, task_entry));
1583 
1584  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1585  sizeof_shareds, task_entry);
1586 
1587  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1588 
1589  return retval;
1590 }
1591 
1592 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1593  kmp_int32 flags,
1594  size_t sizeof_kmp_task_t,
1595  size_t sizeof_shareds,
1596  kmp_routine_entry_t task_entry,
1597  kmp_int64 device_id) {
1598  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1599  // target task is untied defined in the specification
1600  input_flags.tiedness = TASK_UNTIED;
1601 
1602  if (__kmp_enable_hidden_helper)
1603  input_flags.hidden_helper = TRUE;
1604 
1605  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1606  sizeof_shareds, task_entry);
1607 }
1608 
1622 kmp_int32
1624  kmp_task_t *new_task, kmp_int32 naffins,
1625  kmp_task_affinity_info_t *affin_list) {
1626  return 0;
1627 }
1628 
1629 // __kmp_invoke_task: invoke the specified task
1630 //
1631 // gtid: global thread ID of caller
1632 // task: the task to invoke
1633 // current_task: the task to resume after task invocation
1634 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1635  kmp_taskdata_t *current_task) {
1636  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1637  kmp_info_t *thread;
1638  int discard = 0 /* false */;
1639  KA_TRACE(
1640  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1641  gtid, taskdata, current_task));
1642  KMP_DEBUG_ASSERT(task);
1643  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1644  taskdata->td_flags.complete == 1)) {
1645  // This is a proxy task that was already completed but it needs to run
1646  // its bottom-half finish
1647  KA_TRACE(
1648  30,
1649  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1650  gtid, taskdata));
1651 
1652  __kmp_bottom_half_finish_proxy(gtid, task);
1653 
1654  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1655  "proxy task %p, resuming task %p\n",
1656  gtid, taskdata, current_task));
1657 
1658  return;
1659  }
1660 
1661 #if OMPT_SUPPORT
1662  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1663  // does not execute code.
1664  ompt_thread_info_t oldInfo;
1665  if (UNLIKELY(ompt_enabled.enabled)) {
1666  // Store the threads states and restore them after the task
1667  thread = __kmp_threads[gtid];
1668  oldInfo = thread->th.ompt_thread_info;
1669  thread->th.ompt_thread_info.wait_id = 0;
1670  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1671  ? ompt_state_work_serial
1672  : ompt_state_work_parallel;
1673  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1674  }
1675 #endif
1676 
1677  // Decreament the counter of hidden helper tasks to be executed
1678  if (taskdata->td_flags.hidden_helper) {
1679  // Hidden helper tasks can only be executed by hidden helper threads
1680  KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1681  KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1682  }
1683 
1684  // Proxy tasks are not handled by the runtime
1685  if (taskdata->td_flags.proxy != TASK_PROXY) {
1686  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1687  }
1688 
1689  // TODO: cancel tasks if the parallel region has also been cancelled
1690  // TODO: check if this sequence can be hoisted above __kmp_task_start
1691  // if cancellation has been enabled for this run ...
1692  if (UNLIKELY(__kmp_omp_cancellation)) {
1693  thread = __kmp_threads[gtid];
1694  kmp_team_t *this_team = thread->th.th_team;
1695  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1696  if ((taskgroup && taskgroup->cancel_request) ||
1697  (this_team->t.t_cancel_request == cancel_parallel)) {
1698 #if OMPT_SUPPORT && OMPT_OPTIONAL
1699  ompt_data_t *task_data;
1700  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1701  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1702  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1703  task_data,
1704  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1705  : ompt_cancel_parallel) |
1706  ompt_cancel_discarded_task,
1707  NULL);
1708  }
1709 #endif
1710  KMP_COUNT_BLOCK(TASK_cancelled);
1711  // this task belongs to a task group and we need to cancel it
1712  discard = 1 /* true */;
1713  }
1714  }
1715 
1716  // Invoke the task routine and pass in relevant data.
1717  // Thunks generated by gcc take a different argument list.
1718  if (!discard) {
1719  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1720  taskdata->td_last_tied = current_task->td_last_tied;
1721  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1722  }
1723 #if KMP_STATS_ENABLED
1724  KMP_COUNT_BLOCK(TASK_executed);
1725  switch (KMP_GET_THREAD_STATE()) {
1726  case FORK_JOIN_BARRIER:
1727  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1728  break;
1729  case PLAIN_BARRIER:
1730  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1731  break;
1732  case TASKYIELD:
1733  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1734  break;
1735  case TASKWAIT:
1736  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1737  break;
1738  case TASKGROUP:
1739  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1740  break;
1741  default:
1742  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1743  break;
1744  }
1745 #endif // KMP_STATS_ENABLED
1746 
1747 // OMPT task begin
1748 #if OMPT_SUPPORT
1749  if (UNLIKELY(ompt_enabled.enabled))
1750  __ompt_task_start(task, current_task, gtid);
1751 #endif
1752 #if OMPT_SUPPORT && OMPT_OPTIONAL
1753  if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1754  taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1755  ompt_data_t instance = ompt_data_none;
1756  instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1757  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1758  ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1759  &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1760  ompt_dispatch_taskloop_chunk, instance);
1761  taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1762  }
1763 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1764 
1765 #if OMPD_SUPPORT
1766  if (ompd_state & OMPD_ENABLE_BP)
1767  ompd_bp_task_begin();
1768 #endif
1769 
1770 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1771  kmp_uint64 cur_time;
1772  kmp_int32 kmp_itt_count_task =
1773  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1774  current_task->td_flags.tasktype == TASK_IMPLICIT;
1775  if (kmp_itt_count_task) {
1776  thread = __kmp_threads[gtid];
1777  // Time outer level explicit task on barrier for adjusting imbalance time
1778  if (thread->th.th_bar_arrive_time)
1779  cur_time = __itt_get_timestamp();
1780  else
1781  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1782  }
1783  KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1784 #endif
1785 
1786  if (task->routine != NULL) {
1787 #ifdef KMP_GOMP_COMPAT
1788  if (taskdata->td_flags.native) {
1789  ((void (*)(void *))(*(task->routine)))(task->shareds);
1790  } else
1791 #endif /* KMP_GOMP_COMPAT */
1792  {
1793  (*(task->routine))(gtid, task);
1794  }
1795  }
1796  KMP_POP_PARTITIONED_TIMER();
1797 
1798 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1799  if (kmp_itt_count_task) {
1800  // Barrier imbalance - adjust arrive time with the task duration
1801  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1802  }
1803  KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1804  KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1805 #endif
1806  }
1807 
1808 #if OMPD_SUPPORT
1809  if (ompd_state & OMPD_ENABLE_BP)
1810  ompd_bp_task_end();
1811 #endif
1812 
1813  // Proxy tasks are not handled by the runtime
1814  if (taskdata->td_flags.proxy != TASK_PROXY) {
1815 #if OMPT_SUPPORT
1816  if (UNLIKELY(ompt_enabled.enabled)) {
1817  thread->th.ompt_thread_info = oldInfo;
1818  if (taskdata->td_flags.tiedness == TASK_TIED) {
1819  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1820  }
1821  __kmp_task_finish<true>(gtid, task, current_task);
1822  } else
1823 #endif
1824  __kmp_task_finish<false>(gtid, task, current_task);
1825  }
1826 
1827  KA_TRACE(
1828  30,
1829  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1830  gtid, taskdata, current_task));
1831  return;
1832 }
1833 
1834 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1835 //
1836 // loc_ref: location of original task pragma (ignored)
1837 // gtid: Global Thread ID of encountering thread
1838 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1839 // Returns:
1840 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1841 // be resumed later.
1842 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1843 // resumed later.
1844 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1845  kmp_task_t *new_task) {
1846  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1847 
1848  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1849  loc_ref, new_taskdata));
1850 
1851 #if OMPT_SUPPORT
1852  kmp_taskdata_t *parent;
1853  if (UNLIKELY(ompt_enabled.enabled)) {
1854  parent = new_taskdata->td_parent;
1855  if (ompt_enabled.ompt_callback_task_create) {
1856  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1857  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1858  &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1859  OMPT_GET_RETURN_ADDRESS(0));
1860  }
1861  }
1862 #endif
1863 
1864  /* Should we execute the new task or queue it? For now, let's just always try
1865  to queue it. If the queue fills up, then we'll execute it. */
1866 
1867  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1868  { // Execute this task immediately
1869  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1870  new_taskdata->td_flags.task_serial = 1;
1871  __kmp_invoke_task(gtid, new_task, current_task);
1872  }
1873 
1874  KA_TRACE(
1875  10,
1876  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1877  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1878  gtid, loc_ref, new_taskdata));
1879 
1880 #if OMPT_SUPPORT
1881  if (UNLIKELY(ompt_enabled.enabled)) {
1882  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1883  }
1884 #endif
1885  return TASK_CURRENT_NOT_QUEUED;
1886 }
1887 
1888 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1889 //
1890 // gtid: Global Thread ID of encountering thread
1891 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1892 // serialize_immediate: if TRUE then if the task is executed immediately its
1893 // execution will be serialized
1894 // Returns:
1895 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1896 // be resumed later.
1897 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1898 // resumed later.
1899 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1900  bool serialize_immediate) {
1901  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1902 
1903  /* Should we execute the new task or queue it? For now, let's just always try
1904  to queue it. If the queue fills up, then we'll execute it. */
1905  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1906  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1907  { // Execute this task immediately
1908  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1909  if (serialize_immediate)
1910  new_taskdata->td_flags.task_serial = 1;
1911  __kmp_invoke_task(gtid, new_task, current_task);
1912  }
1913 
1914  return TASK_CURRENT_NOT_QUEUED;
1915 }
1916 
1917 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1918 // non-thread-switchable task from the parent thread only!
1919 //
1920 // loc_ref: location of original task pragma (ignored)
1921 // gtid: Global Thread ID of encountering thread
1922 // new_task: non-thread-switchable task thunk allocated by
1923 // __kmp_omp_task_alloc()
1924 // Returns:
1925 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1926 // be resumed later.
1927 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1928 // resumed later.
1929 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1930  kmp_task_t *new_task) {
1931  kmp_int32 res;
1932  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1933 
1934 #if KMP_DEBUG || OMPT_SUPPORT
1935  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1936 #endif
1937  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1938  new_taskdata));
1939  __kmp_assert_valid_gtid(gtid);
1940 
1941 #if OMPT_SUPPORT
1942  kmp_taskdata_t *parent = NULL;
1943  if (UNLIKELY(ompt_enabled.enabled)) {
1944  if (!new_taskdata->td_flags.started) {
1945  OMPT_STORE_RETURN_ADDRESS(gtid);
1946  parent = new_taskdata->td_parent;
1947  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1948  parent->ompt_task_info.frame.enter_frame.ptr =
1949  OMPT_GET_FRAME_ADDRESS(0);
1950  }
1951  if (ompt_enabled.ompt_callback_task_create) {
1952  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1953  &(parent->ompt_task_info.task_data),
1954  &(parent->ompt_task_info.frame),
1955  &(new_taskdata->ompt_task_info.task_data),
1956  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1957  OMPT_LOAD_RETURN_ADDRESS(gtid));
1958  }
1959  } else {
1960  // We are scheduling the continuation of an UNTIED task.
1961  // Scheduling back to the parent task.
1962  __ompt_task_finish(new_task,
1963  new_taskdata->ompt_task_info.scheduling_parent,
1964  ompt_task_switch);
1965  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1966  }
1967  }
1968 #endif
1969 
1970  res = __kmp_omp_task(gtid, new_task, true);
1971 
1972  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1973  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1974  gtid, loc_ref, new_taskdata));
1975 #if OMPT_SUPPORT
1976  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1977  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1978  }
1979 #endif
1980  return res;
1981 }
1982 
1983 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1984 // a taskloop task with the correct OMPT return address
1985 //
1986 // loc_ref: location of original task pragma (ignored)
1987 // gtid: Global Thread ID of encountering thread
1988 // new_task: non-thread-switchable task thunk allocated by
1989 // __kmp_omp_task_alloc()
1990 // codeptr_ra: return address for OMPT callback
1991 // Returns:
1992 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1993 // be resumed later.
1994 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1995 // resumed later.
1996 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1997  kmp_task_t *new_task, void *codeptr_ra) {
1998  kmp_int32 res;
1999  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2000 
2001 #if KMP_DEBUG || OMPT_SUPPORT
2002  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2003 #endif
2004  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2005  new_taskdata));
2006 
2007 #if OMPT_SUPPORT
2008  kmp_taskdata_t *parent = NULL;
2009  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
2010  parent = new_taskdata->td_parent;
2011  if (!parent->ompt_task_info.frame.enter_frame.ptr)
2012  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2013  if (ompt_enabled.ompt_callback_task_create) {
2014  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2015  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
2016  &(new_taskdata->ompt_task_info.task_data),
2017  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
2018  codeptr_ra);
2019  }
2020  }
2021 #endif
2022 
2023  res = __kmp_omp_task(gtid, new_task, true);
2024 
2025  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2026  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2027  gtid, loc_ref, new_taskdata));
2028 #if OMPT_SUPPORT
2029  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2030  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2031  }
2032 #endif
2033  return res;
2034 }
2035 
2036 template <bool ompt>
2037 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
2038  void *frame_address,
2039  void *return_address) {
2040  kmp_taskdata_t *taskdata = nullptr;
2041  kmp_info_t *thread;
2042  int thread_finished = FALSE;
2043  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
2044 
2045  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2046  KMP_DEBUG_ASSERT(gtid >= 0);
2047 
2048  if (__kmp_tasking_mode != tskm_immediate_exec) {
2049  thread = __kmp_threads[gtid];
2050  taskdata = thread->th.th_current_task;
2051 
2052 #if OMPT_SUPPORT && OMPT_OPTIONAL
2053  ompt_data_t *my_task_data;
2054  ompt_data_t *my_parallel_data;
2055 
2056  if (ompt) {
2057  my_task_data = &(taskdata->ompt_task_info.task_data);
2058  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2059 
2060  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2061 
2062  if (ompt_enabled.ompt_callback_sync_region) {
2063  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2064  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2065  my_task_data, return_address);
2066  }
2067 
2068  if (ompt_enabled.ompt_callback_sync_region_wait) {
2069  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2070  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2071  my_task_data, return_address);
2072  }
2073  }
2074 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2075 
2076 // Debugger: The taskwait is active. Store location and thread encountered the
2077 // taskwait.
2078 #if USE_ITT_BUILD
2079 // Note: These values are used by ITT events as well.
2080 #endif /* USE_ITT_BUILD */
2081  taskdata->td_taskwait_counter += 1;
2082  taskdata->td_taskwait_ident = loc_ref;
2083  taskdata->td_taskwait_thread = gtid + 1;
2084 
2085 #if USE_ITT_BUILD
2086  void *itt_sync_obj = NULL;
2087 #if USE_ITT_NOTIFY
2088  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2089 #endif /* USE_ITT_NOTIFY */
2090 #endif /* USE_ITT_BUILD */
2091 
2092  bool must_wait =
2093  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2094 
2095  must_wait = must_wait || (thread->th.th_task_team != NULL &&
2096  thread->th.th_task_team->tt.tt_found_proxy_tasks);
2097  // If hidden helper thread is encountered, we must enable wait here.
2098  must_wait =
2099  must_wait ||
2100  (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2101  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2102 
2103  if (must_wait) {
2104  kmp_flag_32<false, false> flag(
2105  RCAST(std::atomic<kmp_uint32> *,
2106  &(taskdata->td_incomplete_child_tasks)),
2107  0U);
2108  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2109  flag.execute_tasks(thread, gtid, FALSE,
2110  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2111  __kmp_task_stealing_constraint);
2112  }
2113  }
2114 #if USE_ITT_BUILD
2115  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2116  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2117 #endif /* USE_ITT_BUILD */
2118 
2119  // Debugger: The taskwait is completed. Location remains, but thread is
2120  // negated.
2121  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2122 
2123 #if OMPT_SUPPORT && OMPT_OPTIONAL
2124  if (ompt) {
2125  if (ompt_enabled.ompt_callback_sync_region_wait) {
2126  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2127  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2128  my_task_data, return_address);
2129  }
2130  if (ompt_enabled.ompt_callback_sync_region) {
2131  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2132  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2133  my_task_data, return_address);
2134  }
2135  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2136  }
2137 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2138 
2139  }
2140 
2141  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2142  "returning TASK_CURRENT_NOT_QUEUED\n",
2143  gtid, taskdata));
2144 
2145  return TASK_CURRENT_NOT_QUEUED;
2146 }
2147 
2148 #if OMPT_SUPPORT && OMPT_OPTIONAL
2149 OMPT_NOINLINE
2150 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2151  void *frame_address,
2152  void *return_address) {
2153  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2154  return_address);
2155 }
2156 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2157 
2158 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2159 // complete
2160 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2161 #if OMPT_SUPPORT && OMPT_OPTIONAL
2162  if (UNLIKELY(ompt_enabled.enabled)) {
2163  OMPT_STORE_RETURN_ADDRESS(gtid);
2164  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2165  OMPT_LOAD_RETURN_ADDRESS(gtid));
2166  }
2167 #endif
2168  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2169 }
2170 
2171 // __kmpc_omp_taskyield: switch to a different task
2172 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2173  kmp_taskdata_t *taskdata = NULL;
2174  kmp_info_t *thread;
2175  int thread_finished = FALSE;
2176 
2177  KMP_COUNT_BLOCK(OMP_TASKYIELD);
2178  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2179 
2180  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2181  gtid, loc_ref, end_part));
2182  __kmp_assert_valid_gtid(gtid);
2183 
2184  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2185  thread = __kmp_threads[gtid];
2186  taskdata = thread->th.th_current_task;
2187 // Should we model this as a task wait or not?
2188 // Debugger: The taskwait is active. Store location and thread encountered the
2189 // taskwait.
2190 #if USE_ITT_BUILD
2191 // Note: These values are used by ITT events as well.
2192 #endif /* USE_ITT_BUILD */
2193  taskdata->td_taskwait_counter += 1;
2194  taskdata->td_taskwait_ident = loc_ref;
2195  taskdata->td_taskwait_thread = gtid + 1;
2196 
2197 #if USE_ITT_BUILD
2198  void *itt_sync_obj = NULL;
2199 #if USE_ITT_NOTIFY
2200  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2201 #endif /* USE_ITT_NOTIFY */
2202 #endif /* USE_ITT_BUILD */
2203  if (!taskdata->td_flags.team_serial) {
2204  kmp_task_team_t *task_team = thread->th.th_task_team;
2205  if (task_team != NULL) {
2206  if (KMP_TASKING_ENABLED(task_team)) {
2207 #if OMPT_SUPPORT
2208  if (UNLIKELY(ompt_enabled.enabled))
2209  thread->th.ompt_thread_info.ompt_task_yielded = 1;
2210 #endif
2211  __kmp_execute_tasks_32(
2212  thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2213  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2214  __kmp_task_stealing_constraint);
2215 #if OMPT_SUPPORT
2216  if (UNLIKELY(ompt_enabled.enabled))
2217  thread->th.ompt_thread_info.ompt_task_yielded = 0;
2218 #endif
2219  }
2220  }
2221  }
2222 #if USE_ITT_BUILD
2223  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2224 #endif /* USE_ITT_BUILD */
2225 
2226  // Debugger: The taskwait is completed. Location remains, but thread is
2227  // negated.
2228  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2229  }
2230 
2231  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2232  "returning TASK_CURRENT_NOT_QUEUED\n",
2233  gtid, taskdata));
2234 
2235  return TASK_CURRENT_NOT_QUEUED;
2236 }
2237 
2238 // Task Reduction implementation
2239 //
2240 // Note: initial implementation didn't take into account the possibility
2241 // to specify omp_orig for initializer of the UDR (user defined reduction).
2242 // Corrected implementation takes into account the omp_orig object.
2243 // Compiler is free to use old implementation if omp_orig is not specified.
2244 
2253 typedef struct kmp_taskred_flags {
2255  unsigned lazy_priv : 1;
2256  unsigned reserved31 : 31;
2258 
2262 typedef struct kmp_task_red_input {
2263  void *reduce_shar;
2264  size_t reduce_size;
2265  // three compiler-generated routines (init, fini are optional):
2266  void *reduce_init;
2267  void *reduce_fini;
2268  void *reduce_comb;
2271 
2275 typedef struct kmp_taskred_data {
2276  void *reduce_shar;
2277  size_t reduce_size;
2279  void *reduce_priv;
2280  void *reduce_pend;
2281  // three compiler-generated routines (init, fini are optional):
2282  void *reduce_comb;
2283  void *reduce_init;
2284  void *reduce_fini;
2285  void *reduce_orig;
2287 
2293 typedef struct kmp_taskred_input {
2294  void *reduce_shar;
2295  void *reduce_orig;
2296  size_t reduce_size;
2297  // three compiler-generated routines (init, fini are optional):
2298  void *reduce_init;
2299  void *reduce_fini;
2300  void *reduce_comb;
2307 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2308 template <>
2309 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2310  kmp_task_red_input_t &src) {
2311  item.reduce_orig = NULL;
2312 }
2313 template <>
2314 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2315  kmp_taskred_input_t &src) {
2316  if (src.reduce_orig != NULL) {
2317  item.reduce_orig = src.reduce_orig;
2318  } else {
2319  item.reduce_orig = src.reduce_shar;
2320  } // non-NULL reduce_orig means new interface used
2321 }
2322 
2323 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2324 template <>
2325 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2326  size_t offset) {
2327  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2328 }
2329 template <>
2330 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2331  size_t offset) {
2332  ((void (*)(void *, void *))item.reduce_init)(
2333  (char *)(item.reduce_priv) + offset, item.reduce_orig);
2334 }
2335 
2336 template <typename T>
2337 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2338  __kmp_assert_valid_gtid(gtid);
2339  kmp_info_t *thread = __kmp_threads[gtid];
2340  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2341  kmp_uint32 nth = thread->th.th_team_nproc;
2342  kmp_taskred_data_t *arr;
2343 
2344  // check input data just in case
2345  KMP_ASSERT(tg != NULL);
2346  KMP_ASSERT(data != NULL);
2347  KMP_ASSERT(num > 0);
2348  if (nth == 1) {
2349  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2350  gtid, tg));
2351  return (void *)tg;
2352  }
2353  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2354  gtid, tg, num));
2355  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2356  thread, num * sizeof(kmp_taskred_data_t));
2357  for (int i = 0; i < num; ++i) {
2358  size_t size = data[i].reduce_size - 1;
2359  // round the size up to cache line per thread-specific item
2360  size += CACHE_LINE - size % CACHE_LINE;
2361  KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2362  arr[i].reduce_shar = data[i].reduce_shar;
2363  arr[i].reduce_size = size;
2364  arr[i].flags = data[i].flags;
2365  arr[i].reduce_comb = data[i].reduce_comb;
2366  arr[i].reduce_init = data[i].reduce_init;
2367  arr[i].reduce_fini = data[i].reduce_fini;
2368  __kmp_assign_orig<T>(arr[i], data[i]);
2369  if (!arr[i].flags.lazy_priv) {
2370  // allocate cache-line aligned block and fill it with zeros
2371  arr[i].reduce_priv = __kmp_allocate(nth * size);
2372  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2373  if (arr[i].reduce_init != NULL) {
2374  // initialize all thread-specific items
2375  for (size_t j = 0; j < nth; ++j) {
2376  __kmp_call_init<T>(arr[i], j * size);
2377  }
2378  }
2379  } else {
2380  // only allocate space for pointers now,
2381  // objects will be lazily allocated/initialized if/when requested
2382  // note that __kmp_allocate zeroes the allocated memory
2383  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2384  }
2385  }
2386  tg->reduce_data = (void *)arr;
2387  tg->reduce_num_data = num;
2388  return (void *)tg;
2389 }
2390 
2405 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2406  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2407 }
2408 
2421 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2422  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2423 }
2424 
2425 // Copy task reduction data (except for shared pointers).
2426 template <typename T>
2427 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2428  kmp_taskgroup_t *tg, void *reduce_data) {
2429  kmp_taskred_data_t *arr;
2430  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2431  " from data %p\n",
2432  thr, tg, reduce_data));
2433  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2434  thr, num * sizeof(kmp_taskred_data_t));
2435  // threads will share private copies, thunk routines, sizes, flags, etc.:
2436  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2437  for (int i = 0; i < num; ++i) {
2438  arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2439  }
2440  tg->reduce_data = (void *)arr;
2441  tg->reduce_num_data = num;
2442 }
2443 
2453 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2454  __kmp_assert_valid_gtid(gtid);
2455  kmp_info_t *thread = __kmp_threads[gtid];
2456  kmp_int32 nth = thread->th.th_team_nproc;
2457  if (nth == 1)
2458  return data; // nothing to do
2459 
2460  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2461  if (tg == NULL)
2462  tg = thread->th.th_current_task->td_taskgroup;
2463  KMP_ASSERT(tg != NULL);
2464  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2465  kmp_int32 num = tg->reduce_num_data;
2466  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2467 
2468  KMP_ASSERT(data != NULL);
2469  while (tg != NULL) {
2470  for (int i = 0; i < num; ++i) {
2471  if (!arr[i].flags.lazy_priv) {
2472  if (data == arr[i].reduce_shar ||
2473  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2474  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2475  } else {
2476  // check shared location first
2477  void **p_priv = (void **)(arr[i].reduce_priv);
2478  if (data == arr[i].reduce_shar)
2479  goto found;
2480  // check if we get some thread specific location as parameter
2481  for (int j = 0; j < nth; ++j)
2482  if (data == p_priv[j])
2483  goto found;
2484  continue; // not found, continue search
2485  found:
2486  if (p_priv[tid] == NULL) {
2487  // allocate thread specific object lazily
2488  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2489  if (arr[i].reduce_init != NULL) {
2490  if (arr[i].reduce_orig != NULL) { // new interface
2491  ((void (*)(void *, void *))arr[i].reduce_init)(
2492  p_priv[tid], arr[i].reduce_orig);
2493  } else { // old interface (single parameter)
2494  ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2495  }
2496  }
2497  }
2498  return p_priv[tid];
2499  }
2500  }
2501  tg = tg->parent;
2502  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2503  num = tg->reduce_num_data;
2504  }
2505  KMP_ASSERT2(0, "Unknown task reduction item");
2506  return NULL; // ERROR, this line never executed
2507 }
2508 
2509 // Finalize task reduction.
2510 // Called from __kmpc_end_taskgroup()
2511 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2512  kmp_int32 nth = th->th.th_team_nproc;
2513  KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2514  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2515  kmp_int32 num = tg->reduce_num_data;
2516  for (int i = 0; i < num; ++i) {
2517  void *sh_data = arr[i].reduce_shar;
2518  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2519  void (*f_comb)(void *, void *) =
2520  (void (*)(void *, void *))(arr[i].reduce_comb);
2521  if (!arr[i].flags.lazy_priv) {
2522  void *pr_data = arr[i].reduce_priv;
2523  size_t size = arr[i].reduce_size;
2524  for (int j = 0; j < nth; ++j) {
2525  void *priv_data = (char *)pr_data + j * size;
2526  f_comb(sh_data, priv_data); // combine results
2527  if (f_fini)
2528  f_fini(priv_data); // finalize if needed
2529  }
2530  } else {
2531  void **pr_data = (void **)(arr[i].reduce_priv);
2532  for (int j = 0; j < nth; ++j) {
2533  if (pr_data[j] != NULL) {
2534  f_comb(sh_data, pr_data[j]); // combine results
2535  if (f_fini)
2536  f_fini(pr_data[j]); // finalize if needed
2537  __kmp_free(pr_data[j]);
2538  }
2539  }
2540  }
2541  __kmp_free(arr[i].reduce_priv);
2542  }
2543  __kmp_thread_free(th, arr);
2544  tg->reduce_data = NULL;
2545  tg->reduce_num_data = 0;
2546 }
2547 
2548 // Cleanup task reduction data for parallel or worksharing,
2549 // do not touch task private data other threads still working with.
2550 // Called from __kmpc_end_taskgroup()
2551 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2552  __kmp_thread_free(th, tg->reduce_data);
2553  tg->reduce_data = NULL;
2554  tg->reduce_num_data = 0;
2555 }
2556 
2557 template <typename T>
2558 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2559  int num, T *data) {
2560  __kmp_assert_valid_gtid(gtid);
2561  kmp_info_t *thr = __kmp_threads[gtid];
2562  kmp_int32 nth = thr->th.th_team_nproc;
2563  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2564  if (nth == 1) {
2565  KA_TRACE(10,
2566  ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2567  gtid, thr->th.th_current_task->td_taskgroup));
2568  return (void *)thr->th.th_current_task->td_taskgroup;
2569  }
2570  kmp_team_t *team = thr->th.th_team;
2571  void *reduce_data;
2572  kmp_taskgroup_t *tg;
2573  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2574  if (reduce_data == NULL &&
2575  __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2576  (void *)1)) {
2577  // single thread enters this block to initialize common reduction data
2578  KMP_DEBUG_ASSERT(reduce_data == NULL);
2579  // first initialize own data, then make a copy other threads can use
2580  tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2581  reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2582  KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2583  // fini counters should be 0 at this point
2584  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2585  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2586  KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2587  } else {
2588  while (
2589  (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2590  (void *)1) { // wait for task reduction initialization
2591  KMP_CPU_PAUSE();
2592  }
2593  KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2594  tg = thr->th.th_current_task->td_taskgroup;
2595  __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2596  }
2597  return tg;
2598 }
2599 
2616 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2617  int num, void *data) {
2618  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2619  (kmp_task_red_input_t *)data);
2620 }
2621 
2636 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2637  void *data) {
2638  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2639  (kmp_taskred_input_t *)data);
2640 }
2641 
2650 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2651  __kmpc_end_taskgroup(loc, gtid);
2652 }
2653 
2654 // __kmpc_taskgroup: Start a new taskgroup
2655 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2656  __kmp_assert_valid_gtid(gtid);
2657  kmp_info_t *thread = __kmp_threads[gtid];
2658  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2659  kmp_taskgroup_t *tg_new =
2660  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2661  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2662  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2663  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2664  tg_new->parent = taskdata->td_taskgroup;
2665  tg_new->reduce_data = NULL;
2666  tg_new->reduce_num_data = 0;
2667  tg_new->gomp_data = NULL;
2668  taskdata->td_taskgroup = tg_new;
2669 
2670 #if OMPT_SUPPORT && OMPT_OPTIONAL
2671  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2672  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2673  if (!codeptr)
2674  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2675  kmp_team_t *team = thread->th.th_team;
2676  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2677  // FIXME: I think this is wrong for lwt!
2678  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2679 
2680  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2681  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2682  &(my_task_data), codeptr);
2683  }
2684 #endif
2685 }
2686 
2687 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2688 // and its descendants are complete
2689 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2690  __kmp_assert_valid_gtid(gtid);
2691  kmp_info_t *thread = __kmp_threads[gtid];
2692  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2693  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2694  int thread_finished = FALSE;
2695 
2696 #if OMPT_SUPPORT && OMPT_OPTIONAL
2697  kmp_team_t *team;
2698  ompt_data_t my_task_data;
2699  ompt_data_t my_parallel_data;
2700  void *codeptr = nullptr;
2701  if (UNLIKELY(ompt_enabled.enabled)) {
2702  team = thread->th.th_team;
2703  my_task_data = taskdata->ompt_task_info.task_data;
2704  // FIXME: I think this is wrong for lwt!
2705  my_parallel_data = team->t.ompt_team_info.parallel_data;
2706  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2707  if (!codeptr)
2708  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2709  }
2710 #endif
2711 
2712  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2713  KMP_DEBUG_ASSERT(taskgroup != NULL);
2714  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2715 
2716  if (__kmp_tasking_mode != tskm_immediate_exec) {
2717  // mark task as waiting not on a barrier
2718  taskdata->td_taskwait_counter += 1;
2719  taskdata->td_taskwait_ident = loc;
2720  taskdata->td_taskwait_thread = gtid + 1;
2721 #if USE_ITT_BUILD
2722  // For ITT the taskgroup wait is similar to taskwait until we need to
2723  // distinguish them
2724  void *itt_sync_obj = NULL;
2725 #if USE_ITT_NOTIFY
2726  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2727 #endif /* USE_ITT_NOTIFY */
2728 #endif /* USE_ITT_BUILD */
2729 
2730 #if OMPT_SUPPORT && OMPT_OPTIONAL
2731  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2732  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2733  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2734  &(my_task_data), codeptr);
2735  }
2736 #endif
2737 
2738  if (!taskdata->td_flags.team_serial ||
2739  (thread->th.th_task_team != NULL &&
2740  (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2741  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2742  kmp_flag_32<false, false> flag(
2743  RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2744  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2745  flag.execute_tasks(thread, gtid, FALSE,
2746  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2747  __kmp_task_stealing_constraint);
2748  }
2749  }
2750  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2751 
2752 #if OMPT_SUPPORT && OMPT_OPTIONAL
2753  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2754  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2755  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2756  &(my_task_data), codeptr);
2757  }
2758 #endif
2759 
2760 #if USE_ITT_BUILD
2761  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2762  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2763 #endif /* USE_ITT_BUILD */
2764  }
2765  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2766 
2767  if (taskgroup->reduce_data != NULL &&
2768  !taskgroup->gomp_data) { // need to reduce?
2769  int cnt;
2770  void *reduce_data;
2771  kmp_team_t *t = thread->th.th_team;
2772  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2773  // check if <priv> data of the first reduction variable shared for the team
2774  void *priv0 = arr[0].reduce_priv;
2775  if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2776  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2777  // finishing task reduction on parallel
2778  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2779  if (cnt == thread->th.th_team_nproc - 1) {
2780  // we are the last thread passing __kmpc_reduction_modifier_fini()
2781  // finalize task reduction:
2782  __kmp_task_reduction_fini(thread, taskgroup);
2783  // cleanup fields in the team structure:
2784  // TODO: is relaxed store enough here (whole barrier should follow)?
2785  __kmp_thread_free(thread, reduce_data);
2786  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2787  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2788  } else {
2789  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2790  // so do not finalize reduction, just clean own copy of the data
2791  __kmp_task_reduction_clean(thread, taskgroup);
2792  }
2793  } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2794  NULL &&
2795  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2796  // finishing task reduction on worksharing
2797  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2798  if (cnt == thread->th.th_team_nproc - 1) {
2799  // we are the last thread passing __kmpc_reduction_modifier_fini()
2800  __kmp_task_reduction_fini(thread, taskgroup);
2801  // cleanup fields in team structure:
2802  // TODO: is relaxed store enough here (whole barrier should follow)?
2803  __kmp_thread_free(thread, reduce_data);
2804  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2805  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2806  } else {
2807  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2808  // so do not finalize reduction, just clean own copy of the data
2809  __kmp_task_reduction_clean(thread, taskgroup);
2810  }
2811  } else {
2812  // finishing task reduction on taskgroup
2813  __kmp_task_reduction_fini(thread, taskgroup);
2814  }
2815  }
2816  // Restore parent taskgroup for the current task
2817  taskdata->td_taskgroup = taskgroup->parent;
2818  __kmp_thread_free(thread, taskgroup);
2819 
2820  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2821  gtid, taskdata));
2822 
2823 #if OMPT_SUPPORT && OMPT_OPTIONAL
2824  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2825  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2826  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2827  &(my_task_data), codeptr);
2828  }
2829 #endif
2830 }
2831 
2832 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
2833  kmp_task_team_t *task_team,
2834  kmp_int32 is_constrained) {
2835  kmp_task_t *task = NULL;
2836  kmp_taskdata_t *taskdata;
2837  kmp_taskdata_t *current;
2838  kmp_thread_data_t *thread_data;
2839  int ntasks = task_team->tt.tt_num_task_pri;
2840  if (ntasks == 0) {
2841  KA_TRACE(
2842  20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
2843  return NULL;
2844  }
2845  do {
2846  // decrement num_tasks to "reserve" one task to get for execution
2847  if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
2848  ntasks - 1))
2849  break;
2850  } while (ntasks > 0);
2851  if (ntasks == 0) {
2852  KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
2853  __kmp_get_gtid()));
2854  return NULL;
2855  }
2856  // We got a "ticket" to get a "reserved" priority task
2857  int deque_ntasks;
2858  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
2859  do {
2860  KMP_ASSERT(list != NULL);
2861  thread_data = &list->td;
2862  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2863  deque_ntasks = thread_data->td.td_deque_ntasks;
2864  if (deque_ntasks == 0) {
2865  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2866  KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
2867  __kmp_get_gtid(), thread_data));
2868  list = list->next;
2869  }
2870  } while (deque_ntasks == 0);
2871  KMP_DEBUG_ASSERT(deque_ntasks);
2872  int target = thread_data->td.td_deque_head;
2873  current = __kmp_threads[gtid]->th.th_current_task;
2874  taskdata = thread_data->td.td_deque[target];
2875  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2876  // Bump head pointer and Wrap.
2877  thread_data->td.td_deque_head =
2878  (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2879  } else {
2880  if (!task_team->tt.tt_untied_task_encountered) {
2881  // The TSC does not allow to steal victim task
2882  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2883  KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
2884  "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
2885  gtid, thread_data, task_team, deque_ntasks, target,
2886  thread_data->td.td_deque_tail));
2887  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2888  return NULL;
2889  }
2890  int i;
2891  // walk through the deque trying to steal any task
2892  taskdata = NULL;
2893  for (i = 1; i < deque_ntasks; ++i) {
2894  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2895  taskdata = thread_data->td.td_deque[target];
2896  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2897  break; // found task to execute
2898  } else {
2899  taskdata = NULL;
2900  }
2901  }
2902  if (taskdata == NULL) {
2903  // No appropriate candidate found to execute
2904  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2905  KA_TRACE(
2906  10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
2907  "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
2908  gtid, thread_data, task_team, deque_ntasks,
2909  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2910  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2911  return NULL;
2912  }
2913  int prev = target;
2914  for (i = i + 1; i < deque_ntasks; ++i) {
2915  // shift remaining tasks in the deque left by 1
2916  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2917  thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
2918  prev = target;
2919  }
2920  KMP_DEBUG_ASSERT(
2921  thread_data->td.td_deque_tail ==
2922  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
2923  thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
2924  }
2925  thread_data->td.td_deque_ntasks = deque_ntasks - 1;
2926  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2927  task = KMP_TASKDATA_TO_TASK(taskdata);
2928  return task;
2929 }
2930 
2931 // __kmp_remove_my_task: remove a task from my own deque
2932 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2933  kmp_task_team_t *task_team,
2934  kmp_int32 is_constrained) {
2935  kmp_task_t *task;
2936  kmp_taskdata_t *taskdata;
2937  kmp_thread_data_t *thread_data;
2938  kmp_uint32 tail;
2939 
2940  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2941  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2942  NULL); // Caller should check this condition
2943 
2944  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2945 
2946  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2947  gtid, thread_data->td.td_deque_ntasks,
2948  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2949 
2950  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2951  KA_TRACE(10,
2952  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2953  "ntasks=%d head=%u tail=%u\n",
2954  gtid, thread_data->td.td_deque_ntasks,
2955  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2956  return NULL;
2957  }
2958 
2959  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2960 
2961  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2962  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2963  KA_TRACE(10,
2964  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2965  "ntasks=%d head=%u tail=%u\n",
2966  gtid, thread_data->td.td_deque_ntasks,
2967  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2968  return NULL;
2969  }
2970 
2971  tail = (thread_data->td.td_deque_tail - 1) &
2972  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2973  taskdata = thread_data->td.td_deque[tail];
2974 
2975  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2976  thread->th.th_current_task)) {
2977  // The TSC does not allow to steal victim task
2978  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2979  KA_TRACE(10,
2980  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2981  "ntasks=%d head=%u tail=%u\n",
2982  gtid, thread_data->td.td_deque_ntasks,
2983  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2984  return NULL;
2985  }
2986 
2987  thread_data->td.td_deque_tail = tail;
2988  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2989 
2990  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2991 
2992  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2993  "ntasks=%d head=%u tail=%u\n",
2994  gtid, taskdata, thread_data->td.td_deque_ntasks,
2995  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2996 
2997  task = KMP_TASKDATA_TO_TASK(taskdata);
2998  return task;
2999 }
3000 
3001 // __kmp_steal_task: remove a task from another thread's deque
3002 // Assume that calling thread has already checked existence of
3003 // task_team thread_data before calling this routine.
3004 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
3005  kmp_task_team_t *task_team,
3006  std::atomic<kmp_int32> *unfinished_threads,
3007  int *thread_finished,
3008  kmp_int32 is_constrained) {
3009  kmp_task_t *task;
3010  kmp_taskdata_t *taskdata;
3011  kmp_taskdata_t *current;
3012  kmp_thread_data_t *victim_td, *threads_data;
3013  kmp_int32 target;
3014  kmp_int32 victim_tid;
3015 
3016  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3017 
3018  threads_data = task_team->tt.tt_threads_data;
3019  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3020 
3021  victim_tid = victim_thr->th.th_info.ds.ds_tid;
3022  victim_td = &threads_data[victim_tid];
3023 
3024  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3025  "task_team=%p ntasks=%d head=%u tail=%u\n",
3026  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3027  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3028  victim_td->td.td_deque_tail));
3029 
3030  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3031  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3032  "task_team=%p ntasks=%d head=%u tail=%u\n",
3033  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3034  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3035  victim_td->td.td_deque_tail));
3036  return NULL;
3037  }
3038 
3039  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3040 
3041  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3042  // Check again after we acquire the lock
3043  if (ntasks == 0) {
3044  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3045  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3046  "task_team=%p ntasks=%d head=%u tail=%u\n",
3047  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3048  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3049  return NULL;
3050  }
3051 
3052  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3053  current = __kmp_threads[gtid]->th.th_current_task;
3054  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3055  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3056  // Bump head pointer and Wrap.
3057  victim_td->td.td_deque_head =
3058  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3059  } else {
3060  if (!task_team->tt.tt_untied_task_encountered) {
3061  // The TSC does not allow to steal victim task
3062  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3063  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3064  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3065  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3066  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3067  return NULL;
3068  }
3069  int i;
3070  // walk through victim's deque trying to steal any task
3071  target = victim_td->td.td_deque_head;
3072  taskdata = NULL;
3073  for (i = 1; i < ntasks; ++i) {
3074  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3075  taskdata = victim_td->td.td_deque[target];
3076  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3077  break; // found victim task
3078  } else {
3079  taskdata = NULL;
3080  }
3081  }
3082  if (taskdata == NULL) {
3083  // No appropriate candidate to steal found
3084  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3085  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3086  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3087  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3088  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3089  return NULL;
3090  }
3091  int prev = target;
3092  for (i = i + 1; i < ntasks; ++i) {
3093  // shift remaining tasks in the deque left by 1
3094  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3095  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3096  prev = target;
3097  }
3098  KMP_DEBUG_ASSERT(
3099  victim_td->td.td_deque_tail ==
3100  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3101  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3102  }
3103  if (*thread_finished) {
3104  // We need to un-mark this victim as a finished victim. This must be done
3105  // before releasing the lock, or else other threads (starting with the
3106  // primary thread victim) might be prematurely released from the barrier!!!
3107 #if KMP_DEBUG
3108  kmp_int32 count =
3109 #endif
3110  KMP_ATOMIC_INC(unfinished_threads);
3111  KA_TRACE(
3112  20,
3113  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3114  gtid, count + 1, task_team));
3115  *thread_finished = FALSE;
3116  }
3117  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3118 
3119  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3120 
3121  KMP_COUNT_BLOCK(TASK_stolen);
3122  KA_TRACE(10,
3123  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3124  "task_team=%p ntasks=%d head=%u tail=%u\n",
3125  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3126  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3127 
3128  task = KMP_TASKDATA_TO_TASK(taskdata);
3129  return task;
3130 }
3131 
3132 // __kmp_execute_tasks_template: Choose and execute tasks until either the
3133 // condition is statisfied (return true) or there are none left (return false).
3134 //
3135 // final_spin is TRUE if this is the spin at the release barrier.
3136 // thread_finished indicates whether the thread is finished executing all
3137 // the tasks it has on its deque, and is at the release barrier.
3138 // spinner is the location on which to spin.
3139 // spinner == NULL means only execute a single task and return.
3140 // checker is the value to check to terminate the spin.
3141 template <class C>
3142 static inline int __kmp_execute_tasks_template(
3143  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3144  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3145  kmp_int32 is_constrained) {
3146  kmp_task_team_t *task_team = thread->th.th_task_team;
3147  kmp_thread_data_t *threads_data;
3148  kmp_task_t *task;
3149  kmp_info_t *other_thread;
3150  kmp_taskdata_t *current_task = thread->th.th_current_task;
3151  std::atomic<kmp_int32> *unfinished_threads;
3152  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3153  tid = thread->th.th_info.ds.ds_tid;
3154 
3155  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3156  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3157 
3158  if (task_team == NULL || current_task == NULL)
3159  return FALSE;
3160 
3161  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3162  "*thread_finished=%d\n",
3163  gtid, final_spin, *thread_finished));
3164 
3165  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3166  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3167 
3168  KMP_DEBUG_ASSERT(threads_data != NULL);
3169 
3170  nthreads = task_team->tt.tt_nproc;
3171  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3172  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
3173  task_team->tt.tt_hidden_helper_task_encountered);
3174  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3175 
3176  while (1) { // Outer loop keeps trying to find tasks in case of single thread
3177  // getting tasks from target constructs
3178  while (1) { // Inner loop to find a task and execute it
3179  task = NULL;
3180  if (task_team->tt.tt_num_task_pri) { // get priority task first
3181  task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3182  }
3183  if (task == NULL && use_own_tasks) { // check own queue next
3184  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3185  }
3186  if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3187  int asleep = 1;
3188  use_own_tasks = 0;
3189  // Try to steal from the last place I stole from successfully.
3190  if (victim_tid == -2) { // haven't stolen anything yet
3191  victim_tid = threads_data[tid].td.td_deque_last_stolen;
3192  if (victim_tid !=
3193  -1) // if we have a last stolen from victim, get the thread
3194  other_thread = threads_data[victim_tid].td.td_thr;
3195  }
3196  if (victim_tid != -1) { // found last victim
3197  asleep = 0;
3198  } else if (!new_victim) { // no recent steals and we haven't already
3199  // used a new victim; select a random thread
3200  do { // Find a different thread to steal work from.
3201  // Pick a random thread. Initial plan was to cycle through all the
3202  // threads, and only return if we tried to steal from every thread,
3203  // and failed. Arch says that's not such a great idea.
3204  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3205  if (victim_tid >= tid) {
3206  ++victim_tid; // Adjusts random distribution to exclude self
3207  }
3208  // Found a potential victim
3209  other_thread = threads_data[victim_tid].td.td_thr;
3210  // There is a slight chance that __kmp_enable_tasking() did not wake
3211  // up all threads waiting at the barrier. If victim is sleeping,
3212  // then wake it up. Since we were going to pay the cache miss
3213  // penalty for referencing another thread's kmp_info_t struct
3214  // anyway,
3215  // the check shouldn't cost too much performance at this point. In
3216  // extra barrier mode, tasks do not sleep at the separate tasking
3217  // barrier, so this isn't a problem.
3218  asleep = 0;
3219  if ((__kmp_tasking_mode == tskm_task_teams) &&
3220  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3221  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3222  NULL)) {
3223  asleep = 1;
3224  __kmp_null_resume_wrapper(other_thread);
3225  // A sleeping thread should not have any tasks on it's queue.
3226  // There is a slight possibility that it resumes, steals a task
3227  // from another thread, which spawns more tasks, all in the time
3228  // that it takes this thread to check => don't write an assertion
3229  // that the victim's queue is empty. Try stealing from a
3230  // different thread.
3231  }
3232  } while (asleep);
3233  }
3234 
3235  if (!asleep) {
3236  // We have a victim to try to steal from
3237  task = __kmp_steal_task(other_thread, gtid, task_team,
3238  unfinished_threads, thread_finished,
3239  is_constrained);
3240  }
3241  if (task != NULL) { // set last stolen to victim
3242  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3243  threads_data[tid].td.td_deque_last_stolen = victim_tid;
3244  // The pre-refactored code did not try more than 1 successful new
3245  // vicitm, unless the last one generated more local tasks;
3246  // new_victim keeps track of this
3247  new_victim = 1;
3248  }
3249  } else { // No tasks found; unset last_stolen
3250  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3251  victim_tid = -2; // no successful victim found
3252  }
3253  }
3254 
3255  if (task == NULL)
3256  break; // break out of tasking loop
3257 
3258 // Found a task; execute it
3259 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3260  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3261  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3262  // get the object reliably
3263  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3264  }
3265  __kmp_itt_task_starting(itt_sync_obj);
3266  }
3267 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3268  __kmp_invoke_task(gtid, task, current_task);
3269 #if USE_ITT_BUILD
3270  if (itt_sync_obj != NULL)
3271  __kmp_itt_task_finished(itt_sync_obj);
3272 #endif /* USE_ITT_BUILD */
3273  // If this thread is only partway through the barrier and the condition is
3274  // met, then return now, so that the barrier gather/release pattern can
3275  // proceed. If this thread is in the last spin loop in the barrier,
3276  // waiting to be released, we know that the termination condition will not
3277  // be satisfied, so don't waste any cycles checking it.
3278  if (flag == NULL || (!final_spin && flag->done_check())) {
3279  KA_TRACE(
3280  15,
3281  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3282  gtid));
3283  return TRUE;
3284  }
3285  if (thread->th.th_task_team == NULL) {
3286  break;
3287  }
3288  KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3289  // If execution of a stolen task results in more tasks being placed on our
3290  // run queue, reset use_own_tasks
3291  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3292  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3293  "other tasks, restart\n",
3294  gtid));
3295  use_own_tasks = 1;
3296  new_victim = 0;
3297  }
3298  }
3299 
3300  // The task source has been exhausted. If in final spin loop of barrier,
3301  // check if termination condition is satisfied. The work queue may be empty
3302  // but there might be proxy tasks still executing.
3303  if (final_spin &&
3304  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3305  // First, decrement the #unfinished threads, if that has not already been
3306  // done. This decrement might be to the spin location, and result in the
3307  // termination condition being satisfied.
3308  if (!*thread_finished) {
3309 #if KMP_DEBUG
3310  kmp_int32 count = -1 +
3311 #endif
3312  KMP_ATOMIC_DEC(unfinished_threads);
3313  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3314  "unfinished_threads to %d task_team=%p\n",
3315  gtid, count, task_team));
3316  *thread_finished = TRUE;
3317  }
3318 
3319  // It is now unsafe to reference thread->th.th_team !!!
3320  // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3321  // thread to pass through the barrier, where it might reset each thread's
3322  // th.th_team field for the next parallel region. If we can steal more
3323  // work, we know that this has not happened yet.
3324  if (flag != NULL && flag->done_check()) {
3325  KA_TRACE(
3326  15,
3327  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3328  gtid));
3329  return TRUE;
3330  }
3331  }
3332 
3333  // If this thread's task team is NULL, primary thread has recognized that
3334  // there are no more tasks; bail out
3335  if (thread->th.th_task_team == NULL) {
3336  KA_TRACE(15,
3337  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3338  return FALSE;
3339  }
3340 
3341  // Check the flag again to see if it has already done in case to be trapped
3342  // into infinite loop when a if0 task depends on a hidden helper task
3343  // outside any parallel region. Detached tasks are not impacted in this case
3344  // because the only thread executing this function has to execute the proxy
3345  // task so it is in another code path that has the same check.
3346  if (flag == NULL || (!final_spin && flag->done_check())) {
3347  KA_TRACE(15,
3348  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3349  gtid));
3350  return TRUE;
3351  }
3352 
3353  // We could be getting tasks from target constructs; if this is the only
3354  // thread, keep trying to execute tasks from own queue
3355  if (nthreads == 1 &&
3356  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3357  use_own_tasks = 1;
3358  else {
3359  KA_TRACE(15,
3360  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3361  return FALSE;
3362  }
3363  }
3364 }
3365 
3366 template <bool C, bool S>
3367 int __kmp_execute_tasks_32(
3368  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3369  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3370  kmp_int32 is_constrained) {
3371  return __kmp_execute_tasks_template(
3372  thread, gtid, flag, final_spin,
3373  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3374 }
3375 
3376 template <bool C, bool S>
3377 int __kmp_execute_tasks_64(
3378  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3379  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3380  kmp_int32 is_constrained) {
3381  return __kmp_execute_tasks_template(
3382  thread, gtid, flag, final_spin,
3383  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3384 }
3385 
3386 template <bool C, bool S>
3387 int __kmp_atomic_execute_tasks_64(
3388  kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3389  int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3390  kmp_int32 is_constrained) {
3391  return __kmp_execute_tasks_template(
3392  thread, gtid, flag, final_spin,
3393  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3394 }
3395 
3396 int __kmp_execute_tasks_oncore(
3397  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3398  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3399  kmp_int32 is_constrained) {
3400  return __kmp_execute_tasks_template(
3401  thread, gtid, flag, final_spin,
3402  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3403 }
3404 
3405 template int
3406 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3407  kmp_flag_32<false, false> *, int,
3408  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3409 
3410 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3411  kmp_flag_64<false, true> *,
3412  int,
3413  int *USE_ITT_BUILD_ARG(void *),
3414  kmp_int32);
3415 
3416 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3417  kmp_flag_64<true, false> *,
3418  int,
3419  int *USE_ITT_BUILD_ARG(void *),
3420  kmp_int32);
3421 
3422 template int __kmp_atomic_execute_tasks_64<false, true>(
3423  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3424  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3425 
3426 template int __kmp_atomic_execute_tasks_64<true, false>(
3427  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3428  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3429 
3430 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3431 // next barrier so they can assist in executing enqueued tasks.
3432 // First thread in allocates the task team atomically.
3433 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3434  kmp_info_t *this_thr) {
3435  kmp_thread_data_t *threads_data;
3436  int nthreads, i, is_init_thread;
3437 
3438  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3439  __kmp_gtid_from_thread(this_thr)));
3440 
3441  KMP_DEBUG_ASSERT(task_team != NULL);
3442  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3443 
3444  nthreads = task_team->tt.tt_nproc;
3445  KMP_DEBUG_ASSERT(nthreads > 0);
3446  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3447 
3448  // Allocate or increase the size of threads_data if necessary
3449  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3450 
3451  if (!is_init_thread) {
3452  // Some other thread already set up the array.
3453  KA_TRACE(
3454  20,
3455  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3456  __kmp_gtid_from_thread(this_thr)));
3457  return;
3458  }
3459  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3460  KMP_DEBUG_ASSERT(threads_data != NULL);
3461 
3462  if (__kmp_tasking_mode == tskm_task_teams &&
3463  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3464  // Release any threads sleeping at the barrier, so that they can steal
3465  // tasks and execute them. In extra barrier mode, tasks do not sleep
3466  // at the separate tasking barrier, so this isn't a problem.
3467  for (i = 0; i < nthreads; i++) {
3468  void *sleep_loc;
3469  kmp_info_t *thread = threads_data[i].td.td_thr;
3470 
3471  if (i == this_thr->th.th_info.ds.ds_tid) {
3472  continue;
3473  }
3474  // Since we haven't locked the thread's suspend mutex lock at this
3475  // point, there is a small window where a thread might be putting
3476  // itself to sleep, but hasn't set the th_sleep_loc field yet.
3477  // To work around this, __kmp_execute_tasks_template() periodically checks
3478  // see if other threads are sleeping (using the same random mechanism that
3479  // is used for task stealing) and awakens them if they are.
3480  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3481  NULL) {
3482  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3483  __kmp_gtid_from_thread(this_thr),
3484  __kmp_gtid_from_thread(thread)));
3485  __kmp_null_resume_wrapper(thread);
3486  } else {
3487  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3488  __kmp_gtid_from_thread(this_thr),
3489  __kmp_gtid_from_thread(thread)));
3490  }
3491  }
3492  }
3493 
3494  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3495  __kmp_gtid_from_thread(this_thr)));
3496 }
3497 
3498 /* // TODO: Check the comment consistency
3499  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3500  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3501  * After a child * thread checks into a barrier and calls __kmp_release() from
3502  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3503  * longer assume that the kmp_team_t structure is intact (at any moment, the
3504  * primary thread may exit the barrier code and free the team data structure,
3505  * and return the threads to the thread pool).
3506  *
3507  * This does not work with the tasking code, as the thread is still
3508  * expected to participate in the execution of any tasks that may have been
3509  * spawned my a member of the team, and the thread still needs access to all
3510  * to each thread in the team, so that it can steal work from it.
3511  *
3512  * Enter the existence of the kmp_task_team_t struct. It employs a reference
3513  * counting mechanism, and is allocated by the primary thread before calling
3514  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3515  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3516  * of the kmp_task_team_t structs for consecutive barriers can overlap
3517  * (and will, unless the primary thread is the last thread to exit the barrier
3518  * release phase, which is not typical). The existence of such a struct is
3519  * useful outside the context of tasking.
3520  *
3521  * We currently use the existence of the threads array as an indicator that
3522  * tasks were spawned since the last barrier. If the structure is to be
3523  * useful outside the context of tasking, then this will have to change, but
3524  * not setting the field minimizes the performance impact of tasking on
3525  * barriers, when no explicit tasks were spawned (pushed, actually).
3526  */
3527 
3528 static kmp_task_team_t *__kmp_free_task_teams =
3529  NULL; // Free list for task_team data structures
3530 // Lock for task team data structures
3531 kmp_bootstrap_lock_t __kmp_task_team_lock =
3532  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3533 
3534 // __kmp_alloc_task_deque:
3535 // Allocates a task deque for a particular thread, and initialize the necessary
3536 // data structures relating to the deque. This only happens once per thread
3537 // per task team since task teams are recycled. No lock is needed during
3538 // allocation since each thread allocates its own deque.
3539 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3540  kmp_thread_data_t *thread_data) {
3541  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3542  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3543 
3544  // Initialize last stolen task field to "none"
3545  thread_data->td.td_deque_last_stolen = -1;
3546 
3547  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3548  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3549  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3550 
3551  KE_TRACE(
3552  10,
3553  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3554  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3555  // Allocate space for task deque, and zero the deque
3556  // Cannot use __kmp_thread_calloc() because threads not around for
3557  // kmp_reap_task_team( ).
3558  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3559  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3560  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3561 }
3562 
3563 // __kmp_free_task_deque:
3564 // Deallocates a task deque for a particular thread. Happens at library
3565 // deallocation so don't need to reset all thread data fields.
3566 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3567  if (thread_data->td.td_deque != NULL) {
3568  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3569  TCW_4(thread_data->td.td_deque_ntasks, 0);
3570  __kmp_free(thread_data->td.td_deque);
3571  thread_data->td.td_deque = NULL;
3572  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3573  }
3574 
3575 #ifdef BUILD_TIED_TASK_STACK
3576  // GEH: Figure out what to do here for td_susp_tied_tasks
3577  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3578  __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3579  }
3580 #endif // BUILD_TIED_TASK_STACK
3581 }
3582 
3583 // __kmp_realloc_task_threads_data:
3584 // Allocates a threads_data array for a task team, either by allocating an
3585 // initial array or enlarging an existing array. Only the first thread to get
3586 // the lock allocs or enlarges the array and re-initializes the array elements.
3587 // That thread returns "TRUE", the rest return "FALSE".
3588 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3589 // The current size is given by task_team -> tt.tt_max_threads.
3590 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3591  kmp_task_team_t *task_team) {
3592  kmp_thread_data_t **threads_data_p;
3593  kmp_int32 nthreads, maxthreads;
3594  int is_init_thread = FALSE;
3595 
3596  if (TCR_4(task_team->tt.tt_found_tasks)) {
3597  // Already reallocated and initialized.
3598  return FALSE;
3599  }
3600 
3601  threads_data_p = &task_team->tt.tt_threads_data;
3602  nthreads = task_team->tt.tt_nproc;
3603  maxthreads = task_team->tt.tt_max_threads;
3604 
3605  // All threads must lock when they encounter the first task of the implicit
3606  // task region to make sure threads_data fields are (re)initialized before
3607  // used.
3608  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3609 
3610  if (!TCR_4(task_team->tt.tt_found_tasks)) {
3611  // first thread to enable tasking
3612  kmp_team_t *team = thread->th.th_team;
3613  int i;
3614 
3615  is_init_thread = TRUE;
3616  if (maxthreads < nthreads) {
3617 
3618  if (*threads_data_p != NULL) {
3619  kmp_thread_data_t *old_data = *threads_data_p;
3620  kmp_thread_data_t *new_data = NULL;
3621 
3622  KE_TRACE(
3623  10,
3624  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3625  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3626  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3627  // Reallocate threads_data to have more elements than current array
3628  // Cannot use __kmp_thread_realloc() because threads not around for
3629  // kmp_reap_task_team( ). Note all new array entries are initialized
3630  // to zero by __kmp_allocate().
3631  new_data = (kmp_thread_data_t *)__kmp_allocate(
3632  nthreads * sizeof(kmp_thread_data_t));
3633  // copy old data to new data
3634  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3635  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3636 
3637 #ifdef BUILD_TIED_TASK_STACK
3638  // GEH: Figure out if this is the right thing to do
3639  for (i = maxthreads; i < nthreads; i++) {
3640  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3641  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3642  }
3643 #endif // BUILD_TIED_TASK_STACK
3644  // Install the new data and free the old data
3645  (*threads_data_p) = new_data;
3646  __kmp_free(old_data);
3647  } else {
3648  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3649  "threads data for task_team %p, size = %d\n",
3650  __kmp_gtid_from_thread(thread), task_team, nthreads));
3651  // Make the initial allocate for threads_data array, and zero entries
3652  // Cannot use __kmp_thread_calloc() because threads not around for
3653  // kmp_reap_task_team( ).
3654  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3655  nthreads * sizeof(kmp_thread_data_t));
3656 #ifdef BUILD_TIED_TASK_STACK
3657  // GEH: Figure out if this is the right thing to do
3658  for (i = 0; i < nthreads; i++) {
3659  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3660  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3661  }
3662 #endif // BUILD_TIED_TASK_STACK
3663  }
3664  task_team->tt.tt_max_threads = nthreads;
3665  } else {
3666  // If array has (more than) enough elements, go ahead and use it
3667  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3668  }
3669 
3670  // initialize threads_data pointers back to thread_info structures
3671  for (i = 0; i < nthreads; i++) {
3672  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3673  thread_data->td.td_thr = team->t.t_threads[i];
3674 
3675  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3676  // The last stolen field survives across teams / barrier, and the number
3677  // of threads may have changed. It's possible (likely?) that a new
3678  // parallel region will exhibit the same behavior as previous region.
3679  thread_data->td.td_deque_last_stolen = -1;
3680  }
3681  }
3682 
3683  KMP_MB();
3684  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3685  }
3686 
3687  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3688  return is_init_thread;
3689 }
3690 
3691 // __kmp_free_task_threads_data:
3692 // Deallocates a threads_data array for a task team, including any attached
3693 // tasking deques. Only occurs at library shutdown.
3694 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3695  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3696  if (task_team->tt.tt_threads_data != NULL) {
3697  int i;
3698  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3699  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3700  }
3701  __kmp_free(task_team->tt.tt_threads_data);
3702  task_team->tt.tt_threads_data = NULL;
3703  }
3704  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3705 }
3706 
3707 // __kmp_free_task_pri_list:
3708 // Deallocates tasking deques used for priority tasks.
3709 // Only occurs at library shutdown.
3710 static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3711  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3712  if (task_team->tt.tt_task_pri_list != NULL) {
3713  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3714  while (list != NULL) {
3715  kmp_task_pri_t *next = list->next;
3716  __kmp_free_task_deque(&list->td);
3717  __kmp_free(list);
3718  list = next;
3719  }
3720  task_team->tt.tt_task_pri_list = NULL;
3721  }
3722  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3723 }
3724 
3725 // __kmp_allocate_task_team:
3726 // Allocates a task team associated with a specific team, taking it from
3727 // the global task team free list if possible. Also initializes data
3728 // structures.
3729 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3730  kmp_team_t *team) {
3731  kmp_task_team_t *task_team = NULL;
3732  int nthreads;
3733 
3734  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3735  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3736 
3737  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3738  // Take a task team from the task team pool
3739  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3740  if (__kmp_free_task_teams != NULL) {
3741  task_team = __kmp_free_task_teams;
3742  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3743  task_team->tt.tt_next = NULL;
3744  }
3745  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3746  }
3747 
3748  if (task_team == NULL) {
3749  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3750  "task team for team %p\n",
3751  __kmp_gtid_from_thread(thread), team));
3752  // Allocate a new task team if one is not available. Cannot use
3753  // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3754  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3755  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3756  __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3757 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3758  // suppress race conditions detection on synchronization flags in debug mode
3759  // this helps to analyze library internals eliminating false positives
3760  __itt_suppress_mark_range(
3761  __itt_suppress_range, __itt_suppress_threading_errors,
3762  &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3763  __itt_suppress_mark_range(__itt_suppress_range,
3764  __itt_suppress_threading_errors,
3765  CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3766  sizeof(task_team->tt.tt_active));
3767 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3768  // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3769  // task_team->tt.tt_threads_data = NULL;
3770  // task_team->tt.tt_max_threads = 0;
3771  // task_team->tt.tt_next = NULL;
3772  }
3773 
3774  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3775  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3776  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3777  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3778 
3779  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3780  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3781  TCW_4(task_team->tt.tt_active, TRUE);
3782 
3783  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3784  "unfinished_threads init'd to %d\n",
3785  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3786  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3787  return task_team;
3788 }
3789 
3790 // __kmp_free_task_team:
3791 // Frees the task team associated with a specific thread, and adds it
3792 // to the global task team free list.
3793 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3794  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3795  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3796 
3797  // Put task team back on free list
3798  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3799 
3800  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3801  task_team->tt.tt_next = __kmp_free_task_teams;
3802  TCW_PTR(__kmp_free_task_teams, task_team);
3803 
3804  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3805 }
3806 
3807 // __kmp_reap_task_teams:
3808 // Free all the task teams on the task team free list.
3809 // Should only be done during library shutdown.
3810 // Cannot do anything that needs a thread structure or gtid since they are
3811 // already gone.
3812 void __kmp_reap_task_teams(void) {
3813  kmp_task_team_t *task_team;
3814 
3815  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3816  // Free all task_teams on the free list
3817  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3818  while ((task_team = __kmp_free_task_teams) != NULL) {
3819  __kmp_free_task_teams = task_team->tt.tt_next;
3820  task_team->tt.tt_next = NULL;
3821 
3822  // Free threads_data if necessary
3823  if (task_team->tt.tt_threads_data != NULL) {
3824  __kmp_free_task_threads_data(task_team);
3825  }
3826  if (task_team->tt.tt_task_pri_list != NULL) {
3827  __kmp_free_task_pri_list(task_team);
3828  }
3829  __kmp_free(task_team);
3830  }
3831  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3832  }
3833 }
3834 
3835 // __kmp_wait_to_unref_task_teams:
3836 // Some threads could still be in the fork barrier release code, possibly
3837 // trying to steal tasks. Wait for each thread to unreference its task team.
3838 void __kmp_wait_to_unref_task_teams(void) {
3839  kmp_info_t *thread;
3840  kmp_uint32 spins;
3841  kmp_uint64 time;
3842  int done;
3843 
3844  KMP_INIT_YIELD(spins);
3845  KMP_INIT_BACKOFF(time);
3846 
3847  for (;;) {
3848  done = TRUE;
3849 
3850  // TODO: GEH - this may be is wrong because some sync would be necessary
3851  // in case threads are added to the pool during the traversal. Need to
3852  // verify that lock for thread pool is held when calling this routine.
3853  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3854  thread = thread->th.th_next_pool) {
3855 #if KMP_OS_WINDOWS
3856  DWORD exit_val;
3857 #endif
3858  if (TCR_PTR(thread->th.th_task_team) == NULL) {
3859  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3860  __kmp_gtid_from_thread(thread)));
3861  continue;
3862  }
3863 #if KMP_OS_WINDOWS
3864  // TODO: GEH - add this check for Linux* OS / OS X* as well?
3865  if (!__kmp_is_thread_alive(thread, &exit_val)) {
3866  thread->th.th_task_team = NULL;
3867  continue;
3868  }
3869 #endif
3870 
3871  done = FALSE; // Because th_task_team pointer is not NULL for this thread
3872 
3873  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3874  "unreference task_team\n",
3875  __kmp_gtid_from_thread(thread)));
3876 
3877  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3878  void *sleep_loc;
3879  // If the thread is sleeping, awaken it.
3880  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3881  NULL) {
3882  KA_TRACE(
3883  10,
3884  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3885  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3886  __kmp_null_resume_wrapper(thread);
3887  }
3888  }
3889  }
3890  if (done) {
3891  break;
3892  }
3893 
3894  // If oversubscribed or have waited a bit, yield.
3895  KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3896  }
3897 }
3898 
3899 // __kmp_task_team_setup: Create a task_team for the current team, but use
3900 // an already created, unused one if it already exists.
3901 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3902  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3903 
3904  // If this task_team hasn't been created yet, allocate it. It will be used in
3905  // the region after the next.
3906  // If it exists, it is the current task team and shouldn't be touched yet as
3907  // it may still be in use.
3908  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3909  (always || team->t.t_nproc > 1)) {
3910  team->t.t_task_team[this_thr->th.th_task_state] =
3911  __kmp_allocate_task_team(this_thr, team);
3912  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3913  " for team %d at parity=%d\n",
3914  __kmp_gtid_from_thread(this_thr),
3915  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
3916  this_thr->th.th_task_state));
3917  }
3918 
3919  // After threads exit the release, they will call sync, and then point to this
3920  // other task_team; make sure it is allocated and properly initialized. As
3921  // threads spin in the barrier release phase, they will continue to use the
3922  // previous task_team struct(above), until they receive the signal to stop
3923  // checking for tasks (they can't safely reference the kmp_team_t struct,
3924  // which could be reallocated by the primary thread). No task teams are formed
3925  // for serialized teams.
3926  if (team->t.t_nproc > 1) {
3927  int other_team = 1 - this_thr->th.th_task_state;
3928  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
3929  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3930  team->t.t_task_team[other_team] =
3931  __kmp_allocate_task_team(this_thr, team);
3932  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
3933  "task_team %p for team %d at parity=%d\n",
3934  __kmp_gtid_from_thread(this_thr),
3935  team->t.t_task_team[other_team], team->t.t_id, other_team));
3936  } else { // Leave the old task team struct in place for the upcoming region;
3937  // adjust as needed
3938  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3939  if (!task_team->tt.tt_active ||
3940  team->t.t_nproc != task_team->tt.tt_nproc) {
3941  TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3942  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3943  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3944  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3945  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3946  team->t.t_nproc);
3947  TCW_4(task_team->tt.tt_active, TRUE);
3948  }
3949  // if team size has changed, the first thread to enable tasking will
3950  // realloc threads_data if necessary
3951  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
3952  "%p for team %d at parity=%d\n",
3953  __kmp_gtid_from_thread(this_thr),
3954  team->t.t_task_team[other_team], team->t.t_id, other_team));
3955  }
3956  }
3957 
3958  // For regular thread, task enabling should be called when the task is going
3959  // to be pushed to a dequeue. However, for the hidden helper thread, we need
3960  // it ahead of time so that some operations can be performed without race
3961  // condition.
3962  if (this_thr == __kmp_hidden_helper_main_thread) {
3963  for (int i = 0; i < 2; ++i) {
3964  kmp_task_team_t *task_team = team->t.t_task_team[i];
3965  if (KMP_TASKING_ENABLED(task_team)) {
3966  continue;
3967  }
3968  __kmp_enable_tasking(task_team, this_thr);
3969  for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
3970  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
3971  if (thread_data->td.td_deque == NULL) {
3972  __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
3973  }
3974  }
3975  }
3976  }
3977 }
3978 
3979 // __kmp_task_team_sync: Propagation of task team data from team to threads
3980 // which happens just after the release phase of a team barrier. This may be
3981 // called by any thread, but only for teams with # threads > 1.
3982 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3983  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3984 
3985  // Toggle the th_task_state field, to switch which task_team this thread
3986  // refers to
3987  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
3988 
3989  // It is now safe to propagate the task team pointer from the team struct to
3990  // the current thread.
3991  TCW_PTR(this_thr->th.th_task_team,
3992  team->t.t_task_team[this_thr->th.th_task_state]);
3993  KA_TRACE(20,
3994  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3995  "%p from Team #%d (parity=%d)\n",
3996  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3997  team->t.t_id, this_thr->th.th_task_state));
3998 }
3999 
4000 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4001 // barrier gather phase. Only called by primary thread if #threads in team > 1
4002 // or if proxy tasks were created.
4003 //
4004 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4005 // by passing in 0 optionally as the last argument. When wait is zero, primary
4006 // thread does not wait for unfinished_threads to reach 0.
4007 void __kmp_task_team_wait(
4008  kmp_info_t *this_thr,
4009  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4010  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4011 
4012  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4013  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4014 
4015  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4016  if (wait) {
4017  KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4018  "(for unfinished_threads to reach 0) on task_team = %p\n",
4019  __kmp_gtid_from_thread(this_thr), task_team));
4020  // Worker threads may have dropped through to release phase, but could
4021  // still be executing tasks. Wait here for tasks to complete. To avoid
4022  // memory contention, only primary thread checks termination condition.
4023  kmp_flag_32<false, false> flag(
4024  RCAST(std::atomic<kmp_uint32> *,
4025  &task_team->tt.tt_unfinished_threads),
4026  0U);
4027  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4028  }
4029  // Deactivate the old task team, so that the worker threads will stop
4030  // referencing it while spinning.
4031  KA_TRACE(
4032  20,
4033  ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4034  "setting active to false, setting local and team's pointer to NULL\n",
4035  __kmp_gtid_from_thread(this_thr), task_team));
4036  KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
4037  task_team->tt.tt_found_proxy_tasks == TRUE ||
4038  task_team->tt.tt_hidden_helper_task_encountered == TRUE);
4039  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4040  TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4041  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
4042  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4043  KMP_MB();
4044 
4045  TCW_PTR(this_thr->th.th_task_team, NULL);
4046  }
4047 }
4048 
4049 // __kmp_tasking_barrier:
4050 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4051 // Internal function to execute all tasks prior to a regular barrier or a join
4052 // barrier. It is a full barrier itself, which unfortunately turns regular
4053 // barriers into double barriers and join barriers into 1 1/2 barriers.
4054 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4055  std::atomic<kmp_uint32> *spin = RCAST(
4056  std::atomic<kmp_uint32> *,
4057  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4058  int flag = FALSE;
4059  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4060 
4061 #if USE_ITT_BUILD
4062  KMP_FSYNC_SPIN_INIT(spin, NULL);
4063 #endif /* USE_ITT_BUILD */
4064  kmp_flag_32<false, false> spin_flag(spin, 0U);
4065  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4066  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4067 #if USE_ITT_BUILD
4068  // TODO: What about itt_sync_obj??
4069  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4070 #endif /* USE_ITT_BUILD */
4071 
4072  if (TCR_4(__kmp_global.g.g_done)) {
4073  if (__kmp_global.g.g_abort)
4074  __kmp_abort_thread();
4075  break;
4076  }
4077  KMP_YIELD(TRUE);
4078  }
4079 #if USE_ITT_BUILD
4080  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4081 #endif /* USE_ITT_BUILD */
4082 }
4083 
4084 // __kmp_give_task puts a task into a given thread queue if:
4085 // - the queue for that thread was created
4086 // - there's space in that queue
4087 // Because of this, __kmp_push_task needs to check if there's space after
4088 // getting the lock
4089 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
4090  kmp_int32 pass) {
4091  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4092  kmp_task_team_t *task_team = taskdata->td_task_team;
4093 
4094  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4095  taskdata, tid));
4096 
4097  // If task_team is NULL something went really bad...
4098  KMP_DEBUG_ASSERT(task_team != NULL);
4099 
4100  bool result = false;
4101  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4102 
4103  if (thread_data->td.td_deque == NULL) {
4104  // There's no queue in this thread, go find another one
4105  // We're guaranteed that at least one thread has a queue
4106  KA_TRACE(30,
4107  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4108  tid, taskdata));
4109  return result;
4110  }
4111 
4112  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4113  TASK_DEQUE_SIZE(thread_data->td)) {
4114  KA_TRACE(
4115  30,
4116  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4117  taskdata, tid));
4118 
4119  // if this deque is bigger than the pass ratio give a chance to another
4120  // thread
4121  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4122  return result;
4123 
4124  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4125  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4126  TASK_DEQUE_SIZE(thread_data->td)) {
4127  // expand deque to push the task which is not allowed to execute
4128  __kmp_realloc_task_deque(thread, thread_data);
4129  }
4130 
4131  } else {
4132 
4133  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4134 
4135  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4136  TASK_DEQUE_SIZE(thread_data->td)) {
4137  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4138  "thread %d.\n",
4139  taskdata, tid));
4140 
4141  // if this deque is bigger than the pass ratio give a chance to another
4142  // thread
4143  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4144  goto release_and_exit;
4145 
4146  __kmp_realloc_task_deque(thread, thread_data);
4147  }
4148  }
4149 
4150  // lock is held here, and there is space in the deque
4151 
4152  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4153  // Wrap index.
4154  thread_data->td.td_deque_tail =
4155  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4156  TCW_4(thread_data->td.td_deque_ntasks,
4157  TCR_4(thread_data->td.td_deque_ntasks) + 1);
4158 
4159  result = true;
4160  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4161  taskdata, tid));
4162 
4163 release_and_exit:
4164  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4165 
4166  return result;
4167 }
4168 
4169 #define PROXY_TASK_FLAG 0x40000000
4170 /* The finish of the proxy tasks is divided in two pieces:
4171  - the top half is the one that can be done from a thread outside the team
4172  - the bottom half must be run from a thread within the team
4173 
4174  In order to run the bottom half the task gets queued back into one of the
4175  threads of the team. Once the td_incomplete_child_task counter of the parent
4176  is decremented the threads can leave the barriers. So, the bottom half needs
4177  to be queued before the counter is decremented. The top half is therefore
4178  divided in two parts:
4179  - things that can be run before queuing the bottom half
4180  - things that must be run after queuing the bottom half
4181 
4182  This creates a second race as the bottom half can free the task before the
4183  second top half is executed. To avoid this we use the
4184  td_incomplete_child_task of the proxy task to synchronize the top and bottom
4185  half. */
4186 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4187  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4188  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4189  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4190  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4191 
4192  taskdata->td_flags.complete = 1; // mark the task as completed
4193 
4194  if (taskdata->td_taskgroup)
4195  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4196 
4197  // Create an imaginary children for this task so the bottom half cannot
4198  // release the task before we have completed the second top half
4199  KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4200 }
4201 
4202 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4203 #if KMP_DEBUG
4204  kmp_int32 children = 0;
4205  // Predecrement simulated by "- 1" calculation
4206  children = -1 +
4207 #endif
4208  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4209  KMP_DEBUG_ASSERT(children >= 0);
4210 
4211  // Remove the imaginary children
4212  KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4213 }
4214 
4215 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4216  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4217  kmp_info_t *thread = __kmp_threads[gtid];
4218 
4219  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4220  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4221  1); // top half must run before bottom half
4222 
4223  // We need to wait to make sure the top half is finished
4224  // Spinning here should be ok as this should happen quickly
4225  while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4226  PROXY_TASK_FLAG) > 0)
4227  ;
4228 
4229  __kmp_release_deps(gtid, taskdata);
4230  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4231 }
4232 
4241 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4242  KMP_DEBUG_ASSERT(ptask != NULL);
4243  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4244  KA_TRACE(
4245  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4246  gtid, taskdata));
4247  __kmp_assert_valid_gtid(gtid);
4248  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4249 
4250  __kmp_first_top_half_finish_proxy(taskdata);
4251  __kmp_second_top_half_finish_proxy(taskdata);
4252  __kmp_bottom_half_finish_proxy(gtid, ptask);
4253 
4254  KA_TRACE(10,
4255  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4256  gtid, taskdata));
4257 }
4258 
4259 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
4260  KMP_DEBUG_ASSERT(ptask != NULL);
4261  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4262 
4263  // Enqueue task to complete bottom half completion from a thread within the
4264  // corresponding team
4265  kmp_team_t *team = taskdata->td_team;
4266  kmp_int32 nthreads = team->t.t_nproc;
4267  kmp_info_t *thread;
4268 
4269  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4270  // but we cannot use __kmp_get_random here
4271  kmp_int32 start_k = start % nthreads;
4272  kmp_int32 pass = 1;
4273  kmp_int32 k = start_k;
4274 
4275  do {
4276  // For now we're just linearly trying to find a thread
4277  thread = team->t.t_threads[k];
4278  k = (k + 1) % nthreads;
4279 
4280  // we did a full pass through all the threads
4281  if (k == start_k)
4282  pass = pass << 1;
4283 
4284  } while (!__kmp_give_task(thread, k, ptask, pass));
4285 }
4286 
4294 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4295  KMP_DEBUG_ASSERT(ptask != NULL);
4296  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4297 
4298  KA_TRACE(
4299  10,
4300  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4301  taskdata));
4302 
4303  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4304 
4305  __kmp_first_top_half_finish_proxy(taskdata);
4306 
4307  __kmpc_give_task(ptask);
4308 
4309  __kmp_second_top_half_finish_proxy(taskdata);
4310 
4311  KA_TRACE(
4312  10,
4313  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4314  taskdata));
4315 }
4316 
4317 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4318  kmp_task_t *task) {
4319  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4320  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4321  td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4322  td->td_allow_completion_event.ed.task = task;
4323  __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4324  }
4325  return &td->td_allow_completion_event;
4326 }
4327 
4328 void __kmp_fulfill_event(kmp_event_t *event) {
4329  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4330  kmp_task_t *ptask = event->ed.task;
4331  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4332  bool detached = false;
4333  int gtid = __kmp_get_gtid();
4334 
4335  // The associated task might have completed or could be completing at this
4336  // point.
4337  // We need to take the lock to avoid races
4338  __kmp_acquire_tas_lock(&event->lock, gtid);
4339  if (taskdata->td_flags.proxy == TASK_PROXY) {
4340  detached = true;
4341  } else {
4342 #if OMPT_SUPPORT
4343  // The OMPT event must occur under mutual exclusion,
4344  // otherwise the tool might access ptask after free
4345  if (UNLIKELY(ompt_enabled.enabled))
4346  __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4347 #endif
4348  }
4349  event->type = KMP_EVENT_UNINITIALIZED;
4350  __kmp_release_tas_lock(&event->lock, gtid);
4351 
4352  if (detached) {
4353 #if OMPT_SUPPORT
4354  // We free ptask afterwards and know the task is finished,
4355  // so locking is not necessary
4356  if (UNLIKELY(ompt_enabled.enabled))
4357  __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4358 #endif
4359  // If the task detached complete the proxy task
4360  if (gtid >= 0) {
4361  kmp_team_t *team = taskdata->td_team;
4362  kmp_info_t *thread = __kmp_get_thread();
4363  if (thread->th.th_team == team) {
4364  __kmpc_proxy_task_completed(gtid, ptask);
4365  return;
4366  }
4367  }
4368 
4369  // fallback
4371  }
4372  }
4373 }
4374 
4375 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4376 // for taskloop
4377 //
4378 // thread: allocating thread
4379 // task_src: pointer to source task to be duplicated
4380 // returns: a pointer to the allocated kmp_task_t structure (task).
4381 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
4382  kmp_task_t *task;
4383  kmp_taskdata_t *taskdata;
4384  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4385  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4386  size_t shareds_offset;
4387  size_t task_size;
4388 
4389  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4390  task_src));
4391  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4392  TASK_FULL); // it should not be proxy task
4393  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4394  task_size = taskdata_src->td_size_alloc;
4395 
4396  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4397  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4398  task_size));
4399 #if USE_FAST_MEMORY
4400  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4401 #else
4402  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4403 #endif /* USE_FAST_MEMORY */
4404  KMP_MEMCPY(taskdata, taskdata_src, task_size);
4405 
4406  task = KMP_TASKDATA_TO_TASK(taskdata);
4407 
4408  // Initialize new task (only specific fields not affected by memcpy)
4409  taskdata->td_task_id = KMP_GEN_TASK_ID();
4410  if (task->shareds != NULL) { // need setup shareds pointer
4411  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4412  task->shareds = &((char *)taskdata)[shareds_offset];
4413  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4414  0);
4415  }
4416  taskdata->td_alloc_thread = thread;
4417  taskdata->td_parent = parent_task;
4418  // task inherits the taskgroup from the parent task
4419  taskdata->td_taskgroup = parent_task->td_taskgroup;
4420  // tied task needs to initialize the td_last_tied at creation,
4421  // untied one does this when it is scheduled for execution
4422  if (taskdata->td_flags.tiedness == TASK_TIED)
4423  taskdata->td_last_tied = taskdata;
4424 
4425  // Only need to keep track of child task counts if team parallel and tasking
4426  // not serialized
4427  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4428  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4429  if (parent_task->td_taskgroup)
4430  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4431  // Only need to keep track of allocated child tasks for explicit tasks since
4432  // implicit not deallocated
4433  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4434  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4435  }
4436 
4437  KA_TRACE(20,
4438  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4439  thread, taskdata, taskdata->td_parent));
4440 #if OMPT_SUPPORT
4441  if (UNLIKELY(ompt_enabled.enabled))
4442  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4443 #endif
4444  return task;
4445 }
4446 
4447 // Routine optionally generated by the compiler for setting the lastprivate flag
4448 // and calling needed constructors for private/firstprivate objects
4449 // (used to form taskloop tasks from pattern task)
4450 // Parameters: dest task, src task, lastprivate flag.
4451 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4452 
4453 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4454 
4455 // class to encapsulate manipulating loop bounds in a taskloop task.
4456 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4457 // the loop bound variables.
4458 class kmp_taskloop_bounds_t {
4459  kmp_task_t *task;
4460  const kmp_taskdata_t *taskdata;
4461  size_t lower_offset;
4462  size_t upper_offset;
4463 
4464 public:
4465  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4466  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4467  lower_offset((char *)lb - (char *)task),
4468  upper_offset((char *)ub - (char *)task) {
4469  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4470  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4471  }
4472  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4473  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4474  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4475  size_t get_lower_offset() const { return lower_offset; }
4476  size_t get_upper_offset() const { return upper_offset; }
4477  kmp_uint64 get_lb() const {
4478  kmp_int64 retval;
4479 #if defined(KMP_GOMP_COMPAT)
4480  // Intel task just returns the lower bound normally
4481  if (!taskdata->td_flags.native) {
4482  retval = *(kmp_int64 *)((char *)task + lower_offset);
4483  } else {
4484  // GOMP task has to take into account the sizeof(long)
4485  if (taskdata->td_size_loop_bounds == 4) {
4486  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4487  retval = (kmp_int64)*lb;
4488  } else {
4489  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4490  retval = (kmp_int64)*lb;
4491  }
4492  }
4493 #else
4494  (void)taskdata;
4495  retval = *(kmp_int64 *)((char *)task + lower_offset);
4496 #endif // defined(KMP_GOMP_COMPAT)
4497  return retval;
4498  }
4499  kmp_uint64 get_ub() const {
4500  kmp_int64 retval;
4501 #if defined(KMP_GOMP_COMPAT)
4502  // Intel task just returns the upper bound normally
4503  if (!taskdata->td_flags.native) {
4504  retval = *(kmp_int64 *)((char *)task + upper_offset);
4505  } else {
4506  // GOMP task has to take into account the sizeof(long)
4507  if (taskdata->td_size_loop_bounds == 4) {
4508  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4509  retval = (kmp_int64)*ub;
4510  } else {
4511  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4512  retval = (kmp_int64)*ub;
4513  }
4514  }
4515 #else
4516  retval = *(kmp_int64 *)((char *)task + upper_offset);
4517 #endif // defined(KMP_GOMP_COMPAT)
4518  return retval;
4519  }
4520  void set_lb(kmp_uint64 lb) {
4521 #if defined(KMP_GOMP_COMPAT)
4522  // Intel task just sets the lower bound normally
4523  if (!taskdata->td_flags.native) {
4524  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4525  } else {
4526  // GOMP task has to take into account the sizeof(long)
4527  if (taskdata->td_size_loop_bounds == 4) {
4528  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4529  *lower = (kmp_uint32)lb;
4530  } else {
4531  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4532  *lower = (kmp_uint64)lb;
4533  }
4534  }
4535 #else
4536  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4537 #endif // defined(KMP_GOMP_COMPAT)
4538  }
4539  void set_ub(kmp_uint64 ub) {
4540 #if defined(KMP_GOMP_COMPAT)
4541  // Intel task just sets the upper bound normally
4542  if (!taskdata->td_flags.native) {
4543  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4544  } else {
4545  // GOMP task has to take into account the sizeof(long)
4546  if (taskdata->td_size_loop_bounds == 4) {
4547  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4548  *upper = (kmp_uint32)ub;
4549  } else {
4550  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4551  *upper = (kmp_uint64)ub;
4552  }
4553  }
4554 #else
4555  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4556 #endif // defined(KMP_GOMP_COMPAT)
4557  }
4558 };
4559 
4560 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4561 //
4562 // loc Source location information
4563 // gtid Global thread ID
4564 // task Pattern task, exposes the loop iteration range
4565 // lb Pointer to loop lower bound in task structure
4566 // ub Pointer to loop upper bound in task structure
4567 // st Loop stride
4568 // ub_glob Global upper bound (used for lastprivate check)
4569 // num_tasks Number of tasks to execute
4570 // grainsize Number of loop iterations per task
4571 // extras Number of chunks with grainsize+1 iterations
4572 // last_chunk Reduction of grainsize for last task
4573 // tc Iterations count
4574 // task_dup Tasks duplication routine
4575 // codeptr_ra Return address for OMPT events
4576 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4577  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4578  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4579  kmp_uint64 grainsize, kmp_uint64 extras,
4580  kmp_int64 last_chunk, kmp_uint64 tc,
4581 #if OMPT_SUPPORT
4582  void *codeptr_ra,
4583 #endif
4584  void *task_dup) {
4585  KMP_COUNT_BLOCK(OMP_TASKLOOP);
4586  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4587  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4588  // compiler provides global bounds here
4589  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4590  kmp_uint64 lower = task_bounds.get_lb();
4591  kmp_uint64 upper = task_bounds.get_ub();
4592  kmp_uint64 i;
4593  kmp_info_t *thread = __kmp_threads[gtid];
4594  kmp_taskdata_t *current_task = thread->th.th_current_task;
4595  kmp_task_t *next_task;
4596  kmp_int32 lastpriv = 0;
4597 
4598  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4599  (last_chunk < 0 ? last_chunk : extras));
4600  KMP_DEBUG_ASSERT(num_tasks > extras);
4601  KMP_DEBUG_ASSERT(num_tasks > 0);
4602  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4603  "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4604  gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4605  ub_glob, st, task_dup));
4606 
4607  // Launch num_tasks tasks, assign grainsize iterations each task
4608  for (i = 0; i < num_tasks; ++i) {
4609  kmp_uint64 chunk_minus_1;
4610  if (extras == 0) {
4611  chunk_minus_1 = grainsize - 1;
4612  } else {
4613  chunk_minus_1 = grainsize;
4614  --extras; // first extras iterations get bigger chunk (grainsize+1)
4615  }
4616  upper = lower + st * chunk_minus_1;
4617  if (upper > *ub) {
4618  upper = *ub;
4619  }
4620  if (i == num_tasks - 1) {
4621  // schedule the last task, set lastprivate flag if needed
4622  if (st == 1) { // most common case
4623  KMP_DEBUG_ASSERT(upper == *ub);
4624  if (upper == ub_glob)
4625  lastpriv = 1;
4626  } else if (st > 0) { // positive loop stride
4627  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4628  if ((kmp_uint64)st > ub_glob - upper)
4629  lastpriv = 1;
4630  } else { // negative loop stride
4631  KMP_DEBUG_ASSERT(upper + st < *ub);
4632  if (upper - ub_glob < (kmp_uint64)(-st))
4633  lastpriv = 1;
4634  }
4635  }
4636  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4637  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4638  kmp_taskloop_bounds_t next_task_bounds =
4639  kmp_taskloop_bounds_t(next_task, task_bounds);
4640 
4641  // adjust task-specific bounds
4642  next_task_bounds.set_lb(lower);
4643  if (next_taskdata->td_flags.native) {
4644  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4645  } else {
4646  next_task_bounds.set_ub(upper);
4647  }
4648  if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4649  // etc.
4650  ptask_dup(next_task, task, lastpriv);
4651  KA_TRACE(40,
4652  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4653  "upper %lld stride %lld, (offsets %p %p)\n",
4654  gtid, i, next_task, lower, upper, st,
4655  next_task_bounds.get_lower_offset(),
4656  next_task_bounds.get_upper_offset()));
4657 #if OMPT_SUPPORT
4658  __kmp_omp_taskloop_task(NULL, gtid, next_task,
4659  codeptr_ra); // schedule new task
4660 #if OMPT_OPTIONAL
4661  if (ompt_enabled.ompt_callback_dispatch) {
4662  OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4663  lower, upper, st);
4664  }
4665 #endif // OMPT_OPTIONAL
4666 #else
4667  __kmp_omp_task(gtid, next_task, true); // schedule new task
4668 #endif
4669  lower = upper + st; // adjust lower bound for the next iteration
4670  }
4671  // free the pattern task and exit
4672  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4673  // do not execute the pattern task, just do internal bookkeeping
4674  __kmp_task_finish<false>(gtid, task, current_task);
4675 }
4676 
4677 // Structure to keep taskloop parameters for auxiliary task
4678 // kept in the shareds of the task structure.
4679 typedef struct __taskloop_params {
4680  kmp_task_t *task;
4681  kmp_uint64 *lb;
4682  kmp_uint64 *ub;
4683  void *task_dup;
4684  kmp_int64 st;
4685  kmp_uint64 ub_glob;
4686  kmp_uint64 num_tasks;
4687  kmp_uint64 grainsize;
4688  kmp_uint64 extras;
4689  kmp_int64 last_chunk;
4690  kmp_uint64 tc;
4691  kmp_uint64 num_t_min;
4692 #if OMPT_SUPPORT
4693  void *codeptr_ra;
4694 #endif
4695 } __taskloop_params_t;
4696 
4697 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4698  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4699  kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4700  kmp_uint64,
4701 #if OMPT_SUPPORT
4702  void *,
4703 #endif
4704  void *);
4705 
4706 // Execute part of the taskloop submitted as a task.
4707 int __kmp_taskloop_task(int gtid, void *ptask) {
4708  __taskloop_params_t *p =
4709  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4710  kmp_task_t *task = p->task;
4711  kmp_uint64 *lb = p->lb;
4712  kmp_uint64 *ub = p->ub;
4713  void *task_dup = p->task_dup;
4714  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4715  kmp_int64 st = p->st;
4716  kmp_uint64 ub_glob = p->ub_glob;
4717  kmp_uint64 num_tasks = p->num_tasks;
4718  kmp_uint64 grainsize = p->grainsize;
4719  kmp_uint64 extras = p->extras;
4720  kmp_int64 last_chunk = p->last_chunk;
4721  kmp_uint64 tc = p->tc;
4722  kmp_uint64 num_t_min = p->num_t_min;
4723 #if OMPT_SUPPORT
4724  void *codeptr_ra = p->codeptr_ra;
4725 #endif
4726 #if KMP_DEBUG
4727  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4728  KMP_DEBUG_ASSERT(task != NULL);
4729  KA_TRACE(20,
4730  ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4731  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4732  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4733  st, task_dup));
4734 #endif
4735  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4736  if (num_tasks > num_t_min)
4737  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4738  grainsize, extras, last_chunk, tc, num_t_min,
4739 #if OMPT_SUPPORT
4740  codeptr_ra,
4741 #endif
4742  task_dup);
4743  else
4744  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4745  grainsize, extras, last_chunk, tc,
4746 #if OMPT_SUPPORT
4747  codeptr_ra,
4748 #endif
4749  task_dup);
4750 
4751  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4752  return 0;
4753 }
4754 
4755 // Schedule part of the taskloop as a task,
4756 // execute the rest of the taskloop.
4757 //
4758 // loc Source location information
4759 // gtid Global thread ID
4760 // task Pattern task, exposes the loop iteration range
4761 // lb Pointer to loop lower bound in task structure
4762 // ub Pointer to loop upper bound in task structure
4763 // st Loop stride
4764 // ub_glob Global upper bound (used for lastprivate check)
4765 // num_tasks Number of tasks to execute
4766 // grainsize Number of loop iterations per task
4767 // extras Number of chunks with grainsize+1 iterations
4768 // last_chunk Reduction of grainsize for last task
4769 // tc Iterations count
4770 // num_t_min Threshold to launch tasks recursively
4771 // task_dup Tasks duplication routine
4772 // codeptr_ra Return address for OMPT events
4773 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4774  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4775  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4776  kmp_uint64 grainsize, kmp_uint64 extras,
4777  kmp_int64 last_chunk, kmp_uint64 tc,
4778  kmp_uint64 num_t_min,
4779 #if OMPT_SUPPORT
4780  void *codeptr_ra,
4781 #endif
4782  void *task_dup) {
4783  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4784  KMP_DEBUG_ASSERT(task != NULL);
4785  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4786  KA_TRACE(20,
4787  ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4788  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4789  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4790  st, task_dup));
4791  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4792  kmp_uint64 lower = *lb;
4793  kmp_info_t *thread = __kmp_threads[gtid];
4794  // kmp_taskdata_t *current_task = thread->th.th_current_task;
4795  kmp_task_t *next_task;
4796  size_t lower_offset =
4797  (char *)lb - (char *)task; // remember offset of lb in the task structure
4798  size_t upper_offset =
4799  (char *)ub - (char *)task; // remember offset of ub in the task structure
4800 
4801  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4802  (last_chunk < 0 ? last_chunk : extras));
4803  KMP_DEBUG_ASSERT(num_tasks > extras);
4804  KMP_DEBUG_ASSERT(num_tasks > 0);
4805 
4806  // split the loop in two halves
4807  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4808  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
4809  kmp_uint64 gr_size0 = grainsize;
4810  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4811  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4812  if (last_chunk < 0) {
4813  ext0 = ext1 = 0;
4814  last_chunk1 = last_chunk;
4815  tc0 = grainsize * n_tsk0;
4816  tc1 = tc - tc0;
4817  } else if (n_tsk0 <= extras) {
4818  gr_size0++; // integrate extras into grainsize
4819  ext0 = 0; // no extra iters in 1st half
4820  ext1 = extras - n_tsk0; // remaining extras
4821  tc0 = gr_size0 * n_tsk0;
4822  tc1 = tc - tc0;
4823  } else { // n_tsk0 > extras
4824  ext1 = 0; // no extra iters in 2nd half
4825  ext0 = extras;
4826  tc1 = grainsize * n_tsk1;
4827  tc0 = tc - tc1;
4828  }
4829  ub0 = lower + st * (tc0 - 1);
4830  lb1 = ub0 + st;
4831 
4832  // create pattern task for 2nd half of the loop
4833  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4834  // adjust lower bound (upper bound is not changed) for the 2nd half
4835  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4836  if (ptask_dup != NULL) // construct firstprivates, etc.
4837  ptask_dup(next_task, task, 0);
4838  *ub = ub0; // adjust upper bound for the 1st half
4839 
4840  // create auxiliary task for 2nd half of the loop
4841  // make sure new task has same parent task as the pattern task
4842  kmp_taskdata_t *current_task = thread->th.th_current_task;
4843  thread->th.th_current_task = taskdata->td_parent;
4844  kmp_task_t *new_task =
4845  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4846  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4847  // restore current task
4848  thread->th.th_current_task = current_task;
4849  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4850  p->task = next_task;
4851  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4852  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4853  p->task_dup = task_dup;
4854  p->st = st;
4855  p->ub_glob = ub_glob;
4856  p->num_tasks = n_tsk1;
4857  p->grainsize = grainsize;
4858  p->extras = ext1;
4859  p->last_chunk = last_chunk1;
4860  p->tc = tc1;
4861  p->num_t_min = num_t_min;
4862 #if OMPT_SUPPORT
4863  p->codeptr_ra = codeptr_ra;
4864 #endif
4865 
4866 #if OMPT_SUPPORT
4867  // schedule new task with correct return address for OMPT events
4868  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4869 #else
4870  __kmp_omp_task(gtid, new_task, true); // schedule new task
4871 #endif
4872 
4873  // execute the 1st half of current subrange
4874  if (n_tsk0 > num_t_min)
4875  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4876  ext0, last_chunk0, tc0, num_t_min,
4877 #if OMPT_SUPPORT
4878  codeptr_ra,
4879 #endif
4880  task_dup);
4881  else
4882  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4883  gr_size0, ext0, last_chunk0, tc0,
4884 #if OMPT_SUPPORT
4885  codeptr_ra,
4886 #endif
4887  task_dup);
4888 
4889  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
4890 }
4891 
4892 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4893  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4894  int nogroup, int sched, kmp_uint64 grainsize,
4895  int modifier, void *task_dup) {
4896  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4897  KMP_DEBUG_ASSERT(task != NULL);
4898  if (nogroup == 0) {
4899 #if OMPT_SUPPORT && OMPT_OPTIONAL
4900  OMPT_STORE_RETURN_ADDRESS(gtid);
4901 #endif
4902  __kmpc_taskgroup(loc, gtid);
4903  }
4904 
4905  // =========================================================================
4906  // calculate loop parameters
4907  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4908  kmp_uint64 tc;
4909  // compiler provides global bounds here
4910  kmp_uint64 lower = task_bounds.get_lb();
4911  kmp_uint64 upper = task_bounds.get_ub();
4912  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4913  kmp_uint64 num_tasks = 0, extras = 0;
4914  kmp_int64 last_chunk =
4915  0; // reduce grainsize of last task by last_chunk in strict mode
4916  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4917  kmp_info_t *thread = __kmp_threads[gtid];
4918  kmp_taskdata_t *current_task = thread->th.th_current_task;
4919 
4920  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4921  "grain %llu(%d, %d), dup %p\n",
4922  gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
4923  task_dup));
4924 
4925  // compute trip count
4926  if (st == 1) { // most common case
4927  tc = upper - lower + 1;
4928  } else if (st < 0) {
4929  tc = (lower - upper) / (-st) + 1;
4930  } else { // st > 0
4931  tc = (upper - lower) / st + 1;
4932  }
4933  if (tc == 0) {
4934  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
4935  // free the pattern task and exit
4936  __kmp_task_start(gtid, task, current_task);
4937  // do not execute anything for zero-trip loop
4938  __kmp_task_finish<false>(gtid, task, current_task);
4939  return;
4940  }
4941 
4942 #if OMPT_SUPPORT && OMPT_OPTIONAL
4943  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4944  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4945  if (ompt_enabled.ompt_callback_work) {
4946  ompt_callbacks.ompt_callback(ompt_callback_work)(
4947  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4948  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4949  }
4950 #endif
4951 
4952  if (num_tasks_min == 0)
4953  // TODO: can we choose better default heuristic?
4954  num_tasks_min =
4955  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4956 
4957  // compute num_tasks/grainsize based on the input provided
4958  switch (sched) {
4959  case 0: // no schedule clause specified, we can choose the default
4960  // let's try to schedule (team_size*10) tasks
4961  grainsize = thread->th.th_team_nproc * 10;
4962  KMP_FALLTHROUGH();
4963  case 2: // num_tasks provided
4964  if (grainsize > tc) {
4965  num_tasks = tc; // too big num_tasks requested, adjust values
4966  grainsize = 1;
4967  extras = 0;
4968  } else {
4969  num_tasks = grainsize;
4970  grainsize = tc / num_tasks;
4971  extras = tc % num_tasks;
4972  }
4973  break;
4974  case 1: // grainsize provided
4975  if (grainsize > tc) {
4976  num_tasks = 1;
4977  grainsize = tc; // too big grainsize requested, adjust values
4978  extras = 0;
4979  } else {
4980  if (modifier) {
4981  num_tasks = (tc + grainsize - 1) / grainsize;
4982  last_chunk = tc - (num_tasks * grainsize);
4983  extras = 0;
4984  } else {
4985  num_tasks = tc / grainsize;
4986  // adjust grainsize for balanced distribution of iterations
4987  grainsize = tc / num_tasks;
4988  extras = tc % num_tasks;
4989  }
4990  }
4991  break;
4992  default:
4993  KMP_ASSERT2(0, "unknown scheduling of taskloop");
4994  }
4995 
4996  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4997  (last_chunk < 0 ? last_chunk : extras));
4998  KMP_DEBUG_ASSERT(num_tasks > extras);
4999  KMP_DEBUG_ASSERT(num_tasks > 0);
5000  // =========================================================================
5001 
5002  // check if clause value first
5003  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5004  if (if_val == 0) { // if(0) specified, mark task as serial
5005  taskdata->td_flags.task_serial = 1;
5006  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5007  // always start serial tasks linearly
5008  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5009  grainsize, extras, last_chunk, tc,
5010 #if OMPT_SUPPORT
5011  OMPT_GET_RETURN_ADDRESS(0),
5012 #endif
5013  task_dup);
5014  // !taskdata->td_flags.native => currently force linear spawning of tasks
5015  // for GOMP_taskloop
5016  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5017  KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5018  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5019  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5020  last_chunk));
5021  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5022  grainsize, extras, last_chunk, tc, num_tasks_min,
5023 #if OMPT_SUPPORT
5024  OMPT_GET_RETURN_ADDRESS(0),
5025 #endif
5026  task_dup);
5027  } else {
5028  KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5029  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5030  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5031  last_chunk));
5032  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5033  grainsize, extras, last_chunk, tc,
5034 #if OMPT_SUPPORT
5035  OMPT_GET_RETURN_ADDRESS(0),
5036 #endif
5037  task_dup);
5038  }
5039 
5040 #if OMPT_SUPPORT && OMPT_OPTIONAL
5041  if (ompt_enabled.ompt_callback_work) {
5042  ompt_callbacks.ompt_callback(ompt_callback_work)(
5043  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5044  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5045  }
5046 #endif
5047 
5048  if (nogroup == 0) {
5049 #if OMPT_SUPPORT && OMPT_OPTIONAL
5050  OMPT_STORE_RETURN_ADDRESS(gtid);
5051 #endif
5052  __kmpc_end_taskgroup(loc, gtid);
5053  }
5054  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5055 }
5056 
5073 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5074  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5075  int sched, kmp_uint64 grainsize, void *task_dup) {
5076  __kmp_assert_valid_gtid(gtid);
5077  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5078  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5079  0, task_dup);
5080  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5081 }
5082 
5100 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5101  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5102  int nogroup, int sched, kmp_uint64 grainsize,
5103  int modifier, void *task_dup) {
5104  __kmp_assert_valid_gtid(gtid);
5105  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5106  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5107  modifier, task_dup);
5108  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5109 }
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
struct kmp_taskred_input kmp_taskred_input_t
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
kmp_taskred_flags_t flags
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
struct kmp_taskred_data kmp_taskred_data_t
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:908
void * __kmpc_taskred_init(int gtid, int num, void *data)
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
Definition: kmp.h:234
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
struct kmp_taskred_flags kmp_taskred_flags_t
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
struct kmp_task_red_input kmp_task_red_input_t