LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #if ENABLE_LIBOMPTARGET
25 static void (*tgt_target_nowait_query)(void **);
26 
27 void __kmp_init_target_task() {
28  *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
29 }
30 #endif
31 
32 /* forward declaration */
33 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
34  kmp_info_t *this_thr);
35 static void __kmp_alloc_task_deque(kmp_info_t *thread,
36  kmp_thread_data_t *thread_data);
37 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
38  kmp_task_team_t *task_team);
39 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
40 #if OMPX_TASKGRAPH
41 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
42 int __kmp_taskloop_task(int gtid, void *ptask);
43 #endif
44 
45 #ifdef BUILD_TIED_TASK_STACK
46 
47 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
48 // from top do bottom
49 //
50 // gtid: global thread identifier for thread containing stack
51 // thread_data: thread data for task team thread containing stack
52 // threshold: value above which the trace statement triggers
53 // location: string identifying call site of this function (for trace)
54 static void __kmp_trace_task_stack(kmp_int32 gtid,
55  kmp_thread_data_t *thread_data,
56  int threshold, char *location) {
57  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
58  kmp_taskdata_t **stack_top = task_stack->ts_top;
59  kmp_int32 entries = task_stack->ts_entries;
60  kmp_taskdata_t *tied_task;
61 
62  KA_TRACE(
63  threshold,
64  ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
65  "first_block = %p, stack_top = %p \n",
66  location, gtid, entries, task_stack->ts_first_block, stack_top));
67 
68  KMP_DEBUG_ASSERT(stack_top != NULL);
69  KMP_DEBUG_ASSERT(entries > 0);
70 
71  while (entries != 0) {
72  KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
73  // fix up ts_top if we need to pop from previous block
74  if (entries & TASK_STACK_INDEX_MASK == 0) {
75  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
76 
77  stack_block = stack_block->sb_prev;
78  stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
79  }
80 
81  // finish bookkeeping
82  stack_top--;
83  entries--;
84 
85  tied_task = *stack_top;
86 
87  KMP_DEBUG_ASSERT(tied_task != NULL);
88  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
89 
90  KA_TRACE(threshold,
91  ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
92  "stack_top=%p, tied_task=%p\n",
93  location, gtid, entries, stack_top, tied_task));
94  }
95  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
96 
97  KA_TRACE(threshold,
98  ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
99  location, gtid));
100 }
101 
102 // __kmp_init_task_stack: initialize the task stack for the first time
103 // after a thread_data structure is created.
104 // It should not be necessary to do this again (assuming the stack works).
105 //
106 // gtid: global thread identifier of calling thread
107 // thread_data: thread data for task team thread containing stack
108 static void __kmp_init_task_stack(kmp_int32 gtid,
109  kmp_thread_data_t *thread_data) {
110  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
111  kmp_stack_block_t *first_block;
112 
113  // set up the first block of the stack
114  first_block = &task_stack->ts_first_block;
115  task_stack->ts_top = (kmp_taskdata_t **)first_block;
116  memset((void *)first_block, '\0',
117  TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
118 
119  // initialize the stack to be empty
120  task_stack->ts_entries = TASK_STACK_EMPTY;
121  first_block->sb_next = NULL;
122  first_block->sb_prev = NULL;
123 }
124 
125 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
126 //
127 // gtid: global thread identifier for calling thread
128 // thread_data: thread info for thread containing stack
129 static void __kmp_free_task_stack(kmp_int32 gtid,
130  kmp_thread_data_t *thread_data) {
131  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
132  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
133 
134  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
135  // free from the second block of the stack
136  while (stack_block != NULL) {
137  kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
138 
139  stack_block->sb_next = NULL;
140  stack_block->sb_prev = NULL;
141  if (stack_block != &task_stack->ts_first_block) {
142  __kmp_thread_free(thread,
143  stack_block); // free the block, if not the first
144  }
145  stack_block = next_block;
146  }
147  // initialize the stack to be empty
148  task_stack->ts_entries = 0;
149  task_stack->ts_top = NULL;
150 }
151 
152 // __kmp_push_task_stack: Push the tied task onto the task stack.
153 // Grow the stack if necessary by allocating another block.
154 //
155 // gtid: global thread identifier for calling thread
156 // thread: thread info for thread containing stack
157 // tied_task: the task to push on the stack
158 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
159  kmp_taskdata_t *tied_task) {
160  // GEH - need to consider what to do if tt_threads_data not allocated yet
161  kmp_thread_data_t *thread_data =
162  &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
163  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
164 
165  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
166  return; // Don't push anything on stack if team or team tasks are serialized
167  }
168 
169  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
170  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
171 
172  KA_TRACE(20,
173  ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
174  gtid, thread, tied_task));
175  // Store entry
176  *(task_stack->ts_top) = tied_task;
177 
178  // Do bookkeeping for next push
179  task_stack->ts_top++;
180  task_stack->ts_entries++;
181 
182  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
183  // Find beginning of this task block
184  kmp_stack_block_t *stack_block =
185  (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
186 
187  // Check if we already have a block
188  if (stack_block->sb_next !=
189  NULL) { // reset ts_top to beginning of next block
190  task_stack->ts_top = &stack_block->sb_next->sb_block[0];
191  } else { // Alloc new block and link it up
192  kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
193  thread, sizeof(kmp_stack_block_t));
194 
195  task_stack->ts_top = &new_block->sb_block[0];
196  stack_block->sb_next = new_block;
197  new_block->sb_prev = stack_block;
198  new_block->sb_next = NULL;
199 
200  KA_TRACE(
201  30,
202  ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
203  gtid, tied_task, new_block));
204  }
205  }
206  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
207  tied_task));
208 }
209 
210 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
211 // the task, just check to make sure it matches the ending task passed in.
212 //
213 // gtid: global thread identifier for the calling thread
214 // thread: thread info structure containing stack
215 // tied_task: the task popped off the stack
216 // ending_task: the task that is ending (should match popped task)
217 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
218  kmp_taskdata_t *ending_task) {
219  // GEH - need to consider what to do if tt_threads_data not allocated yet
220  kmp_thread_data_t *thread_data =
221  &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
222  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
223  kmp_taskdata_t *tied_task;
224 
225  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
226  // Don't pop anything from stack if team or team tasks are serialized
227  return;
228  }
229 
230  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
231  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
232 
233  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
234  thread));
235 
236  // fix up ts_top if we need to pop from previous block
237  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
238  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
239 
240  stack_block = stack_block->sb_prev;
241  task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
242  }
243 
244  // finish bookkeeping
245  task_stack->ts_top--;
246  task_stack->ts_entries--;
247 
248  tied_task = *(task_stack->ts_top);
249 
250  KMP_DEBUG_ASSERT(tied_task != NULL);
251  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
252  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
253 
254  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
255  tied_task));
256  return;
257 }
258 #endif /* BUILD_TIED_TASK_STACK */
259 
260 // returns 1 if new task is allowed to execute, 0 otherwise
261 // checks Task Scheduling constraint (if requested) and
262 // mutexinoutset dependencies if any
263 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
264  const kmp_taskdata_t *tasknew,
265  const kmp_taskdata_t *taskcurr) {
266  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
267  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
268  // only descendant of all deferred tied tasks can be scheduled, checking
269  // the last one is enough, as it in turn is the descendant of all others
270  kmp_taskdata_t *current = taskcurr->td_last_tied;
271  KMP_DEBUG_ASSERT(current != NULL);
272  // check if the task is not suspended on barrier
273  if (current->td_flags.tasktype == TASK_EXPLICIT ||
274  current->td_taskwait_thread > 0) { // <= 0 on barrier
275  kmp_int32 level = current->td_level;
276  kmp_taskdata_t *parent = tasknew->td_parent;
277  while (parent != current && parent->td_level > level) {
278  // check generation up to the level of the current task
279  parent = parent->td_parent;
280  KMP_DEBUG_ASSERT(parent != NULL);
281  }
282  if (parent != current)
283  return false;
284  }
285  }
286  // Check mutexinoutset dependencies, acquire locks
287  kmp_depnode_t *node = tasknew->td_depnode;
288 #if OMPX_TASKGRAPH
289  if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
290 #else
291  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
292 #endif
293  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
294  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
295  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
296  continue;
297  // could not get the lock, release previous locks
298  for (int j = i - 1; j >= 0; --j)
299  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
300  return false;
301  }
302  // negative num_locks means all locks acquired successfully
303  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
304  }
305  return true;
306 }
307 
308 // __kmp_realloc_task_deque:
309 // Re-allocates a task deque for a particular thread, copies the content from
310 // the old deque and adjusts the necessary data structures relating to the
311 // deque. This operation must be done with the deque_lock being held
312 static void __kmp_realloc_task_deque(kmp_info_t *thread,
313  kmp_thread_data_t *thread_data) {
314  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
315  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
316  kmp_int32 new_size = 2 * size;
317 
318  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
319  "%d] for thread_data %p\n",
320  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
321 
322  kmp_taskdata_t **new_deque =
323  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
324 
325  int i, j;
326  for (i = thread_data->td.td_deque_head, j = 0; j < size;
327  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
328  new_deque[j] = thread_data->td.td_deque[i];
329 
330  __kmp_free(thread_data->td.td_deque);
331 
332  thread_data->td.td_deque_head = 0;
333  thread_data->td.td_deque_tail = size;
334  thread_data->td.td_deque = new_deque;
335  thread_data->td.td_deque_size = new_size;
336 }
337 
338 static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
339  kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
340  kmp_thread_data_t *thread_data = &l->td;
341  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
342  thread_data->td.td_deque_last_stolen = -1;
343  KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
344  "for thread_data %p\n",
345  __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
346  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
347  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
348  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
349  return l;
350 }
351 
352 // The function finds the deque of priority tasks with given priority, or
353 // allocates a new deque and put it into sorted (high -> low) list of deques.
354 // Deques of non-default priority tasks are shared between all threads in team,
355 // as opposed to per-thread deques of tasks with default priority.
356 // The function is called under the lock task_team->tt.tt_task_pri_lock.
357 static kmp_thread_data_t *
358 __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
359  kmp_thread_data_t *thread_data;
360  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
361  if (lst->priority == pri) {
362  // Found queue of tasks with given priority.
363  thread_data = &lst->td;
364  } else if (lst->priority < pri) {
365  // All current priority queues contain tasks with lower priority.
366  // Allocate new one for given priority tasks.
367  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
368  thread_data = &list->td;
369  list->priority = pri;
370  list->next = lst;
371  task_team->tt.tt_task_pri_list = list;
372  } else { // task_team->tt.tt_task_pri_list->priority > pri
373  kmp_task_pri_t *next_queue = lst->next;
374  while (next_queue && next_queue->priority > pri) {
375  lst = next_queue;
376  next_queue = lst->next;
377  }
378  // lst->priority > pri && (next == NULL || pri >= next->priority)
379  if (next_queue == NULL) {
380  // No queue with pri priority, need to allocate new one.
381  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
382  thread_data = &list->td;
383  list->priority = pri;
384  list->next = NULL;
385  lst->next = list;
386  } else if (next_queue->priority == pri) {
387  // Found queue of tasks with given priority.
388  thread_data = &next_queue->td;
389  } else { // lst->priority > pri > next->priority
390  // insert newly allocated between existed queues
391  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
392  thread_data = &list->td;
393  list->priority = pri;
394  list->next = next_queue;
395  lst->next = list;
396  }
397  }
398  return thread_data;
399 }
400 
401 // __kmp_push_priority_task: Add a task to the team's priority task deque
402 static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
403  kmp_taskdata_t *taskdata,
404  kmp_task_team_t *task_team,
405  kmp_int32 pri) {
406  kmp_thread_data_t *thread_data = NULL;
407  KA_TRACE(20,
408  ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
409  gtid, taskdata, pri));
410 
411  // Find task queue specific to priority value
412  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
413  if (UNLIKELY(lst == NULL)) {
414  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
415  if (task_team->tt.tt_task_pri_list == NULL) {
416  // List of queues is still empty, allocate one.
417  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
418  thread_data = &list->td;
419  list->priority = pri;
420  list->next = NULL;
421  task_team->tt.tt_task_pri_list = list;
422  } else {
423  // Other thread initialized a queue. Check if it fits and get thread_data.
424  thread_data = __kmp_get_priority_deque_data(task_team, pri);
425  }
426  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
427  } else {
428  if (lst->priority == pri) {
429  // Found queue of tasks with given priority.
430  thread_data = &lst->td;
431  } else {
432  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
433  thread_data = __kmp_get_priority_deque_data(task_team, pri);
434  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
435  }
436  }
437  KMP_DEBUG_ASSERT(thread_data);
438 
439  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
440  // Check if deque is full
441  if (TCR_4(thread_data->td.td_deque_ntasks) >=
442  TASK_DEQUE_SIZE(thread_data->td)) {
443  if (__kmp_enable_task_throttling &&
444  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
445  thread->th.th_current_task)) {
446  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
447  KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
448  "TASK_NOT_PUSHED for task %p\n",
449  gtid, taskdata));
450  return TASK_NOT_PUSHED;
451  } else {
452  // expand deque to push the task which is not allowed to execute
453  __kmp_realloc_task_deque(thread, thread_data);
454  }
455  }
456  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
457  TASK_DEQUE_SIZE(thread_data->td));
458  // Push taskdata.
459  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
460  // Wrap index.
461  thread_data->td.td_deque_tail =
462  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
463  TCW_4(thread_data->td.td_deque_ntasks,
464  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
465  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
466  KMP_FSYNC_RELEASING(taskdata); // releasing child
467  KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
468  "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
469  gtid, taskdata, thread_data->td.td_deque_ntasks,
470  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
471  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
472  task_team->tt.tt_num_task_pri++; // atomic inc
473  return TASK_SUCCESSFULLY_PUSHED;
474 }
475 
476 // __kmp_push_task: Add a task to the thread's deque
477 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
478  kmp_info_t *thread = __kmp_threads[gtid];
479  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
480 
481  // If we encounter a hidden helper task, and the current thread is not a
482  // hidden helper thread, we have to give the task to any hidden helper thread
483  // starting from its shadow one.
484  if (UNLIKELY(taskdata->td_flags.hidden_helper &&
485  !KMP_HIDDEN_HELPER_THREAD(gtid))) {
486  kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
487  __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
488  // Signal the hidden helper threads.
489  __kmp_hidden_helper_worker_thread_signal();
490  return TASK_SUCCESSFULLY_PUSHED;
491  }
492 
493  kmp_task_team_t *task_team = thread->th.th_task_team;
494  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
495  kmp_thread_data_t *thread_data;
496 
497  KA_TRACE(20,
498  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
499 
500  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
501  // untied task needs to increment counter so that the task structure is not
502  // freed prematurely
503  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
504  KMP_DEBUG_USE_VAR(counter);
505  KA_TRACE(
506  20,
507  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
508  gtid, counter, taskdata));
509  }
510 
511  // The first check avoids building task_team thread data if serialized
512  if (UNLIKELY(taskdata->td_flags.task_serial)) {
513  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
514  "TASK_NOT_PUSHED for task %p\n",
515  gtid, taskdata));
516  return TASK_NOT_PUSHED;
517  }
518 
519  // Now that serialized tasks have returned, we can assume that we are not in
520  // immediate exec mode
521  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
522  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
523  __kmp_enable_tasking(task_team, thread);
524  }
525  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
526  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
527 
528  if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
529  __kmp_max_task_priority > 0) {
530  int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
531  return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
532  }
533 
534  // Find tasking deque specific to encountering thread
535  thread_data = &task_team->tt.tt_threads_data[tid];
536 
537  // No lock needed since only owner can allocate. If the task is hidden_helper,
538  // we don't need it either because we have initialized the dequeue for hidden
539  // helper thread data.
540  if (UNLIKELY(thread_data->td.td_deque == NULL)) {
541  __kmp_alloc_task_deque(thread, thread_data);
542  }
543 
544  int locked = 0;
545  // Check if deque is full
546  if (TCR_4(thread_data->td.td_deque_ntasks) >=
547  TASK_DEQUE_SIZE(thread_data->td)) {
548  if (__kmp_enable_task_throttling &&
549  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
550  thread->th.th_current_task)) {
551  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
552  "TASK_NOT_PUSHED for task %p\n",
553  gtid, taskdata));
554  return TASK_NOT_PUSHED;
555  } else {
556  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
557  locked = 1;
558  if (TCR_4(thread_data->td.td_deque_ntasks) >=
559  TASK_DEQUE_SIZE(thread_data->td)) {
560  // expand deque to push the task which is not allowed to execute
561  __kmp_realloc_task_deque(thread, thread_data);
562  }
563  }
564  }
565  // Lock the deque for the task push operation
566  if (!locked) {
567  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
568  // Need to recheck as we can get a proxy task from thread outside of OpenMP
569  if (TCR_4(thread_data->td.td_deque_ntasks) >=
570  TASK_DEQUE_SIZE(thread_data->td)) {
571  if (__kmp_enable_task_throttling &&
572  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
573  thread->th.th_current_task)) {
574  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
575  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
576  "returning TASK_NOT_PUSHED for task %p\n",
577  gtid, taskdata));
578  return TASK_NOT_PUSHED;
579  } else {
580  // expand deque to push the task which is not allowed to execute
581  __kmp_realloc_task_deque(thread, thread_data);
582  }
583  }
584  }
585  // Must have room since no thread can add tasks but calling thread
586  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
587  TASK_DEQUE_SIZE(thread_data->td));
588 
589  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
590  taskdata; // Push taskdata
591  // Wrap index.
592  thread_data->td.td_deque_tail =
593  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
594  TCW_4(thread_data->td.td_deque_ntasks,
595  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
596  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
597  KMP_FSYNC_RELEASING(taskdata); // releasing child
598  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
599  "task=%p ntasks=%d head=%u tail=%u\n",
600  gtid, taskdata, thread_data->td.td_deque_ntasks,
601  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
602 
603  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
604 
605  return TASK_SUCCESSFULLY_PUSHED;
606 }
607 
608 // __kmp_pop_current_task_from_thread: set up current task from called thread
609 // when team ends
610 //
611 // this_thr: thread structure to set current_task in.
612 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
613  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
614  "this_thread=%p, curtask=%p, "
615  "curtask_parent=%p\n",
616  0, this_thr, this_thr->th.th_current_task,
617  this_thr->th.th_current_task->td_parent));
618 
619  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
620 
621  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
622  "this_thread=%p, curtask=%p, "
623  "curtask_parent=%p\n",
624  0, this_thr, this_thr->th.th_current_task,
625  this_thr->th.th_current_task->td_parent));
626 }
627 
628 // __kmp_push_current_task_to_thread: set up current task in called thread for a
629 // new team
630 //
631 // this_thr: thread structure to set up
632 // team: team for implicit task data
633 // tid: thread within team to set up
634 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
635  int tid) {
636  // current task of the thread is a parent of the new just created implicit
637  // tasks of new team
638  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
639  "curtask=%p "
640  "parent_task=%p\n",
641  tid, this_thr, this_thr->th.th_current_task,
642  team->t.t_implicit_task_taskdata[tid].td_parent));
643 
644  KMP_DEBUG_ASSERT(this_thr != NULL);
645 
646  if (tid == 0) {
647  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
648  team->t.t_implicit_task_taskdata[0].td_parent =
649  this_thr->th.th_current_task;
650  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
651  }
652  } else {
653  team->t.t_implicit_task_taskdata[tid].td_parent =
654  team->t.t_implicit_task_taskdata[0].td_parent;
655  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
656  }
657 
658  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
659  "curtask=%p "
660  "parent_task=%p\n",
661  tid, this_thr, this_thr->th.th_current_task,
662  team->t.t_implicit_task_taskdata[tid].td_parent));
663 }
664 
665 // __kmp_task_start: bookkeeping for a task starting execution
666 //
667 // GTID: global thread id of calling thread
668 // task: task starting execution
669 // current_task: task suspending
670 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
671  kmp_taskdata_t *current_task) {
672  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
673  kmp_info_t *thread = __kmp_threads[gtid];
674 
675  KA_TRACE(10,
676  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
677  gtid, taskdata, current_task));
678 
679  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
680 
681  // mark currently executing task as suspended
682  // TODO: GEH - make sure root team implicit task is initialized properly.
683  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
684  current_task->td_flags.executing = 0;
685 
686 // Add task to stack if tied
687 #ifdef BUILD_TIED_TASK_STACK
688  if (taskdata->td_flags.tiedness == TASK_TIED) {
689  __kmp_push_task_stack(gtid, thread, taskdata);
690  }
691 #endif /* BUILD_TIED_TASK_STACK */
692 
693  // mark starting task as executing and as current task
694  thread->th.th_current_task = taskdata;
695 
696  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
697  taskdata->td_flags.tiedness == TASK_UNTIED);
698  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
699  taskdata->td_flags.tiedness == TASK_UNTIED);
700  taskdata->td_flags.started = 1;
701  taskdata->td_flags.executing = 1;
702  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
703  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
704 
705  // GEH TODO: shouldn't we pass some sort of location identifier here?
706  // APT: yes, we will pass location here.
707  // need to store current thread state (in a thread or taskdata structure)
708  // before setting work_state, otherwise wrong state is set after end of task
709 
710  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
711 
712  return;
713 }
714 
715 #if OMPT_SUPPORT
716 //------------------------------------------------------------------------------
717 // __ompt_task_init:
718 // Initialize OMPT fields maintained by a task. This will only be called after
719 // ompt_start_tool, so we already know whether ompt is enabled or not.
720 
721 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
722  // The calls to __ompt_task_init already have the ompt_enabled condition.
723  task->ompt_task_info.task_data.value = 0;
724  task->ompt_task_info.frame.exit_frame = ompt_data_none;
725  task->ompt_task_info.frame.enter_frame = ompt_data_none;
726  task->ompt_task_info.frame.exit_frame_flags =
727  ompt_frame_runtime | ompt_frame_framepointer;
728  task->ompt_task_info.frame.enter_frame_flags =
729  ompt_frame_runtime | ompt_frame_framepointer;
730  task->ompt_task_info.dispatch_chunk.start = 0;
731  task->ompt_task_info.dispatch_chunk.iterations = 0;
732 }
733 
734 // __ompt_task_start:
735 // Build and trigger task-begin event
736 static inline void __ompt_task_start(kmp_task_t *task,
737  kmp_taskdata_t *current_task,
738  kmp_int32 gtid) {
739  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
740  ompt_task_status_t status = ompt_task_switch;
741  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
742  status = ompt_task_yield;
743  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
744  }
745  /* let OMPT know that we're about to run this task */
746  if (ompt_enabled.ompt_callback_task_schedule) {
747  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
748  &(current_task->ompt_task_info.task_data), status,
749  &(taskdata->ompt_task_info.task_data));
750  }
751  taskdata->ompt_task_info.scheduling_parent = current_task;
752 }
753 
754 // __ompt_task_finish:
755 // Build and trigger final task-schedule event
756 static inline void __ompt_task_finish(kmp_task_t *task,
757  kmp_taskdata_t *resumed_task,
758  ompt_task_status_t status) {
759  if (ompt_enabled.ompt_callback_task_schedule) {
760  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
761  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
762  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
763  status = ompt_task_cancel;
764  }
765 
766  /* let OMPT know that we're returning to the callee task */
767  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
768  &(taskdata->ompt_task_info.task_data), status,
769  (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
770  }
771 }
772 #endif
773 
774 template <bool ompt>
775 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
776  kmp_task_t *task,
777  void *frame_address,
778  void *return_address) {
779  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
780  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
781 
782  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
783  "current_task=%p\n",
784  gtid, loc_ref, taskdata, current_task));
785 
786  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
787  // untied task needs to increment counter so that the task structure is not
788  // freed prematurely
789  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
790  KMP_DEBUG_USE_VAR(counter);
791  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
792  "incremented for task %p\n",
793  gtid, counter, taskdata));
794  }
795 
796  taskdata->td_flags.task_serial =
797  1; // Execute this task immediately, not deferred.
798  __kmp_task_start(gtid, task, current_task);
799 
800 #if OMPT_SUPPORT
801  if (ompt) {
802  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
803  current_task->ompt_task_info.frame.enter_frame.ptr =
804  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
805  current_task->ompt_task_info.frame.enter_frame_flags =
806  taskdata->ompt_task_info.frame.exit_frame_flags =
807  ompt_frame_application | ompt_frame_framepointer;
808  }
809  if (ompt_enabled.ompt_callback_task_create) {
810  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
811  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
812  &(parent_info->task_data), &(parent_info->frame),
813  &(taskdata->ompt_task_info.task_data),
814  TASK_TYPE_DETAILS_FORMAT(taskdata), 0, return_address);
815  }
816  __ompt_task_start(task, current_task, gtid);
817  }
818 #endif // OMPT_SUPPORT
819 
820  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
821  loc_ref, taskdata));
822 }
823 
824 #if OMPT_SUPPORT
825 OMPT_NOINLINE
826 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
827  kmp_task_t *task,
828  void *frame_address,
829  void *return_address) {
830  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
831  return_address);
832 }
833 #endif // OMPT_SUPPORT
834 
835 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
836 // execution
837 //
838 // loc_ref: source location information; points to beginning of task block.
839 // gtid: global thread number.
840 // task: task thunk for the started task.
841 #ifdef __s390x__
842 // This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
843 // In order for it to work correctly, the caller also needs to be compiled with
844 // backchain. If a caller is compiled without backchain,
845 // OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
846 // crash.
847 __attribute__((target("backchain")))
848 #endif
849 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
850  kmp_task_t *task) {
851 #if OMPT_SUPPORT
852  if (UNLIKELY(ompt_enabled.enabled)) {
853  OMPT_STORE_RETURN_ADDRESS(gtid);
854  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
855  OMPT_GET_FRAME_ADDRESS(1),
856  OMPT_LOAD_RETURN_ADDRESS(gtid));
857  return;
858  }
859 #endif
860  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
861 }
862 
863 #ifdef TASK_UNUSED
864 // __kmpc_omp_task_begin: report that a given task has started execution
865 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
866 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
867  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
868 
869  KA_TRACE(
870  10,
871  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
872  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
873 
874  __kmp_task_start(gtid, task, current_task);
875 
876  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
877  loc_ref, KMP_TASK_TO_TASKDATA(task)));
878  return;
879 }
880 #endif // TASK_UNUSED
881 
882 // __kmp_free_task: free the current task space and the space for shareds
883 //
884 // gtid: Global thread ID of calling thread
885 // taskdata: task to free
886 // thread: thread data structure of caller
887 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
888  kmp_info_t *thread) {
889  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
890  taskdata));
891 
892  // Check to make sure all flags and counters have the correct values
893  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
894  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
895  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
896  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
897  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
898  taskdata->td_flags.task_serial == 1);
899  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
900  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
901  // Clear data to not be re-used later by mistake.
902  task->data1.destructors = NULL;
903  task->data2.priority = 0;
904 
905  taskdata->td_flags.freed = 1;
906 #if OMPX_TASKGRAPH
907  // do not free tasks in taskgraph
908  if (!taskdata->is_taskgraph) {
909 #endif
910 // deallocate the taskdata and shared variable blocks associated with this task
911 #if USE_FAST_MEMORY
912  __kmp_fast_free(thread, taskdata);
913 #else /* ! USE_FAST_MEMORY */
914  __kmp_thread_free(thread, taskdata);
915 #endif
916 #if OMPX_TASKGRAPH
917  } else {
918  taskdata->td_flags.complete = 0;
919  taskdata->td_flags.started = 0;
920  taskdata->td_flags.freed = 0;
921  taskdata->td_flags.executing = 0;
922  taskdata->td_flags.task_serial =
923  (taskdata->td_parent->td_flags.final ||
924  taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
925 
926  // taskdata->td_allow_completion_event.pending_events_count = 1;
927  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
928  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
929  // start at one because counts current task and children
930  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
931  }
932 #endif
933 
934  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
935 }
936 
937 // __kmp_free_task_and_ancestors: free the current task and ancestors without
938 // children
939 //
940 // gtid: Global thread ID of calling thread
941 // taskdata: task to free
942 // thread: thread data structure of caller
943 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
944  kmp_taskdata_t *taskdata,
945  kmp_info_t *thread) {
946  // Proxy tasks must always be allowed to free their parents
947  // because they can be run in background even in serial mode.
948  kmp_int32 team_serial =
949  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
950  !taskdata->td_flags.proxy;
951  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
952 
953  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
954  KMP_DEBUG_ASSERT(children >= 0);
955 
956  // Now, go up the ancestor tree to see if any ancestors can now be freed.
957  while (children == 0) {
958  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
959 
960  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
961  "and freeing itself\n",
962  gtid, taskdata));
963 
964  // --- Deallocate my ancestor task ---
965  __kmp_free_task(gtid, taskdata, thread);
966 
967  taskdata = parent_taskdata;
968 
969  if (team_serial)
970  return;
971  // Stop checking ancestors at implicit task instead of walking up ancestor
972  // tree to avoid premature deallocation of ancestors.
973  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
974  if (taskdata->td_dephash) { // do we need to cleanup dephash?
975  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
976  kmp_tasking_flags_t flags_old = taskdata->td_flags;
977  if (children == 0 && flags_old.complete == 1) {
978  kmp_tasking_flags_t flags_new = flags_old;
979  flags_new.complete = 0;
980  if (KMP_COMPARE_AND_STORE_ACQ32(
981  RCAST(kmp_int32 *, &taskdata->td_flags),
982  *RCAST(kmp_int32 *, &flags_old),
983  *RCAST(kmp_int32 *, &flags_new))) {
984  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
985  "dephash of implicit task %p\n",
986  gtid, taskdata));
987  // cleanup dephash of finished implicit task
988  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
989  }
990  }
991  }
992  return;
993  }
994  // Predecrement simulated by "- 1" calculation
995  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
996  KMP_DEBUG_ASSERT(children >= 0);
997  }
998 
999  KA_TRACE(
1000  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
1001  "not freeing it yet\n",
1002  gtid, taskdata, children));
1003 }
1004 
1005 // Only need to keep track of child task counts if any of the following:
1006 // 1. team parallel and tasking not serialized;
1007 // 2. it is a proxy or detachable or hidden helper task
1008 // 3. the children counter of its parent task is greater than 0.
1009 // The reason for the 3rd one is for serialized team that found detached task,
1010 // hidden helper task, T. In this case, the execution of T is still deferred,
1011 // and it is also possible that a regular task depends on T. In this case, if we
1012 // don't track the children, task synchronization will be broken.
1013 static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
1014  kmp_tasking_flags_t flags = taskdata->td_flags;
1015  bool ret = !(flags.team_serial || flags.tasking_ser);
1016  ret = ret || flags.proxy == TASK_PROXY ||
1017  flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
1018  ret = ret ||
1019  KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
1020 #if OMPX_TASKGRAPH
1021  if (taskdata->td_taskgroup && taskdata->is_taskgraph)
1022  ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
1023 #endif
1024  return ret;
1025 }
1026 
1027 // __kmp_task_finish: bookkeeping to do when a task finishes execution
1028 //
1029 // gtid: global thread ID for calling thread
1030 // task: task to be finished
1031 // resumed_task: task to be resumed. (may be NULL if task is serialized)
1032 //
1033 // template<ompt>: effectively ompt_enabled.enabled!=0
1034 // the version with ompt=false is inlined, allowing to optimize away all ompt
1035 // code in this case
1036 template <bool ompt>
1037 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
1038  kmp_taskdata_t *resumed_task) {
1039  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1040  kmp_info_t *thread = __kmp_threads[gtid];
1041  kmp_task_team_t *task_team =
1042  thread->th.th_task_team; // might be NULL for serial teams...
1043 #if OMPX_TASKGRAPH
1044  // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
1045  bool is_taskgraph;
1046 #endif
1047 #if KMP_DEBUG
1048  kmp_int32 children = 0;
1049 #endif
1050  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
1051  "task %p\n",
1052  gtid, taskdata, resumed_task));
1053 
1054  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
1055 
1056 #if OMPX_TASKGRAPH
1057  is_taskgraph = taskdata->is_taskgraph;
1058 #endif
1059 
1060 // Pop task from stack if tied
1061 #ifdef BUILD_TIED_TASK_STACK
1062  if (taskdata->td_flags.tiedness == TASK_TIED) {
1063  __kmp_pop_task_stack(gtid, thread, taskdata);
1064  }
1065 #endif /* BUILD_TIED_TASK_STACK */
1066 
1067  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
1068  // untied task needs to check the counter so that the task structure is not
1069  // freed prematurely
1070  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
1071  KA_TRACE(
1072  20,
1073  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
1074  gtid, counter, taskdata));
1075  if (counter > 0) {
1076  // untied task is not done, to be continued possibly by other thread, do
1077  // not free it now
1078  if (resumed_task == NULL) {
1079  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
1080  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1081  // task is the parent
1082  }
1083  thread->th.th_current_task = resumed_task; // restore current_task
1084  resumed_task->td_flags.executing = 1; // resume previous task
1085  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
1086  "resuming task %p\n",
1087  gtid, taskdata, resumed_task));
1088  return;
1089  }
1090  }
1091 
1092  // bookkeeping for resuming task:
1093  // GEH - note tasking_ser => task_serial
1094  KMP_DEBUG_ASSERT(
1095  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
1096  taskdata->td_flags.task_serial);
1097  if (taskdata->td_flags.task_serial) {
1098  if (resumed_task == NULL) {
1099  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1100  // task is the parent
1101  }
1102  } else {
1103  KMP_DEBUG_ASSERT(resumed_task !=
1104  NULL); // verify that resumed task is passed as argument
1105  }
1106 
1107  /* If the tasks' destructor thunk flag has been set, we need to invoke the
1108  destructor thunk that has been generated by the compiler. The code is
1109  placed here, since at this point other tasks might have been released
1110  hence overlapping the destructor invocations with some other work in the
1111  released tasks. The OpenMP spec is not specific on when the destructors
1112  are invoked, so we should be free to choose. */
1113  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
1114  kmp_routine_entry_t destr_thunk = task->data1.destructors;
1115  KMP_ASSERT(destr_thunk);
1116  destr_thunk(gtid, task);
1117  }
1118 
1119  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
1120  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
1121  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
1122 
1123  bool completed = true;
1124  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
1125  if (taskdata->td_allow_completion_event.type ==
1126  KMP_EVENT_ALLOW_COMPLETION) {
1127  // event hasn't been fulfilled yet. Try to detach task.
1128  __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1129  if (taskdata->td_allow_completion_event.type ==
1130  KMP_EVENT_ALLOW_COMPLETION) {
1131  // task finished execution
1132  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1133  taskdata->td_flags.executing = 0; // suspend the finishing task
1134 
1135 #if OMPT_SUPPORT
1136  // For a detached task, which is not completed, we switch back
1137  // the omp_fulfill_event signals completion
1138  // locking is necessary to avoid a race with ompt_task_late_fulfill
1139  if (ompt)
1140  __ompt_task_finish(task, resumed_task, ompt_task_detach);
1141 #endif
1142 
1143  // no access to taskdata after this point!
1144  // __kmp_fulfill_event might free taskdata at any time from now
1145 
1146  taskdata->td_flags.proxy = TASK_PROXY; // proxify!
1147  completed = false;
1148  }
1149  __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1150  }
1151  }
1152 
1153  // Tasks with valid target async handles must be re-enqueued.
1154  if (taskdata->td_target_data.async_handle != NULL) {
1155  // Note: no need to translate gtid to its shadow. If the current thread is a
1156  // hidden helper one, then the gtid is already correct. Otherwise, hidden
1157  // helper threads are disabled, and gtid refers to a OpenMP thread.
1158 #if OMPT_SUPPORT
1159  if (ompt) {
1160  __ompt_task_finish(task, resumed_task, ompt_task_switch);
1161  }
1162 #endif
1163  __kmpc_give_task(task, __kmp_tid_from_gtid(gtid));
1164  if (KMP_HIDDEN_HELPER_THREAD(gtid))
1165  __kmp_hidden_helper_worker_thread_signal();
1166  completed = false;
1167  }
1168 
1169  if (completed) {
1170  taskdata->td_flags.complete = 1; // mark the task as completed
1171 #if OMPX_TASKGRAPH
1172  taskdata->td_flags.onced = 1; // mark the task as ran once already
1173 #endif
1174 
1175 #if OMPT_SUPPORT
1176  // This is not a detached task, we are done here
1177  if (ompt)
1178  __ompt_task_finish(task, resumed_task, ompt_task_complete);
1179 #endif
1180  // TODO: What would be the balance between the conditions in the function
1181  // and an atomic operation?
1182  if (__kmp_track_children_task(taskdata)) {
1183  __kmp_release_deps(gtid, taskdata);
1184  // Predecrement simulated by "- 1" calculation
1185 #if KMP_DEBUG
1186  children = -1 +
1187 #endif
1188  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
1189  KMP_DEBUG_ASSERT(children >= 0);
1190 #if OMPX_TASKGRAPH
1191  if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
1192 #else
1193  if (taskdata->td_taskgroup)
1194 #endif
1195  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1196  } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
1197  task_team->tt.tt_hidden_helper_task_encountered)) {
1198  // if we found proxy or hidden helper tasks there could exist a dependency
1199  // chain with the proxy task as origin
1200  __kmp_release_deps(gtid, taskdata);
1201  }
1202  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
1203  // called. Othertwise, if a task is executed immediately from the
1204  // release_deps code, the flag will be reset to 1 again by this same
1205  // function
1206  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1207  taskdata->td_flags.executing = 0; // suspend the finishing task
1208 
1209  // Decrement the counter of hidden helper tasks to be executed.
1210  if (taskdata->td_flags.hidden_helper) {
1211  // Hidden helper tasks can only be executed by hidden helper threads.
1212  KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1213  KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1214  }
1215  }
1216 
1217  KA_TRACE(
1218  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
1219  gtid, taskdata, children));
1220 
1221  // Free this task and then ancestor tasks if they have no children.
1222  // Restore th_current_task first as suggested by John:
1223  // johnmc: if an asynchronous inquiry peers into the runtime system
1224  // it doesn't see the freed task as the current task.
1225  thread->th.th_current_task = resumed_task;
1226  if (completed)
1227  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
1228 
1229  // TODO: GEH - make sure root team implicit task is initialized properly.
1230  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
1231  resumed_task->td_flags.executing = 1; // resume previous task
1232 
1233 #if OMPX_TASKGRAPH
1234  if (is_taskgraph && __kmp_track_children_task(taskdata) &&
1235  taskdata->td_taskgroup) {
1236  // TDG: we only release taskgroup barrier here because
1237  // free_task_and_ancestors will call
1238  // __kmp_free_task, which resets all task parameters such as
1239  // taskdata->started, etc. If we release the barrier earlier, these
1240  // parameters could be read before being reset. This is not an issue for
1241  // non-TDG implementation because we never reuse a task(data) structure
1242  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1243  }
1244 #endif
1245 
1246  KA_TRACE(
1247  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1248  gtid, taskdata, resumed_task));
1249 
1250  return;
1251 }
1252 
1253 template <bool ompt>
1254 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1255  kmp_int32 gtid,
1256  kmp_task_t *task) {
1257  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1258  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1259  KMP_DEBUG_ASSERT(gtid >= 0);
1260  // this routine will provide task to resume
1261  __kmp_task_finish<ompt>(gtid, task, NULL);
1262 
1263  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1264  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1265 
1266 #if OMPT_SUPPORT
1267  if (ompt) {
1268  ompt_frame_t *ompt_frame;
1269  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1270  ompt_frame->enter_frame = ompt_data_none;
1271  ompt_frame->enter_frame_flags =
1272  ompt_frame_runtime | ompt_frame_framepointer;
1273  }
1274 #endif
1275 
1276  return;
1277 }
1278 
1279 #if OMPT_SUPPORT
1280 OMPT_NOINLINE
1281 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1282  kmp_task_t *task) {
1283  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1284 }
1285 #endif // OMPT_SUPPORT
1286 
1287 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1288 //
1289 // loc_ref: source location information; points to end of task block.
1290 // gtid: global thread number.
1291 // task: task thunk for the completed task.
1292 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1293  kmp_task_t *task) {
1294 #if OMPT_SUPPORT
1295  if (UNLIKELY(ompt_enabled.enabled)) {
1296  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1297  return;
1298  }
1299 #endif
1300  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1301 }
1302 
1303 #ifdef TASK_UNUSED
1304 // __kmpc_omp_task_complete: report that a task has completed execution
1305 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1306 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1307  kmp_task_t *task) {
1308  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1309  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1310 
1311  __kmp_task_finish<false>(gtid, task,
1312  NULL); // Not sure how to find task to resume
1313 
1314  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1315  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1316  return;
1317 }
1318 #endif // TASK_UNUSED
1319 
1320 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1321 // task for a given thread
1322 //
1323 // loc_ref: reference to source location of parallel region
1324 // this_thr: thread data structure corresponding to implicit task
1325 // team: team for this_thr
1326 // tid: thread id of given thread within team
1327 // set_curr_task: TRUE if need to push current task to thread
1328 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1329 // have already been done elsewhere.
1330 // TODO: Get better loc_ref. Value passed in may be NULL
1331 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1332  kmp_team_t *team, int tid, int set_curr_task) {
1333  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1334 
1335  KF_TRACE(
1336  10,
1337  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1338  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1339 
1340  task->td_task_id = KMP_GEN_TASK_ID();
1341  task->td_team = team;
1342  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1343  // in debugger)
1344  task->td_ident = loc_ref;
1345  task->td_taskwait_ident = NULL;
1346  task->td_taskwait_counter = 0;
1347  task->td_taskwait_thread = 0;
1348 
1349  task->td_flags.tiedness = TASK_TIED;
1350  task->td_flags.tasktype = TASK_IMPLICIT;
1351  task->td_flags.proxy = TASK_FULL;
1352 
1353  // All implicit tasks are executed immediately, not deferred
1354  task->td_flags.task_serial = 1;
1355  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1356  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1357 
1358  task->td_flags.started = 1;
1359  task->td_flags.executing = 1;
1360  task->td_flags.complete = 0;
1361  task->td_flags.freed = 0;
1362 #if OMPX_TASKGRAPH
1363  task->td_flags.onced = 0;
1364 #endif
1365 
1366  task->td_depnode = NULL;
1367  task->td_last_tied = task;
1368  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1369 
1370  if (set_curr_task) { // only do this init first time thread is created
1371  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1372  // Not used: don't need to deallocate implicit task
1373  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1374  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1375  task->td_dephash = NULL;
1376  __kmp_push_current_task_to_thread(this_thr, team, tid);
1377  } else {
1378  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1379  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1380  }
1381 
1382 #if OMPT_SUPPORT
1383  if (UNLIKELY(ompt_enabled.enabled))
1384  __ompt_task_init(task, tid);
1385 #endif
1386 
1387  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1388  team, task));
1389 }
1390 
1391 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1392 // at the end of parallel regions. Some resources are kept for reuse in the next
1393 // parallel region.
1394 //
1395 // thread: thread data structure corresponding to implicit task
1396 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1397  kmp_taskdata_t *task = thread->th.th_current_task;
1398  if (task->td_dephash) {
1399  int children;
1400  task->td_flags.complete = 1;
1401 #if OMPX_TASKGRAPH
1402  task->td_flags.onced = 1;
1403 #endif
1404  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1405  kmp_tasking_flags_t flags_old = task->td_flags;
1406  if (children == 0 && flags_old.complete == 1) {
1407  kmp_tasking_flags_t flags_new = flags_old;
1408  flags_new.complete = 0;
1409  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1410  *RCAST(kmp_int32 *, &flags_old),
1411  *RCAST(kmp_int32 *, &flags_new))) {
1412  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1413  "dephash of implicit task %p\n",
1414  thread->th.th_info.ds.ds_gtid, task));
1415  __kmp_dephash_free_entries(thread, task->td_dephash);
1416  }
1417  }
1418  }
1419 }
1420 
1421 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1422 // when these are destroyed regions
1423 //
1424 // thread: thread data structure corresponding to implicit task
1425 void __kmp_free_implicit_task(kmp_info_t *thread) {
1426  kmp_taskdata_t *task = thread->th.th_current_task;
1427  if (task && task->td_dephash) {
1428  __kmp_dephash_free(thread, task->td_dephash);
1429  task->td_dephash = NULL;
1430  }
1431 }
1432 
1433 // Round up a size to a power of two specified by val: Used to insert padding
1434 // between structures co-allocated using a single malloc() call
1435 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1436  if (size & (val - 1)) {
1437  size &= ~(val - 1);
1438  if (size <= KMP_SIZE_T_MAX - val) {
1439  size += val; // Round up if there is no overflow.
1440  }
1441  }
1442  return size;
1443 } // __kmp_round_up_to_va
1444 
1445 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1446 //
1447 // loc_ref: source location information
1448 // gtid: global thread number.
1449 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1450 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1451 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1452 // private vars accessed in task.
1453 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1454 // in task.
1455 // task_entry: Pointer to task code entry point generated by compiler.
1456 // returns: a pointer to the allocated kmp_task_t structure (task).
1457 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1458  kmp_tasking_flags_t *flags,
1459  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1460  kmp_routine_entry_t task_entry) {
1461  kmp_task_t *task;
1462  kmp_taskdata_t *taskdata;
1463  kmp_info_t *thread = __kmp_threads[gtid];
1464  kmp_team_t *team = thread->th.th_team;
1465  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1466  size_t shareds_offset;
1467 
1468  if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1469  __kmp_middle_initialize();
1470 
1471  if (flags->hidden_helper) {
1472  if (__kmp_enable_hidden_helper) {
1473  if (!TCR_4(__kmp_init_hidden_helper))
1474  __kmp_hidden_helper_initialize();
1475  } else {
1476  // If the hidden helper task is not enabled, reset the flag to FALSE.
1477  flags->hidden_helper = FALSE;
1478  }
1479  }
1480 
1481  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1482  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1483  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1484  sizeof_shareds, task_entry));
1485 
1486  KMP_DEBUG_ASSERT(parent_task);
1487  if (parent_task->td_flags.final) {
1488  if (flags->merged_if0) {
1489  }
1490  flags->final = 1;
1491  }
1492 
1493  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1494  // Untied task encountered causes the TSC algorithm to check entire deque of
1495  // the victim thread. If no untied task encountered, then checking the head
1496  // of the deque should be enough.
1497  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1498  }
1499 
1500  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1501  // the tasking setup
1502  // when that happens is too late.
1503  if (UNLIKELY(flags->proxy == TASK_PROXY ||
1504  flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1505  if (flags->proxy == TASK_PROXY) {
1506  flags->tiedness = TASK_UNTIED;
1507  flags->merged_if0 = 1;
1508  }
1509  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1510  tasking support enabled */
1511  if ((thread->th.th_task_team) == NULL) {
1512  /* This should only happen if the team is serialized
1513  setup a task team and propagate it to the thread */
1514  KMP_DEBUG_ASSERT(team->t.t_serialized);
1515  KA_TRACE(30,
1516  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1517  gtid));
1518  __kmp_task_team_setup(thread, team);
1519  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1520  }
1521  kmp_task_team_t *task_team = thread->th.th_task_team;
1522 
1523  /* tasking must be enabled now as the task might not be pushed */
1524  if (!KMP_TASKING_ENABLED(task_team)) {
1525  KA_TRACE(
1526  30,
1527  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1528  __kmp_enable_tasking(task_team, thread);
1529  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1530  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1531  // No lock needed since only owner can allocate
1532  if (thread_data->td.td_deque == NULL) {
1533  __kmp_alloc_task_deque(thread, thread_data);
1534  }
1535  }
1536 
1537  if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1538  task_team->tt.tt_found_proxy_tasks == FALSE)
1539  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1540  if (flags->hidden_helper &&
1541  task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1542  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1543  }
1544 
1545  // Calculate shared structure offset including padding after kmp_task_t struct
1546  // to align pointers in shared struct
1547  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1548  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1549 
1550  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1551  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1552  shareds_offset));
1553  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1554  sizeof_shareds));
1555 
1556  // Avoid double allocation here by combining shareds with taskdata
1557 #if USE_FAST_MEMORY
1558  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1559  sizeof_shareds);
1560 #else /* ! USE_FAST_MEMORY */
1561  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1562  sizeof_shareds);
1563 #endif /* USE_FAST_MEMORY */
1564 
1565  task = KMP_TASKDATA_TO_TASK(taskdata);
1566 
1567 // Make sure task & taskdata are aligned appropriately
1568 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
1569  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1570  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1571 #else
1572  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1573  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1574 #endif
1575  if (sizeof_shareds > 0) {
1576  // Avoid double allocation here by combining shareds with taskdata
1577  task->shareds = &((char *)taskdata)[shareds_offset];
1578  // Make sure shareds struct is aligned to pointer size
1579  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1580  0);
1581  } else {
1582  task->shareds = NULL;
1583  }
1584  task->routine = task_entry;
1585  task->part_id = 0; // AC: Always start with 0 part id
1586 
1587  taskdata->td_task_id = KMP_GEN_TASK_ID();
1588  taskdata->td_team = thread->th.th_team;
1589  taskdata->td_alloc_thread = thread;
1590  taskdata->td_parent = parent_task;
1591  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1592  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1593  taskdata->td_ident = loc_ref;
1594  taskdata->td_taskwait_ident = NULL;
1595  taskdata->td_taskwait_counter = 0;
1596  taskdata->td_taskwait_thread = 0;
1597  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1598  // avoid copying icvs for proxy tasks
1599  if (flags->proxy == TASK_FULL)
1600  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1601 
1602  taskdata->td_flags = *flags;
1603  taskdata->td_task_team = thread->th.th_task_team;
1604  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1605  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1606  // If it is hidden helper task, we need to set the team and task team
1607  // correspondingly.
1608  if (flags->hidden_helper) {
1609  kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1610  taskdata->td_team = shadow_thread->th.th_team;
1611  taskdata->td_task_team = shadow_thread->th.th_task_team;
1612  }
1613 
1614  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1615  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1616 
1617  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1618  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1619 
1620  // GEH - Note we serialize the task if the team is serialized to make sure
1621  // implicit parallel region tasks are not left until program termination to
1622  // execute. Also, it helps locality to execute immediately.
1623 
1624  taskdata->td_flags.task_serial =
1625  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1626  taskdata->td_flags.tasking_ser || flags->merged_if0);
1627 
1628  taskdata->td_flags.started = 0;
1629  taskdata->td_flags.executing = 0;
1630  taskdata->td_flags.complete = 0;
1631  taskdata->td_flags.freed = 0;
1632 #if OMPX_TASKGRAPH
1633  taskdata->td_flags.onced = 0;
1634 #endif
1635  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1636  // start at one because counts current task and children
1637  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1638  taskdata->td_taskgroup =
1639  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1640  taskdata->td_dephash = NULL;
1641  taskdata->td_depnode = NULL;
1642  taskdata->td_target_data.async_handle = NULL;
1643  if (flags->tiedness == TASK_UNTIED)
1644  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1645  else
1646  taskdata->td_last_tied = taskdata;
1647  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1648 #if OMPT_SUPPORT
1649  if (UNLIKELY(ompt_enabled.enabled))
1650  __ompt_task_init(taskdata, gtid);
1651 #endif
1652  // TODO: What would be the balance between the conditions in the function and
1653  // an atomic operation?
1654  if (__kmp_track_children_task(taskdata)) {
1655  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1656  if (parent_task->td_taskgroup)
1657  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1658  // Only need to keep track of allocated child tasks for explicit tasks since
1659  // implicit not deallocated
1660  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1661  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1662  }
1663  if (flags->hidden_helper) {
1664  taskdata->td_flags.task_serial = FALSE;
1665  // Increment the number of hidden helper tasks to be executed
1666  KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1667  }
1668  }
1669 
1670 #if OMPX_TASKGRAPH
1671  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
1672  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
1673  (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
1674  taskdata->is_taskgraph = 1;
1675  taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
1676  taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
1677  }
1678 #endif
1679  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1680  gtid, taskdata, taskdata->td_parent));
1681 
1682  return task;
1683 }
1684 
1685 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1686  kmp_int32 flags, size_t sizeof_kmp_task_t,
1687  size_t sizeof_shareds,
1688  kmp_routine_entry_t task_entry) {
1689  kmp_task_t *retval;
1690  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1691  __kmp_assert_valid_gtid(gtid);
1692  input_flags->native = FALSE;
1693  // __kmp_task_alloc() sets up all other runtime flags
1694  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1695  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1696  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1697  input_flags->proxy ? "proxy" : "",
1698  input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1699  sizeof_shareds, task_entry));
1700 
1701  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1702  sizeof_shareds, task_entry);
1703 
1704  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1705 
1706  return retval;
1707 }
1708 
1709 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1710  kmp_int32 flags,
1711  size_t sizeof_kmp_task_t,
1712  size_t sizeof_shareds,
1713  kmp_routine_entry_t task_entry,
1714  kmp_int64 device_id) {
1715  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1716  // target task is untied defined in the specification
1717  input_flags.tiedness = TASK_UNTIED;
1718  input_flags.target = 1;
1719 
1720  if (__kmp_enable_hidden_helper)
1721  input_flags.hidden_helper = TRUE;
1722 
1723  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1724  sizeof_shareds, task_entry);
1725 }
1726 
1740 kmp_int32
1742  kmp_task_t *new_task, kmp_int32 naffins,
1743  kmp_task_affinity_info_t *affin_list) {
1744  return 0;
1745 }
1746 
1747 // __kmp_invoke_task: invoke the specified task
1748 //
1749 // gtid: global thread ID of caller
1750 // task: the task to invoke
1751 // current_task: the task to resume after task invocation
1752 #ifdef __s390x__
1753 __attribute__((target("backchain")))
1754 #endif
1755 static void
1756 __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1757  kmp_taskdata_t *current_task) {
1758  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1759  kmp_info_t *thread;
1760  int discard = 0 /* false */;
1761  KA_TRACE(
1762  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1763  gtid, taskdata, current_task));
1764  KMP_DEBUG_ASSERT(task);
1765  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1766  taskdata->td_flags.complete == 1)) {
1767  // This is a proxy task that was already completed but it needs to run
1768  // its bottom-half finish
1769  KA_TRACE(
1770  30,
1771  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1772  gtid, taskdata));
1773 
1774  __kmp_bottom_half_finish_proxy(gtid, task);
1775 
1776  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1777  "proxy task %p, resuming task %p\n",
1778  gtid, taskdata, current_task));
1779 
1780  return;
1781  }
1782 
1783 #if OMPT_SUPPORT
1784  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1785  // does not execute code.
1786  ompt_thread_info_t oldInfo;
1787  if (UNLIKELY(ompt_enabled.enabled)) {
1788  // Store the threads states and restore them after the task
1789  thread = __kmp_threads[gtid];
1790  oldInfo = thread->th.ompt_thread_info;
1791  thread->th.ompt_thread_info.wait_id = 0;
1792  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1793  ? ompt_state_work_serial
1794  : ompt_state_work_parallel;
1795  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1796  }
1797 #endif
1798 
1799  // Proxy tasks are not handled by the runtime
1800  if (taskdata->td_flags.proxy != TASK_PROXY) {
1801  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1802  }
1803 
1804  // TODO: cancel tasks if the parallel region has also been cancelled
1805  // TODO: check if this sequence can be hoisted above __kmp_task_start
1806  // if cancellation has been enabled for this run ...
1807  if (UNLIKELY(__kmp_omp_cancellation)) {
1808  thread = __kmp_threads[gtid];
1809  kmp_team_t *this_team = thread->th.th_team;
1810  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1811  if ((taskgroup && taskgroup->cancel_request) ||
1812  (this_team->t.t_cancel_request == cancel_parallel)) {
1813 #if OMPT_SUPPORT && OMPT_OPTIONAL
1814  ompt_data_t *task_data;
1815  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1816  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1817  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1818  task_data,
1819  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1820  : ompt_cancel_parallel) |
1821  ompt_cancel_discarded_task,
1822  NULL);
1823  }
1824 #endif
1825  KMP_COUNT_BLOCK(TASK_cancelled);
1826  // this task belongs to a task group and we need to cancel it
1827  discard = 1 /* true */;
1828  }
1829  }
1830 
1831  // Invoke the task routine and pass in relevant data.
1832  // Thunks generated by gcc take a different argument list.
1833  if (!discard) {
1834  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1835  taskdata->td_last_tied = current_task->td_last_tied;
1836  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1837  }
1838 #if KMP_STATS_ENABLED
1839  KMP_COUNT_BLOCK(TASK_executed);
1840  switch (KMP_GET_THREAD_STATE()) {
1841  case FORK_JOIN_BARRIER:
1842  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1843  break;
1844  case PLAIN_BARRIER:
1845  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1846  break;
1847  case TASKYIELD:
1848  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1849  break;
1850  case TASKWAIT:
1851  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1852  break;
1853  case TASKGROUP:
1854  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1855  break;
1856  default:
1857  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1858  break;
1859  }
1860 #endif // KMP_STATS_ENABLED
1861 
1862 // OMPT task begin
1863 #if OMPT_SUPPORT
1864  if (UNLIKELY(ompt_enabled.enabled))
1865  __ompt_task_start(task, current_task, gtid);
1866 #endif
1867 #if OMPT_SUPPORT && OMPT_OPTIONAL
1868  if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1869  taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1870  ompt_data_t instance = ompt_data_none;
1871  instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1872  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1873  ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1874  &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1875  ompt_dispatch_taskloop_chunk, instance);
1876  taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1877  }
1878 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1879 
1880 #if OMPD_SUPPORT
1881  if (ompd_state & OMPD_ENABLE_BP)
1882  ompd_bp_task_begin();
1883 #endif
1884 
1885 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1886  kmp_uint64 cur_time;
1887  kmp_int32 kmp_itt_count_task =
1888  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1889  current_task->td_flags.tasktype == TASK_IMPLICIT;
1890  if (kmp_itt_count_task) {
1891  thread = __kmp_threads[gtid];
1892  // Time outer level explicit task on barrier for adjusting imbalance time
1893  if (thread->th.th_bar_arrive_time)
1894  cur_time = __itt_get_timestamp();
1895  else
1896  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1897  }
1898  KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1899 #endif
1900 
1901 #if ENABLE_LIBOMPTARGET
1902  if (taskdata->td_target_data.async_handle != NULL) {
1903  // If we have a valid target async handle, that means that we have already
1904  // executed the task routine once. We must query for the handle completion
1905  // instead of re-executing the routine.
1906  KMP_ASSERT(tgt_target_nowait_query);
1907  tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
1908  } else
1909 #endif
1910  if (task->routine != NULL) {
1911 #ifdef KMP_GOMP_COMPAT
1912  if (taskdata->td_flags.native) {
1913  ((void (*)(void *))(*(task->routine)))(task->shareds);
1914  } else
1915 #endif /* KMP_GOMP_COMPAT */
1916  {
1917  (*(task->routine))(gtid, task);
1918  }
1919  }
1920  KMP_POP_PARTITIONED_TIMER();
1921 
1922 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1923  if (kmp_itt_count_task) {
1924  // Barrier imbalance - adjust arrive time with the task duration
1925  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1926  }
1927  KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1928  KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1929 #endif
1930  }
1931 
1932 #if OMPD_SUPPORT
1933  if (ompd_state & OMPD_ENABLE_BP)
1934  ompd_bp_task_end();
1935 #endif
1936 
1937  // Proxy tasks are not handled by the runtime
1938  if (taskdata->td_flags.proxy != TASK_PROXY) {
1939 #if OMPT_SUPPORT
1940  if (UNLIKELY(ompt_enabled.enabled)) {
1941  thread->th.ompt_thread_info = oldInfo;
1942  if (taskdata->td_flags.tiedness == TASK_TIED) {
1943  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1944  }
1945  __kmp_task_finish<true>(gtid, task, current_task);
1946  } else
1947 #endif
1948  __kmp_task_finish<false>(gtid, task, current_task);
1949  }
1950 #if OMPT_SUPPORT
1951  else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {
1952  __ompt_task_finish(task, current_task, ompt_task_switch);
1953  }
1954 #endif
1955 
1956  KA_TRACE(
1957  30,
1958  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1959  gtid, taskdata, current_task));
1960  return;
1961 }
1962 
1963 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1964 //
1965 // loc_ref: location of original task pragma (ignored)
1966 // gtid: Global Thread ID of encountering thread
1967 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1968 // Returns:
1969 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1970 // be resumed later.
1971 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1972 // resumed later.
1973 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1974  kmp_task_t *new_task) {
1975  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1976 
1977  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1978  loc_ref, new_taskdata));
1979 
1980 #if OMPT_SUPPORT
1981  kmp_taskdata_t *parent;
1982  if (UNLIKELY(ompt_enabled.enabled)) {
1983  parent = new_taskdata->td_parent;
1984  if (ompt_enabled.ompt_callback_task_create) {
1985  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1986  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1987  &(new_taskdata->ompt_task_info.task_data),
1988  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1989  OMPT_GET_RETURN_ADDRESS(0));
1990  }
1991  }
1992 #endif
1993 
1994  /* Should we execute the new task or queue it? For now, let's just always try
1995  to queue it. If the queue fills up, then we'll execute it. */
1996 
1997  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1998  { // Execute this task immediately
1999  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
2000  new_taskdata->td_flags.task_serial = 1;
2001  __kmp_invoke_task(gtid, new_task, current_task);
2002  }
2003 
2004  KA_TRACE(
2005  10,
2006  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
2007  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
2008  gtid, loc_ref, new_taskdata));
2009 
2010 #if OMPT_SUPPORT
2011  if (UNLIKELY(ompt_enabled.enabled)) {
2012  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2013  }
2014 #endif
2015  return TASK_CURRENT_NOT_QUEUED;
2016 }
2017 
2018 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
2019 //
2020 // gtid: Global Thread ID of encountering thread
2021 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
2022 // serialize_immediate: if TRUE then if the task is executed immediately its
2023 // execution will be serialized
2024 // Returns:
2025 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2026 // be resumed later.
2027 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2028 // resumed later.
2029 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
2030  bool serialize_immediate) {
2031  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2032 
2033 #if OMPX_TASKGRAPH
2034  if (new_taskdata->is_taskgraph &&
2035  __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
2036  kmp_tdg_info_t *tdg = new_taskdata->tdg;
2037  // extend the record_map if needed
2038  if (new_taskdata->td_task_id >= new_taskdata->tdg->map_size) {
2039  __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
2040  // map_size could have been updated by another thread if recursive
2041  // taskloop
2042  if (new_taskdata->td_task_id >= tdg->map_size) {
2043  kmp_uint old_size = tdg->map_size;
2044  kmp_uint new_size = old_size * 2;
2045  kmp_node_info_t *old_record = tdg->record_map;
2046  kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
2047  new_size * sizeof(kmp_node_info_t));
2048 
2049  KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
2050  tdg->record_map = new_record;
2051 
2052  __kmp_free(old_record);
2053 
2054  for (kmp_int i = old_size; i < new_size; i++) {
2055  kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
2056  __kmp_successors_size * sizeof(kmp_int32));
2057  new_record[i].task = nullptr;
2058  new_record[i].successors = successorsList;
2059  new_record[i].nsuccessors = 0;
2060  new_record[i].npredecessors = 0;
2061  new_record[i].successors_size = __kmp_successors_size;
2062  KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
2063  }
2064  // update the size at the end, so that we avoid other
2065  // threads use old_record while map_size is already updated
2066  tdg->map_size = new_size;
2067  }
2068  __kmp_release_bootstrap_lock(&tdg->graph_lock);
2069  }
2070  // record a task
2071  if (tdg->record_map[new_taskdata->td_task_id].task == nullptr) {
2072  tdg->record_map[new_taskdata->td_task_id].task = new_task;
2073  tdg->record_map[new_taskdata->td_task_id].parent_task =
2074  new_taskdata->td_parent;
2075  KMP_ATOMIC_INC(&tdg->num_tasks);
2076  }
2077  }
2078 #endif
2079 
2080  /* Should we execute the new task or queue it? For now, let's just always try
2081  to queue it. If the queue fills up, then we'll execute it. */
2082  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
2083  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
2084  { // Execute this task immediately
2085  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
2086  if (serialize_immediate)
2087  new_taskdata->td_flags.task_serial = 1;
2088  __kmp_invoke_task(gtid, new_task, current_task);
2089  } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2090  __kmp_wpolicy_passive) {
2091  kmp_info_t *this_thr = __kmp_threads[gtid];
2092  kmp_team_t *team = this_thr->th.th_team;
2093  kmp_int32 nthreads = this_thr->th.th_team_nproc;
2094  for (int i = 0; i < nthreads; ++i) {
2095  kmp_info_t *thread = team->t.t_threads[i];
2096  if (thread == this_thr)
2097  continue;
2098  if (thread->th.th_sleep_loc != NULL) {
2099  __kmp_null_resume_wrapper(thread);
2100  break; // awake one thread at a time
2101  }
2102  }
2103  }
2104  return TASK_CURRENT_NOT_QUEUED;
2105 }
2106 
2107 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
2108 // non-thread-switchable task from the parent thread only!
2109 //
2110 // loc_ref: location of original task pragma (ignored)
2111 // gtid: Global Thread ID of encountering thread
2112 // new_task: non-thread-switchable task thunk allocated by
2113 // __kmp_omp_task_alloc()
2114 // Returns:
2115 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2116 // be resumed later.
2117 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2118 // resumed later.
2119 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
2120  kmp_task_t *new_task) {
2121  kmp_int32 res;
2122  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2123 
2124 #if KMP_DEBUG || OMPT_SUPPORT
2125  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2126 #endif
2127  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2128  new_taskdata));
2129  __kmp_assert_valid_gtid(gtid);
2130 
2131 #if OMPT_SUPPORT
2132  kmp_taskdata_t *parent = NULL;
2133  if (UNLIKELY(ompt_enabled.enabled)) {
2134  if (!new_taskdata->td_flags.started) {
2135  OMPT_STORE_RETURN_ADDRESS(gtid);
2136  parent = new_taskdata->td_parent;
2137  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
2138  parent->ompt_task_info.frame.enter_frame.ptr =
2139  OMPT_GET_FRAME_ADDRESS(0);
2140  }
2141  if (ompt_enabled.ompt_callback_task_create) {
2142  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2143  &(parent->ompt_task_info.task_data),
2144  &(parent->ompt_task_info.frame),
2145  &(new_taskdata->ompt_task_info.task_data),
2146  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
2147  OMPT_LOAD_RETURN_ADDRESS(gtid));
2148  }
2149  } else {
2150  // We are scheduling the continuation of an UNTIED task.
2151  // Scheduling back to the parent task.
2152  __ompt_task_finish(new_task,
2153  new_taskdata->ompt_task_info.scheduling_parent,
2154  ompt_task_switch);
2155  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
2156  }
2157  }
2158 #endif
2159 
2160  res = __kmp_omp_task(gtid, new_task, true);
2161 
2162  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2163  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2164  gtid, loc_ref, new_taskdata));
2165 #if OMPT_SUPPORT
2166  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2167  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2168  }
2169 #endif
2170  return res;
2171 }
2172 
2173 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
2174 // a taskloop task with the correct OMPT return address
2175 //
2176 // loc_ref: location of original task pragma (ignored)
2177 // gtid: Global Thread ID of encountering thread
2178 // new_task: non-thread-switchable task thunk allocated by
2179 // __kmp_omp_task_alloc()
2180 // codeptr_ra: return address for OMPT callback
2181 // Returns:
2182 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2183 // be resumed later.
2184 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2185 // resumed later.
2186 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
2187  kmp_task_t *new_task, void *codeptr_ra) {
2188  kmp_int32 res;
2189  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2190 
2191 #if KMP_DEBUG || OMPT_SUPPORT
2192  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2193 #endif
2194  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2195  new_taskdata));
2196 
2197 #if OMPT_SUPPORT
2198  kmp_taskdata_t *parent = NULL;
2199  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
2200  parent = new_taskdata->td_parent;
2201  if (!parent->ompt_task_info.frame.enter_frame.ptr)
2202  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2203  if (ompt_enabled.ompt_callback_task_create) {
2204  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2205  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
2206  &(new_taskdata->ompt_task_info.task_data),
2207  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, codeptr_ra);
2208  }
2209  }
2210 #endif
2211 
2212  res = __kmp_omp_task(gtid, new_task, true);
2213 
2214  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2215  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2216  gtid, loc_ref, new_taskdata));
2217 #if OMPT_SUPPORT
2218  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2219  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2220  }
2221 #endif
2222  return res;
2223 }
2224 
2225 template <bool ompt>
2226 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
2227  void *frame_address,
2228  void *return_address) {
2229  kmp_taskdata_t *taskdata = nullptr;
2230  kmp_info_t *thread;
2231  int thread_finished = FALSE;
2232  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
2233 
2234  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2235  KMP_DEBUG_ASSERT(gtid >= 0);
2236 
2237  if (__kmp_tasking_mode != tskm_immediate_exec) {
2238  thread = __kmp_threads[gtid];
2239  taskdata = thread->th.th_current_task;
2240 
2241 #if OMPT_SUPPORT && OMPT_OPTIONAL
2242  ompt_data_t *my_task_data;
2243  ompt_data_t *my_parallel_data;
2244 
2245  if (ompt) {
2246  my_task_data = &(taskdata->ompt_task_info.task_data);
2247  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2248 
2249  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2250 
2251  if (ompt_enabled.ompt_callback_sync_region) {
2252  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2253  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2254  my_task_data, return_address);
2255  }
2256 
2257  if (ompt_enabled.ompt_callback_sync_region_wait) {
2258  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2259  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2260  my_task_data, return_address);
2261  }
2262  }
2263 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2264 
2265 // Debugger: The taskwait is active. Store location and thread encountered the
2266 // taskwait.
2267 #if USE_ITT_BUILD
2268 // Note: These values are used by ITT events as well.
2269 #endif /* USE_ITT_BUILD */
2270  taskdata->td_taskwait_counter += 1;
2271  taskdata->td_taskwait_ident = loc_ref;
2272  taskdata->td_taskwait_thread = gtid + 1;
2273 
2274 #if USE_ITT_BUILD
2275  void *itt_sync_obj = NULL;
2276 #if USE_ITT_NOTIFY
2277  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2278 #endif /* USE_ITT_NOTIFY */
2279 #endif /* USE_ITT_BUILD */
2280 
2281  bool must_wait =
2282  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2283 
2284  must_wait = must_wait || (thread->th.th_task_team != NULL &&
2285  thread->th.th_task_team->tt.tt_found_proxy_tasks);
2286  // If hidden helper thread is encountered, we must enable wait here.
2287  must_wait =
2288  must_wait ||
2289  (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2290  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2291 
2292  if (must_wait) {
2293  kmp_flag_32<false, false> flag(
2294  RCAST(std::atomic<kmp_uint32> *,
2295  &(taskdata->td_incomplete_child_tasks)),
2296  0U);
2297  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2298  flag.execute_tasks(thread, gtid, FALSE,
2299  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2300  __kmp_task_stealing_constraint);
2301  }
2302  }
2303 #if USE_ITT_BUILD
2304  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2305  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2306 #endif /* USE_ITT_BUILD */
2307 
2308  // Debugger: The taskwait is completed. Location remains, but thread is
2309  // negated.
2310  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2311 
2312 #if OMPT_SUPPORT && OMPT_OPTIONAL
2313  if (ompt) {
2314  if (ompt_enabled.ompt_callback_sync_region_wait) {
2315  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2316  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2317  my_task_data, return_address);
2318  }
2319  if (ompt_enabled.ompt_callback_sync_region) {
2320  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2321  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2322  my_task_data, return_address);
2323  }
2324  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2325  }
2326 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2327  }
2328 
2329  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2330  "returning TASK_CURRENT_NOT_QUEUED\n",
2331  gtid, taskdata));
2332 
2333  return TASK_CURRENT_NOT_QUEUED;
2334 }
2335 
2336 #if OMPT_SUPPORT && OMPT_OPTIONAL
2337 OMPT_NOINLINE
2338 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2339  void *frame_address,
2340  void *return_address) {
2341  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2342  return_address);
2343 }
2344 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2345 
2346 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2347 // complete
2348 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2349 #if OMPT_SUPPORT && OMPT_OPTIONAL
2350  if (UNLIKELY(ompt_enabled.enabled)) {
2351  OMPT_STORE_RETURN_ADDRESS(gtid);
2352  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2353  OMPT_LOAD_RETURN_ADDRESS(gtid));
2354  }
2355 #endif
2356  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2357 }
2358 
2359 // __kmpc_omp_taskyield: switch to a different task
2360 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2361  kmp_taskdata_t *taskdata = NULL;
2362  kmp_info_t *thread;
2363  int thread_finished = FALSE;
2364 
2365  KMP_COUNT_BLOCK(OMP_TASKYIELD);
2366  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2367 
2368  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2369  gtid, loc_ref, end_part));
2370  __kmp_assert_valid_gtid(gtid);
2371 
2372  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2373  thread = __kmp_threads[gtid];
2374  taskdata = thread->th.th_current_task;
2375 // Should we model this as a task wait or not?
2376 // Debugger: The taskwait is active. Store location and thread encountered the
2377 // taskwait.
2378 #if USE_ITT_BUILD
2379 // Note: These values are used by ITT events as well.
2380 #endif /* USE_ITT_BUILD */
2381  taskdata->td_taskwait_counter += 1;
2382  taskdata->td_taskwait_ident = loc_ref;
2383  taskdata->td_taskwait_thread = gtid + 1;
2384 
2385 #if USE_ITT_BUILD
2386  void *itt_sync_obj = NULL;
2387 #if USE_ITT_NOTIFY
2388  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2389 #endif /* USE_ITT_NOTIFY */
2390 #endif /* USE_ITT_BUILD */
2391  if (!taskdata->td_flags.team_serial) {
2392  kmp_task_team_t *task_team = thread->th.th_task_team;
2393  if (task_team != NULL) {
2394  if (KMP_TASKING_ENABLED(task_team)) {
2395 #if OMPT_SUPPORT
2396  if (UNLIKELY(ompt_enabled.enabled))
2397  thread->th.ompt_thread_info.ompt_task_yielded = 1;
2398 #endif
2399  __kmp_execute_tasks_32(
2400  thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2401  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2402  __kmp_task_stealing_constraint);
2403 #if OMPT_SUPPORT
2404  if (UNLIKELY(ompt_enabled.enabled))
2405  thread->th.ompt_thread_info.ompt_task_yielded = 0;
2406 #endif
2407  }
2408  }
2409  }
2410 #if USE_ITT_BUILD
2411  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2412 #endif /* USE_ITT_BUILD */
2413 
2414  // Debugger: The taskwait is completed. Location remains, but thread is
2415  // negated.
2416  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2417  }
2418 
2419  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2420  "returning TASK_CURRENT_NOT_QUEUED\n",
2421  gtid, taskdata));
2422 
2423  return TASK_CURRENT_NOT_QUEUED;
2424 }
2425 
2426 // Task Reduction implementation
2427 //
2428 // Note: initial implementation didn't take into account the possibility
2429 // to specify omp_orig for initializer of the UDR (user defined reduction).
2430 // Corrected implementation takes into account the omp_orig object.
2431 // Compiler is free to use old implementation if omp_orig is not specified.
2432 
2441 typedef struct kmp_taskred_flags {
2443  unsigned lazy_priv : 1;
2444  unsigned reserved31 : 31;
2446 
2450 typedef struct kmp_task_red_input {
2451  void *reduce_shar;
2452  size_t reduce_size;
2453  // three compiler-generated routines (init, fini are optional):
2454  void *reduce_init;
2455  void *reduce_fini;
2456  void *reduce_comb;
2459 
2463 typedef struct kmp_taskred_data {
2464  void *reduce_shar;
2465  size_t reduce_size;
2467  void *reduce_priv;
2468  void *reduce_pend;
2469  // three compiler-generated routines (init, fini are optional):
2470  void *reduce_comb;
2471  void *reduce_init;
2472  void *reduce_fini;
2473  void *reduce_orig;
2475 
2481 typedef struct kmp_taskred_input {
2482  void *reduce_shar;
2483  void *reduce_orig;
2484  size_t reduce_size;
2485  // three compiler-generated routines (init, fini are optional):
2486  void *reduce_init;
2487  void *reduce_fini;
2488  void *reduce_comb;
2495 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2496 template <>
2497 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2498  kmp_task_red_input_t &src) {
2499  item.reduce_orig = NULL;
2500 }
2501 template <>
2502 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2503  kmp_taskred_input_t &src) {
2504  if (src.reduce_orig != NULL) {
2505  item.reduce_orig = src.reduce_orig;
2506  } else {
2507  item.reduce_orig = src.reduce_shar;
2508  } // non-NULL reduce_orig means new interface used
2509 }
2510 
2511 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2512 template <>
2513 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2514  size_t offset) {
2515  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2516 }
2517 template <>
2518 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2519  size_t offset) {
2520  ((void (*)(void *, void *))item.reduce_init)(
2521  (char *)(item.reduce_priv) + offset, item.reduce_orig);
2522 }
2523 
2524 template <typename T>
2525 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2526  __kmp_assert_valid_gtid(gtid);
2527  kmp_info_t *thread = __kmp_threads[gtid];
2528  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2529  kmp_uint32 nth = thread->th.th_team_nproc;
2530  kmp_taskred_data_t *arr;
2531 
2532  // check input data just in case
2533  KMP_ASSERT(tg != NULL);
2534  KMP_ASSERT(data != NULL);
2535  KMP_ASSERT(num > 0);
2536  if (nth == 1 && !__kmp_enable_hidden_helper) {
2537  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2538  gtid, tg));
2539  return (void *)tg;
2540  }
2541  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2542  gtid, tg, num));
2543  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2544  thread, num * sizeof(kmp_taskred_data_t));
2545  for (int i = 0; i < num; ++i) {
2546  size_t size = data[i].reduce_size - 1;
2547  // round the size up to cache line per thread-specific item
2548  size += CACHE_LINE - size % CACHE_LINE;
2549  KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2550  arr[i].reduce_shar = data[i].reduce_shar;
2551  arr[i].reduce_size = size;
2552  arr[i].flags = data[i].flags;
2553  arr[i].reduce_comb = data[i].reduce_comb;
2554  arr[i].reduce_init = data[i].reduce_init;
2555  arr[i].reduce_fini = data[i].reduce_fini;
2556  __kmp_assign_orig<T>(arr[i], data[i]);
2557  if (!arr[i].flags.lazy_priv) {
2558  // allocate cache-line aligned block and fill it with zeros
2559  arr[i].reduce_priv = __kmp_allocate(nth * size);
2560  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2561  if (arr[i].reduce_init != NULL) {
2562  // initialize all thread-specific items
2563  for (size_t j = 0; j < nth; ++j) {
2564  __kmp_call_init<T>(arr[i], j * size);
2565  }
2566  }
2567  } else {
2568  // only allocate space for pointers now,
2569  // objects will be lazily allocated/initialized if/when requested
2570  // note that __kmp_allocate zeroes the allocated memory
2571  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2572  }
2573  }
2574  tg->reduce_data = (void *)arr;
2575  tg->reduce_num_data = num;
2576  return (void *)tg;
2577 }
2578 
2593 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2594 #if OMPX_TASKGRAPH
2595  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2596  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2597  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2598  this_tdg->rec_taskred_data =
2599  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2600  this_tdg->rec_num_taskred = num;
2601  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2602  sizeof(kmp_task_red_input_t) * num);
2603  }
2604 #endif
2605  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2606 }
2607 
2620 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2621 #if OMPX_TASKGRAPH
2622  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2623  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2624  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2625  this_tdg->rec_taskred_data =
2626  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2627  this_tdg->rec_num_taskred = num;
2628  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2629  sizeof(kmp_task_red_input_t) * num);
2630  }
2631 #endif
2632  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2633 }
2634 
2635 // Copy task reduction data (except for shared pointers).
2636 template <typename T>
2637 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2638  kmp_taskgroup_t *tg, void *reduce_data) {
2639  kmp_taskred_data_t *arr;
2640  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2641  " from data %p\n",
2642  thr, tg, reduce_data));
2643  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2644  thr, num * sizeof(kmp_taskred_data_t));
2645  // threads will share private copies, thunk routines, sizes, flags, etc.:
2646  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2647  for (int i = 0; i < num; ++i) {
2648  arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2649  }
2650  tg->reduce_data = (void *)arr;
2651  tg->reduce_num_data = num;
2652 }
2653 
2663 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2664  __kmp_assert_valid_gtid(gtid);
2665  kmp_info_t *thread = __kmp_threads[gtid];
2666  kmp_int32 nth = thread->th.th_team_nproc;
2667  if (nth == 1)
2668  return data; // nothing to do
2669 
2670  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2671  if (tg == NULL)
2672  tg = thread->th.th_current_task->td_taskgroup;
2673  KMP_ASSERT(tg != NULL);
2674  kmp_taskred_data_t *arr;
2675  kmp_int32 num;
2676  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2677 
2678 #if OMPX_TASKGRAPH
2679  if ((thread->th.th_current_task->is_taskgraph) &&
2680  (!__kmp_tdg_is_recording(
2681  __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
2682  tg = thread->th.th_current_task->td_taskgroup;
2683  KMP_ASSERT(tg != NULL);
2684  KMP_ASSERT(tg->reduce_data != NULL);
2685  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2686  num = tg->reduce_num_data;
2687  }
2688 #endif
2689 
2690  KMP_ASSERT(data != NULL);
2691  while (tg != NULL) {
2692  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2693  num = tg->reduce_num_data;
2694  for (int i = 0; i < num; ++i) {
2695  if (!arr[i].flags.lazy_priv) {
2696  if (data == arr[i].reduce_shar ||
2697  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2698  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2699  } else {
2700  // check shared location first
2701  void **p_priv = (void **)(arr[i].reduce_priv);
2702  if (data == arr[i].reduce_shar)
2703  goto found;
2704  // check if we get some thread specific location as parameter
2705  for (int j = 0; j < nth; ++j)
2706  if (data == p_priv[j])
2707  goto found;
2708  continue; // not found, continue search
2709  found:
2710  if (p_priv[tid] == NULL) {
2711  // allocate thread specific object lazily
2712  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2713  if (arr[i].reduce_init != NULL) {
2714  if (arr[i].reduce_orig != NULL) { // new interface
2715  ((void (*)(void *, void *))arr[i].reduce_init)(
2716  p_priv[tid], arr[i].reduce_orig);
2717  } else { // old interface (single parameter)
2718  ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2719  }
2720  }
2721  }
2722  return p_priv[tid];
2723  }
2724  }
2725  KMP_ASSERT(tg->parent);
2726  tg = tg->parent;
2727  }
2728  KMP_ASSERT2(0, "Unknown task reduction item");
2729  return NULL; // ERROR, this line never executed
2730 }
2731 
2732 // Finalize task reduction.
2733 // Called from __kmpc_end_taskgroup()
2734 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2735  kmp_int32 nth = th->th.th_team_nproc;
2736  KMP_DEBUG_ASSERT(
2737  nth > 1 ||
2738  __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
2739  // are using hidden helper threads
2740  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2741  kmp_int32 num = tg->reduce_num_data;
2742  for (int i = 0; i < num; ++i) {
2743  void *sh_data = arr[i].reduce_shar;
2744  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2745  void (*f_comb)(void *, void *) =
2746  (void (*)(void *, void *))(arr[i].reduce_comb);
2747  if (!arr[i].flags.lazy_priv) {
2748  void *pr_data = arr[i].reduce_priv;
2749  size_t size = arr[i].reduce_size;
2750  for (int j = 0; j < nth; ++j) {
2751  void *priv_data = (char *)pr_data + j * size;
2752  f_comb(sh_data, priv_data); // combine results
2753  if (f_fini)
2754  f_fini(priv_data); // finalize if needed
2755  }
2756  } else {
2757  void **pr_data = (void **)(arr[i].reduce_priv);
2758  for (int j = 0; j < nth; ++j) {
2759  if (pr_data[j] != NULL) {
2760  f_comb(sh_data, pr_data[j]); // combine results
2761  if (f_fini)
2762  f_fini(pr_data[j]); // finalize if needed
2763  __kmp_free(pr_data[j]);
2764  }
2765  }
2766  }
2767  __kmp_free(arr[i].reduce_priv);
2768  }
2769  __kmp_thread_free(th, arr);
2770  tg->reduce_data = NULL;
2771  tg->reduce_num_data = 0;
2772 }
2773 
2774 // Cleanup task reduction data for parallel or worksharing,
2775 // do not touch task private data other threads still working with.
2776 // Called from __kmpc_end_taskgroup()
2777 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2778  __kmp_thread_free(th, tg->reduce_data);
2779  tg->reduce_data = NULL;
2780  tg->reduce_num_data = 0;
2781 }
2782 
2783 template <typename T>
2784 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2785  int num, T *data) {
2786  __kmp_assert_valid_gtid(gtid);
2787  kmp_info_t *thr = __kmp_threads[gtid];
2788  kmp_int32 nth = thr->th.th_team_nproc;
2789  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2790  if (nth == 1) {
2791  KA_TRACE(10,
2792  ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2793  gtid, thr->th.th_current_task->td_taskgroup));
2794  return (void *)thr->th.th_current_task->td_taskgroup;
2795  }
2796  kmp_team_t *team = thr->th.th_team;
2797  void *reduce_data;
2798  kmp_taskgroup_t *tg;
2799  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2800  if (reduce_data == NULL &&
2801  __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2802  (void *)1)) {
2803  // single thread enters this block to initialize common reduction data
2804  KMP_DEBUG_ASSERT(reduce_data == NULL);
2805  // first initialize own data, then make a copy other threads can use
2806  tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2807  reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2808  KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2809  // fini counters should be 0 at this point
2810  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2811  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2812  KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2813  } else {
2814  while (
2815  (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2816  (void *)1) { // wait for task reduction initialization
2817  KMP_CPU_PAUSE();
2818  }
2819  KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2820  tg = thr->th.th_current_task->td_taskgroup;
2821  __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2822  }
2823  return tg;
2824 }
2825 
2842 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2843  int num, void *data) {
2844  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2845  (kmp_task_red_input_t *)data);
2846 }
2847 
2862 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2863  void *data) {
2864  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2865  (kmp_taskred_input_t *)data);
2866 }
2867 
2876 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2877  __kmpc_end_taskgroup(loc, gtid);
2878 }
2879 
2880 // __kmpc_taskgroup: Start a new taskgroup
2881 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2882  __kmp_assert_valid_gtid(gtid);
2883  kmp_info_t *thread = __kmp_threads[gtid];
2884  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2885  kmp_taskgroup_t *tg_new =
2886  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2887  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2888  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2889  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2890  tg_new->parent = taskdata->td_taskgroup;
2891  tg_new->reduce_data = NULL;
2892  tg_new->reduce_num_data = 0;
2893  tg_new->gomp_data = NULL;
2894  taskdata->td_taskgroup = tg_new;
2895 
2896 #if OMPT_SUPPORT && OMPT_OPTIONAL
2897  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2898  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2899  if (!codeptr)
2900  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2901  kmp_team_t *team = thread->th.th_team;
2902  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2903  // FIXME: I think this is wrong for lwt!
2904  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2905 
2906  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2907  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2908  &(my_task_data), codeptr);
2909  }
2910 #endif
2911 }
2912 
2913 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2914 // and its descendants are complete
2915 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2916  __kmp_assert_valid_gtid(gtid);
2917  kmp_info_t *thread = __kmp_threads[gtid];
2918  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2919  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2920  int thread_finished = FALSE;
2921 
2922 #if OMPT_SUPPORT && OMPT_OPTIONAL
2923  kmp_team_t *team;
2924  ompt_data_t my_task_data;
2925  ompt_data_t my_parallel_data;
2926  void *codeptr = nullptr;
2927  if (UNLIKELY(ompt_enabled.enabled)) {
2928  team = thread->th.th_team;
2929  my_task_data = taskdata->ompt_task_info.task_data;
2930  // FIXME: I think this is wrong for lwt!
2931  my_parallel_data = team->t.ompt_team_info.parallel_data;
2932  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2933  if (!codeptr)
2934  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2935  }
2936 #endif
2937 
2938  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2939  KMP_DEBUG_ASSERT(taskgroup != NULL);
2940  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2941 
2942  if (__kmp_tasking_mode != tskm_immediate_exec) {
2943  // mark task as waiting not on a barrier
2944  taskdata->td_taskwait_counter += 1;
2945  taskdata->td_taskwait_ident = loc;
2946  taskdata->td_taskwait_thread = gtid + 1;
2947 #if USE_ITT_BUILD
2948  // For ITT the taskgroup wait is similar to taskwait until we need to
2949  // distinguish them
2950  void *itt_sync_obj = NULL;
2951 #if USE_ITT_NOTIFY
2952  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2953 #endif /* USE_ITT_NOTIFY */
2954 #endif /* USE_ITT_BUILD */
2955 
2956 #if OMPT_SUPPORT && OMPT_OPTIONAL
2957  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2958  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2959  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2960  &(my_task_data), codeptr);
2961  }
2962 #endif
2963 
2964  if (!taskdata->td_flags.team_serial ||
2965  (thread->th.th_task_team != NULL &&
2966  (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2967  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2968  kmp_flag_32<false, false> flag(
2969  RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2970  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2971  flag.execute_tasks(thread, gtid, FALSE,
2972  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2973  __kmp_task_stealing_constraint);
2974  }
2975  }
2976  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2977 
2978 #if OMPT_SUPPORT && OMPT_OPTIONAL
2979  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2980  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2981  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2982  &(my_task_data), codeptr);
2983  }
2984 #endif
2985 
2986 #if USE_ITT_BUILD
2987  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2988  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2989 #endif /* USE_ITT_BUILD */
2990  }
2991  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2992 
2993  if (taskgroup->reduce_data != NULL &&
2994  !taskgroup->gomp_data) { // need to reduce?
2995  int cnt;
2996  void *reduce_data;
2997  kmp_team_t *t = thread->th.th_team;
2998  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2999  // check if <priv> data of the first reduction variable shared for the team
3000  void *priv0 = arr[0].reduce_priv;
3001  if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
3002  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
3003  // finishing task reduction on parallel
3004  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
3005  if (cnt == thread->th.th_team_nproc - 1) {
3006  // we are the last thread passing __kmpc_reduction_modifier_fini()
3007  // finalize task reduction:
3008  __kmp_task_reduction_fini(thread, taskgroup);
3009  // cleanup fields in the team structure:
3010  // TODO: is relaxed store enough here (whole barrier should follow)?
3011  __kmp_thread_free(thread, reduce_data);
3012  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
3013  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
3014  } else {
3015  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
3016  // so do not finalize reduction, just clean own copy of the data
3017  __kmp_task_reduction_clean(thread, taskgroup);
3018  }
3019  } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
3020  NULL &&
3021  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
3022  // finishing task reduction on worksharing
3023  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
3024  if (cnt == thread->th.th_team_nproc - 1) {
3025  // we are the last thread passing __kmpc_reduction_modifier_fini()
3026  __kmp_task_reduction_fini(thread, taskgroup);
3027  // cleanup fields in team structure:
3028  // TODO: is relaxed store enough here (whole barrier should follow)?
3029  __kmp_thread_free(thread, reduce_data);
3030  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
3031  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
3032  } else {
3033  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
3034  // so do not finalize reduction, just clean own copy of the data
3035  __kmp_task_reduction_clean(thread, taskgroup);
3036  }
3037  } else {
3038  // finishing task reduction on taskgroup
3039  __kmp_task_reduction_fini(thread, taskgroup);
3040  }
3041  }
3042  // Restore parent taskgroup for the current task
3043  taskdata->td_taskgroup = taskgroup->parent;
3044  __kmp_thread_free(thread, taskgroup);
3045 
3046  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
3047  gtid, taskdata));
3048 
3049 #if OMPT_SUPPORT && OMPT_OPTIONAL
3050  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
3051  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
3052  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
3053  &(my_task_data), codeptr);
3054  }
3055 #endif
3056 }
3057 
3058 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
3059  kmp_task_team_t *task_team,
3060  kmp_int32 is_constrained) {
3061  kmp_task_t *task = NULL;
3062  kmp_taskdata_t *taskdata;
3063  kmp_taskdata_t *current;
3064  kmp_thread_data_t *thread_data;
3065  int ntasks = task_team->tt.tt_num_task_pri;
3066  if (ntasks == 0) {
3067  KA_TRACE(
3068  20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
3069  return NULL;
3070  }
3071  do {
3072  // decrement num_tasks to "reserve" one task to get for execution
3073  if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
3074  ntasks - 1))
3075  break;
3076  ntasks = task_team->tt.tt_num_task_pri;
3077  } while (ntasks > 0);
3078  if (ntasks == 0) {
3079  KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
3080  __kmp_get_gtid()));
3081  return NULL;
3082  }
3083  // We got a "ticket" to get a "reserved" priority task
3084  int deque_ntasks;
3085  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3086  do {
3087  KMP_ASSERT(list != NULL);
3088  thread_data = &list->td;
3089  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3090  deque_ntasks = thread_data->td.td_deque_ntasks;
3091  if (deque_ntasks == 0) {
3092  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3093  KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
3094  __kmp_get_gtid(), thread_data));
3095  list = list->next;
3096  }
3097  } while (deque_ntasks == 0);
3098  KMP_DEBUG_ASSERT(deque_ntasks);
3099  int target = thread_data->td.td_deque_head;
3100  current = __kmp_threads[gtid]->th.th_current_task;
3101  taskdata = thread_data->td.td_deque[target];
3102  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3103  // Bump head pointer and Wrap.
3104  thread_data->td.td_deque_head =
3105  (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3106  } else {
3107  if (!task_team->tt.tt_untied_task_encountered) {
3108  // The TSC does not allow to steal victim task
3109  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3110  KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
3111  "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
3112  gtid, thread_data, task_team, deque_ntasks, target,
3113  thread_data->td.td_deque_tail));
3114  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
3115  return NULL;
3116  }
3117  int i;
3118  // walk through the deque trying to steal any task
3119  taskdata = NULL;
3120  for (i = 1; i < deque_ntasks; ++i) {
3121  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3122  taskdata = thread_data->td.td_deque[target];
3123  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3124  break; // found task to execute
3125  } else {
3126  taskdata = NULL;
3127  }
3128  }
3129  if (taskdata == NULL) {
3130  // No appropriate candidate found to execute
3131  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3132  KA_TRACE(
3133  10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
3134  "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
3135  gtid, thread_data, task_team, deque_ntasks,
3136  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3137  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
3138  return NULL;
3139  }
3140  int prev = target;
3141  for (i = i + 1; i < deque_ntasks; ++i) {
3142  // shift remaining tasks in the deque left by 1
3143  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3144  thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
3145  prev = target;
3146  }
3147  KMP_DEBUG_ASSERT(
3148  thread_data->td.td_deque_tail ==
3149  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
3150  thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
3151  }
3152  thread_data->td.td_deque_ntasks = deque_ntasks - 1;
3153  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3154  task = KMP_TASKDATA_TO_TASK(taskdata);
3155  return task;
3156 }
3157 
3158 // __kmp_remove_my_task: remove a task from my own deque
3159 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
3160  kmp_task_team_t *task_team,
3161  kmp_int32 is_constrained) {
3162  kmp_task_t *task;
3163  kmp_taskdata_t *taskdata;
3164  kmp_thread_data_t *thread_data;
3165  kmp_uint32 tail;
3166 
3167  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3168  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
3169  NULL); // Caller should check this condition
3170 
3171  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
3172 
3173  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
3174  gtid, thread_data->td.td_deque_ntasks,
3175  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3176 
3177  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
3178  KA_TRACE(10,
3179  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
3180  "ntasks=%d head=%u tail=%u\n",
3181  gtid, thread_data->td.td_deque_ntasks,
3182  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3183  return NULL;
3184  }
3185 
3186  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3187 
3188  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
3189  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3190  KA_TRACE(10,
3191  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
3192  "ntasks=%d head=%u tail=%u\n",
3193  gtid, thread_data->td.td_deque_ntasks,
3194  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3195  return NULL;
3196  }
3197 
3198  tail = (thread_data->td.td_deque_tail - 1) &
3199  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
3200  taskdata = thread_data->td.td_deque[tail];
3201 
3202  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
3203  thread->th.th_current_task)) {
3204  // The TSC does not allow to steal victim task
3205  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3206  KA_TRACE(10,
3207  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
3208  "ntasks=%d head=%u tail=%u\n",
3209  gtid, thread_data->td.td_deque_ntasks,
3210  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3211  return NULL;
3212  }
3213 
3214  thread_data->td.td_deque_tail = tail;
3215  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
3216 
3217  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3218 
3219  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
3220  "ntasks=%d head=%u tail=%u\n",
3221  gtid, taskdata, thread_data->td.td_deque_ntasks,
3222  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3223 
3224  task = KMP_TASKDATA_TO_TASK(taskdata);
3225  return task;
3226 }
3227 
3228 // __kmp_steal_task: remove a task from another thread's deque
3229 // Assume that calling thread has already checked existence of
3230 // task_team thread_data before calling this routine.
3231 static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
3232  kmp_task_team_t *task_team,
3233  std::atomic<kmp_int32> *unfinished_threads,
3234  int *thread_finished,
3235  kmp_int32 is_constrained) {
3236  kmp_task_t *task;
3237  kmp_taskdata_t *taskdata;
3238  kmp_taskdata_t *current;
3239  kmp_thread_data_t *victim_td, *threads_data;
3240  kmp_int32 target;
3241  kmp_info_t *victim_thr;
3242 
3243  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3244 
3245  threads_data = task_team->tt.tt_threads_data;
3246  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3247  KMP_DEBUG_ASSERT(victim_tid >= 0);
3248  KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_nproc);
3249 
3250  victim_td = &threads_data[victim_tid];
3251  victim_thr = victim_td->td.td_thr;
3252  (void)victim_thr; // Use in TRACE messages which aren't always enabled.
3253 
3254  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3255  "task_team=%p ntasks=%d head=%u tail=%u\n",
3256  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3257  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3258  victim_td->td.td_deque_tail));
3259 
3260  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3261  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3262  "task_team=%p ntasks=%d head=%u tail=%u\n",
3263  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3264  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3265  victim_td->td.td_deque_tail));
3266  return NULL;
3267  }
3268 
3269  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3270 
3271  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3272  // Check again after we acquire the lock
3273  if (ntasks == 0) {
3274  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3275  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3276  "task_team=%p ntasks=%d head=%u tail=%u\n",
3277  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3278  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3279  return NULL;
3280  }
3281 
3282  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3283  current = __kmp_threads[gtid]->th.th_current_task;
3284  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3285  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3286  // Bump head pointer and Wrap.
3287  victim_td->td.td_deque_head =
3288  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3289  } else {
3290  if (!task_team->tt.tt_untied_task_encountered) {
3291  // The TSC does not allow to steal victim task
3292  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3293  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3294  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3295  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3296  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3297  return NULL;
3298  }
3299  int i;
3300  // walk through victim's deque trying to steal any task
3301  target = victim_td->td.td_deque_head;
3302  taskdata = NULL;
3303  for (i = 1; i < ntasks; ++i) {
3304  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3305  taskdata = victim_td->td.td_deque[target];
3306  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3307  break; // found victim task
3308  } else {
3309  taskdata = NULL;
3310  }
3311  }
3312  if (taskdata == NULL) {
3313  // No appropriate candidate to steal found
3314  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3315  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3316  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3317  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3318  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3319  return NULL;
3320  }
3321  int prev = target;
3322  for (i = i + 1; i < ntasks; ++i) {
3323  // shift remaining tasks in the deque left by 1
3324  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3325  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3326  prev = target;
3327  }
3328  KMP_DEBUG_ASSERT(
3329  victim_td->td.td_deque_tail ==
3330  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3331  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3332  }
3333  if (*thread_finished) {
3334  // We need to un-mark this victim as a finished victim. This must be done
3335  // before releasing the lock, or else other threads (starting with the
3336  // primary thread victim) might be prematurely released from the barrier!!!
3337 #if KMP_DEBUG
3338  kmp_int32 count =
3339 #endif
3340  KMP_ATOMIC_INC(unfinished_threads);
3341  KA_TRACE(
3342  20,
3343  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3344  gtid, count + 1, task_team));
3345  *thread_finished = FALSE;
3346  }
3347  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3348 
3349  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3350 
3351  KMP_COUNT_BLOCK(TASK_stolen);
3352  KA_TRACE(10,
3353  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3354  "task_team=%p ntasks=%d head=%u tail=%u\n",
3355  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3356  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3357 
3358  task = KMP_TASKDATA_TO_TASK(taskdata);
3359  return task;
3360 }
3361 
3362 // __kmp_execute_tasks_template: Choose and execute tasks until either the
3363 // condition is statisfied (return true) or there are none left (return false).
3364 //
3365 // final_spin is TRUE if this is the spin at the release barrier.
3366 // thread_finished indicates whether the thread is finished executing all
3367 // the tasks it has on its deque, and is at the release barrier.
3368 // spinner is the location on which to spin.
3369 // spinner == NULL means only execute a single task and return.
3370 // checker is the value to check to terminate the spin.
3371 template <class C>
3372 static inline int __kmp_execute_tasks_template(
3373  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3374  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3375  kmp_int32 is_constrained) {
3376  kmp_task_team_t *task_team = thread->th.th_task_team;
3377  kmp_thread_data_t *threads_data;
3378  kmp_task_t *task;
3379  kmp_info_t *other_thread;
3380  kmp_taskdata_t *current_task = thread->th.th_current_task;
3381  std::atomic<kmp_int32> *unfinished_threads;
3382  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3383  tid = thread->th.th_info.ds.ds_tid;
3384 
3385  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3386  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3387 
3388  if (task_team == NULL || current_task == NULL)
3389  return FALSE;
3390 
3391  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3392  "*thread_finished=%d\n",
3393  gtid, final_spin, *thread_finished));
3394 
3395  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3396  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3397 
3398  KMP_DEBUG_ASSERT(threads_data != NULL);
3399 
3400  nthreads = task_team->tt.tt_nproc;
3401  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3402  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3403 
3404  while (1) { // Outer loop keeps trying to find tasks in case of single thread
3405  // getting tasks from target constructs
3406  while (1) { // Inner loop to find a task and execute it
3407  task = NULL;
3408  if (task_team->tt.tt_num_task_pri) { // get priority task first
3409  task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3410  }
3411  if (task == NULL && use_own_tasks) { // check own queue next
3412  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3413  }
3414  if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3415  int asleep = 1;
3416  use_own_tasks = 0;
3417  // Try to steal from the last place I stole from successfully.
3418  if (victim_tid == -2) { // haven't stolen anything yet
3419  victim_tid = threads_data[tid].td.td_deque_last_stolen;
3420  if (victim_tid !=
3421  -1) // if we have a last stolen from victim, get the thread
3422  other_thread = threads_data[victim_tid].td.td_thr;
3423  }
3424  if (victim_tid != -1) { // found last victim
3425  asleep = 0;
3426  } else if (!new_victim) { // no recent steals and we haven't already
3427  // used a new victim; select a random thread
3428  do { // Find a different thread to steal work from.
3429  // Pick a random thread. Initial plan was to cycle through all the
3430  // threads, and only return if we tried to steal from every thread,
3431  // and failed. Arch says that's not such a great idea.
3432  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3433  if (victim_tid >= tid) {
3434  ++victim_tid; // Adjusts random distribution to exclude self
3435  }
3436  // Found a potential victim
3437  other_thread = threads_data[victim_tid].td.td_thr;
3438  // There is a slight chance that __kmp_enable_tasking() did not wake
3439  // up all threads waiting at the barrier. If victim is sleeping,
3440  // then wake it up. Since we were going to pay the cache miss
3441  // penalty for referencing another thread's kmp_info_t struct
3442  // anyway,
3443  // the check shouldn't cost too much performance at this point. In
3444  // extra barrier mode, tasks do not sleep at the separate tasking
3445  // barrier, so this isn't a problem.
3446  asleep = 0;
3447  if ((__kmp_tasking_mode == tskm_task_teams) &&
3448  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3449  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3450  NULL)) {
3451  asleep = 1;
3452  __kmp_null_resume_wrapper(other_thread);
3453  // A sleeping thread should not have any tasks on it's queue.
3454  // There is a slight possibility that it resumes, steals a task
3455  // from another thread, which spawns more tasks, all in the time
3456  // that it takes this thread to check => don't write an assertion
3457  // that the victim's queue is empty. Try stealing from a
3458  // different thread.
3459  }
3460  } while (asleep);
3461  }
3462 
3463  if (!asleep) {
3464  // We have a victim to try to steal from
3465  task =
3466  __kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,
3467  thread_finished, is_constrained);
3468  }
3469  if (task != NULL) { // set last stolen to victim
3470  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3471  threads_data[tid].td.td_deque_last_stolen = victim_tid;
3472  // The pre-refactored code did not try more than 1 successful new
3473  // vicitm, unless the last one generated more local tasks;
3474  // new_victim keeps track of this
3475  new_victim = 1;
3476  }
3477  } else { // No tasks found; unset last_stolen
3478  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3479  victim_tid = -2; // no successful victim found
3480  }
3481  }
3482 
3483  if (task == NULL)
3484  break; // break out of tasking loop
3485 
3486 // Found a task; execute it
3487 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3488  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3489  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3490  // get the object reliably
3491  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3492  }
3493  __kmp_itt_task_starting(itt_sync_obj);
3494  }
3495 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3496  __kmp_invoke_task(gtid, task, current_task);
3497 #if USE_ITT_BUILD
3498  if (itt_sync_obj != NULL)
3499  __kmp_itt_task_finished(itt_sync_obj);
3500 #endif /* USE_ITT_BUILD */
3501  // If this thread is only partway through the barrier and the condition is
3502  // met, then return now, so that the barrier gather/release pattern can
3503  // proceed. If this thread is in the last spin loop in the barrier,
3504  // waiting to be released, we know that the termination condition will not
3505  // be satisfied, so don't waste any cycles checking it.
3506  if (flag == NULL || (!final_spin && flag->done_check())) {
3507  KA_TRACE(
3508  15,
3509  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3510  gtid));
3511  return TRUE;
3512  }
3513  if (thread->th.th_task_team == NULL) {
3514  break;
3515  }
3516  KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3517  // If execution of a stolen task results in more tasks being placed on our
3518  // run queue, reset use_own_tasks
3519  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3520  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3521  "other tasks, restart\n",
3522  gtid));
3523  use_own_tasks = 1;
3524  new_victim = 0;
3525  }
3526  }
3527 
3528  // The task source has been exhausted. If in final spin loop of barrier,
3529  // check if termination condition is satisfied. The work queue may be empty
3530  // but there might be proxy tasks still executing.
3531  if (final_spin &&
3532  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3533  // First, decrement the #unfinished threads, if that has not already been
3534  // done. This decrement might be to the spin location, and result in the
3535  // termination condition being satisfied.
3536  if (!*thread_finished) {
3537 #if KMP_DEBUG
3538  kmp_int32 count = -1 +
3539 #endif
3540  KMP_ATOMIC_DEC(unfinished_threads);
3541  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3542  "unfinished_threads to %d task_team=%p\n",
3543  gtid, count, task_team));
3544  *thread_finished = TRUE;
3545  }
3546 
3547  // It is now unsafe to reference thread->th.th_team !!!
3548  // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3549  // thread to pass through the barrier, where it might reset each thread's
3550  // th.th_team field for the next parallel region. If we can steal more
3551  // work, we know that this has not happened yet.
3552  if (flag != NULL && flag->done_check()) {
3553  KA_TRACE(
3554  15,
3555  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3556  gtid));
3557  return TRUE;
3558  }
3559  }
3560 
3561  // If this thread's task team is NULL, primary thread has recognized that
3562  // there are no more tasks; bail out
3563  if (thread->th.th_task_team == NULL) {
3564  KA_TRACE(15,
3565  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3566  return FALSE;
3567  }
3568 
3569  // Check the flag again to see if it has already done in case to be trapped
3570  // into infinite loop when a if0 task depends on a hidden helper task
3571  // outside any parallel region. Detached tasks are not impacted in this case
3572  // because the only thread executing this function has to execute the proxy
3573  // task so it is in another code path that has the same check.
3574  if (flag == NULL || (!final_spin && flag->done_check())) {
3575  KA_TRACE(15,
3576  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3577  gtid));
3578  return TRUE;
3579  }
3580 
3581  // We could be getting tasks from target constructs; if this is the only
3582  // thread, keep trying to execute tasks from own queue
3583  if (nthreads == 1 &&
3584  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3585  use_own_tasks = 1;
3586  else {
3587  KA_TRACE(15,
3588  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3589  return FALSE;
3590  }
3591  }
3592 }
3593 
3594 template <bool C, bool S>
3595 int __kmp_execute_tasks_32(
3596  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3597  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3598  kmp_int32 is_constrained) {
3599  return __kmp_execute_tasks_template(
3600  thread, gtid, flag, final_spin,
3601  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3602 }
3603 
3604 template <bool C, bool S>
3605 int __kmp_execute_tasks_64(
3606  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3607  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3608  kmp_int32 is_constrained) {
3609  return __kmp_execute_tasks_template(
3610  thread, gtid, flag, final_spin,
3611  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3612 }
3613 
3614 template <bool C, bool S>
3615 int __kmp_atomic_execute_tasks_64(
3616  kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3617  int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3618  kmp_int32 is_constrained) {
3619  return __kmp_execute_tasks_template(
3620  thread, gtid, flag, final_spin,
3621  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3622 }
3623 
3624 int __kmp_execute_tasks_oncore(
3625  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3626  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3627  kmp_int32 is_constrained) {
3628  return __kmp_execute_tasks_template(
3629  thread, gtid, flag, final_spin,
3630  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3631 }
3632 
3633 template int
3634 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3635  kmp_flag_32<false, false> *, int,
3636  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3637 
3638 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3639  kmp_flag_64<false, true> *,
3640  int,
3641  int *USE_ITT_BUILD_ARG(void *),
3642  kmp_int32);
3643 
3644 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3645  kmp_flag_64<true, false> *,
3646  int,
3647  int *USE_ITT_BUILD_ARG(void *),
3648  kmp_int32);
3649 
3650 template int __kmp_atomic_execute_tasks_64<false, true>(
3651  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3652  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3653 
3654 template int __kmp_atomic_execute_tasks_64<true, false>(
3655  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3656  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3657 
3658 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3659 // next barrier so they can assist in executing enqueued tasks.
3660 // First thread in allocates the task team atomically.
3661 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3662  kmp_info_t *this_thr) {
3663  kmp_thread_data_t *threads_data;
3664  int nthreads, i, is_init_thread;
3665 
3666  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3667  __kmp_gtid_from_thread(this_thr)));
3668 
3669  KMP_DEBUG_ASSERT(task_team != NULL);
3670  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3671 
3672  nthreads = task_team->tt.tt_nproc;
3673  KMP_DEBUG_ASSERT(nthreads > 0);
3674  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3675 
3676  // Allocate or increase the size of threads_data if necessary
3677  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3678 
3679  if (!is_init_thread) {
3680  // Some other thread already set up the array.
3681  KA_TRACE(
3682  20,
3683  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3684  __kmp_gtid_from_thread(this_thr)));
3685  return;
3686  }
3687  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3688  KMP_DEBUG_ASSERT(threads_data != NULL);
3689 
3690  if (__kmp_tasking_mode == tskm_task_teams &&
3691  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3692  // Release any threads sleeping at the barrier, so that they can steal
3693  // tasks and execute them. In extra barrier mode, tasks do not sleep
3694  // at the separate tasking barrier, so this isn't a problem.
3695  for (i = 0; i < nthreads; i++) {
3696  void *sleep_loc;
3697  kmp_info_t *thread = threads_data[i].td.td_thr;
3698 
3699  if (i == this_thr->th.th_info.ds.ds_tid) {
3700  continue;
3701  }
3702  // Since we haven't locked the thread's suspend mutex lock at this
3703  // point, there is a small window where a thread might be putting
3704  // itself to sleep, but hasn't set the th_sleep_loc field yet.
3705  // To work around this, __kmp_execute_tasks_template() periodically checks
3706  // see if other threads are sleeping (using the same random mechanism that
3707  // is used for task stealing) and awakens them if they are.
3708  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3709  NULL) {
3710  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3711  __kmp_gtid_from_thread(this_thr),
3712  __kmp_gtid_from_thread(thread)));
3713  __kmp_null_resume_wrapper(thread);
3714  } else {
3715  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3716  __kmp_gtid_from_thread(this_thr),
3717  __kmp_gtid_from_thread(thread)));
3718  }
3719  }
3720  }
3721 
3722  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3723  __kmp_gtid_from_thread(this_thr)));
3724 }
3725 
3726 /* // TODO: Check the comment consistency
3727  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3728  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3729  * After a child * thread checks into a barrier and calls __kmp_release() from
3730  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3731  * longer assume that the kmp_team_t structure is intact (at any moment, the
3732  * primary thread may exit the barrier code and free the team data structure,
3733  * and return the threads to the thread pool).
3734  *
3735  * This does not work with the tasking code, as the thread is still
3736  * expected to participate in the execution of any tasks that may have been
3737  * spawned my a member of the team, and the thread still needs access to all
3738  * to each thread in the team, so that it can steal work from it.
3739  *
3740  * Enter the existence of the kmp_task_team_t struct. It employs a reference
3741  * counting mechanism, and is allocated by the primary thread before calling
3742  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3743  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3744  * of the kmp_task_team_t structs for consecutive barriers can overlap
3745  * (and will, unless the primary thread is the last thread to exit the barrier
3746  * release phase, which is not typical). The existence of such a struct is
3747  * useful outside the context of tasking.
3748  *
3749  * We currently use the existence of the threads array as an indicator that
3750  * tasks were spawned since the last barrier. If the structure is to be
3751  * useful outside the context of tasking, then this will have to change, but
3752  * not setting the field minimizes the performance impact of tasking on
3753  * barriers, when no explicit tasks were spawned (pushed, actually).
3754  */
3755 
3756 static kmp_task_team_t *__kmp_free_task_teams =
3757  NULL; // Free list for task_team data structures
3758 // Lock for task team data structures
3759 kmp_bootstrap_lock_t __kmp_task_team_lock =
3760  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3761 
3762 // __kmp_alloc_task_deque:
3763 // Allocates a task deque for a particular thread, and initialize the necessary
3764 // data structures relating to the deque. This only happens once per thread
3765 // per task team since task teams are recycled. No lock is needed during
3766 // allocation since each thread allocates its own deque.
3767 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3768  kmp_thread_data_t *thread_data) {
3769  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3770  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3771 
3772  // Initialize last stolen task field to "none"
3773  thread_data->td.td_deque_last_stolen = -1;
3774 
3775  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3776  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3777  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3778 
3779  KE_TRACE(
3780  10,
3781  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3782  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3783  // Allocate space for task deque, and zero the deque
3784  // Cannot use __kmp_thread_calloc() because threads not around for
3785  // kmp_reap_task_team( ).
3786  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3787  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3788  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3789 }
3790 
3791 // __kmp_free_task_deque:
3792 // Deallocates a task deque for a particular thread. Happens at library
3793 // deallocation so don't need to reset all thread data fields.
3794 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3795  if (thread_data->td.td_deque != NULL) {
3796  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3797  TCW_4(thread_data->td.td_deque_ntasks, 0);
3798  __kmp_free(thread_data->td.td_deque);
3799  thread_data->td.td_deque = NULL;
3800  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3801  }
3802 
3803 #ifdef BUILD_TIED_TASK_STACK
3804  // GEH: Figure out what to do here for td_susp_tied_tasks
3805  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3806  __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3807  }
3808 #endif // BUILD_TIED_TASK_STACK
3809 }
3810 
3811 // __kmp_realloc_task_threads_data:
3812 // Allocates a threads_data array for a task team, either by allocating an
3813 // initial array or enlarging an existing array. Only the first thread to get
3814 // the lock allocs or enlarges the array and re-initializes the array elements.
3815 // That thread returns "TRUE", the rest return "FALSE".
3816 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3817 // The current size is given by task_team -> tt.tt_max_threads.
3818 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3819  kmp_task_team_t *task_team) {
3820  kmp_thread_data_t **threads_data_p;
3821  kmp_int32 nthreads, maxthreads;
3822  int is_init_thread = FALSE;
3823 
3824  if (TCR_4(task_team->tt.tt_found_tasks)) {
3825  // Already reallocated and initialized.
3826  return FALSE;
3827  }
3828 
3829  threads_data_p = &task_team->tt.tt_threads_data;
3830  nthreads = task_team->tt.tt_nproc;
3831  maxthreads = task_team->tt.tt_max_threads;
3832 
3833  // All threads must lock when they encounter the first task of the implicit
3834  // task region to make sure threads_data fields are (re)initialized before
3835  // used.
3836  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3837 
3838  if (!TCR_4(task_team->tt.tt_found_tasks)) {
3839  // first thread to enable tasking
3840  kmp_team_t *team = thread->th.th_team;
3841  int i;
3842 
3843  is_init_thread = TRUE;
3844  if (maxthreads < nthreads) {
3845 
3846  if (*threads_data_p != NULL) {
3847  kmp_thread_data_t *old_data = *threads_data_p;
3848  kmp_thread_data_t *new_data = NULL;
3849 
3850  KE_TRACE(
3851  10,
3852  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3853  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3854  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3855  // Reallocate threads_data to have more elements than current array
3856  // Cannot use __kmp_thread_realloc() because threads not around for
3857  // kmp_reap_task_team( ). Note all new array entries are initialized
3858  // to zero by __kmp_allocate().
3859  new_data = (kmp_thread_data_t *)__kmp_allocate(
3860  nthreads * sizeof(kmp_thread_data_t));
3861  // copy old data to new data
3862  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3863  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3864 
3865 #ifdef BUILD_TIED_TASK_STACK
3866  // GEH: Figure out if this is the right thing to do
3867  for (i = maxthreads; i < nthreads; i++) {
3868  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3869  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3870  }
3871 #endif // BUILD_TIED_TASK_STACK
3872  // Install the new data and free the old data
3873  (*threads_data_p) = new_data;
3874  __kmp_free(old_data);
3875  } else {
3876  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3877  "threads data for task_team %p, size = %d\n",
3878  __kmp_gtid_from_thread(thread), task_team, nthreads));
3879  // Make the initial allocate for threads_data array, and zero entries
3880  // Cannot use __kmp_thread_calloc() because threads not around for
3881  // kmp_reap_task_team( ).
3882  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3883  nthreads * sizeof(kmp_thread_data_t));
3884 #ifdef BUILD_TIED_TASK_STACK
3885  // GEH: Figure out if this is the right thing to do
3886  for (i = 0; i < nthreads; i++) {
3887  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3888  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3889  }
3890 #endif // BUILD_TIED_TASK_STACK
3891  }
3892  task_team->tt.tt_max_threads = nthreads;
3893  } else {
3894  // If array has (more than) enough elements, go ahead and use it
3895  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3896  }
3897 
3898  // initialize threads_data pointers back to thread_info structures
3899  for (i = 0; i < nthreads; i++) {
3900  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3901  thread_data->td.td_thr = team->t.t_threads[i];
3902 
3903  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3904  // The last stolen field survives across teams / barrier, and the number
3905  // of threads may have changed. It's possible (likely?) that a new
3906  // parallel region will exhibit the same behavior as previous region.
3907  thread_data->td.td_deque_last_stolen = -1;
3908  }
3909  }
3910 
3911  KMP_MB();
3912  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3913  }
3914 
3915  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3916  return is_init_thread;
3917 }
3918 
3919 // __kmp_free_task_threads_data:
3920 // Deallocates a threads_data array for a task team, including any attached
3921 // tasking deques. Only occurs at library shutdown.
3922 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3923  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3924  if (task_team->tt.tt_threads_data != NULL) {
3925  int i;
3926  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3927  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3928  }
3929  __kmp_free(task_team->tt.tt_threads_data);
3930  task_team->tt.tt_threads_data = NULL;
3931  }
3932  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3933 }
3934 
3935 // __kmp_free_task_pri_list:
3936 // Deallocates tasking deques used for priority tasks.
3937 // Only occurs at library shutdown.
3938 static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3939  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3940  if (task_team->tt.tt_task_pri_list != NULL) {
3941  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3942  while (list != NULL) {
3943  kmp_task_pri_t *next = list->next;
3944  __kmp_free_task_deque(&list->td);
3945  __kmp_free(list);
3946  list = next;
3947  }
3948  task_team->tt.tt_task_pri_list = NULL;
3949  }
3950  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3951 }
3952 
3953 static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
3954  kmp_team_t *team) {
3955  int team_nth = team->t.t_nproc;
3956  // Only need to init if task team is isn't active or team size changed
3957  if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
3958  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3959  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3960  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3961  TCW_4(task_team->tt.tt_nproc, team_nth);
3962  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
3963  TCW_4(task_team->tt.tt_active, TRUE);
3964  }
3965 }
3966 
3967 // __kmp_allocate_task_team:
3968 // Allocates a task team associated with a specific team, taking it from
3969 // the global task team free list if possible. Also initializes data
3970 // structures.
3971 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3972  kmp_team_t *team) {
3973  kmp_task_team_t *task_team = NULL;
3974 
3975  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3976  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3977 
3978  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3979  // Take a task team from the task team pool
3980  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3981  if (__kmp_free_task_teams != NULL) {
3982  task_team = __kmp_free_task_teams;
3983  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3984  task_team->tt.tt_next = NULL;
3985  }
3986  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3987  }
3988 
3989  if (task_team == NULL) {
3990  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3991  "task team for team %p\n",
3992  __kmp_gtid_from_thread(thread), team));
3993  // Allocate a new task team if one is not available. Cannot use
3994  // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3995  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3996  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3997  __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3998 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3999  // suppress race conditions detection on synchronization flags in debug mode
4000  // this helps to analyze library internals eliminating false positives
4001  __itt_suppress_mark_range(
4002  __itt_suppress_range, __itt_suppress_threading_errors,
4003  &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
4004  __itt_suppress_mark_range(__itt_suppress_range,
4005  __itt_suppress_threading_errors,
4006  CCAST(kmp_uint32 *, &task_team->tt.tt_active),
4007  sizeof(task_team->tt.tt_active));
4008 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4009  // Note: __kmp_allocate zeroes returned memory, othewise we would need:
4010  // task_team->tt.tt_threads_data = NULL;
4011  // task_team->tt.tt_max_threads = 0;
4012  // task_team->tt.tt_next = NULL;
4013  }
4014 
4015  __kmp_task_team_init(task_team, team);
4016 
4017  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
4018  "unfinished_threads init'd to %d\n",
4019  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
4020  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
4021  return task_team;
4022 }
4023 
4024 // __kmp_free_task_team:
4025 // Frees the task team associated with a specific thread, and adds it
4026 // to the global task team free list.
4027 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
4028  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
4029  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
4030 
4031  // Put task team back on free list
4032  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
4033 
4034  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
4035  task_team->tt.tt_next = __kmp_free_task_teams;
4036  TCW_PTR(__kmp_free_task_teams, task_team);
4037 
4038  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
4039 }
4040 
4041 // __kmp_reap_task_teams:
4042 // Free all the task teams on the task team free list.
4043 // Should only be done during library shutdown.
4044 // Cannot do anything that needs a thread structure or gtid since they are
4045 // already gone.
4046 void __kmp_reap_task_teams(void) {
4047  kmp_task_team_t *task_team;
4048 
4049  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
4050  // Free all task_teams on the free list
4051  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
4052  while ((task_team = __kmp_free_task_teams) != NULL) {
4053  __kmp_free_task_teams = task_team->tt.tt_next;
4054  task_team->tt.tt_next = NULL;
4055 
4056  // Free threads_data if necessary
4057  if (task_team->tt.tt_threads_data != NULL) {
4058  __kmp_free_task_threads_data(task_team);
4059  }
4060  if (task_team->tt.tt_task_pri_list != NULL) {
4061  __kmp_free_task_pri_list(task_team);
4062  }
4063  __kmp_free(task_team);
4064  }
4065  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
4066  }
4067 }
4068 
4069 // View the array of two task team pointers as a pair of pointers:
4070 // 1) a single task_team pointer
4071 // 2) next pointer for stack
4072 // Serial teams can create a stack of task teams for nested serial teams.
4073 void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
4074  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4075  kmp_task_team_list_t *current =
4076  (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
4077  kmp_task_team_list_t *node =
4078  (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));
4079  node->task_team = current->task_team;
4080  node->next = current->next;
4081  thread->th.th_task_team = current->task_team = NULL;
4082  current->next = node;
4083 }
4084 
4085 // Serial team pops a task team off the stack
4086 void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
4087  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4088  kmp_task_team_list_t *current =
4089  (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
4090  if (current->task_team) {
4091  __kmp_free_task_team(thread, current->task_team);
4092  }
4093  kmp_task_team_list_t *next = current->next;
4094  if (next) {
4095  current->task_team = next->task_team;
4096  current->next = next->next;
4097  KMP_DEBUG_ASSERT(next != current);
4098  __kmp_free(next);
4099  thread->th.th_task_team = current->task_team;
4100  }
4101 }
4102 
4103 // __kmp_wait_to_unref_task_teams:
4104 // Some threads could still be in the fork barrier release code, possibly
4105 // trying to steal tasks. Wait for each thread to unreference its task team.
4106 void __kmp_wait_to_unref_task_teams(void) {
4107  kmp_info_t *thread;
4108  kmp_uint32 spins;
4109  kmp_uint64 time;
4110  int done;
4111 
4112  KMP_INIT_YIELD(spins);
4113  KMP_INIT_BACKOFF(time);
4114 
4115  for (;;) {
4116  done = TRUE;
4117 
4118  // TODO: GEH - this may be is wrong because some sync would be necessary
4119  // in case threads are added to the pool during the traversal. Need to
4120  // verify that lock for thread pool is held when calling this routine.
4121  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
4122  thread = thread->th.th_next_pool) {
4123 #if KMP_OS_WINDOWS
4124  DWORD exit_val;
4125 #endif
4126  if (TCR_PTR(thread->th.th_task_team) == NULL) {
4127  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
4128  __kmp_gtid_from_thread(thread)));
4129  continue;
4130  }
4131 #if KMP_OS_WINDOWS
4132  // TODO: GEH - add this check for Linux* OS / OS X* as well?
4133  if (!__kmp_is_thread_alive(thread, &exit_val)) {
4134  thread->th.th_task_team = NULL;
4135  continue;
4136  }
4137 #endif
4138 
4139  done = FALSE; // Because th_task_team pointer is not NULL for this thread
4140 
4141  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
4142  "unreference task_team\n",
4143  __kmp_gtid_from_thread(thread)));
4144 
4145  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
4146  void *sleep_loc;
4147  // If the thread is sleeping, awaken it.
4148  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
4149  NULL) {
4150  KA_TRACE(
4151  10,
4152  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
4153  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
4154  __kmp_null_resume_wrapper(thread);
4155  }
4156  }
4157  }
4158  if (done) {
4159  break;
4160  }
4161 
4162  // If oversubscribed or have waited a bit, yield.
4163  KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
4164  }
4165 }
4166 
4167 // __kmp_task_team_setup: Create a task_team for the current team, but use
4168 // an already created, unused one if it already exists.
4169 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
4170  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4171 
4172  // For the serial and root teams, setup the first task team pointer to point
4173  // to task team. The other pointer is a stack of task teams from previous
4174  // serial levels.
4175  if (team == this_thr->th.th_serial_team ||
4176  team == this_thr->th.th_root->r.r_root_team) {
4177  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4178  if (team->t.t_task_team[0] == NULL) {
4179  team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
4180  KA_TRACE(
4181  20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
4182  " for serial/root team %p\n",
4183  __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
4184 
4185  } else
4186  __kmp_task_team_init(team->t.t_task_team[0], team);
4187  return;
4188  }
4189 
4190  // If this task_team hasn't been created yet, allocate it. It will be used in
4191  // the region after the next.
4192  // If it exists, it is the current task team and shouldn't be touched yet as
4193  // it may still be in use.
4194  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
4195  team->t.t_task_team[this_thr->th.th_task_state] =
4196  __kmp_allocate_task_team(this_thr, team);
4197  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
4198  " for team %d at parity=%d\n",
4199  __kmp_gtid_from_thread(this_thr),
4200  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
4201  this_thr->th.th_task_state));
4202  }
4203 
4204  // After threads exit the release, they will call sync, and then point to this
4205  // other task_team; make sure it is allocated and properly initialized. As
4206  // threads spin in the barrier release phase, they will continue to use the
4207  // previous task_team struct(above), until they receive the signal to stop
4208  // checking for tasks (they can't safely reference the kmp_team_t struct,
4209  // which could be reallocated by the primary thread).
4210  int other_team = 1 - this_thr->th.th_task_state;
4211  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
4212  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
4213  team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
4214  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
4215  "task_team %p for team %d at parity=%d\n",
4216  __kmp_gtid_from_thread(this_thr),
4217  team->t.t_task_team[other_team], team->t.t_id, other_team));
4218  } else { // Leave the old task team struct in place for the upcoming region;
4219  // adjust as needed
4220  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
4221  __kmp_task_team_init(task_team, team);
4222  // if team size has changed, the first thread to enable tasking will
4223  // realloc threads_data if necessary
4224  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
4225  "%p for team %d at parity=%d\n",
4226  __kmp_gtid_from_thread(this_thr),
4227  team->t.t_task_team[other_team], team->t.t_id, other_team));
4228  }
4229 
4230  // For regular thread, task enabling should be called when the task is going
4231  // to be pushed to a dequeue. However, for the hidden helper thread, we need
4232  // it ahead of time so that some operations can be performed without race
4233  // condition.
4234  if (this_thr == __kmp_hidden_helper_main_thread) {
4235  for (int i = 0; i < 2; ++i) {
4236  kmp_task_team_t *task_team = team->t.t_task_team[i];
4237  if (KMP_TASKING_ENABLED(task_team)) {
4238  continue;
4239  }
4240  __kmp_enable_tasking(task_team, this_thr);
4241  for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
4242  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
4243  if (thread_data->td.td_deque == NULL) {
4244  __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
4245  }
4246  }
4247  }
4248  }
4249 }
4250 
4251 // __kmp_task_team_sync: Propagation of task team data from team to threads
4252 // which happens just after the release phase of a team barrier. This may be
4253 // called by any thread. This is not called for serial or root teams.
4254 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
4255  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4256  KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
4257  KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
4258 
4259  // Toggle the th_task_state field, to switch which task_team this thread
4260  // refers to
4261  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
4262 
4263  // It is now safe to propagate the task team pointer from the team struct to
4264  // the current thread.
4265  TCW_PTR(this_thr->th.th_task_team,
4266  team->t.t_task_team[this_thr->th.th_task_state]);
4267  KA_TRACE(20,
4268  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4269  "%p from Team #%d (parity=%d)\n",
4270  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4271  team->t.t_id, this_thr->th.th_task_state));
4272 }
4273 
4274 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4275 // barrier gather phase. Only called by the primary thread.
4276 //
4277 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4278 // by passing in 0 optionally as the last argument. When wait is zero, primary
4279 // thread does not wait for unfinished_threads to reach 0.
4280 void __kmp_task_team_wait(
4281  kmp_info_t *this_thr,
4282  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4283  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4284 
4285  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4286  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4287 
4288  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4289  if (wait) {
4290  KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4291  "(for unfinished_threads to reach 0) on task_team = %p\n",
4292  __kmp_gtid_from_thread(this_thr), task_team));
4293  // Worker threads may have dropped through to release phase, but could
4294  // still be executing tasks. Wait here for tasks to complete. To avoid
4295  // memory contention, only primary thread checks termination condition.
4296  kmp_flag_32<false, false> flag(
4297  RCAST(std::atomic<kmp_uint32> *,
4298  &task_team->tt.tt_unfinished_threads),
4299  0U);
4300  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4301  }
4302  // Deactivate the old task team, so that the worker threads will stop
4303  // referencing it while spinning.
4304  KA_TRACE(
4305  20,
4306  ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4307  "setting active to false, setting local and team's pointer to NULL\n",
4308  __kmp_gtid_from_thread(this_thr), task_team));
4309  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4310  TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4311  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
4312  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4313  KMP_MB();
4314 
4315  TCW_PTR(this_thr->th.th_task_team, NULL);
4316  }
4317 }
4318 
4319 // __kmp_tasking_barrier:
4320 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4321 // Internal function to execute all tasks prior to a regular barrier or a join
4322 // barrier. It is a full barrier itself, which unfortunately turns regular
4323 // barriers into double barriers and join barriers into 1 1/2 barriers.
4324 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4325  std::atomic<kmp_uint32> *spin = RCAST(
4326  std::atomic<kmp_uint32> *,
4327  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4328  int flag = FALSE;
4329  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4330 
4331 #if USE_ITT_BUILD
4332  KMP_FSYNC_SPIN_INIT(spin, NULL);
4333 #endif /* USE_ITT_BUILD */
4334  kmp_flag_32<false, false> spin_flag(spin, 0U);
4335  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4336  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4337 #if USE_ITT_BUILD
4338  // TODO: What about itt_sync_obj??
4339  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4340 #endif /* USE_ITT_BUILD */
4341 
4342  if (TCR_4(__kmp_global.g.g_done)) {
4343  if (__kmp_global.g.g_abort)
4344  __kmp_abort_thread();
4345  break;
4346  }
4347  KMP_YIELD(TRUE);
4348  }
4349 #if USE_ITT_BUILD
4350  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4351 #endif /* USE_ITT_BUILD */
4352 }
4353 
4354 // __kmp_give_task puts a task into a given thread queue if:
4355 // - the queue for that thread was created
4356 // - there's space in that queue
4357 // Because of this, __kmp_push_task needs to check if there's space after
4358 // getting the lock
4359 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
4360  kmp_int32 pass) {
4361  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4362  kmp_task_team_t *task_team = taskdata->td_task_team;
4363 
4364  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4365  taskdata, tid));
4366 
4367  // If task_team is NULL something went really bad...
4368  KMP_DEBUG_ASSERT(task_team != NULL);
4369 
4370  bool result = false;
4371  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4372 
4373  if (thread_data->td.td_deque == NULL) {
4374  // There's no queue in this thread, go find another one
4375  // We're guaranteed that at least one thread has a queue
4376  KA_TRACE(30,
4377  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4378  tid, taskdata));
4379  return result;
4380  }
4381 
4382  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4383  TASK_DEQUE_SIZE(thread_data->td)) {
4384  KA_TRACE(
4385  30,
4386  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4387  taskdata, tid));
4388 
4389  // if this deque is bigger than the pass ratio give a chance to another
4390  // thread
4391  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4392  return result;
4393 
4394  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4395  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4396  TASK_DEQUE_SIZE(thread_data->td)) {
4397  // expand deque to push the task which is not allowed to execute
4398  __kmp_realloc_task_deque(thread, thread_data);
4399  }
4400 
4401  } else {
4402 
4403  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4404 
4405  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4406  TASK_DEQUE_SIZE(thread_data->td)) {
4407  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4408  "thread %d.\n",
4409  taskdata, tid));
4410 
4411  // if this deque is bigger than the pass ratio give a chance to another
4412  // thread
4413  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4414  goto release_and_exit;
4415 
4416  __kmp_realloc_task_deque(thread, thread_data);
4417  }
4418  }
4419 
4420  // lock is held here, and there is space in the deque
4421 
4422  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4423  // Wrap index.
4424  thread_data->td.td_deque_tail =
4425  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4426  TCW_4(thread_data->td.td_deque_ntasks,
4427  TCR_4(thread_data->td.td_deque_ntasks) + 1);
4428 
4429  result = true;
4430  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4431  taskdata, tid));
4432 
4433 release_and_exit:
4434  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4435 
4436  return result;
4437 }
4438 
4439 #define PROXY_TASK_FLAG 0x40000000
4440 /* The finish of the proxy tasks is divided in two pieces:
4441  - the top half is the one that can be done from a thread outside the team
4442  - the bottom half must be run from a thread within the team
4443 
4444  In order to run the bottom half the task gets queued back into one of the
4445  threads of the team. Once the td_incomplete_child_task counter of the parent
4446  is decremented the threads can leave the barriers. So, the bottom half needs
4447  to be queued before the counter is decremented. The top half is therefore
4448  divided in two parts:
4449  - things that can be run before queuing the bottom half
4450  - things that must be run after queuing the bottom half
4451 
4452  This creates a second race as the bottom half can free the task before the
4453  second top half is executed. To avoid this we use the
4454  td_incomplete_child_task of the proxy task to synchronize the top and bottom
4455  half. */
4456 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4457  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4458  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4459  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4460  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4461 
4462  taskdata->td_flags.complete = 1; // mark the task as completed
4463 #if OMPX_TASKGRAPH
4464  taskdata->td_flags.onced = 1;
4465 #endif
4466 
4467  if (taskdata->td_taskgroup)
4468  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4469 
4470  // Create an imaginary children for this task so the bottom half cannot
4471  // release the task before we have completed the second top half
4472  KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4473 }
4474 
4475 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4476 #if KMP_DEBUG
4477  kmp_int32 children = 0;
4478  // Predecrement simulated by "- 1" calculation
4479  children = -1 +
4480 #endif
4481  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4482  KMP_DEBUG_ASSERT(children >= 0);
4483 
4484  // Remove the imaginary children
4485  KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4486 }
4487 
4488 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4489  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4490  kmp_info_t *thread = __kmp_threads[gtid];
4491 
4492  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4493  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4494  1); // top half must run before bottom half
4495 
4496  // We need to wait to make sure the top half is finished
4497  // Spinning here should be ok as this should happen quickly
4498  while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4499  PROXY_TASK_FLAG) > 0)
4500  ;
4501 
4502  __kmp_release_deps(gtid, taskdata);
4503  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4504 }
4505 
4514 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4515  KMP_DEBUG_ASSERT(ptask != NULL);
4516  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4517  KA_TRACE(
4518  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4519  gtid, taskdata));
4520  __kmp_assert_valid_gtid(gtid);
4521  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4522 
4523  __kmp_first_top_half_finish_proxy(taskdata);
4524  __kmp_second_top_half_finish_proxy(taskdata);
4525  __kmp_bottom_half_finish_proxy(gtid, ptask);
4526 
4527  KA_TRACE(10,
4528  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4529  gtid, taskdata));
4530 }
4531 
4532 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
4533  KMP_DEBUG_ASSERT(ptask != NULL);
4534  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4535 
4536  // Enqueue task to complete bottom half completion from a thread within the
4537  // corresponding team
4538  kmp_team_t *team = taskdata->td_team;
4539  kmp_int32 nthreads = team->t.t_nproc;
4540  kmp_info_t *thread;
4541 
4542  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4543  // but we cannot use __kmp_get_random here
4544  kmp_int32 start_k = start % nthreads;
4545  kmp_int32 pass = 1;
4546  kmp_int32 k = start_k;
4547 
4548  do {
4549  // For now we're just linearly trying to find a thread
4550  thread = team->t.t_threads[k];
4551  k = (k + 1) % nthreads;
4552 
4553  // we did a full pass through all the threads
4554  if (k == start_k)
4555  pass = pass << 1;
4556 
4557  } while (!__kmp_give_task(thread, k, ptask, pass));
4558 
4559  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {
4560  // awake at least one thread to execute given task
4561  for (int i = 0; i < nthreads; ++i) {
4562  thread = team->t.t_threads[i];
4563  if (thread->th.th_sleep_loc != NULL) {
4564  __kmp_null_resume_wrapper(thread);
4565  break;
4566  }
4567  }
4568  }
4569 }
4570 
4578 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4579  KMP_DEBUG_ASSERT(ptask != NULL);
4580  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4581 
4582  KA_TRACE(
4583  10,
4584  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4585  taskdata));
4586 
4587  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4588 
4589  __kmp_first_top_half_finish_proxy(taskdata);
4590 
4591  __kmpc_give_task(ptask);
4592 
4593  __kmp_second_top_half_finish_proxy(taskdata);
4594 
4595  KA_TRACE(
4596  10,
4597  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4598  taskdata));
4599 }
4600 
4601 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4602  kmp_task_t *task) {
4603  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4604  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4605  td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4606  td->td_allow_completion_event.ed.task = task;
4607  __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4608  }
4609  return &td->td_allow_completion_event;
4610 }
4611 
4612 void __kmp_fulfill_event(kmp_event_t *event) {
4613  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4614  kmp_task_t *ptask = event->ed.task;
4615  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4616  bool detached = false;
4617  int gtid = __kmp_get_gtid();
4618 
4619  // The associated task might have completed or could be completing at this
4620  // point.
4621  // We need to take the lock to avoid races
4622  __kmp_acquire_tas_lock(&event->lock, gtid);
4623  if (taskdata->td_flags.proxy == TASK_PROXY) {
4624  detached = true;
4625  } else {
4626 #if OMPT_SUPPORT
4627  // The OMPT event must occur under mutual exclusion,
4628  // otherwise the tool might access ptask after free
4629  if (UNLIKELY(ompt_enabled.enabled))
4630  __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4631 #endif
4632  }
4633  event->type = KMP_EVENT_UNINITIALIZED;
4634  __kmp_release_tas_lock(&event->lock, gtid);
4635 
4636  if (detached) {
4637 #if OMPT_SUPPORT
4638  // We free ptask afterwards and know the task is finished,
4639  // so locking is not necessary
4640  if (UNLIKELY(ompt_enabled.enabled))
4641  __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4642 #endif
4643  // If the task detached complete the proxy task
4644  if (gtid >= 0) {
4645  kmp_team_t *team = taskdata->td_team;
4646  kmp_info_t *thread = __kmp_get_thread();
4647  if (thread->th.th_team == team) {
4648  __kmpc_proxy_task_completed(gtid, ptask);
4649  return;
4650  }
4651  }
4652 
4653  // fallback
4655  }
4656  }
4657 }
4658 
4659 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4660 // for taskloop
4661 //
4662 // thread: allocating thread
4663 // task_src: pointer to source task to be duplicated
4664 // taskloop_recur: used only when dealing with taskgraph,
4665 // indicating whether we need to update task->td_task_id
4666 // returns: a pointer to the allocated kmp_task_t structure (task).
4667 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
4668 #if OMPX_TASKGRAPH
4669  , int taskloop_recur
4670 #endif
4671 ) {
4672  kmp_task_t *task;
4673  kmp_taskdata_t *taskdata;
4674  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4675  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4676  size_t shareds_offset;
4677  size_t task_size;
4678 
4679  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4680  task_src));
4681  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4682  TASK_FULL); // it should not be proxy task
4683  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4684  task_size = taskdata_src->td_size_alloc;
4685 
4686  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4687  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4688  task_size));
4689 #if USE_FAST_MEMORY
4690  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4691 #else
4692  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4693 #endif /* USE_FAST_MEMORY */
4694  KMP_MEMCPY(taskdata, taskdata_src, task_size);
4695 
4696  task = KMP_TASKDATA_TO_TASK(taskdata);
4697 
4698  // Initialize new task (only specific fields not affected by memcpy)
4699 #if OMPX_TASKGRAPH
4700  if (!taskdata->is_taskgraph || taskloop_recur)
4701  taskdata->td_task_id = KMP_GEN_TASK_ID();
4702  else if (taskdata->is_taskgraph &&
4703  __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
4704  taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
4705 #else
4706  taskdata->td_task_id = KMP_GEN_TASK_ID();
4707 #endif
4708  if (task->shareds != NULL) { // need setup shareds pointer
4709  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4710  task->shareds = &((char *)taskdata)[shareds_offset];
4711  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4712  0);
4713  }
4714  taskdata->td_alloc_thread = thread;
4715  taskdata->td_parent = parent_task;
4716  // task inherits the taskgroup from the parent task
4717  taskdata->td_taskgroup = parent_task->td_taskgroup;
4718  // tied task needs to initialize the td_last_tied at creation,
4719  // untied one does this when it is scheduled for execution
4720  if (taskdata->td_flags.tiedness == TASK_TIED)
4721  taskdata->td_last_tied = taskdata;
4722 
4723  // Only need to keep track of child task counts if team parallel and tasking
4724  // not serialized
4725  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4726  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4727  if (parent_task->td_taskgroup)
4728  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4729  // Only need to keep track of allocated child tasks for explicit tasks since
4730  // implicit not deallocated
4731  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4732  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4733  }
4734 
4735  KA_TRACE(20,
4736  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4737  thread, taskdata, taskdata->td_parent));
4738 #if OMPT_SUPPORT
4739  if (UNLIKELY(ompt_enabled.enabled))
4740  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4741 #endif
4742  return task;
4743 }
4744 
4745 // Routine optionally generated by the compiler for setting the lastprivate flag
4746 // and calling needed constructors for private/firstprivate objects
4747 // (used to form taskloop tasks from pattern task)
4748 // Parameters: dest task, src task, lastprivate flag.
4749 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4750 
4751 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4752 
4753 // class to encapsulate manipulating loop bounds in a taskloop task.
4754 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4755 // the loop bound variables.
4756 class kmp_taskloop_bounds_t {
4757  kmp_task_t *task;
4758  const kmp_taskdata_t *taskdata;
4759  size_t lower_offset;
4760  size_t upper_offset;
4761 
4762 public:
4763  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4764  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4765  lower_offset((char *)lb - (char *)task),
4766  upper_offset((char *)ub - (char *)task) {
4767  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4768  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4769  }
4770  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4771  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4772  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4773  size_t get_lower_offset() const { return lower_offset; }
4774  size_t get_upper_offset() const { return upper_offset; }
4775  kmp_uint64 get_lb() const {
4776  kmp_int64 retval;
4777 #if defined(KMP_GOMP_COMPAT)
4778  // Intel task just returns the lower bound normally
4779  if (!taskdata->td_flags.native) {
4780  retval = *(kmp_int64 *)((char *)task + lower_offset);
4781  } else {
4782  // GOMP task has to take into account the sizeof(long)
4783  if (taskdata->td_size_loop_bounds == 4) {
4784  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4785  retval = (kmp_int64)*lb;
4786  } else {
4787  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4788  retval = (kmp_int64)*lb;
4789  }
4790  }
4791 #else
4792  (void)taskdata;
4793  retval = *(kmp_int64 *)((char *)task + lower_offset);
4794 #endif // defined(KMP_GOMP_COMPAT)
4795  return retval;
4796  }
4797  kmp_uint64 get_ub() const {
4798  kmp_int64 retval;
4799 #if defined(KMP_GOMP_COMPAT)
4800  // Intel task just returns the upper bound normally
4801  if (!taskdata->td_flags.native) {
4802  retval = *(kmp_int64 *)((char *)task + upper_offset);
4803  } else {
4804  // GOMP task has to take into account the sizeof(long)
4805  if (taskdata->td_size_loop_bounds == 4) {
4806  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4807  retval = (kmp_int64)*ub;
4808  } else {
4809  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4810  retval = (kmp_int64)*ub;
4811  }
4812  }
4813 #else
4814  retval = *(kmp_int64 *)((char *)task + upper_offset);
4815 #endif // defined(KMP_GOMP_COMPAT)
4816  return retval;
4817  }
4818  void set_lb(kmp_uint64 lb) {
4819 #if defined(KMP_GOMP_COMPAT)
4820  // Intel task just sets the lower bound normally
4821  if (!taskdata->td_flags.native) {
4822  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4823  } else {
4824  // GOMP task has to take into account the sizeof(long)
4825  if (taskdata->td_size_loop_bounds == 4) {
4826  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4827  *lower = (kmp_uint32)lb;
4828  } else {
4829  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4830  *lower = (kmp_uint64)lb;
4831  }
4832  }
4833 #else
4834  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4835 #endif // defined(KMP_GOMP_COMPAT)
4836  }
4837  void set_ub(kmp_uint64 ub) {
4838 #if defined(KMP_GOMP_COMPAT)
4839  // Intel task just sets the upper bound normally
4840  if (!taskdata->td_flags.native) {
4841  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4842  } else {
4843  // GOMP task has to take into account the sizeof(long)
4844  if (taskdata->td_size_loop_bounds == 4) {
4845  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4846  *upper = (kmp_uint32)ub;
4847  } else {
4848  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4849  *upper = (kmp_uint64)ub;
4850  }
4851  }
4852 #else
4853  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4854 #endif // defined(KMP_GOMP_COMPAT)
4855  }
4856 };
4857 
4858 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4859 //
4860 // loc Source location information
4861 // gtid Global thread ID
4862 // task Pattern task, exposes the loop iteration range
4863 // lb Pointer to loop lower bound in task structure
4864 // ub Pointer to loop upper bound in task structure
4865 // st Loop stride
4866 // ub_glob Global upper bound (used for lastprivate check)
4867 // num_tasks Number of tasks to execute
4868 // grainsize Number of loop iterations per task
4869 // extras Number of chunks with grainsize+1 iterations
4870 // last_chunk Reduction of grainsize for last task
4871 // tc Iterations count
4872 // task_dup Tasks duplication routine
4873 // codeptr_ra Return address for OMPT events
4874 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4875  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4876  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4877  kmp_uint64 grainsize, kmp_uint64 extras,
4878  kmp_int64 last_chunk, kmp_uint64 tc,
4879 #if OMPT_SUPPORT
4880  void *codeptr_ra,
4881 #endif
4882  void *task_dup) {
4883  KMP_COUNT_BLOCK(OMP_TASKLOOP);
4884  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4885  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4886  // compiler provides global bounds here
4887  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4888  kmp_uint64 lower = task_bounds.get_lb();
4889  kmp_uint64 upper = task_bounds.get_ub();
4890  kmp_uint64 i;
4891  kmp_info_t *thread = __kmp_threads[gtid];
4892  kmp_taskdata_t *current_task = thread->th.th_current_task;
4893  kmp_task_t *next_task;
4894  kmp_int32 lastpriv = 0;
4895 
4896  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4897  (last_chunk < 0 ? last_chunk : extras));
4898  KMP_DEBUG_ASSERT(num_tasks > extras);
4899  KMP_DEBUG_ASSERT(num_tasks > 0);
4900  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4901  "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4902  gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4903  ub_glob, st, task_dup));
4904 
4905  // Launch num_tasks tasks, assign grainsize iterations each task
4906  for (i = 0; i < num_tasks; ++i) {
4907  kmp_uint64 chunk_minus_1;
4908  if (extras == 0) {
4909  chunk_minus_1 = grainsize - 1;
4910  } else {
4911  chunk_minus_1 = grainsize;
4912  --extras; // first extras iterations get bigger chunk (grainsize+1)
4913  }
4914  upper = lower + st * chunk_minus_1;
4915  if (upper > *ub) {
4916  upper = *ub;
4917  }
4918  if (i == num_tasks - 1) {
4919  // schedule the last task, set lastprivate flag if needed
4920  if (st == 1) { // most common case
4921  KMP_DEBUG_ASSERT(upper == *ub);
4922  if (upper == ub_glob)
4923  lastpriv = 1;
4924  } else if (st > 0) { // positive loop stride
4925  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4926  if ((kmp_uint64)st > ub_glob - upper)
4927  lastpriv = 1;
4928  } else { // negative loop stride
4929  KMP_DEBUG_ASSERT(upper + st < *ub);
4930  if (upper - ub_glob < (kmp_uint64)(-st))
4931  lastpriv = 1;
4932  }
4933  }
4934 
4935 #if OMPX_TASKGRAPH
4936  next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
4937 #else
4938  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4939 #endif
4940 
4941  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4942  kmp_taskloop_bounds_t next_task_bounds =
4943  kmp_taskloop_bounds_t(next_task, task_bounds);
4944 
4945  // adjust task-specific bounds
4946  next_task_bounds.set_lb(lower);
4947  if (next_taskdata->td_flags.native) {
4948  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4949  } else {
4950  next_task_bounds.set_ub(upper);
4951  }
4952  if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4953  // etc.
4954  ptask_dup(next_task, task, lastpriv);
4955  KA_TRACE(40,
4956  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4957  "upper %lld stride %lld, (offsets %p %p)\n",
4958  gtid, i, next_task, lower, upper, st,
4959  next_task_bounds.get_lower_offset(),
4960  next_task_bounds.get_upper_offset()));
4961 #if OMPT_SUPPORT
4962  __kmp_omp_taskloop_task(NULL, gtid, next_task,
4963  codeptr_ra); // schedule new task
4964 #if OMPT_OPTIONAL
4965  if (ompt_enabled.ompt_callback_dispatch) {
4966  OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4967  lower, upper, st);
4968  }
4969 #endif // OMPT_OPTIONAL
4970 #else
4971  __kmp_omp_task(gtid, next_task, true); // schedule new task
4972 #endif
4973  lower = upper + st; // adjust lower bound for the next iteration
4974  }
4975  // free the pattern task and exit
4976  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4977  // do not execute the pattern task, just do internal bookkeeping
4978  __kmp_task_finish<false>(gtid, task, current_task);
4979 }
4980 
4981 // Structure to keep taskloop parameters for auxiliary task
4982 // kept in the shareds of the task structure.
4983 typedef struct __taskloop_params {
4984  kmp_task_t *task;
4985  kmp_uint64 *lb;
4986  kmp_uint64 *ub;
4987  void *task_dup;
4988  kmp_int64 st;
4989  kmp_uint64 ub_glob;
4990  kmp_uint64 num_tasks;
4991  kmp_uint64 grainsize;
4992  kmp_uint64 extras;
4993  kmp_int64 last_chunk;
4994  kmp_uint64 tc;
4995  kmp_uint64 num_t_min;
4996 #if OMPT_SUPPORT
4997  void *codeptr_ra;
4998 #endif
4999 } __taskloop_params_t;
5000 
5001 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
5002  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
5003  kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
5004  kmp_uint64,
5005 #if OMPT_SUPPORT
5006  void *,
5007 #endif
5008  void *);
5009 
5010 // Execute part of the taskloop submitted as a task.
5011 int __kmp_taskloop_task(int gtid, void *ptask) {
5012  __taskloop_params_t *p =
5013  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
5014  kmp_task_t *task = p->task;
5015  kmp_uint64 *lb = p->lb;
5016  kmp_uint64 *ub = p->ub;
5017  void *task_dup = p->task_dup;
5018  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5019  kmp_int64 st = p->st;
5020  kmp_uint64 ub_glob = p->ub_glob;
5021  kmp_uint64 num_tasks = p->num_tasks;
5022  kmp_uint64 grainsize = p->grainsize;
5023  kmp_uint64 extras = p->extras;
5024  kmp_int64 last_chunk = p->last_chunk;
5025  kmp_uint64 tc = p->tc;
5026  kmp_uint64 num_t_min = p->num_t_min;
5027 #if OMPT_SUPPORT
5028  void *codeptr_ra = p->codeptr_ra;
5029 #endif
5030 #if KMP_DEBUG
5031  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5032  KMP_DEBUG_ASSERT(task != NULL);
5033  KA_TRACE(20,
5034  ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
5035  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5036  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
5037  st, task_dup));
5038 #endif
5039  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
5040  if (num_tasks > num_t_min)
5041  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5042  grainsize, extras, last_chunk, tc, num_t_min,
5043 #if OMPT_SUPPORT
5044  codeptr_ra,
5045 #endif
5046  task_dup);
5047  else
5048  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5049  grainsize, extras, last_chunk, tc,
5050 #if OMPT_SUPPORT
5051  codeptr_ra,
5052 #endif
5053  task_dup);
5054 
5055  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
5056  return 0;
5057 }
5058 
5059 // Schedule part of the taskloop as a task,
5060 // execute the rest of the taskloop.
5061 //
5062 // loc Source location information
5063 // gtid Global thread ID
5064 // task Pattern task, exposes the loop iteration range
5065 // lb Pointer to loop lower bound in task structure
5066 // ub Pointer to loop upper bound in task structure
5067 // st Loop stride
5068 // ub_glob Global upper bound (used for lastprivate check)
5069 // num_tasks Number of tasks to execute
5070 // grainsize Number of loop iterations per task
5071 // extras Number of chunks with grainsize+1 iterations
5072 // last_chunk Reduction of grainsize for last task
5073 // tc Iterations count
5074 // num_t_min Threshold to launch tasks recursively
5075 // task_dup Tasks duplication routine
5076 // codeptr_ra Return address for OMPT events
5077 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
5078  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5079  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
5080  kmp_uint64 grainsize, kmp_uint64 extras,
5081  kmp_int64 last_chunk, kmp_uint64 tc,
5082  kmp_uint64 num_t_min,
5083 #if OMPT_SUPPORT
5084  void *codeptr_ra,
5085 #endif
5086  void *task_dup) {
5087  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5088  KMP_DEBUG_ASSERT(task != NULL);
5089  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
5090  KA_TRACE(20,
5091  ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
5092  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5093  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
5094  st, task_dup));
5095  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5096  kmp_uint64 lower = *lb;
5097  kmp_info_t *thread = __kmp_threads[gtid];
5098  // kmp_taskdata_t *current_task = thread->th.th_current_task;
5099  kmp_task_t *next_task;
5100  size_t lower_offset =
5101  (char *)lb - (char *)task; // remember offset of lb in the task structure
5102  size_t upper_offset =
5103  (char *)ub - (char *)task; // remember offset of ub in the task structure
5104 
5105  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5106  (last_chunk < 0 ? last_chunk : extras));
5107  KMP_DEBUG_ASSERT(num_tasks > extras);
5108  KMP_DEBUG_ASSERT(num_tasks > 0);
5109 
5110  // split the loop in two halves
5111  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
5112  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
5113  kmp_uint64 gr_size0 = grainsize;
5114  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
5115  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
5116  if (last_chunk < 0) {
5117  ext0 = ext1 = 0;
5118  last_chunk1 = last_chunk;
5119  tc0 = grainsize * n_tsk0;
5120  tc1 = tc - tc0;
5121  } else if (n_tsk0 <= extras) {
5122  gr_size0++; // integrate extras into grainsize
5123  ext0 = 0; // no extra iters in 1st half
5124  ext1 = extras - n_tsk0; // remaining extras
5125  tc0 = gr_size0 * n_tsk0;
5126  tc1 = tc - tc0;
5127  } else { // n_tsk0 > extras
5128  ext1 = 0; // no extra iters in 2nd half
5129  ext0 = extras;
5130  tc1 = grainsize * n_tsk1;
5131  tc0 = tc - tc1;
5132  }
5133  ub0 = lower + st * (tc0 - 1);
5134  lb1 = ub0 + st;
5135 
5136  // create pattern task for 2nd half of the loop
5137 #if OMPX_TASKGRAPH
5138  next_task = __kmp_task_dup_alloc(thread, task,
5139  /* taskloop_recur */ 1);
5140 #else
5141  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
5142 #endif
5143  // adjust lower bound (upper bound is not changed) for the 2nd half
5144  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
5145  if (ptask_dup != NULL) // construct firstprivates, etc.
5146  ptask_dup(next_task, task, 0);
5147  *ub = ub0; // adjust upper bound for the 1st half
5148 
5149  // create auxiliary task for 2nd half of the loop
5150  // make sure new task has same parent task as the pattern task
5151  kmp_taskdata_t *current_task = thread->th.th_current_task;
5152  thread->th.th_current_task = taskdata->td_parent;
5153  kmp_task_t *new_task =
5154  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
5155  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
5156  // restore current task
5157  thread->th.th_current_task = current_task;
5158  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
5159  p->task = next_task;
5160  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
5161  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
5162  p->task_dup = task_dup;
5163  p->st = st;
5164  p->ub_glob = ub_glob;
5165  p->num_tasks = n_tsk1;
5166  p->grainsize = grainsize;
5167  p->extras = ext1;
5168  p->last_chunk = last_chunk1;
5169  p->tc = tc1;
5170  p->num_t_min = num_t_min;
5171 #if OMPT_SUPPORT
5172  p->codeptr_ra = codeptr_ra;
5173 #endif
5174 
5175 #if OMPX_TASKGRAPH
5176  kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
5177  new_task_data->tdg = taskdata->tdg;
5178  new_task_data->is_taskgraph = 0;
5179 #endif
5180 
5181 #if OMPT_SUPPORT
5182  // schedule new task with correct return address for OMPT events
5183  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
5184 #else
5185  __kmp_omp_task(gtid, new_task, true); // schedule new task
5186 #endif
5187 
5188  // execute the 1st half of current subrange
5189  if (n_tsk0 > num_t_min)
5190  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
5191  ext0, last_chunk0, tc0, num_t_min,
5192 #if OMPT_SUPPORT
5193  codeptr_ra,
5194 #endif
5195  task_dup);
5196  else
5197  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
5198  gr_size0, ext0, last_chunk0, tc0,
5199 #if OMPT_SUPPORT
5200  codeptr_ra,
5201 #endif
5202  task_dup);
5203 
5204  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
5205 }
5206 
5207 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5208  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5209  int nogroup, int sched, kmp_uint64 grainsize,
5210  int modifier, void *task_dup) {
5211  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5212  KMP_DEBUG_ASSERT(task != NULL);
5213  if (nogroup == 0) {
5214 #if OMPT_SUPPORT && OMPT_OPTIONAL
5215  OMPT_STORE_RETURN_ADDRESS(gtid);
5216 #endif
5217  __kmpc_taskgroup(loc, gtid);
5218  }
5219 
5220 #if OMPX_TASKGRAPH
5221  KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
5222 #endif
5223  // =========================================================================
5224  // calculate loop parameters
5225  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
5226  kmp_uint64 tc;
5227  // compiler provides global bounds here
5228  kmp_uint64 lower = task_bounds.get_lb();
5229  kmp_uint64 upper = task_bounds.get_ub();
5230  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
5231  kmp_uint64 num_tasks = 0, extras = 0;
5232  kmp_int64 last_chunk =
5233  0; // reduce grainsize of last task by last_chunk in strict mode
5234  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
5235  kmp_info_t *thread = __kmp_threads[gtid];
5236  kmp_taskdata_t *current_task = thread->th.th_current_task;
5237 
5238  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
5239  "grain %llu(%d, %d), dup %p\n",
5240  gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
5241  task_dup));
5242 
5243  // compute trip count
5244  if (st == 1) { // most common case
5245  tc = upper - lower + 1;
5246  } else if (st < 0) {
5247  tc = (lower - upper) / (-st) + 1;
5248  } else { // st > 0
5249  tc = (upper - lower) / st + 1;
5250  }
5251  if (tc == 0) {
5252  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
5253  // free the pattern task and exit
5254  __kmp_task_start(gtid, task, current_task);
5255  // do not execute anything for zero-trip loop
5256  __kmp_task_finish<false>(gtid, task, current_task);
5257  return;
5258  }
5259 
5260 #if OMPT_SUPPORT && OMPT_OPTIONAL
5261  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
5262  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
5263  if (ompt_enabled.ompt_callback_work) {
5264  ompt_callbacks.ompt_callback(ompt_callback_work)(
5265  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
5266  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5267  }
5268 #endif
5269 
5270  if (num_tasks_min == 0)
5271  // TODO: can we choose better default heuristic?
5272  num_tasks_min =
5273  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
5274 
5275  // compute num_tasks/grainsize based on the input provided
5276  switch (sched) {
5277  case 0: // no schedule clause specified, we can choose the default
5278  // let's try to schedule (team_size*10) tasks
5279  grainsize = thread->th.th_team_nproc * 10;
5280  KMP_FALLTHROUGH();
5281  case 2: // num_tasks provided
5282  if (grainsize > tc) {
5283  num_tasks = tc; // too big num_tasks requested, adjust values
5284  grainsize = 1;
5285  extras = 0;
5286  } else {
5287  num_tasks = grainsize;
5288  grainsize = tc / num_tasks;
5289  extras = tc % num_tasks;
5290  }
5291  break;
5292  case 1: // grainsize provided
5293  if (grainsize > tc) {
5294  num_tasks = 1;
5295  grainsize = tc; // too big grainsize requested, adjust values
5296  extras = 0;
5297  } else {
5298  if (modifier) {
5299  num_tasks = (tc + grainsize - 1) / grainsize;
5300  last_chunk = tc - (num_tasks * grainsize);
5301  extras = 0;
5302  } else {
5303  num_tasks = tc / grainsize;
5304  // adjust grainsize for balanced distribution of iterations
5305  grainsize = tc / num_tasks;
5306  extras = tc % num_tasks;
5307  }
5308  }
5309  break;
5310  default:
5311  KMP_ASSERT2(0, "unknown scheduling of taskloop");
5312  }
5313 
5314  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5315  (last_chunk < 0 ? last_chunk : extras));
5316  KMP_DEBUG_ASSERT(num_tasks > extras);
5317  KMP_DEBUG_ASSERT(num_tasks > 0);
5318  // =========================================================================
5319 
5320  // check if clause value first
5321  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5322  if (if_val == 0) { // if(0) specified, mark task as serial
5323  taskdata->td_flags.task_serial = 1;
5324  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5325  // always start serial tasks linearly
5326  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5327  grainsize, extras, last_chunk, tc,
5328 #if OMPT_SUPPORT
5329  OMPT_GET_RETURN_ADDRESS(0),
5330 #endif
5331  task_dup);
5332  // !taskdata->td_flags.native => currently force linear spawning of tasks
5333  // for GOMP_taskloop
5334  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5335  KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5336  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5337  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5338  last_chunk));
5339  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5340  grainsize, extras, last_chunk, tc, num_tasks_min,
5341 #if OMPT_SUPPORT
5342  OMPT_GET_RETURN_ADDRESS(0),
5343 #endif
5344  task_dup);
5345  } else {
5346  KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5347  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5348  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5349  last_chunk));
5350  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5351  grainsize, extras, last_chunk, tc,
5352 #if OMPT_SUPPORT
5353  OMPT_GET_RETURN_ADDRESS(0),
5354 #endif
5355  task_dup);
5356  }
5357 
5358 #if OMPT_SUPPORT && OMPT_OPTIONAL
5359  if (ompt_enabled.ompt_callback_work) {
5360  ompt_callbacks.ompt_callback(ompt_callback_work)(
5361  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5362  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5363  }
5364 #endif
5365 
5366  if (nogroup == 0) {
5367 #if OMPT_SUPPORT && OMPT_OPTIONAL
5368  OMPT_STORE_RETURN_ADDRESS(gtid);
5369 #endif
5370  __kmpc_end_taskgroup(loc, gtid);
5371  }
5372  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5373 }
5374 
5391 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5392  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5393  int sched, kmp_uint64 grainsize, void *task_dup) {
5394  __kmp_assert_valid_gtid(gtid);
5395  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5396  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5397  0, task_dup);
5398  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5399 }
5400 
5418 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5419  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5420  int nogroup, int sched, kmp_uint64 grainsize,
5421  int modifier, void *task_dup) {
5422  __kmp_assert_valid_gtid(gtid);
5423  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5424  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5425  modifier, task_dup);
5426  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5427 }
5428 
5438  if (gtid == KMP_GTID_DNE)
5439  return NULL;
5440 
5441  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5442  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5443 
5444  if (!taskdata)
5445  return NULL;
5446 
5447  return &taskdata->td_target_data.async_handle;
5448 }
5449 
5458 bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
5459  if (gtid == KMP_GTID_DNE)
5460  return FALSE;
5461 
5462  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5463  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5464 
5465  if (!taskdata)
5466  return FALSE;
5467 
5468  return taskdata->td_task_team != NULL;
5469 }
5470 
5471 #if OMPX_TASKGRAPH
5472 // __kmp_find_tdg: identify a TDG through its ID
5473 // gtid: Global Thread ID
5474 // tdg_id: ID of the TDG
5475 // returns: If a TDG corresponding to this ID is found and not
5476 // its initial state, return the pointer to it, otherwise nullptr
5477 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
5478  kmp_tdg_info_t *res = nullptr;
5479  if (__kmp_max_tdgs == 0)
5480  return res;
5481 
5482  if (__kmp_global_tdgs == NULL)
5483  __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
5484  sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
5485 
5486  if ((__kmp_global_tdgs[tdg_id]) &&
5487  (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
5488  res = __kmp_global_tdgs[tdg_id];
5489  return res;
5490 }
5491 
5492 // __kmp_print_tdg_dot: prints the TDG to a dot file
5493 // tdg: ID of the TDG
5494 void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg) {
5495  kmp_int32 tdg_id = tdg->tdg_id;
5496  KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
5497 
5498  char file_name[20];
5499  sprintf(file_name, "tdg_%d.dot", tdg_id);
5500  kmp_safe_raii_file_t tdg_file(file_name, "w");
5501 
5502  kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5503  fprintf(tdg_file,
5504  "digraph TDG {\n"
5505  " compound=true\n"
5506  " subgraph cluster {\n"
5507  " label=TDG_%d\n",
5508  tdg_id);
5509  for (kmp_int32 i = 0; i < num_tasks; i++) {
5510  fprintf(tdg_file, " %d[style=bold]\n", i);
5511  }
5512  fprintf(tdg_file, " }\n");
5513  for (kmp_int32 i = 0; i < num_tasks; i++) {
5514  kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
5515  kmp_int32 *successors = tdg->record_map[i].successors;
5516  if (nsuccessors > 0) {
5517  for (kmp_int32 j = 0; j < nsuccessors; j++)
5518  fprintf(tdg_file, " %d -> %d \n", i, successors[j]);
5519  }
5520  }
5521  fprintf(tdg_file, "}");
5522  KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
5523 }
5524 
5525 // __kmp_start_record: launch the execution of a previous
5526 // recorded TDG
5527 // gtid: Global Thread ID
5528 // tdg: ID of the TDG
5529 void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5530  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
5531  KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
5532  tdg->tdg_id, tdg->num_roots));
5533  kmp_node_info_t *this_record_map = tdg->record_map;
5534  kmp_int32 *this_root_tasks = tdg->root_tasks;
5535  kmp_int32 this_num_roots = tdg->num_roots;
5536  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5537 
5538  kmp_info_t *thread = __kmp_threads[gtid];
5539  kmp_taskdata_t *parent_task = thread->th.th_current_task;
5540 
5541  if (tdg->rec_taskred_data) {
5542  __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
5543  }
5544 
5545  for (kmp_int32 j = 0; j < this_num_tasks; j++) {
5546  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
5547 
5548  td->td_parent = parent_task;
5549  this_record_map[j].parent_task = parent_task;
5550 
5551  kmp_taskgroup_t *parent_taskgroup =
5552  this_record_map[j].parent_task->td_taskgroup;
5553 
5554  KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
5555  this_record_map[j].npredecessors);
5556  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
5557 
5558  if (parent_taskgroup) {
5559  KMP_ATOMIC_INC(&parent_taskgroup->count);
5560  // The taskgroup is different so we must update it
5561  td->td_taskgroup = parent_taskgroup;
5562  } else if (td->td_taskgroup != nullptr) {
5563  // If the parent doesnt have a taskgroup, remove it from the task
5564  td->td_taskgroup = nullptr;
5565  }
5566  if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
5567  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
5568  }
5569 
5570  for (kmp_int32 j = 0; j < this_num_roots; ++j) {
5571  __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
5572  }
5573  KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
5574  tdg->tdg_id, tdg->num_roots));
5575 }
5576 
5577 // __kmp_start_record: set up a TDG structure and turn the
5578 // recording flag to true
5579 // gtid: Global Thread ID of the encountering thread
5580 // input_flags: Flags associated with the TDG
5581 // tdg_id: ID of the TDG to record
5582 static inline void __kmp_start_record(kmp_int32 gtid,
5583  kmp_taskgraph_flags_t *flags,
5584  kmp_int32 tdg_id) {
5585  kmp_tdg_info_t *tdg =
5586  (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
5587  __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
5588  // Initializing the TDG structure
5589  tdg->tdg_id = tdg_id;
5590  tdg->map_size = INIT_MAPSIZE;
5591  tdg->num_roots = -1;
5592  tdg->root_tasks = nullptr;
5593  tdg->tdg_status = KMP_TDG_RECORDING;
5594  tdg->rec_num_taskred = 0;
5595  tdg->rec_taskred_data = nullptr;
5596  KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
5597 
5598  // Initializing the list of nodes in this TDG
5599  kmp_node_info_t *this_record_map =
5600  (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
5601  for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
5602  kmp_int32 *successorsList =
5603  (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
5604  this_record_map[i].task = nullptr;
5605  this_record_map[i].successors = successorsList;
5606  this_record_map[i].nsuccessors = 0;
5607  this_record_map[i].npredecessors = 0;
5608  this_record_map[i].successors_size = __kmp_successors_size;
5609  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
5610  }
5611 
5612  __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
5613 }
5614 
5615 // __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
5616 // the beginning of the record process of a task region
5617 // loc_ref: Location of TDG, not used yet
5618 // gtid: Global Thread ID of the encountering thread
5619 // input_flags: Flags associated with the TDG
5620 // tdg_id: ID of the TDG to record, for now, incremental integer
5621 // returns: 1 if we record, otherwise, 0
5622 kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
5623  kmp_int32 input_flags, kmp_int32 tdg_id) {
5624 
5625  kmp_int32 res;
5626  kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
5627  KA_TRACE(10,
5628  ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
5629  gtid, loc_ref, input_flags, tdg_id));
5630 
5631  if (__kmp_max_tdgs == 0) {
5632  KA_TRACE(
5633  10,
5634  ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
5635  "__kmp_max_tdgs = 0\n",
5636  gtid, loc_ref, input_flags, tdg_id));
5637  return 1;
5638  }
5639 
5640  __kmpc_taskgroup(loc_ref, gtid);
5641  if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
5642  // TODO: use re_record flag
5643  __kmp_exec_tdg(gtid, tdg);
5644  res = 0;
5645  } else {
5646  __kmp_curr_tdg_idx = tdg_id;
5647  KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
5648  __kmp_start_record(gtid, flags, tdg_id);
5649  __kmp_num_tdg++;
5650  res = 1;
5651  }
5652  KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
5653  gtid, tdg_id, res ? "record" : "execute"));
5654  return res;
5655 }
5656 
5657 // __kmp_end_record: set up a TDG after recording it
5658 // gtid: Global thread ID
5659 // tdg: Pointer to the TDG
5660 void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5661  // Store roots
5662  kmp_node_info_t *this_record_map = tdg->record_map;
5663  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5664  kmp_int32 *this_root_tasks =
5665  (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
5666  kmp_int32 this_map_size = tdg->map_size;
5667  kmp_int32 this_num_roots = 0;
5668  kmp_info_t *thread = __kmp_threads[gtid];
5669 
5670  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5671  if (this_record_map[i].npredecessors == 0) {
5672  this_root_tasks[this_num_roots++] = i;
5673  }
5674  }
5675 
5676  // Update with roots info and mapsize
5677  tdg->map_size = this_map_size;
5678  tdg->num_roots = this_num_roots;
5679  tdg->root_tasks = this_root_tasks;
5680  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
5681  tdg->tdg_status = KMP_TDG_READY;
5682 
5683  if (thread->th.th_current_task->td_dephash) {
5684  __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
5685  thread->th.th_current_task->td_dephash = NULL;
5686  }
5687 
5688  // Reset predecessor counter
5689  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5690  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
5691  this_record_map[i].npredecessors);
5692  }
5693  KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
5694 
5695  if (__kmp_tdg_dot)
5696  __kmp_print_tdg_dot(tdg);
5697 }
5698 
5699 // __kmpc_end_record_task: wrapper around __kmp_end_record to mark
5700 // the end of recording phase
5701 //
5702 // loc_ref: Source location information
5703 // gtid: Global thread ID
5704 // input_flags: Flags attached to the graph
5705 // tdg_id: ID of the TDG just finished recording
5706 void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
5707  kmp_int32 input_flags, kmp_int32 tdg_id) {
5708  kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
5709 
5710  KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
5711  " tdg=%d with flags=%d\n",
5712  gtid, loc_ref, tdg_id, input_flags));
5713  if (__kmp_max_tdgs) {
5714  // TODO: use input_flags->nowait
5715  __kmpc_end_taskgroup(loc_ref, gtid);
5716  if (__kmp_tdg_is_recording(tdg->tdg_status))
5717  __kmp_end_record(gtid, tdg);
5718  }
5719  KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
5720  " tdg=%d, its status is now READY\n",
5721  gtid, loc_ref, tdg_id));
5722 }
5723 #endif
struct kmp_taskred_data kmp_taskred_data_t
struct kmp_task_red_input kmp_task_red_input_t
struct kmp_taskred_flags kmp_taskred_flags_t
struct kmp_taskred_input kmp_taskred_input_t
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:911
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void * __kmpc_taskred_init(int gtid, int num, void *data)
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
bool __kmpc_omp_has_task_team(kmp_int32 gtid)
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
void ** __kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
Definition: kmp.h:247
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags