LLVM OpenMP* Runtime Library
kmp_affinity.h
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 #include <limits>
19 
20 #if KMP_AFFINITY_SUPPORTED
21 #if KMP_USE_HWLOC
22 class KMPHwlocAffinity : public KMPAffinity {
23 public:
24  class Mask : public KMPAffinity::Mask {
25  hwloc_cpuset_t mask;
26 
27  public:
28  Mask() {
29  mask = hwloc_bitmap_alloc();
30  this->zero();
31  }
32  ~Mask() { hwloc_bitmap_free(mask); }
33  void set(int i) override { hwloc_bitmap_set(mask, i); }
34  bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35  void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36  void zero() override { hwloc_bitmap_zero(mask); }
37  bool empty() const override { return hwloc_bitmap_iszero(mask); }
38  void copy(const KMPAffinity::Mask *src) override {
39  const Mask *convert = static_cast<const Mask *>(src);
40  hwloc_bitmap_copy(mask, convert->mask);
41  }
42  void bitwise_and(const KMPAffinity::Mask *rhs) override {
43  const Mask *convert = static_cast<const Mask *>(rhs);
44  hwloc_bitmap_and(mask, mask, convert->mask);
45  }
46  void bitwise_or(const KMPAffinity::Mask *rhs) override {
47  const Mask *convert = static_cast<const Mask *>(rhs);
48  hwloc_bitmap_or(mask, mask, convert->mask);
49  }
50  void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51  bool is_equal(const KMPAffinity::Mask *rhs) const override {
52  const Mask *convert = static_cast<const Mask *>(rhs);
53  return hwloc_bitmap_isequal(mask, convert->mask);
54  }
55  int begin() const override { return hwloc_bitmap_first(mask); }
56  int end() const override { return -1; }
57  int next(int previous) const override {
58  return hwloc_bitmap_next(mask, previous);
59  }
60  int get_system_affinity(bool abort_on_error) override {
61  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62  "Illegal get affinity operation when not capable");
63  long retval =
64  hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65  if (retval >= 0) {
66  return 0;
67  }
68  int error = errno;
69  if (abort_on_error) {
70  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71  KMP_ERR(error), __kmp_msg_null);
72  }
73  return error;
74  }
75  int set_system_affinity(bool abort_on_error) const override {
76  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77  "Illegal set affinity operation when not capable");
78  long retval =
79  hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80  if (retval >= 0) {
81  return 0;
82  }
83  int error = errno;
84  if (abort_on_error) {
85  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86  KMP_ERR(error), __kmp_msg_null);
87  }
88  return error;
89  }
90 #if KMP_OS_WINDOWS
91  int set_process_affinity(bool abort_on_error) const override {
92  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93  "Illegal set process affinity operation when not capable");
94  int error = 0;
95  const hwloc_topology_support *support =
96  hwloc_topology_get_support(__kmp_hwloc_topology);
97  if (support->cpubind->set_proc_cpubind) {
98  int retval;
99  retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100  HWLOC_CPUBIND_PROCESS);
101  if (retval >= 0)
102  return 0;
103  error = errno;
104  if (abort_on_error)
105  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106  KMP_ERR(error), __kmp_msg_null);
107  }
108  return error;
109  }
110 #endif
111  int get_proc_group() const override {
112  int group = -1;
113 #if KMP_OS_WINDOWS
114  if (__kmp_num_proc_groups == 1) {
115  return 1;
116  }
117  for (int i = 0; i < __kmp_num_proc_groups; i++) {
118  // On windows, the long type is always 32 bits
119  unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120  unsigned long second_32_bits =
121  hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122  if (first_32_bits == 0 && second_32_bits == 0) {
123  continue;
124  }
125  if (group >= 0) {
126  return -1;
127  }
128  group = i;
129  }
130 #endif /* KMP_OS_WINDOWS */
131  return group;
132  }
133  };
134  void determine_capable(const char *var) override {
135  const hwloc_topology_support *topology_support;
136  if (__kmp_hwloc_topology == NULL) {
137  if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138  __kmp_hwloc_error = TRUE;
139  if (__kmp_affinity.flags.verbose) {
140  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141  }
142  }
143  if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144  __kmp_hwloc_error = TRUE;
145  if (__kmp_affinity.flags.verbose) {
146  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147  }
148  }
149  }
150  topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151  // Is the system capable of setting/getting this thread's affinity?
152  // Also, is topology discovery possible? (pu indicates ability to discover
153  // processing units). And finally, were there no errors when calling any
154  // hwloc_* API functions?
155  if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156  topology_support->cpubind->get_thisthread_cpubind &&
157  topology_support->discovery->pu && !__kmp_hwloc_error) {
158  // enables affinity according to KMP_AFFINITY_CAPABLE() macro
159  KMP_AFFINITY_ENABLE(TRUE);
160  } else {
161  // indicate that hwloc didn't work and disable affinity
162  __kmp_hwloc_error = TRUE;
163  KMP_AFFINITY_DISABLE();
164  }
165  }
166  void bind_thread(int which) override {
167  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168  "Illegal set affinity operation when not capable");
169  KMPAffinity::Mask *mask;
170  KMP_CPU_ALLOC_ON_STACK(mask);
171  KMP_CPU_ZERO(mask);
172  KMP_CPU_SET(which, mask);
173  __kmp_set_system_affinity(mask, TRUE);
174  KMP_CPU_FREE_FROM_STACK(mask);
175  }
176  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178  KMPAffinity::Mask *allocate_mask_array(int num) override {
179  return new Mask[num];
180  }
181  void deallocate_mask_array(KMPAffinity::Mask *array) override {
182  Mask *hwloc_array = static_cast<Mask *>(array);
183  delete[] hwloc_array;
184  }
185  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186  int index) override {
187  Mask *hwloc_array = static_cast<Mask *>(array);
188  return &(hwloc_array[index]);
189  }
190  api_type get_api_type() const override { return HWLOC; }
191 };
192 #endif /* KMP_USE_HWLOC */
193 
194 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
195  KMP_OS_AIX
196 #if KMP_OS_LINUX
197 /* On some of the older OS's that we build on, these constants aren't present
198  in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
199  all systems of the same arch where they are defined, and they cannot change.
200  stone forever. */
201 #include <sys/syscall.h>
202 #if KMP_ARCH_X86 || KMP_ARCH_ARM
203 #ifndef __NR_sched_setaffinity
204 #define __NR_sched_setaffinity 241
205 #elif __NR_sched_setaffinity != 241
206 #error Wrong code for setaffinity system call.
207 #endif /* __NR_sched_setaffinity */
208 #ifndef __NR_sched_getaffinity
209 #define __NR_sched_getaffinity 242
210 #elif __NR_sched_getaffinity != 242
211 #error Wrong code for getaffinity system call.
212 #endif /* __NR_sched_getaffinity */
213 #elif KMP_ARCH_AARCH64
214 #ifndef __NR_sched_setaffinity
215 #define __NR_sched_setaffinity 122
216 #elif __NR_sched_setaffinity != 122
217 #error Wrong code for setaffinity system call.
218 #endif /* __NR_sched_setaffinity */
219 #ifndef __NR_sched_getaffinity
220 #define __NR_sched_getaffinity 123
221 #elif __NR_sched_getaffinity != 123
222 #error Wrong code for getaffinity system call.
223 #endif /* __NR_sched_getaffinity */
224 #elif KMP_ARCH_RISCV64
225 #ifndef __NR_sched_setaffinity
226 #define __NR_sched_setaffinity 122
227 #elif __NR_sched_setaffinity != 122
228 #error Wrong code for setaffinity system call.
229 #endif /* __NR_sched_setaffinity */
230 #ifndef __NR_sched_getaffinity
231 #define __NR_sched_getaffinity 123
232 #elif __NR_sched_getaffinity != 123
233 #error Wrong code for getaffinity system call.
234 #endif /* __NR_sched_getaffinity */
235 #elif KMP_ARCH_X86_64
236 #ifndef __NR_sched_setaffinity
237 #define __NR_sched_setaffinity 203
238 #elif __NR_sched_setaffinity != 203
239 #error Wrong code for setaffinity system call.
240 #endif /* __NR_sched_setaffinity */
241 #ifndef __NR_sched_getaffinity
242 #define __NR_sched_getaffinity 204
243 #elif __NR_sched_getaffinity != 204
244 #error Wrong code for getaffinity system call.
245 #endif /* __NR_sched_getaffinity */
246 #elif KMP_ARCH_PPC64
247 #ifndef __NR_sched_setaffinity
248 #define __NR_sched_setaffinity 222
249 #elif __NR_sched_setaffinity != 222
250 #error Wrong code for setaffinity system call.
251 #endif /* __NR_sched_setaffinity */
252 #ifndef __NR_sched_getaffinity
253 #define __NR_sched_getaffinity 223
254 #elif __NR_sched_getaffinity != 223
255 #error Wrong code for getaffinity system call.
256 #endif /* __NR_sched_getaffinity */
257 #elif KMP_ARCH_MIPS
258 #ifndef __NR_sched_setaffinity
259 #define __NR_sched_setaffinity 4239
260 #elif __NR_sched_setaffinity != 4239
261 #error Wrong code for setaffinity system call.
262 #endif /* __NR_sched_setaffinity */
263 #ifndef __NR_sched_getaffinity
264 #define __NR_sched_getaffinity 4240
265 #elif __NR_sched_getaffinity != 4240
266 #error Wrong code for getaffinity system call.
267 #endif /* __NR_sched_getaffinity */
268 #elif KMP_ARCH_MIPS64
269 #ifndef __NR_sched_setaffinity
270 #define __NR_sched_setaffinity 5195
271 #elif __NR_sched_setaffinity != 5195
272 #error Wrong code for setaffinity system call.
273 #endif /* __NR_sched_setaffinity */
274 #ifndef __NR_sched_getaffinity
275 #define __NR_sched_getaffinity 5196
276 #elif __NR_sched_getaffinity != 5196
277 #error Wrong code for getaffinity system call.
278 #endif /* __NR_sched_getaffinity */
279 #elif KMP_ARCH_LOONGARCH64
280 #ifndef __NR_sched_setaffinity
281 #define __NR_sched_setaffinity 122
282 #elif __NR_sched_setaffinity != 122
283 #error Wrong code for setaffinity system call.
284 #endif /* __NR_sched_setaffinity */
285 #ifndef __NR_sched_getaffinity
286 #define __NR_sched_getaffinity 123
287 #elif __NR_sched_getaffinity != 123
288 #error Wrong code for getaffinity system call.
289 #endif /* __NR_sched_getaffinity */
290 #elif KMP_ARCH_RISCV64
291 #ifndef __NR_sched_setaffinity
292 #define __NR_sched_setaffinity 122
293 #elif __NR_sched_setaffinity != 122
294 #error Wrong code for setaffinity system call.
295 #endif /* __NR_sched_setaffinity */
296 #ifndef __NR_sched_getaffinity
297 #define __NR_sched_getaffinity 123
298 #elif __NR_sched_getaffinity != 123
299 #error Wrong code for getaffinity system call.
300 #endif /* __NR_sched_getaffinity */
301 #elif KMP_ARCH_VE
302 #ifndef __NR_sched_setaffinity
303 #define __NR_sched_setaffinity 203
304 #elif __NR_sched_setaffinity != 203
305 #error Wrong code for setaffinity system call.
306 #endif /* __NR_sched_setaffinity */
307 #ifndef __NR_sched_getaffinity
308 #define __NR_sched_getaffinity 204
309 #elif __NR_sched_getaffinity != 204
310 #error Wrong code for getaffinity system call.
311 #endif /* __NR_sched_getaffinity */
312 #elif KMP_ARCH_S390X
313 #ifndef __NR_sched_setaffinity
314 #define __NR_sched_setaffinity 239
315 #elif __NR_sched_setaffinity != 239
316 #error Wrong code for setaffinity system call.
317 #endif /* __NR_sched_setaffinity */
318 #ifndef __NR_sched_getaffinity
319 #define __NR_sched_getaffinity 240
320 #elif __NR_sched_getaffinity != 240
321 #error Wrong code for getaffinity system call.
322 #endif /* __NR_sched_getaffinity */
323 #else
324 #error Unknown or unsupported architecture
325 #endif /* KMP_ARCH_* */
326 #elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
327 #include <pthread.h>
328 #include <pthread_np.h>
329 #elif KMP_OS_NETBSD
330 #include <pthread.h>
331 #include <sched.h>
332 #elif KMP_OS_AIX
333 #include <sys/dr.h>
334 #include <sys/rset.h>
335 #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
336 #define GET_NUMBER_SMT_SETS 0x0004
337 extern "C" int syssmt(int flags, int, int, int *);
338 #endif
339 class KMPNativeAffinity : public KMPAffinity {
340  class Mask : public KMPAffinity::Mask {
341  typedef unsigned long mask_t;
342  typedef decltype(__kmp_affin_mask_size) mask_size_type;
343  static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
344  static const mask_t ONE = 1;
345  mask_size_type get_num_mask_types() const {
346  return __kmp_affin_mask_size / sizeof(mask_t);
347  }
348 
349  public:
350  mask_t *mask;
351  Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
352  ~Mask() {
353  if (mask)
354  __kmp_free(mask);
355  }
356  void set(int i) override {
357  mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
358  }
359  bool is_set(int i) const override {
360  return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
361  }
362  void clear(int i) override {
363  mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
364  }
365  void zero() override {
366  mask_size_type e = get_num_mask_types();
367  for (mask_size_type i = 0; i < e; ++i)
368  mask[i] = (mask_t)0;
369  }
370  bool empty() const override {
371  mask_size_type e = get_num_mask_types();
372  for (mask_size_type i = 0; i < e; ++i)
373  if (mask[i] != (mask_t)0)
374  return false;
375  return true;
376  }
377  void copy(const KMPAffinity::Mask *src) override {
378  const Mask *convert = static_cast<const Mask *>(src);
379  mask_size_type e = get_num_mask_types();
380  for (mask_size_type i = 0; i < e; ++i)
381  mask[i] = convert->mask[i];
382  }
383  void bitwise_and(const KMPAffinity::Mask *rhs) override {
384  const Mask *convert = static_cast<const Mask *>(rhs);
385  mask_size_type e = get_num_mask_types();
386  for (mask_size_type i = 0; i < e; ++i)
387  mask[i] &= convert->mask[i];
388  }
389  void bitwise_or(const KMPAffinity::Mask *rhs) override {
390  const Mask *convert = static_cast<const Mask *>(rhs);
391  mask_size_type e = get_num_mask_types();
392  for (mask_size_type i = 0; i < e; ++i)
393  mask[i] |= convert->mask[i];
394  }
395  void bitwise_not() override {
396  mask_size_type e = get_num_mask_types();
397  for (mask_size_type i = 0; i < e; ++i)
398  mask[i] = ~(mask[i]);
399  }
400  bool is_equal(const KMPAffinity::Mask *rhs) const override {
401  const Mask *convert = static_cast<const Mask *>(rhs);
402  mask_size_type e = get_num_mask_types();
403  for (mask_size_type i = 0; i < e; ++i)
404  if (mask[i] != convert->mask[i])
405  return false;
406  return true;
407  }
408  int begin() const override {
409  int retval = 0;
410  while (retval < end() && !is_set(retval))
411  ++retval;
412  return retval;
413  }
414  int end() const override {
415  int e;
416  __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
417  return e;
418  }
419  int next(int previous) const override {
420  int retval = previous + 1;
421  while (retval < end() && !is_set(retval))
422  ++retval;
423  return retval;
424  }
425 #if KMP_OS_AIX
426  // On AIX, we don't have a way to get CPU(s) a thread is bound to.
427  // This routine is only used to get the full mask.
428  int get_system_affinity(bool abort_on_error) override {
429  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
430  "Illegal get affinity operation when not capable");
431 
432  (void)abort_on_error;
433 
434  // Set the mask with all CPUs that are available.
435  for (int i = 0; i < __kmp_xproc; ++i)
436  KMP_CPU_SET(i, this);
437  return 0;
438  }
439  int set_system_affinity(bool abort_on_error) const override {
440  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
441 
442  "Illegal set affinity operation when not capable");
443 
444  int location;
445  int gtid = __kmp_entry_gtid();
446  int tid = thread_self();
447 
448  // Unbind the thread if it was bound to any processors before so that
449  // we can bind the thread to CPUs specified by the mask not others.
450  int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
451 
452  // On AIX, we can only bind to one instead of a set of CPUs with the
453  // bindprocessor() system call.
454  KMP_CPU_SET_ITERATE(location, this) {
455  if (KMP_CPU_ISSET(location, this)) {
456  retval = bindprocessor(BINDTHREAD, tid, location);
457  if (retval == -1 && errno == 1) {
458  rsid_t rsid;
459  rsethandle_t rsh;
460  // Put something in rsh to prevent compiler warning
461  // about uninitalized use
462  rsh = rs_alloc(RS_EMPTY);
463  rsid.at_pid = getpid();
464  if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
465  retval = ra_detachrset(R_PROCESS, rsid, 0);
466  retval = bindprocessor(BINDTHREAD, tid, location);
467  }
468  }
469  if (retval == 0) {
470  KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "
471  "T#%d to cpu=%d.\n",
472  gtid, location));
473  continue;
474  }
475  int error = errno;
476  if (abort_on_error) {
477  __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
478  KMP_ERR(error), __kmp_msg_null);
479  KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "
480  "T#%d to cpu=%d, errno=%d.\n",
481  gtid, location, error));
482  return error;
483  }
484  }
485  }
486  return 0;
487  }
488 #else // !KMP_OS_AIX
489  int get_system_affinity(bool abort_on_error) override {
490  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
491  "Illegal get affinity operation when not capable");
492 #if KMP_OS_LINUX
493  long retval =
494  syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
495 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
496  int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
497  reinterpret_cast<cpuset_t *>(mask));
498  int retval = (r == 0 ? 0 : -1);
499 #endif
500  if (retval >= 0) {
501  return 0;
502  }
503  int error = errno;
504  if (abort_on_error) {
505  __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
506  KMP_ERR(error), __kmp_msg_null);
507  }
508  return error;
509  }
510  int set_system_affinity(bool abort_on_error) const override {
511  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
512  "Illegal set affinity operation when not capable");
513 #if KMP_OS_LINUX
514  long retval =
515  syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
516 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
517  int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
518  reinterpret_cast<cpuset_t *>(mask));
519  int retval = (r == 0 ? 0 : -1);
520 #endif
521  if (retval >= 0) {
522  return 0;
523  }
524  int error = errno;
525  if (abort_on_error) {
526  __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
527  KMP_ERR(error), __kmp_msg_null);
528  }
529  return error;
530  }
531 #endif // KMP_OS_AIX
532  };
533  void determine_capable(const char *env_var) override {
534  __kmp_affinity_determine_capable(env_var);
535  }
536  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
537  KMPAffinity::Mask *allocate_mask() override {
538  KMPNativeAffinity::Mask *retval = new Mask();
539  return retval;
540  }
541  void deallocate_mask(KMPAffinity::Mask *m) override {
542  KMPNativeAffinity::Mask *native_mask =
543  static_cast<KMPNativeAffinity::Mask *>(m);
544  delete native_mask;
545  }
546  KMPAffinity::Mask *allocate_mask_array(int num) override {
547  return new Mask[num];
548  }
549  void deallocate_mask_array(KMPAffinity::Mask *array) override {
550  Mask *linux_array = static_cast<Mask *>(array);
551  delete[] linux_array;
552  }
553  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
554  int index) override {
555  Mask *linux_array = static_cast<Mask *>(array);
556  return &(linux_array[index]);
557  }
558  api_type get_api_type() const override { return NATIVE_OS; }
559 };
560 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \
561  || KMP_OS_AIX */
562 
563 #if KMP_OS_WINDOWS
564 class KMPNativeAffinity : public KMPAffinity {
565  class Mask : public KMPAffinity::Mask {
566  typedef ULONG_PTR mask_t;
567  static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
568  mask_t *mask;
569 
570  public:
571  Mask() {
572  mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
573  }
574  ~Mask() {
575  if (mask)
576  __kmp_free(mask);
577  }
578  void set(int i) override {
579  mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
580  }
581  bool is_set(int i) const override {
582  return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
583  }
584  void clear(int i) override {
585  mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
586  }
587  void zero() override {
588  for (int i = 0; i < __kmp_num_proc_groups; ++i)
589  mask[i] = 0;
590  }
591  bool empty() const override {
592  for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
593  if (mask[i])
594  return false;
595  return true;
596  }
597  void copy(const KMPAffinity::Mask *src) override {
598  const Mask *convert = static_cast<const Mask *>(src);
599  for (int i = 0; i < __kmp_num_proc_groups; ++i)
600  mask[i] = convert->mask[i];
601  }
602  void bitwise_and(const KMPAffinity::Mask *rhs) override {
603  const Mask *convert = static_cast<const Mask *>(rhs);
604  for (int i = 0; i < __kmp_num_proc_groups; ++i)
605  mask[i] &= convert->mask[i];
606  }
607  void bitwise_or(const KMPAffinity::Mask *rhs) override {
608  const Mask *convert = static_cast<const Mask *>(rhs);
609  for (int i = 0; i < __kmp_num_proc_groups; ++i)
610  mask[i] |= convert->mask[i];
611  }
612  void bitwise_not() override {
613  for (int i = 0; i < __kmp_num_proc_groups; ++i)
614  mask[i] = ~(mask[i]);
615  }
616  bool is_equal(const KMPAffinity::Mask *rhs) const override {
617  const Mask *convert = static_cast<const Mask *>(rhs);
618  for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
619  if (mask[i] != convert->mask[i])
620  return false;
621  return true;
622  }
623  int begin() const override {
624  int retval = 0;
625  while (retval < end() && !is_set(retval))
626  ++retval;
627  return retval;
628  }
629  int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
630  int next(int previous) const override {
631  int retval = previous + 1;
632  while (retval < end() && !is_set(retval))
633  ++retval;
634  return retval;
635  }
636  int set_process_affinity(bool abort_on_error) const override {
637  if (__kmp_num_proc_groups <= 1) {
638  if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
639  DWORD error = GetLastError();
640  if (abort_on_error) {
641  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
642  __kmp_msg_null);
643  }
644  return error;
645  }
646  }
647  return 0;
648  }
649  int set_system_affinity(bool abort_on_error) const override {
650  if (__kmp_num_proc_groups > 1) {
651  // Check for a valid mask.
652  GROUP_AFFINITY ga;
653  int group = get_proc_group();
654  if (group < 0) {
655  if (abort_on_error) {
656  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
657  }
658  return -1;
659  }
660  // Transform the bit vector into a GROUP_AFFINITY struct
661  // and make the system call to set affinity.
662  ga.Group = group;
663  ga.Mask = mask[group];
664  ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
665 
666  KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
667  if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
668  DWORD error = GetLastError();
669  if (abort_on_error) {
670  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
671  __kmp_msg_null);
672  }
673  return error;
674  }
675  } else {
676  if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
677  DWORD error = GetLastError();
678  if (abort_on_error) {
679  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
680  __kmp_msg_null);
681  }
682  return error;
683  }
684  }
685  return 0;
686  }
687  int get_system_affinity(bool abort_on_error) override {
688  if (__kmp_num_proc_groups > 1) {
689  this->zero();
690  GROUP_AFFINITY ga;
691  KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
692  if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
693  DWORD error = GetLastError();
694  if (abort_on_error) {
695  __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
696  KMP_ERR(error), __kmp_msg_null);
697  }
698  return error;
699  }
700  if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
701  (ga.Mask == 0)) {
702  return -1;
703  }
704  mask[ga.Group] = ga.Mask;
705  } else {
706  mask_t newMask, sysMask, retval;
707  if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
708  DWORD error = GetLastError();
709  if (abort_on_error) {
710  __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
711  KMP_ERR(error), __kmp_msg_null);
712  }
713  return error;
714  }
715  retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
716  if (!retval) {
717  DWORD error = GetLastError();
718  if (abort_on_error) {
719  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
720  KMP_ERR(error), __kmp_msg_null);
721  }
722  return error;
723  }
724  newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
725  if (!newMask) {
726  DWORD error = GetLastError();
727  if (abort_on_error) {
728  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
729  KMP_ERR(error), __kmp_msg_null);
730  }
731  }
732  *mask = retval;
733  }
734  return 0;
735  }
736  int get_proc_group() const override {
737  int group = -1;
738  if (__kmp_num_proc_groups == 1) {
739  return 1;
740  }
741  for (int i = 0; i < __kmp_num_proc_groups; i++) {
742  if (mask[i] == 0)
743  continue;
744  if (group >= 0)
745  return -1;
746  group = i;
747  }
748  return group;
749  }
750  };
751  void determine_capable(const char *env_var) override {
752  __kmp_affinity_determine_capable(env_var);
753  }
754  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
755  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
756  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
757  KMPAffinity::Mask *allocate_mask_array(int num) override {
758  return new Mask[num];
759  }
760  void deallocate_mask_array(KMPAffinity::Mask *array) override {
761  Mask *windows_array = static_cast<Mask *>(array);
762  delete[] windows_array;
763  }
764  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
765  int index) override {
766  Mask *windows_array = static_cast<Mask *>(array);
767  return &(windows_array[index]);
768  }
769  api_type get_api_type() const override { return NATIVE_OS; }
770 };
771 #endif /* KMP_OS_WINDOWS */
772 #endif /* KMP_AFFINITY_SUPPORTED */
773 
774 // Describe an attribute for a level in the machine topology
775 struct kmp_hw_attr_t {
776  int core_type : 8;
777  int core_eff : 8;
778  unsigned valid : 1;
779  unsigned reserved : 15;
780 
781  static const int UNKNOWN_CORE_EFF = -1;
782 
783  kmp_hw_attr_t()
784  : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
785  valid(0), reserved(0) {}
786  void set_core_type(kmp_hw_core_type_t type) {
787  valid = 1;
788  core_type = type;
789  }
790  void set_core_eff(int eff) {
791  valid = 1;
792  core_eff = eff;
793  }
794  kmp_hw_core_type_t get_core_type() const {
795  return (kmp_hw_core_type_t)core_type;
796  }
797  int get_core_eff() const { return core_eff; }
798  bool is_core_type_valid() const {
799  return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
800  }
801  bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
802  operator bool() const { return valid; }
803  void clear() {
804  core_type = KMP_HW_CORE_TYPE_UNKNOWN;
805  core_eff = UNKNOWN_CORE_EFF;
806  valid = 0;
807  }
808  bool contains(const kmp_hw_attr_t &other) const {
809  if (!valid && !other.valid)
810  return true;
811  if (valid && other.valid) {
812  if (other.is_core_type_valid()) {
813  if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
814  return false;
815  }
816  if (other.is_core_eff_valid()) {
817  if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
818  return false;
819  }
820  return true;
821  }
822  return false;
823  }
824 #if KMP_AFFINITY_SUPPORTED
825  bool contains(const kmp_affinity_attrs_t &attr) const {
826  if (!valid && !attr.valid)
827  return true;
828  if (valid && attr.valid) {
829  if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
830  return (is_core_type_valid() &&
831  (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
832  if (attr.core_eff != UNKNOWN_CORE_EFF)
833  return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
834  return true;
835  }
836  return false;
837  }
838 #endif // KMP_AFFINITY_SUPPORTED
839  bool operator==(const kmp_hw_attr_t &rhs) const {
840  return (rhs.valid == valid && rhs.core_eff == core_eff &&
841  rhs.core_type == core_type);
842  }
843  bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
844 };
845 
846 #if KMP_AFFINITY_SUPPORTED
847 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
848 #endif
849 
850 class kmp_hw_thread_t {
851 public:
852  static const int UNKNOWN_ID = -1;
853  static const int MULTIPLE_ID = -2;
854  static int compare_ids(const void *a, const void *b);
855  static int compare_compact(const void *a, const void *b);
856  int ids[KMP_HW_LAST];
857  int sub_ids[KMP_HW_LAST];
858  bool leader;
859  int os_id;
860  kmp_hw_attr_t attrs;
861 
862  void print() const;
863  void clear() {
864  for (int i = 0; i < (int)KMP_HW_LAST; ++i)
865  ids[i] = UNKNOWN_ID;
866  leader = false;
867  attrs.clear();
868  }
869 };
870 
871 class kmp_topology_t {
872 
873  struct flags_t {
874  int uniform : 1;
875  int reserved : 31;
876  };
877 
878  int depth;
879 
880  // The following arrays are all 'depth' long and have been
881  // allocated to hold up to KMP_HW_LAST number of objects if
882  // needed so layers can be added without reallocation of any array
883 
884  // Orderd array of the types in the topology
885  kmp_hw_t *types;
886 
887  // Keep quick topology ratios, for non-uniform topologies,
888  // this ratio holds the max number of itemAs per itemB
889  // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
890  int *ratio;
891 
892  // Storage containing the absolute number of each topology layer
893  int *count;
894 
895  // The number of core efficiencies. This is only useful for hybrid
896  // topologies. Core efficiencies will range from 0 to num efficiencies - 1
897  int num_core_efficiencies;
898  int num_core_types;
899  kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
900 
901  // The hardware threads array
902  // hw_threads is num_hw_threads long
903  // Each hw_thread's ids and sub_ids are depth deep
904  int num_hw_threads;
905  kmp_hw_thread_t *hw_threads;
906 
907  // Equivalence hash where the key is the hardware topology item
908  // and the value is the equivalent hardware topology type in the
909  // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
910  // known equivalence for the topology type
911  kmp_hw_t equivalent[KMP_HW_LAST];
912 
913  // Flags describing the topology
914  flags_t flags;
915 
916  // Compact value used during sort_compact()
917  int compact;
918 
919  // Insert a new topology layer after allocation
920  void _insert_layer(kmp_hw_t type, const int *ids);
921 
922 #if KMP_GROUP_AFFINITY
923  // Insert topology information about Windows Processor groups
924  void _insert_windows_proc_groups();
925 #endif
926 
927  // Count each item & get the num x's per y
928  // e.g., get the number of cores and the number of threads per core
929  // for each (x, y) in (KMP_HW_* , KMP_HW_*)
930  void _gather_enumeration_information();
931 
932  // Remove layers that don't add information to the topology.
933  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
934  void _remove_radix1_layers();
935 
936  // Find out if the topology is uniform
937  void _discover_uniformity();
938 
939  // Set all the sub_ids for each hardware thread
940  void _set_sub_ids();
941 
942  // Set global affinity variables describing the number of threads per
943  // core, the number of packages, the number of cores per package, and
944  // the number of cores.
945  void _set_globals();
946 
947  // Set the last level cache equivalent type
948  void _set_last_level_cache();
949 
950  // Return the number of cores with a particular attribute, 'attr'.
951  // If 'find_all' is true, then find all cores on the machine, otherwise find
952  // all cores per the layer 'above'
953  int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
954  bool find_all = false) const;
955 
956 public:
957  // Force use of allocate()/deallocate()
958  kmp_topology_t() = delete;
959  kmp_topology_t(const kmp_topology_t &t) = delete;
960  kmp_topology_t(kmp_topology_t &&t) = delete;
961  kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
962  kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
963 
964  static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
965  static void deallocate(kmp_topology_t *);
966 
967  // Functions used in create_map() routines
968  kmp_hw_thread_t &at(int index) {
969  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
970  return hw_threads[index];
971  }
972  const kmp_hw_thread_t &at(int index) const {
973  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
974  return hw_threads[index];
975  }
976  int get_num_hw_threads() const { return num_hw_threads; }
977  void sort_ids() {
978  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
979  kmp_hw_thread_t::compare_ids);
980  }
981  // Check if the hardware ids are unique, if they are
982  // return true, otherwise return false
983  bool check_ids() const;
984 
985  // Function to call after the create_map() routine
986  void canonicalize();
987  void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
988 
989 // Functions used after canonicalize() called
990 
991 #if KMP_AFFINITY_SUPPORTED
992  // Set the granularity for affinity settings
993  void set_granularity(kmp_affinity_t &stgs) const;
994  bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
995  bool restrict_to_mask(const kmp_affin_mask_t *mask);
996  bool filter_hw_subset();
997 #endif
998  bool is_uniform() const { return flags.uniform; }
999  // Tell whether a type is a valid type in the topology
1000  // returns KMP_HW_UNKNOWN when there is no equivalent type
1001  kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
1002  if (type == KMP_HW_UNKNOWN)
1003  return KMP_HW_UNKNOWN;
1004  return equivalent[type];
1005  }
1006  // Set type1 = type2
1007  void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
1008  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
1009  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
1010  kmp_hw_t real_type2 = equivalent[type2];
1011  if (real_type2 == KMP_HW_UNKNOWN)
1012  real_type2 = type2;
1013  equivalent[type1] = real_type2;
1014  // This loop is required since any of the types may have been set to
1015  // be equivalent to type1. They all must be checked and reset to type2.
1016  KMP_FOREACH_HW_TYPE(type) {
1017  if (equivalent[type] == type1) {
1018  equivalent[type] = real_type2;
1019  }
1020  }
1021  }
1022  // Calculate number of types corresponding to level1
1023  // per types corresponding to level2 (e.g., number of threads per core)
1024  int calculate_ratio(int level1, int level2) const {
1025  KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1026  KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1027  int r = 1;
1028  for (int level = level1; level > level2; --level)
1029  r *= ratio[level];
1030  return r;
1031  }
1032  int get_ratio(int level) const {
1033  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1034  return ratio[level];
1035  }
1036  int get_depth() const { return depth; };
1037  kmp_hw_t get_type(int level) const {
1038  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1039  return types[level];
1040  }
1041  int get_level(kmp_hw_t type) const {
1042  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1043  int eq_type = equivalent[type];
1044  if (eq_type == KMP_HW_UNKNOWN)
1045  return -1;
1046  for (int i = 0; i < depth; ++i)
1047  if (types[i] == eq_type)
1048  return i;
1049  return -1;
1050  }
1051  int get_count(int level) const {
1052  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1053  return count[level];
1054  }
1055  // Return the total number of cores with attribute 'attr'
1056  int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1057  return _get_ncores_with_attr(attr, -1, true);
1058  }
1059  // Return the number of cores with attribute
1060  // 'attr' per topology level 'above'
1061  int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1062  return _get_ncores_with_attr(attr, above, false);
1063  }
1064 
1065 #if KMP_AFFINITY_SUPPORTED
1066  friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1067  void sort_compact(kmp_affinity_t &affinity) {
1068  compact = affinity.compact;
1069  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1070  kmp_hw_thread_t::compare_compact);
1071  }
1072 #endif
1073  void print(const char *env_var = "KMP_AFFINITY") const;
1074  void dump() const;
1075 };
1076 extern kmp_topology_t *__kmp_topology;
1077 
1078 class kmp_hw_subset_t {
1079  const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1080 
1081 public:
1082  // Describe a machine topology item in KMP_HW_SUBSET
1083  struct item_t {
1084  kmp_hw_t type;
1085  int num_attrs;
1086  int num[MAX_ATTRS];
1087  int offset[MAX_ATTRS];
1088  kmp_hw_attr_t attr[MAX_ATTRS];
1089  };
1090  // Put parenthesis around max to avoid accidental use of Windows max macro.
1091  const static int USE_ALL = (std::numeric_limits<int>::max)();
1092 
1093 private:
1094  int depth;
1095  int capacity;
1096  item_t *items;
1097  kmp_uint64 set;
1098  bool absolute;
1099  // The set must be able to handle up to KMP_HW_LAST number of layers
1100  KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1101  // Sorting the KMP_HW_SUBSET items to follow topology order
1102  // All unknown topology types will be at the beginning of the subset
1103  static int hw_subset_compare(const void *i1, const void *i2) {
1104  kmp_hw_t type1 = ((const item_t *)i1)->type;
1105  kmp_hw_t type2 = ((const item_t *)i2)->type;
1106  int level1 = __kmp_topology->get_level(type1);
1107  int level2 = __kmp_topology->get_level(type2);
1108  return level1 - level2;
1109  }
1110 
1111 public:
1112  // Force use of allocate()/deallocate()
1113  kmp_hw_subset_t() = delete;
1114  kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1115  kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1116  kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1117  kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1118 
1119  static kmp_hw_subset_t *allocate() {
1120  int initial_capacity = 5;
1121  kmp_hw_subset_t *retval =
1122  (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1123  retval->depth = 0;
1124  retval->capacity = initial_capacity;
1125  retval->set = 0ull;
1126  retval->absolute = false;
1127  retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1128  return retval;
1129  }
1130  static void deallocate(kmp_hw_subset_t *subset) {
1131  __kmp_free(subset->items);
1132  __kmp_free(subset);
1133  }
1134  void set_absolute() { absolute = true; }
1135  bool is_absolute() const { return absolute; }
1136  void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1137  for (int i = 0; i < depth; ++i) {
1138  // Found an existing item for this layer type
1139  // Add the num, offset, and attr to this item
1140  if (items[i].type == type) {
1141  int idx = items[i].num_attrs++;
1142  if ((size_t)idx >= MAX_ATTRS)
1143  return;
1144  items[i].num[idx] = num;
1145  items[i].offset[idx] = offset;
1146  items[i].attr[idx] = attr;
1147  return;
1148  }
1149  }
1150  if (depth == capacity - 1) {
1151  capacity *= 2;
1152  item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1153  for (int i = 0; i < depth; ++i)
1154  new_items[i] = items[i];
1155  __kmp_free(items);
1156  items = new_items;
1157  }
1158  items[depth].num_attrs = 1;
1159  items[depth].type = type;
1160  items[depth].num[0] = num;
1161  items[depth].offset[0] = offset;
1162  items[depth].attr[0] = attr;
1163  depth++;
1164  set |= (1ull << type);
1165  }
1166  int get_depth() const { return depth; }
1167  const item_t &at(int index) const {
1168  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1169  return items[index];
1170  }
1171  item_t &at(int index) {
1172  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1173  return items[index];
1174  }
1175  void remove(int index) {
1176  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1177  set &= ~(1ull << items[index].type);
1178  for (int j = index + 1; j < depth; ++j) {
1179  items[j - 1] = items[j];
1180  }
1181  depth--;
1182  }
1183  void sort() {
1184  KMP_DEBUG_ASSERT(__kmp_topology);
1185  qsort(items, depth, sizeof(item_t), hw_subset_compare);
1186  }
1187  bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1188 
1189  // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1190  // This means putting each of {sockets, cores, threads} in the topology if
1191  // they are not specified:
1192  // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1193  // e.g., 3module => *s,3module,*c,*t
1194  // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1195  // are expecting the traditional sockets/cores/threads topology. For newer
1196  // hardware, there can be intervening layers like dies/tiles/modules
1197  // (usually corresponding to a cache level). So when a user asks for
1198  // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1199  // should get 12 hardware threads across 6 cores and effectively ignore the
1200  // module layer.
1201  void canonicalize(const kmp_topology_t *top) {
1202  // Layers to target for KMP_HW_SUBSET canonicalization
1203  kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1204 
1205  // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1206  if (is_absolute())
1207  return;
1208 
1209  // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1210  // topology doesn't have these layers
1211  for (kmp_hw_t type : targeted)
1212  if (top->get_level(type) == KMP_HW_UNKNOWN)
1213  return;
1214 
1215  // Put targeted layers in topology if they do not exist
1216  for (kmp_hw_t type : targeted) {
1217  bool found = false;
1218  for (int i = 0; i < get_depth(); ++i) {
1219  if (top->get_equivalent_type(items[i].type) == type) {
1220  found = true;
1221  break;
1222  }
1223  }
1224  if (!found) {
1225  push_back(USE_ALL, type, 0, kmp_hw_attr_t{});
1226  }
1227  }
1228  sort();
1229  // Set as an absolute topology that only targets the targeted layers
1230  set_absolute();
1231  }
1232  void dump() const {
1233  printf("**********************\n");
1234  printf("*** kmp_hw_subset: ***\n");
1235  printf("* depth: %d\n", depth);
1236  printf("* items:\n");
1237  for (int i = 0; i < depth; ++i) {
1238  printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1239  for (int j = 0; j < items[i].num_attrs; ++j) {
1240  printf(" num: %d, offset: %d, attr: ", items[i].num[j],
1241  items[i].offset[j]);
1242  if (!items[i].attr[j]) {
1243  printf(" (none)\n");
1244  } else {
1245  printf(
1246  " core_type = %s, core_eff = %d\n",
1247  __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1248  items[i].attr[j].get_core_eff());
1249  }
1250  }
1251  }
1252  printf("* set: 0x%llx\n", set);
1253  printf("* absolute: %d\n", absolute);
1254  printf("**********************\n");
1255  }
1256 };
1257 extern kmp_hw_subset_t *__kmp_hw_subset;
1258 
1259 /* A structure for holding machine-specific hierarchy info to be computed once
1260  at init. This structure represents a mapping of threads to the actual machine
1261  hierarchy, or to our best guess at what the hierarchy might be, for the
1262  purpose of performing an efficient barrier. In the worst case, when there is
1263  no machine hierarchy information, it produces a tree suitable for a barrier,
1264  similar to the tree used in the hyper barrier. */
1265 class hierarchy_info {
1266 public:
1267  /* Good default values for number of leaves and branching factor, given no
1268  affinity information. Behaves a bit like hyper barrier. */
1269  static const kmp_uint32 maxLeaves = 4;
1270  static const kmp_uint32 minBranch = 4;
1276  kmp_uint32 maxLevels;
1277 
1282  kmp_uint32 depth;
1283  kmp_uint32 base_num_threads;
1284  enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1285  volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1286  // 2=initialization in progress
1287  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1288 
1293  kmp_uint32 *numPerLevel;
1294  kmp_uint32 *skipPerLevel;
1295 
1296  void deriveLevels() {
1297  int hier_depth = __kmp_topology->get_depth();
1298  for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1299  numPerLevel[level] = __kmp_topology->get_ratio(i);
1300  }
1301  }
1302 
1303  hierarchy_info()
1304  : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1305 
1306  void fini() {
1307  if (!uninitialized && numPerLevel) {
1308  __kmp_free(numPerLevel);
1309  numPerLevel = NULL;
1310  uninitialized = not_initialized;
1311  }
1312  }
1313 
1314  void init(int num_addrs) {
1315  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1316  &uninitialized, not_initialized, initializing);
1317  if (bool_result == 0) { // Wait for initialization
1318  while (TCR_1(uninitialized) != initialized)
1319  KMP_CPU_PAUSE();
1320  return;
1321  }
1322  KMP_DEBUG_ASSERT(bool_result == 1);
1323 
1324  /* Added explicit initialization of the data fields here to prevent usage of
1325  dirty value observed when static library is re-initialized multiple times
1326  (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1327  OpenMP). */
1328  depth = 1;
1329  resizing = 0;
1330  maxLevels = 7;
1331  numPerLevel =
1332  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1333  skipPerLevel = &(numPerLevel[maxLevels]);
1334  for (kmp_uint32 i = 0; i < maxLevels;
1335  ++i) { // init numPerLevel[*] to 1 item per level
1336  numPerLevel[i] = 1;
1337  skipPerLevel[i] = 1;
1338  }
1339 
1340  // Sort table by physical ID
1341  if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1342  deriveLevels();
1343  } else {
1344  numPerLevel[0] = maxLeaves;
1345  numPerLevel[1] = num_addrs / maxLeaves;
1346  if (num_addrs % maxLeaves)
1347  numPerLevel[1]++;
1348  }
1349 
1350  base_num_threads = num_addrs;
1351  for (int i = maxLevels - 1; i >= 0;
1352  --i) // count non-empty levels to get depth
1353  if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1354  depth++;
1355 
1356  kmp_uint32 branch = minBranch;
1357  if (numPerLevel[0] == 1)
1358  branch = num_addrs / maxLeaves;
1359  if (branch < minBranch)
1360  branch = minBranch;
1361  for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1362  while (numPerLevel[d] > branch ||
1363  (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1364  if (numPerLevel[d] & 1)
1365  numPerLevel[d]++;
1366  numPerLevel[d] = numPerLevel[d] >> 1;
1367  if (numPerLevel[d + 1] == 1)
1368  depth++;
1369  numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1370  }
1371  if (numPerLevel[0] == 1) {
1372  branch = branch >> 1;
1373  if (branch < 4)
1374  branch = minBranch;
1375  }
1376  }
1377 
1378  for (kmp_uint32 i = 1; i < depth; ++i)
1379  skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1380  // Fill in hierarchy in the case of oversubscription
1381  for (kmp_uint32 i = depth; i < maxLevels; ++i)
1382  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1383 
1384  uninitialized = initialized; // One writer
1385  }
1386 
1387  // Resize the hierarchy if nproc changes to something larger than before
1388  void resize(kmp_uint32 nproc) {
1389  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1390  while (bool_result == 0) { // someone else is trying to resize
1391  KMP_CPU_PAUSE();
1392  if (nproc <= base_num_threads) // happy with other thread's resize
1393  return;
1394  else // try to resize
1395  bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1396  }
1397  KMP_DEBUG_ASSERT(bool_result != 0);
1398  if (nproc <= base_num_threads)
1399  return; // happy with other thread's resize
1400 
1401  // Calculate new maxLevels
1402  kmp_uint32 old_sz = skipPerLevel[depth - 1];
1403  kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1404  // First see if old maxLevels is enough to contain new size
1405  for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1406  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1407  numPerLevel[i - 1] *= 2;
1408  old_sz *= 2;
1409  depth++;
1410  }
1411  if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1412  while (nproc > old_sz) {
1413  old_sz *= 2;
1414  incs++;
1415  depth++;
1416  }
1417  maxLevels += incs;
1418 
1419  // Resize arrays
1420  kmp_uint32 *old_numPerLevel = numPerLevel;
1421  kmp_uint32 *old_skipPerLevel = skipPerLevel;
1422  numPerLevel = skipPerLevel = NULL;
1423  numPerLevel =
1424  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1425  skipPerLevel = &(numPerLevel[maxLevels]);
1426 
1427  // Copy old elements from old arrays
1428  for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1429  // init numPerLevel[*] to 1 item per level
1430  numPerLevel[i] = old_numPerLevel[i];
1431  skipPerLevel[i] = old_skipPerLevel[i];
1432  }
1433 
1434  // Init new elements in arrays to 1
1435  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1436  // init numPerLevel[*] to 1 item per level
1437  numPerLevel[i] = 1;
1438  skipPerLevel[i] = 1;
1439  }
1440 
1441  // Free old arrays
1442  __kmp_free(old_numPerLevel);
1443  }
1444 
1445  // Fill in oversubscription levels of hierarchy
1446  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1447  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1448 
1449  base_num_threads = nproc;
1450  resizing = 0; // One writer
1451  }
1452 };
1453 #endif // KMP_AFFINITY_H