nlib
SimdAlgorithm.h
Go to the documentation of this file.
1 
2 /*--------------------------------------------------------------------------------*
3  Project: CrossRoad
4  Copyright (C)Nintendo All rights reserved.
5 
6  These coded instructions, statements, and computer programs contain proprietary
7  information of Nintendo and/or its licensed developers and are protected by
8  national and international copyright laws. They may not be disclosed to third
9  parties or copied or duplicated in any form, in whole or in part, without the
10  prior written consent of Nintendo.
11 
12  The content herein is highly confidential and should be handled accordingly.
13  *--------------------------------------------------------------------------------*/
14 
15 #pragma once
16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_
17 #define INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_
18 
19 #include <algorithm>
20 #include <functional>
21 #include "nn/nlib/simd/SimdInt.h"
23 
24 #if defined(NLIB_SIMD)
25 
26 NLIB_NAMESPACE_BEGIN
27 namespace simd {
28 namespace detail {
29 
30 NLIB_VIS_PUBLIC void sortUint32A16(uint32_t* buf, uint32_t* src, size_t N) NLIB_NOEXCEPT;
31 NLIB_VIS_PUBLIC void sortUint32A16_1(uint32_t* buf, uint32_t* src, size_t N) NLIB_NOEXCEPT;
32 
33 template <size_t NumElem, bool flag>
34 struct MergeSortHelper {
35  static NLIB_ALWAYS_INLINE void Sort(uint32_t* data) NLIB_NOEXCEPT {
36  NLIB_ALIGNAS(16) uint32_t tmp[NumElem];
37  sortUint32A16(tmp, data, NumElem);
38  }
39 };
40 
41 template <size_t NumElem>
42 struct MergeSortHelper<NumElem, false> {
43  static NLIB_ALWAYS_INLINE void Sort(uint32_t* data) NLIB_NOEXCEPT {
44  NLIB_ALIGNAS(16) uint32_t tmp[NumElem];
45  sortUint32A16_1(tmp, data, NumElem);
46  }
47 };
48 
49 } // namespace detail
50 
51 template <size_t NumElem>
52 // data must be 16 bytes aligned, NumElem must be multiple of 16
53 NLIB_ALWAYS_INLINE void MergeSortUint32A16(uint32_t* data) NLIB_NOEXCEPT {
54  NLIB_STATIC_ASSERT((NumElem > 0) && (NumElem % 16 == 0));
55  NLIB_ASSERT(!(reinterpret_cast<uintptr_t>(data) & 15));
56  detail::MergeSortHelper<NumElem, ((NumElem & (NumElem - 1)) == 0)>::Sort(data);
57 }
58 
60 
61 template <class PRED>
62 // i128 PRED(i128 c) { mask(c.u8[i]) if matched }
63 const void* nlib_memchr_pred(const void* s, PRED pred, size_t n) NLIB_NOEXCEPT {
64  if (!s) return NULL;
65 
66  const unsigned char* p = reinterpret_cast<const unsigned char*>(s);
67  i128 a1, a2;
68  i128 cmp1, cmp2;
69  uint32_t mask;
70  if (reinterpret_cast<uintptr_t>(p) & 15) {
71  size_t r = reinterpret_cast<uintptr_t>(p) & 15;
72  a1 = I128::LoadA16(p - r);
73  cmp1 = pred(a1);
74  mask = I128::MoveMask8(cmp1);
75  mask = mask >> r;
76  size_t rr = 16 - r;
77  if (n < rr) {
78  mask &= (1 << n) - 1;
79  if (mask) return p + nlib_ctz32(mask);
80  return NULL;
81  }
82  if (mask) return p + nlib_ctz32(mask);
83  p += rr;
84  n -= rr;
85  }
86  if (n >= 16) {
87  if ((reinterpret_cast<uintptr_t>(p) & 32)) {
88  a1 = I128::LoadA16(p);
89  cmp1 = pred(a1);
90  if (!I128::IsZero(cmp1)) {
91  mask = I128::MoveMask8(cmp1);
92  return p + nlib_ctz32(mask);
93  }
94  p += 16;
95  n -= 16;
96  }
97  while (n >= 32) {
98  a1 = I128::LoadA16(p);
99  a2 = I128::LoadA16(p + 16);
100  cmp1 = I128::SetZero();
101  cmp2 = I128::SetZero();
102  cmp1 = pred(a1);
103  cmp2 = pred(a2);
104  if (!I128::IsZero(I128::Or(cmp1, cmp2))) {
105  mask = I128::MoveMask8(cmp1) | (I128::MoveMask8(cmp2) << 16);
106  return p + nlib_ctz32(mask);
107  }
108  p += 32;
109  n -= 32;
110  }
111  if (n >= 16) {
112  a1 = I128::LoadA16(p);
113  cmp1 = pred(a1);
114  if (!I128::IsZero(cmp1)) {
115  mask = I128::MoveMask8(cmp1);
116  return p + nlib_ctz32(mask);
117  }
118  p += 16;
119  n -= 16;
120  }
121  }
122  if (n > 0) {
123  a1 = I128::LoadA16(p);
124  cmp1 = pred(a1);
125  mask = I128::MoveMask8(cmp1);
126  mask &= (1 << n) - 1;
127  if (mask) return p + nlib_ctz32(mask);
128  }
129  return NULL;
130 }
131 
132 template <class PRED>
133 // i128 PRED(i128 c) { mask(c.u8[i]) if matched }
134 const void* nlib_memchr_pred_not(const void* s, PRED pred, size_t n) NLIB_NOEXCEPT {
135  if (!s) return NULL;
136 
137  const unsigned char* p = reinterpret_cast<const unsigned char*>(s);
138  i128 a1, a2;
139  i128 cmp1, cmp2;
140  uint32_t mask;
141  if (reinterpret_cast<uintptr_t>(p) & 15) {
142  size_t r = reinterpret_cast<uintptr_t>(p) & 15;
143  a1 = I128::LoadA16(p - r);
144  cmp1 = I128::Not(pred(a1));
145  mask = I128::MoveMask8(cmp1);
146  mask = mask >> r;
147  size_t rr = 16 - r;
148  if (n < rr) {
149  mask &= (1 << n) - 1;
150  if (mask) return p + nlib_ctz32(mask);
151  return NULL;
152  }
153  if (mask) return p + nlib_ctz32(mask);
154  p += rr;
155  n -= rr;
156  }
157  if (n >= 16) {
158  if ((reinterpret_cast<uintptr_t>(p) & 32)) {
159  a1 = I128::LoadA16(p);
160  cmp1 = pred(a1);
161  if (!I128::IsFull(cmp1)) {
162  mask = I128::MoveMask8(I128::Not(cmp1));
163  return p + nlib_ctz32(mask);
164  }
165  p += 16;
166  n -= 16;
167  }
168  while (n >= 32) {
169  a1 = I128::LoadA16(p);
170  a2 = I128::LoadA16(p + 16);
171  cmp1 = pred(a1);
172  cmp2 = pred(a2);
173  if (!I128::IsFull(I128::And(cmp1, cmp2))) {
174  mask = I128::MoveMask8(I128::Not(cmp1)) | (I128::MoveMask8(I128::Not(cmp2)) << 16);
175  return p + nlib_ctz32(mask);
176  }
177  p += 32;
178  n -= 32;
179  }
180  if (n >= 16) {
181  a1 = I128::LoadA16(p);
182  cmp1 = pred(a1);
183  if (!I128::IsFull(cmp1)) {
184  mask = I128::MoveMask8(I128::Not(cmp1));
185  return p + nlib_ctz32(mask);
186  }
187  p += 16;
188  n -= 16;
189  }
190  }
191  if (n > 0) {
192  a1 = I128::LoadA16(p);
193  cmp1 = I128::Not(pred(a1));
194  mask = I128::MoveMask8(cmp1);
195  mask &= (1 << n) - 1;
196  if (mask) return p + nlib_ctz32(mask);
197  }
198  return NULL;
199 }
200 
201 // mask if c.u8[i] is a-zA-Z
202 inline i128 __vectorcall IsAlpha(i128 c) NLIB_NOEXCEPT {
203  i128 result = I128::CmpLtInt8(c, I128::SetValue('{', each_int8));
204  result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue('`', each_int8)));
205 
206  i128 tmp = I128::CmpLtInt8(c, I128::SetValue('[', each_int8));
207  tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue('@', each_int8)));
208  result = I128::Or(result, tmp);
209 
210  return result;
211 }
212 
213 // mask if c.u8[i] is 0-9
214 inline i128 __vectorcall IsDigit(i128 c) NLIB_NOEXCEPT {
215  i128 result = I128::CmpLtInt8(c, I128::SetValue(':', each_int8));
216  result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue('/', each_int8)));
217 
218  return result;
219 }
220 
221 // mask if c.u8[i] is a-zA-Z0-9
222 inline i128 __vectorcall IsAlnum(i128 c) NLIB_NOEXCEPT { return I128::Or(IsDigit(c), IsAlpha(c)); }
223 
224 // mask if c.u8[i] is space, CR, LF, or tab
225 inline i128 __vectorcall IsSpace(i128 c) NLIB_NOEXCEPT {
226  i128 result = I128::CmpEq8(c, I128::SetValue(' ', each_int8));
227  result = I128::Or(result, I128::CmpEq8(c, I128::SetValue('\r', each_int8)));
228  result = I128::Or(result, I128::CmpEq8(c, I128::SetValue('\n', each_int8)));
229  result = I128::Or(result, I128::CmpEq8(c, I128::SetValue('\t', each_int8)));
230 
231  return result;
232 }
233 
234 // mask if c.u8[i] is 0-9A-Fa-f
235 inline i128 __vectorcall IsXdigit(i128 c) NLIB_NOEXCEPT {
236  i128 tmp;
237  i128 result = I128::CmpLtInt8(c, I128::SetValue(':', each_int8));
238  result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue('/', each_int8)));
239 
240  tmp = I128::CmpLtInt8(c, I128::SetValue('G', each_int8));
241  tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue('@', each_int8)));
242  result = I128::Or(result, tmp);
243 
244  tmp = I128::CmpLtInt8(c, I128::SetValue('g', each_int8));
245  tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue('`', each_int8)));
246  result = I128::Or(result, tmp);
247 
248  return result;
249 }
250 
251 namespace detail {
252 template<class T, class Compare>
253 class KeyIdxSortLess {
254  public:
255  KeyIdxSortLess(T* const* src, uint32_t mask,
256  Compare comp) NLIB_NOEXCEPT : src_(src), mask_(mask), comp_(comp) {
257  }
258  NLIB_ALWAYS_INLINE bool operator()(uint32_t lhs, uint32_t rhs) const NLIB_NOEXCEPT {
259  return comp_(*src_[lhs & mask_], *src_[rhs & mask_]);
260  }
261 
262  private:
263  T* const* src_;
264  uint32_t mask_;
265  Compare comp_;
266 };
267 
268 NLIB_ALIGNAS(16) extern NLIB_VIS_PUBLIC const uint32_t keyidxsort_0123[4];
269 } // namespace detail
270 
271 template<class T, class Compare>
272 errno_t KeyIdxSortN(T** dst, T* const* src, size_t n, Compare comp) NLIB_NOEXCEPT {
273  // T must have a member function such that
274  // uint32_t GetKey32() const;
275 #ifdef NLIB_64BIT
276  if (n == 0 || n > 0x7FFFFFFFU) return EINVAL;
277 #else
278  if (n == 0 || n > RSIZE_MAX) return EINVAL;
279 #endif
280  errno_t e;
281  size_t n16 = (n + 15) & ~15;
282 
284  e = mem_.Init(n16 * sizeof(uint32_t), 16); // NOLINT
285  if (NLIB_UNLIKELY(e != 0)) return e;
286  uint32_t* mem = reinterpret_cast<uint32_t*>(mem_.Get());
287  int idx_width = 32 - nlib_clz32(static_cast<uint32_t>(n16));
288  uint32_t idx_mask = (1U << idx_width) - 1;
289  uint32_t idx_mask_inv = ~idx_mask;
290 
291  uint32_t max_key = 0U;
292  uint32_t min_key = 0xFFFFFFFFU;
293  for (size_t i = 0; i < n; ++i) {
294  uint32_t key = src[i]->GetKey32();
295  mem[i] = key;
296  if (min_key > key)
297  min_key = key;
298  else if (max_key < key)
299  max_key = key;
300  }
301  // several number of the bits could be omitted because they are the same.
302  int left_shift = nlib_clz32(min_key ^ max_key);
303 
304  // embed indices
305 #if 1
306  i128 vecmask = I128::SetValue(idx_mask_inv, each_uint32);
307  i128 vecidx0 = I128::LoadA16(&detail::keyidxsort_0123[0]);
308  i128 vec_i = I128::SetZero();
309  i128 four = I128::SetValue(4, each_uint32);
310  i128 vecidx4 = I128::Add32(vecidx0, four);
311  i128 vecidx8 = I128::Add32(vecidx4, four);
312  i128 vecidx12 = I128::Add32(vecidx8, four);
313  i128 d16 = I128::Mult32(four, four);
314 
315  for (size_t i = 0; i < n16; i += 16) {
316  i128 m0 = I128::LoadA16(&mem[i]);
317  i128 m1 = I128::LoadA16(&mem[i + 4]);
318  i128 m2 = I128::LoadA16(&mem[i + 8]);
319  i128 m3 = I128::LoadA16(&mem[i + 12]);
320 
321  m0 = I128::And(I128::ShiftLeftLogical32(m0, left_shift), vecmask);
322  m1 = I128::And(I128::ShiftLeftLogical32(m1, left_shift), vecmask);
323  m2 = I128::And(I128::ShiftLeftLogical32(m2, left_shift), vecmask);
324  m3 = I128::And(I128::ShiftLeftLogical32(m3, left_shift), vecmask);
325  m0 = I128::Or(m0, I128::Add32(vec_i, vecidx0));
326  m1 = I128::Or(m1, I128::Add32(vec_i, vecidx4));
327  m2 = I128::Or(m2, I128::Add32(vec_i, vecidx8));
328  m3 = I128::Or(m3, I128::Add32(vec_i, vecidx12));
329 
330  I128::StoreA16(&mem[i], m0);
331  I128::StoreA16(&mem[i + 4], m1);
332  I128::StoreA16(&mem[i + 8], m2);
333  I128::StoreA16(&mem[i + 12], m3);
334  vec_i = I128::Add16(vec_i, d16);
335  }
336 #else
337  for (size_t i = 0; i < n; ++i) {
338  mem[i] = ((mem[i] << left_shift) & idx_mask_inv) | static_cast<uint32_t>(i);
339  }
340 #endif
341  for (size_t i = n; i < n16; ++i) {
342  // It is ok because MergeSortUint32A16 is a stable sort algorithm.
343  mem[i] = idx_mask_inv | static_cast<uint32_t>(i);
344  }
345 
346  e = MergeSortUint32A16(mem, n16);
347  if (NLIB_UNLIKELY(e != 0)) return e;
348 
349  // they must be sorted if the keys are the same.
350  uint32_t prev_key = mem[0] & idx_mask_inv;
351  size_t i = 1;
352  detail::KeyIdxSortLess<T, Compare> myless(src, idx_mask, comp);
353  while (i < n) {
354  uint32_t key = mem[i] & idx_mask_inv;
355  if (NLIB_UNLIKELY(key == prev_key)) {
356  size_t from = i - 1;
357  do {
358  ++i;
359  if (i == n) break;
360  key = mem[i] & idx_mask_inv;
361  } while (key == prev_key);
362  // if the sort algorithm is not stable, mem[i -> n16] might be swapped.
363  std::stable_sort(&mem[from], &mem[i], myless);
364  }
365  prev_key = key;
366  ++i;
367  }
368 
369  for (i = 0; i < n; ++i) {
370  dst[i] = src[mem[i] & idx_mask];
371  }
372  return 0;
373 }
374 
375 template<class T>
376 NLIB_ALWAYS_INLINE errno_t KeyIdxSortN(T** dst, T* const* src, size_t n) NLIB_NOEXCEPT {
377  return KeyIdxSortN(dst, src, n, std::less<T>());
378 }
379 
380 template<class T, class Compare>
381 inline errno_t KeyIdxSort(T** first, T** last, Compare comp) NLIB_NOEXCEPT {
382  size_t n = last - first;
383  T** tmp = reinterpret_cast<T**>(nlib_malloc(n * sizeof(*first)));
384  if (NLIB_UNLIKELY(!tmp)) return ENOMEM;
385  errno_t e = KeyIdxSortN(tmp, first, n, comp);
386  if (NLIB_LIKELY(e == 0)) {
387  nlib_memcpy(first, n * sizeof(*first), tmp, n * sizeof(*tmp));
388  }
389  nlib_free(tmp);
390  return e;
391 }
392 
393 template<class T>
394 NLIB_ALWAYS_INLINE errno_t KeyIdxSort(T** first, T** last) NLIB_NOEXCEPT {
395  return KeyIdxSort(first, last, std::less<T>());
396 }
397 
398 } // namespace simd
399 NLIB_NAMESPACE_END
400 
401 #endif
402 
403 #endif // INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_
i128 IsXdigit(i128 c) noexcept
Masks hexadecimal characters in c.
#define NLIB_ALWAYS_INLINE
Indicates that the compiler is forced to perform inline expansion of functions.
Definition: Platform_unix.h:97
errno_t Init(size_t size, size_t align) noexcept
Allocates memory.
i128 IsDigit(i128 c) noexcept
Masks the characters 0 through 9 in c.
errno_t KeyIdxSort(T **first, T **last) noexcept
Executes KeyIdxSort(first, last, std::less<T>()).
Implements the class and functions for SIMD computations on integers.
#define NLIB_UNLIKELY(x)
Indicates to the compiler that condition x is likely to be false.
A class for obtaining aligned memory.
#define RSIZE_MAX
Defines a value somewhat smaller than the maximum value of size_t.
Definition: Platform.h:216
i128 IsAlpha(i128 c) noexcept
Masks alphabetic letters in c.
static int nlib_clz32(uint32_t x)
Returns the number of consecutive zero bits, with respect to the most significant bit (MSB)...
Definition: Platform.h:2535
i128 IsSpace(i128 c) noexcept
Masks space characters (0x20, 0x09, 0x0A, 0x0D) in c.
#define NLIB_VIS_PUBLIC
Symbols for functions and classes are made available outside of the library.
Definition: Platform_unix.h:89
i128 IsAlnum(i128 c) noexcept
Masks alphabetic letters or the characters 0 through 9 in c.
NLIB_CHECK_RESULT void * nlib_malloc(size_t size)
A weak function that calls the C standard function malloc. nlib calls malloc via this function...
nlib_i128_t i128
nlib_i128_t is defined using typedef.
Definition: SimdInt.h:74
#define NLIB_LIKELY(x)
Indicates to the compiler that condition x is likely to be true.
Definition: Platform_unix.h:99
static errno_t nlib_memcpy(void *s1, size_t s1max, const void *s2, size_t n)
An implementation corresponding to N1078 memcpy_s.
Definition: Platform.h:2357
static int nlib_ctz32(uint32_t x)
Returns the number of consecutive zero bits, with respect to the least significant bit (LSB)...
Definition: Platform.h:2536
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
Definition: SimdInt.h:53
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Definition: Config.h:99
void * Get() noexcept
Returns a pointer to the allocated region.
errno_t KeyIdxSortN(T **dst, T *const *src, size_t n) noexcept
Executes KeyIdxSortN(dst, src, n, std::less<T>()).
Used when aligned memory needs to be obtained.
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
Definition: Config.h:239
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
Definition: SimdInt.h:47
const void * nlib_memchr_pred_not(const void *s, PRED pred, size_t n) noexcept
A function template for examining the bytes in byte strings using SIMD instructions.
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
Definition: Config.h:154
void nlib_free(void *ptr)
A weak function that calls the C standard function free. nlib calls free via this function...
errno_t MergeSortUint32A16(uint32_t *data, size_t n) noexcept
Uses SIMD to merge and sort 32-bit unsigned integer strings in the ascending order.
const void * nlib_memchr_pred(const void *s, PRED pred, size_t n) noexcept
A function template for examining the bytes in byte strings using SIMD instructions.
Definition: SimdAlgorithm.h:63
int errno_t
Indicates with an int-type typedef that a POSIX error value is returned as the return value...
Definition: NMalloc.h:37