nlib
SimdAlgorithm.h
Go to the documentation of this file.
1 
2 /*--------------------------------------------------------------------------------*
3  Project: CrossRoad
4  Copyright (C)Nintendo All rights reserved.
5 
6  These coded instructions, statements, and computer programs contain proprietary
7  information of Nintendo and/or its licensed developers and are protected by
8  national and international copyright laws. They may not be disclosed to third
9  parties or copied or duplicated in any form, in whole or in part, without the
10  prior written consent of Nintendo.
11 
12  The content herein is highly confidential and should be handled accordingly.
13  *--------------------------------------------------------------------------------*/
14 
15 #pragma once
16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_
17 #define INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_
18 
19 #include <algorithm>
20 #include <functional>
21 #include "nn/nlib/simd/SimdInt.h"
23 
24 #if defined(NLIB_SIMD)
25 
26 NLIB_NAMESPACE_BEGIN
27 namespace simd {
28 namespace detail {
29 
30 NLIB_VIS_PUBLIC void sortUint32A16(uint32_t* buf, uint32_t* src, size_t N) NLIB_NOEXCEPT;
31 NLIB_VIS_PUBLIC void sortUint32A16_1(uint32_t* buf, uint32_t* src, size_t N) NLIB_NOEXCEPT;
32 
33 template<size_t NumElem, bool flag>
34 struct MergeSortHelper {
35  static NLIB_ALWAYS_INLINE void Sort(uint32_t* data) NLIB_NOEXCEPT {
36  NLIB_ALIGNAS(16) uint32_t tmp[NumElem];
37  sortUint32A16(tmp, data, NumElem);
38  }
39 };
40 
41 template<size_t NumElem>
42 struct MergeSortHelper<NumElem, false> {
43  static NLIB_ALWAYS_INLINE void Sort(uint32_t* data) NLIB_NOEXCEPT {
44  NLIB_ALIGNAS(16) uint32_t tmp[NumElem];
45  sortUint32A16_1(tmp, data, NumElem);
46  }
47 };
48 
49 } // namespace detail
50 
51 template<size_t NumElem>
52 // data must be 16 bytes aligned, NumElem must be multiple of 16
54  NLIB_STATIC_ASSERT((NumElem > 0) && (NumElem % 16 == 0));
55  NLIB_ASSERT(!(reinterpret_cast<uintptr_t>(data) & 15));
56  detail::MergeSortHelper<NumElem, ((NumElem & (NumElem - 1)) == 0)>::Sort(data);
57 }
58 
60 
61 namespace detail {
62 inline i128 SimdFindSub2(i128, i128 cmp) NLIB_NOEXCEPT {
63  return cmp;
64 }
65 template<class Pred, class... Tail>
66 inline i128 SimdFindSub2(i128 a, i128 cmp, Pred pred, Tail... tail) NLIB_NOEXCEPT {
67  return SimdFindSub(a, I128::Or(cmp, pred(a)), tail...);
68 }
69 template<class Pred, class... Tail>
70 inline i128 SimdFindSub(i128 a, Pred pred, Tail... tail) NLIB_NOEXCEPT {
71  return SimdFindSub2(a, pred(a), tail...);
72 }
73 inline i128 PartialLoad(const nlib_byte_t* p, size_t n) {
74  uint8_t tmpbuf[16];
75  nlib_memset(&tmpbuf[0], 0, 16);
76  nlib_memcpy(&tmpbuf[0], 16, p, n);
77  return I128::LoadA1(tmpbuf);
78 }
79 } // namespace detail
80 
81 template<class... PredList>
82 // i128 PRED(i128 c) { mask(c.u8[i]) if matched }
83 const nlib_byte_t* SimdFind(const void* buf, size_t n, PredList... predlist) NLIB_NOEXCEPT {
84  if (!buf) return nullptr;
85  const nlib_byte_t* p = static_cast<const nlib_byte_t*>(buf);
86  while (n >= 16) {
87  i128 cmp = detail::SimdFindSub(I128::LoadA1(p), predlist...);
88  if (!I128::IsZero(cmp)) return p + nlib_ctz32(I128::MoveMask8(cmp));
89  p += 16;
90  n -= 16;
91  }
92  if (n > 0) {
93  // NOTE: NO OVERLAP
94  // 'pred' might count the occurence of the characters
95  i128 cmp = detail::SimdFindSub(detail::PartialLoad(p, n), predlist...);
96  if (!I128::IsZero(cmp)) {
97  int mask = I128::MoveMask8(cmp) & ((1 << n) - 1);
98  if (mask != 0) return p + nlib_ctz32(mask);
99  }
100  }
101  return nullptr;
102 }
103 
104 template<class... PredList>
105 // i128 PRED(i128 c) { mask(c.u8[i]) if matched }
106 const nlib_byte_t* SimdRfind(const void* buf, size_t n, PredList... predlist) NLIB_NOEXCEPT {
107  if (!buf) return nullptr;
108  const nlib_byte_t* p = static_cast<const nlib_byte_t*>(buf) + n;
109  while (n >= 16) {
110  p -= 16;
111  n -= 16;
112  i128 cmp = detail::SimdFindSub(I128::LoadA1(p), predlist...);
113  if (!I128::IsZero(cmp)) return p + (31 - nlib_clz32(I128::MoveMask8(cmp)));
114  }
115  if (n > 0) {
116  // NOTE: NO OVERLAP
117  // 'pred' might count the occurence of the characters
118  i128 cmp = detail::SimdFindSub(detail::PartialLoad(p, n), predlist...);
119  if (!I128::IsZero(cmp)) {
120  int mask = I128::MoveMask8(cmp) & ((1 << n) - 1);
121  if (mask != 0) return static_cast<const nlib_byte_t*>(buf) + (31 - nlib_clz32(mask));
122  }
123  }
124  return nullptr;
125 }
126 
127 template<class... PredList>
128 inline const char* SimdFind(const char* s, PredList... predlist) NLIB_NOEXCEPT {
129  return SimdFind(s, nlib_strlen(s), predlist...);
130 }
131 template<class... PredList>
132 inline const char* SimdRfind(const char* s, PredList... predlist) NLIB_NOEXCEPT {
133  return SimdRfind(s, nlib_strlen(s), predlist...);
134 }
135 
136 template<class PRED>
137 // i128 PRED(i128 c) { mask(c.u8[i]) if matched }
138 inline const void* nlib_memchr_pred(const void* s, PRED pred, size_t n) NLIB_NOEXCEPT {
139  return SimdFind(s, n, pred);
140 }
141 
142 template<class PRED>
143 // i128 PRED(i128 c) { mask(c.u8[i]) if matched }
144 inline const void* nlib_memchr_pred_not(const void* s, PRED pred, size_t n) NLIB_NOEXCEPT {
145  return SimdFind(s, n, [&pred](i128 c) { return I128::Not(pred(c)); });
146 }
147 
148 // mask if c.u8[i] is a-zA-Z
149 inline i128 __vectorcall IsAlpha(i128 c) NLIB_NOEXCEPT {
150  i128 result = I128::CmpLtInt8(c, I128::SetValue('{', each_int8));
151  result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue('`', each_int8)));
152 
153  i128 tmp = I128::CmpLtInt8(c, I128::SetValue('[', each_int8));
154  tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue('@', each_int8)));
155  result = I128::Or(result, tmp);
156 
157  return result;
158 }
159 
160 // mask if c.u8[i] is 0-9
161 inline i128 __vectorcall IsDigit(i128 c) NLIB_NOEXCEPT {
162  i128 result = I128::CmpLtInt8(c, I128::SetValue(':', each_int8));
163  result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue('/', each_int8)));
164 
165  return result;
166 }
167 
168 // mask if c.u8[i] is a-zA-Z0-9
169 inline i128 __vectorcall IsAlnum(i128 c) NLIB_NOEXCEPT {
170  return I128::Or(IsDigit(c), IsAlpha(c));
171 }
172 
173 // mask if c.u8[i] is space, CR, LF, or tab
174 inline i128 __vectorcall IsSpace(i128 c) NLIB_NOEXCEPT {
175  i128 result = I128::CmpEq8(c, I128::SetValue(' ', each_int8));
176  result = I128::Or(result, I128::CmpEq8(c, I128::SetValue('\r', each_int8)));
177  result = I128::Or(result, I128::CmpEq8(c, I128::SetValue('\n', each_int8)));
178  result = I128::Or(result, I128::CmpEq8(c, I128::SetValue('\t', each_int8)));
179 
180  return result;
181 }
182 
183 // mask if c.u8[i] is 0-9A-Fa-f
184 inline i128 __vectorcall IsXdigit(i128 c) NLIB_NOEXCEPT {
185  i128 tmp;
186  i128 result = I128::CmpLtInt8(c, I128::SetValue(':', each_int8));
187  result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue('/', each_int8)));
188 
189  tmp = I128::CmpLtInt8(c, I128::SetValue('G', each_int8));
190  tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue('@', each_int8)));
191  result = I128::Or(result, tmp);
192 
193  tmp = I128::CmpLtInt8(c, I128::SetValue('g', each_int8));
194  tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue('`', each_int8)));
195  result = I128::Or(result, tmp);
196 
197  return result;
198 }
199 
200 inline i128 __vectorcall IsNotAscii(i128 c) NLIB_NOEXCEPT {
201  return I128::CmpLtInt8(c, I128::SetZero());
202 }
203 
204 namespace detail {
205 template<class T, class Compare>
206 class KeyIdxSortLess {
207  public:
208  KeyIdxSortLess(T* const* src, uint32_t mask, Compare comp) NLIB_NOEXCEPT : src_(src),
209  mask_(mask),
210  comp_(comp) {}
211  NLIB_ALWAYS_INLINE bool operator()(uint32_t lhs, uint32_t rhs) const NLIB_NOEXCEPT {
212  return comp_(*src_[lhs & mask_], *src_[rhs & mask_]);
213  }
214 
215  private:
216  T* const* src_;
217  uint32_t mask_;
218  Compare comp_;
219 };
220 
221 NLIB_ALIGNAS(16) extern NLIB_VIS_PUBLIC const uint32_t keyidxsort_0123[4];
222 } // namespace detail
223 
224 template<class T, class Compare>
225 errno_t KeyIdxSortN(T** dst, T* const* src, size_t n, Compare comp) NLIB_NOEXCEPT {
226  // T must have a member function such that
227  // uint32_t GetKey32() const;
228 #ifdef NLIB_64BIT
229  if (n == 0 || n > 0x7FFFFFFFU) return EINVAL;
230 #else
231  if (n == 0 || n > RSIZE_MAX) return EINVAL;
232 #endif
233  errno_t e;
234  size_t n16 = (n + 15) & ~15;
235 
237  e = mem_.Init(n16 * sizeof(uint32_t), 16);
238  if (NLIB_UNLIKELY(e != 0)) return e;
239  uint32_t* mem = reinterpret_cast<uint32_t*>(mem_.Get());
240  int idx_width = 32 - nlib_clz32(static_cast<uint32_t>(n16));
241  uint32_t idx_mask = (1U << idx_width) - 1;
242  uint32_t idx_mask_inv = ~idx_mask;
243 
244  uint32_t max_key = 0U;
245  uint32_t min_key = 0xFFFFFFFFU;
246  for (size_t i = 0; i < n; ++i) {
247  uint32_t key = src[i]->GetKey32();
248  mem[i] = key;
249  if (min_key > key)
250  min_key = key;
251  else if (max_key < key)
252  max_key = key;
253  }
254  // several number of the bits could be omitted because they are the same.
255  int left_shift = nlib_clz32(min_key ^ max_key);
256 
257  // embed indices
258 #if 1
259  i128 vecmask = I128::SetValue(idx_mask_inv, each_uint32);
260  i128 vecidx0 = I128::LoadA16(&detail::keyidxsort_0123[0]);
261  i128 vec_i = I128::SetZero();
262  i128 four = I128::SetValue(4, each_uint32);
263  i128 vecidx4 = I128::Add32(vecidx0, four);
264  i128 vecidx8 = I128::Add32(vecidx4, four);
265  i128 vecidx12 = I128::Add32(vecidx8, four);
266  i128 d16 = I128::Mult32(four, four);
267 
268  for (size_t i = 0; i < n16; i += 16) {
269  i128 m0 = I128::LoadA16(&mem[i]);
270  i128 m1 = I128::LoadA16(&mem[i + 4]);
271  i128 m2 = I128::LoadA16(&mem[i + 8]);
272  i128 m3 = I128::LoadA16(&mem[i + 12]);
273 
274  m0 = I128::And(I128::ShiftLeftLogical32(m0, left_shift), vecmask);
275  m1 = I128::And(I128::ShiftLeftLogical32(m1, left_shift), vecmask);
276  m2 = I128::And(I128::ShiftLeftLogical32(m2, left_shift), vecmask);
277  m3 = I128::And(I128::ShiftLeftLogical32(m3, left_shift), vecmask);
278  m0 = I128::Or(m0, I128::Add32(vec_i, vecidx0));
279  m1 = I128::Or(m1, I128::Add32(vec_i, vecidx4));
280  m2 = I128::Or(m2, I128::Add32(vec_i, vecidx8));
281  m3 = I128::Or(m3, I128::Add32(vec_i, vecidx12));
282 
283  I128::StoreA16(&mem[i], m0);
284  I128::StoreA16(&mem[i + 4], m1);
285  I128::StoreA16(&mem[i + 8], m2);
286  I128::StoreA16(&mem[i + 12], m3);
287  vec_i = I128::Add16(vec_i, d16);
288  }
289 #else
290  for (size_t i = 0; i < n; ++i) {
291  mem[i] = ((mem[i] << left_shift) & idx_mask_inv) | static_cast<uint32_t>(i);
292  }
293 #endif
294  for (size_t i = n; i < n16; ++i) {
295  // It is ok because MergeSortUint32A16 is a stable sort algorithm.
296  mem[i] = idx_mask_inv | static_cast<uint32_t>(i);
297  }
298 
299  e = MergeSortUint32A16(mem, n16);
300  if (NLIB_UNLIKELY(e != 0)) return e;
301 
302  // they must be sorted if the keys are the same.
303  uint32_t prev_key = mem[0] & idx_mask_inv;
304  size_t i = 1;
305  detail::KeyIdxSortLess<T, Compare> myless(src, idx_mask, comp);
306  while (i < n) {
307  uint32_t key = mem[i] & idx_mask_inv;
308  if (NLIB_UNLIKELY(key == prev_key)) {
309  size_t from = i - 1;
310  do {
311  ++i;
312  if (i == n) break;
313  key = mem[i] & idx_mask_inv;
314  } while (key == prev_key);
315  // if the sort algorithm is not stable, mem[i -> n16] might be swapped.
316  std::stable_sort(&mem[from], &mem[i], myless);
317  }
318  prev_key = key;
319  ++i;
320  }
321 
322  for (i = 0; i < n; ++i) {
323  dst[i] = src[mem[i] & idx_mask];
324  }
325  return 0;
326 }
327 
328 template<class T>
329 NLIB_ALWAYS_INLINE errno_t KeyIdxSortN(T** dst, T* const* src, size_t n) NLIB_NOEXCEPT {
330  return KeyIdxSortN(dst, src, n, std::less<T>());
331 }
332 
333 template<class T, class Compare>
334 inline errno_t KeyIdxSort(T** first, T** last, Compare comp) NLIB_NOEXCEPT {
335  size_t n = last - first;
336  T** tmp = reinterpret_cast<T**>(nlib_malloc(n * sizeof(*first)));
337  if (NLIB_UNLIKELY(!tmp)) return ENOMEM;
338  errno_t e = KeyIdxSortN(tmp, first, n, comp);
339  if (NLIB_LIKELY(e == 0)) {
340  nlib_memcpy(first, n * sizeof(*first), tmp, n * sizeof(*tmp));
341  }
342  nlib_free(tmp);
343  return e;
344 }
345 
346 template<class T>
348  return KeyIdxSort(first, last, std::less<T>());
349 }
350 
351 } // namespace simd
352 NLIB_NAMESPACE_END
353 
354 #endif
355 
356 #endif // INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_
i128 IsXdigit(i128 c) noexcept
Masks hexadecimal characters in c.
static errno_t nlib_memset(void *buf, int ch, size_t n)
Makes a function call corresponding to memset(buf, ch, n).
Definition: Platform.h:2544
#define NLIB_ALWAYS_INLINE
Indicates that the compiler is forced to perform inline expansion of functions.
Definition: Platform_unix.h:95
errno_t Init(size_t size, size_t align) noexcept
Allocates memory.
i128 IsDigit(i128 c) noexcept
Masks the characters 0 through 9 in c.
errno_t KeyIdxSort(T **first, T **last) noexcept
Executes KeyIdxSort(first, last, std::less<T>()).
Implements the class and functions for SIMD computations on integers.
#define NLIB_UNLIKELY(x)
Indicates to the compiler that condition x is likely to be false.
Definition: Platform_unix.h:98
A class for obtaining aligned memory.
#define RSIZE_MAX
Defines a value somewhat smaller than the maximum value of size_t.
Definition: Platform.h:219
i128 IsAlpha(i128 c) noexcept
Masks alphabetic letters in c.
static int nlib_clz32(uint32_t x)
Returns the number of consecutive zero bits, with respect to the most significant bit (MSB)...
Definition: Platform.h:2690
i128 IsSpace(i128 c) noexcept
Masks space characters (0x20, 0x09, 0x0A, 0x0D) in c.
#define NLIB_VIS_PUBLIC
Symbols for functions and classes are made available outside of the library.
Definition: Platform_unix.h:87
i128 IsAlnum(i128 c) noexcept
Masks alphabetic letters or the characters 0 through 9 in c.
void * nlib_malloc(size_t size)
A weak function that calls the C standard function malloc. nlib calls malloc via this function...
nlib_i128_t i128
nlib_i128_t is defined using typedef.
Definition: SimdInt.h:74
#define NLIB_LIKELY(x)
Indicates to the compiler that condition x is likely to be true.
Definition: Platform_unix.h:97
static int nlib_ctz32(uint32_t x)
Returns the number of consecutive zero bits, with respect to the least significant bit (LSB)...
Definition: Platform.h:2691
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
Definition: SimdInt.h:53
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Definition: Config.h:109
void * Get() noexcept
Returns a pointer to the allocated region.
errno_t KeyIdxSortN(T **dst, T *const *src, size_t n) noexcept
Executes KeyIdxSortN(dst, src, n, std::less<T>()).
Used when aligned memory needs to be obtained.
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
Definition: Config.h:260
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
Definition: SimdInt.h:47
size_t nlib_strlen(const char *s)
Internally calls strlen(). In some cases, it may operate as an independent implementation.
const void * nlib_memchr_pred_not(const void *s, PRED pred, size_t n) noexcept
A function template for examining the bytes in byte strings using SIMD instructions.
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
Definition: Config.h:174
void nlib_free(void *ptr)
A weak function that calls the C standard function free. nlib calls free via this function...
unsigned char nlib_byte_t
This type will be defined as std::byte in a typedef of C++17 or later.
Definition: Platform.h:314
errno_t MergeSortUint32A16(uint32_t *data, size_t n) noexcept
Uses SIMD to merge and sort 32-bit unsigned integer strings in the ascending order.
const void * nlib_memchr_pred(const void *s, PRED pred, size_t n) noexcept
A function template for examining the bytes in byte strings using SIMD instructions.
int errno_t
Indicates with an int-type typedef that a POSIX error value is returned as the return value...
Definition: NMalloc.h:37