nlib
SimdAlgorithm.h
Go to the documentation of this file.
1 
2 #pragma once
3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_
4 #define INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_
5 
6 #include "nn/nlib/simd/SimdInt.h"
7 
8 #if defined(NLIB_SIMD)
9 
10 NLIB_NAMESPACE_BEGIN
11 namespace simd {
12 
13 namespace detail {
14 NLIB_VIS_PUBLIC void sortUint32A16(uint32_t* buf, uint32_t* src, size_t N);
15 NLIB_VIS_PUBLIC void sortUint32A16_1(uint32_t* buf, uint32_t* src, size_t N);
16 
17 template <size_t NumElem, bool flag>
18 struct MergeSortHelper {
19  static NLIB_ALWAYS_INLINE void Sort(uint32_t* data) {
20  NLIB_ALIGNAS(16) uint32_t tmp[NumElem];
21  sortUint32A16(tmp, data, NumElem);
22  }
23 };
24 
25 template <size_t NumElem>
26 struct MergeSortHelper<NumElem, false> {
27  static NLIB_ALWAYS_INLINE void Sort(uint32_t* data) {
28  NLIB_ALIGNAS(16) uint32_t tmp[NumElem];
29  sortUint32A16_1(tmp, data, NumElem);
30  }
31 };
32 
33 } // namespace detail
34 
35 template <size_t NumElem>
36 // data must be 16 bytes aligned, NumElem must be multiple of 16
37 NLIB_ALWAYS_INLINE void MergeSortUint32A16(uint32_t* data) {
38  NLIB_STATIC_ASSERT((NumElem > 0) && (NumElem % 16 == 0));
39  NLIB_ASSERT(!(reinterpret_cast<uintptr_t>(data) & 15));
40  detail::MergeSortHelper<NumElem, ((NumElem & (NumElem - 1)) == 0)>::Sort(data);
41 }
42 
43 template <class PRED>
44 // i128 PRED(i128 c) { mask(c.u8[i]) if matched }
45 const void* nlib_memchr_pred(const void* s, PRED pred, size_t n) {
46  if (!s) return NULL;
47 
48  const unsigned char* p = reinterpret_cast<const unsigned char*>(s);
49  i128 a1, a2;
50  i128 cmp1, cmp2;
51  uint32_t mask;
52  if (reinterpret_cast<uintptr_t>(p) & 15) {
53  size_t r = reinterpret_cast<uintptr_t>(p) & 15;
54  a1 = I128::LoadA16(p - r);
55  cmp1 = pred(a1);
56  mask = I128::MoveMask8(cmp1);
57  mask = mask >> r;
58  size_t rr = 16 - r;
59  if (n < rr) {
60  mask &= (1 << n) - 1;
61  if (mask) return p + nlib_ctz(mask);
62  return NULL;
63  }
64  if (mask) return p + nlib_ctz(mask);
65  p += rr;
66  n -= rr;
67  }
68  if (n >= 16) {
69  if ((reinterpret_cast<uintptr_t>(p) & 32)) {
70  a1 = I128::LoadA16(p);
71  cmp1 = pred(a1);
72  if (!I128::IsZero(cmp1)) {
73  mask = I128::MoveMask8(cmp1);
74  return p + nlib_ctz(mask);
75  }
76  p += 16;
77  n -= 16;
78  }
79  while (n >= 32) {
80  a1 = I128::LoadA16(p);
81  a2 = I128::LoadA16(p + 16);
82  cmp1 = I128::SetZero();
83  cmp2 = I128::SetZero();
84  cmp1 = pred(a1);
85  cmp2 = pred(a2);
86  if (!I128::IsZero(I128::Or(cmp1, cmp2))) {
87  mask = I128::MoveMask8(cmp1) | (I128::MoveMask8(cmp2) << 16);
88  return p + nlib_ctz(mask);
89  }
90  p += 32;
91  n -= 32;
92  }
93  if (n >= 16) {
94  a1 = I128::LoadA16(p);
95  cmp1 = pred(a1);
96  if (!I128::IsZero(cmp1)) {
97  mask = I128::MoveMask8(cmp1);
98  return p + nlib_ctz(mask);
99  }
100  p += 16;
101  n -= 16;
102  }
103  }
104  if (n > 0) {
105  a1 = I128::LoadA16(p);
106  cmp1 = pred(a1);
107  mask = I128::MoveMask8(cmp1);
108  mask &= (1 << n) - 1;
109  if (mask) return p + nlib_ctz(mask);
110  }
111  return NULL;
112 }
113 
114 template <class PRED>
115 // i128 PRED(i128 c) { mask(c.u8[i]) if matched }
116 const void* nlib_memchr_pred_not(const void* s, PRED pred, size_t n) {
117  if (!s) return NULL;
118 
119  const unsigned char* p = reinterpret_cast<const unsigned char*>(s);
120  i128 a1, a2;
121  i128 cmp1, cmp2;
122  uint32_t mask;
123  if (reinterpret_cast<uintptr_t>(p) & 15) {
124  size_t r = reinterpret_cast<uintptr_t>(p) & 15;
125  a1 = I128::LoadA16(p - r);
126  cmp1 = I128::Not(pred(a1));
127  mask = I128::MoveMask8(cmp1);
128  mask = mask >> r;
129  size_t rr = 16 - r;
130  if (n < rr) {
131  mask &= (1 << n) - 1;
132  if (mask) return p + nlib_ctz(mask);
133  return NULL;
134  }
135  if (mask) return p + nlib_ctz(mask);
136  p += rr;
137  n -= rr;
138  }
139  if (n >= 16) {
140  if ((reinterpret_cast<uintptr_t>(p) & 32)) {
141  a1 = I128::LoadA16(p);
142  cmp1 = pred(a1);
143  if (!I128::IsFull(cmp1)) {
144  mask = I128::MoveMask8(I128::Not(cmp1));
145  return p + nlib_ctz(mask);
146  }
147  p += 16;
148  n -= 16;
149  }
150  while (n >= 32) {
151  a1 = I128::LoadA16(p);
152  a2 = I128::LoadA16(p + 16);
153  cmp1 = pred(a1);
154  cmp2 = pred(a2);
155  if (!I128::IsFull(I128::And(cmp1, cmp2))) {
156  mask = I128::MoveMask8(I128::Not(cmp1)) | (I128::MoveMask8(I128::Not(cmp2)) << 16);
157  return p + nlib_ctz(mask);
158  }
159  p += 32;
160  n -= 32;
161  }
162  if (n >= 16) {
163  a1 = I128::LoadA16(p);
164  cmp1 = pred(a1);
165  if (!I128::IsFull(cmp1)) {
166  mask = I128::MoveMask8(I128::Not(cmp1));
167  return p + nlib_ctz(mask);
168  }
169  p += 16;
170  n -= 16;
171  }
172  }
173  if (n > 0) {
174  a1 = I128::LoadA16(p);
175  cmp1 = I128::Not(pred(a1));
176  mask = I128::MoveMask8(cmp1);
177  mask &= (1 << n) - 1;
178  if (mask) return p + nlib_ctz(mask);
179  }
180  return NULL;
181 }
182 
183 // mask if c.u8[i] is a-zA-Z
184 inline i128 __vectorcall IsAlpha(i128 c) NLIB_NOEXCEPT {
185  i128 result = I128::CmpLtInt8(c, I128::SetValue('{', each_int8));
186  result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue('`', each_int8)));
187 
188  i128 tmp = I128::CmpLtInt8(c, I128::SetValue('[', each_int8));
189  tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue('@', each_int8)));
190  result = I128::Or(result, tmp);
191 
192  return result;
193 }
194 
195 // mask if c.u8[i] is 0-9
196 inline i128 __vectorcall IsDigit(i128 c) NLIB_NOEXCEPT {
197  i128 result = I128::CmpLtInt8(c, I128::SetValue(':', each_int8));
198  result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue('/', each_int8)));
199 
200  return result;
201 }
202 
203 // mask if c.u8[i] is a-zA-Z0-9
204 inline i128 __vectorcall IsAlnum(i128 c) NLIB_NOEXCEPT { return I128::Or(IsDigit(c), IsAlpha(c)); }
205 
206 // mask if c.u8[i] is space, CR, LF, or tab
207 inline i128 __vectorcall IsSpace(i128 c) NLIB_NOEXCEPT {
208  i128 result = I128::CmpEq8(c, I128::SetValue(' ', each_int8));
209  result = I128::Or(result, I128::CmpEq8(c, I128::SetValue('\r', each_int8)));
210  result = I128::Or(result, I128::CmpEq8(c, I128::SetValue('\n', each_int8)));
211  result = I128::Or(result, I128::CmpEq8(c, I128::SetValue('\t', each_int8)));
212 
213  return result;
214 }
215 
216 // mask if c.u8[i] is 0-9A-Fa-f
217 inline i128 __vectorcall IsXdigit(i128 c) NLIB_NOEXCEPT {
218  i128 tmp;
219  i128 result = I128::CmpLtInt8(c, I128::SetValue(':', each_int8));
220  result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue('/', each_int8)));
221 
222  tmp = I128::CmpLtInt8(c, I128::SetValue('G', each_int8));
223  tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue('@', each_int8)));
224  result = I128::Or(result, tmp);
225 
226  tmp = I128::CmpLtInt8(c, I128::SetValue('g', each_int8));
227  tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue('`', each_int8)));
228  result = I128::Or(result, tmp);
229 
230  return result;
231 }
232 
233 } // namespace simd
234 NLIB_NAMESPACE_END
235 
236 #endif
237 
238 #endif // INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Definition: Platform.h:2151
i128 IsXdigit(i128 c) noexcept
Masks hexadecimal characters in c.
i128 IsDigit(i128 c) noexcept
Masks the characters 0 though 9 in c.
Implements the class and functions for SIMD computations on integers.
i128 IsAlpha(i128 c) noexcept
Masks alphabetic letters in c.
int nlib_ctz(uint32_t x)
Returns the number of consecutive zero bits, with respect to the least significant bit (LSB)...
i128 IsSpace(i128 c) noexcept
Masks space characters (0x20, 0x09, 0x0A, 0x0D) in c.
i128 IsAlnum(i128 c) noexcept
Masks alphabetic letters or the characters 0 though 9 in c.
void MergeSortUint32A16(uint32_t *data)
Uses SIMD to merge sort a sequence of 32-bit unsigned integers.
Definition: SimdAlgorithm.h:37
const void * nlib_memchr_pred_not(const void *s, PRED pred, size_t n)
A function template for examining the bytes in byte strings using SIMD instructions.
nlib_i128_t i128
nlib_i128_t is defined using typedef.
Definition: SimdInt.h:71
const void * nlib_memchr_pred(const void *s, PRED pred, size_t n)
A function template for examining the bytes in byte strings using SIMD instructions.
Definition: SimdAlgorithm.h:45
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
Definition: Config.h:209
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
Definition: SimdInt.h:40
#define NLIB_ALWAYS_INLINE
Indicates that the compiler is forced to perform inline expansion of functions.
Definition: Platform_unix.h:59
#define NLIB_VIS_PUBLIC
Symbols for functions and classes are made available outside of the library.
Definition: Platform_unix.h:51
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
Definition: Config.h:117