3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ 4 #define INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ 11 #if defined(NLIB_SIMD) 20 template <
size_t NumElem,
bool flag>
21 struct MergeSortHelper {
24 sortUint32A16(tmp, data, NumElem);
28 template <
size_t NumElem>
29 struct MergeSortHelper<NumElem, false> {
32 sortUint32A16_1(tmp, data, NumElem);
38 template <
size_t NumElem>
42 NLIB_ASSERT(!(reinterpret_cast<uintptr_t>(data) & 15));
43 detail::MergeSortHelper<NumElem, ((NumElem & (NumElem - 1)) == 0)>::Sort(data);
53 const unsigned char* p =
reinterpret_cast<const unsigned char*
>(s);
57 if (reinterpret_cast<uintptr_t>(p) & 15) {
58 size_t r =
reinterpret_cast<uintptr_t
>(p) & 15;
59 a1 = I128::LoadA16(p - r);
61 mask = I128::MoveMask8(cmp1);
66 if (mask)
return p + nlib_ctz(mask);
69 if (mask)
return p + nlib_ctz(mask);
74 if ((reinterpret_cast<uintptr_t>(p) & 32)) {
75 a1 = I128::LoadA16(p);
77 if (!I128::IsZero(cmp1)) {
78 mask = I128::MoveMask8(cmp1);
79 return p + nlib_ctz(mask);
85 a1 = I128::LoadA16(p);
86 a2 = I128::LoadA16(p + 16);
87 cmp1 = I128::SetZero();
88 cmp2 = I128::SetZero();
91 if (!I128::IsZero(I128::Or(cmp1, cmp2))) {
92 mask = I128::MoveMask8(cmp1) | (I128::MoveMask8(cmp2) << 16);
93 return p + nlib_ctz(mask);
99 a1 = I128::LoadA16(p);
101 if (!I128::IsZero(cmp1)) {
102 mask = I128::MoveMask8(cmp1);
103 return p + nlib_ctz(mask);
110 a1 = I128::LoadA16(p);
112 mask = I128::MoveMask8(cmp1);
113 mask &= (1 << n) - 1;
114 if (mask)
return p + nlib_ctz(mask);
119 template <
class PRED>
124 const unsigned char* p =
reinterpret_cast<const unsigned char*
>(s);
128 if (reinterpret_cast<uintptr_t>(p) & 15) {
129 size_t r =
reinterpret_cast<uintptr_t
>(p) & 15;
130 a1 = I128::LoadA16(p - r);
131 cmp1 = I128::Not(pred(a1));
132 mask = I128::MoveMask8(cmp1);
136 mask &= (1 << n) - 1;
137 if (mask)
return p + nlib_ctz(mask);
140 if (mask)
return p + nlib_ctz(mask);
145 if ((reinterpret_cast<uintptr_t>(p) & 32)) {
146 a1 = I128::LoadA16(p);
148 if (!I128::IsFull(cmp1)) {
149 mask = I128::MoveMask8(I128::Not(cmp1));
150 return p + nlib_ctz(mask);
156 a1 = I128::LoadA16(p);
157 a2 = I128::LoadA16(p + 16);
160 if (!I128::IsFull(I128::And(cmp1, cmp2))) {
161 mask = I128::MoveMask8(I128::Not(cmp1)) | (I128::MoveMask8(I128::Not(cmp2)) << 16);
162 return p + nlib_ctz(mask);
168 a1 = I128::LoadA16(p);
170 if (!I128::IsFull(cmp1)) {
171 mask = I128::MoveMask8(I128::Not(cmp1));
172 return p + nlib_ctz(mask);
179 a1 = I128::LoadA16(p);
180 cmp1 = I128::Not(pred(a1));
181 mask = I128::MoveMask8(cmp1);
182 mask &= (1 << n) - 1;
183 if (mask)
return p + nlib_ctz(mask);
190 i128 result = I128::CmpLtInt8(c, I128::SetValue(
'{',
each_int8));
191 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'`',
each_int8)));
193 i128 tmp = I128::CmpLtInt8(c, I128::SetValue(
'[',
each_int8));
194 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'@',
each_int8)));
195 result = I128::Or(result, tmp);
202 i128 result = I128::CmpLtInt8(c, I128::SetValue(
':',
each_int8));
203 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'/',
each_int8)));
213 i128 result = I128::CmpEq8(c, I128::SetValue(
' ',
each_int8));
214 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\r',
each_int8)));
215 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\n',
each_int8)));
216 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\t',
each_int8)));
224 i128 result = I128::CmpLtInt8(c, I128::SetValue(
':',
each_int8));
225 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'/',
each_int8)));
227 tmp = I128::CmpLtInt8(c, I128::SetValue(
'G',
each_int8));
228 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'@',
each_int8)));
229 result = I128::Or(result, tmp);
231 tmp = I128::CmpLtInt8(c, I128::SetValue(
'g',
each_int8));
232 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'`',
each_int8)));
233 result = I128::Or(result, tmp);
239 template<
class T,
class Compare>
240 class KeyIdxSortLess {
242 KeyIdxSortLess(T*
const* src, uint32_t mask,
243 Compare comp) NLIB_NOEXCEPT : src_(src), mask_(mask), comp_(comp) {
246 return comp_(*src_[lhs & mask_], *src_[rhs & mask_]);
258 template<
class T,
class Compare>
263 if (n == 0 || n > 0x7FFFFFFFU)
return EINVAL;
265 if (n == 0 || n >
RSIZE_MAX)
return EINVAL;
268 size_t n16 = (n + 15) & ~15;
271 e = mem_.
Init(n16 *
sizeof(uint32_t), 16);
273 uint32_t* mem =
reinterpret_cast<uint32_t*
>(mem_.
Get());
274 int idx_width = 32 - nlib_clz(static_cast<uint32_t>(n16));
275 uint32_t idx_mask = (1U << idx_width) - 1;
276 uint32_t idx_mask_inv = ~idx_mask;
278 uint32_t max_key = 0U;
279 uint32_t min_key = 0xFFFFFFFFU;
280 for (
size_t i = 0; i < n; ++i) {
281 uint32_t key = src[i]->GetKey32();
285 else if (max_key < key)
289 int left_shift = nlib_clz(min_key ^ max_key);
294 i128 vecidx0 = I128::LoadA16(&detail::keyidxsort_0123[0]);
295 i128 vec_i = I128::SetZero();
297 i128 vecidx4 = I128::Add32(vecidx0, four);
298 i128 vecidx8 = I128::Add32(vecidx4, four);
299 i128 vecidx12 = I128::Add32(vecidx8, four);
300 i128 d16 = I128::Mult32(four, four);
302 for (
size_t i = 0; i < n16; i += 16) {
303 i128 m0 = I128::LoadA16(&mem[i]);
304 i128 m1 = I128::LoadA16(&mem[i + 4]);
305 i128 m2 = I128::LoadA16(&mem[i + 8]);
306 i128 m3 = I128::LoadA16(&mem[i + 12]);
308 m0 = I128::And(I128::ShiftLeftLogical32(m0, left_shift), vecmask);
309 m1 = I128::And(I128::ShiftLeftLogical32(m1, left_shift), vecmask);
310 m2 = I128::And(I128::ShiftLeftLogical32(m2, left_shift), vecmask);
311 m3 = I128::And(I128::ShiftLeftLogical32(m3, left_shift), vecmask);
312 m0 = I128::Or(m0, I128::Add32(vec_i, vecidx0));
313 m1 = I128::Or(m1, I128::Add32(vec_i, vecidx4));
314 m2 = I128::Or(m2, I128::Add32(vec_i, vecidx8));
315 m3 = I128::Or(m3, I128::Add32(vec_i, vecidx12));
317 I128::StoreA16(&mem[i], m0);
318 I128::StoreA16(&mem[i + 4], m1);
319 I128::StoreA16(&mem[i + 8], m2);
320 I128::StoreA16(&mem[i + 12], m3);
321 vec_i = I128::Add16(vec_i, d16);
324 for (
size_t i = 0; i < n; ++i) {
325 mem[i] = ((mem[i] << left_shift) & idx_mask_inv) |
static_cast<uint32_t
>(i);
328 for (
size_t i = n; i < n16; ++i) {
330 mem[i] = idx_mask_inv |
static_cast<uint32_t
>(i);
337 uint32_t prev_key = mem[0] & idx_mask_inv;
339 detail::KeyIdxSortLess<T, Compare> myless(src, idx_mask, comp);
341 uint32_t key = mem[i] & idx_mask_inv;
347 key = mem[i] & idx_mask_inv;
348 }
while (key == prev_key);
350 std::stable_sort(&mem[from], &mem[i], myless);
356 for (i = 0; i < n; ++i) {
357 dst[i] = src[mem[i] & idx_mask];
367 template<
class T,
class Compare>
369 size_t n = last - first;
370 T** tmp =
reinterpret_cast<T**
>(
nlib_malloc(n *
sizeof(*first)));
374 nlib_memcpy(first, n *
sizeof(*first), tmp, n *
sizeof(*tmp));
382 return KeyIdxSort(first, last, std::less<T>());
390 #endif // INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ i128 IsXdigit(i128 c) noexcept
Masks hexadecimal characters in c.
errno_t Init(size_t size, size_t align) noexcept
Allocates memory.
i128 IsDigit(i128 c) noexcept
Masks the characters 0 through 9 in c.
errno_t KeyIdxSort(T **first, T **last) noexcept
Executes KeyIdxSort(first, last, std::less<T>()).
Implements the class and functions for SIMD computations on integers.
A class for obtaining aligned memory.
i128 IsAlpha(i128 c) noexcept
Masks alphabetic letters in c.
i128 IsSpace(i128 c) noexcept
Masks space characters (0x20, 0x09, 0x0A, 0x0D) in c.
i128 IsAlnum(i128 c) noexcept
Masks alphabetic letters or the characters 0 through 9 in c.
nlib_i128_t i128
nlib_i128_t is defined using typedef.
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
void * Get() noexcept
Returns a pointer to the allocated region.
errno_t KeyIdxSortN(T **dst, T *const *src, size_t n) noexcept
Executes KeyIdxSortN(dst, src, n, std::less<T>()).
Used when aligned memory needs to be obtained.
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
const void * nlib_memchr_pred_not(const void *s, PRED pred, size_t n) noexcept
A function template for examining the bytes in byte strings using SIMD instructions.
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
errno_t MergeSortUint32A16(uint32_t *data, size_t n) noexcept
Uses SIMD to merge and sort 32-bit unsigned integer strings in the ascending order.
const void * nlib_memchr_pred(const void *s, PRED pred, size_t n) noexcept
A function template for examining the bytes in byte strings using SIMD instructions.