16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ 24 #if defined(NLIB_SIMD) 33 template <
size_t NumElem,
bool flag>
34 struct MergeSortHelper {
37 sortUint32A16(tmp, data, NumElem);
41 template <
size_t NumElem>
42 struct MergeSortHelper<NumElem, false> {
45 sortUint32A16_1(tmp, data, NumElem);
51 template <
size_t NumElem>
55 NLIB_ASSERT(!(reinterpret_cast<uintptr_t>(data) & 15));
56 detail::MergeSortHelper<NumElem, ((NumElem & (NumElem - 1)) == 0)>::Sort(data);
64 if (!s)
return nullptr;
66 const unsigned char* p =
reinterpret_cast<const unsigned char*
>(s);
70 if (reinterpret_cast<uintptr_t>(p) & 15) {
71 size_t r =
reinterpret_cast<uintptr_t
>(p) & 15;
72 a1 = I128::LoadA16(p - r);
74 mask = I128::MoveMask8(cmp1);
87 if ((reinterpret_cast<uintptr_t>(p) & 32)) {
88 a1 = I128::LoadA16(p);
90 if (!I128::IsZero(cmp1)) {
91 mask = I128::MoveMask8(cmp1);
98 a1 = I128::LoadA16(p);
99 a2 = I128::LoadA16(p + 16);
100 cmp1 = I128::SetZero();
101 cmp2 = I128::SetZero();
104 if (!I128::IsZero(I128::Or(cmp1, cmp2))) {
105 mask = I128::MoveMask8(cmp1) | (I128::MoveMask8(cmp2) << 16);
112 a1 = I128::LoadA16(p);
114 if (!I128::IsZero(cmp1)) {
115 mask = I128::MoveMask8(cmp1);
123 a1 = I128::LoadA16(p);
125 mask = I128::MoveMask8(cmp1);
126 mask &= (1 << n) - 1;
132 template <
class PRED>
135 if (!s)
return nullptr;
137 const unsigned char* p =
reinterpret_cast<const unsigned char*
>(s);
141 if (reinterpret_cast<uintptr_t>(p) & 15) {
142 size_t r =
reinterpret_cast<uintptr_t
>(p) & 15;
143 a1 = I128::LoadA16(p - r);
144 cmp1 = I128::Not(pred(a1));
145 mask = I128::MoveMask8(cmp1);
149 mask &= (1 << n) - 1;
158 if ((reinterpret_cast<uintptr_t>(p) & 32)) {
159 a1 = I128::LoadA16(p);
161 if (!I128::IsFull(cmp1)) {
162 mask = I128::MoveMask8(I128::Not(cmp1));
169 a1 = I128::LoadA16(p);
170 a2 = I128::LoadA16(p + 16);
173 if (!I128::IsFull(I128::And(cmp1, cmp2))) {
174 mask = I128::MoveMask8(I128::Not(cmp1)) | (I128::MoveMask8(I128::Not(cmp2)) << 16);
181 a1 = I128::LoadA16(p);
183 if (!I128::IsFull(cmp1)) {
184 mask = I128::MoveMask8(I128::Not(cmp1));
192 a1 = I128::LoadA16(p);
193 cmp1 = I128::Not(pred(a1));
194 mask = I128::MoveMask8(cmp1);
195 mask &= (1 << n) - 1;
203 i128 result = I128::CmpLtInt8(c, I128::SetValue(
'{',
each_int8));
204 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'`',
each_int8)));
206 i128 tmp = I128::CmpLtInt8(c, I128::SetValue(
'[',
each_int8));
207 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'@',
each_int8)));
208 result = I128::Or(result, tmp);
215 i128 result = I128::CmpLtInt8(c, I128::SetValue(
':',
each_int8));
216 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'/',
each_int8)));
226 i128 result = I128::CmpEq8(c, I128::SetValue(
' ',
each_int8));
227 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\r',
each_int8)));
228 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\n',
each_int8)));
229 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\t',
each_int8)));
237 i128 result = I128::CmpLtInt8(c, I128::SetValue(
':',
each_int8));
238 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'/',
each_int8)));
240 tmp = I128::CmpLtInt8(c, I128::SetValue(
'G',
each_int8));
241 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'@',
each_int8)));
242 result = I128::Or(result, tmp);
244 tmp = I128::CmpLtInt8(c, I128::SetValue(
'g',
each_int8));
245 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'`',
each_int8)));
246 result = I128::Or(result, tmp);
252 template<
class T,
class Compare>
253 class KeyIdxSortLess {
255 KeyIdxSortLess(T*
const* src, uint32_t mask,
256 Compare comp)
NLIB_NOEXCEPT : src_(src), mask_(mask), comp_(comp) {
259 return comp_(*src_[lhs & mask_], *src_[rhs & mask_]);
271 template<
class T,
class Compare>
276 if (n == 0 || n > 0x7FFFFFFFU)
return EINVAL;
278 if (n == 0 || n >
RSIZE_MAX)
return EINVAL;
281 size_t n16 = (n + 15) & ~15;
284 e = mem_.
Init(n16 *
sizeof(uint32_t), 16);
286 uint32_t* mem =
reinterpret_cast<uint32_t*
>(mem_.
Get());
287 int idx_width = 32 -
nlib_clz32(static_cast<uint32_t>(n16));
288 uint32_t idx_mask = (1U << idx_width) - 1;
289 uint32_t idx_mask_inv = ~idx_mask;
291 uint32_t max_key = 0U;
292 uint32_t min_key = 0xFFFFFFFFU;
293 for (
size_t i = 0; i < n; ++i) {
294 uint32_t key = src[i]->GetKey32();
298 else if (max_key < key)
302 int left_shift =
nlib_clz32(min_key ^ max_key);
307 i128 vecidx0 = I128::LoadA16(&detail::keyidxsort_0123[0]);
308 i128 vec_i = I128::SetZero();
310 i128 vecidx4 = I128::Add32(vecidx0, four);
311 i128 vecidx8 = I128::Add32(vecidx4, four);
312 i128 vecidx12 = I128::Add32(vecidx8, four);
313 i128 d16 = I128::Mult32(four, four);
315 for (
size_t i = 0; i < n16; i += 16) {
316 i128 m0 = I128::LoadA16(&mem[i]);
317 i128 m1 = I128::LoadA16(&mem[i + 4]);
318 i128 m2 = I128::LoadA16(&mem[i + 8]);
319 i128 m3 = I128::LoadA16(&mem[i + 12]);
321 m0 = I128::And(I128::ShiftLeftLogical32(m0, left_shift), vecmask);
322 m1 = I128::And(I128::ShiftLeftLogical32(m1, left_shift), vecmask);
323 m2 = I128::And(I128::ShiftLeftLogical32(m2, left_shift), vecmask);
324 m3 = I128::And(I128::ShiftLeftLogical32(m3, left_shift), vecmask);
325 m0 = I128::Or(m0, I128::Add32(vec_i, vecidx0));
326 m1 = I128::Or(m1, I128::Add32(vec_i, vecidx4));
327 m2 = I128::Or(m2, I128::Add32(vec_i, vecidx8));
328 m3 = I128::Or(m3, I128::Add32(vec_i, vecidx12));
330 I128::StoreA16(&mem[i], m0);
331 I128::StoreA16(&mem[i + 4], m1);
332 I128::StoreA16(&mem[i + 8], m2);
333 I128::StoreA16(&mem[i + 12], m3);
334 vec_i = I128::Add16(vec_i, d16);
337 for (
size_t i = 0; i < n; ++i) {
338 mem[i] = ((mem[i] << left_shift) & idx_mask_inv) |
static_cast<uint32_t
>(i);
341 for (
size_t i = n; i < n16; ++i) {
343 mem[i] = idx_mask_inv |
static_cast<uint32_t
>(i);
350 uint32_t prev_key = mem[0] & idx_mask_inv;
352 detail::KeyIdxSortLess<T, Compare> myless(src, idx_mask, comp);
354 uint32_t key = mem[i] & idx_mask_inv;
360 key = mem[i] & idx_mask_inv;
361 }
while (key == prev_key);
363 std::stable_sort(&mem[from], &mem[i], myless);
369 for (i = 0; i < n; ++i) {
370 dst[i] = src[mem[i] & idx_mask];
380 template<
class T,
class Compare>
382 size_t n = last - first;
383 T** tmp =
reinterpret_cast<T**
>(
nlib_malloc(n *
sizeof(*first)));
387 nlib_memcpy(first, n *
sizeof(*first), tmp, n *
sizeof(*tmp));
395 return KeyIdxSort(first, last, std::less<T>());
403 #endif // INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ i128 IsXdigit(i128 c) noexcept
Masks hexadecimal characters in c.
errno_t Init(size_t size, size_t align) noexcept
Allocates memory.
i128 IsDigit(i128 c) noexcept
Masks the characters 0 through 9 in c.
errno_t KeyIdxSort(T **first, T **last) noexcept
Executes KeyIdxSort(first, last, std::less<T>()).
Implements the class and functions for SIMD computations on integers.
A class for obtaining aligned memory.
i128 IsAlpha(i128 c) noexcept
Masks alphabetic letters in c.
i128 IsSpace(i128 c) noexcept
Masks space characters (0x20, 0x09, 0x0A, 0x0D) in c.
i128 IsAlnum(i128 c) noexcept
Masks alphabetic letters or the characters 0 through 9 in c.
nlib_i128_t i128
nlib_i128_t is defined using typedef.
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
void * Get() noexcept
Returns a pointer to the allocated region.
errno_t KeyIdxSortN(T **dst, T *const *src, size_t n) noexcept
Executes KeyIdxSortN(dst, src, n, std::less<T>()).
Used when aligned memory needs to be obtained.
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
const void * nlib_memchr_pred_not(const void *s, PRED pred, size_t n) noexcept
A function template for examining the bytes in byte strings using SIMD instructions.
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
errno_t MergeSortUint32A16(uint32_t *data, size_t n) noexcept
Uses SIMD to merge and sort 32-bit unsigned integer strings in the ascending order.
const void * nlib_memchr_pred(const void *s, PRED pred, size_t n) noexcept
A function template for examining the bytes in byte strings using SIMD instructions.