16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ 24 #if defined(NLIB_SIMD) 33 template<
size_t NumElem,
bool flag>
34 struct MergeSortHelper {
37 sortUint32A16(tmp, data, NumElem);
41 template<
size_t NumElem>
42 struct MergeSortHelper<NumElem, false> {
45 sortUint32A16_1(tmp, data, NumElem);
51 template<
size_t NumElem>
55 NLIB_ASSERT(!(reinterpret_cast<uintptr_t>(data) & 15));
56 detail::MergeSortHelper<NumElem, ((NumElem & (NumElem - 1)) == 0)>::Sort(data);
65 template<
class Pred,
class... Tail>
67 return SimdFindSub(a, I128::Or(cmp, pred(a)), tail...);
69 template<
class Pred,
class... Tail>
71 return SimdFindSub2(a, pred(a), tail...);
76 nlib_memcpy(&tmpbuf[0], 16, p, n);
77 return I128::LoadA1(tmpbuf);
81 template<
class... PredList>
84 if (!buf)
return nullptr;
87 i128 cmp = detail::SimdFindSub(I128::LoadA1(p), predlist...);
88 if (!I128::IsZero(cmp))
return p +
nlib_ctz32(I128::MoveMask8(cmp));
95 i128 cmp = detail::SimdFindSub(detail::PartialLoad(p, n), predlist...);
96 if (!I128::IsZero(cmp)) {
97 int mask = I128::MoveMask8(cmp) & ((1 << n) - 1);
104 template<
class... PredList>
107 if (!buf)
return nullptr;
112 i128 cmp = detail::SimdFindSub(I128::LoadA1(p), predlist...);
113 if (!I128::IsZero(cmp))
return p + (31 -
nlib_clz32(I128::MoveMask8(cmp)));
118 i128 cmp = detail::SimdFindSub(detail::PartialLoad(p, n), predlist...);
119 if (!I128::IsZero(cmp)) {
120 int mask = I128::MoveMask8(cmp) & ((1 << n) - 1);
127 template<
class... PredList>
128 inline const char* SimdFind(
const char* s, PredList... predlist)
NLIB_NOEXCEPT {
131 template<
class... PredList>
132 inline const char* SimdRfind(
const char* s, PredList... predlist)
NLIB_NOEXCEPT {
139 return SimdFind(s, n, pred);
145 return SimdFind(s, n, [&pred](
i128 c) {
return I128::Not(pred(c)); });
150 i128 result = I128::CmpLtInt8(c, I128::SetValue(
'{',
each_int8));
151 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'`',
each_int8)));
153 i128 tmp = I128::CmpLtInt8(c, I128::SetValue(
'[',
each_int8));
154 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'@',
each_int8)));
155 result = I128::Or(result, tmp);
162 i128 result = I128::CmpLtInt8(c, I128::SetValue(
':',
each_int8));
163 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'/',
each_int8)));
175 i128 result = I128::CmpEq8(c, I128::SetValue(
' ',
each_int8));
176 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\r',
each_int8)));
177 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\n',
each_int8)));
178 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\t',
each_int8)));
186 i128 result = I128::CmpLtInt8(c, I128::SetValue(
':',
each_int8));
187 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'/',
each_int8)));
189 tmp = I128::CmpLtInt8(c, I128::SetValue(
'G',
each_int8));
190 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'@',
each_int8)));
191 result = I128::Or(result, tmp);
193 tmp = I128::CmpLtInt8(c, I128::SetValue(
'g',
each_int8));
194 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'`',
each_int8)));
195 result = I128::Or(result, tmp);
201 return I128::CmpLtInt8(c, I128::SetZero());
205 template<
class T,
class Compare>
206 class KeyIdxSortLess {
208 KeyIdxSortLess(T*
const* src, uint32_t mask, Compare comp)
NLIB_NOEXCEPT : src_(src),
212 return comp_(*src_[lhs & mask_], *src_[rhs & mask_]);
224 template<class T, class Compare>
229 if (n == 0 || n > 0x7FFFFFFFU)
return EINVAL;
231 if (n == 0 || n >
RSIZE_MAX)
return EINVAL;
234 size_t n16 = (n + 15) & ~15;
237 e = mem_.
Init(n16 *
sizeof(uint32_t), 16);
239 uint32_t* mem =
reinterpret_cast<uint32_t*
>(mem_.
Get());
240 int idx_width = 32 -
nlib_clz32(static_cast<uint32_t>(n16));
241 uint32_t idx_mask = (1U << idx_width) - 1;
242 uint32_t idx_mask_inv = ~idx_mask;
244 uint32_t max_key = 0U;
245 uint32_t min_key = 0xFFFFFFFFU;
246 for (
size_t i = 0; i < n; ++i) {
247 uint32_t key = src[i]->GetKey32();
251 else if (max_key < key)
255 int left_shift =
nlib_clz32(min_key ^ max_key);
260 i128 vecidx0 = I128::LoadA16(&detail::keyidxsort_0123[0]);
261 i128 vec_i = I128::SetZero();
263 i128 vecidx4 = I128::Add32(vecidx0, four);
264 i128 vecidx8 = I128::Add32(vecidx4, four);
265 i128 vecidx12 = I128::Add32(vecidx8, four);
266 i128 d16 = I128::Mult32(four, four);
268 for (
size_t i = 0; i < n16; i += 16) {
269 i128 m0 = I128::LoadA16(&mem[i]);
270 i128 m1 = I128::LoadA16(&mem[i + 4]);
271 i128 m2 = I128::LoadA16(&mem[i + 8]);
272 i128 m3 = I128::LoadA16(&mem[i + 12]);
274 m0 = I128::And(I128::ShiftLeftLogical32(m0, left_shift), vecmask);
275 m1 = I128::And(I128::ShiftLeftLogical32(m1, left_shift), vecmask);
276 m2 = I128::And(I128::ShiftLeftLogical32(m2, left_shift), vecmask);
277 m3 = I128::And(I128::ShiftLeftLogical32(m3, left_shift), vecmask);
278 m0 = I128::Or(m0, I128::Add32(vec_i, vecidx0));
279 m1 = I128::Or(m1, I128::Add32(vec_i, vecidx4));
280 m2 = I128::Or(m2, I128::Add32(vec_i, vecidx8));
281 m3 = I128::Or(m3, I128::Add32(vec_i, vecidx12));
283 I128::StoreA16(&mem[i], m0);
284 I128::StoreA16(&mem[i + 4], m1);
285 I128::StoreA16(&mem[i + 8], m2);
286 I128::StoreA16(&mem[i + 12], m3);
287 vec_i = I128::Add16(vec_i, d16);
290 for (
size_t i = 0; i < n; ++i) {
291 mem[i] = ((mem[i] << left_shift) & idx_mask_inv) |
static_cast<uint32_t
>(i);
294 for (
size_t i = n; i < n16; ++i) {
296 mem[i] = idx_mask_inv |
static_cast<uint32_t
>(i);
303 uint32_t prev_key = mem[0] & idx_mask_inv;
305 detail::KeyIdxSortLess<T, Compare> myless(src, idx_mask, comp);
307 uint32_t key = mem[i] & idx_mask_inv;
313 key = mem[i] & idx_mask_inv;
314 }
while (key == prev_key);
316 std::stable_sort(&mem[from], &mem[i], myless);
322 for (i = 0; i < n; ++i) {
323 dst[i] = src[mem[i] & idx_mask];
333 template<
class T,
class Compare>
335 size_t n = last - first;
336 T** tmp =
reinterpret_cast<T**
>(
nlib_malloc(n *
sizeof(*first)));
340 nlib_memcpy(first, n *
sizeof(*first), tmp, n *
sizeof(*tmp));
348 return KeyIdxSort(first, last, std::less<T>());
356 #endif // INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ i128 IsXdigit(i128 c) noexcept
Masks hexadecimal characters in c.
errno_t Init(size_t size, size_t align) noexcept
Allocates memory.
i128 IsDigit(i128 c) noexcept
Masks the characters 0 through 9 in c.
errno_t KeyIdxSort(T **first, T **last) noexcept
Executes KeyIdxSort(first, last, std::less<T>()).
Implements the class and functions for SIMD computations on integers.
A class for obtaining aligned memory.
i128 IsAlpha(i128 c) noexcept
Masks alphabetic letters in c.
i128 IsSpace(i128 c) noexcept
Masks space characters (0x20, 0x09, 0x0A, 0x0D) in c.
i128 IsAlnum(i128 c) noexcept
Masks alphabetic letters or the characters 0 through 9 in c.
nlib_i128_t i128
nlib_i128_t is defined using typedef.
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
void * Get() noexcept
Returns a pointer to the allocated region.
errno_t KeyIdxSortN(T **dst, T *const *src, size_t n) noexcept
Executes KeyIdxSortN(dst, src, n, std::less<T>()).
Used when aligned memory needs to be obtained.
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
const void * nlib_memchr_pred_not(const void *s, PRED pred, size_t n) noexcept
A function template for examining the bytes in byte strings using SIMD instructions.
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
errno_t MergeSortUint32A16(uint32_t *data, size_t n) noexcept
Uses SIMD to merge and sort 32-bit unsigned integer strings in the ascending order.
const void * nlib_memchr_pred(const void *s, PRED pred, size_t n) noexcept
A function template for examining the bytes in byte strings using SIMD instructions.