3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ 4 #define INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ 11 #if defined(NLIB_SIMD) 20 template <
size_t NumElem,
bool flag>
21 struct MergeSortHelper {
24 sortUint32A16(tmp, data, NumElem);
28 template <
size_t NumElem>
29 struct MergeSortHelper<NumElem, false> {
32 sortUint32A16_1(tmp, data, NumElem);
38 template <
size_t NumElem>
42 NLIB_ASSERT(!(reinterpret_cast<uintptr_t>(data) & 15));
43 detail::MergeSortHelper<NumElem, ((NumElem & (NumElem - 1)) == 0)>::Sort(data);
53 const unsigned char* p =
reinterpret_cast<const unsigned char*
>(s);
57 if (reinterpret_cast<uintptr_t>(p) & 15) {
58 size_t r =
reinterpret_cast<uintptr_t
>(p) & 15;
59 a1 = I128::LoadA16(p - r);
61 mask = I128::MoveMask8(cmp1);
66 if (mask)
return p + nlib_ctz(mask);
69 if (mask)
return p + nlib_ctz(mask);
74 if ((reinterpret_cast<uintptr_t>(p) & 32)) {
75 a1 = I128::LoadA16(p);
77 if (!I128::IsZero(cmp1)) {
78 mask = I128::MoveMask8(cmp1);
79 return p + nlib_ctz(mask);
85 a1 = I128::LoadA16(p);
86 a2 = I128::LoadA16(p + 16);
87 cmp1 = I128::SetZero();
88 cmp2 = I128::SetZero();
91 if (!I128::IsZero(I128::Or(cmp1, cmp2))) {
92 mask = I128::MoveMask8(cmp1) | (I128::MoveMask8(cmp2) << 16);
93 return p + nlib_ctz(mask);
99 a1 = I128::LoadA16(p);
101 if (!I128::IsZero(cmp1)) {
102 mask = I128::MoveMask8(cmp1);
103 return p + nlib_ctz(mask);
110 a1 = I128::LoadA16(p);
112 mask = I128::MoveMask8(cmp1);
113 mask &= (1 << n) - 1;
114 if (mask)
return p + nlib_ctz(mask);
119 template <
class PRED>
124 const unsigned char* p =
reinterpret_cast<const unsigned char*
>(s);
128 if (reinterpret_cast<uintptr_t>(p) & 15) {
129 size_t r =
reinterpret_cast<uintptr_t
>(p) & 15;
130 a1 = I128::LoadA16(p - r);
131 cmp1 = I128::Not(pred(a1));
132 mask = I128::MoveMask8(cmp1);
136 mask &= (1 << n) - 1;
137 if (mask)
return p + nlib_ctz(mask);
140 if (mask)
return p + nlib_ctz(mask);
145 if ((reinterpret_cast<uintptr_t>(p) & 32)) {
146 a1 = I128::LoadA16(p);
148 if (!I128::IsFull(cmp1)) {
149 mask = I128::MoveMask8(I128::Not(cmp1));
150 return p + nlib_ctz(mask);
156 a1 = I128::LoadA16(p);
157 a2 = I128::LoadA16(p + 16);
160 if (!I128::IsFull(I128::And(cmp1, cmp2))) {
161 mask = I128::MoveMask8(I128::Not(cmp1)) | (I128::MoveMask8(I128::Not(cmp2)) << 16);
162 return p + nlib_ctz(mask);
168 a1 = I128::LoadA16(p);
170 if (!I128::IsFull(cmp1)) {
171 mask = I128::MoveMask8(I128::Not(cmp1));
172 return p + nlib_ctz(mask);
179 a1 = I128::LoadA16(p);
180 cmp1 = I128::Not(pred(a1));
181 mask = I128::MoveMask8(cmp1);
182 mask &= (1 << n) - 1;
183 if (mask)
return p + nlib_ctz(mask);
190 i128 result = I128::CmpLtInt8(c, I128::SetValue(
'{',
each_int8));
191 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'`',
each_int8)));
193 i128 tmp = I128::CmpLtInt8(c, I128::SetValue(
'[',
each_int8));
194 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'@',
each_int8)));
195 result = I128::Or(result, tmp);
202 i128 result = I128::CmpLtInt8(c, I128::SetValue(
':',
each_int8));
203 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'/',
each_int8)));
213 i128 result = I128::CmpEq8(c, I128::SetValue(
' ',
each_int8));
214 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\r',
each_int8)));
215 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\n',
each_int8)));
216 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\t',
each_int8)));
224 i128 result = I128::CmpLtInt8(c, I128::SetValue(
':',
each_int8));
225 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'/',
each_int8)));
227 tmp = I128::CmpLtInt8(c, I128::SetValue(
'G',
each_int8));
228 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'@',
each_int8)));
229 result = I128::Or(result, tmp);
231 tmp = I128::CmpLtInt8(c, I128::SetValue(
'g',
each_int8));
232 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'`',
each_int8)));
233 result = I128::Or(result, tmp);
239 template<
class T,
class Compare>
240 class KeyIdxSortLess {
242 KeyIdxSortLess(T*
const* src, uint32_t mask,
243 Compare comp) NLIB_NOEXCEPT : src_(src), mask_(mask), comp_(comp) {
246 return comp_(*src_[lhs & mask_], *src_[rhs & mask_]);
258 template<
class T,
class Compare>
263 if (n == 0 || n > 0x7FFFFFFFU)
return EINVAL;
265 if (n == 0 || n >
RSIZE_MAX)
return EINVAL;
268 size_t n16 = (n + 15) & ~15;
271 e = mem_.
Init(n16 *
sizeof(uint32_t), 16);
273 uint32_t* mem =
reinterpret_cast<uint32_t*
>(mem_.
Get());
274 int idx_width = 32 - nlib_clz(static_cast<uint32_t>(n16));
275 uint32_t idx_mask = (1U << idx_width) - 1;
276 uint32_t idx_mask_inv = ~idx_mask;
278 uint32_t max_key = 0U;
279 uint32_t min_key = 0xFFFFFFFFU;
280 for (
size_t i = 0; i < n; ++i) {
281 uint32_t key = src[i]->GetKey32();
285 else if (max_key < key)
289 int left_shift = nlib_clz(min_key ^ max_key);
294 i128 vecidx0 = I128::LoadA16(&detail::keyidxsort_0123[0]);
295 i128 vec_i = I128::SetZero();
297 i128 vecidx4 = I128::Add32(vecidx0, four);
298 i128 vecidx8 = I128::Add32(vecidx4, four);
299 i128 vecidx12 = I128::Add32(vecidx8, four);
300 i128 d16 = I128::Mult32(four, four);
302 for (
size_t i = 0; i < n16; i += 16) {
303 i128 m0 = I128::LoadA16(&mem[i]);
304 i128 m1 = I128::LoadA16(&mem[i + 4]);
305 i128 m2 = I128::LoadA16(&mem[i + 8]);
306 i128 m3 = I128::LoadA16(&mem[i + 12]);
308 m0 = I128::And(I128::ShiftLeftLogical32(m0, left_shift), vecmask);
309 m1 = I128::And(I128::ShiftLeftLogical32(m1, left_shift), vecmask);
310 m2 = I128::And(I128::ShiftLeftLogical32(m2, left_shift), vecmask);
311 m3 = I128::And(I128::ShiftLeftLogical32(m3, left_shift), vecmask);
312 m0 = I128::Or(m0, I128::Add32(vec_i, vecidx0));
313 m1 = I128::Or(m1, I128::Add32(vec_i, vecidx4));
314 m2 = I128::Or(m2, I128::Add32(vec_i, vecidx8));
315 m3 = I128::Or(m3, I128::Add32(vec_i, vecidx12));
317 I128::StoreA16(&mem[i], m0);
318 I128::StoreA16(&mem[i + 4], m1);
319 I128::StoreA16(&mem[i + 8], m2);
320 I128::StoreA16(&mem[i + 12], m3);
321 vec_i = I128::Add16(vec_i, d16);
324 for (
size_t i = 0; i < n; ++i) {
325 mem[i] = ((mem[i] << left_shift) & idx_mask_inv) |
static_cast<uint32_t
>(i);
328 for (
size_t i = n; i < n16; ++i) {
330 mem[i] = idx_mask_inv |
static_cast<uint32_t
>(i);
337 uint32_t prev_key = mem[0] & idx_mask_inv;
339 detail::KeyIdxSortLess<T, Compare> myless(src, idx_mask, comp);
341 uint32_t key = mem[i] & idx_mask_inv;
347 key = mem[i] & idx_mask_inv;
348 }
while (key == prev_key);
350 std::stable_sort(&mem[from], &mem[i], myless);
356 for (i = 0; i < n; ++i) {
357 dst[i] = src[mem[i] & idx_mask];
367 template<
class T,
class Compare>
369 size_t n = last - first;
370 T** tmp =
reinterpret_cast<T**
>(
nlib_malloc(n *
sizeof(*first)));
374 nlib_memcpy(first, n *
sizeof(*first), tmp, n *
sizeof(*tmp));
382 return KeyIdxSort(first, last, std::less<T>());
390 #endif // INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ i128 IsXdigit(i128 c) noexcept
c 内の16進数の文字をマスクします。
errno_t Init(size_t size, size_t align) noexcept
メモリの割り当てを行います。
i128 IsDigit(i128 c) noexcept
c 内の'0'-'9'の文字をマスクします。
errno_t KeyIdxSort(T **first, T **last) noexcept
KeyIdxSort(first, last, std::less<T>())を実行します。
整数のSIMD演算を行うためのクラスや関数が実装されています。
i128 IsAlpha(i128 c) noexcept
c 内のアルファベットをマスクします。
i128 IsSpace(i128 c) noexcept
c 内の空白文字(0x20, 0x09, 0x0A, 0x0D)をマスクします。
i128 IsAlnum(i128 c) noexcept
c 内のアルファベットか'0'-'9'の文字をマスクします。
nlib_i128_t i128
nlib_i128_tがtypedefされています。
constexpr const each_uint32_tag each_uint32
each_uint32_tag型の定数オブジェクトで、32bitの符号なし整数を示すためのタグです。
#define NLIB_NOEXCEPT
環境に合わせてnoexcept 又は同等の定義がされます。
void * Get() noexcept
割り当てられた領域へのポインタを返します。
errno_t KeyIdxSortN(T **dst, T *const *src, size_t n) noexcept
KeyIdxSortN(dst, src, n, std::less<T>())を実行します。
#define NLIB_ALIGNAS(x)
alignas(x)又は同等の定義がされます。
constexpr const each_int8_tag each_int8
each_int8_tag型の定数オブジェクトで、8bitの符号付き整数を示すためのタグです。
const void * nlib_memchr_pred_not(const void *s, PRED pred, size_t n) noexcept
バイト列内のバイトの検査をSIMD命令を使って行うための関数テンプレートです。
#define NLIB_STATIC_ASSERT(exp)
静的アサートが定義されます。利用可能であればstatic_assertを利用します。
errno_t MergeSortUint32A16(uint32_t *data, size_t n) noexcept
SIMDを利用して32bit符号なし整数の並びを昇順にマージソートします。
const void * nlib_memchr_pred(const void *s, PRED pred, size_t n) noexcept
バイト列内のバイトの検査をSIMD命令を使って行うための関数テンプレートです。