16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ 24 #if defined(NLIB_SIMD) 33 template <
size_t NumElem,
bool flag>
34 struct MergeSortHelper {
37 sortUint32A16(tmp, data, NumElem);
41 template <
size_t NumElem>
42 struct MergeSortHelper<NumElem, false> {
45 sortUint32A16_1(tmp, data, NumElem);
51 template <
size_t NumElem>
55 NLIB_ASSERT(!(reinterpret_cast<uintptr_t>(data) & 15));
56 detail::MergeSortHelper<NumElem, ((NumElem & (NumElem - 1)) == 0)>::Sort(data);
64 if (!s)
return nullptr;
66 const unsigned char* p =
reinterpret_cast<const unsigned char*
>(s);
70 if (reinterpret_cast<uintptr_t>(p) & 15) {
71 size_t r =
reinterpret_cast<uintptr_t
>(p) & 15;
72 a1 = I128::LoadA16(p - r);
74 mask = I128::MoveMask8(cmp1);
87 if ((reinterpret_cast<uintptr_t>(p) & 32)) {
88 a1 = I128::LoadA16(p);
90 if (!I128::IsZero(cmp1)) {
91 mask = I128::MoveMask8(cmp1);
98 a1 = I128::LoadA16(p);
99 a2 = I128::LoadA16(p + 16);
100 cmp1 = I128::SetZero();
101 cmp2 = I128::SetZero();
104 if (!I128::IsZero(I128::Or(cmp1, cmp2))) {
105 mask = I128::MoveMask8(cmp1) | (I128::MoveMask8(cmp2) << 16);
112 a1 = I128::LoadA16(p);
114 if (!I128::IsZero(cmp1)) {
115 mask = I128::MoveMask8(cmp1);
123 a1 = I128::LoadA16(p);
125 mask = I128::MoveMask8(cmp1);
126 mask &= (1 << n) - 1;
132 template <
class PRED>
135 if (!s)
return nullptr;
137 const unsigned char* p =
reinterpret_cast<const unsigned char*
>(s);
141 if (reinterpret_cast<uintptr_t>(p) & 15) {
142 size_t r =
reinterpret_cast<uintptr_t
>(p) & 15;
143 a1 = I128::LoadA16(p - r);
144 cmp1 = I128::Not(pred(a1));
145 mask = I128::MoveMask8(cmp1);
149 mask &= (1 << n) - 1;
158 if ((reinterpret_cast<uintptr_t>(p) & 32)) {
159 a1 = I128::LoadA16(p);
161 if (!I128::IsFull(cmp1)) {
162 mask = I128::MoveMask8(I128::Not(cmp1));
169 a1 = I128::LoadA16(p);
170 a2 = I128::LoadA16(p + 16);
173 if (!I128::IsFull(I128::And(cmp1, cmp2))) {
174 mask = I128::MoveMask8(I128::Not(cmp1)) | (I128::MoveMask8(I128::Not(cmp2)) << 16);
181 a1 = I128::LoadA16(p);
183 if (!I128::IsFull(cmp1)) {
184 mask = I128::MoveMask8(I128::Not(cmp1));
192 a1 = I128::LoadA16(p);
193 cmp1 = I128::Not(pred(a1));
194 mask = I128::MoveMask8(cmp1);
195 mask &= (1 << n) - 1;
203 i128 result = I128::CmpLtInt8(c, I128::SetValue(
'{',
each_int8));
204 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'`',
each_int8)));
206 i128 tmp = I128::CmpLtInt8(c, I128::SetValue(
'[',
each_int8));
207 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'@',
each_int8)));
208 result = I128::Or(result, tmp);
215 i128 result = I128::CmpLtInt8(c, I128::SetValue(
':',
each_int8));
216 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'/',
each_int8)));
226 i128 result = I128::CmpEq8(c, I128::SetValue(
' ',
each_int8));
227 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\r',
each_int8)));
228 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\n',
each_int8)));
229 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\t',
each_int8)));
237 i128 result = I128::CmpLtInt8(c, I128::SetValue(
':',
each_int8));
238 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'/',
each_int8)));
240 tmp = I128::CmpLtInt8(c, I128::SetValue(
'G',
each_int8));
241 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'@',
each_int8)));
242 result = I128::Or(result, tmp);
244 tmp = I128::CmpLtInt8(c, I128::SetValue(
'g',
each_int8));
245 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'`',
each_int8)));
246 result = I128::Or(result, tmp);
252 template<
class T,
class Compare>
253 class KeyIdxSortLess {
255 KeyIdxSortLess(T*
const* src, uint32_t mask,
256 Compare comp)
NLIB_NOEXCEPT : src_(src), mask_(mask), comp_(comp) {
259 return comp_(*src_[lhs & mask_], *src_[rhs & mask_]);
271 template<
class T,
class Compare>
276 if (n == 0 || n > 0x7FFFFFFFU)
return EINVAL;
278 if (n == 0 || n >
RSIZE_MAX)
return EINVAL;
281 size_t n16 = (n + 15) & ~15;
284 e = mem_.
Init(n16 *
sizeof(uint32_t), 16);
286 uint32_t* mem =
reinterpret_cast<uint32_t*
>(mem_.
Get());
287 int idx_width = 32 -
nlib_clz32(static_cast<uint32_t>(n16));
288 uint32_t idx_mask = (1U << idx_width) - 1;
289 uint32_t idx_mask_inv = ~idx_mask;
291 uint32_t max_key = 0U;
292 uint32_t min_key = 0xFFFFFFFFU;
293 for (
size_t i = 0; i < n; ++i) {
294 uint32_t key = src[i]->GetKey32();
298 else if (max_key < key)
302 int left_shift =
nlib_clz32(min_key ^ max_key);
307 i128 vecidx0 = I128::LoadA16(&detail::keyidxsort_0123[0]);
308 i128 vec_i = I128::SetZero();
310 i128 vecidx4 = I128::Add32(vecidx0, four);
311 i128 vecidx8 = I128::Add32(vecidx4, four);
312 i128 vecidx12 = I128::Add32(vecidx8, four);
313 i128 d16 = I128::Mult32(four, four);
315 for (
size_t i = 0; i < n16; i += 16) {
316 i128 m0 = I128::LoadA16(&mem[i]);
317 i128 m1 = I128::LoadA16(&mem[i + 4]);
318 i128 m2 = I128::LoadA16(&mem[i + 8]);
319 i128 m3 = I128::LoadA16(&mem[i + 12]);
321 m0 = I128::And(I128::ShiftLeftLogical32(m0, left_shift), vecmask);
322 m1 = I128::And(I128::ShiftLeftLogical32(m1, left_shift), vecmask);
323 m2 = I128::And(I128::ShiftLeftLogical32(m2, left_shift), vecmask);
324 m3 = I128::And(I128::ShiftLeftLogical32(m3, left_shift), vecmask);
325 m0 = I128::Or(m0, I128::Add32(vec_i, vecidx0));
326 m1 = I128::Or(m1, I128::Add32(vec_i, vecidx4));
327 m2 = I128::Or(m2, I128::Add32(vec_i, vecidx8));
328 m3 = I128::Or(m3, I128::Add32(vec_i, vecidx12));
330 I128::StoreA16(&mem[i], m0);
331 I128::StoreA16(&mem[i + 4], m1);
332 I128::StoreA16(&mem[i + 8], m2);
333 I128::StoreA16(&mem[i + 12], m3);
334 vec_i = I128::Add16(vec_i, d16);
337 for (
size_t i = 0; i < n; ++i) {
338 mem[i] = ((mem[i] << left_shift) & idx_mask_inv) |
static_cast<uint32_t
>(i);
341 for (
size_t i = n; i < n16; ++i) {
343 mem[i] = idx_mask_inv |
static_cast<uint32_t
>(i);
350 uint32_t prev_key = mem[0] & idx_mask_inv;
352 detail::KeyIdxSortLess<T, Compare> myless(src, idx_mask, comp);
354 uint32_t key = mem[i] & idx_mask_inv;
360 key = mem[i] & idx_mask_inv;
361 }
while (key == prev_key);
363 std::stable_sort(&mem[from], &mem[i], myless);
369 for (i = 0; i < n; ++i) {
370 dst[i] = src[mem[i] & idx_mask];
380 template<
class T,
class Compare>
382 size_t n = last - first;
383 T** tmp =
reinterpret_cast<T**
>(
nlib_malloc(n *
sizeof(*first)));
387 nlib_memcpy(first, n *
sizeof(*first), tmp, n *
sizeof(*tmp));
395 return KeyIdxSort(first, last, std::less<T>());
403 #endif // INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ i128 IsXdigit(i128 c) noexcept
c 内の16進数の文字をマスクします。
errno_t Init(size_t size, size_t align) noexcept
メモリの割り当てを行います。
i128 IsDigit(i128 c) noexcept
c 内の'0'-'9'の文字をマスクします。
errno_t KeyIdxSort(T **first, T **last) noexcept
KeyIdxSort(first, last, std::less<T>())を実行します。
整数のSIMD演算を行うためのクラスや関数が実装されています。
i128 IsAlpha(i128 c) noexcept
c 内のアルファベットをマスクします。
i128 IsSpace(i128 c) noexcept
c 内の空白文字(0x20, 0x09, 0x0A, 0x0D)をマスクします。
i128 IsAlnum(i128 c) noexcept
c 内のアルファベットか'0'-'9'の文字をマスクします。
nlib_i128_t i128
nlib_i128_tがtypedefされています。
constexpr const each_uint32_tag each_uint32
each_uint32_tag型の定数オブジェクトで、32bitの符号なし整数を示すためのタグです。
#define NLIB_NOEXCEPT
環境に合わせてnoexcept 又は同等の定義がされます。
void * Get() noexcept
割り当てられた領域へのポインタを返します。
errno_t KeyIdxSortN(T **dst, T *const *src, size_t n) noexcept
KeyIdxSortN(dst, src, n, std::less<T>())を実行します。
#define NLIB_ALIGNAS(x)
alignas(x)又は同等の定義がされます。
constexpr const each_int8_tag each_int8
each_int8_tag型の定数オブジェクトで、8bitの符号付き整数を示すためのタグです。
const void * nlib_memchr_pred_not(const void *s, PRED pred, size_t n) noexcept
バイト列内のバイトの検査をSIMD命令を使って行うための関数テンプレートです。
#define NLIB_STATIC_ASSERT(exp)
静的アサートが定義されます。利用可能であればstatic_assertを利用します。
errno_t MergeSortUint32A16(uint32_t *data, size_t n) noexcept
SIMDを利用して32bit符号なし整数の並びを昇順にマージソートします。
const void * nlib_memchr_pred(const void *s, PRED pred, size_t n) noexcept
バイト列内のバイトの検査をSIMD命令を使って行うための関数テンプレートです。