16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ 24 #if defined(NLIB_SIMD) 33 template<
size_t NumElem,
bool flag>
34 struct MergeSortHelper {
37 sortUint32A16(tmp, data, NumElem);
41 template<
size_t NumElem>
42 struct MergeSortHelper<NumElem, false> {
45 sortUint32A16_1(tmp, data, NumElem);
51 template<
size_t NumElem>
55 NLIB_ASSERT(!(reinterpret_cast<uintptr_t>(data) & 15));
56 detail::MergeSortHelper<NumElem, ((NumElem & (NumElem - 1)) == 0)>::Sort(data);
65 template<
class Pred,
class... Tail>
67 return SimdFindSub(a, I128::Or(cmp, pred(a)), tail...);
69 template<
class Pred,
class... Tail>
71 return SimdFindSub2(a, pred(a), tail...);
76 nlib_memcpy(&tmpbuf[0], 16, p, n);
77 return I128::LoadA1(tmpbuf);
81 template<
class... PredList>
84 if (!buf)
return nullptr;
87 i128 cmp = detail::SimdFindSub(I128::LoadA1(p), predlist...);
88 if (!I128::IsZero(cmp))
return p +
nlib_ctz32(I128::MoveMask8(cmp));
95 i128 cmp = detail::SimdFindSub(detail::PartialLoad(p, n), predlist...);
96 if (!I128::IsZero(cmp)) {
97 int mask = I128::MoveMask8(cmp) & ((1 << n) - 1);
104 template<
class... PredList>
107 if (!buf)
return nullptr;
112 i128 cmp = detail::SimdFindSub(I128::LoadA1(p), predlist...);
113 if (!I128::IsZero(cmp))
return p + (31 -
nlib_clz32(I128::MoveMask8(cmp)));
118 i128 cmp = detail::SimdFindSub(detail::PartialLoad(p, n), predlist...);
119 if (!I128::IsZero(cmp)) {
120 int mask = I128::MoveMask8(cmp) & ((1 << n) - 1);
127 template<
class... PredList>
128 inline const char* SimdFind(
const char* s, PredList... predlist)
NLIB_NOEXCEPT {
131 template<
class... PredList>
132 inline const char* SimdRfind(
const char* s, PredList... predlist)
NLIB_NOEXCEPT {
139 return SimdFind(s, n, pred);
145 return SimdFind(s, n, [&pred](
i128 c) {
return I128::Not(pred(c)); });
150 i128 result = I128::CmpLtInt8(c, I128::SetValue(
'{',
each_int8));
151 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'`',
each_int8)));
153 i128 tmp = I128::CmpLtInt8(c, I128::SetValue(
'[',
each_int8));
154 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'@',
each_int8)));
155 result = I128::Or(result, tmp);
162 i128 result = I128::CmpLtInt8(c, I128::SetValue(
':',
each_int8));
163 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'/',
each_int8)));
175 i128 result = I128::CmpEq8(c, I128::SetValue(
' ',
each_int8));
176 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\r',
each_int8)));
177 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\n',
each_int8)));
178 result = I128::Or(result, I128::CmpEq8(c, I128::SetValue(
'\t',
each_int8)));
186 i128 result = I128::CmpLtInt8(c, I128::SetValue(
':',
each_int8));
187 result = I128::And(result, I128::CmpGtInt8(c, I128::SetValue(
'/',
each_int8)));
189 tmp = I128::CmpLtInt8(c, I128::SetValue(
'G',
each_int8));
190 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'@',
each_int8)));
191 result = I128::Or(result, tmp);
193 tmp = I128::CmpLtInt8(c, I128::SetValue(
'g',
each_int8));
194 tmp = I128::And(tmp, I128::CmpGtInt8(c, I128::SetValue(
'`',
each_int8)));
195 result = I128::Or(result, tmp);
201 return I128::CmpLtInt8(c, I128::SetZero());
205 template<
class T,
class Compare>
206 class KeyIdxSortLess {
208 KeyIdxSortLess(T*
const* src, uint32_t mask, Compare comp)
NLIB_NOEXCEPT : src_(src),
212 return comp_(*src_[lhs & mask_], *src_[rhs & mask_]);
224 template<class T, class Compare>
229 if (n == 0 || n > 0x7FFFFFFFU)
return EINVAL;
231 if (n == 0 || n >
RSIZE_MAX)
return EINVAL;
234 size_t n16 = (n + 15) & ~15;
237 e = mem_.
Init(n16 *
sizeof(uint32_t), 16);
239 uint32_t* mem =
reinterpret_cast<uint32_t*
>(mem_.
Get());
240 int idx_width = 32 -
nlib_clz32(static_cast<uint32_t>(n16));
241 uint32_t idx_mask = (1U << idx_width) - 1;
242 uint32_t idx_mask_inv = ~idx_mask;
244 uint32_t max_key = 0U;
245 uint32_t min_key = 0xFFFFFFFFU;
246 for (
size_t i = 0; i < n; ++i) {
247 uint32_t key = src[i]->GetKey32();
251 else if (max_key < key)
255 int left_shift =
nlib_clz32(min_key ^ max_key);
260 i128 vecidx0 = I128::LoadA16(&detail::keyidxsort_0123[0]);
261 i128 vec_i = I128::SetZero();
263 i128 vecidx4 = I128::Add32(vecidx0, four);
264 i128 vecidx8 = I128::Add32(vecidx4, four);
265 i128 vecidx12 = I128::Add32(vecidx8, four);
266 i128 d16 = I128::Mult32(four, four);
268 for (
size_t i = 0; i < n16; i += 16) {
269 i128 m0 = I128::LoadA16(&mem[i]);
270 i128 m1 = I128::LoadA16(&mem[i + 4]);
271 i128 m2 = I128::LoadA16(&mem[i + 8]);
272 i128 m3 = I128::LoadA16(&mem[i + 12]);
274 m0 = I128::And(I128::ShiftLeftLogical32(m0, left_shift), vecmask);
275 m1 = I128::And(I128::ShiftLeftLogical32(m1, left_shift), vecmask);
276 m2 = I128::And(I128::ShiftLeftLogical32(m2, left_shift), vecmask);
277 m3 = I128::And(I128::ShiftLeftLogical32(m3, left_shift), vecmask);
278 m0 = I128::Or(m0, I128::Add32(vec_i, vecidx0));
279 m1 = I128::Or(m1, I128::Add32(vec_i, vecidx4));
280 m2 = I128::Or(m2, I128::Add32(vec_i, vecidx8));
281 m3 = I128::Or(m3, I128::Add32(vec_i, vecidx12));
283 I128::StoreA16(&mem[i], m0);
284 I128::StoreA16(&mem[i + 4], m1);
285 I128::StoreA16(&mem[i + 8], m2);
286 I128::StoreA16(&mem[i + 12], m3);
287 vec_i = I128::Add16(vec_i, d16);
290 for (
size_t i = 0; i < n; ++i) {
291 mem[i] = ((mem[i] << left_shift) & idx_mask_inv) |
static_cast<uint32_t
>(i);
294 for (
size_t i = n; i < n16; ++i) {
296 mem[i] = idx_mask_inv |
static_cast<uint32_t
>(i);
303 uint32_t prev_key = mem[0] & idx_mask_inv;
305 detail::KeyIdxSortLess<T, Compare> myless(src, idx_mask, comp);
307 uint32_t key = mem[i] & idx_mask_inv;
313 key = mem[i] & idx_mask_inv;
314 }
while (key == prev_key);
316 std::stable_sort(&mem[from], &mem[i], myless);
322 for (i = 0; i < n; ++i) {
323 dst[i] = src[mem[i] & idx_mask];
333 template<
class T,
class Compare>
335 size_t n = last - first;
336 T** tmp =
reinterpret_cast<T**
>(
nlib_malloc(n *
sizeof(*first)));
340 nlib_memcpy(first, n *
sizeof(*first), tmp, n *
sizeof(*tmp));
348 return KeyIdxSort(first, last, std::less<T>());
356 #endif // INCLUDE_NN_NLIB_SIMD_SIMDALGORITHM_H_ i128 IsXdigit(i128 c) noexcept
c 内の16進数の文字をマスクします。
errno_t Init(size_t size, size_t align) noexcept
メモリの割り当てを行います。
i128 IsDigit(i128 c) noexcept
c 内の'0'-'9'の文字をマスクします。
errno_t KeyIdxSort(T **first, T **last) noexcept
KeyIdxSort(first, last, std::less<T>())を実行します。
整数のSIMD演算を行うためのクラスや関数が実装されています。
i128 IsAlpha(i128 c) noexcept
c 内のアルファベットをマスクします。
i128 IsSpace(i128 c) noexcept
c 内の空白文字(0x20, 0x09, 0x0A, 0x0D)をマスクします。
i128 IsAlnum(i128 c) noexcept
c 内のアルファベットか'0'-'9'の文字をマスクします。
nlib_i128_t i128
nlib_i128_tがtypedefされています。
constexpr const each_uint32_tag each_uint32
each_uint32_tag型の定数オブジェクトで、32bitの符号なし整数を示すためのタグです。
#define NLIB_NOEXCEPT
環境に合わせてnoexcept 又は同等の定義がされます。
void * Get() noexcept
割り当てられた領域へのポインタを返します。
errno_t KeyIdxSortN(T **dst, T *const *src, size_t n) noexcept
KeyIdxSortN(dst, src, n, std::less<T>())を実行します。
#define NLIB_ALIGNAS(x)
alignas(x)又は同等の定義がされます。
constexpr const each_int8_tag each_int8
each_int8_tag型の定数オブジェクトで、8bitの符号付き整数を示すためのタグです。
const void * nlib_memchr_pred_not(const void *s, PRED pred, size_t n) noexcept
バイト列内のバイトの検査をSIMD命令を使って行うための関数テンプレートです。
#define NLIB_STATIC_ASSERT(exp)
静的アサートが定義されます。利用可能であればstatic_assertを利用します。
errno_t MergeSortUint32A16(uint32_t *data, size_t n) noexcept
SIMDを利用して32bit符号なし整数の並びを昇順にマージソートします。
const void * nlib_memchr_pred(const void *s, PRED pred, size_t n) noexcept
バイト列内のバイトの検査をSIMD命令を使って行うための関数テンプレートです。