3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
4 #define INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
8 #if defined(NLIB_SSE41)
11 #elif defined(NLIB_NEON)
60 #if !defined(_MSC_VER) || _MSC_VER < 1800
66 #if defined(NLIB_SIMD)
74 typedef const i64& i64arg;
75 typedef const i128& i128arg;
77 typedef const i64 i64arg;
78 typedef const i128 i128arg;
99 return I64::CmpEq8(dummy, dummy);
110 return LoadA8(reinterpret_cast<void*>(p));
113 return LoadA4(reinterpret_cast<void*>(p));
116 return LoadA2(reinterpret_cast<void*>(p));
119 return LoadA1(reinterpret_cast<void*>(p));
122 return LoadA8(reinterpret_cast<void*>(p));
125 return LoadA4(reinterpret_cast<void*>(p));
128 return LoadA2(reinterpret_cast<void*>(p));
131 return LoadA1(reinterpret_cast<void*>(p));
137 static void __vectorcall StoreA8(
void* p, i64arg value)
NLIB_NOEXCEPT;
138 static void __vectorcall StoreA4(
void* p, i64arg value)
NLIB_NOEXCEPT;
139 static void __vectorcall StoreA2(
void* p, i64arg value)
NLIB_NOEXCEPT;
140 static void __vectorcall StoreA1(
void* p, i64arg value)
NLIB_NOEXCEPT;
141 static NLIB_ALWAYS_INLINE void __vectorcall StoreA8(uintptr_t p, i64arg value) NLIB_NOEXCEPT {
142 StoreA8(reinterpret_cast<void*>(p), value);
144 static NLIB_ALWAYS_INLINE void __vectorcall StoreA4(uintptr_t p, i64arg value) NLIB_NOEXCEPT {
145 StoreA4(reinterpret_cast<void*>(p), value);
147 static NLIB_ALWAYS_INLINE void __vectorcall StoreA2(uintptr_t p, i64arg value) NLIB_NOEXCEPT {
148 StoreA2(reinterpret_cast<void*>(p), value);
150 static NLIB_ALWAYS_INLINE void __vectorcall StoreA1(uintptr_t p, i64arg value) NLIB_NOEXCEPT {
151 StoreA1(reinterpret_cast<void*>(p), value);
153 static NLIB_ALWAYS_INLINE void __vectorcall StoreA8(intptr_t p, i64arg value) NLIB_NOEXCEPT {
154 StoreA8(reinterpret_cast<void*>(p), value);
156 static NLIB_ALWAYS_INLINE void __vectorcall StoreA4(intptr_t p, i64arg value) NLIB_NOEXCEPT {
157 StoreA4(reinterpret_cast<void*>(p), value);
159 static NLIB_ALWAYS_INLINE void __vectorcall StoreA2(intptr_t p, i64arg value) NLIB_NOEXCEPT {
160 StoreA2(reinterpret_cast<void*>(p), value);
162 static NLIB_ALWAYS_INLINE void __vectorcall StoreA1(intptr_t p, i64arg value) NLIB_NOEXCEPT {
163 StoreA1(reinterpret_cast<void*>(p), value);
170 static uint8_t __vectorcall GetUint8FromLane(i64arg value)
NLIB_NOEXCEPT;
172 static uint16_t __vectorcall GetUint16FromLane(i64arg value)
NLIB_NOEXCEPT;
174 static uint32_t __vectorcall GetUint32FromLane(i64arg value)
NLIB_NOEXCEPT;
177 static i64 __vectorcall SetUint8ToLane(i64arg value, uint8_t v)
NLIB_NOEXCEPT;
179 static i64 __vectorcall SetUint16ToLane(i64arg value, uint16_t v)
NLIB_NOEXCEPT;
181 static i64 __vectorcall SetUint32ToLane(i64arg value, uint32_t v)
NLIB_NOEXCEPT;
186 static i64 __vectorcall Add8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
187 static i64 __vectorcall Add16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
188 static i64 __vectorcall Add32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
189 static i64 __vectorcall Add64(i64arg a, i64arg b)
NLIB_NOEXCEPT;
191 static i64 __vectorcall AddInt8Saturated(i64arg a, i64arg b)
NLIB_NOEXCEPT;
192 static i64 __vectorcall AddInt16Saturated(i64arg a, i64arg b)
NLIB_NOEXCEPT;
194 static i64 __vectorcall AddUint8Saturated(i64arg a, i64arg b)
NLIB_NOEXCEPT;
195 static i64 __vectorcall AddUint16Saturated(i64arg a, i64arg b)
NLIB_NOEXCEPT;
197 static i64 __vectorcall Sub8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
198 static i64 __vectorcall Sub16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
199 static i64 __vectorcall Sub32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
200 static i64 __vectorcall Sub64(i64arg a, i64arg b)
NLIB_NOEXCEPT;
202 static i64 __vectorcall SubInt8Saturated(i64arg a, i64arg b)
NLIB_NOEXCEPT;
203 static i64 __vectorcall SubInt16Saturated(i64arg a, i64arg b)
NLIB_NOEXCEPT;
205 static i64 __vectorcall SubUint8Saturated(i64arg a, i64arg b)
NLIB_NOEXCEPT;
206 static i64 __vectorcall SubUint16Saturated(i64arg a, i64arg b)
NLIB_NOEXCEPT;
208 static i64 __vectorcall PairwiseAdd8(i128arg value)
NLIB_NOEXCEPT;
209 static i64 __vectorcall PairwiseAdd16(i128arg value)
NLIB_NOEXCEPT;
210 static i64 __vectorcall PairwiseAdd32(i128arg value)
NLIB_NOEXCEPT;
212 static i64 __vectorcall Mult16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
213 static i64 __vectorcall MultAdd16(i64arg a, i64arg b, i64arg c)
NLIB_NOEXCEPT;
214 static i64 __vectorcall MultSub16(i64arg a, i64arg b, i64arg c)
NLIB_NOEXCEPT;
215 static i64 __vectorcall Mult32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
216 static i64 __vectorcall MultAdd32(i64arg a, i64arg b, i64arg c)
NLIB_NOEXCEPT;
217 static i64 __vectorcall MultSub32(i64arg a, i64arg b, i64arg c)
NLIB_NOEXCEPT;
219 static i64 __vectorcall MaxInt8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
220 static i64 __vectorcall MaxInt16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
221 static i64 __vectorcall MaxInt32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
223 static i64 __vectorcall MaxUint8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
224 static i64 __vectorcall MaxUint16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
225 static i64 __vectorcall MaxUint32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
226 static i64 __vectorcall MinInt8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
227 static i64 __vectorcall MinInt16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
228 static i64 __vectorcall MinInt32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
229 static i64 __vectorcall MinUint8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
230 static i64 __vectorcall MinUint16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
231 static i64 __vectorcall MinUint32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
234 static i64 __vectorcall AbsInt16(i64arg value)
NLIB_NOEXCEPT;
235 static i64 __vectorcall AbsInt32(i64arg value)
NLIB_NOEXCEPT;
236 static i64 __vectorcall AbsDiffInt8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
237 static i64 __vectorcall AbsDiffInt16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
238 static i64 __vectorcall AbsDiffInt32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
240 static i64 __vectorcall NegateInt8(i64arg value)
NLIB_NOEXCEPT;
241 static i64 __vectorcall NegateInt16(i64arg value)
NLIB_NOEXCEPT;
242 static i64 __vectorcall NegateInt32(i64arg value)
NLIB_NOEXCEPT;
247 static i64 __vectorcall And(i64arg a, i64arg b)
NLIB_NOEXCEPT;
248 static i64 __vectorcall Or(i64arg a, i64arg b)
NLIB_NOEXCEPT;
249 static i64 __vectorcall Xor(i64arg a, i64arg b)
NLIB_NOEXCEPT;
251 static i64 __vectorcall AndNot(i64arg a, i64arg b)
NLIB_NOEXCEPT;
252 static i64 __vectorcall OrNot(i64arg a, i64arg b)
NLIB_NOEXCEPT;
257 static i64 __vectorcall CmpEq8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
258 static i64 __vectorcall CmpEq16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
259 static i64 __vectorcall CmpEq32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
261 static i64 __vectorcall CmpLtInt8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
262 static i64 __vectorcall CmpLtInt16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
263 static i64 __vectorcall CmpLtInt32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
264 static i64 __vectorcall CmpGtInt8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
265 static i64 __vectorcall CmpGtInt16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
266 static i64 __vectorcall CmpGtInt32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
268 static i64 __vectorcall CmpLtUint8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
269 static i64 __vectorcall CmpGtUint8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
270 static i64 __vectorcall CmpLtUint16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
271 static i64 __vectorcall CmpGtUint16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
272 static i64 __vectorcall CmpLtUint32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
273 static i64 __vectorcall CmpGtUint32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
275 static i64 __vectorcall CmpLeInt8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
276 static i64 __vectorcall CmpLeInt16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
277 static i64 __vectorcall CmpLeInt32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
278 static i64 __vectorcall CmpGeInt8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
279 static i64 __vectorcall CmpGeInt16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
280 static i64 __vectorcall CmpGeInt32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
282 static i64 __vectorcall CmpLeUint8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
283 static i64 __vectorcall CmpLeUint16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
284 static i64 __vectorcall CmpLeUint32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
285 static i64 __vectorcall CmpGeUint8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
286 static i64 __vectorcall CmpGeUint16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
287 static i64 __vectorcall CmpGeUint32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
292 static i64 __vectorcall
294 static i64 __vectorcall
297 static i64 __vectorcall ShiftLeftLogical16(i64arg value,
int count)
NLIB_NOEXCEPT;
298 static i64 __vectorcall ShiftRightLogical16(i64arg value,
int count)
NLIB_NOEXCEPT;
299 static i64 __vectorcall ShiftRightArithmetic16(i64arg value,
int count)
NLIB_NOEXCEPT;
301 static i64 __vectorcall ShiftLeftLogical32(i64arg value,
int count)
NLIB_NOEXCEPT;
302 static i64 __vectorcall ShiftRightLogical32(i64arg value,
int count)
NLIB_NOEXCEPT;
303 static i64 __vectorcall ShiftRightArithmetic32(i64arg value,
int count)
NLIB_NOEXCEPT;
309 static i64 __vectorcall ByteShiftLeft(i64arg value)
NLIB_NOEXCEPT;
311 static i64 __vectorcall ByteShiftRight(i64arg value)
NLIB_NOEXCEPT;
313 static i64 __vectorcall ByteRotateRight(i64arg value)
NLIB_NOEXCEPT;
318 static i64 __vectorcall NarrowFrom16To8(i128arg value)
NLIB_NOEXCEPT;
319 static i64 __vectorcall NarrowFrom32To16(i128arg value)
NLIB_NOEXCEPT;
320 static i64 __vectorcall NarrowFrom64To32(i128arg value)
NLIB_NOEXCEPT;
322 static i64 __vectorcall ConvertFromUint16ToUint8Saturated(i128arg value)
NLIB_NOEXCEPT;
323 static i64 __vectorcall ConvertFromInt16ToInt8Saturated(i128arg value)
NLIB_NOEXCEPT;
324 static i64 __vectorcall ConvertFromUint32ToUint16Saturated(i128arg value)
NLIB_NOEXCEPT;
325 static i64 __vectorcall ConvertFromInt32ToInt16Saturated(i128arg value)
NLIB_NOEXCEPT;
330 static i64 __vectorcall Reverse16(i64arg value)
NLIB_NOEXCEPT;
331 static i64 __vectorcall Reverse32(i64arg value)
NLIB_NOEXCEPT;
332 static i64 __vectorcall Reverse64(i64arg value)
NLIB_NOEXCEPT;
337 static int __vectorcall MoveMask8(i64arg value)
NLIB_NOEXCEPT;
339 static i64 __vectorcall Select(i64arg mask, i64arg a, i64arg b)
NLIB_NOEXCEPT;
346 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall
349 #define vreinterpret_s8_s8(a) (a)
350 #define NLIB_OP1(intrin, tp, a) \
351 vreinterpret_s8_##tp(intrin##_##tp(vreinterpret_##tp##_s8(a)))
352 #define NLIB_OP2(intrin, tp, a, b) \
353 vreinterpret_s8_##tp(intrin##_##tp(vreinterpret_##tp##_s8(a), \
354 vreinterpret_##tp##_s8(b)))
355 #define NLIB_OP3(intrin, tp, a, b, c) \
356 vreinterpret_s8_##tp(intrin##_##tp(vreinterpret_##tp##_s8(a), \
357 vreinterpret_##tp##_s8(b), \
358 vreinterpret_##tp##_s8(c)))
359 #define NLIB_CMP(intrin, tp, a, b, utp) \
360 vreinterpret_s8_##utp(intrin##_##tp(vreinterpret_##tp##_s8(a), \
361 vreinterpret_##tp##_s8(b)))
362 #define NLIB_SFT(intrin, tp, a, cnt, stp) \
363 vreinterpret_s8_##tp(intrin##_##tp(vreinterpret_##tp##_s8(a), vdup_n_##stp(cnt)))
364 #define NLIB_CMB(tp, l, h) vreinterpretq_s8_##tp(vcombine_##tp(l, h))
369 #if defined(NLIB_SSE41)
371 return _mm_shuffle_epi8(_mm_cvtsi32_si128(static_cast<uint8_t>(v)), _mm_setzero_si128());
372 #elif defined(NLIB_NEON)
378 NLIB_M(i64) I64::SetValue(int16_t v, each_int16_tag) NLIB_NOEXCEPT {
379 #if defined(NLIB_SSE41)
380 return _mm_set1_epi16(v);
381 #elif defined(NLIB_NEON)
382 return vreinterpret_s8_s16(vdup_n_s16(v));
387 NLIB_M(i64) I64::SetValue(int32_t v, each_int32_tag) NLIB_NOEXCEPT {
388 #if defined(NLIB_SSE41)
389 return _mm_set1_epi32(v);
390 #elif defined(NLIB_NEON)
391 return vreinterpret_s8_s32(vdup_n_s32(v));
396 NLIB_M(i64) I64::SetValue(uint8_t v, each_uint8_tag) NLIB_NOEXCEPT {
397 #if defined(NLIB_SSE41)
399 return _mm_shuffle_epi8(_mm_cvtsi32_si128(v), _mm_setzero_si128());
400 #elif defined(NLIB_NEON)
401 return vreinterpret_s8_u8(vdup_n_u8(v));
406 NLIB_M(i64) I64::SetValue(uint16_t v, each_uint16_tag) NLIB_NOEXCEPT {
407 #if defined(NLIB_SSE41)
408 return _mm_set1_epi16(static_cast<int16_t>(v));
409 #elif defined(NLIB_NEON)
410 return vreinterpret_s8_u16(vdup_n_u16(v));
415 NLIB_M(i64) I64::SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT {
416 #if defined(NLIB_SSE41)
417 return _mm_set1_epi32(static_cast<int32_t>(v));
418 #elif defined(NLIB_NEON)
419 return vreinterpret_s8_u32(vdup_n_u32(v));
425 NLIB_M(i64) I64::SetValue(i64 value, each_select32_tag) NLIB_NOEXCEPT {
427 #if defined(NLIB_SSE41)
428 return _mm_shuffle_epi32(value, _MM_SHUFFLE(N, N, N, N));
429 #elif defined(NLIB_NEON)
430 return vreinterpret_s8_u32(vdup_lane_u32(vreinterpret_u32_s8(value), N));
436 NLIB_M(i64) I64::SetValue(i64 value, each_select16_tag) NLIB_NOEXCEPT {
438 #if defined(NLIB_SSE41)
439 return _mm_shufflelo_epi16(value, _MM_SHUFFLE(N, N, N, N));
440 #elif defined(NLIB_NEON)
441 return vreinterpret_s8_u16(vdup_lane_u16(vreinterpret_u16_s8(value), N));
447 NLIB_M(i64) I64::SetValue(i64 value, each_select8_tag) NLIB_NOEXCEPT {
449 #if defined(NLIB_SSE41)
451 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
453 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
454 #elif defined(NLIB_NEON)
455 return vdup_lane_s8(value, N);
460 NLIB_M(i64) I64::SetZero() NLIB_NOEXCEPT {
461 #if defined(NLIB_SSE41)
462 return _mm_setzero_si128();
463 #elif defined(NLIB_NEON)
469 NLIB_M(i64) I64::LoadA8(const
void* p) NLIB_NOEXCEPT {
470 #if defined(NLIB_SSE41)
471 return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
472 #elif defined(NLIB_NEON)
473 return vreinterpret_s8_u64(vld1_u64(reinterpret_cast<const uint64_t*>(p)));
478 NLIB_M(i64) I64::LoadA4(const
void* p) NLIB_NOEXCEPT {
479 #if defined(NLIB_SSE41)
480 return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
481 #elif defined(NLIB_NEON)
482 return vreinterpret_s8_u32(vld1_u32(reinterpret_cast<const uint32_t*>(p)));
487 NLIB_M(i64) I64::LoadA2(const
void* p) NLIB_NOEXCEPT {
488 #if defined(NLIB_SSE41)
489 return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
490 #elif defined(NLIB_NEON)
491 return vreinterpret_s8_u16(vld1_u16(reinterpret_cast<const uint16_t*>(p)));
496 NLIB_M(i64) I64::LoadA1(const
void* p) NLIB_NOEXCEPT {
497 #if defined(NLIB_SSE41)
498 return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
499 #elif defined(NLIB_NEON)
500 return vld1_s8(reinterpret_cast<const int8_t*>(p));
505 NLIB_M(
void) I64::StoreA8(
void* p, i64arg value) NLIB_NOEXCEPT {
506 #if defined(NLIB_SSE41)
507 _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
508 #elif defined(NLIB_NEON)
509 vst1_u64(reinterpret_cast<uint64_t*>(p), vreinterpret_u64_s8(value));
514 NLIB_M(
void) I64::StoreA4(
void* p, i64arg value) NLIB_NOEXCEPT {
515 #if defined(NLIB_SSE41)
516 _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
517 #elif defined(NLIB_NEON)
518 vst1_u32(reinterpret_cast<uint32_t*>(p), vreinterpret_u32_s8(value));
523 NLIB_M(
void) I64::StoreA2(
void* p, i64arg value) NLIB_NOEXCEPT {
524 #if defined(NLIB_SSE41)
525 _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
526 #elif defined(NLIB_NEON)
527 vst1_u16(reinterpret_cast<uint16_t*>(p), vreinterpret_u16_s8(value));
532 NLIB_M(
void) I64::StoreA1(
void* p, i64arg value) NLIB_NOEXCEPT {
533 #if defined(NLIB_SSE41)
534 _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
535 #elif defined(NLIB_NEON)
536 vst1_s8(reinterpret_cast<int8_t*>(p), value);
542 NLIB_M(uint8_t) I64::GetUint8FromLane(i64arg value) NLIB_NOEXCEPT {
544 #if defined(NLIB_SSE41)
545 return static_cast<uint8_t
>(_mm_extract_epi8(value, N));
546 #elif defined(NLIB_NEON)
547 return vget_lane_u8(vreinterpret_u8_s8(value), N);
553 NLIB_M(uint16_t) I64::GetUint16FromLane(i64arg value) NLIB_NOEXCEPT {
555 #if defined(NLIB_SSE41)
556 return static_cast<uint16_t
>(_mm_extract_epi16(value, N));
557 #elif defined(NLIB_NEON)
558 return vget_lane_u16(vreinterpret_u16_s8(value), N);
564 NLIB_M(uint32_t) I64::GetUint32FromLane(i64arg value) NLIB_NOEXCEPT {
566 #if defined(NLIB_SSE41)
567 return static_cast<uint32_t
>(_mm_extract_epi32(value, N));
568 #elif defined(NLIB_NEON)
569 return vget_lane_u32(vreinterpret_u32_s8(value), N);
575 NLIB_M(i64) I64::SetUint8ToLane(i64arg value, uint8_t v) NLIB_NOEXCEPT {
577 #if defined(NLIB_SSE41)
578 return _mm_insert_epi8(value, static_cast<int8_t>(v), N);
579 #elif defined(NLIB_NEON)
580 return vset_lane_u8(v, vreinterpret_u8_s8(value), N);
586 NLIB_M(i64) I64::SetUint16ToLane(i64arg value, uint16_t v) NLIB_NOEXCEPT {
588 #if defined(NLIB_SSE41)
589 return _mm_insert_epi16(value, static_cast<uint16_t>(v), N);
590 #elif defined(NLIB_NEON)
591 return vset_lane_u16(v, vreinterpret_u16_s8(value), N);
597 NLIB_M(i64) I64::SetUint32ToLane(i64arg value, uint32_t v) NLIB_NOEXCEPT {
599 #if defined(NLIB_SSE41)
600 return _mm_insert_epi32(value, static_cast<int32_t>(v), N);
601 #elif defined(NLIB_NEON)
602 return vset_lane_u32(v, vreinterpret_u32_s8(value), N);
607 NLIB_M(i64) I64::Add8(i64arg a, i64arg b) NLIB_NOEXCEPT {
608 #if defined(NLIB_SSE41)
609 return _mm_add_epi8(a, b);
610 #elif defined(NLIB_NEON)
611 return vadd_s8(a, b);
616 NLIB_M(i64) I64::Add16(i64arg a, i64arg b) NLIB_NOEXCEPT {
617 #if defined(NLIB_SSE41)
618 return _mm_add_epi16(a, b);
619 #elif defined(NLIB_NEON)
620 return NLIB_OP2(vadd, s16, a, b);
625 NLIB_M(i64) I64::Add32(i64arg a, i64arg b) NLIB_NOEXCEPT {
626 #if defined(NLIB_SSE41)
627 return _mm_add_epi32(a, b);
628 #elif defined(NLIB_NEON)
629 return NLIB_OP2(vadd, s32, a, b);
634 NLIB_M(i64) I64::Add64(i64arg a, i64arg b) NLIB_NOEXCEPT {
635 #if defined(NLIB_SSE41)
636 return _mm_add_epi64(a, b);
637 #elif defined(NLIB_NEON)
638 return NLIB_OP2(vadd, s64, a, b);
643 NLIB_M(i64) I64::AddInt8Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
644 #if defined(NLIB_SSE41)
645 return _mm_adds_epi8(a, b);
646 #elif defined(NLIB_NEON)
647 return vqadd_s8(a, b);
652 NLIB_M(i64) I64::AddInt16Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
653 #if defined(NLIB_SSE41)
654 return _mm_adds_epi16(a, b);
655 #elif defined(NLIB_NEON)
656 return NLIB_OP2(vqadd, s16, a, b);
661 NLIB_M(i64) I64::AddUint8Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
662 #if defined(NLIB_SSE41)
663 return _mm_adds_epu8(a, b);
664 #elif defined(NLIB_NEON)
665 return NLIB_OP2(vqadd, u8, a, b);
670 NLIB_M(i64) I64::AddUint16Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
671 #if defined(NLIB_SSE41)
672 return _mm_adds_epu16(a, b);
673 #elif defined(NLIB_NEON)
674 return NLIB_OP2(vqadd, u16, a, b);
679 NLIB_M(i64) I64::Sub8(i64arg a, i64arg b) NLIB_NOEXCEPT {
680 #if defined(NLIB_SSE41)
681 return _mm_sub_epi8(a, b);
682 #elif defined(NLIB_NEON)
683 return vsub_s8(a, b);
688 NLIB_M(i64) I64::Sub16(i64arg a, i64arg b) NLIB_NOEXCEPT {
689 #if defined(NLIB_SSE41)
690 return _mm_sub_epi16(a, b);
691 #elif defined(NLIB_NEON)
692 return NLIB_OP2(vsub, s16, a, b);
697 NLIB_M(i64) I64::Sub32(i64arg a, i64arg b) NLIB_NOEXCEPT {
698 #if defined(NLIB_SSE41)
699 return _mm_sub_epi32(a, b);
700 #elif defined(NLIB_NEON)
701 return NLIB_OP2(vsub, s32, a, b);
706 NLIB_M(i64) I64::Sub64(i64arg a, i64arg b) NLIB_NOEXCEPT {
707 #if defined(NLIB_SSE41)
708 return _mm_sub_epi64(a, b);
709 #elif defined(NLIB_NEON)
710 return NLIB_OP2(vsub, s64, a, b);
715 NLIB_M(i64) I64::SubInt8Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
716 #if defined(NLIB_SSE41)
717 return _mm_subs_epi8(a, b);
718 #elif defined(NLIB_NEON)
719 return vqsub_s8(a, b);
724 NLIB_M(i64) I64::SubInt16Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
725 #if defined(NLIB_SSE41)
726 return _mm_subs_epi16(a, b);
727 #elif defined(NLIB_NEON)
728 return NLIB_OP2(vqsub, s16, a, b);
733 NLIB_M(i64) I64::SubUint8Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
734 #if defined(NLIB_SSE41)
735 return _mm_subs_epu8(a, b);
736 #elif defined(NLIB_NEON)
737 return NLIB_OP2(vqsub, u8, a, b);
742 NLIB_M(i64) I64::SubUint16Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
743 #if defined(NLIB_SSE41)
744 return _mm_subs_epu16(a, b);
745 #elif defined(NLIB_NEON)
746 return NLIB_OP2(vqsub, u16, a, b);
751 NLIB_M(i64) I64::PairwiseAdd8(i128arg value) NLIB_NOEXCEPT {
752 #if defined(NLIB_SSE41)
753 i128 ax = _mm_add_epi8(value, _mm_srli_epi16(value, 8));
754 return I64::NarrowFrom16To8(ax);
755 #elif defined(NLIB_NEON)
756 return vpadd_s8(vget_low_s8(value), vget_high_s8(value));
761 NLIB_M(i64) I64::PairwiseAdd16(i128arg value) NLIB_NOEXCEPT {
762 #if defined(NLIB_SSE41)
763 return _mm_hadd_epi16(value, value);
764 #elif defined(NLIB_NEON)
765 int8x8_t lo = vget_low_s8(value);
766 int8x8_t hi = vget_high_s8(value);
767 return NLIB_OP2(vpadd, s16, lo, hi);
772 NLIB_M(i64) I64::PairwiseAdd32(i128arg value) NLIB_NOEXCEPT {
773 #if defined(NLIB_SSE41)
774 return _mm_hadd_epi32(value, value);
775 #elif defined(NLIB_NEON)
776 int8x8_t lo = vget_low_s8(value);
777 int8x8_t hi = vget_high_s8(value);
778 return NLIB_OP2(vpadd, s32, lo, hi);
783 NLIB_M(i64) I64::Mult16(i64arg a, i64arg b) NLIB_NOEXCEPT {
784 #if defined(NLIB_SSE41)
785 return _mm_mullo_epi16(a, b);
786 #elif defined(NLIB_NEON)
787 return NLIB_OP2(vmul, s16, a, b);
792 NLIB_M(i64) I64::MultAdd16(i64arg a, i64arg b, i64arg c) NLIB_NOEXCEPT {
793 #if defined(NLIB_SSE41)
794 return _mm_add_epi16(c, _mm_mullo_epi16(a, b));
795 #elif defined(NLIB_NEON)
796 return NLIB_OP3(vmla, s16, c, a, b);
801 NLIB_M(i64) I64::MultSub16(i64arg a, i64arg b, i64arg c) NLIB_NOEXCEPT {
802 #if defined(NLIB_SSE41)
803 return _mm_sub_epi16(c, _mm_mullo_epi16(a, b));
804 #elif defined(NLIB_NEON)
805 return NLIB_OP3(vmls, s16, c, a, b);
810 NLIB_M(i64) I64::Mult32(i64arg a, i64arg b) NLIB_NOEXCEPT {
811 #if defined(NLIB_SSE41)
812 return _mm_mullo_epi32(a, b);
813 #elif defined(NLIB_NEON)
814 return NLIB_OP2(vmul, s32, a, b);
819 NLIB_M(i64) I64::MultAdd32(i64arg a, i64arg b, i64arg c) NLIB_NOEXCEPT {
820 #if defined(NLIB_SSE41)
821 return _mm_add_epi32(c, _mm_mullo_epi32(a, b));
822 #elif defined(NLIB_NEON)
823 return NLIB_OP3(vmla, s32, c, a, b);
828 NLIB_M(i64) I64::MultSub32(i64arg a, i64arg b, i64arg c) NLIB_NOEXCEPT {
829 #if defined(NLIB_SSE41)
830 return _mm_sub_epi32(c, _mm_mullo_epi32(a, b));
831 #elif defined(NLIB_NEON)
832 return NLIB_OP3(vmls, s32, c, a, b);
837 NLIB_M(i64) I64::MaxInt8(i64arg a, i64arg b) NLIB_NOEXCEPT {
838 #if defined(NLIB_SSE41)
839 return _mm_max_epi8(a, b);
840 #elif defined(NLIB_NEON)
841 return vmax_s8(a, b);
846 NLIB_M(i64) I64::MaxInt16(i64arg a, i64arg b) NLIB_NOEXCEPT {
847 #if defined(NLIB_SSE41)
848 return _mm_max_epi16(a, b);
849 #elif defined(NLIB_NEON)
850 return NLIB_OP2(vmax, s16, a, b);
855 NLIB_M(i64) I64::MaxInt32(i64arg a, i64arg b) NLIB_NOEXCEPT {
856 #if defined(NLIB_SSE41)
857 return _mm_max_epi32(a, b);
858 #elif defined(NLIB_NEON)
859 return NLIB_OP2(vmax, s32, a, b);
864 NLIB_M(i64) I64::MaxUint8(i64arg a, i64arg b) NLIB_NOEXCEPT {
865 #if defined(NLIB_SSE41)
866 return _mm_max_epu8(a, b);
867 #elif defined(NLIB_NEON)
868 return NLIB_OP2(vmax, u8, a, b);
873 NLIB_M(i64) I64::MaxUint16(i64arg a, i64arg b) NLIB_NOEXCEPT {
874 #if defined(NLIB_SSE41)
875 return _mm_max_epu16(a, b);
876 #elif defined(NLIB_NEON)
877 return NLIB_OP2(vmax, u16, a, b);
882 NLIB_M(i64) I64::MaxUint32(i64arg a, i64arg b) NLIB_NOEXCEPT {
883 #if defined(NLIB_SSE41)
884 return _mm_max_epu32(a, b);
885 #elif defined(NLIB_NEON)
886 return NLIB_OP2(vmax, u32, a, b);
891 NLIB_M(i64) I64::MinInt8(i64arg a, i64arg b) NLIB_NOEXCEPT {
892 #if defined(NLIB_SSE41)
893 return _mm_min_epi8(a, b);
894 #elif defined(NLIB_NEON)
895 return vmin_s8(a, b);
900 NLIB_M(i64) I64::MinInt16(i64arg a, i64arg b) NLIB_NOEXCEPT {
901 #if defined(NLIB_SSE41)
902 return _mm_min_epi16(a, b);
903 #elif defined(NLIB_NEON)
904 return NLIB_OP2(vmin, s16, a, b);
909 NLIB_M(i64) I64::MinInt32(i64arg a, i64arg b) NLIB_NOEXCEPT {
910 #if defined(NLIB_SSE41)
911 return _mm_min_epi32(a, b);
912 #elif defined(NLIB_NEON)
913 return NLIB_OP2(vmin, s32, a, b);
918 NLIB_M(i64) I64::MinUint8(i64arg a, i64arg b) NLIB_NOEXCEPT {
919 #if defined(NLIB_SSE41)
920 return _mm_min_epu8(a, b);
921 #elif defined(NLIB_NEON)
922 return NLIB_OP2(vmin, u8, a, b);
927 NLIB_M(i64) I64::MinUint16(i64arg a, i64arg b) NLIB_NOEXCEPT {
928 #if defined(NLIB_SSE41)
929 return _mm_min_epu16(a, b);
930 #elif defined(NLIB_NEON)
931 return NLIB_OP2(vmin, u16, a, b);
936 NLIB_M(i64) I64::MinUint32(i64arg a, i64arg b) NLIB_NOEXCEPT {
937 #if defined(NLIB_SSE41)
938 return _mm_min_epu32(a, b);
939 #elif defined(NLIB_NEON)
940 return NLIB_OP2(vmin, u32, a, b);
945 NLIB_M(i64) I64::AbsInt8(i64arg value) NLIB_NOEXCEPT {
946 #if defined(NLIB_SSE41)
947 return _mm_abs_epi8(value);
948 #elif defined(NLIB_NEON)
949 return vabs_s8(value);
954 NLIB_M(i64) I64::AbsInt16(i64arg value) NLIB_NOEXCEPT {
955 #if defined(NLIB_SSE41)
956 return _mm_abs_epi16(value);
957 #elif defined(NLIB_NEON)
958 return NLIB_OP1(vabs, s16, value);
963 NLIB_M(i64) I64::AbsInt32(i64arg value) NLIB_NOEXCEPT {
964 #if defined(NLIB_SSE41)
965 return _mm_abs_epi32(value);
966 #elif defined(NLIB_NEON)
967 return NLIB_OP1(vabs, s32, value);
972 NLIB_M(i64) I64::AbsDiffInt8(i64arg a, i64arg b) NLIB_NOEXCEPT {
973 #if defined(NLIB_SSE41)
974 return _mm_abs_epi8(_mm_sub_epi8(a, b));
975 #elif defined(NLIB_NEON)
976 return vabd_s8(a, b);
981 NLIB_M(i64) I64::AbsDiffInt16(i64arg a, i64arg b) NLIB_NOEXCEPT {
982 #if defined(NLIB_SSE41)
983 return _mm_abs_epi16(_mm_sub_epi16(a, b));
984 #elif defined(NLIB_NEON)
985 return NLIB_OP2(vabd, s16, a, b);
990 NLIB_M(i64) I64::AbsDiffInt32(i64arg a, i64arg b) NLIB_NOEXCEPT {
991 #if defined(NLIB_SSE41)
992 return _mm_abs_epi32(_mm_sub_epi32(a, b));
993 #elif defined(NLIB_NEON)
994 return NLIB_OP2(vabd, s32, a, b);
999 NLIB_M(i64) I64::NegateInt8(i64arg value) NLIB_NOEXCEPT {
1000 #if defined(NLIB_SSE41)
1001 return _mm_sub_epi8(_mm_setzero_si128(), value);
1002 #elif defined(NLIB_NEON)
1003 return vneg_s8(value);
1008 NLIB_M(i64) I64::NegateInt16(i64arg value) NLIB_NOEXCEPT {
1009 #if defined(NLIB_SSE41)
1010 return _mm_sub_epi16(_mm_setzero_si128(), value);
1011 #elif defined(NLIB_NEON)
1012 return NLIB_OP1(vneg, s16, value);
1017 NLIB_M(i64) I64::NegateInt32(i64arg value) NLIB_NOEXCEPT {
1018 #if defined(NLIB_SSE41)
1019 return _mm_sub_epi32(_mm_setzero_si128(), value);
1020 #elif defined(NLIB_NEON)
1021 return NLIB_OP1(vneg, s32, value);
1026 NLIB_M(i64) I64::And(i64arg a, i64arg b) NLIB_NOEXCEPT {
1027 #if defined(NLIB_SSE41)
1028 return _mm_and_si128(a, b);
1029 #elif defined(NLIB_NEON)
1030 return vand_s8(a, b);
1035 NLIB_M(i64) I64::Or(i64arg a, i64arg b) NLIB_NOEXCEPT {
1036 #if defined(NLIB_SSE41)
1037 return _mm_or_si128(a, b);
1038 #elif defined(NLIB_NEON)
1039 return vorr_s8(a, b);
1044 NLIB_M(i64) I64::Xor(i64arg a, i64arg b) NLIB_NOEXCEPT {
1045 #if defined(NLIB_SSE41)
1046 return _mm_xor_si128(a, b);
1047 #elif defined(NLIB_NEON)
1048 return veor_s8(a, b);
1053 NLIB_M(i64) I64::Not(i64arg a) NLIB_NOEXCEPT {
1054 #if defined(NLIB_SSE41)
1055 return _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1056 #elif defined(NLIB_NEON)
1062 NLIB_M(i64) I64::AndNot(i64arg a, i64arg b) NLIB_NOEXCEPT {
1063 #if defined(NLIB_SSE41)
1064 return _mm_andnot_si128(a, b);
1065 #elif defined(NLIB_NEON)
1066 return vbic_s8(b, a);
1071 NLIB_M(i64) I64::OrNot(i64arg a, i64arg b) NLIB_NOEXCEPT {
1072 #if defined(NLIB_SSE41)
1073 __m128i full = _mm_cmpeq_epi8(a, a);
1074 return _mm_or_si128(_mm_andnot_si128(a, full), b);
1075 #elif defined(NLIB_NEON)
1076 return vorn_s8(b, a);
1081 NLIB_M(i64) I64::CmpEq8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1082 #if defined(NLIB_SSE41)
1083 return _mm_cmpeq_epi8(a, b);
1084 #elif defined(NLIB_NEON)
1085 return vreinterpret_s8_u8(vceq_s8(a, b));
1090 NLIB_M(i64) I64::CmpEq16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1091 #if defined(NLIB_SSE41)
1092 return _mm_cmpeq_epi16(a, b);
1093 #elif defined(NLIB_NEON)
1094 return NLIB_CMP(vceq, s16, a, b, u16);
1099 NLIB_M(i64) I64::CmpEq32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1100 #if defined(NLIB_SSE41)
1101 return _mm_cmpeq_epi32(a, b);
1102 #elif defined(NLIB_NEON)
1103 return NLIB_CMP(vceq, s32, a, b, u32);
1108 NLIB_M(i64) I64::CmpLtInt8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1109 #if defined(NLIB_SSE41)
1110 return _mm_cmplt_epi8(a, b);
1111 #elif defined(NLIB_NEON)
1112 return NLIB_CMP(vclt, s8, a, b, u8);
1117 NLIB_M(i64) I64::CmpLtInt16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1118 #if defined(NLIB_SSE41)
1119 return _mm_cmplt_epi16(a, b);
1120 #elif defined(NLIB_NEON)
1121 return NLIB_CMP(vclt, s16, a, b, u16);
1126 NLIB_M(i64) I64::CmpLtInt32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1127 #if defined(NLIB_SSE41)
1128 return _mm_cmplt_epi32(a, b);
1129 #elif defined(NLIB_NEON)
1130 return NLIB_CMP(vclt, s32, a, b, u32);
1135 NLIB_M(i64) I64::CmpGtInt8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1136 #if defined(NLIB_SSE41)
1137 return _mm_cmpgt_epi8(a, b);
1138 #elif defined(NLIB_NEON)
1139 return NLIB_CMP(vcgt, s8, a, b, u8);
1144 NLIB_M(i64) I64::CmpGtInt16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1145 #if defined(NLIB_SSE41)
1146 return _mm_cmpgt_epi16(a, b);
1147 #elif defined(NLIB_NEON)
1148 return NLIB_CMP(vcgt, s16, a, b, u16);
1153 NLIB_M(i64) I64::CmpGtInt32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1154 #if defined(NLIB_SSE41)
1155 return _mm_cmpgt_epi32(a, b);
1156 #elif defined(NLIB_NEON)
1157 return NLIB_CMP(vcgt, s32, a, b, u32);
1162 NLIB_M(i64) I64::CmpLtUint8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1163 #if defined(NLIB_SSE41)
1164 __m128i ofs = _mm_shuffle_epi8(_mm_cvtsi32_si128(0x80), _mm_setzero_si128());
1165 return _mm_cmplt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1166 #elif defined(NLIB_NEON)
1167 return NLIB_CMP(vclt, u8, a, b, u8);
1172 NLIB_M(i64) I64::CmpGtUint8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1173 #if defined(NLIB_SSE41)
1174 __m128i ofs = _mm_shuffle_epi8(_mm_cvtsi32_si128(0x80), _mm_setzero_si128());
1175 return _mm_cmpgt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1176 #elif defined(NLIB_NEON)
1177 return NLIB_CMP(vcgt, u8, a, b, u8);
1182 NLIB_M(i64) I64::CmpLtUint16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1183 #if defined(NLIB_SSE41)
1184 __m128i ofs = _mm_set1_epi16(INT16_MIN);
1185 return _mm_cmplt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1186 #elif defined(NLIB_NEON)
1187 return NLIB_CMP(vclt, u16, a, b, u16);
1192 NLIB_M(i64) I64::CmpGtUint16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1193 #if defined(NLIB_SSE41)
1194 __m128i ofs = _mm_set1_epi16(INT16_MIN);
1195 return _mm_cmpgt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1196 #elif defined(NLIB_NEON)
1197 return NLIB_CMP(vcgt, u16, a, b, u16);
1202 NLIB_M(i64) I64::CmpLtUint32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1203 #if defined(NLIB_SSE41)
1204 __m128i ofs = _mm_set1_epi32(INT32_MIN);
1205 return _mm_cmplt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1206 #elif defined(NLIB_NEON)
1207 return NLIB_CMP(vclt, u32, a, b, u32);
1212 NLIB_M(i64) I64::CmpGtUint32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1213 #if defined(NLIB_SSE41)
1214 __m128i ofs = _mm_set1_epi32(INT32_MIN);
1215 return _mm_cmpgt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1216 #elif defined(NLIB_NEON)
1217 return NLIB_CMP(vcgt, u32, a, b, u32);
1222 NLIB_M(i64) I64::CmpLeInt8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1223 #if defined(NLIB_SSE41)
1224 return _mm_or_si128(_mm_cmplt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1225 #elif defined(NLIB_NEON)
1226 return NLIB_CMP(vcle, s8, a, b, u8);
1231 NLIB_M(i64) I64::CmpLeInt16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1232 #if defined(NLIB_SSE41)
1233 return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1234 #elif defined(NLIB_NEON)
1235 return NLIB_CMP(vcle, s16, a, b, u16);
1240 NLIB_M(i64) I64::CmpLeInt32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1241 #if defined(NLIB_SSE41)
1242 return _mm_or_si128(_mm_cmplt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1243 #elif defined(NLIB_NEON)
1244 return NLIB_CMP(vcle, s32, a, b, u32);
1249 NLIB_M(i64) I64::CmpGeInt8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1250 #if defined(NLIB_SSE41)
1251 return _mm_or_si128(_mm_cmpgt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1252 #elif defined(NLIB_NEON)
1253 return NLIB_CMP(vcge, s8, a, b, u8);
1258 NLIB_M(i64) I64::CmpGeInt16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1259 #if defined(NLIB_SSE41)
1260 return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1261 #elif defined(NLIB_NEON)
1262 return NLIB_CMP(vcge, s16, a, b, u16);
1267 NLIB_M(i64) I64::CmpGeInt32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1268 #if defined(NLIB_SSE41)
1269 return _mm_or_si128(_mm_cmpgt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1270 #elif defined(NLIB_NEON)
1271 return NLIB_CMP(vcge, s32, a, b, u32);
1276 NLIB_M(i64) I64::CmpLeUint8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1277 #if defined(NLIB_SSE41)
1278 return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
1279 #elif defined(NLIB_NEON)
1280 return NLIB_CMP(vcle, u8, a, b, u8);
1285 NLIB_M(i64) I64::CmpLeUint16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1286 #if defined(NLIB_SSE41)
1287 return _mm_cmpeq_epi16(_mm_min_epu16(a, b), a);
1288 #elif defined(NLIB_NEON)
1289 return NLIB_CMP(vcle, u16, a, b, u16);
1294 NLIB_M(i64) I64::CmpLeUint32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1295 #if defined(NLIB_SSE41)
1296 return _mm_cmpeq_epi32(_mm_min_epu32(a, b), a);
1297 #elif defined(NLIB_NEON)
1298 return NLIB_CMP(vcle, u32, a, b, u32);
1303 NLIB_M(i64) I64::CmpGeUint8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1304 #if defined(NLIB_SSE41)
1305 return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a);
1306 #elif defined(NLIB_NEON)
1307 return NLIB_CMP(vcge, u8, a, b, u8);
1312 NLIB_M(i64) I64::CmpGeUint16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1313 #if defined(NLIB_SSE41)
1314 return _mm_cmpeq_epi16(_mm_max_epu16(a, b), a);
1315 #elif defined(NLIB_NEON)
1316 return NLIB_CMP(vcge, u16, a, b, u16);
1321 NLIB_M(i64) I64::CmpGeUint32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1322 #if defined(NLIB_SSE41)
1323 return _mm_cmpeq_epi32(_mm_max_epu32(a, b), a);
1324 #elif defined(NLIB_NEON)
1325 return NLIB_CMP(vcge, u32, a, b, u32);
1330 NLIB_M(i64) I64::ShiftLeftLogical16(i64arg value,
int count) NLIB_NOEXCEPT {
1331 #if defined(NLIB_SSE41)
1332 return _mm_slli_epi16(value, count);
1333 #elif defined(NLIB_NEON)
1334 return NLIB_SFT(vshl, u16, value, count, s16);
1339 NLIB_M(i64) I64::ShiftRightLogical16(i64arg value,
int count) NLIB_NOEXCEPT {
1340 #if defined(NLIB_SSE41)
1341 return _mm_srli_epi16(value, count);
1342 #elif defined(NLIB_NEON)
1343 return NLIB_SFT(vshl, u16, value, -count, s16);
1348 NLIB_M(i64) I64::ShiftRightArithmetic16(i64arg value,
int count) NLIB_NOEXCEPT {
1349 #if defined(NLIB_SSE41)
1350 return _mm_srai_epi16(value, count);
1351 #elif defined(NLIB_NEON)
1352 return NLIB_SFT(vshl, s16, value, -count, s16);
1357 NLIB_M(i64) I64::ShiftLeftLogical32(i64arg value,
int count) NLIB_NOEXCEPT {
1358 #if defined(NLIB_SSE41)
1359 return _mm_slli_epi32(value, count);
1360 #elif defined(NLIB_NEON)
1361 return NLIB_SFT(vshl, u32, value, count, s32);
1366 NLIB_M(i64) I64::ShiftRightLogical32(i64arg value,
int count) NLIB_NOEXCEPT {
1367 #if defined(NLIB_SSE41)
1368 return _mm_srli_epi32(value, count);
1369 #elif defined(NLIB_NEON)
1370 return NLIB_SFT(vshl, u32, value, -count, s32);
1375 NLIB_M(i64) I64::ShiftRightArithmetic32(i64arg value,
int count) NLIB_NOEXCEPT {
1376 #if defined(NLIB_SSE41)
1377 return _mm_srai_epi32(value, count);
1378 #elif defined(NLIB_NEON)
1379 return NLIB_SFT(vshl, s32, value, -count, s32);
1385 NLIB_M(i64) I64::ByteShiftLeft(i64arg value) NLIB_NOEXCEPT {
1387 #if defined(NLIB_SSE41)
1388 return _mm_slli_epi64(value, N * 8);
1389 #elif defined(NLIB_NEON)
1390 return vreinterpret_s8_u64(vshl_n_u64(vreinterpret_u64_s8(value), N * 8));
1396 NLIB_M(i64) I64::ByteShiftRight(i64arg value) NLIB_NOEXCEPT {
1398 #if defined(NLIB_SSE41)
1399 return _mm_srli_epi64(value, N * 8);
1400 #elif defined(NLIB_NEON)
1401 return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(value), N * 8));
1407 NLIB_M(i64) I64::ByteRotateRight(i64arg value) NLIB_NOEXCEPT {
1409 #if defined(NLIB_SSE41)
1410 i64 tmp = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 1, 0));
1411 return _mm_alignr_epi8(tmp, tmp, N);
1412 #elif defined(NLIB_NEON)
1413 return vext_s8(value, value, N);
1418 NLIB_M(i64) I64::NarrowFrom16To8(i128arg value) NLIB_NOEXCEPT {
1419 #if defined(NLIB_SSE41)
1421 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1
1423 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
1424 #elif defined(NLIB_NEON)
1425 return vreinterpret_s8_u8(vmovn_u16(vreinterpretq_u16_s8(value)));
1430 NLIB_M(i64) I64::NarrowFrom32To16(i128arg value) NLIB_NOEXCEPT {
1431 #if defined(NLIB_SSE41)
1433 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1
1435 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
1436 #elif defined(NLIB_NEON)
1437 return vreinterpret_s8_u16(vmovn_u32(vreinterpretq_u32_s8(value)));
1442 NLIB_M(i64) I64::NarrowFrom64To32(i128arg value) NLIB_NOEXCEPT {
1443 #if defined(NLIB_SSE41)
1444 return _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 1, 2, 0));
1445 #elif defined(NLIB_NEON)
1446 return vreinterpret_s8_u32(vmovn_u64(vreinterpretq_u64_s8(value)));
1451 NLIB_M(i64) I64::ConvertFromUint16ToUint8Saturated(i128arg value) NLIB_NOEXCEPT {
1452 #if defined(NLIB_SSE41)
1453 i64 masked = _mm_and_si128(value, _mm_set1_epi16(0x7FFFU));
1454 return _mm_packus_epi16(masked, masked);
1455 #elif defined(NLIB_NEON)
1456 return vreinterpret_s8_u8(vqmovn_u16(vreinterpretq_u16_s8(value)));
1461 NLIB_M(i64) I64::ConvertFromInt16ToInt8Saturated(i128arg value) NLIB_NOEXCEPT {
1462 #if defined(NLIB_SSE41)
1463 return _mm_packs_epi16(value, value);
1464 #elif defined(NLIB_NEON)
1465 return vqmovn_s16(vreinterpretq_s16_s8(value));
1470 NLIB_M(i64) I64::ConvertFromUint32ToUint16Saturated(i128arg value) NLIB_NOEXCEPT {
1471 #if defined(NLIB_SSE41)
1472 i64 masked = _mm_and_si128(value, _mm_set1_epi32(0x7FFFFFFFU));
1473 return _mm_packus_epi32(masked, masked);
1474 #elif defined(NLIB_NEON)
1475 return vreinterpret_s8_u16(vqmovn_u32(vreinterpretq_u32_s8(value)));
1480 NLIB_M(i64) I64::ConvertFromInt32ToInt16Saturated(i128arg value) NLIB_NOEXCEPT {
1481 #if defined(NLIB_SSE41)
1482 return _mm_packs_epi32(value, value);
1483 #elif defined(NLIB_NEON)
1484 return vreinterpret_s8_s16(vqmovn_s32(vreinterpretq_s32_s8(value)));
1489 NLIB_M(i64) I64::Reverse16(i64arg value) NLIB_NOEXCEPT {
1490 #if defined(NLIB_SSE41)
1492 1, 0, 3, 2, 5, 4, 7, 6
1494 i128 mask = I64::LoadA8(&mask_[0]);
1495 return _mm_shuffle_epi8(value, mask);
1496 #elif defined(NLIB_NEON)
1497 return NLIB_OP1(vrev16, u8, value);
1502 NLIB_M(i64) I64::Reverse32(i64arg value) NLIB_NOEXCEPT {
1503 #if defined(NLIB_SSE41)
1505 3, 2, 1, 0, 7, 6, 5, 4
1507 i128 mask = I64::LoadA8(&mask_[0]);
1508 return _mm_shuffle_epi8(value, mask);
1509 #elif defined(NLIB_NEON)
1510 return NLIB_OP1(vrev32, u8, value);
1515 NLIB_M(i64) I64::Reverse64(i64arg value) NLIB_NOEXCEPT {
1516 #if defined(NLIB_SSE41)
1518 7, 6, 5, 4, 3, 2, 1, 0
1520 i128 mask = I64::LoadA8(&mask_[0]);
1521 return _mm_shuffle_epi8(value, mask);
1522 #elif defined(NLIB_NEON)
1523 return NLIB_OP1(vrev64, u8, value);
1528 NLIB_M(
int) I64::MoveMask8(i64arg value) NLIB_NOEXCEPT {
1529 #if defined(NLIB_SSE41)
1530 return _mm_movemask_epi8(value) & 0xFF;
1531 #elif defined(NLIB_NEON)
1532 NLIB_ALIGNAS(16) static const uint64_t powers_[1] = {0x8040201008040201ULL};
1533 uint8x8_t powers = vreinterpret_u8_u64(vld1_u64(powers_));
1534 uint8x8_t a = vand_u8(vreinterpret_u8_s8(value), powers);
1535 uint8x8_t tmp = vpadd_u8(a, a);
1536 tmp = vpadd_u8(tmp, tmp);
1537 tmp = vpadd_u8(tmp, tmp);
1538 return vget_lane_u8(tmp, 0);
1543 NLIB_M(
bool) I64::IsZero(i64arg value) NLIB_NOEXCEPT {
1544 #if defined(NLIB_SSE41)
1545 i64 tmp = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 1, 0));
1546 return _mm_testz_si128(tmp, tmp) != 0;
1547 #elif defined(NLIB_NEON)
1548 return vget_lane_u64(vreinterpret_u64_s8(value), 0) == 0;
1553 NLIB_M(i64) I64::Select(i64arg mask, i64arg a, i64arg b) NLIB_NOEXCEPT {
1554 #if defined(NLIB_SSE41)
1555 return _mm_blendv_epi8(b, a, mask);
1556 #elif defined(NLIB_NEON)
1557 return vbsl_s8(vreinterpret_u8_s8(mask), a, b);
1562 #endif // NLIB_DOXYGEN
1582 static i128 __vectorcall SetFull(i128arg dummy)
NLIB_NOEXCEPT;
1587 static i128 __vectorcall LoadA16(
const void* p)
NLIB_NOEXCEPT;
1588 static i128 __vectorcall LoadA8(
const void* p)
NLIB_NOEXCEPT;
1589 static i128 __vectorcall LoadA4(
const void* p)
NLIB_NOEXCEPT;
1590 static i128 __vectorcall LoadA2(
const void* p)
NLIB_NOEXCEPT;
1591 static i128 __vectorcall LoadA1(
const void* p)
NLIB_NOEXCEPT;
1592 static i128 __vectorcall
LoadA16(uintptr_t p) NLIB_NOEXCEPT {
1593 return LoadA16(reinterpret_cast<void*>(p));
1595 static i128 __vectorcall
LoadA8(uintptr_t p) NLIB_NOEXCEPT {
1596 return LoadA8(reinterpret_cast<void*>(p));
1598 static i128 __vectorcall
LoadA4(uintptr_t p) NLIB_NOEXCEPT {
1599 return LoadA4(reinterpret_cast<void*>(p));
1601 static i128 __vectorcall
LoadA2(uintptr_t p) NLIB_NOEXCEPT {
1602 return LoadA2(reinterpret_cast<void*>(p));
1604 static i128 __vectorcall
LoadA1(uintptr_t p) NLIB_NOEXCEPT {
1605 return LoadA1(reinterpret_cast<void*>(p));
1607 static i128 __vectorcall
LoadA16(intptr_t p) NLIB_NOEXCEPT {
1608 return LoadA16(reinterpret_cast<void*>(p));
1610 static i128 __vectorcall
LoadA8(intptr_t p) NLIB_NOEXCEPT {
1611 return LoadA8(reinterpret_cast<void*>(p));
1613 static i128 __vectorcall
LoadA4(intptr_t p) NLIB_NOEXCEPT {
1614 return LoadA4(reinterpret_cast<void*>(p));
1616 static i128 __vectorcall
LoadA2(intptr_t p) NLIB_NOEXCEPT {
1617 return LoadA2(reinterpret_cast<void*>(p));
1619 static i128 __vectorcall
LoadA1(intptr_t p) NLIB_NOEXCEPT {
1620 return LoadA1(reinterpret_cast<void*>(p));
1623 static void __vectorcall StoreA16(
void* p, i128arg value)
NLIB_NOEXCEPT;
1624 static void __vectorcall StoreA8(
void* p, i128arg value)
NLIB_NOEXCEPT;
1625 static void __vectorcall StoreA4(
void* p, i128arg value)
NLIB_NOEXCEPT;
1626 static void __vectorcall StoreA2(
void* p, i128arg value)
NLIB_NOEXCEPT;
1627 static void __vectorcall StoreA1(
void* p, i128arg value)
NLIB_NOEXCEPT;
1628 static void __vectorcall
StoreA16(uintptr_t p, i128arg value) NLIB_NOEXCEPT {
1629 StoreA16(reinterpret_cast<void*>(p), value);
1631 static void __vectorcall
StoreA8(uintptr_t p, i128arg value) NLIB_NOEXCEPT {
1632 StoreA8(reinterpret_cast<void*>(p), value);
1634 static void __vectorcall
StoreA4(uintptr_t p, i128arg value) NLIB_NOEXCEPT {
1635 StoreA4(reinterpret_cast<void*>(p), value);
1637 static void __vectorcall
StoreA2(uintptr_t p, i128arg value) NLIB_NOEXCEPT {
1638 StoreA2(reinterpret_cast<void*>(p), value);
1640 static void __vectorcall
StoreA1(uintptr_t p, i128arg value) NLIB_NOEXCEPT {
1641 StoreA1(reinterpret_cast<void*>(p), value);
1643 static void __vectorcall
StoreA16(intptr_t p, i128arg value) NLIB_NOEXCEPT {
1644 StoreA16(reinterpret_cast<void*>(p), value);
1646 static void __vectorcall
StoreA8(intptr_t p, i128arg value) NLIB_NOEXCEPT {
1647 StoreA8(reinterpret_cast<void*>(p), value);
1649 static void __vectorcall
StoreA4(intptr_t p, i128arg value) NLIB_NOEXCEPT {
1650 StoreA4(reinterpret_cast<void*>(p), value);
1652 static void __vectorcall
StoreA2(intptr_t p, i128arg value) NLIB_NOEXCEPT {
1653 StoreA2(reinterpret_cast<void*>(p), value);
1655 static void __vectorcall
StoreA1(intptr_t p, i128arg value) NLIB_NOEXCEPT {
1656 StoreA1(reinterpret_cast<void*>(p), value);
1663 static uint8_t __vectorcall GetUint8FromLane(i128arg value)
NLIB_NOEXCEPT;
1665 static uint16_t __vectorcall GetUint16FromLane(i128arg value)
NLIB_NOEXCEPT;
1667 static uint32_t __vectorcall GetUint32FromLane(i128arg value)
NLIB_NOEXCEPT;
1669 static uint64_t __vectorcall GetUint64FromLane(i128arg value)
NLIB_NOEXCEPT;
1671 static i128 __vectorcall SetUint8ToLane(i128arg value, uint8_t v)
NLIB_NOEXCEPT;
1673 static i128 __vectorcall SetUint16ToLane(i128arg value, uint16_t v)
NLIB_NOEXCEPT;
1675 static i128 __vectorcall SetUint32ToLane(i128arg value, uint32_t v)
NLIB_NOEXCEPT;
1677 static i128 __vectorcall SetUint64ToLane(i128arg value, uint64_t v)
NLIB_NOEXCEPT;
1682 static i128 __vectorcall Add8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1683 static i128 __vectorcall Add16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1684 static i128 __vectorcall Add32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1685 static i128 __vectorcall Add64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1687 static i128 __vectorcall AddInt8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1688 static i128 __vectorcall AddInt16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1690 static i128 __vectorcall AddUint8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1691 static i128 __vectorcall AddUint16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1693 static i128 __vectorcall Sub8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1694 static i128 __vectorcall Sub16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1695 static i128 __vectorcall Sub32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1696 static i128 __vectorcall Sub64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1698 static i128 __vectorcall SubInt8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1699 static i128 __vectorcall SubInt16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1701 static i128 __vectorcall SubUint8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1702 static i128 __vectorcall SubUint16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1704 static i128 __vectorcall PairwiseAdd8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1705 static i128 __vectorcall PairwiseAdd16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1706 static i128 __vectorcall PairwiseAdd32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1712 static i128 __vectorcall Mult16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1713 static i128 __vectorcall MultAdd16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
1714 static i128 __vectorcall MultSub16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
1715 static i128 __vectorcall Mult32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1716 static i128 __vectorcall MultAdd32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
1717 static i128 __vectorcall MultSub32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
1719 static i128 __vectorcall NegateInt8(i128arg value)
NLIB_NOEXCEPT;
1720 static i128 __vectorcall NegateInt16(i128arg value)
NLIB_NOEXCEPT;
1721 static i128 __vectorcall NegateInt32(i128arg value)
NLIB_NOEXCEPT;
1723 static i128 __vectorcall MaxInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1724 static i128 __vectorcall MaxInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1725 static i128 __vectorcall MaxInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1726 static i128 __vectorcall MaxUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1727 static i128 __vectorcall MaxUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1728 static i128 __vectorcall MaxUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1729 static i128 __vectorcall MinInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1730 static i128 __vectorcall MinInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1731 static i128 __vectorcall MinInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1732 static i128 __vectorcall MinUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1733 static i128 __vectorcall MinUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1734 static i128 __vectorcall MinUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1736 static i128 __vectorcall AbsInt8(i128arg value)
NLIB_NOEXCEPT;
1737 static i128 __vectorcall AbsInt16(i128arg value)
NLIB_NOEXCEPT;
1738 static i128 __vectorcall AbsInt32(i128arg value)
NLIB_NOEXCEPT;
1739 static i128 __vectorcall AbsDiffInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1740 static i128 __vectorcall AbsDiffInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1741 static i128 __vectorcall AbsDiffInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1746 static i128 __vectorcall And(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1747 static i128 __vectorcall Or(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1748 static i128 __vectorcall Xor(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1750 static i128 __vectorcall AndNot(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1751 static i128 __vectorcall OrNot(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1756 static i128 __vectorcall CmpEq8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1757 static i128 __vectorcall CmpEq16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1758 static i128 __vectorcall CmpEq32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1760 static i128 __vectorcall CmpLtInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1761 static i128 __vectorcall CmpLtInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1762 static i128 __vectorcall CmpLtInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1764 static i128 __vectorcall CmpGtInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1765 static i128 __vectorcall CmpGtInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1766 static i128 __vectorcall CmpGtInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1768 static i128 __vectorcall CmpLtUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1769 static i128 __vectorcall CmpLtUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1770 static i128 __vectorcall CmpLtUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1772 static i128 __vectorcall CmpGtUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1773 static i128 __vectorcall CmpGtUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1774 static i128 __vectorcall CmpGtUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1776 static i128 __vectorcall CmpLeInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1777 static i128 __vectorcall CmpLeInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1778 static i128 __vectorcall CmpLeInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1780 static i128 __vectorcall CmpGeInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1781 static i128 __vectorcall CmpGeInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1782 static i128 __vectorcall CmpGeInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1784 static i128 __vectorcall CmpLeUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1785 static i128 __vectorcall CmpLeUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1786 static i128 __vectorcall CmpLeUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1788 static i128 __vectorcall CmpGeUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1789 static i128 __vectorcall CmpGeUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1790 static i128 __vectorcall CmpGeUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1795 static i128 __vectorcall ShiftLeftLogical8(i128arg value,
int count)
NLIB_NOEXCEPT;
1796 static i128 __vectorcall ShiftRightLogical8(i128arg value,
int count)
NLIB_NOEXCEPT;
1798 static i128 __vectorcall ShiftLeftLogical16(i128arg value,
int count)
NLIB_NOEXCEPT;
1799 static i128 __vectorcall ShiftRightLogical16(i128arg value,
int count)
NLIB_NOEXCEPT;
1800 static i128 __vectorcall ShiftRightArithmetic16(i128arg value,
int count)
NLIB_NOEXCEPT;
1802 static i128 __vectorcall ShiftLeftLogical32(i128arg value,
int count)
NLIB_NOEXCEPT;
1803 static i128 __vectorcall ShiftRightLogical32(i128arg value,
int count)
NLIB_NOEXCEPT;
1804 static i128 __vectorcall ShiftRightArithmetic32(i128arg value,
int count)
NLIB_NOEXCEPT;
1806 static i128 __vectorcall ShiftLeftLogical64(i128arg value,
int count)
NLIB_NOEXCEPT;
1807 static i128 __vectorcall ShiftRightLogical64(i128arg value,
int count)
NLIB_NOEXCEPT;
1813 static i128 __vectorcall ByteShiftLeft(i128arg value)
NLIB_NOEXCEPT;
1815 static i128 __vectorcall ByteShiftRight(i128arg value)
NLIB_NOEXCEPT;
1817 static i128 __vectorcall ByteRotateRight(i128arg value)
NLIB_NOEXCEPT;
1819 static i128 __vectorcall AlignR(i128arg a, i128arg b)
NLIB_NOEXCEPT;
1824 static i128 __vectorcall NarrowFrom16To8(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
1825 static i128 __vectorcall NarrowFrom32To16(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
1826 static i128 __vectorcall NarrowFrom64To32(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
1828 static i128 __vectorcall
1829 ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
1830 static i128 __vectorcall ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
1831 static i128 __vectorcall
1832 ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
1833 static i128 __vectorcall
1834 ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
1836 static i128 __vectorcall ConvertFromInt8ToInt16(i64arg value)
NLIB_NOEXCEPT;
1837 static i128 __vectorcall ConvertFromInt16ToInt32(i64arg value)
NLIB_NOEXCEPT;
1838 static i128 __vectorcall ConvertFromInt32ToInt64(i64arg value)
NLIB_NOEXCEPT;
1840 static i128 __vectorcall ConvertFromUint8ToUint16(i64arg value)
NLIB_NOEXCEPT;
1841 static i128 __vectorcall ConvertFromUint16ToUint32(i64arg value)
NLIB_NOEXCEPT;
1842 static i128 __vectorcall ConvertFromUint32ToUint64(i64arg value)
NLIB_NOEXCEPT;
1844 static i128 __vectorcall Zip8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
1845 static i128 __vectorcall Unzip8(i64arg a, i64arg b)
NLIB_NOEXCEPT;
1846 static i128 __vectorcall Zip16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
1847 static i128 __vectorcall Unzip16(i64arg a, i64arg b)
NLIB_NOEXCEPT;
1848 static i128 __vectorcall Zip32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
1849 static i128 __vectorcall Unzip32(i64arg a, i64arg b)
NLIB_NOEXCEPT;
1854 static i128 __vectorcall Reverse16(i128arg value)
NLIB_NOEXCEPT;
1855 static i128 __vectorcall Reverse32(i128arg value)
NLIB_NOEXCEPT;
1856 static i128 __vectorcall Reverse64(i128arg value)
NLIB_NOEXCEPT;
1861 static int __vectorcall MoveMask8(i128arg value)
NLIB_NOEXCEPT;
1862 static int __vectorcall MoveMask16(i128arg value)
NLIB_NOEXCEPT;
1863 static int __vectorcall MoveMask32(i128arg value)
NLIB_NOEXCEPT;
1864 static bool __vectorcall IsZero(i128arg value)
NLIB_NOEXCEPT;
1865 static bool __vectorcall IsFull(i128arg value)
NLIB_NOEXCEPT;
1866 static i128 __vectorcall Select(i128arg mask, i128arg a, i128arg b)
NLIB_NOEXCEPT;
1867 static i128 __vectorcall Shuffle8(i128arg value, i128arg shuffle)
NLIB_NOEXCEPT;
1873 #ifndef NLIB_DOXYGEN
1875 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall
1876 #define NLIB_M2(tp) inline tp __vectorcall
1879 # undef vreinterpret_s8_s8
1887 #define vreinterpretq_s8_s8(a) (a)
1888 #define NLIB_OP1(intrin, tp, a) \
1889 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a)))
1890 #define NLIB_OP2(intrin, tp, a, b) \
1891 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
1892 vreinterpretq_##tp##_s8(b)))
1893 #define NLIB_OP3(intrin, tp, a, b, c) \
1894 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
1895 vreinterpretq_##tp##_s8(b), \
1896 vreinterpretq_##tp##_s8(c)))
1897 #define NLIB_CMP(intrin, tp, a, b, utp) \
1898 vreinterpretq_s8_##utp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
1899 vreinterpretq_##tp##_s8(b)))
1900 #define NLIB_SFT(intrin, tp, a, cnt, stp) \
1901 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vdupq_n_##stp(cnt)))
1902 #define NLIB_CMB(tp, l, h) vreinterpretq_s8_##tp(vcombine_##tp(l, h))
1906 NLIB_M(i128) I128::SetValue(int8_t v, each_int8_tag) NLIB_NOEXCEPT {
1907 #if defined(NLIB_SSE41)
1909 return _mm_shuffle_epi8(_mm_cvtsi32_si128(static_cast<uint8_t>(v)), _mm_setzero_si128());
1910 #elif defined(NLIB_NEON)
1911 return vdupq_n_s8(v);
1916 NLIB_M(i128) I128::SetValue(int16_t v, each_int16_tag) NLIB_NOEXCEPT {
1917 #if defined(NLIB_SSE41)
1918 return _mm_set1_epi16(v);
1919 #elif defined(NLIB_NEON)
1920 return vreinterpretq_s8_s16(vdupq_n_s16(v));
1925 NLIB_M(i128) I128::SetValue(int32_t v, each_int32_tag) NLIB_NOEXCEPT {
1926 #if defined(NLIB_SSE41)
1927 return _mm_set1_epi32(v);
1928 #elif defined(NLIB_NEON)
1929 return vreinterpretq_s8_s32(vdupq_n_s32(v));
1934 NLIB_M(i128) I128::SetValue(int64_t v, each_int64_tag) NLIB_NOEXCEPT {
1935 #if defined(NLIB_SSE41)
1941 return I128::LoadA16(tmp);
1943 return _mm_set1_epi64x(v);
1945 #elif defined(NLIB_NEON)
1946 return vreinterpretq_s8_s64(vdupq_n_s64(v));
1951 NLIB_M(i128) I128::SetValue(uint8_t v, each_uint8_tag) NLIB_NOEXCEPT {
1952 #if defined(NLIB_SSE41)
1954 return _mm_shuffle_epi8(_mm_cvtsi32_si128(v), _mm_setzero_si128());
1955 #elif defined(NLIB_NEON)
1956 return vreinterpretq_s8_u8(vdupq_n_u8(v));
1961 NLIB_M(i128) I128::SetValue(uint16_t v, each_uint16_tag) NLIB_NOEXCEPT {
1962 #if defined(NLIB_SSE41)
1963 return _mm_set1_epi16(static_cast<int16_t>(v));
1964 #elif defined(NLIB_NEON)
1965 return vreinterpretq_s8_u16(vdupq_n_u16(v));
1970 NLIB_M(i128) I128::SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT {
1971 #if defined(NLIB_SSE41)
1972 return _mm_set1_epi32(static_cast<int32_t>(v));
1973 #elif defined(NLIB_NEON)
1974 return vreinterpretq_s8_u32(vdupq_n_u32(v));
1979 NLIB_M(i128) I128::SetValue(uint64_t v, each_uint64_tag) NLIB_NOEXCEPT {
1980 #if defined(NLIB_SSE41)
1983 return I128::LoadA16(tmp);
1985 return _mm_set1_epi64x(static_cast<int64_t>(v));
1987 #elif defined(NLIB_NEON)
1988 return vreinterpretq_s8_u64(vdupq_n_u64(v));
1992 #if defined(NLIB_SSE41)
1995 NLIB_M(i128) I128::SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT {
1997 return _mm_shuffle_epi32(value, _MM_SHUFFLE(N, N, N, N));
1999 #elif defined(NLIB_NEON)
2003 NLIB_M(i128) I128::SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT {
2005 uint32x4_t v = vreinterpretq_u32_s8(value);
2006 return vreinterpretq_s8_u32(vdupq_laneq_u32(v, N));
2010 NLIB_M(i128) I128::SetValue<0>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
2011 uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
2012 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
2015 NLIB_M(i128) I128::SetValue<1>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
2016 uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
2017 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
2020 NLIB_M(i128) I128::SetValue<2>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
2021 uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
2022 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
2025 NLIB_M(i128) I128::SetValue<3>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
2026 uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
2027 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
2032 #if defined(NLIB_SSE41)
2035 NLIB_M2(i128) I128::SetValue(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2038 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1,
2039 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1
2041 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(mask));
2043 #elif defined(NLIB_NEON)
2045 NLIB_M(i128) I128::SetValue<0>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2047 uint16x8_t v = vreinterpretq_u16_s8(value);
2048 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 0));
2050 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
2051 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
2056 NLIB_M(i128) I128::SetValue<1>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2058 uint16x8_t v = vreinterpretq_u16_s8(value);
2059 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 1));
2061 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
2062 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
2067 NLIB_M(i128) I128::SetValue<2>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2069 uint16x8_t v = vreinterpretq_u16_s8(value);
2070 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 2));
2072 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
2073 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
2078 NLIB_M(i128) I128::SetValue<3>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2080 uint16x8_t v = vreinterpretq_u16_s8(value);
2081 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 3));
2083 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
2084 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
2089 NLIB_M(i128) I128::SetValue<4>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2091 uint16x8_t v = vreinterpretq_u16_s8(value);
2092 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 4));
2094 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
2095 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
2100 NLIB_M(i128) I128::SetValue<5>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2102 uint16x8_t v = vreinterpretq_u16_s8(value);
2103 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 5));
2105 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
2106 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
2111 NLIB_M(i128) I128::SetValue<6>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2113 uint16x8_t v = vreinterpretq_u16_s8(value);
2114 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 6));
2116 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
2117 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
2122 NLIB_M(i128) I128::SetValue<7>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2124 uint16x8_t v = vreinterpretq_u16_s8(value);
2125 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 7));
2127 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
2128 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
2133 #if defined(NLIB_SSE41)
2136 NLIB_M2(i128) I128::SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT {
2139 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
2141 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
2143 #elif defined(NLIB_NEON)
2145 template <
size_t N,
bool IsLower>
2146 struct SetValue8Helper {
2147 NLIB_M(i128) operator()(i128 value) NLIB_NOEXCEPT {
2148 return vdupq_lane_s8(vget_low_s8(value), N);
2153 struct SetValue8Helper<N, false> {
2154 NLIB_M(i128) operator()(i128 value) NLIB_NOEXCEPT {
2155 return vdupq_lane_s8(vget_high_s8(value), N - 8);
2162 NLIB_M(i128) I128::SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT {
2165 return vdupq_laneq_s8(value, N);
2167 return detail::SetValue8Helper<N, (N < 8)>()(value);
2173 NLIB_M(i128) I128::SetZero() NLIB_NOEXCEPT {
2174 #if defined(NLIB_SSE41)
2175 return _mm_setzero_si128();
2176 #elif defined(NLIB_NEON)
2177 return vdupq_n_s8(0);
2182 NLIB_M(i128) I128::SetFull(i128arg dummy) NLIB_NOEXCEPT {
return I128::CmpEq8(dummy, dummy); }
2185 NLIB_M(i64) I128::GetLo(i128 value) NLIB_NOEXCEPT {
2186 #if defined(NLIB_SSE41)
2188 #elif defined(NLIB_NEON)
2189 return vget_low_s8(value);
2194 NLIB_M(i64) I128::GetHi(i128 value) NLIB_NOEXCEPT {
2195 #if defined(NLIB_SSE41)
2196 return _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2197 #elif defined(NLIB_NEON)
2198 return vget_high_s8(value);
2203 NLIB_M(i128) I128::LoadA16(const
void* p) NLIB_NOEXCEPT {
2204 #if defined(NLIB_SSE41)
2205 return _mm_load_si128(reinterpret_cast<const __m128i*>(p));
2206 #elif defined(NLIB_NEON)
2207 uint64x2_t tmp = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
2208 return vreinterpretq_s8_u64(tmp);
2213 NLIB_M(i128) I128::LoadA8(const
void* p) NLIB_NOEXCEPT {
2214 #if defined(NLIB_SSE41)
2215 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
2216 #elif defined(NLIB_NEON)
2217 uint64x2_t tmp = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
2218 return vreinterpretq_s8_u64(tmp);
2223 NLIB_M(i128) I128::LoadA4(const
void* p) NLIB_NOEXCEPT {
2224 #if defined(NLIB_SSE41)
2225 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
2226 #elif defined(NLIB_NEON)
2227 uint32x4_t tmp = vld1q_u32(reinterpret_cast<const uint32_t*>(p));
2228 return vreinterpretq_s8_u32(tmp);
2233 NLIB_M(i128) I128::LoadA2(const
void* p) NLIB_NOEXCEPT {
2234 #if defined(NLIB_SSE41)
2235 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
2236 #elif defined(NLIB_NEON)
2237 uint16x8_t tmp = vld1q_u16(reinterpret_cast<const uint16_t*>(p));
2238 return vreinterpretq_s8_u16(tmp);
2243 NLIB_M(i128) I128::LoadA1(const
void* p) NLIB_NOEXCEPT {
2244 #if defined(NLIB_SSE41)
2245 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
2246 #elif defined(NLIB_NEON)
2247 return vld1q_s8(reinterpret_cast<const int8_t*>(p));
2252 NLIB_M(
void) I128::StoreA16(
void* p, i128arg value) NLIB_NOEXCEPT {
2253 #if defined(NLIB_SSE41)
2254 _mm_store_si128(reinterpret_cast<i128*>(p), value);
2255 #elif defined(NLIB_NEON)
2256 vst1q_u64(reinterpret_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
2261 NLIB_M(
void) I128::StoreA8(
void* p, i128arg value) NLIB_NOEXCEPT {
2262 #if defined(NLIB_SSE41)
2263 _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
2264 #elif defined(NLIB_NEON)
2265 vst1q_u64(reinterpret_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
2270 NLIB_M(
void) I128::StoreA4(
void* p, i128arg value) NLIB_NOEXCEPT {
2271 #if defined(NLIB_SSE41)
2272 _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
2273 #elif defined(NLIB_NEON)
2274 vst1q_u32(reinterpret_cast<uint32_t*>(p), vreinterpretq_u32_s8(value));
2279 NLIB_M(
void) I128::StoreA2(
void* p, i128arg value) NLIB_NOEXCEPT {
2280 #if defined(NLIB_SSE41)
2281 _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
2282 #elif defined(NLIB_NEON)
2283 vst1q_u16(reinterpret_cast<uint16_t*>(p), vreinterpretq_u16_s8(value));
2288 NLIB_M(
void) I128::StoreA1(
void* p, i128arg value) NLIB_NOEXCEPT {
2289 #if defined(NLIB_SSE41)
2290 _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
2291 #elif defined(NLIB_NEON)
2292 vst1q_s8(reinterpret_cast<int8_t*>(p), value);
2298 NLIB_M(uint8_t) I128::GetUint8FromLane(i128arg value) NLIB_NOEXCEPT {
2300 #if defined(NLIB_SSE41)
2301 return static_cast<uint8_t
>(_mm_extract_epi8(value, N));
2302 #elif defined(NLIB_NEON)
2303 return vgetq_lane_u8(vreinterpretq_u8_s8(value), N);
2309 NLIB_M(uint16_t) I128::GetUint16FromLane(i128arg value) NLIB_NOEXCEPT {
2311 #if defined(NLIB_SSE41)
2312 return static_cast<uint16_t
>(_mm_extract_epi16(value, N));
2313 #elif defined(NLIB_NEON)
2314 return vgetq_lane_u16(vreinterpretq_u16_s8(value), N);
2320 NLIB_M(uint32_t) I128::GetUint32FromLane(i128arg value) NLIB_NOEXCEPT {
2322 #if defined(NLIB_SSE41)
2323 return static_cast<uint32_t
>(_mm_extract_epi32(value, N));
2324 #elif defined(NLIB_NEON)
2325 return vgetq_lane_u32(vreinterpretq_u32_s8(value), N);
2331 NLIB_M(uint64_t) I128::GetUint64FromLane(i128arg value) NLIB_NOEXCEPT {
2333 #if defined(NLIB_SSE41)
2335 return static_cast<uint64_t
>(_mm_extract_epi64(value, N));
2340 #elif defined(NLIB_NEON)
2341 return vgetq_lane_u64(vreinterpretq_u64_s8(value), N);
2345 #if defined(NLIB_SSE41) && !defined(NLIB_64BIT)
2347 NLIB_M(uint64_t) I128::GetUint64FromLane<0>(i128arg value) NLIB_NOEXCEPT {
2349 _mm_storel_epi64(reinterpret_cast<i128*>(&rval), value);
2353 NLIB_M(uint64_t) I128::GetUint64FromLane<1>(i128arg value) NLIB_NOEXCEPT {
2355 i128 tmp = I128::GetHi(value);
2356 _mm_storel_epi64(reinterpret_cast<i128*>(&rval), tmp);
2363 NLIB_M(i128) I128::SetUint8ToLane(i128arg value, uint8_t v) NLIB_NOEXCEPT {
2365 #if defined(NLIB_SSE41)
2366 return _mm_insert_epi8(value, static_cast<int8_t>(v), N);
2367 #elif defined(NLIB_NEON)
2368 return vreinterpretq_s8_u8(vsetq_lane_u8(v, vreinterpretq_u8_s8(value), N));
2374 NLIB_M(i128) I128::SetUint16ToLane(i128arg value, uint16_t v) NLIB_NOEXCEPT {
2376 #if defined(NLIB_SSE41)
2377 return _mm_insert_epi16(value, static_cast<int16_t>(v), N);
2378 #elif defined(NLIB_NEON)
2379 return vreinterpretq_s8_u16(vsetq_lane_u16(v, vreinterpretq_u16_s8(value), N));
2385 NLIB_M(i128) I128::SetUint32ToLane(i128arg value, uint32_t v) NLIB_NOEXCEPT {
2387 #if defined(NLIB_SSE41)
2388 return _mm_insert_epi32(value, static_cast<uint32_t>(v), N);
2389 #elif defined(NLIB_NEON)
2390 return vreinterpretq_s8_u32(vsetq_lane_u32(v, vreinterpretq_u32_s8(value), N));
2396 NLIB_M(i128) I128::SetUint64ToLane(i128arg value, uint64_t v) NLIB_NOEXCEPT {
2398 #if defined(NLIB_SSE41)
2400 return _mm_insert_epi64(value, static_cast<int64_t>(v), N);
2406 tmp.i64 =
static_cast<int64_t
>(v);
2408 rval = _mm_insert_epi32(value, tmp.i32[0], N * 2 + 0);
2409 return _mm_insert_epi32(rval, tmp.i32[1], N * 2 + 1);
2411 #elif defined(NLIB_NEON)
2412 return vreinterpretq_s8_u64(vsetq_lane_u64(v, vreinterpretq_u64_s8(value), N));
2417 NLIB_M(i128) I128::Add8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2418 #if defined(NLIB_SSE41)
2419 return _mm_add_epi8(a, b);
2420 #elif defined(NLIB_NEON)
2421 return vaddq_s8(a, b);
2426 NLIB_M(i128) I128::Add16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2427 #if defined(NLIB_SSE41)
2428 return _mm_add_epi16(a, b);
2429 #elif defined(NLIB_NEON)
2430 return NLIB_OP2(vaddq, s16, a, b);
2435 NLIB_M(i128) I128::Add32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2436 #if defined(NLIB_SSE41)
2437 return _mm_add_epi32(a, b);
2438 #elif defined(NLIB_NEON)
2439 return NLIB_OP2(vaddq, s32, a, b);
2444 NLIB_M(i128) I128::Add64(i128arg a, i128arg b) NLIB_NOEXCEPT {
2445 #if defined(NLIB_SSE41)
2446 return _mm_add_epi64(a, b);
2447 #elif defined(NLIB_NEON)
2448 return NLIB_OP2(vaddq, s64, a, b);
2453 NLIB_M(i128) I128::AddInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2454 #if defined(NLIB_SSE41)
2455 return _mm_adds_epi8(a, b);
2456 #elif defined(NLIB_NEON)
2457 return vqaddq_s8(a, b);
2462 NLIB_M(i128) I128::AddInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2463 #if defined(NLIB_SSE41)
2464 return _mm_adds_epi16(a, b);
2465 #elif defined(NLIB_NEON)
2466 return NLIB_OP2(vqaddq, s16, a, b);
2471 NLIB_M(i128) I128::AddUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2472 #if defined(NLIB_SSE41)
2473 return _mm_adds_epu8(a, b);
2474 #elif defined(NLIB_NEON)
2475 return NLIB_OP2(vqaddq, u8, a, b);
2480 NLIB_M(i128) I128::AddUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2481 #if defined(NLIB_SSE41)
2482 return _mm_adds_epu16(a, b);
2483 #elif defined(NLIB_NEON)
2484 return NLIB_OP2(vqaddq, u16, a, b);
2489 NLIB_M(i128) I128::Sub8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2490 #if defined(NLIB_SSE41)
2491 return _mm_sub_epi8(a, b);
2492 #elif defined(NLIB_NEON)
2493 return vsubq_s8(a, b);
2498 NLIB_M(i128) I128::Sub16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2499 #if defined(NLIB_SSE41)
2500 return _mm_sub_epi16(a, b);
2501 #elif defined(NLIB_NEON)
2502 return NLIB_OP2(vsubq, s16, a, b);
2507 NLIB_M(i128) I128::Sub32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2508 #if defined(NLIB_SSE41)
2509 return _mm_sub_epi32(a, b);
2510 #elif defined(NLIB_NEON)
2511 return NLIB_OP2(vsubq, s32, a, b);
2516 NLIB_M(i128) I128::Sub64(i128arg a, i128arg b) NLIB_NOEXCEPT {
2517 #if defined(NLIB_SSE41)
2518 return _mm_sub_epi64(a, b);
2519 #elif defined(NLIB_NEON)
2520 return NLIB_OP2(vsubq, s64, a, b);
2525 NLIB_M(i128) I128::SubInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2526 #if defined(NLIB_SSE41)
2527 return _mm_subs_epi8(a, b);
2528 #elif defined(NLIB_NEON)
2529 return NLIB_OP2(vqsubq, s8, a, b);
2534 NLIB_M(i128) I128::SubInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2535 #if defined(NLIB_SSE41)
2536 return _mm_subs_epi16(a, b);
2537 #elif defined(NLIB_NEON)
2538 return NLIB_OP2(vqsubq, s16, a, b);
2543 NLIB_M(i128) I128::SubUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2544 #if defined(NLIB_SSE41)
2545 return _mm_subs_epu8(a, b);
2546 #elif defined(NLIB_NEON)
2547 return NLIB_OP2(vqsubq, u8, a, b);
2552 NLIB_M(i128) I128::SubUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2553 #if defined(NLIB_SSE41)
2554 return _mm_subs_epu16(a, b);
2555 #elif defined(NLIB_NEON)
2556 return NLIB_OP2(vqsubq, u16, a, b);
2561 NLIB_M(i128) I128::PairwiseAdd8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2562 #if defined(NLIB_SSE41)
2563 __m128i ax = _mm_add_epi8(a, _mm_srli_epi16(a, 8));
2564 __m128i bx = _mm_add_epi8(b, _mm_srli_epi16(b, 8));
2565 return I128::NarrowFrom16To8(ax, bx);
2566 #elif defined(NLIB_NEON)
2568 return vpaddq_s8(a, b);
2570 int8x8_t al = vget_low_s8(a);
2571 int8x8_t ah = vget_high_s8(a);
2572 int8x8_t rl = vpadd_s8(al, ah);
2573 int8x8_t bl = vget_low_s8(b);
2574 int8x8_t bh = vget_high_s8(b);
2575 int8x8_t rh = vpadd_s8(bl, bh);
2576 return vcombine_s8(rl, rh);
2582 NLIB_M(i128) I128::PairwiseAdd16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2583 #if defined(NLIB_SSE41)
2584 return _mm_hadd_epi16(a, b);
2585 #elif defined(NLIB_NEON)
2587 return vreinterpretq_s8_s16(vpaddq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)));
2589 int16x4_t al = vget_low_s16(vreinterpretq_s16_s8(a));
2590 int16x4_t ah = vget_high_s16(vreinterpretq_s16_s8(a));
2591 int16x4_t rl = vpadd_s16(al, ah);
2592 int16x4_t bl = vget_low_s16(vreinterpretq_s16_s8(b));
2593 int16x4_t bh = vget_high_s16(vreinterpretq_s16_s8(b));
2594 int16x4_t rh = vpadd_s16(bl, bh);
2595 return NLIB_CMB(s16, rl, rh);
2601 NLIB_M(i128) I128::PairwiseAdd32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2602 #if defined(NLIB_SSE41)
2603 return _mm_hadd_epi32(a, b);
2604 #elif defined(NLIB_NEON)
2606 return vreinterpretq_s8_s32(vpaddq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)));
2608 int32x2_t al = vget_low_s32(vreinterpretq_s32_s8(a));
2609 int32x2_t ah = vget_high_s32(vreinterpretq_s32_s8(a));
2610 int32x2_t rl = vpadd_s32(al, ah);
2611 int32x2_t bl = vget_low_s32(vreinterpretq_s32_s8(b));
2612 int32x2_t bh = vget_high_s32(vreinterpretq_s32_s8(b));
2613 int32x2_t rh = vpadd_s32(bl, bh);
2614 return NLIB_CMB(s32, rl, rh);
2620 NLIB_M(i128) I128::Mult16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2621 #if defined(NLIB_SSE41)
2622 return _mm_mullo_epi16(a, b);
2623 #elif defined(NLIB_NEON)
2624 return NLIB_OP2(vmulq, s16, a, b);
2629 NLIB_M(i128) I128::MultAdd16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
2630 #if defined(NLIB_SSE41)
2631 return _mm_add_epi16(c, _mm_mullo_epi16(a, b));
2632 #elif defined(NLIB_NEON)
2633 return NLIB_OP3(vmlaq, s16, c, a, b);
2638 NLIB_M(i128) I128::MultSub16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
2639 #if defined(NLIB_SSE41)
2640 return _mm_sub_epi16(c, _mm_mullo_epi16(a, b));
2641 #elif defined(NLIB_NEON)
2642 return NLIB_OP3(vmlsq, s16, c, a, b);
2647 NLIB_M(i128) I128::Mult32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2648 #if defined(NLIB_SSE41)
2649 return _mm_mullo_epi32(a, b);
2650 #elif defined(NLIB_NEON)
2651 return NLIB_OP2(vmulq, s32, a, b);
2656 NLIB_M(i128) I128::MultAdd32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
2657 #if defined(NLIB_SSE41)
2658 return _mm_add_epi32(c, _mm_mullo_epi32(a, b));
2659 #elif defined(NLIB_NEON)
2660 return NLIB_OP3(vmlaq, s32, c, a, b);
2665 NLIB_M(i128) I128::MultSub32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
2666 #if defined(NLIB_SSE41)
2667 return _mm_sub_epi32(c, _mm_mullo_epi32(a, b));
2668 #elif defined(NLIB_NEON)
2669 return NLIB_OP3(vmlsq, s32, c, a, b);
2674 NLIB_M(i128) I128::MaxInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2675 #if defined(NLIB_SSE41)
2676 return _mm_max_epi8(a, b);
2677 #elif defined(NLIB_NEON)
2678 return NLIB_OP2(vmaxq, s8, a, b);
2683 NLIB_M(i128) I128::MaxInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2684 #if defined(NLIB_SSE41)
2685 return _mm_max_epi16(a, b);
2686 #elif defined(NLIB_NEON)
2687 return NLIB_OP2(vmaxq, s16, a, b);
2692 NLIB_M(i128) I128::MaxInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2693 #if defined(NLIB_SSE41)
2694 return _mm_max_epi32(a, b);
2695 #elif defined(NLIB_NEON)
2696 return NLIB_OP2(vmaxq, s32, a, b);
2701 NLIB_M(i128) I128::MaxUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2702 #if defined(NLIB_SSE41)
2703 return _mm_max_epu8(a, b);
2704 #elif defined(NLIB_NEON)
2705 return NLIB_OP2(vmaxq, u8, a, b);
2710 NLIB_M(i128) I128::MaxUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2711 #if defined(NLIB_SSE41)
2712 return _mm_max_epu16(a, b);
2713 #elif defined(NLIB_NEON)
2714 return NLIB_OP2(vmaxq, u16, a, b);
2719 NLIB_M(i128) I128::MaxUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2720 #if defined(NLIB_SSE41)
2721 return _mm_max_epu32(a, b);
2722 #elif defined(NLIB_NEON)
2723 return NLIB_OP2(vmaxq, u32, a, b);
2728 NLIB_M(i128) I128::MinInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2729 #if defined(NLIB_SSE41)
2730 return _mm_min_epi8(a, b);
2731 #elif defined(NLIB_NEON)
2732 return NLIB_OP2(vminq, s8, a, b);
2737 NLIB_M(i128) I128::MinInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2738 #if defined(NLIB_SSE41)
2739 return _mm_min_epi16(a, b);
2740 #elif defined(NLIB_NEON)
2741 return NLIB_OP2(vminq, s16, a, b);
2746 NLIB_M(i128) I128::MinInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2747 #if defined(NLIB_SSE41)
2748 return _mm_min_epi32(a, b);
2749 #elif defined(NLIB_NEON)
2750 return NLIB_OP2(vminq, s32, a, b);
2755 NLIB_M(i128) I128::MinUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2756 #if defined(NLIB_SSE41)
2757 return _mm_min_epu8(a, b);
2758 #elif defined(NLIB_NEON)
2759 return NLIB_OP2(vminq, u8, a, b);
2764 NLIB_M(i128) I128::MinUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2765 #if defined(NLIB_SSE41)
2766 return _mm_min_epu16(a, b);
2767 #elif defined(NLIB_NEON)
2768 return NLIB_OP2(vminq, u16, a, b);
2773 NLIB_M(i128) I128::MinUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2774 #if defined(NLIB_SSE41)
2775 return _mm_min_epu32(a, b);
2776 #elif defined(NLIB_NEON)
2777 return NLIB_OP2(vminq, u32, a, b);
2782 NLIB_M(i128) I128::AbsInt8(i128arg value) NLIB_NOEXCEPT {
2783 #if defined(NLIB_SSE41)
2784 return _mm_abs_epi8(value);
2785 #elif defined(NLIB_NEON)
2786 return NLIB_OP1(vabsq, s8, value);
2791 NLIB_M(i128) I128::AbsInt16(i128arg value) NLIB_NOEXCEPT {
2792 #if defined(NLIB_SSE41)
2793 return _mm_abs_epi16(value);
2794 #elif defined(NLIB_NEON)
2795 return NLIB_OP1(vabsq, s16, value);
2800 NLIB_M(i128) I128::AbsInt32(i128arg value) NLIB_NOEXCEPT {
2801 #if defined(NLIB_SSE41)
2802 return _mm_abs_epi32(value);
2803 #elif defined(NLIB_NEON)
2804 return NLIB_OP1(vabsq, s32, value);
2809 NLIB_M(i128) I128::AbsDiffInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2810 #if defined(NLIB_SSE41)
2811 return _mm_abs_epi8(_mm_sub_epi8(a, b));
2812 #elif defined(NLIB_NEON)
2813 return NLIB_OP2(vabdq, s8, a, b);
2818 NLIB_M(i128) I128::AbsDiffInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2819 #if defined(NLIB_SSE41)
2820 return _mm_abs_epi16(_mm_sub_epi16(a, b));
2821 #elif defined(NLIB_NEON)
2822 return NLIB_OP2(vabdq, s16, a, b);
2827 NLIB_M(i128) I128::AbsDiffInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2828 #if defined(NLIB_SSE41)
2829 return _mm_abs_epi32(_mm_sub_epi32(a, b));
2830 #elif defined(NLIB_NEON)
2831 return NLIB_OP2(vabdq, s32, a, b);
2836 NLIB_M(i128) I128::NegateInt8(i128arg value) NLIB_NOEXCEPT {
2837 #if defined(NLIB_SSE41)
2838 return _mm_sub_epi8(_mm_setzero_si128(), value);
2839 #elif defined(NLIB_NEON)
2840 return NLIB_OP1(vnegq, s8, value);
2845 NLIB_M(i128) I128::NegateInt16(i128arg value) NLIB_NOEXCEPT {
2846 #if defined(NLIB_SSE41)
2847 return _mm_sub_epi16(_mm_setzero_si128(), value);
2848 #elif defined(NLIB_NEON)
2849 return NLIB_OP1(vnegq, s16, value);
2854 NLIB_M(i128) I128::NegateInt32(i128arg value) NLIB_NOEXCEPT {
2855 #if defined(NLIB_SSE41)
2856 return _mm_sub_epi32(_mm_setzero_si128(), value);
2857 #elif defined(NLIB_NEON)
2858 return NLIB_OP1(vnegq, s32, value);
2863 NLIB_M(i128) I128::And(i128arg a, i128arg b) NLIB_NOEXCEPT {
2864 #if defined(NLIB_SSE41)
2865 return _mm_and_si128(a, b);
2866 #elif defined(NLIB_NEON)
2867 return NLIB_OP2(vandq, s8, a, b);
2872 NLIB_M(i128) I128::Or(i128arg a, i128arg b) NLIB_NOEXCEPT {
2873 #if defined(NLIB_SSE41)
2874 return _mm_or_si128(a, b);
2875 #elif defined(NLIB_NEON)
2876 return NLIB_OP2(vorrq, s8, a, b);
2881 NLIB_M(i128) I128::Xor(i128arg a, i128arg b) NLIB_NOEXCEPT {
2882 #if defined(NLIB_SSE41)
2883 return _mm_xor_si128(a, b);
2884 #elif defined(NLIB_NEON)
2885 return NLIB_OP2(veorq, s8, a, b);
2890 NLIB_M(i128) I128::Not(i128arg a) NLIB_NOEXCEPT {
2891 #if defined(NLIB_SSE41)
2892 return _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
2893 #elif defined(NLIB_NEON)
2894 return NLIB_OP1(vmvnq, s8, a);
2899 NLIB_M(i128) I128::AndNot(i128arg a, i128arg b) NLIB_NOEXCEPT {
2900 #if defined(NLIB_SSE41)
2901 return _mm_andnot_si128(a, b);
2902 #elif defined(NLIB_NEON)
2903 return NLIB_OP2(vbicq, s8, b, a);
2908 NLIB_M(i128) I128::OrNot(i128arg a, i128arg b) NLIB_NOEXCEPT {
2909 #if defined(NLIB_SSE41)
2910 __m128i not_a = _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
2911 return _mm_or_si128(not_a, b);
2912 #elif defined(NLIB_NEON)
2913 return NLIB_OP2(vornq, s8, b, a);
2918 NLIB_M(i128) I128::CmpEq8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2919 #if defined(NLIB_SSE41)
2920 return _mm_cmpeq_epi8(a, b);
2921 #elif defined(NLIB_NEON)
2922 return NLIB_CMP(vceqq, s8, a, b, u8);
2927 NLIB_M(i128) I128::CmpEq16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2928 #if defined(NLIB_SSE41)
2929 return _mm_cmpeq_epi16(a, b);
2930 #elif defined(NLIB_NEON)
2931 return NLIB_CMP(vceqq, s16, a, b, u16);
2936 NLIB_M(i128) I128::CmpEq32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2937 #if defined(NLIB_SSE41)
2938 return _mm_cmpeq_epi32(a, b);
2939 #elif defined(NLIB_NEON)
2940 return NLIB_CMP(vceqq, s32, a, b, u32);
2945 NLIB_M(i128) I128::CmpLtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2946 #if defined(NLIB_SSE41)
2947 return _mm_cmplt_epi8(a, b);
2948 #elif defined(NLIB_NEON)
2949 return NLIB_CMP(vcltq, s8, a, b, u8);
2954 NLIB_M(i128) I128::CmpLtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2955 #if defined(NLIB_SSE41)
2956 return _mm_cmplt_epi16(a, b);
2957 #elif defined(NLIB_NEON)
2958 return NLIB_CMP(vcltq, s16, a, b, u16);
2963 NLIB_M(i128) I128::CmpLtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2964 #if defined(NLIB_SSE41)
2965 return _mm_cmplt_epi32(a, b);
2966 #elif defined(NLIB_NEON)
2967 return NLIB_CMP(vcltq, s32, a, b, u32);
2972 NLIB_M(i128) I128::CmpGtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2973 #if defined(NLIB_SSE41)
2974 return _mm_cmpgt_epi8(a, b);
2975 #elif defined(NLIB_NEON)
2976 return NLIB_CMP(vcgtq, s8, a, b, u8);
2981 NLIB_M(i128) I128::CmpGtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2982 #if defined(NLIB_SSE41)
2983 return _mm_cmpgt_epi16(a, b);
2984 #elif defined(NLIB_NEON)
2985 return NLIB_CMP(vcgtq, s16, a, b, u16);
2990 NLIB_M(i128) I128::CmpGtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2991 #if defined(NLIB_SSE41)
2992 return _mm_cmpgt_epi32(a, b);
2993 #elif defined(NLIB_NEON)
2994 return NLIB_CMP(vcgtq, s32, a, b, u32);
2999 NLIB_M(i128) I128::CmpLtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
3000 #if defined(NLIB_SSE41)
3001 i128 ofs = I128::SetValue(0x80, each_uint8);
3002 return _mm_cmplt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
3003 #elif defined(NLIB_NEON)
3004 return NLIB_CMP(vcltq, u8, a, b, u8);
3009 NLIB_M(i128) I128::CmpGtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
3010 #if defined(NLIB_SSE41)
3011 i128 ofs = I128::SetValue(0x80, each_uint8);
3012 return _mm_cmpgt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
3013 #elif defined(NLIB_NEON)
3014 return NLIB_CMP(vcgtq, u8, a, b, u8);
3019 NLIB_M(i128) I128::CmpLtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
3020 #if defined(NLIB_SSE41)
3021 i128 ofs = I128::SetValue(0x8000U, each_uint16);
3022 return _mm_cmplt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
3023 #elif defined(NLIB_NEON)
3024 return NLIB_CMP(vcltq, u16, a, b, u16);
3029 NLIB_M(i128) I128::CmpGtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
3030 #if defined(NLIB_SSE41)
3031 i128 ofs = I128::SetValue(0x8000U, each_uint16);
3032 return _mm_cmpgt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
3033 #elif defined(NLIB_NEON)
3034 return NLIB_CMP(vcgtq, u16, a, b, u16);
3039 NLIB_M(i128) I128::CmpLtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
3040 #if defined(NLIB_SSE41)
3041 i128 ofs = I128::SetValue(0x80000000U, each_uint32);
3042 return _mm_cmplt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
3043 #elif defined(NLIB_NEON)
3044 return NLIB_CMP(vcltq, u32, a, b, u32);
3049 NLIB_M(i128) I128::CmpGtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
3050 #if defined(NLIB_SSE41)
3051 i128 ofs = I128::SetValue(0x80000000U, each_uint32);
3052 return _mm_cmpgt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
3053 #elif defined(NLIB_NEON)
3054 return NLIB_CMP(vcgtq, u32, a, b, u32);
3059 NLIB_M(i128) I128::CmpLeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
3060 #if defined(NLIB_SSE41)
3061 return _mm_or_si128(_mm_cmplt_epi8(a, b), _mm_cmpeq_epi8(a, b));
3062 #elif defined(NLIB_NEON)
3063 return NLIB_CMP(vcleq, s8, a, b, u8);
3068 NLIB_M(i128) I128::CmpLeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
3069 #if defined(NLIB_SSE41)
3070 return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
3071 #elif defined(NLIB_NEON)
3072 return NLIB_CMP(vcleq, s16, a, b, u16);
3077 NLIB_M(i128) I128::CmpLeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
3078 #if defined(NLIB_SSE41)
3079 return _mm_or_si128(_mm_cmplt_epi32(a, b), _mm_cmpeq_epi32(a, b));
3080 #elif defined(NLIB_NEON)
3081 return NLIB_CMP(vcleq, s32, a, b, u32);
3086 NLIB_M(i128) I128::CmpGeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
3087 #if defined(NLIB_SSE41)
3088 return _mm_or_si128(_mm_cmpgt_epi8(a, b), _mm_cmpeq_epi8(a, b));
3089 #elif defined(NLIB_NEON)
3090 return NLIB_CMP(vcgeq, s8, a, b, u8);
3095 NLIB_M(i128) I128::CmpGeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
3096 #if defined(NLIB_SSE41)
3097 return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
3098 #elif defined(NLIB_NEON)
3099 return NLIB_CMP(vcgeq, s16, a, b, u16);
3104 NLIB_M(i128) I128::CmpGeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
3105 #if defined(NLIB_SSE41)
3106 return _mm_or_si128(_mm_cmpgt_epi32(a, b), _mm_cmpeq_epi32(a, b));
3107 #elif defined(NLIB_NEON)
3108 return NLIB_CMP(vcgeq, s32, a, b, u32);
3113 NLIB_M(i128) I128::CmpLeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
3114 #if defined(NLIB_SSE41)
3115 return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
3116 #elif defined(NLIB_NEON)
3117 return NLIB_CMP(vcleq, u8, a, b, u8);
3122 NLIB_M(i128) I128::CmpLeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
3123 #if defined(NLIB_SSE41)
3124 return _mm_cmpeq_epi16(_mm_min_epu16(a, b), a);
3125 #elif defined(NLIB_NEON)
3126 return NLIB_CMP(vcleq, u16, a, b, u16);
3131 NLIB_M(i128) I128::CmpLeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
3132 #if defined(NLIB_SSE41)
3133 return _mm_cmpeq_epi32(_mm_min_epu32(a, b), a);
3134 #elif defined(NLIB_NEON)
3135 return NLIB_CMP(vcleq, u32, a, b, u32);
3140 NLIB_M(i128) I128::CmpGeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
3141 #if defined(NLIB_SSE41)
3142 return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a);
3143 #elif defined(NLIB_NEON)
3144 return NLIB_CMP(vcgeq, u8, a, b, u8);
3149 NLIB_M(i128) I128::CmpGeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
3150 #if defined(NLIB_SSE41)
3151 return _mm_cmpeq_epi16(_mm_max_epu16(a, b), a);
3152 #elif defined(NLIB_NEON)
3153 return NLIB_CMP(vcgeq, u16, a, b, u16);
3158 NLIB_M(i128) I128::CmpGeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
3159 #if defined(NLIB_SSE41)
3160 return _mm_cmpeq_epi32(_mm_max_epu32(a, b), a);
3161 #elif defined(NLIB_NEON)
3162 return NLIB_CMP(vcgeq, u32, a, b, u32);
3167 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value,
int count) NLIB_NOEXCEPT {
3168 #if defined(NLIB_SSE41)
3169 __m128i hi = I128::GetHi(value);
3170 __m128i xh = _mm_slli_epi16(_mm_cvtepu8_epi16(hi), count);
3171 __m128i xl = _mm_slli_epi16(_mm_cvtepu8_epi16(value), count);
3172 return I128::NarrowFrom16To8(xl, xh);
3173 #elif defined(NLIB_NEON)
3174 return NLIB_SFT(vshlq, u8, value, count, s8);
3179 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value,
int count) NLIB_NOEXCEPT {
3180 #if defined(NLIB_SSE41)
3181 __m128i hi = I128::GetHi(value);
3182 __m128i xh = _mm_srli_epi16(_mm_cvtepu8_epi16(hi), count);
3183 __m128i xl = _mm_srli_epi16(_mm_cvtepu8_epi16(value), count);
3184 return _mm_packus_epi16(xl, xh);
3185 #elif defined(NLIB_NEON)
3186 return NLIB_SFT(vshlq, u8, value, -count, s8);
3191 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value,
int count) NLIB_NOEXCEPT {
3192 #if defined(NLIB_SSE41)
3193 return _mm_slli_epi16(value, count);
3194 #elif defined(NLIB_NEON)
3195 return NLIB_SFT(vshlq, u16, value, count, s16);
3200 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value,
int count) NLIB_NOEXCEPT {
3201 #if defined(NLIB_SSE41)
3202 return _mm_srli_epi16(value, count);
3203 #elif defined(NLIB_NEON)
3204 return NLIB_SFT(vshlq, u16, value, -count, s16);
3209 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value,
int count) NLIB_NOEXCEPT {
3210 #if defined(NLIB_SSE41)
3211 return _mm_srai_epi16(value, count);
3212 #elif defined(NLIB_NEON)
3213 return NLIB_SFT(vshlq, s16, value, -count, s16);
3218 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value,
int count) NLIB_NOEXCEPT {
3219 #if defined(NLIB_SSE41)
3220 return _mm_slli_epi32(value, count);
3221 #elif defined(NLIB_NEON)
3222 return NLIB_SFT(vshlq, u32, value, count, s32);
3227 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value,
int count) NLIB_NOEXCEPT {
3228 #if defined(NLIB_SSE41)
3229 return _mm_srli_epi32(value, count);
3230 #elif defined(NLIB_NEON)
3231 return NLIB_SFT(vshlq, u32, value, -count, s32);
3236 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value,
int count) NLIB_NOEXCEPT {
3237 #if defined(NLIB_SSE41)
3238 return _mm_srai_epi32(value, count);
3239 #elif defined(NLIB_NEON)
3240 return NLIB_SFT(vshlq, s32, value, -count, s32);
3245 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value,
int count) NLIB_NOEXCEPT {
3246 #if defined(NLIB_SSE41)
3247 return _mm_slli_epi64(value, count);
3248 #elif defined(NLIB_NEON)
3249 return NLIB_SFT(vshlq, u64, value, count, s64);
3254 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value,
int count) NLIB_NOEXCEPT {
3255 #if defined(NLIB_SSE41)
3256 return _mm_srli_epi64(value, count);
3257 #elif defined(NLIB_NEON)
3258 return NLIB_SFT(vshlq, u64, value, -count, s64);
3264 NLIB_M(i128) I128::ByteShiftLeft(i128arg value) NLIB_NOEXCEPT {
3266 #if defined(NLIB_SSE41)
3267 return _mm_slli_si128(value, N);
3268 #elif defined(NLIB_NEON)
3269 return vextq_s8(vdupq_n_s8(0), value, 16 - N);
3275 NLIB_M(i128) I128::ByteShiftRight(i128arg value) NLIB_NOEXCEPT {
3277 #if defined(NLIB_SSE41)
3278 return _mm_srli_si128(value, N);
3279 #elif defined(NLIB_NEON)
3280 return vextq_s8(value, vdupq_n_s8(0), N);
3286 NLIB_M(i128) I128::ByteRotateRight(i128arg value) NLIB_NOEXCEPT {
3288 #if defined(NLIB_SSE41)
3289 return _mm_alignr_epi8(value, value, N);
3290 #elif defined(NLIB_NEON)
3291 return vextq_s8(value, value, N);
3297 NLIB_M(i128) I128::AlignR(i128arg a, i128arg b) NLIB_NOEXCEPT {
3299 #if defined(NLIB_SSE41)
3300 return _mm_alignr_epi8(a, b, N);
3301 #elif defined(NLIB_NEON)
3302 return vextq_s8(b, a, N);
3307 NLIB_M(i128) I128::NarrowFrom16To8(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
3308 #if defined(NLIB_SSE41)
3309 i128 mask = I128::SetValue(0x00FFU, each_uint16);
3310 __m128i lo_mask = _mm_and_si128(lo, mask);
3311 __m128i hi_mask = _mm_and_si128(hi, mask);
3312 return _mm_packus_epi16(lo_mask, hi_mask);
3313 #elif defined(NLIB_NEON)
3315 uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
3316 return vreinterpretq_s8_u8(vmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
3318 uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
3319 uint8x8_t h = vmovn_u16(vreinterpretq_u16_s8(hi));
3320 return NLIB_CMB(u8, l, h);
3326 NLIB_M(i128) I128::NarrowFrom32To16(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
3327 #if defined(NLIB_SSE41)
3328 i128 mask = I128::SetValue(0xFFFFU, each_uint32);
3329 __m128i lo_mask = _mm_and_si128(lo, mask);
3330 __m128i hi_mask = _mm_and_si128(hi, mask);
3331 return _mm_packus_epi32(lo_mask, hi_mask);
3332 #elif defined(NLIB_NEON)
3334 uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
3335 return vreinterpretq_s8_u16(vmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
3337 uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
3338 uint16x4_t h = vmovn_u32(vreinterpretq_u32_s8(hi));
3339 return NLIB_CMB(u16, l, h);
3345 NLIB_M(i128) I128::NarrowFrom64To32(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
3346 #if defined(NLIB_SSE41)
3347 __m128i lo_ = _mm_shuffle_epi32(lo, _MM_SHUFFLE(3, 1, 2, 0));
3348 __m128i hi_ = _mm_shuffle_epi32(hi, _MM_SHUFFLE(3, 1, 2, 0));
3349 return _mm_unpacklo_epi64(lo_, hi_);
3350 #elif defined(NLIB_NEON)
3352 uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
3353 return vreinterpretq_s8_u32(vmovn_high_u64(l, vreinterpretq_u64_s8(hi)));
3355 uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
3356 uint32x2_t h = vmovn_u64(vreinterpretq_u64_s8(hi));
3357 return NLIB_CMB(u32, l, h);
3363 NLIB_M(i128) I128::ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
3364 #if defined(NLIB_SSE41)
3365 i128 b7FFF = I128::SetValue(0x7FFFU, each_uint16);
3366 __m128i lotmp = _mm_and_si128(lo, b7FFF);
3367 __m128i hitmp = _mm_and_si128(hi, b7FFF);
3368 return _mm_packus_epi16(lotmp, hitmp);
3369 #elif defined(NLIB_NEON)
3371 uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
3372 return vreinterpretq_s8_u8(vqmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
3374 uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
3375 uint8x8_t h = vqmovn_u16(vreinterpretq_u16_s8(hi));
3376 return NLIB_CMB(u8, l, h);
3382 NLIB_M(i128) I128::ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
3383 #if defined(NLIB_SSE41)
3384 return _mm_packs_epi16(lo, hi);
3385 #elif defined(NLIB_NEON)
3387 int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
3388 return vqmovn_high_s16(l, vreinterpretq_s16_s8(hi));
3390 int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
3391 int8x8_t h = vqmovn_s16(vreinterpretq_s16_s8(hi));
3392 return NLIB_CMB(s8, l, h);
3398 NLIB_M(i128) I128::ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
3399 #if defined(NLIB_SSE41)
3400 i128 b7FFFFFFF = I128::SetValue(0x7FFFFFFFU, each_uint32);
3401 __m128i lotmp = _mm_and_si128(lo, b7FFFFFFF);
3402 __m128i hitmp = _mm_and_si128(hi, b7FFFFFFF);
3403 return _mm_packus_epi32(lotmp, hitmp);
3404 #elif defined(NLIB_NEON)
3406 uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
3407 return vreinterpretq_s8_u16(vqmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
3409 uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
3410 uint16x4_t h = vqmovn_u32(vreinterpretq_u32_s8(hi));
3411 return NLIB_CMB(u16, l, h);
3417 NLIB_M(i128) I128::ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
3418 #if defined(NLIB_SSE41)
3419 return _mm_packs_epi32(lo, hi);
3420 #elif defined(NLIB_NEON)
3422 int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
3423 return vreinterpretq_s8_s16(vqmovn_high_s32(l, vreinterpretq_s32_s8(hi)));
3425 int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
3426 int16x4_t h = vqmovn_s32(vreinterpretq_s32_s8(hi));
3427 return NLIB_CMB(s16, l, h);
3433 NLIB_M(i128) I128::ConvertFromInt8ToInt16(i64arg value) NLIB_NOEXCEPT {
3434 #if defined(NLIB_SSE41)
3435 return _mm_cvtepi8_epi16(value);
3436 #elif defined(NLIB_NEON)
3437 return vreinterpretq_s8_s16(vmovl_s8(value));
3442 NLIB_M(i128) I128::ConvertFromInt16ToInt32(i64arg value) NLIB_NOEXCEPT {
3443 #if defined(NLIB_SSE41)
3444 return _mm_cvtepi16_epi32(value);
3445 #elif defined(NLIB_NEON)
3446 return vreinterpretq_s8_s32(vmovl_s16(vreinterpret_s16_s8(value)));
3451 NLIB_M(i128) I128::ConvertFromInt32ToInt64(i64arg value) NLIB_NOEXCEPT {
3452 #if defined(NLIB_SSE41)
3453 return _mm_cvtepi32_epi64(value);
3454 #elif defined(NLIB_NEON)
3455 return vreinterpretq_s8_s64(vmovl_s32(vreinterpret_s32_s8(value)));
3460 NLIB_M(i128) I128::ConvertFromUint8ToUint16(i64arg value) NLIB_NOEXCEPT {
3461 #if defined(NLIB_SSE41)
3462 return _mm_cvtepu8_epi16(value);
3463 #elif defined(NLIB_NEON)
3464 return vreinterpretq_s8_u16(vmovl_u8(vreinterpret_u8_s8(value)));
3469 NLIB_M(i128) I128::ConvertFromUint16ToUint32(i64arg value) NLIB_NOEXCEPT {
3470 #if defined(NLIB_SSE41)
3471 return _mm_cvtepu16_epi32(value);
3472 #elif defined(NLIB_NEON)
3473 return vreinterpretq_s8_u32(vmovl_u16(vreinterpret_u16_s8(value)));
3478 NLIB_M(i128) I128::ConvertFromUint32ToUint64(i64arg value) NLIB_NOEXCEPT {
3479 #if defined(NLIB_SSE41)
3480 return _mm_cvtepu32_epi64(value);
3481 #elif defined(NLIB_NEON)
3482 return vreinterpretq_s8_u64(vmovl_u32(vreinterpret_u32_s8(value)));
3487 NLIB_M(i128) I128::Zip8(i64arg a, i64arg b) NLIB_NOEXCEPT {
3488 #if defined(NLIB_SSE41)
3489 return _mm_unpacklo_epi8(a, b);
3490 #elif defined(NLIB_NEON)
3491 int8x8x2_t tmp = vzip_s8(a, b);
3492 return vcombine_s8(tmp.val[0], tmp.val[1]);
3497 NLIB_M(i128) I128::Unzip8(i64arg a, i64arg b) NLIB_NOEXCEPT {
3498 #if defined(NLIB_SSE41)
3500 __m128i tmp = _mm_unpacklo_epi16(a, b);
3502 NLIB_ALIGNAS(16) static const int8_t mask[16] = {0, 4, 8, 12, 2, 6, 10, 14,
3503 1, 5, 9, 13, 3, 7, 11, 15};
3504 return _mm_shuffle_epi8(tmp, *reinterpret_cast<const __m128i*>(&mask[0]));
3505 #elif defined(NLIB_NEON)
3506 int8x8x2_t tmp = vuzp_s8(a, b);
3507 return vcombine_s8(tmp.val[0], tmp.val[1]);
3512 NLIB_M(i128) I128::Zip16(i64arg a, i64arg b) NLIB_NOEXCEPT {
3513 #if defined(NLIB_SSE41)
3514 return _mm_unpacklo_epi16(a, b);
3515 #elif defined(NLIB_NEON)
3516 uint16x4x2_t tmp = vzip_u16(vreinterpret_u16_s8(a), vreinterpret_u16_s8(b));
3517 return NLIB_CMB(u16, tmp.val[0], tmp.val[1]);
3522 NLIB_M(i128) I128::Unzip16(i64arg a, i64arg b) NLIB_NOEXCEPT {
3523 #if defined(NLIB_SSE41)
3525 __m128i tmp = _mm_unpacklo_epi16(a, b);
3527 NLIB_ALIGNAS(16) static const int8_t mask[16] = {0, 1, 8, 9, 2, 3, 10, 11,
3528 4, 5, 12, 13, 6, 7, 14, 15};
3529 return _mm_shuffle_epi8(tmp, *reinterpret_cast<const __m128i*>(&mask[0]));
3530 #elif defined(NLIB_NEON)
3531 uint16x4x2_t tmp = vuzp_u16(vreinterpret_u16_s8(a), vreinterpret_u16_s8(b));
3532 return NLIB_CMB(u16, tmp.val[0], tmp.val[1]);
3537 NLIB_M(i128) I128::Zip32(i64arg a, i64arg b) NLIB_NOEXCEPT {
3538 #if defined(NLIB_SSE41)
3539 return _mm_unpacklo_epi32(a, b);
3540 #elif defined(NLIB_NEON)
3541 uint32x2x2_t tmp = vuzp_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b));
3542 return NLIB_CMB(u32, tmp.val[0], tmp.val[1]);
3547 NLIB_M(i128) I128::Unzip32(i64arg a, i64arg b) NLIB_NOEXCEPT {
3548 #if defined(NLIB_SSE41)
3549 return _mm_unpacklo_epi32(a, b);
3550 #elif defined(NLIB_NEON)
3551 uint32x2x2_t tmp = vuzp_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b));
3552 return NLIB_CMB(u32, tmp.val[0], tmp.val[1]);
3557 NLIB_M(i128) I128::Reverse16(i128arg value) NLIB_NOEXCEPT {
3558 #if defined(NLIB_SSE41)
3560 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
3562 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3563 #elif defined(NLIB_NEON)
3564 return NLIB_OP1(vrev16q, u8, value);
3569 NLIB_M(i128) I128::Reverse32(i128arg value) NLIB_NOEXCEPT {
3570 #if defined(NLIB_SSE41)
3572 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
3574 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3575 #elif defined(NLIB_NEON)
3576 return NLIB_OP1(vrev32q, u8, value);
3581 NLIB_M(i128) I128::Reverse64(i128arg value) NLIB_NOEXCEPT {
3582 #if defined(NLIB_SSE41)
3584 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
3586 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3587 #elif defined(NLIB_NEON)
3588 return NLIB_OP1(vrev64q, u8, value);
3593 NLIB_M(
int) I128::MoveMask8(i128arg value) NLIB_NOEXCEPT {
3594 #if defined(NLIB_SSE41)
3595 return _mm_movemask_epi8(value);
3596 #elif defined(NLIB_NEON)
3598 NLIB_ALIGNAS(16) static const uint64_t powers_[2] = {0x8040201008040201ULL,
3599 0x8040201008040201ULL};
3600 uint64x2_t pwrtmp = vld1q_u64(powers_);
3601 uint8x16_t powers = vreinterpretq_u8_u64(pwrtmp);
3602 uint8x16_t a = vandq_u8(vreinterpretq_u8_s8(value), powers);
3603 uint8x16_t tmp = vpaddq_u8(a, a);
3604 tmp = vpaddq_u8(tmp, tmp);
3605 tmp = vpaddq_u8(tmp, tmp);
3606 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
3608 NLIB_ALIGNAS(16) static const uint64_t powers_[2] = {0x8040201008040201ULL,
3609 0x8040201008040201ULL};
3610 uint64x2_t pwrtmp = vld1q_u64(powers_);
3611 uint8x16_t powers = vreinterpretq_u8_u64(pwrtmp);
3612 uint8x16_t a = vandq_u8(vreinterpretq_u8_s8(value), powers);
3613 uint8x8_t al = vget_low_u8(a);
3614 uint8x8_t ah = vget_high_u8(a);
3615 uint8x8_t tmp = vpadd_u8(al, ah);
3616 tmp = vpadd_u8(tmp, tmp);
3617 tmp = vpadd_u8(tmp, tmp);
3618 return vget_lane_u16(vreinterpret_u16_u8(tmp), 0);
3624 NLIB_M(
int) I128::MoveMask16(i128arg value) NLIB_NOEXCEPT {
3625 #if defined(NLIB_SSE41)
3626 __m128i tmp = _mm_packs_epi16(value, value);
3627 return _mm_movemask_epi8(tmp) & 255;
3628 #elif defined(NLIB_NEON)
3629 NLIB_ALIGNAS(16) static const uint64_t powers_[2] = {0x0008000400020001ULL,
3630 0x0080004000200010ULL};
3631 uint64x2_t pwrtmp = vld1q_u64(powers_);
3632 uint16x8_t powers = vreinterpretq_u16_u64(pwrtmp);
3633 uint16x8_t a = vandq_u16(vreinterpretq_u16_s8(value), powers);
3634 uint8x8_t tmp = vmovn_u16(a);
3635 tmp = vpadd_u8(tmp, tmp);
3636 tmp = vpadd_u8(tmp, tmp);
3637 tmp = vpadd_u8(tmp, tmp);
3638 return vget_lane_u8(tmp, 0);
3643 NLIB_M(
int) I128::MoveMask32(i128arg value) NLIB_NOEXCEPT {
3644 #if defined(NLIB_SSE41)
3645 __m128i tmp = _mm_packs_epi16(value, value);
3646 tmp = _mm_packs_epi16(tmp, tmp);
3647 return _mm_movemask_epi8(tmp) & 15;
3648 #elif defined(NLIB_NEON)
3649 NLIB_ALIGNAS(16) static const uint64_t powers_[2] = {0x0000000200000001ULL,
3650 0x0000000800000004ULL};
3651 uint64x2_t pwrtmp = vld1q_u64(powers_);
3652 uint32x4_t powers = vreinterpretq_u32_u64(pwrtmp);
3653 uint32x4_t a = vandq_u32(vreinterpretq_u32_s8(value), powers);
3654 uint16x4_t tmp = vmovn_u32(a);
3655 tmp = vpadd_u16(tmp, tmp);
3656 tmp = vpadd_u16(tmp, tmp);
3657 return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
3662 NLIB_M(
bool) I128::IsZero(i128arg value) NLIB_NOEXCEPT {
3663 #if defined(NLIB_SSE41)
3664 return _mm_testz_si128(value, value) != 0;
3665 #elif defined(NLIB_NEON)
3666 int8x8_t tmp = vorr_s8(vget_low_s8(value), vget_high_s8(value));
3667 return vget_lane_u64(vreinterpret_u64_s8(tmp), 0) == 0;
3672 NLIB_M(
bool) I128::IsFull(i128arg value) NLIB_NOEXCEPT {
3673 #if defined(NLIB_SSE41)
3674 return _mm_testc_si128(value, _mm_cmpeq_epi8(value, value)) != 0;
3675 #elif defined(NLIB_NEON)
3676 int8x8_t tmp = vand_s8(vget_low_s8(value), vget_high_s8(value));
3677 return vget_lane_s64(vreinterpret_s64_s8(tmp), 0) == -1;
3682 NLIB_M(i128) I128::Select(i128arg mask, i128arg a, i128arg b) NLIB_NOEXCEPT {
3683 #if defined(NLIB_SSE41)
3684 return _mm_blendv_epi8(b, a, mask);
3685 #elif defined(NLIB_NEON)
3686 return NLIB_OP3(vbslq, u32, mask, a, b);
3691 NLIB_M(i128) I128::Shuffle8(i128arg value, i128arg shuffle) NLIB_NOEXCEPT {
3692 #if defined(NLIB_SSE41)
3693 return _mm_shuffle_epi8(value, shuffle);
3694 #elif defined(NLIB_NEON)
3696 return vqtbl1q_s8(value, vreinterpretq_u8_s8(shuffle));
3699 x.val[0] = vget_low_s8(value);
3700 x.val[1] = vget_high_s8(value);
3701 int8x8_t lo = vtbl2_s8(x, vget_low_s8(shuffle));
3702 int8x8_t hi = vtbl2_s8(x, vget_high_s8(shuffle));
3703 return vcombine_s8(lo, hi);
3709 # undef vreinterpretq_s8_s8
3718 #endif // NLIB_DOXYGEN
3723 #if defined(NLIB_SSE41)
3724 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3726 row0 = _mm_shuffle_epi32(row0, _MM_SHUFFLE(3, 1, 2, 0)); \
3727 row1 = _mm_shuffle_epi32(row1, _MM_SHUFFLE(3, 1, 2, 0)); \
3728 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(3, 1, 2, 0)); \
3729 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(3, 1, 2, 0)); \
3730 __m128i t0_transpose32_ = _mm_unpacklo_epi32(row0, row1); \
3731 __m128i t1_transpose32_ = _mm_unpackhi_epi32(row0, row1); \
3732 __m128i t2_transpose32_ = _mm_unpacklo_epi32(row2, row3); \
3733 __m128i t3_transpose32_ = _mm_unpackhi_epi32(row2, row3); \
3734 row0 = _mm_unpacklo_epi64(t0_transpose32_, t2_transpose32_); \
3735 row1 = _mm_unpacklo_epi64(t1_transpose32_, t3_transpose32_); \
3736 row2 = _mm_unpackhi_epi64(t0_transpose32_, t2_transpose32_); \
3737 row3 = _mm_unpackhi_epi64(t1_transpose32_, t3_transpose32_); \
3739 #elif defined(NLIB_NEON)
3740 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3742 uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), \
3743 vreinterpretq_u32_s8(row1)); \
3744 uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), \
3745 vreinterpretq_u32_s8(row3)); \
3746 uint32x4_t row0_, row1_, row2_, row3_; \
3747 uint32x2_t lo, hi; \
3748 lo = vget_low_u32(trn_f0_.val[0]); hi = vget_low_u32(trn_f1_.val[0]); \
3749 row0_ = vcombine_u32(lo, hi); \
3750 row0 = vreinterpretq_s8_u32(row0_); \
3751 lo = vget_low_u32(trn_f0_.val[1]); hi = vget_low_u32(trn_f1_.val[1]); \
3752 row1_ = vcombine_u32(lo, hi); \
3753 row1 = vreinterpretq_s8_u32(row1_); \
3754 lo = vget_high_u32(trn_f0_.val[0]); hi = vget_high_u32(trn_f1_.val[0]); \
3755 row2_ = vcombine_u32(lo, hi); \
3756 row2 = vreinterpretq_s8_u32(row2_); \
3757 lo = vget_high_u32(trn_f0_.val[1]); hi = vget_high_u32(trn_f1_.val[1]); \
3758 row3_ = vcombine_u32(lo, hi); \
3759 row3 = vreinterpretq_s8_u32(row3_); \
3768 #endif // INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
空の構造体で32bitの符号付き整数を示すためのタグです。
#define NLIB_NOEXCEPT
環境に合わせてnoexcept 又は同等の定義がされます。
static i128 LoadA1(intptr_t p) noexcept
LoadA1(const void* p) をラップしています。
空の構造体で64bitの符号付き整数を示すためのタグです。
__m128i nlib_i64_t
64bitの整数用SIMDレジスタのための型です。
static i128 LoadA4(intptr_t p) noexcept
LoadA4(const void* p) をラップしています。
constexpr const each_uint8_tag each_uint8
each_uint8_tag型の定数オブジェクトで、8bitの符号なし整数を示すためのタグです。
int nlib_sse42_supported
x86系アーキテクチャにおいて、SSE4.2がサポートされている場合に0以外の値が設定されます。 ...
static void StoreA16(uintptr_t p, i128arg value) noexcept
StoreA16(void* p, i128arg value) をラップしています。
空の構造体で8bit単位に分けたレーンを選択することを示すためのタグです。
空の構造体で32bit単位に分けたレーンを選択することを示すためのタグです。
static i128 LoadA1(uintptr_t p) noexcept
LoadA1(const void* p) をラップしています。
空の構造体で8bitの符号付き整数を示すためのタグです。
static i128 LoadA2(intptr_t p) noexcept
LoadA2(const void* p) をラップしています。
static void StoreA2(intptr_t p, i128arg value) noexcept
StoreA2(void* p, i128arg value) をラップしています。
constexpr const each_uint16_tag each_uint16
each_uint16_tag型の定数オブジェクトで、16bitの符号なし整数を示すためのタグです。
static void StoreA16(intptr_t p, i128arg value) noexcept
StoreA16(void* p, i128arg value) をラップしています。
空の構造体で16bitの符号なし整数を示すためのタグです。
空の構造体で64bitの符号なし整数を示すためのタグです。
constexpr const each_int64_tag each_int64
each_int64_tag型の定数オブジェクトで、64bitの符号付き整数を示すためのタグです。
nlib_i128_t i128
nlib_i128_tがtypedefされています。
static void StoreA1(uintptr_t p, i128arg value) noexcept
StoreA1(void* p, i128arg value) をラップしています。
static i128 LoadA16(uintptr_t p) noexcept
LoadA16(const void* p) をラップしています。
constexpr const each_uint64_tag each_uint64
each_uint64_tag型の定数オブジェクトで、64bitの符号なし整数を示すためのタグです。
#define NLIB_CEXPR
利用可能であればconstexprが定義されます。そうでない場合は空文字列です。
static i128 LoadA16(intptr_t p) noexcept
LoadA16(const void* p) をラップしています。
128bitレジスタ(SSEではXMM0-XMM15, NEONではQ0-Q15)を用いた整数SIMD演算を行うためのクラスです。 ...
空の構造体で8bitの符号なし整数を示すためのタグです。
空の構造体で16bitの符号付き整数を示すためのタグです。
constexpr const each_int16_tag each_int16
each_int16_tag型の定数オブジェクトで、16bitの符号付き整数を示すためのタグです。
constexpr const each_uint32_tag each_uint32
each_uint32_tag型の定数オブジェクトで、32bitの符号なし整数を示すためのタグです。
空の構造体で16bit単位に分けたレーンを選択することを示すためのタグです。
static i128 LoadA8(intptr_t p) noexcept
LoadA8(const void* p) をラップしています。
constexpr const each_select16_tag each_select16
each_select16_tag型の定数オブジェクトで、16bitのレーンを選択することを示すためのタグです。 ...
static i128 LoadA2(uintptr_t p) noexcept
LoadA2(const void* p) をラップしています。
static void StoreA8(uintptr_t p, i128arg value) noexcept
StoreA8(void* p, i128arg value) をラップしています。
64bit幅での I128 と同様の整数SIMD演算を行うためのクラスです。
__m128i nlib_i128_t
128bitの整数用SIMDレジスタのための型です。
constexpr const each_select8_tag each_select8
each_select8_tag型の定数オブジェクトで、8bitのレーンを選択することを示すためのタグです。 ...
#define NLIB_ALIGNAS(x)
alignas(x)又は同等の定義がされます。
constexpr const each_int8_tag each_int8
each_int8_tag型の定数オブジェクトで、8bitの符号付き整数を示すためのタグです。
constexpr const each_select32_tag each_select32
each_select32_tag型の定数オブジェクトで、32bitのレーンを選択することを示すためのタグです。 ...
static void StoreA8(intptr_t p, i128arg value) noexcept
StoreA8(void* p, i128arg value) をラップしています。
空の構造体で32bitの符号なし整数を示すためのタグです。
nlib_i64_t i64
nlib_i64_tがtypedefされています。
static void StoreA4(intptr_t p, i128arg value) noexcept
StoreA4(void* p, i128arg value) をラップしています。
constexpr const each_int32_tag each_int32
each_int32_tag型の定数オブジェクトで、32bitの符号付き整数を示すためのタグです。
static void StoreA4(uintptr_t p, i128arg value) noexcept
StoreA4(void* p, i128arg value) をラップしています。
#define NLIB_STATIC_ASSERT(exp)
静的アサートが定義されます。利用可能であればstatic_assertを利用します。
static void StoreA1(intptr_t p, i128arg value) noexcept
StoreA1(void* p, i128arg value) をラップしています。
static void StoreA2(uintptr_t p, i128arg value) noexcept
StoreA2(void* p, i128arg value) をラップしています。
static i128 LoadA8(uintptr_t p) noexcept
LoadA8(const void* p) をラップしています。
static i128 LoadA4(uintptr_t p) noexcept
LoadA4(const void* p) をラップしています。