16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDINT_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDINT_H_ 21 #if defined(NLIB_SSE41) 23 #elif defined(NLIB_NEON) 67 #if !defined(_MSC_VER) && !defined(__vectorcall) 71 #if defined(NLIB_SIMD) 76 #if defined(_MSC_VER) && _MSC_VER < 1800 77 typedef const i128& i128arg;
79 typedef const i128 i128arg;
116 #define NLIB_LOAD_REDIRECT(func) \ 117 static i128 __vectorcall func(uintptr_t p) NLIB_NOEXCEPT { \ 118 return func(reinterpret_cast<void*>(p)); \ 120 static i128 __vectorcall func(intptr_t p) NLIB_NOEXCEPT { \ 121 return func(reinterpret_cast<void*>(p)); \ 123 NLIB_LOAD_REDIRECT(LoadA16)
124 NLIB_LOAD_REDIRECT(LoadA8)
125 NLIB_LOAD_REDIRECT(LoadA4)
126 NLIB_LOAD_REDIRECT(LoadA2)
127 NLIB_LOAD_REDIRECT(LoadA1)
128 NLIB_LOAD_REDIRECT(LoadLoA8)
129 NLIB_LOAD_REDIRECT(LoadLoA4)
130 NLIB_LOAD_REDIRECT(LoadLoA2)
131 NLIB_LOAD_REDIRECT(LoadLoA1)
132 NLIB_LOAD_REDIRECT(LoadHiA8)
133 NLIB_LOAD_REDIRECT(LoadHiA4)
134 NLIB_LOAD_REDIRECT(LoadHiA2)
135 NLIB_LOAD_REDIRECT(LoadHiA1)
136 #undef NLIB_LOAD_REDIRECT 138 static void __vectorcall StoreA16(
void* p, i128arg value)
NLIB_NOEXCEPT;
139 static void __vectorcall StoreA8(
void* p, i128arg value)
NLIB_NOEXCEPT;
140 static void __vectorcall StoreA4(
void* p, i128arg value)
NLIB_NOEXCEPT;
141 static void __vectorcall StoreA2(
void* p, i128arg value)
NLIB_NOEXCEPT;
142 static void __vectorcall StoreA1(
void* p, i128arg value)
NLIB_NOEXCEPT;
143 static void __vectorcall StoreLoA8(
void* p, i128arg value)
NLIB_NOEXCEPT;
144 static void __vectorcall StoreLoA4(
void* p, i128arg value)
NLIB_NOEXCEPT;
145 static void __vectorcall StoreLoA2(
void* p, i128arg value)
NLIB_NOEXCEPT;
146 static void __vectorcall StoreLoA1(
void* p, i128arg value)
NLIB_NOEXCEPT;
147 static void __vectorcall StoreHiA8(
void* p, i128arg value)
NLIB_NOEXCEPT;
148 static void __vectorcall StoreHiA4(
void* p, i128arg value)
NLIB_NOEXCEPT;
149 static void __vectorcall StoreHiA2(
void* p, i128arg value)
NLIB_NOEXCEPT;
150 static void __vectorcall StoreHiA1(
void* p, i128arg value)
NLIB_NOEXCEPT;
152 #define NLIB_STORE_REDIRECT(func) \ 153 static void __vectorcall func(uintptr_t p, i128arg value) NLIB_NOEXCEPT { \ 154 func(reinterpret_cast<void*>(p), value); \ 156 static void __vectorcall func(intptr_t p, i128arg value) NLIB_NOEXCEPT { \ 157 func(reinterpret_cast<void*>(p), value); \ 159 NLIB_STORE_REDIRECT(StoreA16)
160 NLIB_STORE_REDIRECT(StoreA8)
161 NLIB_STORE_REDIRECT(StoreA4)
162 NLIB_STORE_REDIRECT(StoreA2)
163 NLIB_STORE_REDIRECT(StoreA1)
164 NLIB_STORE_REDIRECT(StoreLoA8)
165 NLIB_STORE_REDIRECT(StoreLoA4)
166 NLIB_STORE_REDIRECT(StoreLoA2)
167 NLIB_STORE_REDIRECT(StoreLoA1)
168 NLIB_STORE_REDIRECT(StoreHiA8)
169 NLIB_STORE_REDIRECT(StoreHiA4)
170 NLIB_STORE_REDIRECT(StoreHiA2)
171 NLIB_STORE_REDIRECT(StoreHiA1)
172 #undef NLIB_STORE_REDIRECT 178 static uint8_t __vectorcall GetUint8FromLane(i128arg value)
NLIB_NOEXCEPT;
180 static uint16_t __vectorcall GetUint16FromLane(i128arg value)
NLIB_NOEXCEPT;
182 static uint32_t __vectorcall GetUint32FromLane(i128arg value)
NLIB_NOEXCEPT;
184 static uint64_t __vectorcall GetUint64FromLane(i128arg value)
NLIB_NOEXCEPT;
188 static i128 __vectorcall SetUint16ToLane(i128arg value, uint16_t v)
NLIB_NOEXCEPT;
190 static i128 __vectorcall SetUint32ToLane(i128arg value, uint32_t v)
NLIB_NOEXCEPT;
192 static i128 __vectorcall SetUint64ToLane(i128arg value, uint64_t v)
NLIB_NOEXCEPT;
228 static i128 __vectorcall MultAdd16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
229 static i128 __vectorcall MultSub16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
231 static i128 __vectorcall MultAdd32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
232 static i128 __vectorcall MultSub32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
327 static i128 __vectorcall ShiftLeftLogical8(i128arg value,
int count)
NLIB_NOEXCEPT;
328 static i128 __vectorcall ShiftRightLogical8(i128arg value,
int count)
NLIB_NOEXCEPT;
329 static i128 __vectorcall ShiftRightArithmetic8(i128arg value,
int count)
NLIB_NOEXCEPT;
331 static i128 __vectorcall ShiftLeftLogical16(i128arg value,
int count)
NLIB_NOEXCEPT;
332 static i128 __vectorcall ShiftRightLogical16(i128arg value,
int count)
NLIB_NOEXCEPT;
333 static i128 __vectorcall ShiftRightArithmetic16(i128arg value,
int count)
NLIB_NOEXCEPT;
335 static i128 __vectorcall ShiftLeftLogical32(i128arg value,
int count)
NLIB_NOEXCEPT;
336 static i128 __vectorcall ShiftRightLogical32(i128arg value,
int count)
NLIB_NOEXCEPT;
337 static i128 __vectorcall ShiftRightArithmetic32(i128arg value,
int count)
NLIB_NOEXCEPT;
339 static i128 __vectorcall ShiftLeftLogical64(i128arg value,
int count)
NLIB_NOEXCEPT;
340 static i128 __vectorcall ShiftRightLogical64(i128arg value,
int count)
NLIB_NOEXCEPT;
390 static i128 __vectorcall ConvertFromUint16ToUint8Saturated(i128arg lo,
392 static i128 __vectorcall ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
393 static i128 __vectorcall ConvertFromUint32ToUint16Saturated(i128arg lo,
395 static i128 __vectorcall ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
403 static i128 __vectorcall ConvertFromUint8ToUint16Lo(i128arg value)
NLIB_NOEXCEPT;
404 static i128 __vectorcall ConvertFromUint8ToUint16Hi(i128arg value)
NLIB_NOEXCEPT;
405 static i128 __vectorcall ConvertFromUint16ToUint32Lo(i128arg value)
NLIB_NOEXCEPT;
406 static i128 __vectorcall ConvertFromUint16ToUint32Hi(i128arg value)
NLIB_NOEXCEPT;
407 static i128 __vectorcall ConvertFromUint32ToUint64Lo(i128arg value)
NLIB_NOEXCEPT;
408 static i128 __vectorcall ConvertFromUint32ToUint64Hi(i128arg value)
NLIB_NOEXCEPT;
423 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7,
int V8,
int V9,
424 int V10,
int V11,
int V12,
int V13,
int V14,
int V15>
426 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7>
428 template<
int V0,
int V1,
int V2,
int V3>
441 static int __vectorcall MoveMask8(i128arg value)
NLIB_NOEXCEPT;
442 static int __vectorcall MoveMask16(i128arg value)
NLIB_NOEXCEPT;
443 static int __vectorcall MoveMask32(i128arg value)
NLIB_NOEXCEPT;
447 static bool __vectorcall IsZero(i128arg value)
NLIB_NOEXCEPT;
448 static bool __vectorcall IsFull(i128arg value)
NLIB_NOEXCEPT;
449 static i128 __vectorcall Select(i128arg mask, i128arg a, i128arg b)
NLIB_NOEXCEPT;
451 static int __vectorcall PopCntMask8(i128arg value)
NLIB_NOEXCEPT;
452 static int __vectorcall ClzMask8(i128arg value)
NLIB_NOEXCEPT;
453 static int __vectorcall CtzMask8(i128arg value)
NLIB_NOEXCEPT;
454 static int __vectorcall SumUint8(i128arg value)
NLIB_NOEXCEPT;
455 static int __vectorcall SumUint16(i128arg value)
NLIB_NOEXCEPT;
463 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall 464 #define NLIB_M2(tp) inline tp __vectorcall 467 #undef vreinterpret_s8_s8 475 #define vreinterpretq_s8_s8(a) (a) 476 #define NLIB_OP1(intrin, tp, a) vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a))) 477 #define NLIB_OP2(intrin, tp, a, b) \ 478 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vreinterpretq_##tp##_s8(b))) 479 #define NLIB_OP3(intrin, tp, a, b, c) \ 480 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vreinterpretq_##tp##_s8(b), \ 481 vreinterpretq_##tp##_s8(c))) 482 #define NLIB_CMP(intrin, tp, a, b, utp) \ 483 vreinterpretq_s8_##utp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vreinterpretq_##tp##_s8(b))) 484 #define NLIB_SFT(intrin, tp, a, cnt, stp) \ 485 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vdupq_n_##stp(cnt))) 486 #define NLIB_CMB(tp, l, h) vreinterpretq_s8_##tp(vcombine_##tp(l, h)) 491 #if defined(NLIB_SSE41) 493 return _mm_shuffle_epi8(_mm_cvtsi32_si128(static_cast<uint8_t>(v)), _mm_setzero_si128());
494 #elif defined(NLIB_NEON) 495 return vdupq_n_s8(v);
501 #if defined(NLIB_SSE41) 502 return _mm_set1_epi16(v);
503 #elif defined(NLIB_NEON) 504 return vreinterpretq_s8_s16(vdupq_n_s16(v));
510 #if defined(NLIB_SSE41) 511 return _mm_set1_epi32(v);
512 #elif defined(NLIB_NEON) 513 return vreinterpretq_s8_s32(vdupq_n_s32(v));
519 #if defined(NLIB_SSE41) 525 return I128::LoadA16(tmp);
527 return _mm_set1_epi64x(v);
529 #elif defined(NLIB_NEON) 530 return vreinterpretq_s8_s64(vdupq_n_s64(v));
536 #if defined(NLIB_SSE41) 538 return _mm_shuffle_epi8(_mm_cvtsi32_si128(v), _mm_setzero_si128());
539 #elif defined(NLIB_NEON) 540 return vreinterpretq_s8_u8(vdupq_n_u8(v));
546 #if defined(NLIB_SSE41) 547 return _mm_set1_epi16(static_cast<int16_t>(v));
548 #elif defined(NLIB_NEON) 549 return vreinterpretq_s8_u16(vdupq_n_u16(v));
555 #if defined(NLIB_SSE41) 556 return _mm_set1_epi32(static_cast<int32_t>(v));
557 #elif defined(NLIB_NEON) 558 return vreinterpretq_s8_u32(vdupq_n_u32(v));
564 #if defined(NLIB_SSE41) 567 return I128::LoadA16(tmp);
569 return _mm_set1_epi64x(static_cast<int64_t>(v));
571 #elif defined(NLIB_NEON) 572 return vreinterpretq_s8_u64(vdupq_n_u64(v));
576 #if defined(NLIB_SSE41) 581 return _mm_shuffle_epi32(value, _MM_SHUFFLE(N, N, N, N));
583 #elif defined(NLIB_NEON) 589 uint32x4_t v = vreinterpretq_u32_s8(value);
590 return vreinterpretq_s8_u32(vdupq_laneq_u32(v, N));
596 uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
597 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
602 uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
603 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
608 uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
609 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
614 uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
615 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
620 #if defined(NLIB_SSE41) 626 const int8_t mask[16] = {2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1,
627 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1,
628 2 * N, 2 * N + 1, 2 * N, 2 * N + 1};
629 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(mask));
631 #elif defined(NLIB_NEON) 636 uint16x8_t v = vreinterpretq_u16_s8(value);
637 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 0));
639 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
640 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
648 uint16x8_t v = vreinterpretq_u16_s8(value);
649 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 1));
651 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
652 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
660 uint16x8_t v = vreinterpretq_u16_s8(value);
661 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 2));
663 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
664 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
672 uint16x8_t v = vreinterpretq_u16_s8(value);
673 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 3));
675 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
676 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
684 uint16x8_t v = vreinterpretq_u16_s8(value);
685 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 4));
687 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
688 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
696 uint16x8_t v = vreinterpretq_u16_s8(value);
697 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 5));
699 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
700 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
708 uint16x8_t v = vreinterpretq_u16_s8(value);
709 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 6));
711 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
712 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
720 uint16x8_t v = vreinterpretq_u16_s8(value);
721 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 7));
723 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
724 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
729 #if defined(NLIB_SSE41) 734 NLIB_ALIGNAS(16) const int8_t mask[16] = {N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N};
735 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
737 #elif defined(NLIB_NEON) 739 template<
size_t N,
bool IsLower>
740 struct SetValue8Helper {
742 return vdupq_lane_s8(vget_low_s8(value), N);
747 struct SetValue8Helper<N, false> {
749 return vdupq_lane_s8(vget_high_s8(value), N - 8);
760 return vdupq_laneq_s8(value, N);
762 return detail::SetValue8Helper<N, (N < 8)>()(value);
769 #if defined(NLIB_SSE41) 770 return _mm_setzero_si128();
771 #elif defined(NLIB_NEON) 772 return vdupq_n_s8(0);
778 return I128::CmpEq8(dummy, dummy);
783 #if defined(NLIB_SSE41) 784 return _mm_load_si128(static_cast<const __m128i*>(p));
785 #elif defined(NLIB_NEON) 786 uint64x2_t tmp = vld1q_u64(static_cast<const uint64_t*>(p));
787 return vreinterpretq_s8_u64(tmp);
793 #if defined(NLIB_SSE41) 794 return _mm_loadu_si128(static_cast<const __m128i*>(p));
795 #elif defined(NLIB_NEON) 796 uint64x2_t tmp = vld1q_u64(static_cast<const uint64_t*>(p));
797 return vreinterpretq_s8_u64(tmp);
803 #if defined(NLIB_SSE41) 804 return _mm_loadu_si128(static_cast<const __m128i*>(p));
805 #elif defined(NLIB_NEON) 806 uint32x4_t tmp = vld1q_u32(static_cast<const uint32_t*>(p));
807 return vreinterpretq_s8_u32(tmp);
813 #if defined(NLIB_SSE41) 814 return _mm_loadu_si128(static_cast<const __m128i*>(p));
815 #elif defined(NLIB_NEON) 816 uint16x8_t tmp = vld1q_u16(static_cast<const uint16_t*>(p));
817 return vreinterpretq_s8_u16(tmp);
823 #if defined(NLIB_SSE41) 824 return _mm_loadu_si128(static_cast<const __m128i*>(p));
825 #elif defined(NLIB_NEON) 826 return vld1q_s8(static_cast<const int8_t*>(p));
831 #if defined(NLIB_SSE41) 832 return _mm_loadl_epi64(static_cast<const __m128i*>(p));
833 #elif defined(NLIB_NEON) 834 int8x8_t lo = vreinterpret_s8_u64(vld1_u64(static_cast<const uint64_t*>(p)));
835 return vcombine_s8(lo, vdup_n_s8(0));
840 #if defined(NLIB_SSE41) 841 return _mm_loadl_epi64(static_cast<const __m128i*>(p));
842 #elif defined(NLIB_NEON) 843 int8x8_t lo = vreinterpret_s8_u32(vld1_u32(static_cast<const uint32_t*>(p)));
844 return vcombine_s8(lo, vdup_n_s8(0));
849 #if defined(NLIB_SSE41) 850 return _mm_loadl_epi64(static_cast<const __m128i*>(p));
851 #elif defined(NLIB_NEON) 852 int8x8_t lo = vreinterpret_s8_u16(vld1_u16(static_cast<const uint16_t*>(p)));
853 return vcombine_s8(lo, vdup_n_s8(0));
858 #if defined(NLIB_SSE41) 859 return _mm_loadl_epi64(static_cast<const __m128i*>(p));
860 #elif defined(NLIB_NEON) 861 int8x8_t lo = vld1_s8(static_cast<const int8_t*>(p));
862 return vcombine_s8(lo, vdup_n_s8(0));
867 #if defined(NLIB_SSE41) 868 __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
869 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
870 #elif defined(NLIB_NEON) 871 int8x8_t hi = vreinterpret_s8_u64(vld1_u64(static_cast<const uint64_t*>(p)));
872 return vcombine_s8(vdup_n_s8(0), hi);
877 #if defined(NLIB_SSE41) 878 __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
879 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
880 #elif defined(NLIB_NEON) 881 int8x8_t hi = vreinterpret_s8_u32(vld1_u32(static_cast<const uint32_t*>(p)));
882 return vcombine_s8(vdup_n_s8(0), hi);
887 #if defined(NLIB_SSE41) 888 __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
889 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
890 #elif defined(NLIB_NEON) 891 int8x8_t hi = vreinterpret_s8_u16(vld1_u16(static_cast<const uint16_t*>(p)));
892 return vcombine_s8(vdup_n_s8(0), hi);
897 #if defined(NLIB_SSE41) 898 __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
899 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
900 #elif defined(NLIB_NEON) 901 int8x8_t hi = vld1_s8(static_cast<const int8_t*>(p));
902 return vcombine_s8(vdup_n_s8(0), hi);
907 NLIB_M(
void) I128::StoreA16(
void* p, i128arg value)
NLIB_NOEXCEPT {
908 #if defined(NLIB_SSE41) 909 _mm_store_si128(static_cast<i128*>(p), value);
910 #elif defined(NLIB_NEON) 911 vst1q_u64(static_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
916 NLIB_M(
void) I128::StoreA8(
void* p, i128arg value)
NLIB_NOEXCEPT {
917 #if defined(NLIB_SSE41) 918 _mm_storeu_si128(static_cast<i128*>(p), value);
919 #elif defined(NLIB_NEON) 920 vst1q_u64(static_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
925 NLIB_M(
void) I128::StoreA4(
void* p, i128arg value)
NLIB_NOEXCEPT {
926 #if defined(NLIB_SSE41) 927 _mm_storeu_si128(static_cast<i128*>(p), value);
928 #elif defined(NLIB_NEON) 929 vst1q_u32(static_cast<uint32_t*>(p), vreinterpretq_u32_s8(value));
934 NLIB_M(
void) I128::StoreA2(
void* p, i128arg value)
NLIB_NOEXCEPT {
935 #if defined(NLIB_SSE41) 936 _mm_storeu_si128(static_cast<i128*>(p), value);
937 #elif defined(NLIB_NEON) 938 vst1q_u16(static_cast<uint16_t*>(p), vreinterpretq_u16_s8(value));
943 NLIB_M(
void) I128::StoreA1(
void* p, i128arg value)
NLIB_NOEXCEPT {
944 #if defined(NLIB_SSE41) 945 _mm_storeu_si128(static_cast<i128*>(p), value);
946 #elif defined(NLIB_NEON) 947 vst1q_s8(static_cast<int8_t*>(p), value);
951 NLIB_M(
void) I128::StoreLoA8(
void* p, i128arg value)
NLIB_NOEXCEPT {
952 #if defined(NLIB_SSE41) 953 _mm_storel_epi64(static_cast<i128*>(p), value);
954 #elif defined(NLIB_NEON) 955 uint64x1_t x = vreinterpret_u64_s8(vget_low_s8(value));
956 vst1_u64(static_cast<uint64_t*>(p), x);
960 NLIB_M(
void) I128::StoreLoA4(
void* p, i128arg value)
NLIB_NOEXCEPT {
961 #if defined(NLIB_SSE41) 962 _mm_storel_epi64(static_cast<i128*>(p), value);
963 #elif defined(NLIB_NEON) 964 uint32x2_t x = vreinterpret_u32_s8(vget_low_s8(value));
965 vst1_u32(static_cast<uint32_t*>(p), x);
969 NLIB_M(
void) I128::StoreLoA2(
void* p, i128arg value)
NLIB_NOEXCEPT {
970 #if defined(NLIB_SSE41) 971 _mm_storel_epi64(static_cast<i128*>(p), value);
972 #elif defined(NLIB_NEON) 973 uint16x4_t x = vreinterpret_u16_s8(vget_low_s8(value));
974 vst1_u16(static_cast<uint16_t*>(p), x);
978 NLIB_M(
void) I128::StoreLoA1(
void* p, i128arg value)
NLIB_NOEXCEPT {
979 #if defined(NLIB_SSE41) 980 _mm_storel_epi64(static_cast<i128*>(p), value);
981 #elif defined(NLIB_NEON) 982 int8x8_t x = vget_low_s8(value);
983 vst1_s8(static_cast<int8_t*>(p), x);
987 NLIB_M(
void) I128::StoreHiA8(
void* p, i128arg value)
NLIB_NOEXCEPT {
988 #if defined(NLIB_SSE41) 989 _mm_storel_epi64(static_cast<i128*>(p), _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
990 #elif defined(NLIB_NEON) 991 uint64x1_t x = vreinterpret_u64_s8(vget_high_s8(value));
992 vst1_u64(static_cast<uint64_t*>(p), x);
996 NLIB_M(
void) I128::StoreHiA4(
void* p, i128arg value)
NLIB_NOEXCEPT {
997 #if defined(NLIB_SSE41) 998 _mm_storel_epi64(static_cast<i128*>(p), _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
999 #elif defined(NLIB_NEON) 1000 uint32x2_t x = vreinterpret_u32_s8(vget_high_s8(value));
1001 vst1_u32(static_cast<uint32_t*>(p), x);
1005 NLIB_M(
void) I128::StoreHiA2(
void* p, i128arg value)
NLIB_NOEXCEPT {
1006 #if defined(NLIB_SSE41) 1007 _mm_storel_epi64(static_cast<i128*>(p), _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
1008 #elif defined(NLIB_NEON) 1009 uint16x4_t x = vreinterpret_u16_s8(vget_high_s8(value));
1010 vst1_u16(static_cast<uint16_t*>(p), x);
1014 NLIB_M(
void) I128::StoreHiA1(
void* p, i128arg value)
NLIB_NOEXCEPT {
1015 #if defined(NLIB_SSE41) 1016 _mm_storel_epi64(static_cast<i128*>(p), _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
1017 #elif defined(NLIB_NEON) 1018 int8x8_t x = vget_high_s8(value);
1019 vst1_s8(static_cast<int8_t*>(p), x);
1025 NLIB_M(uint8_t) I128::GetUint8FromLane(i128arg value)
NLIB_NOEXCEPT {
1027 #if defined(NLIB_SSE41) 1028 return static_cast<uint8_t
>(_mm_extract_epi8(value, N));
1029 #elif defined(NLIB_NEON) 1030 return vgetq_lane_u8(vreinterpretq_u8_s8(value), N);
1036 NLIB_M(uint16_t) I128::GetUint16FromLane(i128arg value)
NLIB_NOEXCEPT {
1038 #if defined(NLIB_SSE41) 1039 return static_cast<uint16_t
>(_mm_extract_epi16(value, N));
1040 #elif defined(NLIB_NEON) 1041 return vgetq_lane_u16(vreinterpretq_u16_s8(value), N);
1047 NLIB_M(uint32_t) I128::GetUint32FromLane(i128arg value)
NLIB_NOEXCEPT {
1049 #if defined(NLIB_SSE41) 1050 return static_cast<uint32_t
>(_mm_extract_epi32(value, N));
1051 #elif defined(NLIB_NEON) 1052 return vgetq_lane_u32(vreinterpretq_u32_s8(value), N);
1058 NLIB_M(uint64_t) I128::GetUint64FromLane(i128arg value)
NLIB_NOEXCEPT {
1060 #if defined(NLIB_SSE41) 1062 return static_cast<uint64_t
>(_mm_extract_epi64(value, N));
1067 #elif defined(NLIB_NEON) 1068 return vgetq_lane_u64(vreinterpretq_u64_s8(value), N);
1072 #if defined(NLIB_SSE41) && !defined(NLIB_64BIT) 1077 _mm_storel_epi64(reinterpret_cast<i128*>(&rval), value);
1084 i128 tmp = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
1085 _mm_storel_epi64(reinterpret_cast<i128*>(&rval), tmp);
1094 #if defined(NLIB_SSE41) 1095 return _mm_insert_epi8(value, static_cast<int8_t>(v), N);
1096 #elif defined(NLIB_NEON) 1097 return __builtin_constant_p(v)
1098 ? I128::Permute8 < N == 0 ? 16 : 0,
1099 N == 1 ? 17 : 1, N == 2 ? 18 : 2, N == 3 ? 19 : 3, N == 4 ? 20 : 4, N == 5 ? 21 : 5,
1100 N == 6 ? 22 : 6, N == 7 ? 23 : 7, N == 8 ? 24 : 8, N == 9 ? 25 : 9, N == 10 ? 26 : 10,
1101 N == 11 ? 27 : 11, N == 12 ? 28 : 12, N == 13 ? 29 : 13, N == 14 ? 30 : 14,
1102 N == 15 ? 31 : 15 > (value, vreinterpretq_s8_u8(vdupq_n_u8(v)))
1103 : vreinterpretq_s8_u8(vsetq_lane_u8(v, vreinterpretq_u8_s8(value), N));
1111 #if defined(NLIB_SSE41) 1112 return _mm_insert_epi16(value, static_cast<int16_t>(v), N);
1113 #elif defined(NLIB_NEON) 1114 return __builtin_constant_p(v)
1115 ? I128::Permute16 < N == 0 ? 8 : 0,
1116 N == 1 ? 9 : 1, N == 2 ? 10 : 2, N == 3 ? 11 : 3, N == 4 ? 12 : 4, N == 5 ? 13 : 5,
1118 N == 7 ? 15 : 7 > (value, vreinterpretq_s8_u16(vdupq_n_u16(v)))
1119 : vreinterpretq_s8_u16(vsetq_lane_u16(v, vreinterpretq_u16_s8(value), N));
1127 #if defined(NLIB_SSE41) 1128 return _mm_insert_epi32(value, static_cast<uint32_t>(v), N);
1129 #elif defined(NLIB_NEON) 1130 return __builtin_constant_p(v)
1131 ? I128::Permute32 < N == 0 ? 4 : 0,
1132 N == 1 ? 5 : 1, N == 2 ? 6 : 2,
1133 N == 3 ? 7 : 3 > (value, vreinterpretq_s8_u32(vdupq_n_u32(v)))
1134 : vreinterpretq_s8_u32(vsetq_lane_u32(v, vreinterpretq_u32_s8(value), N));
1142 #if defined(NLIB_SSE41) 1144 return _mm_insert_epi64(value, static_cast<int64_t>(v), N);
1150 tmp.i64 =
static_cast<int64_t
>(v);
1152 rval = _mm_insert_epi32(value, tmp.i32[0], N * 2 + 0);
1153 return _mm_insert_epi32(rval, tmp.i32[1], N * 2 + 1);
1155 #elif defined(NLIB_NEON) 1156 return vreinterpretq_s8_u64(vsetq_lane_u64(v, vreinterpretq_u64_s8(value), N));
1162 #if defined(NLIB_SSE41) 1163 return _mm_add_epi8(a, b);
1164 #elif defined(NLIB_NEON) 1165 return vaddq_s8(a, b);
1171 #if defined(NLIB_SSE41) 1172 return _mm_add_epi16(a, b);
1173 #elif defined(NLIB_NEON) 1174 return NLIB_OP2(vaddq, s16, a, b);
1180 #if defined(NLIB_SSE41) 1181 return _mm_add_epi32(a, b);
1182 #elif defined(NLIB_NEON) 1183 return NLIB_OP2(vaddq, s32, a, b);
1189 #if defined(NLIB_SSE41) 1190 return _mm_add_epi64(a, b);
1191 #elif defined(NLIB_NEON) 1192 return NLIB_OP2(vaddq, s64, a, b);
1198 #if defined(NLIB_SSE41) 1199 return _mm_adds_epi8(a, b);
1200 #elif defined(NLIB_NEON) 1201 return vqaddq_s8(a, b);
1207 #if defined(NLIB_SSE41) 1208 return _mm_adds_epi16(a, b);
1209 #elif defined(NLIB_NEON) 1210 return NLIB_OP2(vqaddq, s16, a, b);
1216 #if defined(NLIB_SSE41) 1217 return _mm_adds_epu8(a, b);
1218 #elif defined(NLIB_NEON) 1219 return NLIB_OP2(vqaddq, u8, a, b);
1225 #if defined(NLIB_SSE41) 1226 return _mm_adds_epu16(a, b);
1227 #elif defined(NLIB_NEON) 1228 return NLIB_OP2(vqaddq, u16, a, b);
1234 #if defined(NLIB_SSE41) 1235 return _mm_sub_epi8(a, b);
1236 #elif defined(NLIB_NEON) 1237 return vsubq_s8(a, b);
1243 #if defined(NLIB_SSE41) 1244 return _mm_sub_epi16(a, b);
1245 #elif defined(NLIB_NEON) 1246 return NLIB_OP2(vsubq, s16, a, b);
1252 #if defined(NLIB_SSE41) 1253 return _mm_sub_epi32(a, b);
1254 #elif defined(NLIB_NEON) 1255 return NLIB_OP2(vsubq, s32, a, b);
1261 #if defined(NLIB_SSE41) 1262 return _mm_sub_epi64(a, b);
1263 #elif defined(NLIB_NEON) 1264 return NLIB_OP2(vsubq, s64, a, b);
1270 #if defined(NLIB_SSE41) 1271 return _mm_subs_epi8(a, b);
1272 #elif defined(NLIB_NEON) 1273 return NLIB_OP2(vqsubq, s8, a, b);
1279 #if defined(NLIB_SSE41) 1280 return _mm_subs_epi16(a, b);
1281 #elif defined(NLIB_NEON) 1282 return NLIB_OP2(vqsubq, s16, a, b);
1288 #if defined(NLIB_SSE41) 1289 return _mm_subs_epu8(a, b);
1290 #elif defined(NLIB_NEON) 1291 return NLIB_OP2(vqsubq, u8, a, b);
1297 #if defined(NLIB_SSE41) 1298 return _mm_subs_epu16(a, b);
1299 #elif defined(NLIB_NEON) 1300 return NLIB_OP2(vqsubq, u16, a, b);
1306 #if defined(NLIB_SSE41) 1307 __m128i ax = _mm_add_epi8(a, _mm_srli_epi16(a, 8));
1308 __m128i bx = _mm_add_epi8(b, _mm_srli_epi16(b, 8));
1309 return I128::NarrowFrom16To8(ax, bx);
1310 #elif defined(NLIB_NEON) 1312 return vpaddq_s8(a, b);
1314 int8x8_t al = vget_low_s8(a);
1315 int8x8_t ah = vget_high_s8(a);
1316 int8x8_t rl = vpadd_s8(al, ah);
1317 int8x8_t bl = vget_low_s8(b);
1318 int8x8_t bh = vget_high_s8(b);
1319 int8x8_t rh = vpadd_s8(bl, bh);
1320 return vcombine_s8(rl, rh);
1327 #if defined(NLIB_SSE41) 1328 return _mm_hadd_epi16(a, b);
1329 #elif defined(NLIB_NEON) 1331 return vreinterpretq_s8_s16(vpaddq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)));
1333 int16x4_t al = vget_low_s16(vreinterpretq_s16_s8(a));
1334 int16x4_t ah = vget_high_s16(vreinterpretq_s16_s8(a));
1335 int16x4_t rl = vpadd_s16(al, ah);
1336 int16x4_t bl = vget_low_s16(vreinterpretq_s16_s8(b));
1337 int16x4_t bh = vget_high_s16(vreinterpretq_s16_s8(b));
1338 int16x4_t rh = vpadd_s16(bl, bh);
1339 return NLIB_CMB(s16, rl, rh);
1346 #if defined(NLIB_SSE41) 1347 return _mm_hadd_epi32(a, b);
1348 #elif defined(NLIB_NEON) 1350 return vreinterpretq_s8_s32(vpaddq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)));
1352 int32x2_t al = vget_low_s32(vreinterpretq_s32_s8(a));
1353 int32x2_t ah = vget_high_s32(vreinterpretq_s32_s8(a));
1354 int32x2_t rl = vpadd_s32(al, ah);
1355 int32x2_t bl = vget_low_s32(vreinterpretq_s32_s8(b));
1356 int32x2_t bh = vget_high_s32(vreinterpretq_s32_s8(b));
1357 int32x2_t rh = vpadd_s32(bl, bh);
1358 return NLIB_CMB(s32, rl, rh);
1365 #if defined(NLIB_SSE41) 1366 return _mm_mullo_epi16(a, b);
1367 #elif defined(NLIB_NEON) 1368 return NLIB_OP2(vmulq, s16, a, b);
1374 #if defined(NLIB_SSE41) 1375 return _mm_add_epi16(c, _mm_mullo_epi16(a, b));
1376 #elif defined(NLIB_NEON) 1377 return NLIB_OP3(vmlaq, s16, c, a, b);
1383 #if defined(NLIB_SSE41) 1384 return _mm_sub_epi16(c, _mm_mullo_epi16(a, b));
1385 #elif defined(NLIB_NEON) 1386 return NLIB_OP3(vmlsq, s16, c, a, b);
1392 #if defined(NLIB_SSE41) 1393 return _mm_mullo_epi32(a, b);
1394 #elif defined(NLIB_NEON) 1395 return NLIB_OP2(vmulq, s32, a, b);
1401 #if defined(NLIB_SSE41) 1402 return _mm_add_epi32(c, _mm_mullo_epi32(a, b));
1403 #elif defined(NLIB_NEON) 1404 return NLIB_OP3(vmlaq, s32, c, a, b);
1410 #if defined(NLIB_SSE41) 1411 return _mm_sub_epi32(c, _mm_mullo_epi32(a, b));
1412 #elif defined(NLIB_NEON) 1413 return NLIB_OP3(vmlsq, s32, c, a, b);
1419 #if defined(NLIB_SSE41) 1420 return _mm_max_epi8(a, b);
1421 #elif defined(NLIB_NEON) 1422 return NLIB_OP2(vmaxq, s8, a, b);
1428 #if defined(NLIB_SSE41) 1429 return _mm_max_epi16(a, b);
1430 #elif defined(NLIB_NEON) 1431 return NLIB_OP2(vmaxq, s16, a, b);
1437 #if defined(NLIB_SSE41) 1438 return _mm_max_epi32(a, b);
1439 #elif defined(NLIB_NEON) 1440 return NLIB_OP2(vmaxq, s32, a, b);
1446 #if defined(NLIB_SSE41) 1447 return _mm_max_epu8(a, b);
1448 #elif defined(NLIB_NEON) 1449 return NLIB_OP2(vmaxq, u8, a, b);
1455 #if defined(NLIB_SSE41) 1456 return _mm_max_epu16(a, b);
1457 #elif defined(NLIB_NEON) 1458 return NLIB_OP2(vmaxq, u16, a, b);
1464 #if defined(NLIB_SSE41) 1465 return _mm_max_epu32(a, b);
1466 #elif defined(NLIB_NEON) 1467 return NLIB_OP2(vmaxq, u32, a, b);
1473 #if defined(NLIB_SSE41) 1474 return _mm_min_epi8(a, b);
1475 #elif defined(NLIB_NEON) 1476 return NLIB_OP2(vminq, s8, a, b);
1482 #if defined(NLIB_SSE41) 1483 return _mm_min_epi16(a, b);
1484 #elif defined(NLIB_NEON) 1485 return NLIB_OP2(vminq, s16, a, b);
1491 #if defined(NLIB_SSE41) 1492 return _mm_min_epi32(a, b);
1493 #elif defined(NLIB_NEON) 1494 return NLIB_OP2(vminq, s32, a, b);
1500 #if defined(NLIB_SSE41) 1501 return _mm_min_epu8(a, b);
1502 #elif defined(NLIB_NEON) 1503 return NLIB_OP2(vminq, u8, a, b);
1509 #if defined(NLIB_SSE41) 1510 return _mm_min_epu16(a, b);
1511 #elif defined(NLIB_NEON) 1512 return NLIB_OP2(vminq, u16, a, b);
1518 #if defined(NLIB_SSE41) 1519 return _mm_min_epu32(a, b);
1520 #elif defined(NLIB_NEON) 1521 return NLIB_OP2(vminq, u32, a, b);
1527 #if defined(NLIB_SSE41) 1528 return _mm_abs_epi8(value);
1529 #elif defined(NLIB_NEON) 1530 return NLIB_OP1(vabsq, s8, value);
1536 #if defined(NLIB_SSE41) 1537 return _mm_abs_epi16(value);
1538 #elif defined(NLIB_NEON) 1539 return NLIB_OP1(vabsq, s16, value);
1545 #if defined(NLIB_SSE41) 1546 return _mm_abs_epi32(value);
1547 #elif defined(NLIB_NEON) 1548 return NLIB_OP1(vabsq, s32, value);
1554 #if defined(NLIB_SSE41) 1555 return _mm_abs_epi8(_mm_sub_epi8(a, b));
1556 #elif defined(NLIB_NEON) 1557 return NLIB_OP2(vabdq, s8, a, b);
1563 #if defined(NLIB_SSE41) 1564 return _mm_abs_epi16(_mm_sub_epi16(a, b));
1565 #elif defined(NLIB_NEON) 1566 return NLIB_OP2(vabdq, s16, a, b);
1572 #if defined(NLIB_SSE41) 1573 return _mm_abs_epi32(_mm_sub_epi32(a, b));
1574 #elif defined(NLIB_NEON) 1575 return NLIB_OP2(vabdq, s32, a, b);
1581 #if defined(NLIB_SSE41) 1582 return _mm_sub_epi8(_mm_setzero_si128(), value);
1583 #elif defined(NLIB_NEON) 1584 return NLIB_OP1(vnegq, s8, value);
1590 #if defined(NLIB_SSE41) 1591 return _mm_sub_epi16(_mm_setzero_si128(), value);
1592 #elif defined(NLIB_NEON) 1593 return NLIB_OP1(vnegq, s16, value);
1599 #if defined(NLIB_SSE41) 1600 return _mm_sub_epi32(_mm_setzero_si128(), value);
1601 #elif defined(NLIB_NEON) 1602 return NLIB_OP1(vnegq, s32, value);
1608 #if defined(NLIB_SSE41) 1609 return _mm_and_si128(a, b);
1610 #elif defined(NLIB_NEON) 1611 return NLIB_OP2(vandq, s8, a, b);
1617 #if defined(NLIB_SSE41) 1618 return _mm_or_si128(a, b);
1619 #elif defined(NLIB_NEON) 1620 return NLIB_OP2(vorrq, s8, a, b);
1626 #if defined(NLIB_SSE41) 1627 return _mm_xor_si128(a, b);
1628 #elif defined(NLIB_NEON) 1629 return NLIB_OP2(veorq, s8, a, b);
1635 #if defined(NLIB_SSE41) 1636 return _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1637 #elif defined(NLIB_NEON) 1638 return NLIB_OP1(vmvnq, s8, a);
1644 #if defined(NLIB_SSE41) 1645 return _mm_andnot_si128(a, b);
1646 #elif defined(NLIB_NEON) 1647 return NLIB_OP2(vbicq, s8, b, a);
1653 #if defined(NLIB_SSE41) 1654 __m128i not_a = _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1655 return _mm_or_si128(not_a, b);
1656 #elif defined(NLIB_NEON) 1657 return NLIB_OP2(vornq, s8, b, a);
1662 #if defined(NLIB_NEON) 1663 return vtstq_s8(a, b);
1665 return I128::Not(I128::CmpEqZero8(I128::And(a, b)));
1670 #if defined(NLIB_NEON) 1671 return NLIB_OP2(vtstq, s16, a, b);
1673 return I128::Not(I128::CmpEqZero16(I128::And(a, b)));
1678 #if defined(NLIB_NEON) 1679 return NLIB_OP2(vtstq, s32, a, b);
1681 return I128::Not(I128::CmpEqZero32(I128::And(a, b)));
1687 #if defined(NLIB_SSE41) 1688 return _mm_cmpeq_epi8(a, b);
1689 #elif defined(NLIB_NEON) 1690 return NLIB_CMP(vceqq, s8, a, b, u8);
1696 #if defined(NLIB_SSE41) 1697 return _mm_cmpeq_epi16(a, b);
1698 #elif defined(NLIB_NEON) 1699 return NLIB_CMP(vceqq, s16, a, b, u16);
1705 #if defined(NLIB_SSE41) 1706 return _mm_cmpeq_epi32(a, b);
1707 #elif defined(NLIB_NEON) 1708 return NLIB_CMP(vceqq, s32, a, b, u32);
1714 #if defined(NLIB_SSE41) 1715 return _mm_cmpeq_epi64(a, b);
1716 #elif defined(NLIB_NEON) 1718 return NLIB_CMP(vceqq, s64, a, b, u64);
1720 uint32x4_t x0 = vceqq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b));
1721 uint32x2x2_t x1 = vtrn_u32(vget_low_u32(x0), vget_high_u32(x0));
1722 uint32x2_t x2 = vand_u32(x1.val[0], x1.val[1]);
1723 int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1724 return vreinterpretq_s8_s64(result);
1731 #if defined(NLIB_SSE41) 1732 return _mm_cmplt_epi8(a, b);
1733 #elif defined(NLIB_NEON) 1734 return NLIB_CMP(vcltq, s8, a, b, u8);
1740 #if defined(NLIB_SSE41) 1741 return _mm_cmplt_epi16(a, b);
1742 #elif defined(NLIB_NEON) 1743 return NLIB_CMP(vcltq, s16, a, b, u16);
1749 #if defined(NLIB_SSE41) 1750 return _mm_cmplt_epi32(a, b);
1751 #elif defined(NLIB_NEON) 1752 return NLIB_CMP(vcltq, s32, a, b, u32);
1758 #if defined(NLIB_SSE42) 1759 return _mm_cmpgt_epi64(b, a);
1760 #elif defined(NLIB_NEON) 1762 return NLIB_CMP(vcltq, s64, a, b, u64);
1765 vtrn_s32(vreinterpret_s32_s8(vget_low_s8(a)), vreinterpret_s32_s8(vget_high_s8(a)));
1767 vtrn_s32(vreinterpret_s32_s8(vget_low_s8(b)), vreinterpret_s32_s8(vget_high_s8(b)));
1768 uint32x2_t upper_lt = vclt_s32(trn_a.val[1], trn_b.val[1]);
1769 uint32x2_t upper_eq = vceq_s32(trn_a.val[1], trn_b.val[1]);
1770 uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1771 uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1772 int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1773 return vreinterpretq_s8_s64(result);
1776 i128 cmp = I128::CmpLtInt32(a, b);
1777 i128 eq = I128::CmpEq32(a, b);
1778 i128 cmp_lt = I128::CmpLtUint32(a, b);
1779 i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1780 i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp_lt, cmp_lt);
1781 i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1782 return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1788 #if defined(NLIB_SSE41) 1789 return _mm_cmpgt_epi8(a, b);
1790 #elif defined(NLIB_NEON) 1791 return NLIB_CMP(vcgtq, s8, a, b, u8);
1797 #if defined(NLIB_SSE41) 1798 return _mm_cmpgt_epi16(a, b);
1799 #elif defined(NLIB_NEON) 1800 return NLIB_CMP(vcgtq, s16, a, b, u16);
1806 #if defined(NLIB_SSE41) 1807 return _mm_cmpgt_epi32(a, b);
1808 #elif defined(NLIB_NEON) 1809 return NLIB_CMP(vcgtq, s32, a, b, u32);
1815 #if defined(NLIB_SSE42) 1816 return _mm_cmpgt_epi64(a, b);
1817 #elif defined(NLIB_NEON) && defined(__aarch64__) 1818 return NLIB_CMP(vcgtq, s64, a, b, u64);
1820 return I128::CmpLtInt64(b, a);
1826 #if defined(NLIB_SSE41) 1828 return _mm_cmplt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1829 #elif defined(NLIB_NEON) 1830 return NLIB_CMP(vcltq, u8, a, b, u8);
1836 #if defined(NLIB_SSE41) 1838 return _mm_cmpgt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1839 #elif defined(NLIB_NEON) 1840 return NLIB_CMP(vcgtq, u8, a, b, u8);
1846 #if defined(NLIB_SSE41) 1848 return _mm_cmplt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1849 #elif defined(NLIB_NEON) 1850 return NLIB_CMP(vcltq, u16, a, b, u16);
1856 #if defined(NLIB_SSE41) 1858 return _mm_cmpgt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1859 #elif defined(NLIB_NEON) 1860 return NLIB_CMP(vcgtq, u16, a, b, u16);
1866 #if defined(NLIB_SSE41) 1868 return _mm_cmplt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1869 #elif defined(NLIB_NEON) 1870 return NLIB_CMP(vcltq, u32, a, b, u32);
1876 #if defined(NLIB_SSE41) 1878 return _mm_cmpgt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1879 #elif defined(NLIB_NEON) 1880 return NLIB_CMP(vcgtq, u32, a, b, u32);
1886 #if defined(NLIB_SSE42) 1888 return _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
1889 #elif defined(NLIB_NEON) 1891 return NLIB_CMP(vcltq, u64, a, b, u64);
1893 uint32x2x2_t trn_a =
1894 vtrn_u32(vreinterpret_u32_s8(vget_low_s8(a)), vreinterpret_u32_s8(vget_high_s8(a)));
1895 uint32x2x2_t trn_b =
1896 vtrn_u32(vreinterpret_u32_s8(vget_low_s8(b)), vreinterpret_u32_s8(vget_high_s8(b)));
1897 uint32x2_t upper_lt = vclt_u32(trn_a.val[1], trn_b.val[1]);
1898 uint32x2_t upper_eq = vceq_u32(trn_a.val[1], trn_b.val[1]);
1899 uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1900 uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1901 int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1902 return vreinterpretq_s8_s64(result);
1905 i128 cmp = I128::CmpLtUint32(a, b);
1906 i128 eq = I128::CmpEq32(a, b);
1907 i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1908 i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp, cmp);
1909 i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1910 return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1916 #if defined(NLIB_SSE42) 1918 return _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
1919 #elif defined(NLIB_NEON) && defined(__aarch64__) 1920 return NLIB_CMP(vcgtq, u64, a, b, u64);
1922 return I128::CmpLtUint64(b, a);
1928 #if defined(NLIB_SSE41) 1929 return _mm_or_si128(_mm_cmplt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1930 #elif defined(NLIB_NEON) 1931 return NLIB_CMP(vcleq, s8, a, b, u8);
1937 #if defined(NLIB_SSE41) 1938 return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1939 #elif defined(NLIB_NEON) 1940 return NLIB_CMP(vcleq, s16, a, b, u16);
1946 #if defined(NLIB_SSE41) 1947 return _mm_or_si128(_mm_cmplt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1948 #elif defined(NLIB_NEON) 1949 return NLIB_CMP(vcleq, s32, a, b, u32);
1955 #if defined(NLIB_SSE42) 1956 return _mm_or_si128(_mm_cmpgt_epi64(b, a), _mm_cmpeq_epi64(a, b));
1957 #elif defined(NLIB_NEON) && defined(__aarch64__) 1958 return NLIB_CMP(vcleq, s64, a, b, u64);
1960 return I128::Not(I128::CmpGtInt64(a, b));
1966 #if defined(NLIB_SSE41) 1967 return _mm_or_si128(_mm_cmpgt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1968 #elif defined(NLIB_NEON) 1969 return NLIB_CMP(vcgeq, s8, a, b, u8);
1975 #if defined(NLIB_SSE41) 1976 return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1977 #elif defined(NLIB_NEON) 1978 return NLIB_CMP(vcgeq, s16, a, b, u16);
1984 #if defined(NLIB_SSE41) 1985 return _mm_or_si128(_mm_cmpgt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1986 #elif defined(NLIB_NEON) 1987 return NLIB_CMP(vcgeq, s32, a, b, u32);
1993 #if defined(NLIB_SSE42) 1994 return _mm_or_si128(_mm_cmpgt_epi64(a, b), _mm_cmpeq_epi64(a, b));
1995 #elif defined(NLIB_NEON) && defined(__aarch64__) 1996 return NLIB_CMP(vcgeq, s64, a, b, u64);
1998 return I128::Not(I128::CmpLtInt64(a, b));
2004 #if defined(NLIB_SSE41) 2005 return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
2006 #elif defined(NLIB_NEON) 2007 return NLIB_CMP(vcleq, u8, a, b, u8);
2013 #if defined(NLIB_SSE41) 2014 return _mm_cmpeq_epi16(_mm_min_epu16(a, b), a);
2015 #elif defined(NLIB_NEON) 2016 return NLIB_CMP(vcleq, u16, a, b, u16);
2022 #if defined(NLIB_SSE41) 2023 return _mm_cmpeq_epi32(_mm_min_epu32(a, b), a);
2024 #elif defined(NLIB_NEON) 2025 return NLIB_CMP(vcleq, u32, a, b, u32);
2031 #if defined(NLIB_SSE42) 2033 i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
2034 return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2035 #elif defined(NLIB_NEON) && defined(__aarch64__) 2036 return NLIB_CMP(vcleq, u64, a, b, u64);
2038 return I128::Not(I128::CmpGtUint64(a, b));
2044 #if defined(NLIB_SSE41) 2045 return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a);
2046 #elif defined(NLIB_NEON) 2047 return NLIB_CMP(vcgeq, u8, a, b, u8);
2053 #if defined(NLIB_SSE41) 2054 return _mm_cmpeq_epi16(_mm_max_epu16(a, b), a);
2055 #elif defined(NLIB_NEON) 2056 return NLIB_CMP(vcgeq, u16, a, b, u16);
2062 #if defined(NLIB_SSE41) 2063 return _mm_cmpeq_epi32(_mm_max_epu32(a, b), a);
2064 #elif defined(NLIB_NEON) 2065 return NLIB_CMP(vcgeq, u32, a, b, u32);
2071 #if defined(NLIB_SSE42) 2073 i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
2074 return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2075 #elif defined(NLIB_NEON) && defined(__aarch64__) 2076 return NLIB_CMP(vcgeq, u64, a, b, u64);
2078 return I128::Not(I128::CmpLtUint64(a, b));
2083 #if defined(__aarch64__) 2084 return vceqzq_s8(value);
2086 return I128::CmpEq8(value, I128::SetZero());
2091 #if defined(__aarch64__) 2092 return vreinterpretq_s8_s16(vceqzq_s16(vreinterpretq_s16_s8(value)));
2094 return I128::CmpEq16(value, I128::SetZero());
2099 #if defined(__aarch64__) 2100 return vreinterpretq_s8_s32(vceqzq_s32(vreinterpretq_s32_s8(value)));
2102 return I128::CmpEq32(value, I128::SetZero());
2107 #if defined(__aarch64__) 2108 return vreinterpretq_s8_s64(vceqzq_s64(vreinterpretq_s64_s8(value)));
2110 return I128::CmpEq64(value, I128::SetZero());
2116 #if defined(NLIB_SSE41) 2117 __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2118 __m128i xh = _mm_slli_epi16(_mm_cvtepu8_epi16(hi), count);
2119 __m128i xl = _mm_slli_epi16(_mm_cvtepu8_epi16(value), count);
2120 return I128::NarrowFrom16To8(xl, xh);
2121 #elif defined(NLIB_NEON) 2122 return NLIB_SFT(vshlq, u8, value, count, s8);
2128 #if defined(NLIB_SSE41) 2129 __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2130 __m128i xh = _mm_srli_epi16(_mm_cvtepu8_epi16(hi), count);
2131 __m128i xl = _mm_srli_epi16(_mm_cvtepu8_epi16(value), count);
2132 return _mm_packus_epi16(xl, xh);
2133 #elif defined(NLIB_NEON) 2134 return NLIB_SFT(vshlq, u8, value, -count, s8);
2139 NLIB_M(
i128) I128::ShiftRightArithmetic8(i128arg value,
int count)
NLIB_NOEXCEPT {
2140 #if defined(NLIB_SSE41) 2141 __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2142 __m128i xh = _mm_srai_epi16(_mm_cvtepi8_epi16(hi), count);
2143 __m128i xl = _mm_srai_epi16(_mm_cvtepi8_epi16(value), count);
2144 return _mm_packus_epi16(xl, xh);
2145 #elif defined(NLIB_NEON) 2146 return NLIB_SFT(vshlq, s8, value, -count, s8);
2152 #if defined(NLIB_SSE41) 2153 return _mm_slli_epi16(value, count);
2154 #elif defined(NLIB_NEON) 2155 return NLIB_SFT(vshlq, u16, value, count, s16);
2160 NLIB_M(
i128) I128::ShiftRightLogical16(i128arg value,
int count)
NLIB_NOEXCEPT {
2161 #if defined(NLIB_SSE41) 2162 return _mm_srli_epi16(value, count);
2163 #elif defined(NLIB_NEON) 2164 return NLIB_SFT(vshlq, u16, value, -count, s16);
2169 NLIB_M(
i128) I128::ShiftRightArithmetic16(i128arg value,
int count)
NLIB_NOEXCEPT {
2170 #if defined(NLIB_SSE41) 2171 return _mm_srai_epi16(value, count);
2172 #elif defined(NLIB_NEON) 2173 return NLIB_SFT(vshlq, s16, value, -count, s16);
2179 #if defined(NLIB_SSE41) 2180 return _mm_slli_epi32(value, count);
2181 #elif defined(NLIB_NEON) 2182 return NLIB_SFT(vshlq, u32, value, count, s32);
2187 NLIB_M(
i128) I128::ShiftRightLogical32(i128arg value,
int count)
NLIB_NOEXCEPT {
2188 #if defined(NLIB_SSE41) 2189 return _mm_srli_epi32(value, count);
2190 #elif defined(NLIB_NEON) 2191 return NLIB_SFT(vshlq, u32, value, -count, s32);
2196 NLIB_M(
i128) I128::ShiftRightArithmetic32(i128arg value,
int count)
NLIB_NOEXCEPT {
2197 #if defined(NLIB_SSE41) 2198 return _mm_srai_epi32(value, count);
2199 #elif defined(NLIB_NEON) 2200 return NLIB_SFT(vshlq, s32, value, -count, s32);
2206 #if defined(NLIB_SSE41) 2207 return _mm_slli_epi64(value, count);
2208 #elif defined(NLIB_NEON) 2209 return NLIB_SFT(vshlq, u64, value, count, s64);
2214 NLIB_M(
i128) I128::ShiftRightLogical64(i128arg value,
int count)
NLIB_NOEXCEPT {
2215 #if defined(NLIB_SSE41) 2216 return _mm_srli_epi64(value, count);
2217 #elif defined(NLIB_NEON) 2218 return NLIB_SFT(vshlq, u64, value, -count, s64);
2227 return vshlq_n_s8(value, N);
2229 return I128::ShiftLeftLogical8(value, N);
2238 uint8x16_t tmp = vreinterpretq_u8_s8(value);
2239 return vreinterpretq_s8_u8(vshrq_n_u8(tmp, N));
2241 return I128::ShiftRightLogical8(value, N);
2250 return vshrq_n_s8(value, N);
2252 return I128::ShiftRightArithmetic8(value, N);
2261 uint16x8_t tmp = vreinterpretq_u16_s8(value);
2262 return vreinterpretq_s8_u16(vshlq_n_u16(tmp, N));
2264 return I128::ShiftLeftLogical16(value, N);
2273 uint16x8_t tmp = vreinterpretq_u16_s8(value);
2274 return vreinterpretq_s8_u16(vshrq_n_u16(tmp, N));
2276 return I128::ShiftRightLogical16(value, N);
2285 int16x8_t tmp = vreinterpretq_s16_s8(value);
2286 return vreinterpretq_s8_s16(vshrq_n_s16(tmp, N));
2288 return I128::ShiftRightArithmetic16(value, N);
2297 uint32x4_t tmp = vreinterpretq_u32_s8(value);
2298 return vreinterpretq_s8_u32(vshlq_n_u32(tmp, N));
2300 return I128::ShiftLeftLogical32(value, N);
2309 uint32x4_t tmp = vreinterpretq_u32_s8(value);
2310 return vreinterpretq_s8_u32(vshrq_n_u32(tmp, N));
2312 return I128::ShiftRightLogical32(value, N);
2321 int32x4_t tmp = vreinterpretq_s32_s8(value);
2322 return vreinterpretq_s8_s32(vshrq_n_s32(tmp, N));
2324 return I128::ShiftRightArithmetic32(value, N);
2333 uint64x2_t tmp = vreinterpretq_u64_s8(value);
2334 return vreinterpretq_s8_u64(vshlq_n_u64(tmp, N));
2336 return I128::ShiftLeftLogical64(value, N);
2345 uint64x2_t tmp = vreinterpretq_u64_s8(value);
2346 return vreinterpretq_s8_u64(vshrq_n_u64(tmp, N));
2348 return I128::ShiftRightLogical64(value, N);
2357 return I128::SetZero();
2368 return I128::SetZero();
2377 I128::ShiftRightArithmetic16<0>(i128arg value)
NLIB_NOEXCEPT {
2384 return I128::SetZero();
2393 I128::ShiftRightArithmetic32<0>(i128arg value)
NLIB_NOEXCEPT {
2400 return I128::SetZero();
2413 #if defined(NLIB_SSE41) 2414 return _mm_slli_si128(value, N);
2415 #elif defined(NLIB_NEON) 2416 return vextq_s8(vdupq_n_s8(0), value, 16 - N);
2424 #if defined(NLIB_SSE41) 2425 return _mm_srli_si128(value, N);
2426 #elif defined(NLIB_NEON) 2427 return vextq_s8(value, vdupq_n_s8(0), N);
2435 #if defined(NLIB_SSE41) 2436 return _mm_alignr_epi8(value, value, N);
2437 #elif defined(NLIB_NEON) 2438 return vextq_s8(value, value, N);
2446 #if defined(NLIB_SSE41) 2447 return _mm_alignr_epi8(a, b, N);
2448 #elif defined(NLIB_NEON) 2449 return vextq_s8(b, a, N);
2455 #if defined(NLIB_SSE41) 2457 __m128i lo_mask = _mm_and_si128(lo, mask);
2458 __m128i hi_mask = _mm_and_si128(hi, mask);
2459 return _mm_packus_epi16(lo_mask, hi_mask);
2460 #elif defined(NLIB_NEON) 2462 uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2463 return vreinterpretq_s8_u8(vmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2465 uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2466 uint8x8_t h = vmovn_u16(vreinterpretq_u16_s8(hi));
2467 return NLIB_CMB(u8, l, h);
2474 #if defined(NLIB_SSE41) 2476 __m128i lo_mask = _mm_and_si128(lo, mask);
2477 __m128i hi_mask = _mm_and_si128(hi, mask);
2478 return _mm_packus_epi32(lo_mask, hi_mask);
2479 #elif defined(NLIB_NEON) 2481 uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2482 return vreinterpretq_s8_u16(vmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2484 uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2485 uint16x4_t h = vmovn_u32(vreinterpretq_u32_s8(hi));
2486 return NLIB_CMB(u16, l, h);
2493 #if defined(NLIB_SSE41) 2494 __m128i lo_ = _mm_shuffle_epi32(lo, _MM_SHUFFLE(3, 1, 2, 0));
2495 __m128i hi_ = _mm_shuffle_epi32(hi, _MM_SHUFFLE(3, 1, 2, 0));
2496 return _mm_unpacklo_epi64(lo_, hi_);
2497 #elif defined(NLIB_NEON) 2499 uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2500 return vreinterpretq_s8_u32(vmovn_high_u64(l, vreinterpretq_u64_s8(hi)));
2502 uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2503 uint32x2_t h = vmovn_u64(vreinterpretq_u64_s8(hi));
2504 return NLIB_CMB(u32, l, h);
2510 NLIB_M(
i128) I128::ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2511 #if defined(NLIB_SSE41) 2513 __m128i lotmp = _mm_and_si128(lo, b7FFF);
2514 __m128i hitmp = _mm_and_si128(hi, b7FFF);
2515 return _mm_packus_epi16(lotmp, hitmp);
2516 #elif defined(NLIB_NEON) 2518 uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2519 return vreinterpretq_s8_u8(vqmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2521 uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2522 uint8x8_t h = vqmovn_u16(vreinterpretq_u16_s8(hi));
2523 return NLIB_CMB(u8, l, h);
2529 NLIB_M(
i128) I128::ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2530 #if defined(NLIB_SSE41) 2531 return _mm_packs_epi16(lo, hi);
2532 #elif defined(NLIB_NEON) 2534 int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2535 return vqmovn_high_s16(l, vreinterpretq_s16_s8(hi));
2537 int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2538 int8x8_t h = vqmovn_s16(vreinterpretq_s16_s8(hi));
2539 return NLIB_CMB(s8, l, h);
2545 NLIB_M(
i128) I128::ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2546 #if defined(NLIB_SSE41) 2548 __m128i lotmp = _mm_and_si128(lo, b7FFFFFFF);
2549 __m128i hitmp = _mm_and_si128(hi, b7FFFFFFF);
2550 return _mm_packus_epi32(lotmp, hitmp);
2551 #elif defined(NLIB_NEON) 2553 uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2554 return vreinterpretq_s8_u16(vqmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2556 uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2557 uint16x4_t h = vqmovn_u32(vreinterpretq_u32_s8(hi));
2558 return NLIB_CMB(u16, l, h);
2564 NLIB_M(
i128) I128::ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2565 #if defined(NLIB_SSE41) 2566 return _mm_packs_epi32(lo, hi);
2567 #elif defined(NLIB_NEON) 2569 int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2570 return vreinterpretq_s8_s16(vqmovn_high_s32(l, vreinterpretq_s32_s8(hi)));
2572 int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2573 int16x4_t h = vqmovn_s32(vreinterpretq_s32_s8(hi));
2574 return NLIB_CMB(s16, l, h);
2581 #if defined(NLIB_SSE41) 2582 return _mm_cvtepi8_epi16(value);
2583 #elif defined(NLIB_NEON) 2584 return vreinterpretq_s8_s16(vmovl_s8(vget_low_s8(value)));
2590 #if defined(NLIB_SSE41) 2591 return _mm_cvtepi8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2592 #elif defined(NLIB_NEON) 2594 int16x8_t result = vmovl_high_s8(value);
2596 int16x8_t result = vmovl_s8(vget_high_s8(value));
2598 return vreinterpretq_s8_s16(result);
2604 #if defined(NLIB_SSE41) 2605 return _mm_cvtepi16_epi32(value);
2606 #elif defined(NLIB_NEON) 2607 int16x8_t x = vreinterpretq_s16_s8(value);
2608 int32x4_t result = vmovl_s16(vget_low_s16(x));
2609 return vreinterpretq_s8_s32(result);
2615 #if defined(NLIB_SSE41) 2616 return _mm_cvtepi16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2617 #elif defined(NLIB_NEON) 2618 int16x8_t x = vreinterpretq_s16_s8(value);
2620 int32x4_t result = vmovl_high_s16(x);
2622 int32x4_t result = vmovl_s16(vget_high_s16(x));
2624 return vreinterpretq_s8_s32(result);
2630 #if defined(NLIB_SSE41) 2631 return _mm_cvtepi32_epi64(value);
2632 #elif defined(NLIB_NEON) 2633 int32x4_t x = vreinterpretq_s32_s8(value);
2634 int64x2_t result = vmovl_s32(vget_low_s32(x));
2635 return vreinterpretq_s8_s64(result);
2641 #if defined(NLIB_SSE41) 2642 return _mm_cvtepi32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2643 #elif defined(NLIB_NEON) 2644 int32x4_t x = vreinterpretq_s32_s8(value);
2646 int64x2_t result = vmovl_high_s32(x);
2648 int64x2_t result = vmovl_s32(vget_high_s32(x));
2650 return vreinterpretq_s8_s64(result);
2656 #if defined(NLIB_SSE41) 2657 return _mm_cvtepu8_epi16(value);
2658 #elif defined(NLIB_NEON) 2659 uint8x16_t x = vreinterpretq_u8_s8(value);
2660 uint16x8_t result = vmovl_u8(vget_low_u8(x));
2661 return vreinterpretq_s8_u16(result);
2667 #if defined(NLIB_SSE41) 2668 return _mm_cvtepu8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2669 #elif defined(NLIB_NEON) 2670 uint8x16_t x = vreinterpretq_u8_s8(value);
2672 uint16x8_t result = vmovl_high_u8(x);
2674 uint16x8_t result = vmovl_u8(vget_high_u8(x));
2676 return vreinterpretq_s8_u16(result);
2682 #if defined(NLIB_SSE41) 2683 return _mm_cvtepu16_epi32(value);
2684 #elif defined(NLIB_NEON) 2685 uint16x8_t x = vreinterpretq_u16_s8(value);
2686 uint32x4_t result = vmovl_u16(vget_low_u16(x));
2687 return vreinterpretq_s8_u32(result);
2693 #if defined(NLIB_SSE41) 2694 return _mm_cvtepu16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2695 #elif defined(NLIB_NEON) 2696 uint16x8_t x = vreinterpretq_u16_s8(value);
2698 uint32x4_t result = vmovl_high_u16(x);
2700 uint32x4_t result = vmovl_u16(vget_high_u16(x));
2702 return vreinterpretq_s8_u32(result);
2708 #if defined(NLIB_SSE41) 2709 return _mm_cvtepu32_epi64(value);
2710 #elif defined(NLIB_NEON) 2711 uint32x4_t x = vreinterpretq_u32_s8(value);
2712 uint64x2_t result = vmovl_u32(vget_low_u32(x));
2713 return vreinterpretq_s8_u64(result);
2719 #if defined(NLIB_SSE41) 2720 return _mm_cvtepu32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2721 #elif defined(NLIB_NEON) 2722 uint32x4_t x = vreinterpretq_u32_s8(value);
2724 uint64x2_t result = vmovl_high_u32(x);
2726 uint64x2_t result = vmovl_u32(vget_high_u32(x));
2728 return vreinterpretq_s8_u64(result);
2734 #if defined(NLIB_SSE41) 2735 return _mm_unpacklo_epi8(a, b);
2736 #elif defined(NLIB_NEON) 2738 return vzip1q_s8(a, b);
2740 return vzipq_s8(a, b).val[0];
2747 #if defined(NLIB_SSE41) 2748 return _mm_unpackhi_epi8(a, b);
2749 #elif defined(NLIB_NEON) 2751 return vzip2q_s8(a, b);
2753 return vzipq_s8(a, b).val[1];
2759 #if defined(NLIB_SSE41) 2761 __m128i lo_mask = _mm_and_si128(a, mask);
2762 __m128i hi_mask = _mm_and_si128(b, mask);
2763 return _mm_packus_epi16(lo_mask, hi_mask);
2764 #elif defined(NLIB_NEON) 2766 return vuzp1q_s8(a, b);
2768 return vuzpq_s8(a, b).val[0];
2774 #if defined(NLIB_SSE41) 2776 __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 1);
2777 __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 1);
2778 return _mm_packus_epi16(lo_mask, hi_mask);
2779 #elif defined(NLIB_NEON) 2781 return vuzp2q_s8(a, b);
2783 return vuzpq_s8(a, b).val[1];
2790 #if defined(NLIB_SSE41) 2791 return _mm_unpacklo_epi16(a, b);
2792 #elif defined(NLIB_NEON) 2794 return NLIB_OP2(vzip1q, u16, a, b);
2796 return vreinterpretq_s8_u16(vzipq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2803 #if defined(NLIB_SSE41) 2804 return _mm_unpackhi_epi16(a, b);
2805 #elif defined(NLIB_NEON) 2807 return NLIB_OP2(vzip2q, u16, a, b);
2809 return vreinterpretq_s8_u16(vzipq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2815 #if defined(NLIB_SSE41) 2817 __m128i lo_mask = _mm_and_si128(a, mask);
2818 __m128i hi_mask = _mm_and_si128(b, mask);
2819 return _mm_packus_epi32(lo_mask, hi_mask);
2820 #elif defined(NLIB_NEON) 2822 return NLIB_OP2(vuzp1q, u16, a, b);
2824 return vreinterpretq_s8_u16(vuzpq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2830 #if defined(NLIB_SSE41) 2832 __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 2);
2833 __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 2);
2834 return _mm_packus_epi32(lo_mask, hi_mask);
2835 #elif defined(NLIB_NEON) 2837 return NLIB_OP2(vuzp2q, u16, a, b);
2839 return vreinterpretq_s8_u16(vuzpq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2846 #if defined(NLIB_SSE41) 2847 return _mm_unpacklo_epi32(a, b);
2848 #elif defined(NLIB_NEON) 2850 return NLIB_OP2(vzip1q, u32, a, b);
2852 return vreinterpretq_s8_u32(vzipq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2859 #if defined(NLIB_SSE41) 2860 return _mm_unpackhi_epi32(a, b);
2861 #elif defined(NLIB_NEON) 2863 return NLIB_OP2(vzip2q, u32, a, b);
2865 return vreinterpretq_s8_u32(vzipq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2871 #if defined(NLIB_SSE41) 2872 __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
2873 __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
2874 return _mm_blend_epi16(x0, x1, 0xF0);
2875 #elif defined(NLIB_NEON) 2877 return NLIB_OP2(vuzp1q, u32, a, b);
2879 return vreinterpretq_s8_u32(vuzpq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2885 #if defined(NLIB_SSE41) 2886 __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 3, 1));
2887 __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 2, 0));
2888 return _mm_blend_epi16(x0, x1, 0xF0);
2889 #elif defined(NLIB_NEON) 2891 return NLIB_OP2(vuzp2q, u32, a, b);
2893 return vreinterpretq_s8_u32(vuzpq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2898 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7,
int V8,
int V9,
int V10,
2899 int V11,
int V12,
int V13,
int V14,
int V15>
2902 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON) 2903 return __builtin_shufflevector(a, b, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13,
2905 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41) 2906 return __builtin_shufflevector((__v16qi)a, (__v16qi)b, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9,
2907 V10, V11, V12, V13, V14, V15);
2910 int8_t mask_a[16] = {(V0 < 0 || V0 > 15) ? -128 : V0, (V1 < 0 || V1 > 15) ? -128 : V1,
2911 (V2 < 0 || V2 > 15) ? -128 : V2, (V3 < 0 || V3 > 15) ? -128 : V3,
2912 (V4 < 0 || V4 > 15) ? -128 : V4, (V5 < 0 || V5 > 15) ? -128 : V5,
2913 (V6 < 0 || V6 > 15) ? -128 : V6, (V7 < 0 || V7 > 15) ? -128 : V7,
2914 (V8 < 0 || V8 > 15) ? -128 : V8, (V9 < 0 || V9 > 15) ? -128 : V9,
2915 (V10 < 0 || V10 > 15) ? -128 : V10, (V11 < 0 || V11 > 15) ? -128 : V11,
2916 (V12 < 0 || V12 > 15) ? -128 : V12, (V13 < 0 || V13 > 15) ? -128 : V13,
2917 (V14 < 0 || V14 > 15) ? -128 : V14, (V15 < 0 || V15 > 15) ? -128 : V15};
2919 int8_t mask_b[16] = {
2920 V0 < 16 ? -128 : (V0 - 16), V1 < 16 ? -128 : (V1 - 16), V2 < 16 ? -128 : (V2 - 16),
2921 V3 < 16 ? -128 : (V3 - 16), V4 < 16 ? -128 : (V4 - 16), V5 < 16 ? -128 : (V5 - 16),
2922 V6 < 16 ? -128 : (V6 - 16), V7 < 16 ? -128 : (V7 - 16), V8 < 16 ? -128 : (V8 - 16),
2923 V9 < 16 ? -128 : (V9 - 16), V10 < 16 ? -128 : (V10 - 16), V11 < 16 ? -128 : (V11 - 16),
2924 V12 < 16 ? -128 : (V12 - 16), V13 < 16 ? -128 : (V13 - 16), V14 < 16 ? -128 : (V14 - 16),
2925 V15 < 16 ? -128 : (V15 - 16)};
2926 i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2927 i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2928 return I128::Or(tmp_a, tmp_b);
2932 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7>
2935 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON) 2936 return vreinterpretq_s8_u16(__builtin_shufflevector(
2937 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b), V0, V1, V2, V3, V4, V5, V6, V7));
2938 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41) 2939 return __builtin_shufflevector((__v8hi)a, (__v8hi)b, V0, V1, V2, V3, V4, V5, V6, V7);
2942 int8_t mask_a[16] = {
2943 (V0 < 0 || V0 > 7) ? -128 : V0 * 2, (V0 < 0 || V0 > 7) ? -128 : V0 * 2 + 1,
2944 (V1 < 0 || V1 > 7) ? -128 : V1 * 2, (V1 < 0 || V1 > 7) ? -128 : V1 * 2 + 1,
2945 (V2 < 0 || V2 > 7) ? -128 : V2 * 2, (V2 < 0 || V2 > 7) ? -128 : V2 * 2 + 1,
2946 (V3 < 0 || V3 > 7) ? -128 : V3 * 2, (V3 < 0 || V3 > 7) ? -128 : V3 * 2 + 1,
2947 (V4 < 0 || V4 > 7) ? -128 : V4 * 2, (V4 < 0 || V4 > 7) ? -128 : V4 * 2 + 1,
2948 (V5 < 0 || V5 > 7) ? -128 : V5 * 2, (V5 < 0 || V5 > 7) ? -128 : V5 * 2 + 1,
2949 (V6 < 0 || V6 > 7) ? -128 : V6 * 2, (V6 < 0 || V6 > 7) ? -128 : V6 * 2 + 1,
2950 (V7 < 0 || V7 > 7) ? -128 : V7 * 2, (V7 < 0 || V7 > 7) ? -128 : V7 * 2 + 1};
2952 int8_t mask_b[16] = {V0 < 8 ? -128 : (V0 - 8) * 2, V0 < 8 ? -128 : (V0 - 8) * 2 + 1,
2953 V1 < 8 ? -128 : (V1 - 8) * 2, V1 < 8 ? -128 : (V1 - 8) * 2 + 1,
2954 V2 < 8 ? -128 : (V2 - 8) * 2, V2 < 8 ? -128 : (V2 - 8) * 2 + 1,
2955 V3 < 8 ? -128 : (V3 - 8) * 2, V3 < 8 ? -128 : (V3 - 8) * 2 + 1,
2956 V4 < 8 ? -128 : (V4 - 8) * 2, V4 < 8 ? -128 : (V4 - 8) * 2 + 1,
2957 V5 < 8 ? -128 : (V5 - 8) * 2, V5 < 8 ? -128 : (V5 - 8) * 2 + 1,
2958 V6 < 8 ? -128 : (V6 - 8) * 2, V6 < 8 ? -128 : (V6 - 8) * 2 + 1,
2959 V7 < 8 ? -128 : (V7 - 8) * 2, V7 < 8 ? -128 : (V7 - 8) * 2 + 1};
2960 i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2961 i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2962 return I128::Or(tmp_a, tmp_b);
2966 template<
int V0,
int V1,
int V2,
int V3>
2969 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON) 2970 return vreinterpretq_s8_u32(
2971 __builtin_shufflevector(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b), V0, V1, V2, V3));
2972 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41) 2973 return __builtin_shufflevector((__v4si)a, (__v4si)b, V0, V1, V2, V3);
2976 int8_t mask_a[16] = {
2977 (V0 < 0 || V0 > 3) ? -128 : V0 * 4, (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 1,
2978 (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 2, (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 3,
2979 (V1 < 0 || V1 > 3) ? -128 : V1 * 4, (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 1,
2980 (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 2, (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 3,
2981 (V2 < 0 || V2 > 3) ? -128 : V2 * 4, (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 1,
2982 (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 2, (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 3,
2983 (V3 < 0 || V3 > 3) ? -128 : V3 * 4, (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 1,
2984 (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 2, (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 3};
2986 int8_t mask_b[16] = {V0 < 4 ? -128 : (V0 - 4) * 4, V0 < 4 ? -128 : (V0 - 4) * 4 + 1,
2987 V0 < 4 ? -128 : (V0 - 4) * 4 + 2, V0 < 4 ? -128 : (V0 - 4) * 4 + 3,
2988 V1 < 4 ? -128 : (V1 - 4) * 4, V1 < 4 ? -128 : (V1 - 4) * 4 + 1,
2989 V1 < 4 ? -128 : (V1 - 4) * 4 + 2, V1 < 4 ? -128 : (V1 - 4) * 4 + 3,
2990 V2 < 4 ? -128 : (V2 - 4) * 4, V2 < 4 ? -128 : (V2 - 4) * 4 + 1,
2991 V2 < 4 ? -128 : (V2 - 4) * 4 + 2, V2 < 4 ? -128 : (V2 - 4) * 4 + 3,
2992 V3 < 4 ? -128 : (V3 - 4) * 4, V3 < 4 ? -128 : (V3 - 4) * 4 + 1,
2993 V3 < 4 ? -128 : (V3 - 4) * 4 + 2, V3 < 4 ? -128 : (V3 - 4) * 4 + 3};
2994 i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2995 i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2996 return I128::Or(tmp_a, tmp_b);
3002 #if defined(NLIB_SSE41) 3004 const int8_t mask_[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
3005 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3006 #elif defined(NLIB_NEON) 3007 return NLIB_OP1(vrev16q, u8, value);
3013 #if defined(NLIB_SSE41) 3015 const int8_t mask_[16] = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12};
3016 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3017 #elif defined(NLIB_NEON) 3018 return NLIB_OP1(vrev32q, u8, value);
3024 #if defined(NLIB_SSE41) 3026 const int8_t mask_[16] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
3027 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3028 #elif defined(NLIB_NEON) 3029 return NLIB_OP1(vrev64q, u8, value);
3035 #if defined(NLIB_SSE41) 3036 return _mm_movemask_epi8(value);
3037 #elif defined(NLIB_NEON) 3038 uint8x16_t powers = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL));
3039 uint8x16_t a = vandq_u8(value, powers);
3041 return vaddv_u8(vget_low_u8(a)) | (vaddv_u8(vget_high_u8(a)) << 8);
3043 uint8x8_t al = vget_low_u8(a);
3044 uint8x8_t ah = vget_high_u8(a);
3045 uint8x8_t tmp = vpadd_u8(al, ah);
3046 tmp = vpadd_u8(tmp, tmp);
3047 tmp = vpadd_u8(tmp, tmp);
3048 return vget_lane_u16(vreinterpret_u16_u8(tmp), 0);
3055 #if defined(NLIB_SSE41) 3056 __m128i tmp = _mm_packs_epi16(value, value);
3057 return _mm_movemask_epi8(tmp) & 255;
3058 #elif defined(NLIB_NEON) 3059 uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3060 uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3061 uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3062 uint16x8_t a = vandq_u16(vreinterpretq_u16_s8(value), powers);
3064 return vaddvq_u16(a);
3066 uint8x8_t tmp = vmovn_u16(a);
3067 tmp = vpadd_u8(tmp, tmp);
3068 tmp = vpadd_u8(tmp, tmp);
3069 tmp = vpadd_u8(tmp, tmp);
3070 return vget_lane_u8(tmp, 0);
3077 #if defined(NLIB_SSE41) 3078 __m128i tmp = _mm_packs_epi16(value, value);
3079 tmp = _mm_packs_epi16(tmp, tmp);
3080 return _mm_movemask_epi8(tmp) & 15;
3081 #elif defined(NLIB_NEON) 3082 uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3083 uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3084 uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3085 uint32x4_t a = vandq_u32(vreinterpretq_u32_s8(value), powers);
3087 return vaddvq_u32(a);
3089 uint16x4_t tmp = vmovn_u32(a);
3090 tmp = vpadd_u16(tmp, tmp);
3091 tmp = vpadd_u16(tmp, tmp);
3092 return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
3098 #if defined(NLIB_NEON) 3099 int8x8_t m = vcreate_s8(0x8040201008040201ULL);
3100 int8x8_t s0 = vdup_n_s8(mask & 0xFF);
3101 int8x8_t s1 = vdup_n_s8(mask >> 8);
3102 return vtstq_s8(vcombine_s8(m, m), vcombine_s8(s0, s1));
3103 #elif defined(NLIB_SSE41) 3106 i128 s1 = I128::SetValue(static_cast<int8_t>(mask >> 8),
each_int8);
3107 i128 s = _mm_blend_epi16(s0, s1, 0xF0);
3108 return I128::Test8(m, s);
3113 #if defined(NLIB_NEON) 3114 uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3115 uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3116 uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3117 uint16x8_t s = vdupq_n_u16(mask);
3118 return vreinterpretq_s8_u16(vtstq_u16(powers, s));
3119 #elif defined(NLIB_SSE41) 3122 i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3124 return I128::Test16(m, s);
3129 #if defined(NLIB_NEON) 3130 uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3131 uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3132 uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3133 uint32x4_t s = vdupq_n_u32(mask);
3134 return vreinterpretq_s8_u32(vtstq_u32(powers, s));
3135 #elif defined(NLIB_SSE41) 3138 i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3140 return I128::Test32(m, s);
3146 #if defined(NLIB_SSE41) 3147 return _mm_testz_si128(value, value) != 0;
3148 #elif defined(NLIB_NEON) 3150 uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(value));
3151 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3155 int8x8_t tmp = vorr_s8(vget_low_s8(value), vget_high_s8(value));
3156 return vget_lane_u64(vreinterpret_u64_s8(tmp), 0) == 0;
3163 #if defined(NLIB_SSE41) 3164 return _mm_testc_si128(value, _mm_cmpeq_epi8(value, value)) != 0;
3165 #elif defined(NLIB_NEON) 3167 uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(vmvnq_s8(value)));
3168 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3170 int8x8_t tmp = vand_s8(vget_low_s8(value), vget_high_s8(value));
3171 return vget_lane_s64(vreinterpret_s64_s8(tmp), 0) == -1;
3178 #if defined(NLIB_SSE41) 3179 return _mm_blendv_epi8(b, a, mask);
3180 #elif defined(NLIB_NEON) 3181 return NLIB_OP3(vbslq, u32, mask, a, b);
3187 #if defined(NLIB_SSE41) 3188 return _mm_shuffle_epi8(value, shuffle);
3189 #elif defined(NLIB_NEON) 3191 return vqtbl1q_s8(value, vreinterpretq_u8_s8(shuffle));
3194 x.val[0] = vget_low_s8(value);
3195 x.val[1] = vget_high_s8(value);
3196 int8x8_t lo = vtbl2_s8(x, vget_low_s8(shuffle));
3197 int8x8_t hi = vtbl2_s8(x, vget_high_s8(shuffle));
3198 return vcombine_s8(lo, hi);
3205 #if defined(NLIB_NEON) 3207 int8x16_t tmp = vnegq_s8(value);
3208 return vaddvq_s8(tmp);
3210 int8x16_t tmp = vnegq_s8(value);
3211 int8x8_t lo = vget_low_s8(tmp);
3212 int8x8_t hi = vget_high_s8(tmp);
3213 lo = vadd_s8(lo, hi);
3214 lo = vpadd_s8(lo, lo);
3215 lo = vpadd_s8(lo, lo);
3216 lo = vpadd_s8(lo, lo);
3217 return vget_lane_s8(lo, 0);
3220 return nlib_popcnt16(static_cast<uint16_t>(I128::MoveMask8(value)));
3225 return nlib_clz32(static_cast<uint32_t>(I128::MoveMask8(value))) - 16;
3229 return nlib_ctz32(static_cast<uint32_t>(I128::MoveMask8(value) | 0x10000));
3233 #if defined(NLIB_NEON) && defined(__aarch64__) 3234 uint8x16_t x = vreinterpretq_u8_s8(value);
3235 uint16x8_t lo = vmovl_u8(vget_low_u8(x));
3236 uint16x8_t hi = vmovl_high_u8(x);
3237 return vaddvq_u16(lo) + vaddvq_u16(hi);
3239 i128 lo = I128::ConvertFromUint8ToUint16Lo(value);
3240 i128 hi = I128::ConvertFromUint8ToUint16Hi(value);
3241 i128 x = I128::Add16(lo, hi);
3242 x = I128::Add16(x, I128::ByteShiftRight<8>(x));
3243 x = I128::Add16(x, I128::ByteShiftRight<4>(x));
3244 x = I128::Add16(x, I128::ByteShiftRight<2>(x));
3245 return GetUint16FromLane<0>(x);
3250 #if defined(NLIB_NEON) && defined(__aarch64__) 3251 uint16x8_t x = vreinterpretq_u16_s8(value);
3252 uint32x4_t lo = vmovl_u16(vget_low_u16(x));
3253 uint32x4_t hi = vmovl_high_u16(x);
3254 return vaddvq_u32(lo) + vaddvq_u32(hi);
3256 i128 lo = I128::ConvertFromUint16ToUint32Lo(value);
3257 i128 hi = I128::ConvertFromUint16ToUint32Hi(value);
3258 i128 x = I128::Add32(lo, hi);
3259 x = I128::Add32(x, I128::ByteShiftRight<8>(x));
3260 x = I128::Add32(x, I128::ByteShiftRight<4>(x));
3261 return GetUint32FromLane<0>(x);
3266 #undef vreinterpretq_s8_s8 3275 #endif // NLIB_DOXYGEN 3280 #if defined(NLIB_SSE41) 3281 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \ 3283 (row0) = _mm_shuffle_epi32((row0), _MM_SHUFFLE(3, 1, 2, 0)); \ 3284 (row1) = _mm_shuffle_epi32((row1), _MM_SHUFFLE(3, 1, 2, 0)); \ 3285 (row2) = _mm_shuffle_epi32((row2), _MM_SHUFFLE(3, 1, 2, 0)); \ 3286 (row3) = _mm_shuffle_epi32((row3), _MM_SHUFFLE(3, 1, 2, 0)); \ 3287 __m128i t0_transpose32_ = _mm_unpacklo_epi32((row0), (row1)); \ 3288 __m128i t1_transpose32_ = _mm_unpackhi_epi32((row0), (row1)); \ 3289 __m128i t2_transpose32_ = _mm_unpacklo_epi32((row2), (row3)); \ 3290 __m128i t3_transpose32_ = _mm_unpackhi_epi32((row2), (row3)); \ 3291 (row0) = _mm_unpacklo_epi64(t0_transpose32_, t2_transpose32_); \ 3292 (row1) = _mm_unpacklo_epi64(t1_transpose32_, t3_transpose32_); \ 3293 (row2) = _mm_unpackhi_epi64(t0_transpose32_, t2_transpose32_); \ 3294 (row3) = _mm_unpackhi_epi64(t1_transpose32_, t3_transpose32_); \ 3296 #elif defined(NLIB_NEON) 3298 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \ 3300 uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), vreinterpretq_u32_s8(row1)); \ 3301 uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), vreinterpretq_u32_s8(row3)); \ 3302 uint64x2_t row0_, row1_, row2_, row3_; \ 3303 row0_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \ 3304 vreinterpretq_u64_u32(trn_f1_.val[0])); \ 3305 row0 = vreinterpretq_s8_u64(row0_); \ 3306 row1_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \ 3307 vreinterpretq_u64_u32(trn_f1_.val[1])); \ 3308 row1 = vreinterpretq_s8_u64(row1_); \ 3309 row2_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \ 3310 vreinterpretq_u64_u32(trn_f1_.val[0])); \ 3311 row2 = vreinterpretq_s8_u64(row2_); \ 3312 row3_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \ 3313 vreinterpretq_u64_u32(trn_f1_.val[1])); \ 3314 row3 = vreinterpretq_s8_u64(row3_); \ 3317 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \ 3319 uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), vreinterpretq_u32_s8(row1)); \ 3320 uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), vreinterpretq_u32_s8(row3)); \ 3321 uint32x4_t row0_, row1_, row2_, row3_; \ 3322 uint32x2_t lo, hi; \ 3323 lo = vget_low_u32(trn_f0_.val[0]); \ 3324 hi = vget_low_u32(trn_f1_.val[0]); \ 3325 row0_ = vcombine_u32(lo, hi); \ 3326 row0 = vreinterpretq_s8_u32(row0_); \ 3327 lo = vget_low_u32(trn_f0_.val[1]); \ 3328 hi = vget_low_u32(trn_f1_.val[1]); \ 3329 row1_ = vcombine_u32(lo, hi); \ 3330 row1 = vreinterpretq_s8_u32(row1_); \ 3331 lo = vget_high_u32(trn_f0_.val[0]); \ 3332 hi = vget_high_u32(trn_f1_.val[0]); \ 3333 row2_ = vcombine_u32(lo, hi); \ 3334 row2 = vreinterpretq_s8_u32(row2_); \ 3335 lo = vget_high_u32(trn_f0_.val[1]); \ 3336 hi = vget_high_u32(trn_f1_.val[1]); \ 3337 row3_ = vcombine_u32(lo, hi); \ 3338 row3 = vreinterpretq_s8_u32(row3_); \ 3348 #endif // INCLUDE_NN_NLIB_SIMD_SIMDINT_H_ 空の構造体で32bitの符号付き整数を示すためのタグです。
空の構造体で64bitの符号付き整数を示すためのタグです。
constexpr const each_uint8_tag each_uint8
each_uint8_tag型の定数オブジェクトで、8bitの符号なし整数を示すためのタグです。
空の構造体で8bit単位に分けたレーンを選択することを示すためのタグです。
空の構造体で32bit単位に分けたレーンを選択することを示すためのタグです。
空の構造体で8bitの符号付き整数を示すためのタグです。
constexpr const each_uint16_tag each_uint16
each_uint16_tag型の定数オブジェクトで、16bitの符号なし整数を示すためのタグです。
空の構造体で16bitの符号なし整数を示すためのタグです。
空の構造体で64bitの符号なし整数を示すためのタグです。
constexpr const each_int64_tag each_int64
each_int64_tag型の定数オブジェクトで、64bitの符号付き整数を示すためのタグです。
nlib_i128_t i128
nlib_i128_tがtypedefされています。
constexpr const each_uint64_tag each_uint64
each_uint64_tag型の定数オブジェクトで、64bitの符号なし整数を示すためのタグです。
128bitレジスタ(SSEではXMM0-XMM15, NEONではQ0-Q15)を用いた整数SIMD演算を行うためのクラスです。 ...
空の構造体で8bitの符号なし整数を示すためのタグです。
空の構造体で16bitの符号付き整数を示すためのタグです。
constexpr const each_int16_tag each_int16
each_int16_tag型の定数オブジェクトで、16bitの符号付き整数を示すためのタグです。
constexpr const each_uint32_tag each_uint32
each_uint32_tag型の定数オブジェクトで、32bitの符号なし整数を示すためのタグです。
#define NLIB_NOEXCEPT
環境に合わせてnoexcept 又は同等の定義がされます。
空の構造体で16bit単位に分けたレーンを選択することを示すためのタグです。
constexpr const each_select16_tag each_select16
each_select16_tag型の定数オブジェクトで、16bitのレーンを選択することを示すためのタグです。 ...
#define NLIB_CEXPR
利用可能であればconstexprが定義されます。そうでない場合は空文字列です。
__m128i nlib_i128_t
128bitの整数用SIMDレジスタのための型です。
constexpr const each_select8_tag each_select8
each_select8_tag型の定数オブジェクトで、8bitのレーンを選択することを示すためのタグです。 ...
#define NLIB_ALIGNAS(x)
alignas(x)又は同等の定義がされます。
constexpr const each_int8_tag each_int8
each_int8_tag型の定数オブジェクトで、8bitの符号付き整数を示すためのタグです。
constexpr const each_select32_tag each_select32
each_select32_tag型の定数オブジェクトで、32bitのレーンを選択することを示すためのタグです。 ...
空の構造体で32bitの符号なし整数を示すためのタグです。
constexpr const each_int32_tag each_int32
each_int32_tag型の定数オブジェクトで、32bitの符号付き整数を示すためのタグです。
#define NLIB_STATIC_ASSERT(exp)
静的アサートが定義されます。利用可能であればstatic_assertを利用します。