nlib
SimdInt.h
Go to the documentation of this file.
1 
2 #pragma once
3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
4 #define INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
5 
6 #include "nn/nlib/Config.h"
7 
8 #if defined(NLIB_SSE41)
9 typedef __m128i nlib_i64_t;
10 typedef __m128i nlib_i128_t;
11 #elif defined(NLIB_NEON)
12 typedef int8x8_t nlib_i64_t;
13 typedef int8x16_t nlib_i128_t;
14 #endif
15 
16 #ifdef NLIB_SSE41
17 extern int nlib_sse42_supported;
18 #endif
19 
20 NLIB_NAMESPACE_BEGIN
21 namespace simd {
22 
23 // use each_int8 for the argument value
24 struct each_int8_tag {};
25 // use each_int16 for the argument value
26 struct each_int16_tag {};
27 // use each_int32 for the argument value
28 struct each_int32_tag {};
29 // use each_int64 for the argument value
30 struct each_int64_tag {};
31 // use each_uint8 for the argument value
32 struct each_uint8_tag {};
33 // use each_uint16 for the argument value
34 struct each_uint16_tag {};
35 // use each_uint32 for the argument value
36 struct each_uint32_tag {};
37 // use each_uint64 for the argument value
38 struct each_uint64_tag {};
39 
48 
49 // use each_select32 for the argument value
51 // use each_select16 for the argument value
53 // use each_select8 for the argument value
54 struct each_select8_tag {};
55 
59 
60 #if !defined(_MSC_VER) || _MSC_VER < 1800
61 #ifndef __vectorcall
62 #define __vectorcall
63 #endif
64 #endif
65 
66 #if defined(NLIB_SIMD)
67 
68 // __m128i(SSE), int8x8_t(NEON)
69 typedef nlib_i64_t i64;
70 // __m128i(SSE), int8x16_t(NEON)
71 typedef nlib_i128_t i128;
72 
73 #if _MSC_VER < 1800
74 typedef const i64& i64arg;
75 typedef const i128& i128arg;
76 #else
77 typedef const i64 i64arg;
78 typedef const i128 i128arg;
79 #endif
80 
82  static i64 __vectorcall SetValue(int8_t v, each_int8_tag) NLIB_NOEXCEPT;
83  static i64 __vectorcall SetValue(int16_t v, each_int16_tag) NLIB_NOEXCEPT;
84  static i64 __vectorcall SetValue(int32_t v, each_int32_tag) NLIB_NOEXCEPT;
85  static i64 __vectorcall SetValue(uint8_t v, each_uint8_tag) NLIB_NOEXCEPT;
86  static i64 __vectorcall SetValue(uint16_t v, each_uint16_tag) NLIB_NOEXCEPT;
87  static i64 __vectorcall SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT;
88 
89  template <size_t N>
90  static i64 __vectorcall SetValue(i64 value, each_select32_tag) NLIB_NOEXCEPT;
91  template <size_t N>
92  static i64 __vectorcall SetValue(i64 value, each_select16_tag) NLIB_NOEXCEPT;
93  template <size_t N>
94  static i64 __vectorcall SetValue(i64 value, each_select8_tag) NLIB_NOEXCEPT;
95 
96  static i64 __vectorcall SetZero() NLIB_NOEXCEPT;
97  // set 1 to all bits
98  static NLIB_ALWAYS_INLINE i64 __vectorcall SetFull(i64arg dummy) NLIB_NOEXCEPT {
99  return I64::CmpEq8(dummy, dummy);
100  }
101 
102  //
103  // Load Operations
104  //
105  static i64 __vectorcall LoadA8(const void* p) NLIB_NOEXCEPT;
106  static i64 __vectorcall LoadA4(const void* p) NLIB_NOEXCEPT;
107  static i64 __vectorcall LoadA2(const void* p) NLIB_NOEXCEPT;
108  static i64 __vectorcall LoadA1(const void* p) NLIB_NOEXCEPT;
109  static NLIB_ALWAYS_INLINE i64 __vectorcall LoadA8(uintptr_t p) NLIB_NOEXCEPT {
110  return LoadA8(reinterpret_cast<void*>(p));
111  } // NOLINT
112  static NLIB_ALWAYS_INLINE i64 __vectorcall LoadA4(uintptr_t p) NLIB_NOEXCEPT {
113  return LoadA4(reinterpret_cast<void*>(p));
114  } // NOLINT
115  static NLIB_ALWAYS_INLINE i64 __vectorcall LoadA2(uintptr_t p) NLIB_NOEXCEPT {
116  return LoadA2(reinterpret_cast<void*>(p));
117  } // NOLINT
118  static NLIB_ALWAYS_INLINE i64 __vectorcall LoadA1(uintptr_t p) NLIB_NOEXCEPT {
119  return LoadA1(reinterpret_cast<void*>(p));
120  } // NOLINT
121  static NLIB_ALWAYS_INLINE i64 __vectorcall LoadA8(intptr_t p) NLIB_NOEXCEPT {
122  return LoadA8(reinterpret_cast<void*>(p));
123  } // NOLINT
124  static NLIB_ALWAYS_INLINE i64 __vectorcall LoadA4(intptr_t p) NLIB_NOEXCEPT {
125  return LoadA4(reinterpret_cast<void*>(p));
126  } // NOLINT
127  static NLIB_ALWAYS_INLINE i64 __vectorcall LoadA2(intptr_t p) NLIB_NOEXCEPT {
128  return LoadA2(reinterpret_cast<void*>(p));
129  } // NOLINT
130  static NLIB_ALWAYS_INLINE i64 __vectorcall LoadA1(intptr_t p) NLIB_NOEXCEPT {
131  return LoadA1(reinterpret_cast<void*>(p));
132  } // NOLINT
133 
134  //
135  // Store Operations
136  //
137  static void __vectorcall StoreA8(void* p, i64arg value) NLIB_NOEXCEPT;
138  static void __vectorcall StoreA4(void* p, i64arg value) NLIB_NOEXCEPT;
139  static void __vectorcall StoreA2(void* p, i64arg value) NLIB_NOEXCEPT;
140  static void __vectorcall StoreA1(void* p, i64arg value) NLIB_NOEXCEPT;
141  static NLIB_ALWAYS_INLINE void __vectorcall StoreA8(uintptr_t p, i64arg value) NLIB_NOEXCEPT {
142  StoreA8(reinterpret_cast<void*>(p), value);
143  } // NOLINT
144  static NLIB_ALWAYS_INLINE void __vectorcall StoreA4(uintptr_t p, i64arg value) NLIB_NOEXCEPT {
145  StoreA4(reinterpret_cast<void*>(p), value);
146  } // NOLINT
147  static NLIB_ALWAYS_INLINE void __vectorcall StoreA2(uintptr_t p, i64arg value) NLIB_NOEXCEPT {
148  StoreA2(reinterpret_cast<void*>(p), value);
149  } // NOLINT
150  static NLIB_ALWAYS_INLINE void __vectorcall StoreA1(uintptr_t p, i64arg value) NLIB_NOEXCEPT {
151  StoreA1(reinterpret_cast<void*>(p), value);
152  } // NOLINT
153  static NLIB_ALWAYS_INLINE void __vectorcall StoreA8(intptr_t p, i64arg value) NLIB_NOEXCEPT {
154  StoreA8(reinterpret_cast<void*>(p), value);
155  } // NOLINT
156  static NLIB_ALWAYS_INLINE void __vectorcall StoreA4(intptr_t p, i64arg value) NLIB_NOEXCEPT {
157  StoreA4(reinterpret_cast<void*>(p), value);
158  } // NOLINT
159  static NLIB_ALWAYS_INLINE void __vectorcall StoreA2(intptr_t p, i64arg value) NLIB_NOEXCEPT {
160  StoreA2(reinterpret_cast<void*>(p), value);
161  } // NOLINT
162  static NLIB_ALWAYS_INLINE void __vectorcall StoreA1(intptr_t p, i64arg value) NLIB_NOEXCEPT {
163  StoreA1(reinterpret_cast<void*>(p), value);
164  } // NOLINT
165 
166  //
167  // Get/Set
168  //
169  template <size_t N>
170  static uint8_t __vectorcall GetUint8FromLane(i64arg value) NLIB_NOEXCEPT;
171  template <size_t N>
172  static uint16_t __vectorcall GetUint16FromLane(i64arg value) NLIB_NOEXCEPT;
173  template <size_t N>
174  static uint32_t __vectorcall GetUint32FromLane(i64arg value) NLIB_NOEXCEPT;
175 
176  template <size_t N>
177  static i64 __vectorcall SetUint8ToLane(i64arg value, uint8_t v) NLIB_NOEXCEPT;
178  template <size_t N>
179  static i64 __vectorcall SetUint16ToLane(i64arg value, uint16_t v) NLIB_NOEXCEPT;
180  template <size_t N>
181  static i64 __vectorcall SetUint32ToLane(i64arg value, uint32_t v) NLIB_NOEXCEPT;
182 
183  //
184  // Arithmetic Operations
185  //
186  static i64 __vectorcall Add8(i64arg a, i64arg b) NLIB_NOEXCEPT;
187  static i64 __vectorcall Add16(i64arg a, i64arg b) NLIB_NOEXCEPT;
188  static i64 __vectorcall Add32(i64arg a, i64arg b) NLIB_NOEXCEPT;
189  static i64 __vectorcall Add64(i64arg a, i64arg b) NLIB_NOEXCEPT;
190 
191  static i64 __vectorcall AddInt8Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT;
192  static i64 __vectorcall AddInt16Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT;
193 
194  static i64 __vectorcall AddUint8Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT;
195  static i64 __vectorcall AddUint16Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT;
196 
197  static i64 __vectorcall Sub8(i64arg a, i64arg b) NLIB_NOEXCEPT;
198  static i64 __vectorcall Sub16(i64arg a, i64arg b) NLIB_NOEXCEPT;
199  static i64 __vectorcall Sub32(i64arg a, i64arg b) NLIB_NOEXCEPT;
200  static i64 __vectorcall Sub64(i64arg a, i64arg b) NLIB_NOEXCEPT;
201 
202  static i64 __vectorcall SubInt8Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT;
203  static i64 __vectorcall SubInt16Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT;
204 
205  static i64 __vectorcall SubUint8Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT;
206  static i64 __vectorcall SubUint16Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT;
207 
208  static i64 __vectorcall PairwiseAdd8(i128arg value) NLIB_NOEXCEPT;
209  static i64 __vectorcall PairwiseAdd16(i128arg value) NLIB_NOEXCEPT;
210  static i64 __vectorcall PairwiseAdd32(i128arg value) NLIB_NOEXCEPT;
211 
212  static i64 __vectorcall Mult16(i64arg a, i64arg b) NLIB_NOEXCEPT;
213  static i64 __vectorcall MultAdd16(i64arg a, i64arg b, i64arg c) NLIB_NOEXCEPT;
214  static i64 __vectorcall MultSub16(i64arg a, i64arg b, i64arg c) NLIB_NOEXCEPT;
215  static i64 __vectorcall Mult32(i64arg a, i64arg b) NLIB_NOEXCEPT;
216  static i64 __vectorcall MultAdd32(i64arg a, i64arg b, i64arg c) NLIB_NOEXCEPT;
217  static i64 __vectorcall MultSub32(i64arg a, i64arg b, i64arg c) NLIB_NOEXCEPT;
218 
219  static i64 __vectorcall MaxInt8(i64arg a, i64arg b) NLIB_NOEXCEPT;
220  static i64 __vectorcall MaxInt16(i64arg a, i64arg b) NLIB_NOEXCEPT;
221  static i64 __vectorcall MaxInt32(i64arg a, i64arg b) NLIB_NOEXCEPT;
222 
223  static i64 __vectorcall MaxUint8(i64arg a, i64arg b) NLIB_NOEXCEPT;
224  static i64 __vectorcall MaxUint16(i64arg a, i64arg b) NLIB_NOEXCEPT;
225  static i64 __vectorcall MaxUint32(i64arg a, i64arg b) NLIB_NOEXCEPT;
226  static i64 __vectorcall MinInt8(i64arg a, i64arg b) NLIB_NOEXCEPT;
227  static i64 __vectorcall MinInt16(i64arg a, i64arg b) NLIB_NOEXCEPT;
228  static i64 __vectorcall MinInt32(i64arg a, i64arg b) NLIB_NOEXCEPT;
229  static i64 __vectorcall MinUint8(i64arg a, i64arg b) NLIB_NOEXCEPT;
230  static i64 __vectorcall MinUint16(i64arg a, i64arg b) NLIB_NOEXCEPT;
231  static i64 __vectorcall MinUint32(i64arg a, i64arg b) NLIB_NOEXCEPT;
232 
233  static i64 __vectorcall AbsInt8(i64arg value) NLIB_NOEXCEPT;
234  static i64 __vectorcall AbsInt16(i64arg value) NLIB_NOEXCEPT;
235  static i64 __vectorcall AbsInt32(i64arg value) NLIB_NOEXCEPT;
236  static i64 __vectorcall AbsDiffInt8(i64arg a, i64arg b) NLIB_NOEXCEPT;
237  static i64 __vectorcall AbsDiffInt16(i64arg a, i64arg b) NLIB_NOEXCEPT;
238  static i64 __vectorcall AbsDiffInt32(i64arg a, i64arg b) NLIB_NOEXCEPT;
239 
240  static i64 __vectorcall NegateInt8(i64arg value) NLIB_NOEXCEPT;
241  static i64 __vectorcall NegateInt16(i64arg value) NLIB_NOEXCEPT;
242  static i64 __vectorcall NegateInt32(i64arg value) NLIB_NOEXCEPT;
243 
244  //
245  // Logical Operations
246  //
247  static i64 __vectorcall And(i64arg a, i64arg b) NLIB_NOEXCEPT;
248  static i64 __vectorcall Or(i64arg a, i64arg b) NLIB_NOEXCEPT;
249  static i64 __vectorcall Xor(i64arg a, i64arg b) NLIB_NOEXCEPT;
250  static i64 __vectorcall Not(i64arg a) NLIB_NOEXCEPT;
251  static i64 __vectorcall AndNot(i64arg a, i64arg b) NLIB_NOEXCEPT;
252  static i64 __vectorcall OrNot(i64arg a, i64arg b) NLIB_NOEXCEPT;
253 
254  //
255  // Comparison Operations
256  //
257  static i64 __vectorcall CmpEq8(i64arg a, i64arg b) NLIB_NOEXCEPT;
258  static i64 __vectorcall CmpEq16(i64arg a, i64arg b) NLIB_NOEXCEPT;
259  static i64 __vectorcall CmpEq32(i64arg a, i64arg b) NLIB_NOEXCEPT;
260 
261  static i64 __vectorcall CmpLtInt8(i64arg a, i64arg b) NLIB_NOEXCEPT;
262  static i64 __vectorcall CmpLtInt16(i64arg a, i64arg b) NLIB_NOEXCEPT;
263  static i64 __vectorcall CmpLtInt32(i64arg a, i64arg b) NLIB_NOEXCEPT;
264  static i64 __vectorcall CmpGtInt8(i64arg a, i64arg b) NLIB_NOEXCEPT;
265  static i64 __vectorcall CmpGtInt16(i64arg a, i64arg b) NLIB_NOEXCEPT;
266  static i64 __vectorcall CmpGtInt32(i64arg a, i64arg b) NLIB_NOEXCEPT;
267 
268  static i64 __vectorcall CmpLtUint8(i64arg a, i64arg b) NLIB_NOEXCEPT;
269  static i64 __vectorcall CmpGtUint8(i64arg a, i64arg b) NLIB_NOEXCEPT;
270  static i64 __vectorcall CmpLtUint16(i64arg a, i64arg b) NLIB_NOEXCEPT;
271  static i64 __vectorcall CmpGtUint16(i64arg a, i64arg b) NLIB_NOEXCEPT;
272  static i64 __vectorcall CmpLtUint32(i64arg a, i64arg b) NLIB_NOEXCEPT;
273  static i64 __vectorcall CmpGtUint32(i64arg a, i64arg b) NLIB_NOEXCEPT;
274 
275  static i64 __vectorcall CmpLeInt8(i64arg a, i64arg b) NLIB_NOEXCEPT;
276  static i64 __vectorcall CmpLeInt16(i64arg a, i64arg b) NLIB_NOEXCEPT;
277  static i64 __vectorcall CmpLeInt32(i64arg a, i64arg b) NLIB_NOEXCEPT;
278  static i64 __vectorcall CmpGeInt8(i64arg a, i64arg b) NLIB_NOEXCEPT;
279  static i64 __vectorcall CmpGeInt16(i64arg a, i64arg b) NLIB_NOEXCEPT;
280  static i64 __vectorcall CmpGeInt32(i64arg a, i64arg b) NLIB_NOEXCEPT;
281 
282  static i64 __vectorcall CmpLeUint8(i64arg a, i64arg b) NLIB_NOEXCEPT;
283  static i64 __vectorcall CmpLeUint16(i64arg a, i64arg b) NLIB_NOEXCEPT;
284  static i64 __vectorcall CmpLeUint32(i64arg a, i64arg b) NLIB_NOEXCEPT;
285  static i64 __vectorcall CmpGeUint8(i64arg a, i64arg b) NLIB_NOEXCEPT;
286  static i64 __vectorcall CmpGeUint16(i64arg a, i64arg b) NLIB_NOEXCEPT;
287  static i64 __vectorcall CmpGeUint32(i64arg a, i64arg b) NLIB_NOEXCEPT;
288 
289  //
290  // Bit Shift
291  //
292  static i64 __vectorcall
293  ShiftLeftLogical8(i64arg value, int count) NLIB_NOEXCEPT; // NOLINT not implemented
294  static i64 __vectorcall
295  ShiftRightLogical8(i64arg value, int count) NLIB_NOEXCEPT; // NOLINT not implemented
296 
297  static i64 __vectorcall ShiftLeftLogical16(i64arg value, int count) NLIB_NOEXCEPT;
298  static i64 __vectorcall ShiftRightLogical16(i64arg value, int count) NLIB_NOEXCEPT;
299  static i64 __vectorcall ShiftRightArithmetic16(i64arg value, int count) NLIB_NOEXCEPT;
300 
301  static i64 __vectorcall ShiftLeftLogical32(i64arg value, int count) NLIB_NOEXCEPT;
302  static i64 __vectorcall ShiftRightLogical32(i64arg value, int count) NLIB_NOEXCEPT;
303  static i64 __vectorcall ShiftRightArithmetic32(i64arg value, int count) NLIB_NOEXCEPT;
304 
305  //
306  // 64bit wide Byte Shift/Rotate
307  //
308  template <size_t N>
309  static i64 __vectorcall ByteShiftLeft(i64arg value) NLIB_NOEXCEPT;
310  template <size_t N>
311  static i64 __vectorcall ByteShiftRight(i64arg value) NLIB_NOEXCEPT;
312  template <size_t N>
313  static i64 __vectorcall ByteRotateRight(i64arg value) NLIB_NOEXCEPT;
314 
315  //
316  // Conversion
317  //
318  static i64 __vectorcall NarrowFrom16To8(i128arg value) NLIB_NOEXCEPT;
319  static i64 __vectorcall NarrowFrom32To16(i128arg value) NLIB_NOEXCEPT;
320  static i64 __vectorcall NarrowFrom64To32(i128arg value) NLIB_NOEXCEPT;
321 
322  static i64 __vectorcall ConvertFromUint16ToUint8Saturated(i128arg value) NLIB_NOEXCEPT;
323  static i64 __vectorcall ConvertFromInt16ToInt8Saturated(i128arg value) NLIB_NOEXCEPT;
324  static i64 __vectorcall ConvertFromUint32ToUint16Saturated(i128arg value) NLIB_NOEXCEPT;
325  static i64 __vectorcall ConvertFromInt32ToInt16Saturated(i128arg value) NLIB_NOEXCEPT;
326 
327  //
328  // Swap endian
329  //
330  static i64 __vectorcall Reverse16(i64arg value) NLIB_NOEXCEPT;
331  static i64 __vectorcall Reverse32(i64arg value) NLIB_NOEXCEPT;
332  static i64 __vectorcall Reverse64(i64arg value) NLIB_NOEXCEPT;
333 
334  //
335  // Misc
336  //
337  static int __vectorcall MoveMask8(i64arg value) NLIB_NOEXCEPT; // NOLINT
338  static bool __vectorcall IsZero(i64arg value) NLIB_NOEXCEPT; // NOLINT
339  static i64 __vectorcall Select(i64arg mask, i64arg a, i64arg b) NLIB_NOEXCEPT;
340 
341  private:
342  I64(); // forbidden
343 };
344 
345 #ifndef NLIB_DOXYGEN
346 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall
347 
348 #ifdef NLIB_NEON
349 #define vreinterpret_s8_s8(a) (a)
350 #define NLIB_OP1(intrin, tp, a) \
351  vreinterpret_s8_##tp(intrin##_##tp(vreinterpret_##tp##_s8(a)))
352 #define NLIB_OP2(intrin, tp, a, b) \
353  vreinterpret_s8_##tp(intrin##_##tp(vreinterpret_##tp##_s8(a), \
354  vreinterpret_##tp##_s8(b)))
355 #define NLIB_OP3(intrin, tp, a, b, c) \
356  vreinterpret_s8_##tp(intrin##_##tp(vreinterpret_##tp##_s8(a), \
357  vreinterpret_##tp##_s8(b), \
358  vreinterpret_##tp##_s8(c)))
359 #define NLIB_CMP(intrin, tp, a, b, utp) \
360  vreinterpret_s8_##utp(intrin##_##tp(vreinterpret_##tp##_s8(a), \
361  vreinterpret_##tp##_s8(b)))
362 #define NLIB_SFT(intrin, tp, a, cnt, stp) \
363  vreinterpret_s8_##tp(intrin##_##tp(vreinterpret_##tp##_s8(a), vdup_n_##stp(cnt)))
364 #define NLIB_CMB(tp, l, h) vreinterpretq_s8_##tp(vcombine_##tp(l, h))
365 #endif
366 
367 // r.s8[i] = v for all i
368 NLIB_M(i64) I64::SetValue(int8_t v, each_int8_tag) NLIB_NOEXCEPT {
369 #if defined(NLIB_SSE41)
370  // faster than _mm_set1_epi8(v)
371  return _mm_shuffle_epi8(_mm_cvtsi32_si128(static_cast<uint8_t>(v)), _mm_setzero_si128());
372 #elif defined(NLIB_NEON)
373  return vdup_n_s8(v);
374 #endif
375 }
376 
377 // r.s16[i] = v for all i
378 NLIB_M(i64) I64::SetValue(int16_t v, each_int16_tag) NLIB_NOEXCEPT {
379 #if defined(NLIB_SSE41)
380  return _mm_set1_epi16(v);
381 #elif defined(NLIB_NEON)
382  return vreinterpret_s8_s16(vdup_n_s16(v));
383 #endif
384 }
385 
386 // r.s32[i] = v for all i
387 NLIB_M(i64) I64::SetValue(int32_t v, each_int32_tag) NLIB_NOEXCEPT {
388 #if defined(NLIB_SSE41)
389  return _mm_set1_epi32(v);
390 #elif defined(NLIB_NEON)
391  return vreinterpret_s8_s32(vdup_n_s32(v));
392 #endif
393 }
394 
395 // r.u8[i] = v for all i
396 NLIB_M(i64) I64::SetValue(uint8_t v, each_uint8_tag) NLIB_NOEXCEPT {
397 #if defined(NLIB_SSE41)
398  // faster than _mm_set1_epi8(v)
399  return _mm_shuffle_epi8(_mm_cvtsi32_si128(v), _mm_setzero_si128());
400 #elif defined(NLIB_NEON)
401  return vreinterpret_s8_u8(vdup_n_u8(v));
402 #endif
403 }
404 
405 // r.u16[i] = v for all i
406 NLIB_M(i64) I64::SetValue(uint16_t v, each_uint16_tag) NLIB_NOEXCEPT {
407 #if defined(NLIB_SSE41)
408  return _mm_set1_epi16(static_cast<int16_t>(v));
409 #elif defined(NLIB_NEON)
410  return vreinterpret_s8_u16(vdup_n_u16(v));
411 #endif
412 }
413 
414 // r.u32[i] = v for all i
415 NLIB_M(i64) I64::SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT {
416 #if defined(NLIB_SSE41)
417  return _mm_set1_epi32(static_cast<int32_t>(v));
418 #elif defined(NLIB_NEON)
419  return vreinterpret_s8_u32(vdup_n_u32(v));
420 #endif
421 }
422 
423 template <size_t N>
424 // r.u32[i] = value.u32[N] for all i
425 NLIB_M(i64) I64::SetValue(i64 value, each_select32_tag) NLIB_NOEXCEPT {
426  NLIB_STATIC_ASSERT(N < 2);
427 #if defined(NLIB_SSE41)
428  return _mm_shuffle_epi32(value, _MM_SHUFFLE(N, N, N, N));
429 #elif defined(NLIB_NEON)
430  return vreinterpret_s8_u32(vdup_lane_u32(vreinterpret_u32_s8(value), N));
431 #endif
432 }
433 
434 template <size_t N>
435 // r.u16[i] = value.u16[N] for all i
436 NLIB_M(i64) I64::SetValue(i64 value, each_select16_tag) NLIB_NOEXCEPT {
437  NLIB_STATIC_ASSERT(N < 4);
438 #if defined(NLIB_SSE41)
439  return _mm_shufflelo_epi16(value, _MM_SHUFFLE(N, N, N, N));
440 #elif defined(NLIB_NEON)
441  return vreinterpret_s8_u16(vdup_lane_u16(vreinterpret_u16_s8(value), N));
442 #endif
443 }
444 
445 template <size_t N>
446 // r.u8[i] = value.u8[N] for all i
447 NLIB_M(i64) I64::SetValue(i64 value, each_select8_tag) NLIB_NOEXCEPT {
448  NLIB_STATIC_ASSERT(N < 8);
449 #if defined(NLIB_SSE41)
450  NLIB_ALIGNAS(16) const int8_t mask[16] = {
451  N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
452  };
453  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
454 #elif defined(NLIB_NEON)
455  return vdup_lane_s8(value, N);
456 #endif
457 }
458 
459 // set 0 to all bits
460 NLIB_M(i64) I64::SetZero() NLIB_NOEXCEPT {
461 #if defined(NLIB_SSE41)
462  return _mm_setzero_si128();
463 #elif defined(NLIB_NEON)
464  return vdup_n_s8(0);
465 #endif
466 }
467 
468 // load 8 bytes from p, p must be 8 bytes aligned
469 NLIB_M(i64) I64::LoadA8(const void* p) NLIB_NOEXCEPT {
470 #if defined(NLIB_SSE41)
471  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
472 #elif defined(NLIB_NEON)
473  return vreinterpret_s8_u64(vld1_u64(reinterpret_cast<const uint64_t*>(p)));
474 #endif
475 }
476 
477 // load 8 bytes from p, p must be 4 bytes aligned
478 NLIB_M(i64) I64::LoadA4(const void* p) NLIB_NOEXCEPT {
479 #if defined(NLIB_SSE41)
480  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
481 #elif defined(NLIB_NEON)
482  return vreinterpret_s8_u32(vld1_u32(reinterpret_cast<const uint32_t*>(p)));
483 #endif
484 }
485 
486 // load 8 bytes from p, p must be 2 bytes aligned
487 NLIB_M(i64) I64::LoadA2(const void* p) NLIB_NOEXCEPT {
488 #if defined(NLIB_SSE41)
489  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
490 #elif defined(NLIB_NEON)
491  return vreinterpret_s8_u16(vld1_u16(reinterpret_cast<const uint16_t*>(p)));
492 #endif
493 }
494 
495 // load 8 bytes from p
496 NLIB_M(i64) I64::LoadA1(const void* p) NLIB_NOEXCEPT {
497 #if defined(NLIB_SSE41)
498  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
499 #elif defined(NLIB_NEON)
500  return vld1_s8(reinterpret_cast<const int8_t*>(p));
501 #endif
502 }
503 
504 // store 8 bytes to p, p must be 8 bytes aligned
505 NLIB_M(void) I64::StoreA8(void* p, i64arg value) NLIB_NOEXCEPT {
506 #if defined(NLIB_SSE41)
507  _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
508 #elif defined(NLIB_NEON)
509  vst1_u64(reinterpret_cast<uint64_t*>(p), vreinterpret_u64_s8(value));
510 #endif
511 }
512 
513 // store 8 bytes to p, p must be 4 bytes aligned
514 NLIB_M(void) I64::StoreA4(void* p, i64arg value) NLIB_NOEXCEPT {
515 #if defined(NLIB_SSE41)
516  _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
517 #elif defined(NLIB_NEON)
518  vst1_u32(reinterpret_cast<uint32_t*>(p), vreinterpret_u32_s8(value));
519 #endif
520 }
521 
522 // store 8 bytes to p, p must be 2 bytes aligned
523 NLIB_M(void) I64::StoreA2(void* p, i64arg value) NLIB_NOEXCEPT {
524 #if defined(NLIB_SSE41)
525  _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
526 #elif defined(NLIB_NEON)
527  vst1_u16(reinterpret_cast<uint16_t*>(p), vreinterpret_u16_s8(value));
528 #endif
529 }
530 
531 // store 8 bytes to p
532 NLIB_M(void) I64::StoreA1(void* p, i64arg value) NLIB_NOEXCEPT {
533 #if defined(NLIB_SSE41)
534  _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
535 #elif defined(NLIB_NEON)
536  vst1_s8(reinterpret_cast<int8_t*>(p), value);
537 #endif
538 }
539 
540 template <size_t N>
541 // r = value.u8[N]
542 NLIB_M(uint8_t) I64::GetUint8FromLane(i64arg value) NLIB_NOEXCEPT {
543  NLIB_STATIC_ASSERT(N < 8);
544 #if defined(NLIB_SSE41)
545  return static_cast<uint8_t>(_mm_extract_epi8(value, N));
546 #elif defined(NLIB_NEON)
547  return vget_lane_u8(vreinterpret_u8_s8(value), N);
548 #endif
549 }
550 
551 template <size_t N>
552 // r = value.u16[N]
553 NLIB_M(uint16_t) I64::GetUint16FromLane(i64arg value) NLIB_NOEXCEPT {
554  NLIB_STATIC_ASSERT(N < 4);
555 #if defined(NLIB_SSE41)
556  return static_cast<uint16_t>(_mm_extract_epi16(value, N));
557 #elif defined(NLIB_NEON)
558  return vget_lane_u16(vreinterpret_u16_s8(value), N);
559 #endif
560 }
561 
562 template <size_t N>
563 // r = value.u32[N]
564 NLIB_M(uint32_t) I64::GetUint32FromLane(i64arg value) NLIB_NOEXCEPT {
565  NLIB_STATIC_ASSERT(N < 2);
566 #if defined(NLIB_SSE41)
567  return static_cast<uint32_t>(_mm_extract_epi32(value, N));
568 #elif defined(NLIB_NEON)
569  return vget_lane_u32(vreinterpret_u32_s8(value), N);
570 #endif
571 }
572 
573 template <size_t N>
574 // r = value, r.u8[N] = v
575 NLIB_M(i64) I64::SetUint8ToLane(i64arg value, uint8_t v) NLIB_NOEXCEPT {
576  NLIB_STATIC_ASSERT(N < 8);
577 #if defined(NLIB_SSE41)
578  return _mm_insert_epi8(value, static_cast<int8_t>(v), N);
579 #elif defined(NLIB_NEON)
580  return vset_lane_u8(v, vreinterpret_u8_s8(value), N);
581 #endif
582 }
583 
584 template <size_t N>
585 // r = value, r.u16[N] = v
586 NLIB_M(i64) I64::SetUint16ToLane(i64arg value, uint16_t v) NLIB_NOEXCEPT {
587  NLIB_STATIC_ASSERT(N < 4);
588 #if defined(NLIB_SSE41)
589  return _mm_insert_epi16(value, static_cast<uint16_t>(v), N);
590 #elif defined(NLIB_NEON)
591  return vset_lane_u16(v, vreinterpret_u16_s8(value), N);
592 #endif
593 }
594 
595 template <size_t N>
596 // r = value, r.u32[N] = v
597 NLIB_M(i64) I64::SetUint32ToLane(i64arg value, uint32_t v) NLIB_NOEXCEPT {
598  NLIB_STATIC_ASSERT(N < 2);
599 #if defined(NLIB_SSE41)
600  return _mm_insert_epi32(value, static_cast<int32_t>(v), N);
601 #elif defined(NLIB_NEON)
602  return vset_lane_u32(v, vreinterpret_u32_s8(value), N);
603 #endif
604 }
605 
606 // r.i8[i] = a.i8[i] + b.i8[i]
607 NLIB_M(i64) I64::Add8(i64arg a, i64arg b) NLIB_NOEXCEPT {
608 #if defined(NLIB_SSE41)
609  return _mm_add_epi8(a, b);
610 #elif defined(NLIB_NEON)
611  return vadd_s8(a, b);
612 #endif
613 }
614 
615 // r.i16[i] = a.i16[i] + b.i16[i]
616 NLIB_M(i64) I64::Add16(i64arg a, i64arg b) NLIB_NOEXCEPT {
617 #if defined(NLIB_SSE41)
618  return _mm_add_epi16(a, b);
619 #elif defined(NLIB_NEON)
620  return NLIB_OP2(vadd, s16, a, b);
621 #endif
622 }
623 
624 // r.i32[i] = a.i32[i] + b.i32[i]
625 NLIB_M(i64) I64::Add32(i64arg a, i64arg b) NLIB_NOEXCEPT {
626 #if defined(NLIB_SSE41)
627  return _mm_add_epi32(a, b);
628 #elif defined(NLIB_NEON)
629  return NLIB_OP2(vadd, s32, a, b);
630 #endif
631 }
632 
633 // r.i64[0] = a.i64[0] + b.i64[0]
634 NLIB_M(i64) I64::Add64(i64arg a, i64arg b) NLIB_NOEXCEPT {
635 #if defined(NLIB_SSE41)
636  return _mm_add_epi64(a, b);
637 #elif defined(NLIB_NEON)
638  return NLIB_OP2(vadd, s64, a, b);
639 #endif
640 }
641 
642 // 127 if a[i] + b[i] > 127, -128 if a[i] + b[i] < -128
643 NLIB_M(i64) I64::AddInt8Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
644 #if defined(NLIB_SSE41)
645  return _mm_adds_epi8(a, b);
646 #elif defined(NLIB_NEON)
647  return vqadd_s8(a, b);
648 #endif
649 }
650 
651 // 32767 if a[i] + b[i] > 32767, -32768 if a[i] + b[i] < -32768
652 NLIB_M(i64) I64::AddInt16Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
653 #if defined(NLIB_SSE41)
654  return _mm_adds_epi16(a, b);
655 #elif defined(NLIB_NEON)
656  return NLIB_OP2(vqadd, s16, a, b);
657 #endif
658 }
659 
660 // 255 if a[i] + b[i] > 255
661 NLIB_M(i64) I64::AddUint8Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
662 #if defined(NLIB_SSE41)
663  return _mm_adds_epu8(a, b);
664 #elif defined(NLIB_NEON)
665  return NLIB_OP2(vqadd, u8, a, b);
666 #endif
667 }
668 
669 // 65535 if a[i] + b[i] > 65535
670 NLIB_M(i64) I64::AddUint16Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
671 #if defined(NLIB_SSE41)
672  return _mm_adds_epu16(a, b);
673 #elif defined(NLIB_NEON)
674  return NLIB_OP2(vqadd, u16, a, b);
675 #endif
676 }
677 
678 // r.i8[i] = a.i8[i] - b.i8[i]
679 NLIB_M(i64) I64::Sub8(i64arg a, i64arg b) NLIB_NOEXCEPT {
680 #if defined(NLIB_SSE41)
681  return _mm_sub_epi8(a, b);
682 #elif defined(NLIB_NEON)
683  return vsub_s8(a, b);
684 #endif
685 }
686 
687 // r.i16[i] = a.i16[i] - b.i16[i]
688 NLIB_M(i64) I64::Sub16(i64arg a, i64arg b) NLIB_NOEXCEPT {
689 #if defined(NLIB_SSE41)
690  return _mm_sub_epi16(a, b);
691 #elif defined(NLIB_NEON)
692  return NLIB_OP2(vsub, s16, a, b);
693 #endif
694 }
695 
696 // r.i32[i] = a.i32[i] - b.i32[i]
697 NLIB_M(i64) I64::Sub32(i64arg a, i64arg b) NLIB_NOEXCEPT {
698 #if defined(NLIB_SSE41)
699  return _mm_sub_epi32(a, b);
700 #elif defined(NLIB_NEON)
701  return NLIB_OP2(vsub, s32, a, b);
702 #endif
703 }
704 
705 // r.i64[0] = a.i64[0] - b.i64[0]
706 NLIB_M(i64) I64::Sub64(i64arg a, i64arg b) NLIB_NOEXCEPT {
707 #if defined(NLIB_SSE41)
708  return _mm_sub_epi64(a, b);
709 #elif defined(NLIB_NEON)
710  return NLIB_OP2(vsub, s64, a, b);
711 #endif
712 }
713 
714 // 127 if a[i] - b[i] > 127, -128 if a[i] - b[i] < -128
715 NLIB_M(i64) I64::SubInt8Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
716 #if defined(NLIB_SSE41)
717  return _mm_subs_epi8(a, b);
718 #elif defined(NLIB_NEON)
719  return vqsub_s8(a, b);
720 #endif
721 }
722 
723 // 32767 if a[i] - b[i] > 32767, -32768 if a[i] - b[i] < -32768
724 NLIB_M(i64) I64::SubInt16Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
725 #if defined(NLIB_SSE41)
726  return _mm_subs_epi16(a, b);
727 #elif defined(NLIB_NEON)
728  return NLIB_OP2(vqsub, s16, a, b);
729 #endif
730 }
731 
732 // 0 if a.u8[i] - b.u8[i] < 0
733 NLIB_M(i64) I64::SubUint8Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
734 #if defined(NLIB_SSE41)
735  return _mm_subs_epu8(a, b);
736 #elif defined(NLIB_NEON)
737  return NLIB_OP2(vqsub, u8, a, b);
738 #endif
739 }
740 
741 // 0 if a.u16[i] - b.u16[i] < 0
742 NLIB_M(i64) I64::SubUint16Saturated(i64arg a, i64arg b) NLIB_NOEXCEPT {
743 #if defined(NLIB_SSE41)
744  return _mm_subs_epu16(a, b);
745 #elif defined(NLIB_NEON)
746  return NLIB_OP2(vqsub, u16, a, b);
747 #endif
748 }
749 
750 // r.i8[0] = value[0] + value[1], ..., r.i8[7] = value[14] + value[15]
751 NLIB_M(i64) I64::PairwiseAdd8(i128arg value) NLIB_NOEXCEPT {
752 #if defined(NLIB_SSE41)
753  i128 ax = _mm_add_epi8(value, _mm_srli_epi16(value, 8));
754  return I64::NarrowFrom16To8(ax);
755 #elif defined(NLIB_NEON)
756  return vpadd_s8(vget_low_s8(value), vget_high_s8(value));
757 #endif
758 }
759 
760 // r.i16[0] = value[0] + value[1], ..., r.i16[3] = value[6] + value[7]
761 NLIB_M(i64) I64::PairwiseAdd16(i128arg value) NLIB_NOEXCEPT {
762 #if defined(NLIB_SSE41)
763  return _mm_hadd_epi16(value, value);
764 #elif defined(NLIB_NEON)
765  int8x8_t lo = vget_low_s8(value);
766  int8x8_t hi = vget_high_s8(value);
767  return NLIB_OP2(vpadd, s16, lo, hi);
768 #endif
769 }
770 
771 // r.i32[0] = value[0] + value[1], r.i32[1] = value[2] + value[3]
772 NLIB_M(i64) I64::PairwiseAdd32(i128arg value) NLIB_NOEXCEPT {
773 #if defined(NLIB_SSE41)
774  return _mm_hadd_epi32(value, value);
775 #elif defined(NLIB_NEON)
776  int8x8_t lo = vget_low_s8(value);
777  int8x8_t hi = vget_high_s8(value);
778  return NLIB_OP2(vpadd, s32, lo, hi);
779 #endif
780 }
781 
782 // r.i16[i] = a.i16[i] * b.i16[i]
783 NLIB_M(i64) I64::Mult16(i64arg a, i64arg b) NLIB_NOEXCEPT {
784 #if defined(NLIB_SSE41)
785  return _mm_mullo_epi16(a, b);
786 #elif defined(NLIB_NEON)
787  return NLIB_OP2(vmul, s16, a, b);
788 #endif
789 }
790 
791 // r = c + a * b
792 NLIB_M(i64) I64::MultAdd16(i64arg a, i64arg b, i64arg c) NLIB_NOEXCEPT {
793 #if defined(NLIB_SSE41)
794  return _mm_add_epi16(c, _mm_mullo_epi16(a, b));
795 #elif defined(NLIB_NEON)
796  return NLIB_OP3(vmla, s16, c, a, b);
797 #endif
798 }
799 
800 // r = c - a * b
801 NLIB_M(i64) I64::MultSub16(i64arg a, i64arg b, i64arg c) NLIB_NOEXCEPT {
802 #if defined(NLIB_SSE41)
803  return _mm_sub_epi16(c, _mm_mullo_epi16(a, b));
804 #elif defined(NLIB_NEON)
805  return NLIB_OP3(vmls, s16, c, a, b);
806 #endif
807 }
808 
809 // r.i32[i] = a.i32[i] * b.i32[i]
810 NLIB_M(i64) I64::Mult32(i64arg a, i64arg b) NLIB_NOEXCEPT {
811 #if defined(NLIB_SSE41)
812  return _mm_mullo_epi32(a, b);
813 #elif defined(NLIB_NEON)
814  return NLIB_OP2(vmul, s32, a, b);
815 #endif
816 }
817 
818 // r = c + a * b
819 NLIB_M(i64) I64::MultAdd32(i64arg a, i64arg b, i64arg c) NLIB_NOEXCEPT {
820 #if defined(NLIB_SSE41)
821  return _mm_add_epi32(c, _mm_mullo_epi32(a, b));
822 #elif defined(NLIB_NEON)
823  return NLIB_OP3(vmla, s32, c, a, b);
824 #endif
825 }
826 
827 // r = c - a * b
828 NLIB_M(i64) I64::MultSub32(i64arg a, i64arg b, i64arg c) NLIB_NOEXCEPT {
829 #if defined(NLIB_SSE41)
830  return _mm_sub_epi32(c, _mm_mullo_epi32(a, b));
831 #elif defined(NLIB_NEON)
832  return NLIB_OP3(vmls, s32, c, a, b);
833 #endif
834 }
835 
836 // r.s8[i] = max(a.s8[i], b.s8[i])
837 NLIB_M(i64) I64::MaxInt8(i64arg a, i64arg b) NLIB_NOEXCEPT {
838 #if defined(NLIB_SSE41)
839  return _mm_max_epi8(a, b);
840 #elif defined(NLIB_NEON)
841  return vmax_s8(a, b);
842 #endif
843 }
844 
845 // r.s16[i] = max(a.s16[i], b.s16[i])
846 NLIB_M(i64) I64::MaxInt16(i64arg a, i64arg b) NLIB_NOEXCEPT {
847 #if defined(NLIB_SSE41)
848  return _mm_max_epi16(a, b);
849 #elif defined(NLIB_NEON)
850  return NLIB_OP2(vmax, s16, a, b);
851 #endif
852 }
853 
854 // r.s32[i] = max(a.s32[i], b.s32[i])
855 NLIB_M(i64) I64::MaxInt32(i64arg a, i64arg b) NLIB_NOEXCEPT {
856 #if defined(NLIB_SSE41)
857  return _mm_max_epi32(a, b);
858 #elif defined(NLIB_NEON)
859  return NLIB_OP2(vmax, s32, a, b);
860 #endif
861 }
862 
863 // r.u8[i] = max(a.u8[i], b.u8[i])
864 NLIB_M(i64) I64::MaxUint8(i64arg a, i64arg b) NLIB_NOEXCEPT {
865 #if defined(NLIB_SSE41)
866  return _mm_max_epu8(a, b);
867 #elif defined(NLIB_NEON)
868  return NLIB_OP2(vmax, u8, a, b);
869 #endif
870 }
871 
872 // r.u16[i] = max(a.u16[i], b.u16[i])
873 NLIB_M(i64) I64::MaxUint16(i64arg a, i64arg b) NLIB_NOEXCEPT {
874 #if defined(NLIB_SSE41)
875  return _mm_max_epu16(a, b);
876 #elif defined(NLIB_NEON)
877  return NLIB_OP2(vmax, u16, a, b);
878 #endif
879 }
880 
881 // r.u32[i] = max(a.u32[i], b.u32[i])
882 NLIB_M(i64) I64::MaxUint32(i64arg a, i64arg b) NLIB_NOEXCEPT {
883 #if defined(NLIB_SSE41)
884  return _mm_max_epu32(a, b);
885 #elif defined(NLIB_NEON)
886  return NLIB_OP2(vmax, u32, a, b);
887 #endif
888 }
889 
890 // r.s8[i] = min(a.s8[i], b.s8[i])
891 NLIB_M(i64) I64::MinInt8(i64arg a, i64arg b) NLIB_NOEXCEPT {
892 #if defined(NLIB_SSE41)
893  return _mm_min_epi8(a, b);
894 #elif defined(NLIB_NEON)
895  return vmin_s8(a, b);
896 #endif
897 }
898 
899 // r.s16[i] = min(a.s16[i], b.s16[i])
900 NLIB_M(i64) I64::MinInt16(i64arg a, i64arg b) NLIB_NOEXCEPT {
901 #if defined(NLIB_SSE41)
902  return _mm_min_epi16(a, b);
903 #elif defined(NLIB_NEON)
904  return NLIB_OP2(vmin, s16, a, b);
905 #endif
906 }
907 
908 // r.s32[i] = min(a.s32[i], b.s32[i])
909 NLIB_M(i64) I64::MinInt32(i64arg a, i64arg b) NLIB_NOEXCEPT {
910 #if defined(NLIB_SSE41)
911  return _mm_min_epi32(a, b);
912 #elif defined(NLIB_NEON)
913  return NLIB_OP2(vmin, s32, a, b);
914 #endif
915 }
916 
917 // r.u8[i] = min(a.u8[i], b.u8[i])
918 NLIB_M(i64) I64::MinUint8(i64arg a, i64arg b) NLIB_NOEXCEPT {
919 #if defined(NLIB_SSE41)
920  return _mm_min_epu8(a, b);
921 #elif defined(NLIB_NEON)
922  return NLIB_OP2(vmin, u8, a, b);
923 #endif
924 }
925 
926 // r.u16[i] = min(a.u16[i], b.u16[i])
927 NLIB_M(i64) I64::MinUint16(i64arg a, i64arg b) NLIB_NOEXCEPT {
928 #if defined(NLIB_SSE41)
929  return _mm_min_epu16(a, b);
930 #elif defined(NLIB_NEON)
931  return NLIB_OP2(vmin, u16, a, b);
932 #endif
933 }
934 
935 // r.u32[i] = min(a.u32[i], b.u32[i])
936 NLIB_M(i64) I64::MinUint32(i64arg a, i64arg b) NLIB_NOEXCEPT {
937 #if defined(NLIB_SSE41)
938  return _mm_min_epu32(a, b);
939 #elif defined(NLIB_NEON)
940  return NLIB_OP2(vmin, u32, a, b);
941 #endif
942 }
943 
944 // r.s8[i] = abs(value.s8[i])
945 NLIB_M(i64) I64::AbsInt8(i64arg value) NLIB_NOEXCEPT {
946 #if defined(NLIB_SSE41)
947  return _mm_abs_epi8(value);
948 #elif defined(NLIB_NEON)
949  return vabs_s8(value);
950 #endif
951 }
952 
953 // r.s16[i] = abs(value.s16[i])
954 NLIB_M(i64) I64::AbsInt16(i64arg value) NLIB_NOEXCEPT {
955 #if defined(NLIB_SSE41)
956  return _mm_abs_epi16(value);
957 #elif defined(NLIB_NEON)
958  return NLIB_OP1(vabs, s16, value);
959 #endif
960 }
961 
962 // r.s32[i] = abs(value.s32[i])
963 NLIB_M(i64) I64::AbsInt32(i64arg value) NLIB_NOEXCEPT {
964 #if defined(NLIB_SSE41)
965  return _mm_abs_epi32(value);
966 #elif defined(NLIB_NEON)
967  return NLIB_OP1(vabs, s32, value);
968 #endif
969 }
970 
971 // r.s8[i] = abs(a.s8[i] - b.s8[i])
972 NLIB_M(i64) I64::AbsDiffInt8(i64arg a, i64arg b) NLIB_NOEXCEPT {
973 #if defined(NLIB_SSE41)
974  return _mm_abs_epi8(_mm_sub_epi8(a, b));
975 #elif defined(NLIB_NEON)
976  return vabd_s8(a, b);
977 #endif
978 }
979 
980 // r.s16[i] = abs(a.s16[i] - b.s16[i])
981 NLIB_M(i64) I64::AbsDiffInt16(i64arg a, i64arg b) NLIB_NOEXCEPT {
982 #if defined(NLIB_SSE41)
983  return _mm_abs_epi16(_mm_sub_epi16(a, b));
984 #elif defined(NLIB_NEON)
985  return NLIB_OP2(vabd, s16, a, b);
986 #endif
987 }
988 
989 // r.s32[i] = abs(a.s32[i] - b.s32[i])
990 NLIB_M(i64) I64::AbsDiffInt32(i64arg a, i64arg b) NLIB_NOEXCEPT {
991 #if defined(NLIB_SSE41)
992  return _mm_abs_epi32(_mm_sub_epi32(a, b));
993 #elif defined(NLIB_NEON)
994  return NLIB_OP2(vabd, s32, a, b);
995 #endif
996 }
997 
998 // r.s8[i] = -value.s8[i]
999 NLIB_M(i64) I64::NegateInt8(i64arg value) NLIB_NOEXCEPT {
1000 #if defined(NLIB_SSE41)
1001  return _mm_sub_epi8(_mm_setzero_si128(), value);
1002 #elif defined(NLIB_NEON)
1003  return vneg_s8(value);
1004 #endif
1005 }
1006 
1007 // r.s16[i] = -value.s16[i]
1008 NLIB_M(i64) I64::NegateInt16(i64arg value) NLIB_NOEXCEPT {
1009 #if defined(NLIB_SSE41)
1010  return _mm_sub_epi16(_mm_setzero_si128(), value);
1011 #elif defined(NLIB_NEON)
1012  return NLIB_OP1(vneg, s16, value);
1013 #endif
1014 }
1015 
1016 // r.s32[i] = -value.s32[i]
1017 NLIB_M(i64) I64::NegateInt32(i64arg value) NLIB_NOEXCEPT {
1018 #if defined(NLIB_SSE41)
1019  return _mm_sub_epi32(_mm_setzero_si128(), value);
1020 #elif defined(NLIB_NEON)
1021  return NLIB_OP1(vneg, s32, value);
1022 #endif
1023 }
1024 
1025 // r = a & b
1026 NLIB_M(i64) I64::And(i64arg a, i64arg b) NLIB_NOEXCEPT {
1027 #if defined(NLIB_SSE41)
1028  return _mm_and_si128(a, b);
1029 #elif defined(NLIB_NEON)
1030  return vand_s8(a, b);
1031 #endif
1032 }
1033 
1034 // r = a | b
1035 NLIB_M(i64) I64::Or(i64arg a, i64arg b) NLIB_NOEXCEPT {
1036 #if defined(NLIB_SSE41)
1037  return _mm_or_si128(a, b);
1038 #elif defined(NLIB_NEON)
1039  return vorr_s8(a, b);
1040 #endif
1041 }
1042 
1043 // r = a ^ b
1044 NLIB_M(i64) I64::Xor(i64arg a, i64arg b) NLIB_NOEXCEPT {
1045 #if defined(NLIB_SSE41)
1046  return _mm_xor_si128(a, b);
1047 #elif defined(NLIB_NEON)
1048  return veor_s8(a, b);
1049 #endif
1050 }
1051 
1052 // r = ~a
1053 NLIB_M(i64) I64::Not(i64arg a) NLIB_NOEXCEPT {
1054 #if defined(NLIB_SSE41)
1055  return _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1056 #elif defined(NLIB_NEON)
1057  return vmvn_s8(a);
1058 #endif
1059 }
1060 
1061 // r = ~a & b
1062 NLIB_M(i64) I64::AndNot(i64arg a, i64arg b) NLIB_NOEXCEPT {
1063 #if defined(NLIB_SSE41)
1064  return _mm_andnot_si128(a, b);
1065 #elif defined(NLIB_NEON)
1066  return vbic_s8(b, a);
1067 #endif
1068 }
1069 
1070 // r = ~a | b
1071 NLIB_M(i64) I64::OrNot(i64arg a, i64arg b) NLIB_NOEXCEPT {
1072 #if defined(NLIB_SSE41)
1073  __m128i full = _mm_cmpeq_epi8(a, a);
1074  return _mm_or_si128(_mm_andnot_si128(a, full), b);
1075 #elif defined(NLIB_NEON)
1076  return vorn_s8(b, a);
1077 #endif
1078 }
1079 
1080 // r.i8[i] = a.i8[i] == a.i8[i] ? 0xFF : 0
1081 NLIB_M(i64) I64::CmpEq8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1082 #if defined(NLIB_SSE41)
1083  return _mm_cmpeq_epi8(a, b);
1084 #elif defined(NLIB_NEON)
1085  return vreinterpret_s8_u8(vceq_s8(a, b));
1086 #endif
1087 }
1088 
1089 // r.i16[i] = a.i16[i] == a.i16[i] ? 0xFFFF : 0
1090 NLIB_M(i64) I64::CmpEq16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1091 #if defined(NLIB_SSE41)
1092  return _mm_cmpeq_epi16(a, b);
1093 #elif defined(NLIB_NEON)
1094  return NLIB_CMP(vceq, s16, a, b, u16);
1095 #endif
1096 }
1097 
1098 // r.i32[i] = a.i32[i] == a.i32[i] ? 0xFFFFFFFF : 0
1099 NLIB_M(i64) I64::CmpEq32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1100 #if defined(NLIB_SSE41)
1101  return _mm_cmpeq_epi32(a, b);
1102 #elif defined(NLIB_NEON)
1103  return NLIB_CMP(vceq, s32, a, b, u32);
1104 #endif
1105 }
1106 
1107 // r.s8[i] = a.s8[i] < a.s8[i] ? 0xFF : 0
1108 NLIB_M(i64) I64::CmpLtInt8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1109 #if defined(NLIB_SSE41)
1110  return _mm_cmplt_epi8(a, b);
1111 #elif defined(NLIB_NEON)
1112  return NLIB_CMP(vclt, s8, a, b, u8);
1113 #endif
1114 }
1115 
1116 // r.s16[i] = a.s16[i] < a.s16[i] ? 0xFFFF : 0
1117 NLIB_M(i64) I64::CmpLtInt16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1118 #if defined(NLIB_SSE41)
1119  return _mm_cmplt_epi16(a, b);
1120 #elif defined(NLIB_NEON)
1121  return NLIB_CMP(vclt, s16, a, b, u16);
1122 #endif
1123 }
1124 
1125 // r.s32[i] = a.s32[i] < a.s32[i] ? 0xFFFFFFFF : 0
1126 NLIB_M(i64) I64::CmpLtInt32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1127 #if defined(NLIB_SSE41)
1128  return _mm_cmplt_epi32(a, b);
1129 #elif defined(NLIB_NEON)
1130  return NLIB_CMP(vclt, s32, a, b, u32);
1131 #endif
1132 }
1133 
1134 // r.s8[i] = a.s8[i] > a.s8[i] ? 0xFF : 0
1135 NLIB_M(i64) I64::CmpGtInt8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1136 #if defined(NLIB_SSE41)
1137  return _mm_cmpgt_epi8(a, b);
1138 #elif defined(NLIB_NEON)
1139  return NLIB_CMP(vcgt, s8, a, b, u8);
1140 #endif
1141 }
1142 
1143 // r.s16[i] = a.s16[i] > a.s16[i] ? 0xFFFF : 0
1144 NLIB_M(i64) I64::CmpGtInt16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1145 #if defined(NLIB_SSE41)
1146  return _mm_cmpgt_epi16(a, b);
1147 #elif defined(NLIB_NEON)
1148  return NLIB_CMP(vcgt, s16, a, b, u16);
1149 #endif
1150 }
1151 
1152 // r.s32[i] = a.s32[i] > a.s32[i] ? 0xFFFFFFFF : 0
1153 NLIB_M(i64) I64::CmpGtInt32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1154 #if defined(NLIB_SSE41)
1155  return _mm_cmpgt_epi32(a, b);
1156 #elif defined(NLIB_NEON)
1157  return NLIB_CMP(vcgt, s32, a, b, u32);
1158 #endif
1159 }
1160 
1161 // r.u8[i] = a.u8[i] < a.u8[i] ? 0xFF : 0
1162 NLIB_M(i64) I64::CmpLtUint8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1163 #if defined(NLIB_SSE41)
1164  __m128i ofs = _mm_shuffle_epi8(_mm_cvtsi32_si128(0x80), _mm_setzero_si128());
1165  return _mm_cmplt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1166 #elif defined(NLIB_NEON)
1167  return NLIB_CMP(vclt, u8, a, b, u8);
1168 #endif
1169 }
1170 
1171 // r.u8[i] = a.u8[i] > a.u8[i] ? 0xFF : 0
1172 NLIB_M(i64) I64::CmpGtUint8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1173 #if defined(NLIB_SSE41)
1174  __m128i ofs = _mm_shuffle_epi8(_mm_cvtsi32_si128(0x80), _mm_setzero_si128());
1175  return _mm_cmpgt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1176 #elif defined(NLIB_NEON)
1177  return NLIB_CMP(vcgt, u8, a, b, u8);
1178 #endif
1179 }
1180 
1181 // r.u16[i] = a.u16[i] < a.u16[i] ? 0xFFFF : 0
1182 NLIB_M(i64) I64::CmpLtUint16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1183 #if defined(NLIB_SSE41)
1184  __m128i ofs = _mm_set1_epi16(INT16_MIN);
1185  return _mm_cmplt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1186 #elif defined(NLIB_NEON)
1187  return NLIB_CMP(vclt, u16, a, b, u16);
1188 #endif
1189 }
1190 
1191 // r.u16[i] = a.u16[i] > a.u16[i] ? 0xFFFF : 0
1192 NLIB_M(i64) I64::CmpGtUint16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1193 #if defined(NLIB_SSE41)
1194  __m128i ofs = _mm_set1_epi16(INT16_MIN);
1195  return _mm_cmpgt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1196 #elif defined(NLIB_NEON)
1197  return NLIB_CMP(vcgt, u16, a, b, u16);
1198 #endif
1199 }
1200 
1201 // r.u32[i] = a.u32[i] < a.u32[i] ? 0xFFFFFFFF : 0
1202 NLIB_M(i64) I64::CmpLtUint32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1203 #if defined(NLIB_SSE41)
1204  __m128i ofs = _mm_set1_epi32(INT32_MIN);
1205  return _mm_cmplt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1206 #elif defined(NLIB_NEON)
1207  return NLIB_CMP(vclt, u32, a, b, u32);
1208 #endif
1209 }
1210 
1211 // r.u32[i] = a.u32[i] > a.u32[i] ? 0xFFFFFFFF : 0
1212 NLIB_M(i64) I64::CmpGtUint32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1213 #if defined(NLIB_SSE41)
1214  __m128i ofs = _mm_set1_epi32(INT32_MIN);
1215  return _mm_cmpgt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1216 #elif defined(NLIB_NEON)
1217  return NLIB_CMP(vcgt, u32, a, b, u32);
1218 #endif
1219 }
1220 
1221 // r.s8[i] = a.s8[i] <= a.s8[i] ? 0xFF : 0
1222 NLIB_M(i64) I64::CmpLeInt8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1223 #if defined(NLIB_SSE41)
1224  return _mm_or_si128(_mm_cmplt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1225 #elif defined(NLIB_NEON)
1226  return NLIB_CMP(vcle, s8, a, b, u8);
1227 #endif
1228 }
1229 
1230 // r.s16[i] = a.s16[i] <= a.s16[i] ? 0xFFFF : 0
1231 NLIB_M(i64) I64::CmpLeInt16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1232 #if defined(NLIB_SSE41)
1233  return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1234 #elif defined(NLIB_NEON)
1235  return NLIB_CMP(vcle, s16, a, b, u16);
1236 #endif
1237 }
1238 
1239 // r.s32[i] = a.s32[i] <= a.s32[i] ? 0xFFFFFFFF : 0
1240 NLIB_M(i64) I64::CmpLeInt32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1241 #if defined(NLIB_SSE41)
1242  return _mm_or_si128(_mm_cmplt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1243 #elif defined(NLIB_NEON)
1244  return NLIB_CMP(vcle, s32, a, b, u32);
1245 #endif
1246 }
1247 
1248 // r.s8[i] = a.s8[i] >= a.s8[i] ? 0xFF : 0
1249 NLIB_M(i64) I64::CmpGeInt8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1250 #if defined(NLIB_SSE41)
1251  return _mm_or_si128(_mm_cmpgt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1252 #elif defined(NLIB_NEON)
1253  return NLIB_CMP(vcge, s8, a, b, u8);
1254 #endif
1255 }
1256 
1257 // r.s16[i] = a.s16[i] >= a.s16[i] ? 0xFFFF : 0
1258 NLIB_M(i64) I64::CmpGeInt16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1259 #if defined(NLIB_SSE41)
1260  return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1261 #elif defined(NLIB_NEON)
1262  return NLIB_CMP(vcge, s16, a, b, u16);
1263 #endif
1264 }
1265 
1266 // r.s32[i] = a.s32[i] >= a.s32[i] ? 0xFFFFFFFF : 0
1267 NLIB_M(i64) I64::CmpGeInt32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1268 #if defined(NLIB_SSE41)
1269  return _mm_or_si128(_mm_cmpgt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1270 #elif defined(NLIB_NEON)
1271  return NLIB_CMP(vcge, s32, a, b, u32);
1272 #endif
1273 }
1274 
1275 // r.u8[i] = a.u8[i] <= a.u8[i] ? 0xFF : 0
1276 NLIB_M(i64) I64::CmpLeUint8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1277 #if defined(NLIB_SSE41)
1278  return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
1279 #elif defined(NLIB_NEON)
1280  return NLIB_CMP(vcle, u8, a, b, u8);
1281 #endif
1282 }
1283 
1284 // r.u16[i] = a.u16[i] <= a.u16[i] ? 0xFFFF : 0
1285 NLIB_M(i64) I64::CmpLeUint16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1286 #if defined(NLIB_SSE41)
1287  return _mm_cmpeq_epi16(_mm_min_epu16(a, b), a);
1288 #elif defined(NLIB_NEON)
1289  return NLIB_CMP(vcle, u16, a, b, u16);
1290 #endif
1291 }
1292 
1293 // r.u32[i] = a.u32[i] <= a.u32[i] ? 0xFFFFFFFF : 0
1294 NLIB_M(i64) I64::CmpLeUint32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1295 #if defined(NLIB_SSE41)
1296  return _mm_cmpeq_epi32(_mm_min_epu32(a, b), a);
1297 #elif defined(NLIB_NEON)
1298  return NLIB_CMP(vcle, u32, a, b, u32);
1299 #endif
1300 }
1301 
1302 // r.u8[i] = a.u8[i] >= a.u8[i] ? 0xFF : 0
1303 NLIB_M(i64) I64::CmpGeUint8(i64arg a, i64arg b) NLIB_NOEXCEPT {
1304 #if defined(NLIB_SSE41)
1305  return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a);
1306 #elif defined(NLIB_NEON)
1307  return NLIB_CMP(vcge, u8, a, b, u8);
1308 #endif
1309 }
1310 
1311 // r.u16[i] = a.u16[i] >= a.u16[i] ? 0xFFFF : 0
1312 NLIB_M(i64) I64::CmpGeUint16(i64arg a, i64arg b) NLIB_NOEXCEPT {
1313 #if defined(NLIB_SSE41)
1314  return _mm_cmpeq_epi16(_mm_max_epu16(a, b), a);
1315 #elif defined(NLIB_NEON)
1316  return NLIB_CMP(vcge, u16, a, b, u16);
1317 #endif
1318 }
1319 
1320 // r.u32[i] = a.u32[i] >= a.u32[i] ? 0xFFFFFFFF : 0
1321 NLIB_M(i64) I64::CmpGeUint32(i64arg a, i64arg b) NLIB_NOEXCEPT {
1322 #if defined(NLIB_SSE41)
1323  return _mm_cmpeq_epi32(_mm_max_epu32(a, b), a);
1324 #elif defined(NLIB_NEON)
1325  return NLIB_CMP(vcge, u32, a, b, u32);
1326 #endif
1327 }
1328 
1329 // r.u16[i] = value.u16[i] << count
1330 NLIB_M(i64) I64::ShiftLeftLogical16(i64arg value, int count) NLIB_NOEXCEPT {
1331 #if defined(NLIB_SSE41)
1332  return _mm_slli_epi16(value, count);
1333 #elif defined(NLIB_NEON)
1334  return NLIB_SFT(vshl, u16, value, count, s16);
1335 #endif
1336 }
1337 
1338 // r.u16[i] = value.u16[i] >> count
1339 NLIB_M(i64) I64::ShiftRightLogical16(i64arg value, int count) NLIB_NOEXCEPT {
1340 #if defined(NLIB_SSE41)
1341  return _mm_srli_epi16(value, count);
1342 #elif defined(NLIB_NEON)
1343  return NLIB_SFT(vshl, u16, value, -count, s16);
1344 #endif
1345 }
1346 
1347 // r.s16[i] = value.s16[i] >> count
1348 NLIB_M(i64) I64::ShiftRightArithmetic16(i64arg value, int count) NLIB_NOEXCEPT {
1349 #if defined(NLIB_SSE41)
1350  return _mm_srai_epi16(value, count);
1351 #elif defined(NLIB_NEON)
1352  return NLIB_SFT(vshl, s16, value, -count, s16);
1353 #endif
1354 }
1355 
1356 // r.u32[i] = value.u32[i] << count
1357 NLIB_M(i64) I64::ShiftLeftLogical32(i64arg value, int count) NLIB_NOEXCEPT {
1358 #if defined(NLIB_SSE41)
1359  return _mm_slli_epi32(value, count);
1360 #elif defined(NLIB_NEON)
1361  return NLIB_SFT(vshl, u32, value, count, s32);
1362 #endif
1363 }
1364 
1365 // r.u32[i] = value.u32[i] >> count
1366 NLIB_M(i64) I64::ShiftRightLogical32(i64arg value, int count) NLIB_NOEXCEPT {
1367 #if defined(NLIB_SSE41)
1368  return _mm_srli_epi32(value, count);
1369 #elif defined(NLIB_NEON)
1370  return NLIB_SFT(vshl, u32, value, -count, s32);
1371 #endif
1372 }
1373 
1374 // r.s32[i] = value.s32[i] >> count
1375 NLIB_M(i64) I64::ShiftRightArithmetic32(i64arg value, int count) NLIB_NOEXCEPT {
1376 #if defined(NLIB_SSE41)
1377  return _mm_srai_epi32(value, count);
1378 #elif defined(NLIB_NEON)
1379  return NLIB_SFT(vshl, s32, value, -count, s32);
1380 #endif
1381 }
1382 
1383 template <size_t N>
1384 // r.i8[i + N] = r.i8[i], fill zero
1385 NLIB_M(i64) I64::ByteShiftLeft(i64arg value) NLIB_NOEXCEPT {
1386  NLIB_STATIC_ASSERT(N < 8);
1387 #if defined(NLIB_SSE41)
1388  return _mm_slli_epi64(value, N * 8);
1389 #elif defined(NLIB_NEON)
1390  return vreinterpret_s8_u64(vshl_n_u64(vreinterpret_u64_s8(value), N * 8));
1391 #endif
1392 }
1393 
1394 template <size_t N>
1395 // r.i8[i - N] = r.i8[i], fill zero
1396 NLIB_M(i64) I64::ByteShiftRight(i64arg value) NLIB_NOEXCEPT {
1397  NLIB_STATIC_ASSERT(N < 8);
1398 #if defined(NLIB_SSE41)
1399  return _mm_srli_epi64(value, N * 8);
1400 #elif defined(NLIB_NEON)
1401  return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(value), N * 8));
1402 #endif
1403 }
1404 
1405 template <size_t N>
1406 // left <- 76543210 -> right
1407 NLIB_M(i64) I64::ByteRotateRight(i64arg value) NLIB_NOEXCEPT {
1408  NLIB_STATIC_ASSERT(N < 8);
1409 #if defined(NLIB_SSE41)
1410  i64 tmp = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 1, 0));
1411  return _mm_alignr_epi8(tmp, tmp, N);
1412 #elif defined(NLIB_NEON)
1413  return vext_s8(value, value, N);
1414 #endif
1415 }
1416 
1417 // r.u8[i] = value.u8[i * 2]
1418 NLIB_M(i64) I64::NarrowFrom16To8(i128arg value) NLIB_NOEXCEPT {
1419 #if defined(NLIB_SSE41)
1420  NLIB_ALIGNAS(16) const int8_t mask[16] = {
1421  0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1
1422  };
1423  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
1424 #elif defined(NLIB_NEON)
1425  return vreinterpret_s8_u8(vmovn_u16(vreinterpretq_u16_s8(value)));
1426 #endif
1427 }
1428 
1429 // r.u16[i] = value.u16[i * 2]
1430 NLIB_M(i64) I64::NarrowFrom32To16(i128arg value) NLIB_NOEXCEPT {
1431 #if defined(NLIB_SSE41)
1432  NLIB_ALIGNAS(16) const int8_t mask[16] = {
1433  0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1
1434  };
1435  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
1436 #elif defined(NLIB_NEON)
1437  return vreinterpret_s8_u16(vmovn_u32(vreinterpretq_u32_s8(value)));
1438 #endif
1439 }
1440 
1441 // r.u32[i] = value.u32[i * 2]
1442 NLIB_M(i64) I64::NarrowFrom64To32(i128arg value) NLIB_NOEXCEPT {
1443 #if defined(NLIB_SSE41)
1444  return _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 1, 2, 0));
1445 #elif defined(NLIB_NEON)
1446  return vreinterpret_s8_u32(vmovn_u64(vreinterpretq_u64_s8(value)));
1447 #endif
1448 }
1449 
1450 // r.u8[i] = 255 if value.u16[i] > 255
1451 NLIB_M(i64) I64::ConvertFromUint16ToUint8Saturated(i128arg value) NLIB_NOEXCEPT {
1452 #if defined(NLIB_SSE41)
1453  i64 masked = _mm_and_si128(value, _mm_set1_epi16(0x7FFFU));
1454  return _mm_packus_epi16(masked, masked);
1455 #elif defined(NLIB_NEON)
1456  return vreinterpret_s8_u8(vqmovn_u16(vreinterpretq_u16_s8(value)));
1457 #endif
1458 }
1459 
1460 // r.s8[i] = 127 if value.s16[i] > 127, -128 if value.s16[i] < -128
1461 NLIB_M(i64) I64::ConvertFromInt16ToInt8Saturated(i128arg value) NLIB_NOEXCEPT {
1462 #if defined(NLIB_SSE41)
1463  return _mm_packs_epi16(value, value);
1464 #elif defined(NLIB_NEON)
1465  return vqmovn_s16(vreinterpretq_s16_s8(value));
1466 #endif
1467 }
1468 
1469 // r.u16[i] = 65535 if value.u32[i] > 65535
1470 NLIB_M(i64) I64::ConvertFromUint32ToUint16Saturated(i128arg value) NLIB_NOEXCEPT {
1471 #if defined(NLIB_SSE41)
1472  i64 masked = _mm_and_si128(value, _mm_set1_epi32(0x7FFFFFFFU));
1473  return _mm_packus_epi32(masked, masked);
1474 #elif defined(NLIB_NEON)
1475  return vreinterpret_s8_u16(vqmovn_u32(vreinterpretq_u32_s8(value)));
1476 #endif
1477 }
1478 
1479 // r.s16[i] = 32767 if value.s32[i] > 32767, -32768 if value.s32[i] < -32768
1480 NLIB_M(i64) I64::ConvertFromInt32ToInt16Saturated(i128arg value) NLIB_NOEXCEPT {
1481 #if defined(NLIB_SSE41)
1482  return _mm_packs_epi32(value, value);
1483 #elif defined(NLIB_NEON)
1484  return vreinterpret_s8_s16(vqmovn_s32(vreinterpretq_s32_s8(value)));
1485 #endif
1486 }
1487 
1488 // ab cd ef gh -> ba dc fe hg
1489 NLIB_M(i64) I64::Reverse16(i64arg value) NLIB_NOEXCEPT {
1490 #if defined(NLIB_SSE41)
1491  NLIB_ALIGNAS(8) const int8_t mask_[8] = {
1492  1, 0, 3, 2, 5, 4, 7, 6
1493  };
1494  i128 mask = I64::LoadA8(&mask_[0]);
1495  return _mm_shuffle_epi8(value, mask);
1496 #elif defined(NLIB_NEON)
1497  return NLIB_OP1(vrev16, u8, value);
1498 #endif
1499 }
1500 
1501 // abcd efgh -> dcba hgfe
1502 NLIB_M(i64) I64::Reverse32(i64arg value) NLIB_NOEXCEPT {
1503 #if defined(NLIB_SSE41)
1504  NLIB_ALIGNAS(8) const int8_t mask_[8] = {
1505  3, 2, 1, 0, 7, 6, 5, 4
1506  };
1507  i128 mask = I64::LoadA8(&mask_[0]);
1508  return _mm_shuffle_epi8(value, mask);
1509 #elif defined(NLIB_NEON)
1510  return NLIB_OP1(vrev32, u8, value);
1511 #endif
1512 }
1513 
1514 // abcdefgh -> hgfedcba
1515 NLIB_M(i64) I64::Reverse64(i64arg value) NLIB_NOEXCEPT {
1516 #if defined(NLIB_SSE41)
1517  NLIB_ALIGNAS(8) const int8_t mask_[8] = {
1518  7, 6, 5, 4, 3, 2, 1, 0
1519  };
1520  i128 mask = I64::LoadA8(&mask_[0]);
1521  return _mm_shuffle_epi8(value, mask);
1522 #elif defined(NLIB_NEON)
1523  return NLIB_OP1(vrev64, u8, value);
1524 #endif
1525 }
1526 
1527 // r = Or(ismasked(value.u8[i]) ? (1 << i) : 0)
1528 NLIB_M(int) I64::MoveMask8(i64arg value) NLIB_NOEXCEPT { // NOLINT
1529 #if defined(NLIB_SSE41)
1530  return _mm_movemask_epi8(value) & 0xFF;
1531 #elif defined(NLIB_NEON)
1532  NLIB_ALIGNAS(16) static const uint64_t powers_[1] = {0x8040201008040201ULL};
1533  uint8x8_t powers = vreinterpret_u8_u64(vld1_u64(powers_));
1534  uint8x8_t a = vand_u8(vreinterpret_u8_s8(value), powers);
1535  uint8x8_t tmp = vpadd_u8(a, a);
1536  tmp = vpadd_u8(tmp, tmp);
1537  tmp = vpadd_u8(tmp, tmp);
1538  return vget_lane_u8(tmp, 0);
1539 #endif
1540 }
1541 
1542 // true if value == 0
1543 NLIB_M(bool) I64::IsZero(i64arg value) NLIB_NOEXCEPT { // NOLINT
1544 #if defined(NLIB_SSE41)
1545  i64 tmp = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 1, 0));
1546  return _mm_testz_si128(tmp, tmp) != 0;
1547 #elif defined(NLIB_NEON)
1548  return vget_lane_u64(vreinterpret_u64_s8(value), 0) == 0;
1549 #endif
1550 }
1551 
1552 // r.i8[i] = ismasked(mask.i8[i]) ? a.i8[i] : b.i8[i]
1553 NLIB_M(i64) I64::Select(i64arg mask, i64arg a, i64arg b) NLIB_NOEXCEPT {
1554 #if defined(NLIB_SSE41)
1555  return _mm_blendv_epi8(b, a, mask);
1556 #elif defined(NLIB_NEON)
1557  return vbsl_s8(vreinterpret_u8_s8(mask), a, b);
1558 #endif
1559 }
1560 
1561 #undef NLIB_M
1562 #endif // NLIB_DOXYGEN
1563 
1565  static i128 __vectorcall SetValue(int8_t v, each_int8_tag) NLIB_NOEXCEPT;
1566  static i128 __vectorcall SetValue(int16_t v, each_int16_tag) NLIB_NOEXCEPT;
1567  static i128 __vectorcall SetValue(int32_t v, each_int32_tag) NLIB_NOEXCEPT;
1568  static i128 __vectorcall SetValue(int64_t v, each_int64_tag) NLIB_NOEXCEPT;
1569  static i128 __vectorcall SetValue(uint8_t v, each_uint8_tag) NLIB_NOEXCEPT;
1570  static i128 __vectorcall SetValue(uint16_t v, each_uint16_tag) NLIB_NOEXCEPT;
1571  static i128 __vectorcall SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT;
1572  static i128 __vectorcall SetValue(uint64_t v, each_uint64_tag) NLIB_NOEXCEPT;
1573 
1574  template <size_t N>
1575  static i128 __vectorcall SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT;
1576  template <size_t N>
1577  static i128 __vectorcall SetValue(i128 value, each_select16_tag) NLIB_NOEXCEPT;
1578  template <size_t N>
1579  static i128 __vectorcall SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT;
1580 
1581  static i128 __vectorcall SetZero() NLIB_NOEXCEPT;
1582  static i128 __vectorcall SetFull(i128arg dummy) NLIB_NOEXCEPT;
1583 
1584  static i64 __vectorcall GetLo(i128 value) NLIB_NOEXCEPT;
1585  static i64 __vectorcall GetHi(i128 value) NLIB_NOEXCEPT;
1586 
1587  static i128 __vectorcall LoadA16(const void* p) NLIB_NOEXCEPT;
1588  static i128 __vectorcall LoadA8(const void* p) NLIB_NOEXCEPT;
1589  static i128 __vectorcall LoadA4(const void* p) NLIB_NOEXCEPT;
1590  static i128 __vectorcall LoadA2(const void* p) NLIB_NOEXCEPT;
1591  static i128 __vectorcall LoadA1(const void* p) NLIB_NOEXCEPT;
1592  static i128 __vectorcall LoadA16(uintptr_t p) NLIB_NOEXCEPT {
1593  return LoadA16(reinterpret_cast<void*>(p));
1594  } // NOLINT
1595  static i128 __vectorcall LoadA8(uintptr_t p) NLIB_NOEXCEPT {
1596  return LoadA8(reinterpret_cast<void*>(p));
1597  } // NOLINT
1598  static i128 __vectorcall LoadA4(uintptr_t p) NLIB_NOEXCEPT {
1599  return LoadA4(reinterpret_cast<void*>(p));
1600  } // NOLINT
1601  static i128 __vectorcall LoadA2(uintptr_t p) NLIB_NOEXCEPT {
1602  return LoadA2(reinterpret_cast<void*>(p));
1603  } // NOLINT
1604  static i128 __vectorcall LoadA1(uintptr_t p) NLIB_NOEXCEPT {
1605  return LoadA1(reinterpret_cast<void*>(p));
1606  } // NOLINT
1607  static i128 __vectorcall LoadA16(intptr_t p) NLIB_NOEXCEPT {
1608  return LoadA16(reinterpret_cast<void*>(p));
1609  } // NOLINT
1610  static i128 __vectorcall LoadA8(intptr_t p) NLIB_NOEXCEPT {
1611  return LoadA8(reinterpret_cast<void*>(p));
1612  } // NOLINT
1613  static i128 __vectorcall LoadA4(intptr_t p) NLIB_NOEXCEPT {
1614  return LoadA4(reinterpret_cast<void*>(p));
1615  } // NOLINT
1616  static i128 __vectorcall LoadA2(intptr_t p) NLIB_NOEXCEPT {
1617  return LoadA2(reinterpret_cast<void*>(p));
1618  } // NOLINT
1619  static i128 __vectorcall LoadA1(intptr_t p) NLIB_NOEXCEPT {
1620  return LoadA1(reinterpret_cast<void*>(p));
1621  } // NOLINT
1622 
1623  static void __vectorcall StoreA16(void* p, i128arg value) NLIB_NOEXCEPT;
1624  static void __vectorcall StoreA8(void* p, i128arg value) NLIB_NOEXCEPT;
1625  static void __vectorcall StoreA4(void* p, i128arg value) NLIB_NOEXCEPT;
1626  static void __vectorcall StoreA2(void* p, i128arg value) NLIB_NOEXCEPT;
1627  static void __vectorcall StoreA1(void* p, i128arg value) NLIB_NOEXCEPT;
1628  static void __vectorcall StoreA16(uintptr_t p, i128arg value) NLIB_NOEXCEPT {
1629  StoreA16(reinterpret_cast<void*>(p), value);
1630  } // NOLINT
1631  static void __vectorcall StoreA8(uintptr_t p, i128arg value) NLIB_NOEXCEPT {
1632  StoreA8(reinterpret_cast<void*>(p), value);
1633  } // NOLINT
1634  static void __vectorcall StoreA4(uintptr_t p, i128arg value) NLIB_NOEXCEPT {
1635  StoreA4(reinterpret_cast<void*>(p), value);
1636  } // NOLINT
1637  static void __vectorcall StoreA2(uintptr_t p, i128arg value) NLIB_NOEXCEPT {
1638  StoreA2(reinterpret_cast<void*>(p), value);
1639  } // NOLINT
1640  static void __vectorcall StoreA1(uintptr_t p, i128arg value) NLIB_NOEXCEPT {
1641  StoreA1(reinterpret_cast<void*>(p), value);
1642  } // NOLINT
1643  static void __vectorcall StoreA16(intptr_t p, i128arg value) NLIB_NOEXCEPT {
1644  StoreA16(reinterpret_cast<void*>(p), value);
1645  } // NOLINT
1646  static void __vectorcall StoreA8(intptr_t p, i128arg value) NLIB_NOEXCEPT {
1647  StoreA8(reinterpret_cast<void*>(p), value);
1648  } // NOLINT
1649  static void __vectorcall StoreA4(intptr_t p, i128arg value) NLIB_NOEXCEPT {
1650  StoreA4(reinterpret_cast<void*>(p), value);
1651  } // NOLINT
1652  static void __vectorcall StoreA2(intptr_t p, i128arg value) NLIB_NOEXCEPT {
1653  StoreA2(reinterpret_cast<void*>(p), value);
1654  } // NOLINT
1655  static void __vectorcall StoreA1(intptr_t p, i128arg value) NLIB_NOEXCEPT {
1656  StoreA1(reinterpret_cast<void*>(p), value);
1657  } // NOLINT
1658 
1659  //
1660  // Get/Set
1661  //
1662  template <size_t N>
1663  static uint8_t __vectorcall GetUint8FromLane(i128arg value) NLIB_NOEXCEPT;
1664  template <size_t N>
1665  static uint16_t __vectorcall GetUint16FromLane(i128arg value) NLIB_NOEXCEPT;
1666  template <size_t N>
1667  static uint32_t __vectorcall GetUint32FromLane(i128arg value) NLIB_NOEXCEPT;
1668  template <size_t N>
1669  static uint64_t __vectorcall GetUint64FromLane(i128arg value) NLIB_NOEXCEPT;
1670  template <size_t N>
1671  static i128 __vectorcall SetUint8ToLane(i128arg value, uint8_t v) NLIB_NOEXCEPT;
1672  template <size_t N>
1673  static i128 __vectorcall SetUint16ToLane(i128arg value, uint16_t v) NLIB_NOEXCEPT;
1674  template <size_t N>
1675  static i128 __vectorcall SetUint32ToLane(i128arg value, uint32_t v) NLIB_NOEXCEPT;
1676  template <size_t N>
1677  static i128 __vectorcall SetUint64ToLane(i128arg value, uint64_t v) NLIB_NOEXCEPT;
1678 
1679  //
1680  // Arithmetic Operations
1681  //
1682  static i128 __vectorcall Add8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1683  static i128 __vectorcall Add16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1684  static i128 __vectorcall Add32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1685  static i128 __vectorcall Add64(i128arg a, i128arg b) NLIB_NOEXCEPT;
1686 
1687  static i128 __vectorcall AddInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
1688  static i128 __vectorcall AddInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
1689 
1690  static i128 __vectorcall AddUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
1691  static i128 __vectorcall AddUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
1692 
1693  static i128 __vectorcall Sub8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1694  static i128 __vectorcall Sub16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1695  static i128 __vectorcall Sub32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1696  static i128 __vectorcall Sub64(i128arg a, i128arg b) NLIB_NOEXCEPT;
1697 
1698  static i128 __vectorcall SubInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
1699  static i128 __vectorcall SubInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
1700 
1701  static i128 __vectorcall SubUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
1702  static i128 __vectorcall SubUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
1703 
1704  static i128 __vectorcall PairwiseAdd8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1705  static i128 __vectorcall PairwiseAdd16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1706  static i128 __vectorcall PairwiseAdd32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1707  // PairwiseMaxInt8, PairwiseMaxInt16, PairwiseMaxInt32
1708  // PairwiseMaxUint8, PairwiseMaxUint16, PairwiseMaxUint32
1709  // PairwiseMinInt8, PairwiseMinInt16, PairwiseMinInt32
1710  // PairwiseMinUint8, PairwiseMinUint16, PairwiseMinUint32
1711 
1712  static i128 __vectorcall Mult16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1713  static i128 __vectorcall MultAdd16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
1714  static i128 __vectorcall MultSub16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
1715  static i128 __vectorcall Mult32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1716  static i128 __vectorcall MultAdd32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
1717  static i128 __vectorcall MultSub32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
1718 
1719  static i128 __vectorcall NegateInt8(i128arg value) NLIB_NOEXCEPT;
1720  static i128 __vectorcall NegateInt16(i128arg value) NLIB_NOEXCEPT;
1721  static i128 __vectorcall NegateInt32(i128arg value) NLIB_NOEXCEPT;
1722 
1723  static i128 __vectorcall MaxInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1724  static i128 __vectorcall MaxInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1725  static i128 __vectorcall MaxInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1726  static i128 __vectorcall MaxUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1727  static i128 __vectorcall MaxUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1728  static i128 __vectorcall MaxUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1729  static i128 __vectorcall MinInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1730  static i128 __vectorcall MinInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1731  static i128 __vectorcall MinInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1732  static i128 __vectorcall MinUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1733  static i128 __vectorcall MinUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1734  static i128 __vectorcall MinUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1735 
1736  static i128 __vectorcall AbsInt8(i128arg value) NLIB_NOEXCEPT;
1737  static i128 __vectorcall AbsInt16(i128arg value) NLIB_NOEXCEPT;
1738  static i128 __vectorcall AbsInt32(i128arg value) NLIB_NOEXCEPT;
1739  static i128 __vectorcall AbsDiffInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1740  static i128 __vectorcall AbsDiffInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1741  static i128 __vectorcall AbsDiffInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1742 
1743  //
1744  // Logical Operations
1745  //
1746  static i128 __vectorcall And(i128arg a, i128arg b) NLIB_NOEXCEPT;
1747  static i128 __vectorcall Or(i128arg a, i128arg b) NLIB_NOEXCEPT;
1748  static i128 __vectorcall Xor(i128arg a, i128arg b) NLIB_NOEXCEPT;
1749  static i128 __vectorcall Not(i128arg a) NLIB_NOEXCEPT;
1750  static i128 __vectorcall AndNot(i128arg a, i128arg b) NLIB_NOEXCEPT;
1751  static i128 __vectorcall OrNot(i128arg a, i128arg b) NLIB_NOEXCEPT;
1752 
1753  //
1754  // Comparison Operations
1755  //
1756  static i128 __vectorcall CmpEq8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1757  static i128 __vectorcall CmpEq16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1758  static i128 __vectorcall CmpEq32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1759 
1760  static i128 __vectorcall CmpLtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1761  static i128 __vectorcall CmpLtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1762  static i128 __vectorcall CmpLtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1763 
1764  static i128 __vectorcall CmpGtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1765  static i128 __vectorcall CmpGtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1766  static i128 __vectorcall CmpGtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1767 
1768  static i128 __vectorcall CmpLtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1769  static i128 __vectorcall CmpLtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1770  static i128 __vectorcall CmpLtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1771 
1772  static i128 __vectorcall CmpGtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1773  static i128 __vectorcall CmpGtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1774  static i128 __vectorcall CmpGtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1775 
1776  static i128 __vectorcall CmpLeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1777  static i128 __vectorcall CmpLeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1778  static i128 __vectorcall CmpLeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1779 
1780  static i128 __vectorcall CmpGeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1781  static i128 __vectorcall CmpGeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1782  static i128 __vectorcall CmpGeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1783 
1784  static i128 __vectorcall CmpLeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1785  static i128 __vectorcall CmpLeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1786  static i128 __vectorcall CmpLeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1787 
1788  static i128 __vectorcall CmpGeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
1789  static i128 __vectorcall CmpGeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
1790  static i128 __vectorcall CmpGeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
1791 
1792  //
1793  // Bit Shift
1794  //
1795  static i128 __vectorcall ShiftLeftLogical8(i128arg value, int count) NLIB_NOEXCEPT;
1796  static i128 __vectorcall ShiftRightLogical8(i128arg value, int count) NLIB_NOEXCEPT;
1797 
1798  static i128 __vectorcall ShiftLeftLogical16(i128arg value, int count) NLIB_NOEXCEPT;
1799  static i128 __vectorcall ShiftRightLogical16(i128arg value, int count) NLIB_NOEXCEPT;
1800  static i128 __vectorcall ShiftRightArithmetic16(i128arg value, int count) NLIB_NOEXCEPT;
1801 
1802  static i128 __vectorcall ShiftLeftLogical32(i128arg value, int count) NLIB_NOEXCEPT;
1803  static i128 __vectorcall ShiftRightLogical32(i128arg value, int count) NLIB_NOEXCEPT;
1804  static i128 __vectorcall ShiftRightArithmetic32(i128arg value, int count) NLIB_NOEXCEPT;
1805 
1806  static i128 __vectorcall ShiftLeftLogical64(i128arg value, int count) NLIB_NOEXCEPT;
1807  static i128 __vectorcall ShiftRightLogical64(i128arg value, int count) NLIB_NOEXCEPT;
1808 
1809  //
1810  // 128bit wide Byte Shift/Rotate
1811  //
1812  template <size_t N>
1813  static i128 __vectorcall ByteShiftLeft(i128arg value) NLIB_NOEXCEPT;
1814  template <size_t N>
1815  static i128 __vectorcall ByteShiftRight(i128arg value) NLIB_NOEXCEPT;
1816  template <size_t N>
1817  static i128 __vectorcall ByteRotateRight(i128arg value) NLIB_NOEXCEPT;
1818  template <size_t N>
1819  static i128 __vectorcall AlignR(i128arg a, i128arg b) NLIB_NOEXCEPT;
1820 
1821  //
1822  // Conversion
1823  //
1824  static i128 __vectorcall NarrowFrom16To8(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
1825  static i128 __vectorcall NarrowFrom32To16(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
1826  static i128 __vectorcall NarrowFrom64To32(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
1827 
1828  static i128 __vectorcall
1829  ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT; // NOLINT
1830  static i128 __vectorcall ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
1831  static i128 __vectorcall
1832  ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT; // NOLINT
1833  static i128 __vectorcall
1834  ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT; // NOLINT
1835 
1836  static i128 __vectorcall ConvertFromInt8ToInt16(i64arg value) NLIB_NOEXCEPT;
1837  static i128 __vectorcall ConvertFromInt16ToInt32(i64arg value) NLIB_NOEXCEPT;
1838  static i128 __vectorcall ConvertFromInt32ToInt64(i64arg value) NLIB_NOEXCEPT;
1839 
1840  static i128 __vectorcall ConvertFromUint8ToUint16(i64arg value) NLIB_NOEXCEPT;
1841  static i128 __vectorcall ConvertFromUint16ToUint32(i64arg value) NLIB_NOEXCEPT;
1842  static i128 __vectorcall ConvertFromUint32ToUint64(i64arg value) NLIB_NOEXCEPT;
1843 
1844  static i128 __vectorcall Zip8(i64arg a, i64arg b) NLIB_NOEXCEPT;
1845  static i128 __vectorcall Unzip8(i64arg a, i64arg b) NLIB_NOEXCEPT;
1846  static i128 __vectorcall Zip16(i64arg a, i64arg b) NLIB_NOEXCEPT;
1847  static i128 __vectorcall Unzip16(i64arg a, i64arg b) NLIB_NOEXCEPT;
1848  static i128 __vectorcall Zip32(i64arg a, i64arg b) NLIB_NOEXCEPT;
1849  static i128 __vectorcall Unzip32(i64arg a, i64arg b) NLIB_NOEXCEPT;
1850 
1851  //
1852  // Swap endian
1853  //
1854  static i128 __vectorcall Reverse16(i128arg value) NLIB_NOEXCEPT;
1855  static i128 __vectorcall Reverse32(i128arg value) NLIB_NOEXCEPT;
1856  static i128 __vectorcall Reverse64(i128arg value) NLIB_NOEXCEPT;
1857 
1858  //
1859  // Misc
1860  //
1861  static int __vectorcall MoveMask8(i128arg value) NLIB_NOEXCEPT;
1862  static int __vectorcall MoveMask16(i128arg value) NLIB_NOEXCEPT;
1863  static int __vectorcall MoveMask32(i128arg value) NLIB_NOEXCEPT;
1864  static bool __vectorcall IsZero(i128arg value) NLIB_NOEXCEPT;
1865  static bool __vectorcall IsFull(i128arg value) NLIB_NOEXCEPT;
1866  static i128 __vectorcall Select(i128arg mask, i128arg a, i128arg b) NLIB_NOEXCEPT;
1867  static i128 __vectorcall Shuffle8(i128arg value, i128arg shuffle) NLIB_NOEXCEPT;
1868 
1869  private:
1870  I128(); // forbidden
1871 };
1872 
1873 #ifndef NLIB_DOXYGEN
1874 
1875 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall
1876 #define NLIB_M2(tp) inline tp __vectorcall
1877 
1878 #ifdef NLIB_NEON
1879 # undef vreinterpret_s8_s8
1880 # undef NLIB_OP1
1881 # undef NLIB_OP2
1882 # undef NLIB_OP3
1883 # undef NLIB_CMP
1884 # undef NLIB_SFT
1885 # undef NLIB_CMB
1886 
1887 #define vreinterpretq_s8_s8(a) (a)
1888 #define NLIB_OP1(intrin, tp, a) \
1889  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a)))
1890 #define NLIB_OP2(intrin, tp, a, b) \
1891  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
1892  vreinterpretq_##tp##_s8(b)))
1893 #define NLIB_OP3(intrin, tp, a, b, c) \
1894  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
1895  vreinterpretq_##tp##_s8(b), \
1896  vreinterpretq_##tp##_s8(c)))
1897 #define NLIB_CMP(intrin, tp, a, b, utp) \
1898  vreinterpretq_s8_##utp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
1899  vreinterpretq_##tp##_s8(b)))
1900 #define NLIB_SFT(intrin, tp, a, cnt, stp) \
1901  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vdupq_n_##stp(cnt)))
1902 #define NLIB_CMB(tp, l, h) vreinterpretq_s8_##tp(vcombine_##tp(l, h))
1903 #endif
1904 
1905 // r.s8[i] = v for all i_mm_cvtsi32_si128
1906 NLIB_M(i128) I128::SetValue(int8_t v, each_int8_tag) NLIB_NOEXCEPT {
1907 #if defined(NLIB_SSE41)
1908  // faster than _mm_set1_epi8(v)
1909  return _mm_shuffle_epi8(_mm_cvtsi32_si128(static_cast<uint8_t>(v)), _mm_setzero_si128());
1910 #elif defined(NLIB_NEON)
1911  return vdupq_n_s8(v);
1912 #endif
1913 }
1914 
1915 // r.s16[i] = v for all i
1916 NLIB_M(i128) I128::SetValue(int16_t v, each_int16_tag) NLIB_NOEXCEPT {
1917 #if defined(NLIB_SSE41)
1918  return _mm_set1_epi16(v);
1919 #elif defined(NLIB_NEON)
1920  return vreinterpretq_s8_s16(vdupq_n_s16(v));
1921 #endif
1922 }
1923 
1924 // r.s32[i] = v for all i
1925 NLIB_M(i128) I128::SetValue(int32_t v, each_int32_tag) NLIB_NOEXCEPT {
1926 #if defined(NLIB_SSE41)
1927  return _mm_set1_epi32(v);
1928 #elif defined(NLIB_NEON)
1929  return vreinterpretq_s8_s32(vdupq_n_s32(v));
1930 #endif
1931 }
1932 
1933 // r.s64[i] = v for all i
1934 NLIB_M(i128) I128::SetValue(int64_t v, each_int64_tag) NLIB_NOEXCEPT {
1935 #if defined(NLIB_SSE41)
1936 #ifdef _MSC_VER
1937  // conflicts with ffloor()?
1938  // Do not use MMX anyway
1939  // return _mm_set1_epi64(*reinterpret_cast<__m64*>(&x));
1940  NLIB_ALIGNAS(16) int64_t tmp[2] = {v, v};
1941  return I128::LoadA16(tmp);
1942 #else
1943  return _mm_set1_epi64x(v);
1944 #endif
1945 #elif defined(NLIB_NEON)
1946  return vreinterpretq_s8_s64(vdupq_n_s64(v));
1947 #endif
1948 }
1949 
1950 // r.u8[i] = v for all i
1951 NLIB_M(i128) I128::SetValue(uint8_t v, each_uint8_tag) NLIB_NOEXCEPT {
1952 #if defined(NLIB_SSE41)
1953  // faster than _mm_set1_epi8(v)
1954  return _mm_shuffle_epi8(_mm_cvtsi32_si128(v), _mm_setzero_si128());
1955 #elif defined(NLIB_NEON)
1956  return vreinterpretq_s8_u8(vdupq_n_u8(v));
1957 #endif
1958 }
1959 
1960 // r.u16[i] = v for all i
1961 NLIB_M(i128) I128::SetValue(uint16_t v, each_uint16_tag) NLIB_NOEXCEPT {
1962 #if defined(NLIB_SSE41)
1963  return _mm_set1_epi16(static_cast<int16_t>(v));
1964 #elif defined(NLIB_NEON)
1965  return vreinterpretq_s8_u16(vdupq_n_u16(v));
1966 #endif
1967 }
1968 
1969 // r.u32[i] = v for all i
1970 NLIB_M(i128) I128::SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT {
1971 #if defined(NLIB_SSE41)
1972  return _mm_set1_epi32(static_cast<int32_t>(v));
1973 #elif defined(NLIB_NEON)
1974  return vreinterpretq_s8_u32(vdupq_n_u32(v));
1975 #endif
1976 }
1977 
1978 // r.u64[i] = v for all i
1979 NLIB_M(i128) I128::SetValue(uint64_t v, each_uint64_tag) NLIB_NOEXCEPT {
1980 #if defined(NLIB_SSE41)
1981 #ifdef _MSC_VER
1982  NLIB_ALIGNAS(16) uint64_t tmp[2] = {v, v};
1983  return I128::LoadA16(tmp);
1984 #else
1985  return _mm_set1_epi64x(static_cast<int64_t>(v));
1986 #endif
1987 #elif defined(NLIB_NEON)
1988  return vreinterpretq_s8_u64(vdupq_n_u64(v));
1989 #endif
1990 }
1991 
1992 #if defined(NLIB_SSE41)
1993 template <size_t N>
1994 // r.u32[i] = value.u32[N] for all i
1995 NLIB_M(i128) I128::SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT {
1996  NLIB_STATIC_ASSERT(N < 4);
1997  return _mm_shuffle_epi32(value, _MM_SHUFFLE(N, N, N, N));
1998 }
1999 #elif defined(NLIB_NEON)
2000 # ifdef __aarch64__
2001 template <size_t N>
2002 // r.u32[i] = value.u32[N] for all i
2003 NLIB_M(i128) I128::SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT {
2004  NLIB_STATIC_ASSERT(N < 4);
2005  uint32x4_t v = vreinterpretq_u32_s8(value);
2006  return vreinterpretq_s8_u32(vdupq_laneq_u32(v, N));
2007 }
2008 # else
2009 template <>
2010 NLIB_M(i128) I128::SetValue<0>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
2011  uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
2012  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
2013 }
2014 template <>
2015 NLIB_M(i128) I128::SetValue<1>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
2016  uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
2017  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
2018 }
2019 template <>
2020 NLIB_M(i128) I128::SetValue<2>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
2021  uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
2022  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
2023 }
2024 template <>
2025 NLIB_M(i128) I128::SetValue<3>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
2026  uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
2027  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
2028 }
2029 # endif
2030 #endif
2031 
2032 #if defined(NLIB_SSE41)
2033 template <size_t N>
2034 // r.u16[i] = value.u16[N] for all i
2035 NLIB_M2(i128) I128::SetValue(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2036  NLIB_STATIC_ASSERT(N < 8);
2037  NLIB_ALIGNAS(16) const int8_t mask[16] = {
2038  2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1,
2039  2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1
2040  };
2041  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(mask));
2042 }
2043 #elif defined(NLIB_NEON)
2044 template <>
2045 NLIB_M(i128) I128::SetValue<0>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2046 # ifdef __aarch64__
2047  uint16x8_t v = vreinterpretq_u16_s8(value);
2048  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 0));
2049 # else
2050  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
2051  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
2052 # endif
2053 }
2054 
2055 template <>
2056 NLIB_M(i128) I128::SetValue<1>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2057 # ifdef __aarch64__
2058  uint16x8_t v = vreinterpretq_u16_s8(value);
2059  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 1));
2060 # else
2061  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
2062  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
2063 # endif
2064 }
2065 
2066 template <>
2067 NLIB_M(i128) I128::SetValue<2>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2068 # ifdef __aarch64__
2069  uint16x8_t v = vreinterpretq_u16_s8(value);
2070  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 2));
2071 # else
2072  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
2073  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
2074 # endif
2075 }
2076 
2077 template <>
2078 NLIB_M(i128) I128::SetValue<3>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2079 # ifdef __aarch64__
2080  uint16x8_t v = vreinterpretq_u16_s8(value);
2081  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 3));
2082 # else
2083  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
2084  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
2085 # endif
2086 }
2087 
2088 template <>
2089 NLIB_M(i128) I128::SetValue<4>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2090 # ifdef __aarch64__
2091  uint16x8_t v = vreinterpretq_u16_s8(value);
2092  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 4));
2093 # else
2094  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
2095  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
2096 # endif
2097 }
2098 
2099 template <>
2100 NLIB_M(i128) I128::SetValue<5>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2101 # ifdef __aarch64__
2102  uint16x8_t v = vreinterpretq_u16_s8(value);
2103  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 5));
2104 # else
2105  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
2106  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
2107 # endif
2108 }
2109 
2110 template <>
2111 NLIB_M(i128) I128::SetValue<6>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2112 # ifdef __aarch64__
2113  uint16x8_t v = vreinterpretq_u16_s8(value);
2114  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 6));
2115 # else
2116  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
2117  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
2118 # endif
2119 }
2120 
2121 template <>
2122 NLIB_M(i128) I128::SetValue<7>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
2123 # ifdef __aarch64__
2124  uint16x8_t v = vreinterpretq_u16_s8(value);
2125  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 7));
2126 # else
2127  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
2128  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
2129 # endif
2130 }
2131 #endif
2132 
2133 #if defined(NLIB_SSE41)
2134 template <size_t N>
2135 // r.u8[i] = value.u8[N] for all i
2136 NLIB_M2(i128) I128::SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT {
2137  NLIB_STATIC_ASSERT(N < 16);
2138  NLIB_ALIGNAS(16) const int8_t mask[16] = {
2139  N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
2140  };
2141  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
2142 }
2143 #elif defined(NLIB_NEON)
2144 namespace detail {
2145 template <size_t N, bool IsLower>
2146 struct SetValue8Helper {
2147  NLIB_M(i128) operator()(i128 value) NLIB_NOEXCEPT {
2148  return vdupq_lane_s8(vget_low_s8(value), N);
2149  }
2150 };
2151 
2152 template <size_t N>
2153 struct SetValue8Helper<N, false> {
2154  NLIB_M(i128) operator()(i128 value) NLIB_NOEXCEPT {
2155  return vdupq_lane_s8(vget_high_s8(value), N - 8);
2156  }
2157 };
2158 
2159 } // namespace detail
2160 
2161 template <size_t N>
2162 NLIB_M(i128) I128::SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT {
2163  NLIB_STATIC_ASSERT(N < 16);
2164 # ifdef __aarch64__
2165  return vdupq_laneq_s8(value, N);
2166 # else
2167  return detail::SetValue8Helper<N, (N < 8)>()(value);
2168 # endif
2169 }
2170 #endif
2171 
2172 // set 0 to all bits
2173 NLIB_M(i128) I128::SetZero() NLIB_NOEXCEPT {
2174 #if defined(NLIB_SSE41)
2175  return _mm_setzero_si128();
2176 #elif defined(NLIB_NEON)
2177  return vdupq_n_s8(0);
2178 #endif
2179 }
2180 
2181 // set 1 to all bits
2182 NLIB_M(i128) I128::SetFull(i128arg dummy) NLIB_NOEXCEPT { return I128::CmpEq8(dummy, dummy); }
2183 
2184 // r = value.u8[0-7]
2185 NLIB_M(i64) I128::GetLo(i128 value) NLIB_NOEXCEPT {
2186 #if defined(NLIB_SSE41)
2187  return value;
2188 #elif defined(NLIB_NEON)
2189  return vget_low_s8(value);
2190 #endif
2191 }
2192 
2193 // r = value.u8[8-15]
2194 NLIB_M(i64) I128::GetHi(i128 value) NLIB_NOEXCEPT {
2195 #if defined(NLIB_SSE41)
2196  return _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2197 #elif defined(NLIB_NEON)
2198  return vget_high_s8(value);
2199 #endif
2200 }
2201 
2202 // r[i] = p[i], p is 16 bytes aligned
2203 NLIB_M(i128) I128::LoadA16(const void* p) NLIB_NOEXCEPT {
2204 #if defined(NLIB_SSE41)
2205  return _mm_load_si128(reinterpret_cast<const __m128i*>(p));
2206 #elif defined(NLIB_NEON)
2207  uint64x2_t tmp = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
2208  return vreinterpretq_s8_u64(tmp);
2209 #endif
2210 }
2211 
2212 // r[i] = p[i], p is 8 bytes aligned
2213 NLIB_M(i128) I128::LoadA8(const void* p) NLIB_NOEXCEPT {
2214 #if defined(NLIB_SSE41)
2215  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
2216 #elif defined(NLIB_NEON)
2217  uint64x2_t tmp = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
2218  return vreinterpretq_s8_u64(tmp);
2219 #endif
2220 }
2221 
2222 // r[i] = p[i], p is 4 bytes aligned
2223 NLIB_M(i128) I128::LoadA4(const void* p) NLIB_NOEXCEPT {
2224 #if defined(NLIB_SSE41)
2225  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
2226 #elif defined(NLIB_NEON)
2227  uint32x4_t tmp = vld1q_u32(reinterpret_cast<const uint32_t*>(p));
2228  return vreinterpretq_s8_u32(tmp);
2229 #endif
2230 }
2231 
2232 // r[i] = p[i], p is 2 bytes aligned
2233 NLIB_M(i128) I128::LoadA2(const void* p) NLIB_NOEXCEPT {
2234 #if defined(NLIB_SSE41)
2235  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
2236 #elif defined(NLIB_NEON)
2237  uint16x8_t tmp = vld1q_u16(reinterpret_cast<const uint16_t*>(p));
2238  return vreinterpretq_s8_u16(tmp);
2239 #endif
2240 }
2241 
2242 // r[i] = p[i]
2243 NLIB_M(i128) I128::LoadA1(const void* p) NLIB_NOEXCEPT {
2244 #if defined(NLIB_SSE41)
2245  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
2246 #elif defined(NLIB_NEON)
2247  return vld1q_s8(reinterpret_cast<const int8_t*>(p));
2248 #endif
2249 }
2250 
2251 // p[i] = value[i], p is 16 bytes aligned
2252 NLIB_M(void) I128::StoreA16(void* p, i128arg value) NLIB_NOEXCEPT {
2253 #if defined(NLIB_SSE41)
2254  _mm_store_si128(reinterpret_cast<i128*>(p), value);
2255 #elif defined(NLIB_NEON)
2256  vst1q_u64(reinterpret_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
2257 #endif
2258 }
2259 
2260 // p[i] = value[i], p is 8 bytes aligned
2261 NLIB_M(void) I128::StoreA8(void* p, i128arg value) NLIB_NOEXCEPT {
2262 #if defined(NLIB_SSE41)
2263  _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
2264 #elif defined(NLIB_NEON)
2265  vst1q_u64(reinterpret_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
2266 #endif
2267 }
2268 
2269 // p[i] = value[i], p is 4 bytes aligned
2270 NLIB_M(void) I128::StoreA4(void* p, i128arg value) NLIB_NOEXCEPT {
2271 #if defined(NLIB_SSE41)
2272  _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
2273 #elif defined(NLIB_NEON)
2274  vst1q_u32(reinterpret_cast<uint32_t*>(p), vreinterpretq_u32_s8(value));
2275 #endif
2276 }
2277 
2278 // p[i] = value[i], p is 2 bytes aligned
2279 NLIB_M(void) I128::StoreA2(void* p, i128arg value) NLIB_NOEXCEPT {
2280 #if defined(NLIB_SSE41)
2281  _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
2282 #elif defined(NLIB_NEON)
2283  vst1q_u16(reinterpret_cast<uint16_t*>(p), vreinterpretq_u16_s8(value));
2284 #endif
2285 }
2286 
2287 // p[i] = value[i]
2288 NLIB_M(void) I128::StoreA1(void* p, i128arg value) NLIB_NOEXCEPT {
2289 #if defined(NLIB_SSE41)
2290  _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
2291 #elif defined(NLIB_NEON)
2292  vst1q_s8(reinterpret_cast<int8_t*>(p), value);
2293 #endif
2294 }
2295 
2296 template <size_t N>
2297 // r = value.u8[N]
2298 NLIB_M(uint8_t) I128::GetUint8FromLane(i128arg value) NLIB_NOEXCEPT {
2299  NLIB_STATIC_ASSERT(N < 16);
2300 #if defined(NLIB_SSE41)
2301  return static_cast<uint8_t>(_mm_extract_epi8(value, N));
2302 #elif defined(NLIB_NEON)
2303  return vgetq_lane_u8(vreinterpretq_u8_s8(value), N);
2304 #endif
2305 }
2306 
2307 template <size_t N>
2308 // r = value.u16[N]
2309 NLIB_M(uint16_t) I128::GetUint16FromLane(i128arg value) NLIB_NOEXCEPT {
2310  NLIB_STATIC_ASSERT(N < 8);
2311 #if defined(NLIB_SSE41)
2312  return static_cast<uint16_t>(_mm_extract_epi16(value, N));
2313 #elif defined(NLIB_NEON)
2314  return vgetq_lane_u16(vreinterpretq_u16_s8(value), N);
2315 #endif
2316 }
2317 
2318 template <size_t N>
2319 // r = value.u32[N]
2320 NLIB_M(uint32_t) I128::GetUint32FromLane(i128arg value) NLIB_NOEXCEPT {
2321  NLIB_STATIC_ASSERT(N < 4);
2322 #if defined(NLIB_SSE41)
2323  return static_cast<uint32_t>(_mm_extract_epi32(value, N));
2324 #elif defined(NLIB_NEON)
2325  return vgetq_lane_u32(vreinterpretq_u32_s8(value), N);
2326 #endif
2327 }
2328 
2329 template <size_t N>
2330 // r = value.u64[N]
2331 NLIB_M(uint64_t) I128::GetUint64FromLane(i128arg value) NLIB_NOEXCEPT {
2332  NLIB_STATIC_ASSERT(N < 2);
2333 #if defined(NLIB_SSE41)
2334 #ifdef NLIB_64BIT
2335  return static_cast<uint64_t>(_mm_extract_epi64(value, N));
2336 #else
2337  NLIB_UNUSED(value);
2338  return 0; // dummy
2339 #endif
2340 #elif defined(NLIB_NEON)
2341  return vgetq_lane_u64(vreinterpretq_u64_s8(value), N);
2342 #endif
2343 }
2344 
2345 #if defined(NLIB_SSE41) && !defined(NLIB_64BIT)
2346 template <>
2347 NLIB_M(uint64_t) I128::GetUint64FromLane<0>(i128arg value) NLIB_NOEXCEPT {
2348  uint64_t rval;
2349  _mm_storel_epi64(reinterpret_cast<i128*>(&rval), value);
2350  return rval;
2351 }
2352 template <>
2353 NLIB_M(uint64_t) I128::GetUint64FromLane<1>(i128arg value) NLIB_NOEXCEPT {
2354  uint64_t rval;
2355  i128 tmp = I128::GetHi(value);
2356  _mm_storel_epi64(reinterpret_cast<i128*>(&rval), tmp);
2357  return rval;
2358 }
2359 #endif
2360 
2361 template <size_t N>
2362 // r = value, r.u8[N] = v
2363 NLIB_M(i128) I128::SetUint8ToLane(i128arg value, uint8_t v) NLIB_NOEXCEPT {
2364  NLIB_STATIC_ASSERT(N < 16);
2365 #if defined(NLIB_SSE41)
2366  return _mm_insert_epi8(value, static_cast<int8_t>(v), N);
2367 #elif defined(NLIB_NEON)
2368  return vreinterpretq_s8_u8(vsetq_lane_u8(v, vreinterpretq_u8_s8(value), N));
2369 #endif
2370 }
2371 
2372 template <size_t N>
2373 // r = value, r.u16[N] = v
2374 NLIB_M(i128) I128::SetUint16ToLane(i128arg value, uint16_t v) NLIB_NOEXCEPT {
2375  NLIB_STATIC_ASSERT(N < 8);
2376 #if defined(NLIB_SSE41)
2377  return _mm_insert_epi16(value, static_cast<int16_t>(v), N);
2378 #elif defined(NLIB_NEON)
2379  return vreinterpretq_s8_u16(vsetq_lane_u16(v, vreinterpretq_u16_s8(value), N));
2380 #endif
2381 }
2382 
2383 template <size_t N>
2384 // r = value, r.u32[N] = v
2385 NLIB_M(i128) I128::SetUint32ToLane(i128arg value, uint32_t v) NLIB_NOEXCEPT {
2386  NLIB_STATIC_ASSERT(N < 4);
2387 #if defined(NLIB_SSE41)
2388  return _mm_insert_epi32(value, static_cast<uint32_t>(v), N);
2389 #elif defined(NLIB_NEON)
2390  return vreinterpretq_s8_u32(vsetq_lane_u32(v, vreinterpretq_u32_s8(value), N));
2391 #endif
2392 }
2393 
2394 template <size_t N>
2395 // r = value, r.u64[N] = v
2396 NLIB_M(i128) I128::SetUint64ToLane(i128arg value, uint64_t v) NLIB_NOEXCEPT {
2397  NLIB_STATIC_ASSERT(N < 2);
2398 #if defined(NLIB_SSE41)
2399 #ifdef NLIB_64BIT
2400  return _mm_insert_epi64(value, static_cast<int64_t>(v), N);
2401 #else
2402  union {
2403  int32_t i32[2];
2404  int64_t i64;
2405  } tmp;
2406  tmp.i64 = static_cast<int64_t>(v);
2407  __m128i rval;
2408  rval = _mm_insert_epi32(value, tmp.i32[0], N * 2 + 0);
2409  return _mm_insert_epi32(rval, tmp.i32[1], N * 2 + 1);
2410 #endif
2411 #elif defined(NLIB_NEON)
2412  return vreinterpretq_s8_u64(vsetq_lane_u64(v, vreinterpretq_u64_s8(value), N));
2413 #endif
2414 }
2415 
2416 // r.i8[i] = a.i8[i] + b.i8[i]
2417 NLIB_M(i128) I128::Add8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2418 #if defined(NLIB_SSE41)
2419  return _mm_add_epi8(a, b);
2420 #elif defined(NLIB_NEON)
2421  return vaddq_s8(a, b);
2422 #endif
2423 }
2424 
2425 // r.i16[i] = a.i16[i] + b.i16[i]
2426 NLIB_M(i128) I128::Add16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2427 #if defined(NLIB_SSE41)
2428  return _mm_add_epi16(a, b);
2429 #elif defined(NLIB_NEON)
2430  return NLIB_OP2(vaddq, s16, a, b);
2431 #endif
2432 }
2433 
2434 // r.i32[i] = a.i32[i] + b.i32[i]
2435 NLIB_M(i128) I128::Add32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2436 #if defined(NLIB_SSE41)
2437  return _mm_add_epi32(a, b);
2438 #elif defined(NLIB_NEON)
2439  return NLIB_OP2(vaddq, s32, a, b);
2440 #endif
2441 }
2442 
2443 // r.i64[i] = a.i64[i] + b.i64[i]
2444 NLIB_M(i128) I128::Add64(i128arg a, i128arg b) NLIB_NOEXCEPT {
2445 #if defined(NLIB_SSE41)
2446  return _mm_add_epi64(a, b);
2447 #elif defined(NLIB_NEON)
2448  return NLIB_OP2(vaddq, s64, a, b);
2449 #endif
2450 }
2451 
2452 // 127 if a[i] + b[i] > 127, -128 if a[i] + b[i] < -128
2453 NLIB_M(i128) I128::AddInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2454 #if defined(NLIB_SSE41)
2455  return _mm_adds_epi8(a, b);
2456 #elif defined(NLIB_NEON)
2457  return vqaddq_s8(a, b);
2458 #endif
2459 }
2460 
2461 // 32767 if a[i] + b[i] > 32767, -32768 if a[i] + b[i] < -32768
2462 NLIB_M(i128) I128::AddInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2463 #if defined(NLIB_SSE41)
2464  return _mm_adds_epi16(a, b);
2465 #elif defined(NLIB_NEON)
2466  return NLIB_OP2(vqaddq, s16, a, b);
2467 #endif
2468 }
2469 
2470 // 255 if a[i] + b[i] > 255
2471 NLIB_M(i128) I128::AddUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2472 #if defined(NLIB_SSE41)
2473  return _mm_adds_epu8(a, b);
2474 #elif defined(NLIB_NEON)
2475  return NLIB_OP2(vqaddq, u8, a, b);
2476 #endif
2477 }
2478 
2479 // 65535 if a[i] + b[i] > 65535
2480 NLIB_M(i128) I128::AddUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2481 #if defined(NLIB_SSE41)
2482  return _mm_adds_epu16(a, b);
2483 #elif defined(NLIB_NEON)
2484  return NLIB_OP2(vqaddq, u16, a, b);
2485 #endif
2486 }
2487 
2488 // r.i8[i] = a.i8[i] - b.i8[i]
2489 NLIB_M(i128) I128::Sub8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2490 #if defined(NLIB_SSE41)
2491  return _mm_sub_epi8(a, b);
2492 #elif defined(NLIB_NEON)
2493  return vsubq_s8(a, b);
2494 #endif
2495 }
2496 
2497 // r.i16[i] = a.i16[i] - b.i16[i]
2498 NLIB_M(i128) I128::Sub16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2499 #if defined(NLIB_SSE41)
2500  return _mm_sub_epi16(a, b);
2501 #elif defined(NLIB_NEON)
2502  return NLIB_OP2(vsubq, s16, a, b);
2503 #endif
2504 }
2505 
2506 // r.i32[i] = a.i32[i] - b.i32[i]
2507 NLIB_M(i128) I128::Sub32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2508 #if defined(NLIB_SSE41)
2509  return _mm_sub_epi32(a, b);
2510 #elif defined(NLIB_NEON)
2511  return NLIB_OP2(vsubq, s32, a, b);
2512 #endif
2513 }
2514 
2515 // r.i64[0] = a.i64[0] - b.i64[0]
2516 NLIB_M(i128) I128::Sub64(i128arg a, i128arg b) NLIB_NOEXCEPT {
2517 #if defined(NLIB_SSE41)
2518  return _mm_sub_epi64(a, b);
2519 #elif defined(NLIB_NEON)
2520  return NLIB_OP2(vsubq, s64, a, b);
2521 #endif
2522 }
2523 
2524 // 127 if a[i] - b[i] > 127, -128 if a[i] - b[i] < -128
2525 NLIB_M(i128) I128::SubInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2526 #if defined(NLIB_SSE41)
2527  return _mm_subs_epi8(a, b);
2528 #elif defined(NLIB_NEON)
2529  return NLIB_OP2(vqsubq, s8, a, b);
2530 #endif
2531 }
2532 
2533 // 32767 if a[i] - b[i] > 32767, -32768 if a[i] - b[i] < -32768
2534 NLIB_M(i128) I128::SubInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2535 #if defined(NLIB_SSE41)
2536  return _mm_subs_epi16(a, b);
2537 #elif defined(NLIB_NEON)
2538  return NLIB_OP2(vqsubq, s16, a, b);
2539 #endif
2540 }
2541 
2542 // 0 if a.u8[i] - b.u8[i] < 0
2543 NLIB_M(i128) I128::SubUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2544 #if defined(NLIB_SSE41)
2545  return _mm_subs_epu8(a, b);
2546 #elif defined(NLIB_NEON)
2547  return NLIB_OP2(vqsubq, u8, a, b);
2548 #endif
2549 }
2550 
2551 // 0 if a.u16[i] - b.u16[i] < 0
2552 NLIB_M(i128) I128::SubUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
2553 #if defined(NLIB_SSE41)
2554  return _mm_subs_epu16(a, b);
2555 #elif defined(NLIB_NEON)
2556  return NLIB_OP2(vqsubq, u16, a, b);
2557 #endif
2558 }
2559 
2560 // r.i8[0] = value[0] + value[1], ..., r.i8[7] = value[14] + value[15]
2561 NLIB_M(i128) I128::PairwiseAdd8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2562 #if defined(NLIB_SSE41)
2563  __m128i ax = _mm_add_epi8(a, _mm_srli_epi16(a, 8));
2564  __m128i bx = _mm_add_epi8(b, _mm_srli_epi16(b, 8));
2565  return I128::NarrowFrom16To8(ax, bx);
2566 #elif defined(NLIB_NEON)
2567 # ifdef __aarch64__
2568  return vpaddq_s8(a, b);
2569 # else
2570  int8x8_t al = vget_low_s8(a);
2571  int8x8_t ah = vget_high_s8(a);
2572  int8x8_t rl = vpadd_s8(al, ah);
2573  int8x8_t bl = vget_low_s8(b);
2574  int8x8_t bh = vget_high_s8(b);
2575  int8x8_t rh = vpadd_s8(bl, bh);
2576  return vcombine_s8(rl, rh);
2577 # endif
2578 #endif
2579 }
2580 
2581 // r.i16[0] = value[0] + value[1], ..., r.i16[3] = value[6] + value[7]
2582 NLIB_M(i128) I128::PairwiseAdd16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2583 #if defined(NLIB_SSE41)
2584  return _mm_hadd_epi16(a, b);
2585 #elif defined(NLIB_NEON)
2586 # ifdef __aarch64__
2587  return vreinterpretq_s8_s16(vpaddq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)));
2588 # else
2589  int16x4_t al = vget_low_s16(vreinterpretq_s16_s8(a));
2590  int16x4_t ah = vget_high_s16(vreinterpretq_s16_s8(a));
2591  int16x4_t rl = vpadd_s16(al, ah);
2592  int16x4_t bl = vget_low_s16(vreinterpretq_s16_s8(b));
2593  int16x4_t bh = vget_high_s16(vreinterpretq_s16_s8(b));
2594  int16x4_t rh = vpadd_s16(bl, bh);
2595  return NLIB_CMB(s16, rl, rh);
2596 # endif
2597 #endif
2598 }
2599 
2600 // r.i32[0] = value[0] + value[1], r.i32[1] = value[2] + value[3]
2601 NLIB_M(i128) I128::PairwiseAdd32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2602 #if defined(NLIB_SSE41)
2603  return _mm_hadd_epi32(a, b);
2604 #elif defined(NLIB_NEON)
2605 # ifdef __aarch64__
2606  return vreinterpretq_s8_s32(vpaddq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)));
2607 # else
2608  int32x2_t al = vget_low_s32(vreinterpretq_s32_s8(a));
2609  int32x2_t ah = vget_high_s32(vreinterpretq_s32_s8(a));
2610  int32x2_t rl = vpadd_s32(al, ah);
2611  int32x2_t bl = vget_low_s32(vreinterpretq_s32_s8(b));
2612  int32x2_t bh = vget_high_s32(vreinterpretq_s32_s8(b));
2613  int32x2_t rh = vpadd_s32(bl, bh);
2614  return NLIB_CMB(s32, rl, rh);
2615 # endif
2616 #endif
2617 }
2618 
2619 // r.i16[i] = a.i16[i] * b.i16[i]
2620 NLIB_M(i128) I128::Mult16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2621 #if defined(NLIB_SSE41)
2622  return _mm_mullo_epi16(a, b);
2623 #elif defined(NLIB_NEON)
2624  return NLIB_OP2(vmulq, s16, a, b);
2625 #endif
2626 }
2627 
2628 // r = c + a * b
2629 NLIB_M(i128) I128::MultAdd16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
2630 #if defined(NLIB_SSE41)
2631  return _mm_add_epi16(c, _mm_mullo_epi16(a, b));
2632 #elif defined(NLIB_NEON)
2633  return NLIB_OP3(vmlaq, s16, c, a, b);
2634 #endif
2635 }
2636 
2637 // r = c - a * b
2638 NLIB_M(i128) I128::MultSub16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
2639 #if defined(NLIB_SSE41)
2640  return _mm_sub_epi16(c, _mm_mullo_epi16(a, b));
2641 #elif defined(NLIB_NEON)
2642  return NLIB_OP3(vmlsq, s16, c, a, b);
2643 #endif
2644 }
2645 
2646 // r.i32[i] = a.i32[i] * b.i32[i]
2647 NLIB_M(i128) I128::Mult32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2648 #if defined(NLIB_SSE41)
2649  return _mm_mullo_epi32(a, b);
2650 #elif defined(NLIB_NEON)
2651  return NLIB_OP2(vmulq, s32, a, b);
2652 #endif
2653 }
2654 
2655 // r = c + a * b
2656 NLIB_M(i128) I128::MultAdd32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
2657 #if defined(NLIB_SSE41)
2658  return _mm_add_epi32(c, _mm_mullo_epi32(a, b));
2659 #elif defined(NLIB_NEON)
2660  return NLIB_OP3(vmlaq, s32, c, a, b);
2661 #endif
2662 }
2663 
2664 // r = c - a * b
2665 NLIB_M(i128) I128::MultSub32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
2666 #if defined(NLIB_SSE41)
2667  return _mm_sub_epi32(c, _mm_mullo_epi32(a, b));
2668 #elif defined(NLIB_NEON)
2669  return NLIB_OP3(vmlsq, s32, c, a, b);
2670 #endif
2671 }
2672 
2673 // r.s8[i] = max(a.s8[i], b.s8[i])
2674 NLIB_M(i128) I128::MaxInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2675 #if defined(NLIB_SSE41)
2676  return _mm_max_epi8(a, b);
2677 #elif defined(NLIB_NEON)
2678  return NLIB_OP2(vmaxq, s8, a, b);
2679 #endif
2680 }
2681 
2682 // r.s16[i] = max(a.s16[i], b.s16[i])
2683 NLIB_M(i128) I128::MaxInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2684 #if defined(NLIB_SSE41)
2685  return _mm_max_epi16(a, b);
2686 #elif defined(NLIB_NEON)
2687  return NLIB_OP2(vmaxq, s16, a, b);
2688 #endif
2689 }
2690 
2691 // r.s32[i] = max(a.s32[i], b.s32[i])
2692 NLIB_M(i128) I128::MaxInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2693 #if defined(NLIB_SSE41)
2694  return _mm_max_epi32(a, b);
2695 #elif defined(NLIB_NEON)
2696  return NLIB_OP2(vmaxq, s32, a, b);
2697 #endif
2698 }
2699 
2700 // r.u8[i] = max(a.u8[i], b.u8[i])
2701 NLIB_M(i128) I128::MaxUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2702 #if defined(NLIB_SSE41)
2703  return _mm_max_epu8(a, b);
2704 #elif defined(NLIB_NEON)
2705  return NLIB_OP2(vmaxq, u8, a, b);
2706 #endif
2707 }
2708 
2709 // r.u16[i] = max(a.u16[i], b.u16[i])
2710 NLIB_M(i128) I128::MaxUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2711 #if defined(NLIB_SSE41)
2712  return _mm_max_epu16(a, b);
2713 #elif defined(NLIB_NEON)
2714  return NLIB_OP2(vmaxq, u16, a, b);
2715 #endif
2716 }
2717 
2718 // r.u32[i] = max(a.u32[i], b.u32[i])
2719 NLIB_M(i128) I128::MaxUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2720 #if defined(NLIB_SSE41)
2721  return _mm_max_epu32(a, b);
2722 #elif defined(NLIB_NEON)
2723  return NLIB_OP2(vmaxq, u32, a, b);
2724 #endif
2725 }
2726 
2727 // r.s8[i] = min(a.s8[i], b.s8[i])
2728 NLIB_M(i128) I128::MinInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2729 #if defined(NLIB_SSE41)
2730  return _mm_min_epi8(a, b);
2731 #elif defined(NLIB_NEON)
2732  return NLIB_OP2(vminq, s8, a, b);
2733 #endif
2734 }
2735 
2736 // r.s16[i] = min(a.s16[i], b.s16[i])
2737 NLIB_M(i128) I128::MinInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2738 #if defined(NLIB_SSE41)
2739  return _mm_min_epi16(a, b);
2740 #elif defined(NLIB_NEON)
2741  return NLIB_OP2(vminq, s16, a, b);
2742 #endif
2743 }
2744 
2745 // r.s32[i] = min(a.s32[i], b.s32[i])
2746 NLIB_M(i128) I128::MinInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2747 #if defined(NLIB_SSE41)
2748  return _mm_min_epi32(a, b);
2749 #elif defined(NLIB_NEON)
2750  return NLIB_OP2(vminq, s32, a, b);
2751 #endif
2752 }
2753 
2754 // r.u8[i] = min(a.u8[i], b.u8[i])
2755 NLIB_M(i128) I128::MinUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2756 #if defined(NLIB_SSE41)
2757  return _mm_min_epu8(a, b);
2758 #elif defined(NLIB_NEON)
2759  return NLIB_OP2(vminq, u8, a, b);
2760 #endif
2761 }
2762 
2763 // r.u16[i] = min(a.u16[i], b.u16[i])
2764 NLIB_M(i128) I128::MinUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2765 #if defined(NLIB_SSE41)
2766  return _mm_min_epu16(a, b);
2767 #elif defined(NLIB_NEON)
2768  return NLIB_OP2(vminq, u16, a, b);
2769 #endif
2770 }
2771 
2772 // r.u32[i] = min(a.u32[i], b.u32[i])
2773 NLIB_M(i128) I128::MinUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2774 #if defined(NLIB_SSE41)
2775  return _mm_min_epu32(a, b);
2776 #elif defined(NLIB_NEON)
2777  return NLIB_OP2(vminq, u32, a, b);
2778 #endif
2779 }
2780 
2781 // r.s8[i] = abs(value.s8[i])
2782 NLIB_M(i128) I128::AbsInt8(i128arg value) NLIB_NOEXCEPT {
2783 #if defined(NLIB_SSE41)
2784  return _mm_abs_epi8(value);
2785 #elif defined(NLIB_NEON)
2786  return NLIB_OP1(vabsq, s8, value);
2787 #endif
2788 }
2789 
2790 // r.s16[i] = abs(value.s16[i])
2791 NLIB_M(i128) I128::AbsInt16(i128arg value) NLIB_NOEXCEPT {
2792 #if defined(NLIB_SSE41)
2793  return _mm_abs_epi16(value);
2794 #elif defined(NLIB_NEON)
2795  return NLIB_OP1(vabsq, s16, value);
2796 #endif
2797 }
2798 
2799 // r.s32[i] = abs(value.s32[i])
2800 NLIB_M(i128) I128::AbsInt32(i128arg value) NLIB_NOEXCEPT {
2801 #if defined(NLIB_SSE41)
2802  return _mm_abs_epi32(value);
2803 #elif defined(NLIB_NEON)
2804  return NLIB_OP1(vabsq, s32, value);
2805 #endif
2806 }
2807 
2808 // r.s8[i] = abs(a.s8[i] - b.s8[i])
2809 NLIB_M(i128) I128::AbsDiffInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2810 #if defined(NLIB_SSE41)
2811  return _mm_abs_epi8(_mm_sub_epi8(a, b));
2812 #elif defined(NLIB_NEON)
2813  return NLIB_OP2(vabdq, s8, a, b);
2814 #endif
2815 }
2816 
2817 // r.s16[i] = abs(a.s16[i] - b.s16[i])
2818 NLIB_M(i128) I128::AbsDiffInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2819 #if defined(NLIB_SSE41)
2820  return _mm_abs_epi16(_mm_sub_epi16(a, b));
2821 #elif defined(NLIB_NEON)
2822  return NLIB_OP2(vabdq, s16, a, b);
2823 #endif
2824 }
2825 
2826 // r.s32[i] = abs(a.s32[i] - b.s32[i])
2827 NLIB_M(i128) I128::AbsDiffInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2828 #if defined(NLIB_SSE41)
2829  return _mm_abs_epi32(_mm_sub_epi32(a, b));
2830 #elif defined(NLIB_NEON)
2831  return NLIB_OP2(vabdq, s32, a, b);
2832 #endif
2833 }
2834 
2835 // r.s8[i] = -value.s8[i]
2836 NLIB_M(i128) I128::NegateInt8(i128arg value) NLIB_NOEXCEPT {
2837 #if defined(NLIB_SSE41)
2838  return _mm_sub_epi8(_mm_setzero_si128(), value);
2839 #elif defined(NLIB_NEON)
2840  return NLIB_OP1(vnegq, s8, value);
2841 #endif
2842 }
2843 
2844 // r.s16[i] = -value.s16[i]
2845 NLIB_M(i128) I128::NegateInt16(i128arg value) NLIB_NOEXCEPT {
2846 #if defined(NLIB_SSE41)
2847  return _mm_sub_epi16(_mm_setzero_si128(), value);
2848 #elif defined(NLIB_NEON)
2849  return NLIB_OP1(vnegq, s16, value);
2850 #endif
2851 }
2852 
2853 // r.s32[i] = -value.s32[i]
2854 NLIB_M(i128) I128::NegateInt32(i128arg value) NLIB_NOEXCEPT {
2855 #if defined(NLIB_SSE41)
2856  return _mm_sub_epi32(_mm_setzero_si128(), value);
2857 #elif defined(NLIB_NEON)
2858  return NLIB_OP1(vnegq, s32, value);
2859 #endif
2860 }
2861 
2862 // r = a & b
2863 NLIB_M(i128) I128::And(i128arg a, i128arg b) NLIB_NOEXCEPT {
2864 #if defined(NLIB_SSE41)
2865  return _mm_and_si128(a, b);
2866 #elif defined(NLIB_NEON)
2867  return NLIB_OP2(vandq, s8, a, b);
2868 #endif
2869 }
2870 
2871 // r = a | b
2872 NLIB_M(i128) I128::Or(i128arg a, i128arg b) NLIB_NOEXCEPT {
2873 #if defined(NLIB_SSE41)
2874  return _mm_or_si128(a, b);
2875 #elif defined(NLIB_NEON)
2876  return NLIB_OP2(vorrq, s8, a, b);
2877 #endif
2878 }
2879 
2880 // r = a ^ b
2881 NLIB_M(i128) I128::Xor(i128arg a, i128arg b) NLIB_NOEXCEPT {
2882 #if defined(NLIB_SSE41)
2883  return _mm_xor_si128(a, b);
2884 #elif defined(NLIB_NEON)
2885  return NLIB_OP2(veorq, s8, a, b);
2886 #endif
2887 }
2888 
2889 // r = ~a
2890 NLIB_M(i128) I128::Not(i128arg a) NLIB_NOEXCEPT {
2891 #if defined(NLIB_SSE41)
2892  return _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
2893 #elif defined(NLIB_NEON)
2894  return NLIB_OP1(vmvnq, s8, a);
2895 #endif
2896 }
2897 
2898 // r = ~a & b
2899 NLIB_M(i128) I128::AndNot(i128arg a, i128arg b) NLIB_NOEXCEPT {
2900 #if defined(NLIB_SSE41)
2901  return _mm_andnot_si128(a, b);
2902 #elif defined(NLIB_NEON)
2903  return NLIB_OP2(vbicq, s8, b, a);
2904 #endif
2905 }
2906 
2907 // r = ~a | b
2908 NLIB_M(i128) I128::OrNot(i128arg a, i128arg b) NLIB_NOEXCEPT {
2909 #if defined(NLIB_SSE41)
2910  __m128i not_a = _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
2911  return _mm_or_si128(not_a, b);
2912 #elif defined(NLIB_NEON)
2913  return NLIB_OP2(vornq, s8, b, a);
2914 #endif
2915 }
2916 
2917 // r.i8[i] = a.i8[i] == a.i8[i] ? 0xFF : 0
2918 NLIB_M(i128) I128::CmpEq8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2919 #if defined(NLIB_SSE41)
2920  return _mm_cmpeq_epi8(a, b);
2921 #elif defined(NLIB_NEON)
2922  return NLIB_CMP(vceqq, s8, a, b, u8);
2923 #endif
2924 }
2925 
2926 // r.i16[i] = a.i16[i] == a.i16[i] ? 0xFFFF : 0
2927 NLIB_M(i128) I128::CmpEq16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2928 #if defined(NLIB_SSE41)
2929  return _mm_cmpeq_epi16(a, b);
2930 #elif defined(NLIB_NEON)
2931  return NLIB_CMP(vceqq, s16, a, b, u16);
2932 #endif
2933 }
2934 
2935 // r.i32[i] = a.i32[i] == a.i32[i] ? 0xFFFFFFFF : 0
2936 NLIB_M(i128) I128::CmpEq32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2937 #if defined(NLIB_SSE41)
2938  return _mm_cmpeq_epi32(a, b);
2939 #elif defined(NLIB_NEON)
2940  return NLIB_CMP(vceqq, s32, a, b, u32);
2941 #endif
2942 }
2943 
2944 // r.s8[i] = a.s8[i] < a.s8[i] ? 0xFF : 0
2945 NLIB_M(i128) I128::CmpLtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2946 #if defined(NLIB_SSE41)
2947  return _mm_cmplt_epi8(a, b);
2948 #elif defined(NLIB_NEON)
2949  return NLIB_CMP(vcltq, s8, a, b, u8);
2950 #endif
2951 }
2952 
2953 // r.s16[i] = a.s16[i] < a.s16[i] ? 0xFFFF : 0
2954 NLIB_M(i128) I128::CmpLtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2955 #if defined(NLIB_SSE41)
2956  return _mm_cmplt_epi16(a, b);
2957 #elif defined(NLIB_NEON)
2958  return NLIB_CMP(vcltq, s16, a, b, u16);
2959 #endif
2960 }
2961 
2962 // r.s32[i] = a.s32[i] < a.s32[i] ? 0xFFFFFFFF : 0
2963 NLIB_M(i128) I128::CmpLtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2964 #if defined(NLIB_SSE41)
2965  return _mm_cmplt_epi32(a, b);
2966 #elif defined(NLIB_NEON)
2967  return NLIB_CMP(vcltq, s32, a, b, u32);
2968 #endif
2969 }
2970 
2971 // r.s8[i] = a.s8[i] > a.s8[i] ? 0xFF : 0
2972 NLIB_M(i128) I128::CmpGtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2973 #if defined(NLIB_SSE41)
2974  return _mm_cmpgt_epi8(a, b);
2975 #elif defined(NLIB_NEON)
2976  return NLIB_CMP(vcgtq, s8, a, b, u8);
2977 #endif
2978 }
2979 
2980 // r.s16[i] = a.s16[i] > a.s16[i] ? 0xFFFF : 0
2981 NLIB_M(i128) I128::CmpGtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2982 #if defined(NLIB_SSE41)
2983  return _mm_cmpgt_epi16(a, b);
2984 #elif defined(NLIB_NEON)
2985  return NLIB_CMP(vcgtq, s16, a, b, u16);
2986 #endif
2987 }
2988 
2989 // r.s32[i] = a.s32[i] > a.s32[i] ? 0xFFFFFFFF : 0
2990 NLIB_M(i128) I128::CmpGtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2991 #if defined(NLIB_SSE41)
2992  return _mm_cmpgt_epi32(a, b);
2993 #elif defined(NLIB_NEON)
2994  return NLIB_CMP(vcgtq, s32, a, b, u32);
2995 #endif
2996 }
2997 
2998 // r.u8[i] = a.u8[i] < a.u8[i] ? 0xFF : 0
2999 NLIB_M(i128) I128::CmpLtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
3000 #if defined(NLIB_SSE41)
3001  i128 ofs = I128::SetValue(0x80, each_uint8);
3002  return _mm_cmplt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
3003 #elif defined(NLIB_NEON)
3004  return NLIB_CMP(vcltq, u8, a, b, u8);
3005 #endif
3006 }
3007 
3008 // r.u8[i] = a.u8[i] > a.u8[i] ? 0xFF : 0
3009 NLIB_M(i128) I128::CmpGtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
3010 #if defined(NLIB_SSE41)
3011  i128 ofs = I128::SetValue(0x80, each_uint8);
3012  return _mm_cmpgt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
3013 #elif defined(NLIB_NEON)
3014  return NLIB_CMP(vcgtq, u8, a, b, u8);
3015 #endif
3016 }
3017 
3018 // r.u16[i] = a.u16[i] < a.u16[i] ? 0xFFFF : 0
3019 NLIB_M(i128) I128::CmpLtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
3020 #if defined(NLIB_SSE41)
3021  i128 ofs = I128::SetValue(0x8000U, each_uint16);
3022  return _mm_cmplt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
3023 #elif defined(NLIB_NEON)
3024  return NLIB_CMP(vcltq, u16, a, b, u16);
3025 #endif
3026 }
3027 
3028 // r.u16[i] = a.u16[i] > a.u16[i] ? 0xFFFF : 0
3029 NLIB_M(i128) I128::CmpGtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
3030 #if defined(NLIB_SSE41)
3031  i128 ofs = I128::SetValue(0x8000U, each_uint16);
3032  return _mm_cmpgt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
3033 #elif defined(NLIB_NEON)
3034  return NLIB_CMP(vcgtq, u16, a, b, u16);
3035 #endif
3036 }
3037 
3038 // r.u32[i] = a.u32[i] < a.u32[i] ? 0xFFFFFFFF : 0
3039 NLIB_M(i128) I128::CmpLtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
3040 #if defined(NLIB_SSE41)
3041  i128 ofs = I128::SetValue(0x80000000U, each_uint32);
3042  return _mm_cmplt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
3043 #elif defined(NLIB_NEON)
3044  return NLIB_CMP(vcltq, u32, a, b, u32);
3045 #endif
3046 }
3047 
3048 // r.u32[i] = a.u32[i] > a.u32[i] ? 0xFFFFFFFF : 0
3049 NLIB_M(i128) I128::CmpGtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
3050 #if defined(NLIB_SSE41)
3051  i128 ofs = I128::SetValue(0x80000000U, each_uint32);
3052  return _mm_cmpgt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
3053 #elif defined(NLIB_NEON)
3054  return NLIB_CMP(vcgtq, u32, a, b, u32);
3055 #endif
3056 }
3057 
3058 // r.s8[i] = a.s8[i] <= a.s8[i] ? 0xFF : 0
3059 NLIB_M(i128) I128::CmpLeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
3060 #if defined(NLIB_SSE41)
3061  return _mm_or_si128(_mm_cmplt_epi8(a, b), _mm_cmpeq_epi8(a, b));
3062 #elif defined(NLIB_NEON)
3063  return NLIB_CMP(vcleq, s8, a, b, u8);
3064 #endif
3065 }
3066 
3067 // r.s16[i] = a.s16[i] <= a.s16[i] ? 0xFFFF : 0
3068 NLIB_M(i128) I128::CmpLeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
3069 #if defined(NLIB_SSE41)
3070  return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
3071 #elif defined(NLIB_NEON)
3072  return NLIB_CMP(vcleq, s16, a, b, u16);
3073 #endif
3074 }
3075 
3076 // r.s32[i] = a.s32[i] <= a.s32[i] ? 0xFFFFFFFF : 0
3077 NLIB_M(i128) I128::CmpLeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
3078 #if defined(NLIB_SSE41)
3079  return _mm_or_si128(_mm_cmplt_epi32(a, b), _mm_cmpeq_epi32(a, b));
3080 #elif defined(NLIB_NEON)
3081  return NLIB_CMP(vcleq, s32, a, b, u32);
3082 #endif
3083 }
3084 
3085 // r.s8[i] = a.s8[i] >= a.s8[i] ? 0xFF : 0
3086 NLIB_M(i128) I128::CmpGeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
3087 #if defined(NLIB_SSE41)
3088  return _mm_or_si128(_mm_cmpgt_epi8(a, b), _mm_cmpeq_epi8(a, b));
3089 #elif defined(NLIB_NEON)
3090  return NLIB_CMP(vcgeq, s8, a, b, u8);
3091 #endif
3092 }
3093 
3094 // r.s16[i] = a.s16[i] >= a.s16[i] ? 0xFFFF : 0
3095 NLIB_M(i128) I128::CmpGeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
3096 #if defined(NLIB_SSE41)
3097  return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
3098 #elif defined(NLIB_NEON)
3099  return NLIB_CMP(vcgeq, s16, a, b, u16);
3100 #endif
3101 }
3102 
3103 // r.s32[i] = a.s32[i] >= a.s32[i] ? 0xFFFFFFFF : 0
3104 NLIB_M(i128) I128::CmpGeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
3105 #if defined(NLIB_SSE41)
3106  return _mm_or_si128(_mm_cmpgt_epi32(a, b), _mm_cmpeq_epi32(a, b));
3107 #elif defined(NLIB_NEON)
3108  return NLIB_CMP(vcgeq, s32, a, b, u32);
3109 #endif
3110 }
3111 
3112 // r.u8[i] = a.u8[i] <= a.u8[i] ? 0xFF : 0
3113 NLIB_M(i128) I128::CmpLeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
3114 #if defined(NLIB_SSE41)
3115  return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
3116 #elif defined(NLIB_NEON)
3117  return NLIB_CMP(vcleq, u8, a, b, u8);
3118 #endif
3119 }
3120 
3121 // r.u16[i] = a.u16[i] <= a.u16[i] ? 0xFFFF : 0
3122 NLIB_M(i128) I128::CmpLeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
3123 #if defined(NLIB_SSE41)
3124  return _mm_cmpeq_epi16(_mm_min_epu16(a, b), a);
3125 #elif defined(NLIB_NEON)
3126  return NLIB_CMP(vcleq, u16, a, b, u16);
3127 #endif
3128 }
3129 
3130 // r.u32[i] = a.u32[i] <= a.u32[i] ? 0xFFFFFFFF : 0
3131 NLIB_M(i128) I128::CmpLeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
3132 #if defined(NLIB_SSE41)
3133  return _mm_cmpeq_epi32(_mm_min_epu32(a, b), a);
3134 #elif defined(NLIB_NEON)
3135  return NLIB_CMP(vcleq, u32, a, b, u32);
3136 #endif
3137 }
3138 
3139 // r.u8[i] = a.u8[i] >= a.u8[i] ? 0xFF : 0
3140 NLIB_M(i128) I128::CmpGeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
3141 #if defined(NLIB_SSE41)
3142  return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a);
3143 #elif defined(NLIB_NEON)
3144  return NLIB_CMP(vcgeq, u8, a, b, u8);
3145 #endif
3146 }
3147 
3148 // r.u16[i] = a.u16[i] >= a.u16[i] ? 0xFFFF : 0
3149 NLIB_M(i128) I128::CmpGeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
3150 #if defined(NLIB_SSE41)
3151  return _mm_cmpeq_epi16(_mm_max_epu16(a, b), a);
3152 #elif defined(NLIB_NEON)
3153  return NLIB_CMP(vcgeq, u16, a, b, u16);
3154 #endif
3155 }
3156 
3157 // r.u32[i] = a.u32[i] >= a.u32[i] ? 0xFFFFFFFF : 0
3158 NLIB_M(i128) I128::CmpGeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
3159 #if defined(NLIB_SSE41)
3160  return _mm_cmpeq_epi32(_mm_max_epu32(a, b), a);
3161 #elif defined(NLIB_NEON)
3162  return NLIB_CMP(vcgeq, u32, a, b, u32);
3163 #endif
3164 }
3165 
3166 // r.u8[i] = value.u8[i] << count
3167 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value, int count) NLIB_NOEXCEPT {
3168 #if defined(NLIB_SSE41)
3169  __m128i hi = I128::GetHi(value);
3170  __m128i xh = _mm_slli_epi16(_mm_cvtepu8_epi16(hi), count);
3171  __m128i xl = _mm_slli_epi16(_mm_cvtepu8_epi16(value), count);
3172  return I128::NarrowFrom16To8(xl, xh);
3173 #elif defined(NLIB_NEON)
3174  return NLIB_SFT(vshlq, u8, value, count, s8);
3175 #endif
3176 }
3177 
3178 // r.u8[i] = value.u8[i] >> count
3179 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value, int count) NLIB_NOEXCEPT {
3180 #if defined(NLIB_SSE41)
3181  __m128i hi = I128::GetHi(value);
3182  __m128i xh = _mm_srli_epi16(_mm_cvtepu8_epi16(hi), count);
3183  __m128i xl = _mm_srli_epi16(_mm_cvtepu8_epi16(value), count);
3184  return _mm_packus_epi16(xl, xh);
3185 #elif defined(NLIB_NEON)
3186  return NLIB_SFT(vshlq, u8, value, -count, s8);
3187 #endif
3188 }
3189 
3190 // r.u16[i] = value.u16[i] << count
3191 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value, int count) NLIB_NOEXCEPT {
3192 #if defined(NLIB_SSE41)
3193  return _mm_slli_epi16(value, count);
3194 #elif defined(NLIB_NEON)
3195  return NLIB_SFT(vshlq, u16, value, count, s16);
3196 #endif
3197 }
3198 
3199 // r.u16[i] = value.u16[i] >> count
3200 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value, int count) NLIB_NOEXCEPT {
3201 #if defined(NLIB_SSE41)
3202  return _mm_srli_epi16(value, count);
3203 #elif defined(NLIB_NEON)
3204  return NLIB_SFT(vshlq, u16, value, -count, s16);
3205 #endif
3206 }
3207 
3208 // r.s16[i] = value.s16[i] >> count
3209 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value, int count) NLIB_NOEXCEPT {
3210 #if defined(NLIB_SSE41)
3211  return _mm_srai_epi16(value, count);
3212 #elif defined(NLIB_NEON)
3213  return NLIB_SFT(vshlq, s16, value, -count, s16);
3214 #endif
3215 }
3216 
3217 // r.u32[i] = value.u32[i] << count
3218 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value, int count) NLIB_NOEXCEPT {
3219 #if defined(NLIB_SSE41)
3220  return _mm_slli_epi32(value, count);
3221 #elif defined(NLIB_NEON)
3222  return NLIB_SFT(vshlq, u32, value, count, s32);
3223 #endif
3224 }
3225 
3226 // r.u32[i] = value.u32[i] >> count
3227 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value, int count) NLIB_NOEXCEPT {
3228 #if defined(NLIB_SSE41)
3229  return _mm_srli_epi32(value, count);
3230 #elif defined(NLIB_NEON)
3231  return NLIB_SFT(vshlq, u32, value, -count, s32);
3232 #endif
3233 }
3234 
3235 // r.s32[i] = value.s32[i] >> count
3236 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value, int count) NLIB_NOEXCEPT {
3237 #if defined(NLIB_SSE41)
3238  return _mm_srai_epi32(value, count);
3239 #elif defined(NLIB_NEON)
3240  return NLIB_SFT(vshlq, s32, value, -count, s32);
3241 #endif
3242 }
3243 
3244 // r.u32[i] = value.u64[i] << count
3245 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value, int count) NLIB_NOEXCEPT {
3246 #if defined(NLIB_SSE41)
3247  return _mm_slli_epi64(value, count);
3248 #elif defined(NLIB_NEON)
3249  return NLIB_SFT(vshlq, u64, value, count, s64);
3250 #endif
3251 }
3252 
3253 // r.u32[i] = value.u64[i] >> count
3254 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value, int count) NLIB_NOEXCEPT {
3255 #if defined(NLIB_SSE41)
3256  return _mm_srli_epi64(value, count);
3257 #elif defined(NLIB_NEON)
3258  return NLIB_SFT(vshlq, u64, value, -count, s64);
3259 #endif
3260 }
3261 
3262 template <size_t N>
3263 // r.i8[i + N] = r.i8[i], fill zero
3264 NLIB_M(i128) I128::ByteShiftLeft(i128arg value) NLIB_NOEXCEPT {
3265  NLIB_STATIC_ASSERT(N < 16);
3266 #if defined(NLIB_SSE41)
3267  return _mm_slli_si128(value, N);
3268 #elif defined(NLIB_NEON)
3269  return vextq_s8(vdupq_n_s8(0), value, 16 - N);
3270 #endif
3271 }
3272 
3273 template <size_t N>
3274 // r.i8[i - N] = r.i8[i], fill zero
3275 NLIB_M(i128) I128::ByteShiftRight(i128arg value) NLIB_NOEXCEPT {
3276  NLIB_STATIC_ASSERT(N < 16);
3277 #if defined(NLIB_SSE41)
3278  return _mm_srli_si128(value, N);
3279 #elif defined(NLIB_NEON)
3280  return vextq_s8(value, vdupq_n_s8(0), N);
3281 #endif
3282 }
3283 
3284 template <size_t N>
3285 // left <- fedcba9876543210 -> right
3286 NLIB_M(i128) I128::ByteRotateRight(i128arg value) NLIB_NOEXCEPT {
3287  NLIB_STATIC_ASSERT(N < 16);
3288 #if defined(NLIB_SSE41)
3289  return _mm_alignr_epi8(value, value, N);
3290 #elif defined(NLIB_NEON)
3291  return vextq_s8(value, value, N);
3292 #endif
3293 }
3294 
3295 template <size_t N>
3296 // _mm_alignr_epi8(a, b, N), which returns ((a << 128) | b) >> (N * 8)
3297 NLIB_M(i128) I128::AlignR(i128arg a, i128arg b) NLIB_NOEXCEPT {
3298  NLIB_STATIC_ASSERT(N < 16);
3299 #if defined(NLIB_SSE41)
3300  return _mm_alignr_epi8(a, b, N);
3301 #elif defined(NLIB_NEON)
3302  return vextq_s8(b, a, N);
3303 #endif
3304 }
3305 
3306 // r.u8[i] = value.u8[i * 2]
3307 NLIB_M(i128) I128::NarrowFrom16To8(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
3308 #if defined(NLIB_SSE41)
3309  i128 mask = I128::SetValue(0x00FFU, each_uint16);
3310  __m128i lo_mask = _mm_and_si128(lo, mask);
3311  __m128i hi_mask = _mm_and_si128(hi, mask);
3312  return _mm_packus_epi16(lo_mask, hi_mask);
3313 #elif defined(NLIB_NEON)
3314 # ifdef __aarch64__
3315  uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
3316  return vreinterpretq_s8_u8(vmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
3317 # else
3318  uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
3319  uint8x8_t h = vmovn_u16(vreinterpretq_u16_s8(hi));
3320  return NLIB_CMB(u8, l, h);
3321 # endif
3322 #endif
3323 }
3324 
3325 // r.u16[i] = value.u16[i * 2]
3326 NLIB_M(i128) I128::NarrowFrom32To16(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
3327 #if defined(NLIB_SSE41)
3328  i128 mask = I128::SetValue(0xFFFFU, each_uint32);
3329  __m128i lo_mask = _mm_and_si128(lo, mask);
3330  __m128i hi_mask = _mm_and_si128(hi, mask);
3331  return _mm_packus_epi32(lo_mask, hi_mask);
3332 #elif defined(NLIB_NEON)
3333 # ifdef __aarch64__
3334  uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
3335  return vreinterpretq_s8_u16(vmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
3336 # else
3337  uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
3338  uint16x4_t h = vmovn_u32(vreinterpretq_u32_s8(hi));
3339  return NLIB_CMB(u16, l, h);
3340 # endif
3341 #endif
3342 }
3343 
3344 // r.u32[i] = value.u32[i * 2]
3345 NLIB_M(i128) I128::NarrowFrom64To32(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
3346 #if defined(NLIB_SSE41)
3347  __m128i lo_ = _mm_shuffle_epi32(lo, _MM_SHUFFLE(3, 1, 2, 0));
3348  __m128i hi_ = _mm_shuffle_epi32(hi, _MM_SHUFFLE(3, 1, 2, 0));
3349  return _mm_unpacklo_epi64(lo_, hi_);
3350 #elif defined(NLIB_NEON)
3351 # ifdef __aarch64__
3352  uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
3353  return vreinterpretq_s8_u32(vmovn_high_u64(l, vreinterpretq_u64_s8(hi)));
3354 # else
3355  uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
3356  uint32x2_t h = vmovn_u64(vreinterpretq_u64_s8(hi));
3357  return NLIB_CMB(u32, l, h);
3358 # endif
3359 #endif
3360 }
3361 
3362 // r.u8[i] = 255 if value.u16[i] > 255
3363 NLIB_M(i128) I128::ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
3364 #if defined(NLIB_SSE41)
3365  i128 b7FFF = I128::SetValue(0x7FFFU, each_uint16);
3366  __m128i lotmp = _mm_and_si128(lo, b7FFF);
3367  __m128i hitmp = _mm_and_si128(hi, b7FFF);
3368  return _mm_packus_epi16(lotmp, hitmp);
3369 #elif defined(NLIB_NEON)
3370 # ifdef __aarch64__
3371  uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
3372  return vreinterpretq_s8_u8(vqmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
3373 # else
3374  uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
3375  uint8x8_t h = vqmovn_u16(vreinterpretq_u16_s8(hi));
3376  return NLIB_CMB(u8, l, h);
3377 # endif
3378 #endif
3379 }
3380 
3381 // r.s8[i] = 127 if value.s16[i] > 127, -128 if value.s16[i] < -128
3382 NLIB_M(i128) I128::ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
3383 #if defined(NLIB_SSE41)
3384  return _mm_packs_epi16(lo, hi);
3385 #elif defined(NLIB_NEON)
3386 # ifdef __aarch64__
3387  int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
3388  return vqmovn_high_s16(l, vreinterpretq_s16_s8(hi));
3389 # else
3390  int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
3391  int8x8_t h = vqmovn_s16(vreinterpretq_s16_s8(hi));
3392  return NLIB_CMB(s8, l, h);
3393 # endif
3394 #endif
3395 }
3396 
3397 // r.u16[i] = 65535 if value.u32[i] > 65535
3398 NLIB_M(i128) I128::ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
3399 #if defined(NLIB_SSE41)
3400  i128 b7FFFFFFF = I128::SetValue(0x7FFFFFFFU, each_uint32);
3401  __m128i lotmp = _mm_and_si128(lo, b7FFFFFFF);
3402  __m128i hitmp = _mm_and_si128(hi, b7FFFFFFF);
3403  return _mm_packus_epi32(lotmp, hitmp);
3404 #elif defined(NLIB_NEON)
3405 # ifdef __aarch64__
3406  uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
3407  return vreinterpretq_s8_u16(vqmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
3408 # else
3409  uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
3410  uint16x4_t h = vqmovn_u32(vreinterpretq_u32_s8(hi));
3411  return NLIB_CMB(u16, l, h);
3412 # endif
3413 #endif
3414 }
3415 
3416 // r.s16[i] = 32767 if value.s32[i] > 32767, -32768 if value.s32[i] < -32768
3417 NLIB_M(i128) I128::ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
3418 #if defined(NLIB_SSE41)
3419  return _mm_packs_epi32(lo, hi);
3420 #elif defined(NLIB_NEON)
3421 # ifdef __aarch64__
3422  int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
3423  return vreinterpretq_s8_s16(vqmovn_high_s32(l, vreinterpretq_s32_s8(hi)));
3424 # else
3425  int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
3426  int16x4_t h = vqmovn_s32(vreinterpretq_s32_s8(hi));
3427  return NLIB_CMB(s16, l, h);
3428 # endif
3429 #endif
3430 }
3431 
3432 // r.s8[2 * i] = value.s8[i], r.u8[2 * i + 1] = value.s8[i] < 0 ? 0xFF : 0
3433 NLIB_M(i128) I128::ConvertFromInt8ToInt16(i64arg value) NLIB_NOEXCEPT {
3434 #if defined(NLIB_SSE41)
3435  return _mm_cvtepi8_epi16(value);
3436 #elif defined(NLIB_NEON)
3437  return vreinterpretq_s8_s16(vmovl_s8(value));
3438 #endif
3439 }
3440 
3441 // r.s16[2 * i] = value.s16[i], r.u16[2 * i + 1] = value.s16[i] < 0 ? 0xFFFF : 0
3442 NLIB_M(i128) I128::ConvertFromInt16ToInt32(i64arg value) NLIB_NOEXCEPT {
3443 #if defined(NLIB_SSE41)
3444  return _mm_cvtepi16_epi32(value);
3445 #elif defined(NLIB_NEON)
3446  return vreinterpretq_s8_s32(vmovl_s16(vreinterpret_s16_s8(value)));
3447 #endif
3448 }
3449 
3450 // r.s32[2 * i] = value.s32[i], r.u32[2 * i + 1] = value.s32[i] < 0 ? 0xFFFFFFFF : 0
3451 NLIB_M(i128) I128::ConvertFromInt32ToInt64(i64arg value) NLIB_NOEXCEPT {
3452 #if defined(NLIB_SSE41)
3453  return _mm_cvtepi32_epi64(value);
3454 #elif defined(NLIB_NEON)
3455  return vreinterpretq_s8_s64(vmovl_s32(vreinterpret_s32_s8(value)));
3456 #endif
3457 }
3458 
3459 // r.u16[i] = value.u8[i]
3460 NLIB_M(i128) I128::ConvertFromUint8ToUint16(i64arg value) NLIB_NOEXCEPT {
3461 #if defined(NLIB_SSE41)
3462  return _mm_cvtepu8_epi16(value);
3463 #elif defined(NLIB_NEON)
3464  return vreinterpretq_s8_u16(vmovl_u8(vreinterpret_u8_s8(value)));
3465 #endif
3466 }
3467 
3468 // r.u32[i] = value.u16[i]
3469 NLIB_M(i128) I128::ConvertFromUint16ToUint32(i64arg value) NLIB_NOEXCEPT {
3470 #if defined(NLIB_SSE41)
3471  return _mm_cvtepu16_epi32(value);
3472 #elif defined(NLIB_NEON)
3473  return vreinterpretq_s8_u32(vmovl_u16(vreinterpret_u16_s8(value)));
3474 #endif
3475 }
3476 
3477 // r.u64[i] = value.u32[i]
3478 NLIB_M(i128) I128::ConvertFromUint32ToUint64(i64arg value) NLIB_NOEXCEPT {
3479 #if defined(NLIB_SSE41)
3480  return _mm_cvtepu32_epi64(value);
3481 #elif defined(NLIB_NEON)
3482  return vreinterpretq_s8_u64(vmovl_u32(vreinterpret_u32_s8(value)));
3483 #endif
3484 }
3485 
3486 // 01234567 abcdefgh -> 0a1b2c3d4e5f6g7h
3487 NLIB_M(i128) I128::Zip8(i64arg a, i64arg b) NLIB_NOEXCEPT {
3488 #if defined(NLIB_SSE41)
3489  return _mm_unpacklo_epi8(a, b);
3490 #elif defined(NLIB_NEON)
3491  int8x8x2_t tmp = vzip_s8(a, b);
3492  return vcombine_s8(tmp.val[0], tmp.val[1]);
3493 #endif
3494 }
3495 
3496 // 01234567 abcdefgh -> 0246aceg1357bdfh
3497 NLIB_M(i128) I128::Unzip8(i64arg a, i64arg b) NLIB_NOEXCEPT {
3498 #if defined(NLIB_SSE41)
3499  // -> a0,b0,a1,b1,a2,b2,a3,b3, ..., a7,b7
3500  __m128i tmp = _mm_unpacklo_epi16(a, b);
3501  // a0,a2,a4,a6,b0,b2,b4,b6,a1,a3,a5,a7,b1,b3,b5,b7
3502  NLIB_ALIGNAS(16) static const int8_t mask[16] = {0, 4, 8, 12, 2, 6, 10, 14,
3503  1, 5, 9, 13, 3, 7, 11, 15};
3504  return _mm_shuffle_epi8(tmp, *reinterpret_cast<const __m128i*>(&mask[0]));
3505 #elif defined(NLIB_NEON)
3506  int8x8x2_t tmp = vuzp_s8(a, b);
3507  return vcombine_s8(tmp.val[0], tmp.val[1]);
3508 #endif
3509 }
3510 
3511 // 0123 abcd -> 0a1b2c3d
3512 NLIB_M(i128) I128::Zip16(i64arg a, i64arg b) NLIB_NOEXCEPT {
3513 #if defined(NLIB_SSE41)
3514  return _mm_unpacklo_epi16(a, b);
3515 #elif defined(NLIB_NEON)
3516  uint16x4x2_t tmp = vzip_u16(vreinterpret_u16_s8(a), vreinterpret_u16_s8(b));
3517  return NLIB_CMB(u16, tmp.val[0], tmp.val[1]);
3518 #endif
3519 }
3520 
3521 // 0123 abcd -> 02ac13bd
3522 NLIB_M(i128) I128::Unzip16(i64arg a, i64arg b) NLIB_NOEXCEPT {
3523 #if defined(NLIB_SSE41)
3524  // a0,a1,a2,a3,b0,b1,b2,b3 -> a0,b0,a1,b1,a2,b2,a3,b3
3525  __m128i tmp = _mm_unpacklo_epi16(a, b);
3526  // a0,a2,b0,b2,a1,b3,b1,b3
3527  NLIB_ALIGNAS(16) static const int8_t mask[16] = {0, 1, 8, 9, 2, 3, 10, 11,
3528  4, 5, 12, 13, 6, 7, 14, 15};
3529  return _mm_shuffle_epi8(tmp, *reinterpret_cast<const __m128i*>(&mask[0]));
3530 #elif defined(NLIB_NEON)
3531  uint16x4x2_t tmp = vuzp_u16(vreinterpret_u16_s8(a), vreinterpret_u16_s8(b));
3532  return NLIB_CMB(u16, tmp.val[0], tmp.val[1]);
3533 #endif
3534 }
3535 
3536 // 01 ab -> 0a1b
3537 NLIB_M(i128) I128::Zip32(i64arg a, i64arg b) NLIB_NOEXCEPT {
3538 #if defined(NLIB_SSE41)
3539  return _mm_unpacklo_epi32(a, b);
3540 #elif defined(NLIB_NEON)
3541  uint32x2x2_t tmp = vuzp_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b));
3542  return NLIB_CMB(u32, tmp.val[0], tmp.val[1]);
3543 #endif
3544 }
3545 
3546 // 01 ab -> 0a1b
3547 NLIB_M(i128) I128::Unzip32(i64arg a, i64arg b) NLIB_NOEXCEPT {
3548 #if defined(NLIB_SSE41)
3549  return _mm_unpacklo_epi32(a, b);
3550 #elif defined(NLIB_NEON)
3551  uint32x2x2_t tmp = vuzp_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b));
3552  return NLIB_CMB(u32, tmp.val[0], tmp.val[1]);
3553 #endif
3554 }
3555 
3556 // 0123456789abcdef -> 1032547698badcfe
3557 NLIB_M(i128) I128::Reverse16(i128arg value) NLIB_NOEXCEPT {
3558 #if defined(NLIB_SSE41)
3559  NLIB_ALIGNAS(16) const int8_t mask_[16] = {
3560  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
3561  };
3562  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3563 #elif defined(NLIB_NEON)
3564  return NLIB_OP1(vrev16q, u8, value);
3565 #endif
3566 }
3567 
3568 // 0123456789abcdef -> 32107654ba98fedc
3569 NLIB_M(i128) I128::Reverse32(i128arg value) NLIB_NOEXCEPT {
3570 #if defined(NLIB_SSE41)
3571  NLIB_ALIGNAS(16) const int8_t mask_[16] = {
3572  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
3573  };
3574  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3575 #elif defined(NLIB_NEON)
3576  return NLIB_OP1(vrev32q, u8, value);
3577 #endif
3578 }
3579 
3580 // 0123456789abcdef -> 76543210fedcba98
3581 NLIB_M(i128) I128::Reverse64(i128arg value) NLIB_NOEXCEPT {
3582 #if defined(NLIB_SSE41)
3583  NLIB_ALIGNAS(16) const int8_t mask_[16] = {
3584  7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
3585  };
3586  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3587 #elif defined(NLIB_NEON)
3588  return NLIB_OP1(vrev64q, u8, value);
3589 #endif
3590 }
3591 
3592 // r = Or(ismasked(value.u8[i]) ? (1 << i) : 0)
3593 NLIB_M(int) I128::MoveMask8(i128arg value) NLIB_NOEXCEPT { // NOLINT
3594 #if defined(NLIB_SSE41)
3595  return _mm_movemask_epi8(value);
3596 #elif defined(NLIB_NEON)
3597 # ifdef __aarch64__
3598  NLIB_ALIGNAS(16) static const uint64_t powers_[2] = {0x8040201008040201ULL,
3599  0x8040201008040201ULL};
3600  uint64x2_t pwrtmp = vld1q_u64(powers_);
3601  uint8x16_t powers = vreinterpretq_u8_u64(pwrtmp);
3602  uint8x16_t a = vandq_u8(vreinterpretq_u8_s8(value), powers);
3603  uint8x16_t tmp = vpaddq_u8(a, a);
3604  tmp = vpaddq_u8(tmp, tmp);
3605  tmp = vpaddq_u8(tmp, tmp);
3606  return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
3607 # else
3608  NLIB_ALIGNAS(16) static const uint64_t powers_[2] = {0x8040201008040201ULL,
3609  0x8040201008040201ULL};
3610  uint64x2_t pwrtmp = vld1q_u64(powers_); // for NEONvsSSE
3611  uint8x16_t powers = vreinterpretq_u8_u64(pwrtmp);
3612  uint8x16_t a = vandq_u8(vreinterpretq_u8_s8(value), powers);
3613  uint8x8_t al = vget_low_u8(a);
3614  uint8x8_t ah = vget_high_u8(a);
3615  uint8x8_t tmp = vpadd_u8(al, ah);
3616  tmp = vpadd_u8(tmp, tmp);
3617  tmp = vpadd_u8(tmp, tmp);
3618  return vget_lane_u16(vreinterpret_u16_u8(tmp), 0);
3619 # endif
3620 #endif
3621 }
3622 
3623 // r = Or(ismasked(value.u16[i]) ? (1 << i) : 0)
3624 NLIB_M(int) I128::MoveMask16(i128arg value) NLIB_NOEXCEPT { // NOLINT
3625 #if defined(NLIB_SSE41)
3626  __m128i tmp = _mm_packs_epi16(value, value);
3627  return _mm_movemask_epi8(tmp) & 255;
3628 #elif defined(NLIB_NEON)
3629  NLIB_ALIGNAS(16) static const uint64_t powers_[2] = {0x0008000400020001ULL,
3630  0x0080004000200010ULL};
3631  uint64x2_t pwrtmp = vld1q_u64(powers_); // for NEONvsSSE
3632  uint16x8_t powers = vreinterpretq_u16_u64(pwrtmp);
3633  uint16x8_t a = vandq_u16(vreinterpretq_u16_s8(value), powers);
3634  uint8x8_t tmp = vmovn_u16(a);
3635  tmp = vpadd_u8(tmp, tmp);
3636  tmp = vpadd_u8(tmp, tmp);
3637  tmp = vpadd_u8(tmp, tmp);
3638  return vget_lane_u8(tmp, 0);
3639 #endif
3640 }
3641 
3642 // r = Or(ismasked(value.u32[i]) ? (1 << i) : 0)
3643 NLIB_M(int) I128::MoveMask32(i128arg value) NLIB_NOEXCEPT { // NOLINT
3644 #if defined(NLIB_SSE41)
3645  __m128i tmp = _mm_packs_epi16(value, value);
3646  tmp = _mm_packs_epi16(tmp, tmp);
3647  return _mm_movemask_epi8(tmp) & 15;
3648 #elif defined(NLIB_NEON)
3649  NLIB_ALIGNAS(16) static const uint64_t powers_[2] = {0x0000000200000001ULL,
3650  0x0000000800000004ULL};
3651  uint64x2_t pwrtmp = vld1q_u64(powers_); // for NEONvsSSE
3652  uint32x4_t powers = vreinterpretq_u32_u64(pwrtmp);
3653  uint32x4_t a = vandq_u32(vreinterpretq_u32_s8(value), powers);
3654  uint16x4_t tmp = vmovn_u32(a);
3655  tmp = vpadd_u16(tmp, tmp);
3656  tmp = vpadd_u16(tmp, tmp);
3657  return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
3658 #endif
3659 }
3660 
3661 // true if value == 0
3662 NLIB_M(bool) I128::IsZero(i128arg value) NLIB_NOEXCEPT { // NOLINT
3663 #if defined(NLIB_SSE41)
3664  return _mm_testz_si128(value, value) != 0;
3665 #elif defined(NLIB_NEON)
3666  int8x8_t tmp = vorr_s8(vget_low_s8(value), vget_high_s8(value));
3667  return vget_lane_u64(vreinterpret_u64_s8(tmp), 0) == 0;
3668 #endif
3669 }
3670 
3671 // true if all bits on
3672 NLIB_M(bool) I128::IsFull(i128arg value) NLIB_NOEXCEPT { // NOLINT
3673 #if defined(NLIB_SSE41)
3674  return _mm_testc_si128(value, _mm_cmpeq_epi8(value, value)) != 0;
3675 #elif defined(NLIB_NEON)
3676  int8x8_t tmp = vand_s8(vget_low_s8(value), vget_high_s8(value));
3677  return vget_lane_s64(vreinterpret_s64_s8(tmp), 0) == -1;
3678 #endif
3679 }
3680 
3681 // r.i8[i] = ismasked(mask.i8[i]) ? a.i8[i] : b.i8[i]
3682 NLIB_M(i128) I128::Select(i128arg mask, i128arg a, i128arg b) NLIB_NOEXCEPT {
3683 #if defined(NLIB_SSE41)
3684  return _mm_blendv_epi8(b, a, mask);
3685 #elif defined(NLIB_NEON)
3686  return NLIB_OP3(vbslq, u32, mask, a, b);
3687 #endif
3688 }
3689 
3690 // r[i] = 0 if shuffle.u8[i] == 0x80, r[i] = value[shuffle.u8[i]] if 0 <= shuffle.u8[i] < 16
3691 NLIB_M(i128) I128::Shuffle8(i128arg value, i128arg shuffle) NLIB_NOEXCEPT {
3692 #if defined(NLIB_SSE41)
3693  return _mm_shuffle_epi8(value, shuffle);
3694 #elif defined(NLIB_NEON)
3695 # ifdef __aarch64__
3696  return vqtbl1q_s8(value, vreinterpretq_u8_s8(shuffle));
3697 # else
3698  int8x8x2_t x;
3699  x.val[0] = vget_low_s8(value);
3700  x.val[1] = vget_high_s8(value);
3701  int8x8_t lo = vtbl2_s8(x, vget_low_s8(shuffle));
3702  int8x8_t hi = vtbl2_s8(x, vget_high_s8(shuffle));
3703  return vcombine_s8(lo, hi);
3704 # endif
3705 #endif
3706 }
3707 
3708 #ifdef NLIB_NEON
3709 # undef vreinterpretq_s8_s8
3710 # undef NLIB_OP1
3711 # undef NLIB_OP2
3712 # undef NLIB_OP3
3713 # undef NLIB_CMP
3714 # undef NLIB_SFT
3715 # undef NLIB_CMB
3716 #endif
3717 
3718 #endif // NLIB_DOXYGEN
3719 
3720 #undef NLIB_M
3721 #undef NLIB_M2
3722 
3723 #if defined(NLIB_SSE41)
3724 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3725  { \
3726  row0 = _mm_shuffle_epi32(row0, _MM_SHUFFLE(3, 1, 2, 0)); \
3727  row1 = _mm_shuffle_epi32(row1, _MM_SHUFFLE(3, 1, 2, 0)); \
3728  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(3, 1, 2, 0)); \
3729  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(3, 1, 2, 0)); \
3730  __m128i t0_transpose32_ = _mm_unpacklo_epi32(row0, row1); \
3731  __m128i t1_transpose32_ = _mm_unpackhi_epi32(row0, row1); \
3732  __m128i t2_transpose32_ = _mm_unpacklo_epi32(row2, row3); \
3733  __m128i t3_transpose32_ = _mm_unpackhi_epi32(row2, row3); \
3734  row0 = _mm_unpacklo_epi64(t0_transpose32_, t2_transpose32_); \
3735  row1 = _mm_unpacklo_epi64(t1_transpose32_, t3_transpose32_); \
3736  row2 = _mm_unpackhi_epi64(t0_transpose32_, t2_transpose32_); \
3737  row3 = _mm_unpackhi_epi64(t1_transpose32_, t3_transpose32_); \
3738 }
3739 #elif defined(NLIB_NEON)
3740 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3741  { \
3742  uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), \
3743  vreinterpretq_u32_s8(row1)); \
3744  uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), \
3745  vreinterpretq_u32_s8(row3)); \
3746  uint32x4_t row0_, row1_, row2_, row3_; \
3747  uint32x2_t lo, hi; \
3748  lo = vget_low_u32(trn_f0_.val[0]); hi = vget_low_u32(trn_f1_.val[0]); \
3749  row0_ = vcombine_u32(lo, hi); \
3750  row0 = vreinterpretq_s8_u32(row0_); \
3751  lo = vget_low_u32(trn_f0_.val[1]); hi = vget_low_u32(trn_f1_.val[1]); \
3752  row1_ = vcombine_u32(lo, hi); \
3753  row1 = vreinterpretq_s8_u32(row1_); \
3754  lo = vget_high_u32(trn_f0_.val[0]); hi = vget_high_u32(trn_f1_.val[0]); \
3755  row2_ = vcombine_u32(lo, hi); \
3756  row2 = vreinterpretq_s8_u32(row2_); \
3757  lo = vget_high_u32(trn_f0_.val[1]); hi = vget_high_u32(trn_f1_.val[1]); \
3758  row3_ = vcombine_u32(lo, hi); \
3759  row3 = vreinterpretq_s8_u32(row3_); \
3760  }
3761 #endif
3762 
3763 #endif // NLIB_SIMD
3764 
3765 } // namespace simd
3766 NLIB_NAMESPACE_END
3767 
3768 #endif // INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
The tag for representing a signed 32-bit integer with an empty structure.
Definition: SimdInt.h:28
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Definition: Platform.h:2151
static i128 LoadA1(intptr_t p) noexcept
Wraps LoadA1(const void* p).
Definition: SimdInt.h:1619
The tag for representing a signed 64-bit integer with an empty structure.
Definition: SimdInt.h:30
__m128i nlib_i64_t
The type for a SIMD register for 64-bit integers.
Definition: SimdInt.h:9
static i128 LoadA4(intptr_t p) noexcept
Wraps LoadA4(const void* p).
Definition: SimdInt.h:1613
constexpr const each_uint8_tag each_uint8
The tag for representing an unsigned 8-bit integer with an each_uint8_tag-type constant object...
Definition: SimdInt.h:44
int nlib_sse42_supported
Sets a nonzero value if the x86-series architecture supports SSE 4.2.
static void StoreA16(uintptr_t p, i128arg value) noexcept
Wraps StoreA16(void* p, i128arg value).
Definition: SimdInt.h:1628
The tag for representing the selection of a lane divided into 8-bit units with an empty structure...
Definition: SimdInt.h:54
#define NLIB_VIS_HIDDEN
Symbols for functions and classes are not made available outside of the library.
Definition: Platform_unix.h:50
The tag for representing the selection of a lane divided into 32-bit units with an empty structure...
Definition: SimdInt.h:50
static i128 LoadA1(uintptr_t p) noexcept
Wraps LoadA1(const void* p).
Definition: SimdInt.h:1604
The tag for representing a signed 8-bit integer with an empty structure.
Definition: SimdInt.h:24
static i128 LoadA2(intptr_t p) noexcept
Wraps LoadA2(const void* p).
Definition: SimdInt.h:1616
static void StoreA2(intptr_t p, i128arg value) noexcept
Wraps StoreA2(void* p, i128arg value).
Definition: SimdInt.h:1652
constexpr const each_uint16_tag each_uint16
The tag for representing an unsigned 16-bit integer with an each_uint16_tag-type constant object...
Definition: SimdInt.h:45
static void StoreA16(intptr_t p, i128arg value) noexcept
Wraps StoreA16(void* p, i128arg value).
Definition: SimdInt.h:1643
The tag for representing an unsigned 16-bit integer with an empty structure.
Definition: SimdInt.h:34
The tag for representing an unsigned 64-bit integer with an empty structure.
Definition: SimdInt.h:38
constexpr const each_int64_tag each_int64
The tag for representing a signed 64-bit integer with an each_int64_tag-type constant object...
Definition: SimdInt.h:43
nlib_i128_t i128
nlib_i128_t is defined using typedef.
Definition: SimdInt.h:71
static void StoreA1(uintptr_t p, i128arg value) noexcept
Wraps StoreA1(void* p, i128arg value).
Definition: SimdInt.h:1640
static i128 LoadA16(uintptr_t p) noexcept
Wraps LoadA16(const void* p).
Definition: SimdInt.h:1592
constexpr const each_uint64_tag each_uint64
The tag for representing an unsigned 64-bit integer with an each_uint64_tag-type constant object...
Definition: SimdInt.h:47
#define NLIB_CEXPR
Defines constexpr if it is available for use. If not, holds an empty string.
static i128 LoadA16(intptr_t p) noexcept
Wraps LoadA16(const void* p).
Definition: SimdInt.h:1607
The class for integer SIMD computations using128-bit registers (MM0-XMM15 for SSE, and Q0-Q15 for NEON).
Definition: SimdInt.h:1564
The tag for representing an unsigned 8-bit integer with an empty structure.
Definition: SimdInt.h:32
The tag for representing a signed 16-bit integer with an empty structure.
Definition: SimdInt.h:26
constexpr const each_int16_tag each_int16
The tag for representing a signed 16-bit integer with an each_int16_tag-type constant object...
Definition: SimdInt.h:41
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
Definition: SimdInt.h:46
The tag for representing the selection of a lane divided into 16-bit units with an empty structure...
Definition: SimdInt.h:52
static i128 LoadA8(intptr_t p) noexcept
Wraps LoadA8(const void* p).
Definition: SimdInt.h:1610
constexpr const each_select16_tag each_select16
The tag for representing the selection of a 16-bit lane with an each_select16_tag-type constant objec...
Definition: SimdInt.h:57
static i128 LoadA2(uintptr_t p) noexcept
Wraps LoadA2(const void* p).
Definition: SimdInt.h:1601
A file that contains the configuration information for each development environment.
static void StoreA8(uintptr_t p, i128arg value) noexcept
Wraps StoreA8(void* p, i128arg value).
Definition: SimdInt.h:1631
The class for 64-bit wide integer SIMD computations that is otherwise the same as the I128 class...
Definition: SimdInt.h:81
__m128i nlib_i128_t
The type for a SIMD register for 128-bit integers.
Definition: SimdInt.h:10
constexpr const each_select8_tag each_select8
The tag for representing the selection of an 8-bit lane with an each_select8_tag-type constant object...
Definition: SimdInt.h:58
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
Definition: Config.h:209
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
Definition: SimdInt.h:40
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
Definition: SimdInt.h:56
static void StoreA8(intptr_t p, i128arg value) noexcept
Wraps StoreA8(void* p, i128arg value).
Definition: SimdInt.h:1646
The tag for representing an unsigned 32-bit integer with an empty structure.
Definition: SimdInt.h:36
nlib_i64_t i64
nlib_i64_t is defined using typedef.
Definition: SimdInt.h:69
#define NLIB_ALWAYS_INLINE
Indicates that the compiler is forced to perform inline expansion of functions.
Definition: Platform_unix.h:59
static void StoreA4(intptr_t p, i128arg value) noexcept
Wraps StoreA4(void* p, i128arg value).
Definition: SimdInt.h:1649
constexpr const each_int32_tag each_int32
The tag for representing a signed 32-bit integer with an each_int32_tag-type constant object...
Definition: SimdInt.h:42
static void StoreA4(uintptr_t p, i128arg value) noexcept
Wraps StoreA4(void* p, i128arg value).
Definition: SimdInt.h:1634
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
Definition: Config.h:117
static void StoreA1(intptr_t p, i128arg value) noexcept
Wraps StoreA1(void* p, i128arg value).
Definition: SimdInt.h:1655
static void StoreA2(uintptr_t p, i128arg value) noexcept
Wraps StoreA2(void* p, i128arg value).
Definition: SimdInt.h:1637
static i128 LoadA8(uintptr_t p) noexcept
Wraps LoadA8(const void* p).
Definition: SimdInt.h:1595
static i128 LoadA4(uintptr_t p) noexcept
Wraps LoadA4(const void* p).
Definition: SimdInt.h:1598