nlib
SimdInt.h
Go to the documentation of this file.
1 
2 #pragma once
3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
4 #define INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
5 
6 #include "nn/nlib/Config.h"
7 
8 #if defined(NLIB_SSE41)
9 typedef __m128i nlib_i128_t;
10 #elif defined(NLIB_NEON)
11 typedef int8x16_t nlib_i128_t;
12 #endif
13 
14 NLIB_NAMESPACE_BEGIN
15 namespace simd {
16 
17 // use each_int8 for the argument value
18 struct each_int8_tag {};
19 // use each_int16 for the argument value
20 struct each_int16_tag {};
21 // use each_int32 for the argument value
22 struct each_int32_tag {};
23 // use each_int64 for the argument value
24 struct each_int64_tag {};
25 // use each_uint8 for the argument value
26 struct each_uint8_tag {};
27 // use each_uint16 for the argument value
28 struct each_uint16_tag {};
29 // use each_uint32 for the argument value
30 struct each_uint32_tag {};
31 // use each_uint64 for the argument value
32 struct each_uint64_tag {};
33 
42 
43 // use each_select32 for the argument value
45 // use each_select16 for the argument value
47 // use each_select8 for the argument value
48 struct each_select8_tag {};
49 
53 
54 #if !defined(_MSC_VER) || _MSC_VER < 1800
55 #ifndef __vectorcall
56 #define __vectorcall
57 #endif
58 #endif
59 
60 #if defined(NLIB_SIMD)
61 
62 // __m128i(SSE), int8x16_t(NEON)
63 typedef nlib_i128_t i128;
64 
65 #if _MSC_VER < 1800
66 typedef const i128& i128arg;
67 #else
68 typedef const i128 i128arg;
69 #endif
70 
72  static i128 __vectorcall SetValue(int8_t v, each_int8_tag) NLIB_NOEXCEPT;
73  static i128 __vectorcall SetValue(int16_t v, each_int16_tag) NLIB_NOEXCEPT;
74  static i128 __vectorcall SetValue(int32_t v, each_int32_tag) NLIB_NOEXCEPT;
75  static i128 __vectorcall SetValue(int64_t v, each_int64_tag) NLIB_NOEXCEPT;
76  static i128 __vectorcall SetValue(uint8_t v, each_uint8_tag) NLIB_NOEXCEPT;
77  static i128 __vectorcall SetValue(uint16_t v, each_uint16_tag) NLIB_NOEXCEPT;
78  static i128 __vectorcall SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT;
79  static i128 __vectorcall SetValue(uint64_t v, each_uint64_tag) NLIB_NOEXCEPT;
80 
81  template <size_t N>
82  static i128 __vectorcall SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT;
83  template <size_t N>
84  static i128 __vectorcall SetValue(i128 value, each_select16_tag) NLIB_NOEXCEPT;
85  template <size_t N>
86  static i128 __vectorcall SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT;
87 
88  static i128 __vectorcall SetZero() NLIB_NOEXCEPT;
89  static i128 __vectorcall SetFull(i128arg dummy) NLIB_NOEXCEPT;
90 
91  static i128 __vectorcall LoadA16(const void* p) NLIB_NOEXCEPT;
92  static i128 __vectorcall LoadA8(const void* p) NLIB_NOEXCEPT;
93  static i128 __vectorcall LoadA4(const void* p) NLIB_NOEXCEPT;
94  static i128 __vectorcall LoadA2(const void* p) NLIB_NOEXCEPT;
95  static i128 __vectorcall LoadA1(const void* p) NLIB_NOEXCEPT;
96  static i128 __vectorcall LoadLoA8(const void* p) NLIB_NOEXCEPT;
97  static i128 __vectorcall LoadLoA4(const void* p) NLIB_NOEXCEPT;
98  static i128 __vectorcall LoadLoA2(const void* p) NLIB_NOEXCEPT;
99  static i128 __vectorcall LoadLoA1(const void* p) NLIB_NOEXCEPT;
100  static i128 __vectorcall LoadHiA8(const void* p) NLIB_NOEXCEPT;
101  static i128 __vectorcall LoadHiA4(const void* p) NLIB_NOEXCEPT;
102  static i128 __vectorcall LoadHiA2(const void* p) NLIB_NOEXCEPT;
103  static i128 __vectorcall LoadHiA1(const void* p) NLIB_NOEXCEPT;
104 
105 #define NLIB_LOAD_REDIRECT(func) \
106  static i128 __vectorcall func(uintptr_t p) NLIB_NOEXCEPT { \
107  return func(reinterpret_cast<void*>(p)); \
108  } \
109  static i128 __vectorcall func(intptr_t p) NLIB_NOEXCEPT { \
110  return func(reinterpret_cast<void*>(p)); \
111  }
112  NLIB_LOAD_REDIRECT(LoadA16)
113  NLIB_LOAD_REDIRECT(LoadA8)
114  NLIB_LOAD_REDIRECT(LoadA4)
115  NLIB_LOAD_REDIRECT(LoadA2)
116  NLIB_LOAD_REDIRECT(LoadA1)
117  NLIB_LOAD_REDIRECT(LoadLoA8)
118  NLIB_LOAD_REDIRECT(LoadLoA4)
119  NLIB_LOAD_REDIRECT(LoadLoA2)
120  NLIB_LOAD_REDIRECT(LoadLoA1)
121  NLIB_LOAD_REDIRECT(LoadHiA8)
122  NLIB_LOAD_REDIRECT(LoadHiA4)
123  NLIB_LOAD_REDIRECT(LoadHiA2)
124  NLIB_LOAD_REDIRECT(LoadHiA1)
125 #undef NLIB_LOAD_REDIRECT
126 
127  static void __vectorcall StoreA16(void* p, i128arg value) NLIB_NOEXCEPT;
128  static void __vectorcall StoreA8(void* p, i128arg value) NLIB_NOEXCEPT;
129  static void __vectorcall StoreA4(void* p, i128arg value) NLIB_NOEXCEPT;
130  static void __vectorcall StoreA2(void* p, i128arg value) NLIB_NOEXCEPT;
131  static void __vectorcall StoreA1(void* p, i128arg value) NLIB_NOEXCEPT;
132  static void __vectorcall StoreLoA8(void* p, i128arg value) NLIB_NOEXCEPT;
133  static void __vectorcall StoreLoA4(void* p, i128arg value) NLIB_NOEXCEPT;
134  static void __vectorcall StoreLoA2(void* p, i128arg value) NLIB_NOEXCEPT;
135  static void __vectorcall StoreLoA1(void* p, i128arg value) NLIB_NOEXCEPT;
136  static void __vectorcall StoreHiA8(void* p, i128arg value) NLIB_NOEXCEPT;
137  static void __vectorcall StoreHiA4(void* p, i128arg value) NLIB_NOEXCEPT;
138  static void __vectorcall StoreHiA2(void* p, i128arg value) NLIB_NOEXCEPT;
139  static void __vectorcall StoreHiA1(void* p, i128arg value) NLIB_NOEXCEPT;
140 
141 #define NLIB_STORE_REDIRECT(func) \
142  static void __vectorcall func(uintptr_t p, i128arg value) NLIB_NOEXCEPT { \
143  func(reinterpret_cast<void*>(p), value); \
144  } \
145  static void __vectorcall func(intptr_t p, i128arg value) NLIB_NOEXCEPT { \
146  func(reinterpret_cast<void*>(p), value); \
147  }
148  NLIB_STORE_REDIRECT(StoreA16)
149  NLIB_STORE_REDIRECT(StoreA8)
150  NLIB_STORE_REDIRECT(StoreA4)
151  NLIB_STORE_REDIRECT(StoreA2)
152  NLIB_STORE_REDIRECT(StoreA1)
153  NLIB_STORE_REDIRECT(StoreLoA8)
154  NLIB_STORE_REDIRECT(StoreLoA4)
155  NLIB_STORE_REDIRECT(StoreLoA2)
156  NLIB_STORE_REDIRECT(StoreLoA1)
157  NLIB_STORE_REDIRECT(StoreHiA8)
158  NLIB_STORE_REDIRECT(StoreHiA4)
159  NLIB_STORE_REDIRECT(StoreHiA2)
160  NLIB_STORE_REDIRECT(StoreHiA1)
161 #undef NLIB_STORE_REDIRECT
162 
163  //
164  // Get/Set
165  //
166  template <size_t N>
167  static uint8_t __vectorcall GetUint8FromLane(i128arg value) NLIB_NOEXCEPT;
168  template <size_t N>
169  static uint16_t __vectorcall GetUint16FromLane(i128arg value) NLIB_NOEXCEPT;
170  template <size_t N>
171  static uint32_t __vectorcall GetUint32FromLane(i128arg value) NLIB_NOEXCEPT;
172  template <size_t N>
173  static uint64_t __vectorcall GetUint64FromLane(i128arg value) NLIB_NOEXCEPT;
174  template <size_t N>
175  static i128 __vectorcall SetUint8ToLane(i128arg value, uint8_t v) NLIB_NOEXCEPT;
176  template <size_t N>
177  static i128 __vectorcall SetUint16ToLane(i128arg value, uint16_t v) NLIB_NOEXCEPT;
178  template <size_t N>
179  static i128 __vectorcall SetUint32ToLane(i128arg value, uint32_t v) NLIB_NOEXCEPT;
180  template <size_t N>
181  static i128 __vectorcall SetUint64ToLane(i128arg value, uint64_t v) NLIB_NOEXCEPT;
182 
183  //
184  // Arithmetic Operations
185  //
186  static i128 __vectorcall Add8(i128arg a, i128arg b) NLIB_NOEXCEPT;
187  static i128 __vectorcall Add16(i128arg a, i128arg b) NLIB_NOEXCEPT;
188  static i128 __vectorcall Add32(i128arg a, i128arg b) NLIB_NOEXCEPT;
189  static i128 __vectorcall Add64(i128arg a, i128arg b) NLIB_NOEXCEPT;
190 
191  static i128 __vectorcall AddInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
192  static i128 __vectorcall AddInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
193 
194  static i128 __vectorcall AddUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
195  static i128 __vectorcall AddUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
196 
197  static i128 __vectorcall Sub8(i128arg a, i128arg b) NLIB_NOEXCEPT;
198  static i128 __vectorcall Sub16(i128arg a, i128arg b) NLIB_NOEXCEPT;
199  static i128 __vectorcall Sub32(i128arg a, i128arg b) NLIB_NOEXCEPT;
200  static i128 __vectorcall Sub64(i128arg a, i128arg b) NLIB_NOEXCEPT;
201 
202  static i128 __vectorcall SubInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
203  static i128 __vectorcall SubInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
204 
205  static i128 __vectorcall SubUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
206  static i128 __vectorcall SubUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
207 
208  static i128 __vectorcall PairwiseAdd8(i128arg a, i128arg b) NLIB_NOEXCEPT;
209  static i128 __vectorcall PairwiseAdd16(i128arg a, i128arg b) NLIB_NOEXCEPT;
210  static i128 __vectorcall PairwiseAdd32(i128arg a, i128arg b) NLIB_NOEXCEPT;
211  // PairwiseMaxInt8, PairwiseMaxInt16, PairwiseMaxInt32
212  // PairwiseMaxUint8, PairwiseMaxUint16, PairwiseMaxUint32
213  // PairwiseMinInt8, PairwiseMinInt16, PairwiseMinInt32
214  // PairwiseMinUint8, PairwiseMinUint16, PairwiseMinUint32
215 
216  static i128 __vectorcall Mult16(i128arg a, i128arg b) NLIB_NOEXCEPT;
217  static i128 __vectorcall MultAdd16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
218  static i128 __vectorcall MultSub16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
219  static i128 __vectorcall Mult32(i128arg a, i128arg b) NLIB_NOEXCEPT;
220  static i128 __vectorcall MultAdd32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
221  static i128 __vectorcall MultSub32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
222 
223  static i128 __vectorcall NegateInt8(i128arg value) NLIB_NOEXCEPT;
224  static i128 __vectorcall NegateInt16(i128arg value) NLIB_NOEXCEPT;
225  static i128 __vectorcall NegateInt32(i128arg value) NLIB_NOEXCEPT;
226 
227  static i128 __vectorcall MaxInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
228  static i128 __vectorcall MaxInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
229  static i128 __vectorcall MaxInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
230  static i128 __vectorcall MaxUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
231  static i128 __vectorcall MaxUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
232  static i128 __vectorcall MaxUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
233  static i128 __vectorcall MinInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
234  static i128 __vectorcall MinInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
235  static i128 __vectorcall MinInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
236  static i128 __vectorcall MinUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
237  static i128 __vectorcall MinUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
238  static i128 __vectorcall MinUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
239 
240  static i128 __vectorcall AbsInt8(i128arg value) NLIB_NOEXCEPT;
241  static i128 __vectorcall AbsInt16(i128arg value) NLIB_NOEXCEPT;
242  static i128 __vectorcall AbsInt32(i128arg value) NLIB_NOEXCEPT;
243  static i128 __vectorcall AbsDiffInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
244  static i128 __vectorcall AbsDiffInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
245  static i128 __vectorcall AbsDiffInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
246 
247  //
248  // Logical Operations
249  //
250  static i128 __vectorcall And(i128arg a, i128arg b) NLIB_NOEXCEPT;
251  static i128 __vectorcall Or(i128arg a, i128arg b) NLIB_NOEXCEPT;
252  static i128 __vectorcall Xor(i128arg a, i128arg b) NLIB_NOEXCEPT;
253  static i128 __vectorcall Not(i128arg a) NLIB_NOEXCEPT;
254  static i128 __vectorcall AndNot(i128arg a, i128arg b) NLIB_NOEXCEPT;
255  static i128 __vectorcall OrNot(i128arg a, i128arg b) NLIB_NOEXCEPT;
256  static i128 __vectorcall Test8(i128arg a, i128arg b) NLIB_NOEXCEPT;
257  static i128 __vectorcall Test16(i128arg a, i128arg b) NLIB_NOEXCEPT;
258  static i128 __vectorcall Test32(i128arg a, i128arg b) NLIB_NOEXCEPT;
259 
260  //
261  // Comparison Operations
262  //
263  static i128 __vectorcall CmpEq8(i128arg a, i128arg b) NLIB_NOEXCEPT;
264  static i128 __vectorcall CmpEq16(i128arg a, i128arg b) NLIB_NOEXCEPT;
265  static i128 __vectorcall CmpEq32(i128arg a, i128arg b) NLIB_NOEXCEPT;
266  static i128 __vectorcall CmpEq64(i128arg a, i128arg b) NLIB_NOEXCEPT;
267 
268  static i128 __vectorcall CmpLtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
269  static i128 __vectorcall CmpLtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
270  static i128 __vectorcall CmpLtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
271  static i128 __vectorcall CmpLtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
272 
273  static i128 __vectorcall CmpGtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
274  static i128 __vectorcall CmpGtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
275  static i128 __vectorcall CmpGtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
276  static i128 __vectorcall CmpGtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
277 
278  static i128 __vectorcall CmpLtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
279  static i128 __vectorcall CmpLtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
280  static i128 __vectorcall CmpLtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
281  static i128 __vectorcall CmpLtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
282 
283  static i128 __vectorcall CmpGtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
284  static i128 __vectorcall CmpGtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
285  static i128 __vectorcall CmpGtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
286  static i128 __vectorcall CmpGtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
287 
288  static i128 __vectorcall CmpLeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
289  static i128 __vectorcall CmpLeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
290  static i128 __vectorcall CmpLeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
291  static i128 __vectorcall CmpLeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
292 
293  static i128 __vectorcall CmpGeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
294  static i128 __vectorcall CmpGeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
295  static i128 __vectorcall CmpGeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
296  static i128 __vectorcall CmpGeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
297 
298  static i128 __vectorcall CmpLeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
299  static i128 __vectorcall CmpLeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
300  static i128 __vectorcall CmpLeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
301  static i128 __vectorcall CmpLeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
302 
303  static i128 __vectorcall CmpGeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
304  static i128 __vectorcall CmpGeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
305  static i128 __vectorcall CmpGeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
306  static i128 __vectorcall CmpGeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
307 
308  static i128 __vectorcall CmpEqZero8(i128arg value) NLIB_NOEXCEPT;
309  static i128 __vectorcall CmpEqZero16(i128arg value) NLIB_NOEXCEPT;
310  static i128 __vectorcall CmpEqZero32(i128arg value) NLIB_NOEXCEPT;
311  static i128 __vectorcall CmpEqZero64(i128arg value) NLIB_NOEXCEPT;
312 
313  //
314  // Bit Shift
315  //
316  static i128 __vectorcall ShiftLeftLogical8(i128arg value, int count) NLIB_NOEXCEPT;
317  static i128 __vectorcall ShiftRightLogical8(i128arg value, int count) NLIB_NOEXCEPT;
318  static i128 __vectorcall ShiftRightArithmetic8(i128arg value, int count) NLIB_NOEXCEPT;
319 
320  static i128 __vectorcall ShiftLeftLogical16(i128arg value, int count) NLIB_NOEXCEPT;
321  static i128 __vectorcall ShiftRightLogical16(i128arg value, int count) NLIB_NOEXCEPT;
322  static i128 __vectorcall ShiftRightArithmetic16(i128arg value, int count) NLIB_NOEXCEPT;
323 
324  static i128 __vectorcall ShiftLeftLogical32(i128arg value, int count) NLIB_NOEXCEPT;
325  static i128 __vectorcall ShiftRightLogical32(i128arg value, int count) NLIB_NOEXCEPT;
326  static i128 __vectorcall ShiftRightArithmetic32(i128arg value, int count) NLIB_NOEXCEPT;
327 
328  static i128 __vectorcall ShiftLeftLogical64(i128arg value, int count) NLIB_NOEXCEPT;
329  static i128 __vectorcall ShiftRightLogical64(i128arg value, int count) NLIB_NOEXCEPT;
330 
331  //
332  // Bit Shift(Constant)
333  //
334  template <size_t N>
335  static i128 __vectorcall ShiftLeftLogical8(i128arg value) NLIB_NOEXCEPT;
336  template <size_t N>
337  static i128 __vectorcall ShiftRightLogical8(i128arg value) NLIB_NOEXCEPT;
338  template <size_t N>
339  static i128 __vectorcall ShiftRightArithmetic8(i128arg value) NLIB_NOEXCEPT;
340 
341  template <size_t N>
342  static i128 __vectorcall ShiftLeftLogical16(i128arg value) NLIB_NOEXCEPT;
343  template <size_t N>
344  static i128 __vectorcall ShiftRightLogical16(i128arg value) NLIB_NOEXCEPT;
345  template <size_t N>
346  static i128 __vectorcall ShiftRightArithmetic16(i128arg value) NLIB_NOEXCEPT;
347 
348  template <size_t N>
349  static i128 __vectorcall ShiftLeftLogical32(i128arg value) NLIB_NOEXCEPT;
350  template <size_t N>
351  static i128 __vectorcall ShiftRightLogical32(i128arg value) NLIB_NOEXCEPT;
352  template <size_t N>
353  static i128 __vectorcall ShiftRightArithmetic32(i128arg value) NLIB_NOEXCEPT;
354 
355  template <size_t N>
356  static i128 __vectorcall ShiftLeftLogical64(i128arg value) NLIB_NOEXCEPT;
357  template <size_t N>
358  static i128 __vectorcall ShiftRightLogical64(i128arg value) NLIB_NOEXCEPT;
359 
360  //
361  // 128bit wide Byte Shift/Rotate
362  //
363  template <size_t N>
364  static i128 __vectorcall ByteShiftLeft(i128arg value) NLIB_NOEXCEPT;
365  template <size_t N>
366  static i128 __vectorcall ByteShiftRight(i128arg value) NLIB_NOEXCEPT;
367  template <size_t N>
368  static i128 __vectorcall ByteRotateRight(i128arg value) NLIB_NOEXCEPT;
369  template <size_t N>
370  static i128 __vectorcall AlignR(i128arg a, i128arg b) NLIB_NOEXCEPT;
371 
372  //
373  // Conversion
374  //
375  static i128 __vectorcall NarrowFrom16To8(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
376  static i128 __vectorcall NarrowFrom32To16(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
377  static i128 __vectorcall NarrowFrom64To32(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
378 
379  static i128 __vectorcall
380  ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT; // NOLINT
381  static i128 __vectorcall ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
382  static i128 __vectorcall
383  ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT; // NOLINT
384  static i128 __vectorcall
385  ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT; // NOLINT
386 
387  static i128 __vectorcall ConvertFromInt8ToInt16Lo(i128arg value) NLIB_NOEXCEPT;
388  static i128 __vectorcall ConvertFromInt8ToInt16Hi(i128arg value) NLIB_NOEXCEPT;
389  static i128 __vectorcall ConvertFromInt16ToInt32Lo(i128arg value) NLIB_NOEXCEPT;
390  static i128 __vectorcall ConvertFromInt16ToInt32Hi(i128arg value) NLIB_NOEXCEPT;
391  static i128 __vectorcall ConvertFromInt32ToInt64Lo(i128arg value) NLIB_NOEXCEPT;
392  static i128 __vectorcall ConvertFromInt32ToInt64Hi(i128arg value) NLIB_NOEXCEPT;
393  static i128 __vectorcall ConvertFromUint8ToUint16Lo(i128arg value) NLIB_NOEXCEPT;
394  static i128 __vectorcall ConvertFromUint8ToUint16Hi(i128arg value) NLIB_NOEXCEPT;
395  static i128 __vectorcall ConvertFromUint16ToUint32Lo(i128arg value) NLIB_NOEXCEPT;
396  static i128 __vectorcall ConvertFromUint16ToUint32Hi(i128arg value) NLIB_NOEXCEPT;
397  static i128 __vectorcall ConvertFromUint32ToUint64Lo(i128arg value) NLIB_NOEXCEPT;
398  static i128 __vectorcall ConvertFromUint32ToUint64Hi(i128arg value) NLIB_NOEXCEPT;
399 
400  static i128 __vectorcall Zip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
401  static i128 __vectorcall Zip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
402  static i128 __vectorcall Unzip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
403  static i128 __vectorcall Unzip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
404  static i128 __vectorcall Zip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
405  static i128 __vectorcall Zip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
406  static i128 __vectorcall Unzip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
407  static i128 __vectorcall Unzip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
408  static i128 __vectorcall Zip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
409  static i128 __vectorcall Zip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
410  static i128 __vectorcall Unzip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
411  static i128 __vectorcall Unzip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
412 
413  template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7,
414  int V8, int V9, int V10, int V11, int V12, int V13, int V14, int V15>
415  static i128 __vectorcall Permute8(i128arg a, i128arg b) NLIB_NOEXCEPT;
416  template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7>
417  static i128 __vectorcall Permute16(i128arg a, i128arg b) NLIB_NOEXCEPT;
418  template<int V0, int V1, int V2, int V3>
419  static i128 __vectorcall Permute32(i128arg a, i128arg b) NLIB_NOEXCEPT;
420 
421  //
422  // Swap endian
423  //
424  static i128 __vectorcall Reverse16(i128arg value) NLIB_NOEXCEPT;
425  static i128 __vectorcall Reverse32(i128arg value) NLIB_NOEXCEPT;
426  static i128 __vectorcall Reverse64(i128arg value) NLIB_NOEXCEPT;
427 
428  //
429  // Misc
430  //
431  static int __vectorcall MoveMask8(i128arg value) NLIB_NOEXCEPT;
432  static int __vectorcall MoveMask16(i128arg value) NLIB_NOEXCEPT;
433  static int __vectorcall MoveMask32(i128arg value) NLIB_NOEXCEPT;
434  static i128 __vectorcall SetMask8(int mask) NLIB_NOEXCEPT;
435  static i128 __vectorcall SetMask16(int mask) NLIB_NOEXCEPT;
436  static i128 __vectorcall SetMask32(int mask) NLIB_NOEXCEPT;
437  static bool __vectorcall IsZero(i128arg value) NLIB_NOEXCEPT;
438  static bool __vectorcall IsFull(i128arg value) NLIB_NOEXCEPT;
439  static i128 __vectorcall Select(i128arg mask, i128arg a, i128arg b) NLIB_NOEXCEPT;
440  static i128 __vectorcall Shuffle8(i128arg value, i128arg shuffle) NLIB_NOEXCEPT;
441  static int __vectorcall PopCntMask8(i128arg value) NLIB_NOEXCEPT;
442  static int __vectorcall ClzMask8(i128arg value) NLIB_NOEXCEPT;
443  static int __vectorcall CtzMask8(i128arg value) NLIB_NOEXCEPT;
444 
445  private:
446  I128(); // forbidden
447 };
448 
449 #ifndef NLIB_DOXYGEN
450 
451 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall
452 #define NLIB_M2(tp) inline tp __vectorcall
453 
454 #ifdef NLIB_NEON
455 # undef vreinterpret_s8_s8
456 # undef NLIB_OP1
457 # undef NLIB_OP2
458 # undef NLIB_OP3
459 # undef NLIB_CMP
460 # undef NLIB_SFT
461 # undef NLIB_CMB
462 
463 #define vreinterpretq_s8_s8(a) (a)
464 #define NLIB_OP1(intrin, tp, a) \
465  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a)))
466 #define NLIB_OP2(intrin, tp, a, b) \
467  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
468  vreinterpretq_##tp##_s8(b)))
469 #define NLIB_OP3(intrin, tp, a, b, c) \
470  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
471  vreinterpretq_##tp##_s8(b), \
472  vreinterpretq_##tp##_s8(c)))
473 #define NLIB_CMP(intrin, tp, a, b, utp) \
474  vreinterpretq_s8_##utp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
475  vreinterpretq_##tp##_s8(b)))
476 #define NLIB_SFT(intrin, tp, a, cnt, stp) \
477  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vdupq_n_##stp(cnt)))
478 #define NLIB_CMB(tp, l, h) vreinterpretq_s8_##tp(vcombine_##tp(l, h))
479 #endif
480 
481 // r.s8[i] = v for all i_mm_cvtsi32_si128
482 NLIB_M(i128) I128::SetValue(int8_t v, each_int8_tag) NLIB_NOEXCEPT {
483 #if defined(NLIB_SSE41)
484  // faster than _mm_set1_epi8(v)
485  return _mm_shuffle_epi8(_mm_cvtsi32_si128(static_cast<uint8_t>(v)), _mm_setzero_si128());
486 #elif defined(NLIB_NEON)
487  return vdupq_n_s8(v);
488 #endif
489 }
490 
491 // r.s16[i] = v for all i
492 NLIB_M(i128) I128::SetValue(int16_t v, each_int16_tag) NLIB_NOEXCEPT {
493 #if defined(NLIB_SSE41)
494  return _mm_set1_epi16(v);
495 #elif defined(NLIB_NEON)
496  return vreinterpretq_s8_s16(vdupq_n_s16(v));
497 #endif
498 }
499 
500 // r.s32[i] = v for all i
501 NLIB_M(i128) I128::SetValue(int32_t v, each_int32_tag) NLIB_NOEXCEPT {
502 #if defined(NLIB_SSE41)
503  return _mm_set1_epi32(v);
504 #elif defined(NLIB_NEON)
505  return vreinterpretq_s8_s32(vdupq_n_s32(v));
506 #endif
507 }
508 
509 // r.s64[i] = v for all i
510 NLIB_M(i128) I128::SetValue(int64_t v, each_int64_tag) NLIB_NOEXCEPT {
511 #if defined(NLIB_SSE41)
512 #ifdef _MSC_VER
513  // conflicts with ffloor()?
514  // Do not use MMX anyway
515  // return _mm_set1_epi64(*reinterpret_cast<__m64*>(&x));
516  NLIB_ALIGNAS(16) int64_t tmp[2] = {v, v};
517  return I128::LoadA16(tmp);
518 #else
519  return _mm_set1_epi64x(v);
520 #endif
521 #elif defined(NLIB_NEON)
522  return vreinterpretq_s8_s64(vdupq_n_s64(v));
523 #endif
524 }
525 
526 // r.u8[i] = v for all i
527 NLIB_M(i128) I128::SetValue(uint8_t v, each_uint8_tag) NLIB_NOEXCEPT {
528 #if defined(NLIB_SSE41)
529  // faster than _mm_set1_epi8(v)
530  return _mm_shuffle_epi8(_mm_cvtsi32_si128(v), _mm_setzero_si128());
531 #elif defined(NLIB_NEON)
532  return vreinterpretq_s8_u8(vdupq_n_u8(v));
533 #endif
534 }
535 
536 // r.u16[i] = v for all i
537 NLIB_M(i128) I128::SetValue(uint16_t v, each_uint16_tag) NLIB_NOEXCEPT {
538 #if defined(NLIB_SSE41)
539  return _mm_set1_epi16(static_cast<int16_t>(v));
540 #elif defined(NLIB_NEON)
541  return vreinterpretq_s8_u16(vdupq_n_u16(v));
542 #endif
543 }
544 
545 // r.u32[i] = v for all i
546 NLIB_M(i128) I128::SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT {
547 #if defined(NLIB_SSE41)
548  return _mm_set1_epi32(static_cast<int32_t>(v));
549 #elif defined(NLIB_NEON)
550  return vreinterpretq_s8_u32(vdupq_n_u32(v));
551 #endif
552 }
553 
554 // r.u64[i] = v for all i
555 NLIB_M(i128) I128::SetValue(uint64_t v, each_uint64_tag) NLIB_NOEXCEPT {
556 #if defined(NLIB_SSE41)
557 #ifdef _MSC_VER
558  NLIB_ALIGNAS(16) uint64_t tmp[2] = {v, v};
559  return I128::LoadA16(tmp);
560 #else
561  return _mm_set1_epi64x(static_cast<int64_t>(v));
562 #endif
563 #elif defined(NLIB_NEON)
564  return vreinterpretq_s8_u64(vdupq_n_u64(v));
565 #endif
566 }
567 
568 #if defined(NLIB_SSE41)
569 template <size_t N>
570 // r.u32[i] = value.u32[N] for all i
571 NLIB_M(i128) I128::SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT {
572  NLIB_STATIC_ASSERT(N < 4);
573  return _mm_shuffle_epi32(value, _MM_SHUFFLE(N, N, N, N));
574 }
575 #elif defined(NLIB_NEON)
576 # ifdef __aarch64__
577 template <size_t N>
578 // r.u32[i] = value.u32[N] for all i
579 NLIB_M(i128) I128::SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT {
580  NLIB_STATIC_ASSERT(N < 4);
581  uint32x4_t v = vreinterpretq_u32_s8(value);
582  return vreinterpretq_s8_u32(vdupq_laneq_u32(v, N));
583 }
584 # else
585 template <>
586 NLIB_M(i128) I128::SetValue<0>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
587  uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
588  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
589 }
590 template <>
591 NLIB_M(i128) I128::SetValue<1>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
592  uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
593  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
594 }
595 template <>
596 NLIB_M(i128) I128::SetValue<2>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
597  uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
598  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
599 }
600 template <>
601 NLIB_M(i128) I128::SetValue<3>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
602  uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
603  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
604 }
605 # endif
606 #endif
607 
608 #if defined(NLIB_SSE41)
609 template <size_t N>
610 // r.u16[i] = value.u16[N] for all i
611 NLIB_M2(i128) I128::SetValue(i128 value, each_select16_tag) NLIB_NOEXCEPT {
612  NLIB_STATIC_ASSERT(N < 8);
613  NLIB_ALIGNAS(16) const int8_t mask[16] = {
614  2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1,
615  2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1
616  };
617  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(mask));
618 }
619 #elif defined(NLIB_NEON)
620 template <>
621 NLIB_M(i128) I128::SetValue<0>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
622 # ifdef __aarch64__
623  uint16x8_t v = vreinterpretq_u16_s8(value);
624  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 0));
625 # else
626  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
627  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
628 # endif
629 }
630 
631 template <>
632 NLIB_M(i128) I128::SetValue<1>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
633 # ifdef __aarch64__
634  uint16x8_t v = vreinterpretq_u16_s8(value);
635  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 1));
636 # else
637  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
638  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
639 # endif
640 }
641 
642 template <>
643 NLIB_M(i128) I128::SetValue<2>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
644 # ifdef __aarch64__
645  uint16x8_t v = vreinterpretq_u16_s8(value);
646  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 2));
647 # else
648  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
649  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
650 # endif
651 }
652 
653 template <>
654 NLIB_M(i128) I128::SetValue<3>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
655 # ifdef __aarch64__
656  uint16x8_t v = vreinterpretq_u16_s8(value);
657  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 3));
658 # else
659  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
660  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
661 # endif
662 }
663 
664 template <>
665 NLIB_M(i128) I128::SetValue<4>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
666 # ifdef __aarch64__
667  uint16x8_t v = vreinterpretq_u16_s8(value);
668  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 4));
669 # else
670  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
671  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
672 # endif
673 }
674 
675 template <>
676 NLIB_M(i128) I128::SetValue<5>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
677 # ifdef __aarch64__
678  uint16x8_t v = vreinterpretq_u16_s8(value);
679  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 5));
680 # else
681  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
682  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
683 # endif
684 }
685 
686 template <>
687 NLIB_M(i128) I128::SetValue<6>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
688 # ifdef __aarch64__
689  uint16x8_t v = vreinterpretq_u16_s8(value);
690  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 6));
691 # else
692  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
693  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
694 # endif
695 }
696 
697 template <>
698 NLIB_M(i128) I128::SetValue<7>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
699 # ifdef __aarch64__
700  uint16x8_t v = vreinterpretq_u16_s8(value);
701  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 7));
702 # else
703  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
704  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
705 # endif
706 }
707 #endif
708 
709 #if defined(NLIB_SSE41)
710 template <size_t N>
711 // r.u8[i] = value.u8[N] for all i
712 NLIB_M2(i128) I128::SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT {
713  NLIB_STATIC_ASSERT(N < 16);
714  NLIB_ALIGNAS(16) const int8_t mask[16] = {
715  N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
716  };
717  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
718 }
719 #elif defined(NLIB_NEON)
720 namespace detail {
721 template <size_t N, bool IsLower>
722 struct SetValue8Helper {
723  NLIB_M(i128) operator()(i128 value) NLIB_NOEXCEPT {
724  return vdupq_lane_s8(vget_low_s8(value), N);
725  }
726 };
727 
728 template <size_t N>
729 struct SetValue8Helper<N, false> {
730  NLIB_M(i128) operator()(i128 value) NLIB_NOEXCEPT {
731  return vdupq_lane_s8(vget_high_s8(value), N - 8);
732  }
733 };
734 
735 } // namespace detail
736 
737 template <size_t N>
738 NLIB_M(i128) I128::SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT {
739  NLIB_STATIC_ASSERT(N < 16);
740 # ifdef __aarch64__
741  return vdupq_laneq_s8(value, N);
742 # else
743  return detail::SetValue8Helper<N, (N < 8)>()(value);
744 # endif
745 }
746 #endif
747 
748 // set 0 to all bits
749 NLIB_M(i128) I128::SetZero() NLIB_NOEXCEPT {
750 #if defined(NLIB_SSE41)
751  return _mm_setzero_si128();
752 #elif defined(NLIB_NEON)
753  return vdupq_n_s8(0);
754 #endif
755 }
756 
757 // set 1 to all bits
758 NLIB_M(i128) I128::SetFull(i128arg dummy) NLIB_NOEXCEPT { return I128::CmpEq8(dummy, dummy); }
759 
760 // r[i] = p[i], p is 16 bytes aligned
761 NLIB_M(i128) I128::LoadA16(const void* p) NLIB_NOEXCEPT {
762 #if defined(NLIB_SSE41)
763  return _mm_load_si128(reinterpret_cast<const __m128i*>(p));
764 #elif defined(NLIB_NEON)
765  uint64x2_t tmp = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
766  return vreinterpretq_s8_u64(tmp);
767 #endif
768 }
769 
770 // r[i] = p[i], p is 8 bytes aligned
771 NLIB_M(i128) I128::LoadA8(const void* p) NLIB_NOEXCEPT {
772 #if defined(NLIB_SSE41)
773  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
774 #elif defined(NLIB_NEON)
775  uint64x2_t tmp = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
776  return vreinterpretq_s8_u64(tmp);
777 #endif
778 }
779 
780 // r[i] = p[i], p is 4 bytes aligned
781 NLIB_M(i128) I128::LoadA4(const void* p) NLIB_NOEXCEPT {
782 #if defined(NLIB_SSE41)
783  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
784 #elif defined(NLIB_NEON)
785  uint32x4_t tmp = vld1q_u32(reinterpret_cast<const uint32_t*>(p));
786  return vreinterpretq_s8_u32(tmp);
787 #endif
788 }
789 
790 // r[i] = p[i], p is 2 bytes aligned
791 NLIB_M(i128) I128::LoadA2(const void* p) NLIB_NOEXCEPT {
792 #if defined(NLIB_SSE41)
793  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
794 #elif defined(NLIB_NEON)
795  uint16x8_t tmp = vld1q_u16(reinterpret_cast<const uint16_t*>(p));
796  return vreinterpretq_s8_u16(tmp);
797 #endif
798 }
799 
800 // r[i] = p[i]
801 NLIB_M(i128) I128::LoadA1(const void* p) NLIB_NOEXCEPT {
802 #if defined(NLIB_SSE41)
803  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
804 #elif defined(NLIB_NEON)
805  return vld1q_s8(reinterpret_cast<const int8_t*>(p));
806 #endif
807 }
808 
809 NLIB_M(i128) I128::LoadLoA8(const void* p) NLIB_NOEXCEPT {
810 #if defined(NLIB_SSE41)
811  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
812 #elif defined(NLIB_NEON)
813  int8x8_t lo = vreinterpret_s8_u64(vld1_u64(reinterpret_cast<const uint64_t*>(p)));
814  return vcombine_s8(lo, vdup_n_s8(0));
815 #endif
816 }
817 
818 NLIB_M(i128) I128::LoadLoA4(const void* p) NLIB_NOEXCEPT {
819 #if defined(NLIB_SSE41)
820  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
821 #elif defined(NLIB_NEON)
822  int8x8_t lo = vreinterpret_s8_u32(vld1_u32(reinterpret_cast<const uint32_t*>(p)));
823  return vcombine_s8(lo, vdup_n_s8(0));
824 #endif
825 }
826 
827 NLIB_M(i128) I128::LoadLoA2(const void* p) NLIB_NOEXCEPT {
828 #if defined(NLIB_SSE41)
829  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
830 #elif defined(NLIB_NEON)
831  int8x8_t lo = vreinterpret_s8_u16(vld1_u16(reinterpret_cast<const uint16_t*>(p)));
832  return vcombine_s8(lo, vdup_n_s8(0));
833 #endif
834 }
835 
836 NLIB_M(i128) I128::LoadLoA1(const void* p) NLIB_NOEXCEPT {
837 #if defined(NLIB_SSE41)
838  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
839 #elif defined(NLIB_NEON)
840  int8x8_t lo = vld1_s8(reinterpret_cast<const int8_t*>(p));
841  return vcombine_s8(lo, vdup_n_s8(0));
842 #endif
843 }
844 
845 NLIB_M(i128) I128::LoadHiA8(const void* p) NLIB_NOEXCEPT {
846 #if defined(NLIB_SSE41)
847  __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
848  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
849 #elif defined(NLIB_NEON)
850  int8x8_t hi = vreinterpret_s8_u64(vld1_u64(reinterpret_cast<const uint64_t*>(p)));
851  return vcombine_s8(vdup_n_s8(0), hi);
852 #endif
853 }
854 
855 NLIB_M(i128) I128::LoadHiA4(const void* p) NLIB_NOEXCEPT {
856 #if defined(NLIB_SSE41)
857  __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
858  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
859 #elif defined(NLIB_NEON)
860  int8x8_t hi = vreinterpret_s8_u32(vld1_u32(reinterpret_cast<const uint32_t*>(p)));
861  return vcombine_s8(vdup_n_s8(0), hi);
862 #endif
863 }
864 
865 NLIB_M(i128) I128::LoadHiA2(const void* p) NLIB_NOEXCEPT {
866 #if defined(NLIB_SSE41)
867  __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
868  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
869 #elif defined(NLIB_NEON)
870  int8x8_t hi = vreinterpret_s8_u16(vld1_u16(reinterpret_cast<const uint16_t*>(p)));
871  return vcombine_s8(vdup_n_s8(0), hi);
872 #endif
873 }
874 
875 NLIB_M(i128) I128::LoadHiA1(const void* p) NLIB_NOEXCEPT {
876 #if defined(NLIB_SSE41)
877  __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
878  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
879 #elif defined(NLIB_NEON)
880  int8x8_t hi = vld1_s8(reinterpret_cast<const int8_t*>(p));
881  return vcombine_s8(vdup_n_s8(0), hi);
882 #endif
883 }
884 
885 // p[i] = value[i], p is 16 bytes aligned
886 NLIB_M(void) I128::StoreA16(void* p, i128arg value) NLIB_NOEXCEPT {
887 #if defined(NLIB_SSE41)
888  _mm_store_si128(reinterpret_cast<i128*>(p), value);
889 #elif defined(NLIB_NEON)
890  vst1q_u64(reinterpret_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
891 #endif
892 }
893 
894 // p[i] = value[i], p is 8 bytes aligned
895 NLIB_M(void) I128::StoreA8(void* p, i128arg value) NLIB_NOEXCEPT {
896 #if defined(NLIB_SSE41)
897  _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
898 #elif defined(NLIB_NEON)
899  vst1q_u64(reinterpret_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
900 #endif
901 }
902 
903 // p[i] = value[i], p is 4 bytes aligned
904 NLIB_M(void) I128::StoreA4(void* p, i128arg value) NLIB_NOEXCEPT {
905 #if defined(NLIB_SSE41)
906  _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
907 #elif defined(NLIB_NEON)
908  vst1q_u32(reinterpret_cast<uint32_t*>(p), vreinterpretq_u32_s8(value));
909 #endif
910 }
911 
912 // p[i] = value[i], p is 2 bytes aligned
913 NLIB_M(void) I128::StoreA2(void* p, i128arg value) NLIB_NOEXCEPT {
914 #if defined(NLIB_SSE41)
915  _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
916 #elif defined(NLIB_NEON)
917  vst1q_u16(reinterpret_cast<uint16_t*>(p), vreinterpretq_u16_s8(value));
918 #endif
919 }
920 
921 // p[i] = value[i]
922 NLIB_M(void) I128::StoreA1(void* p, i128arg value) NLIB_NOEXCEPT {
923 #if defined(NLIB_SSE41)
924  _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
925 #elif defined(NLIB_NEON)
926  vst1q_s8(reinterpret_cast<int8_t*>(p), value);
927 #endif
928 }
929 
930 NLIB_M(void) I128::StoreLoA8(void* p, i128arg value) NLIB_NOEXCEPT {
931 #if defined(NLIB_SSE41)
932  _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
933 #elif defined(NLIB_NEON)
934  uint64x1_t x = vreinterpret_u64_s8(vget_low_s8(value));
935  vst1_u64(reinterpret_cast<uint64_t*>(p), x);
936 #endif
937 }
938 
939 NLIB_M(void) I128::StoreLoA4(void* p, i128arg value) NLIB_NOEXCEPT {
940 #if defined(NLIB_SSE41)
941  _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
942 #elif defined(NLIB_NEON)
943  uint32x2_t x = vreinterpret_u32_s8(vget_low_s8(value));
944  vst1_u32(reinterpret_cast<uint32_t*>(p), x);
945 #endif
946 }
947 
948 NLIB_M(void) I128::StoreLoA2(void* p, i128arg value) NLIB_NOEXCEPT {
949 #if defined(NLIB_SSE41)
950  _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
951 #elif defined(NLIB_NEON)
952  uint16x4_t x = vreinterpret_u16_s8(vget_low_s8(value));
953  vst1_u16(reinterpret_cast<uint16_t*>(p), x);
954 #endif
955 }
956 
957 NLIB_M(void) I128::StoreLoA1(void* p, i128arg value) NLIB_NOEXCEPT {
958 #if defined(NLIB_SSE41)
959  _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
960 #elif defined(NLIB_NEON)
961  int8x8_t x = vget_low_s8(value);
962  vst1_s8(reinterpret_cast<int8_t*>(p), x);
963 #endif
964 }
965 
966 NLIB_M(void) I128::StoreHiA8(void* p, i128arg value) NLIB_NOEXCEPT {
967 #if defined(NLIB_SSE41)
968  _mm_storel_epi64(reinterpret_cast<i128*>(p),
969  _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
970 #elif defined(NLIB_NEON)
971  uint64x1_t x = vreinterpret_u64_s8(vget_high_s8(value));
972  vst1_u64(reinterpret_cast<uint64_t*>(p), x);
973 #endif
974 }
975 
976 NLIB_M(void) I128::StoreHiA4(void* p, i128arg value) NLIB_NOEXCEPT {
977 #if defined(NLIB_SSE41)
978  _mm_storel_epi64(reinterpret_cast<i128*>(p),
979  _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
980 #elif defined(NLIB_NEON)
981  uint32x2_t x = vreinterpret_u32_s8(vget_high_s8(value));
982  vst1_u32(reinterpret_cast<uint32_t*>(p), x);
983 #endif
984 }
985 
986 NLIB_M(void) I128::StoreHiA2(void* p, i128arg value) NLIB_NOEXCEPT {
987 #if defined(NLIB_SSE41)
988  _mm_storel_epi64(reinterpret_cast<i128*>(p),
989  _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
990 #elif defined(NLIB_NEON)
991  uint16x4_t x = vreinterpret_u16_s8(vget_high_s8(value));
992  vst1_u16(reinterpret_cast<uint16_t*>(p), x);
993 #endif
994 }
995 
996 NLIB_M(void) I128::StoreHiA1(void* p, i128arg value) NLIB_NOEXCEPT {
997 #if defined(NLIB_SSE41)
998  _mm_storel_epi64(reinterpret_cast<i128*>(p),
999  _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
1000 #elif defined(NLIB_NEON)
1001  int8x8_t x = vget_high_s8(value);
1002  vst1_s8(reinterpret_cast<int8_t*>(p), x);
1003 #endif
1004 }
1005 
1006 template <size_t N>
1007 // r = value.u8[N]
1008 NLIB_M(uint8_t) I128::GetUint8FromLane(i128arg value) NLIB_NOEXCEPT {
1009  NLIB_STATIC_ASSERT(N < 16);
1010 #if defined(NLIB_SSE41)
1011  return static_cast<uint8_t>(_mm_extract_epi8(value, N));
1012 #elif defined(NLIB_NEON)
1013  return vgetq_lane_u8(vreinterpretq_u8_s8(value), N);
1014 #endif
1015 }
1016 
1017 template <size_t N>
1018 // r = value.u16[N]
1019 NLIB_M(uint16_t) I128::GetUint16FromLane(i128arg value) NLIB_NOEXCEPT {
1020  NLIB_STATIC_ASSERT(N < 8);
1021 #if defined(NLIB_SSE41)
1022  return static_cast<uint16_t>(_mm_extract_epi16(value, N));
1023 #elif defined(NLIB_NEON)
1024  return vgetq_lane_u16(vreinterpretq_u16_s8(value), N);
1025 #endif
1026 }
1027 
1028 template <size_t N>
1029 // r = value.u32[N]
1030 NLIB_M(uint32_t) I128::GetUint32FromLane(i128arg value) NLIB_NOEXCEPT {
1031  NLIB_STATIC_ASSERT(N < 4);
1032 #if defined(NLIB_SSE41)
1033  return static_cast<uint32_t>(_mm_extract_epi32(value, N));
1034 #elif defined(NLIB_NEON)
1035  return vgetq_lane_u32(vreinterpretq_u32_s8(value), N);
1036 #endif
1037 }
1038 
1039 template <size_t N>
1040 // r = value.u64[N]
1041 NLIB_M(uint64_t) I128::GetUint64FromLane(i128arg value) NLIB_NOEXCEPT {
1042  NLIB_STATIC_ASSERT(N < 2);
1043 #if defined(NLIB_SSE41)
1044 #ifdef NLIB_64BIT
1045  return static_cast<uint64_t>(_mm_extract_epi64(value, N));
1046 #else
1047  NLIB_UNUSED(value);
1048  return 0; // dummy
1049 #endif
1050 #elif defined(NLIB_NEON)
1051  return vgetq_lane_u64(vreinterpretq_u64_s8(value), N);
1052 #endif
1053 }
1054 
1055 #if defined(NLIB_SSE41) && !defined(NLIB_64BIT)
1056 template <>
1057 NLIB_M(uint64_t) I128::GetUint64FromLane<0>(i128arg value) NLIB_NOEXCEPT {
1058  uint64_t rval;
1059  _mm_storel_epi64(reinterpret_cast<i128*>(&rval), value);
1060  return rval;
1061 }
1062 template <>
1063 NLIB_M(uint64_t) I128::GetUint64FromLane<1>(i128arg value) NLIB_NOEXCEPT {
1064  uint64_t rval;
1065  i128 tmp = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
1066  _mm_storel_epi64(reinterpret_cast<i128*>(&rval), tmp);
1067  return rval;
1068 }
1069 #endif
1070 
1071 template <size_t N>
1072 // r = value, r.u8[N] = v
1073 NLIB_M(i128) I128::SetUint8ToLane(i128arg value, uint8_t v) NLIB_NOEXCEPT {
1074  NLIB_STATIC_ASSERT(N < 16);
1075 #if defined(NLIB_SSE41)
1076  return _mm_insert_epi8(value, static_cast<int8_t>(v), N);
1077 #elif defined(NLIB_NEON)
1078  return __builtin_constant_p(v) ?
1079  I128::Permute8<
1080  N == 0 ? 16 : 0,
1081  N == 1 ? 17 : 1,
1082  N == 2 ? 18 : 2,
1083  N == 3 ? 19 : 3,
1084  N == 4 ? 20 : 4,
1085  N == 5 ? 21 : 5,
1086  N == 6 ? 22 : 6,
1087  N == 7 ? 23 : 7,
1088  N == 8 ? 24 : 8,
1089  N == 9 ? 25 : 9,
1090  N == 10 ? 26 : 10,
1091  N == 11 ? 27 : 11,
1092  N == 12 ? 28 : 12,
1093  N == 13 ? 29 : 13,
1094  N == 14 ? 30 : 14,
1095  N == 15 ? 31 : 15>(value, vreinterpretq_s8_u8(vdupq_n_u8(v))) :
1096  vreinterpretq_s8_u8(vsetq_lane_u8(v, vreinterpretq_u8_s8(value), N));
1097 #endif
1098 }
1099 
1100 template <size_t N>
1101 // r = value, r.u16[N] = v
1102 NLIB_M(i128) I128::SetUint16ToLane(i128arg value, uint16_t v) NLIB_NOEXCEPT {
1103  NLIB_STATIC_ASSERT(N < 8);
1104 #if defined(NLIB_SSE41)
1105  return _mm_insert_epi16(value, static_cast<int16_t>(v), N);
1106 #elif defined(NLIB_NEON)
1107  return __builtin_constant_p(v) ?
1108  I128::Permute16<
1109  N == 0 ? 8 : 0,
1110  N == 1 ? 9 : 1,
1111  N == 2 ? 10 : 2,
1112  N == 3 ? 11 : 3,
1113  N == 4 ? 12 : 4,
1114  N == 5 ? 13 : 5,
1115  N == 6 ? 14 : 6,
1116  N == 7 ? 15 : 7>(value, vreinterpretq_s8_u16(vdupq_n_u16(v))) :
1117  vreinterpretq_s8_u16(vsetq_lane_u16(v, vreinterpretq_u16_s8(value), N));
1118 #endif
1119 }
1120 
1121 template <size_t N>
1122 // r = value, r.u32[N] = v
1123 NLIB_M(i128) I128::SetUint32ToLane(i128arg value, uint32_t v) NLIB_NOEXCEPT {
1124  NLIB_STATIC_ASSERT(N < 4);
1125 #if defined(NLIB_SSE41)
1126  return _mm_insert_epi32(value, static_cast<uint32_t>(v), N);
1127 #elif defined(NLIB_NEON)
1128  return __builtin_constant_p(v) ?
1129  I128::Permute32<N == 0 ? 4 : 0,
1130  N == 1 ? 5 : 1,
1131  N == 2 ? 6 : 2,
1132  N == 3 ? 7 : 3>(value, vreinterpretq_s8_u32(vdupq_n_u32(v))) :
1133  vreinterpretq_s8_u32(vsetq_lane_u32(v, vreinterpretq_u32_s8(value), N));
1134 #endif
1135 }
1136 
1137 template <size_t N>
1138 // r = value, r.u64[N] = v
1139 NLIB_M(i128) I128::SetUint64ToLane(i128arg value, uint64_t v) NLIB_NOEXCEPT {
1140  NLIB_STATIC_ASSERT(N < 2);
1141 #if defined(NLIB_SSE41)
1142 #ifdef NLIB_64BIT
1143  return _mm_insert_epi64(value, static_cast<int64_t>(v), N);
1144 #else
1145  union {
1146  int32_t i32[2];
1147  int64_t i64;
1148  } tmp;
1149  tmp.i64 = static_cast<int64_t>(v);
1150  __m128i rval;
1151  rval = _mm_insert_epi32(value, tmp.i32[0], N * 2 + 0);
1152  return _mm_insert_epi32(rval, tmp.i32[1], N * 2 + 1);
1153 #endif
1154 #elif defined(NLIB_NEON)
1155  return vreinterpretq_s8_u64(vsetq_lane_u64(v, vreinterpretq_u64_s8(value), N));
1156 #endif
1157 }
1158 
1159 // r.i8[i] = a.i8[i] + b.i8[i]
1160 NLIB_M(i128) I128::Add8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1161 #if defined(NLIB_SSE41)
1162  return _mm_add_epi8(a, b);
1163 #elif defined(NLIB_NEON)
1164  return vaddq_s8(a, b);
1165 #endif
1166 }
1167 
1168 // r.i16[i] = a.i16[i] + b.i16[i]
1169 NLIB_M(i128) I128::Add16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1170 #if defined(NLIB_SSE41)
1171  return _mm_add_epi16(a, b);
1172 #elif defined(NLIB_NEON)
1173  return NLIB_OP2(vaddq, s16, a, b);
1174 #endif
1175 }
1176 
1177 // r.i32[i] = a.i32[i] + b.i32[i]
1178 NLIB_M(i128) I128::Add32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1179 #if defined(NLIB_SSE41)
1180  return _mm_add_epi32(a, b);
1181 #elif defined(NLIB_NEON)
1182  return NLIB_OP2(vaddq, s32, a, b);
1183 #endif
1184 }
1185 
1186 // r.i64[i] = a.i64[i] + b.i64[i]
1187 NLIB_M(i128) I128::Add64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1188 #if defined(NLIB_SSE41)
1189  return _mm_add_epi64(a, b);
1190 #elif defined(NLIB_NEON)
1191  return NLIB_OP2(vaddq, s64, a, b);
1192 #endif
1193 }
1194 
1195 // 127 if a[i] + b[i] > 127, -128 if a[i] + b[i] < -128
1196 NLIB_M(i128) I128::AddInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1197 #if defined(NLIB_SSE41)
1198  return _mm_adds_epi8(a, b);
1199 #elif defined(NLIB_NEON)
1200  return vqaddq_s8(a, b);
1201 #endif
1202 }
1203 
1204 // 32767 if a[i] + b[i] > 32767, -32768 if a[i] + b[i] < -32768
1205 NLIB_M(i128) I128::AddInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1206 #if defined(NLIB_SSE41)
1207  return _mm_adds_epi16(a, b);
1208 #elif defined(NLIB_NEON)
1209  return NLIB_OP2(vqaddq, s16, a, b);
1210 #endif
1211 }
1212 
1213 // 255 if a[i] + b[i] > 255
1214 NLIB_M(i128) I128::AddUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1215 #if defined(NLIB_SSE41)
1216  return _mm_adds_epu8(a, b);
1217 #elif defined(NLIB_NEON)
1218  return NLIB_OP2(vqaddq, u8, a, b);
1219 #endif
1220 }
1221 
1222 // 65535 if a[i] + b[i] > 65535
1223 NLIB_M(i128) I128::AddUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1224 #if defined(NLIB_SSE41)
1225  return _mm_adds_epu16(a, b);
1226 #elif defined(NLIB_NEON)
1227  return NLIB_OP2(vqaddq, u16, a, b);
1228 #endif
1229 }
1230 
1231 // r.i8[i] = a.i8[i] - b.i8[i]
1232 NLIB_M(i128) I128::Sub8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1233 #if defined(NLIB_SSE41)
1234  return _mm_sub_epi8(a, b);
1235 #elif defined(NLIB_NEON)
1236  return vsubq_s8(a, b);
1237 #endif
1238 }
1239 
1240 // r.i16[i] = a.i16[i] - b.i16[i]
1241 NLIB_M(i128) I128::Sub16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1242 #if defined(NLIB_SSE41)
1243  return _mm_sub_epi16(a, b);
1244 #elif defined(NLIB_NEON)
1245  return NLIB_OP2(vsubq, s16, a, b);
1246 #endif
1247 }
1248 
1249 // r.i32[i] = a.i32[i] - b.i32[i]
1250 NLIB_M(i128) I128::Sub32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1251 #if defined(NLIB_SSE41)
1252  return _mm_sub_epi32(a, b);
1253 #elif defined(NLIB_NEON)
1254  return NLIB_OP2(vsubq, s32, a, b);
1255 #endif
1256 }
1257 
1258 // r.i64[0] = a.i64[0] - b.i64[0]
1259 NLIB_M(i128) I128::Sub64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1260 #if defined(NLIB_SSE41)
1261  return _mm_sub_epi64(a, b);
1262 #elif defined(NLIB_NEON)
1263  return NLIB_OP2(vsubq, s64, a, b);
1264 #endif
1265 }
1266 
1267 // 127 if a[i] - b[i] > 127, -128 if a[i] - b[i] < -128
1268 NLIB_M(i128) I128::SubInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1269 #if defined(NLIB_SSE41)
1270  return _mm_subs_epi8(a, b);
1271 #elif defined(NLIB_NEON)
1272  return NLIB_OP2(vqsubq, s8, a, b);
1273 #endif
1274 }
1275 
1276 // 32767 if a[i] - b[i] > 32767, -32768 if a[i] - b[i] < -32768
1277 NLIB_M(i128) I128::SubInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1278 #if defined(NLIB_SSE41)
1279  return _mm_subs_epi16(a, b);
1280 #elif defined(NLIB_NEON)
1281  return NLIB_OP2(vqsubq, s16, a, b);
1282 #endif
1283 }
1284 
1285 // 0 if a.u8[i] - b.u8[i] < 0
1286 NLIB_M(i128) I128::SubUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1287 #if defined(NLIB_SSE41)
1288  return _mm_subs_epu8(a, b);
1289 #elif defined(NLIB_NEON)
1290  return NLIB_OP2(vqsubq, u8, a, b);
1291 #endif
1292 }
1293 
1294 // 0 if a.u16[i] - b.u16[i] < 0
1295 NLIB_M(i128) I128::SubUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1296 #if defined(NLIB_SSE41)
1297  return _mm_subs_epu16(a, b);
1298 #elif defined(NLIB_NEON)
1299  return NLIB_OP2(vqsubq, u16, a, b);
1300 #endif
1301 }
1302 
1303 // r.i8[0] = value[0] + value[1], ..., r.i8[7] = value[14] + value[15]
1304 NLIB_M(i128) I128::PairwiseAdd8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1305 #if defined(NLIB_SSE41)
1306  __m128i ax = _mm_add_epi8(a, _mm_srli_epi16(a, 8));
1307  __m128i bx = _mm_add_epi8(b, _mm_srli_epi16(b, 8));
1308  return I128::NarrowFrom16To8(ax, bx);
1309 #elif defined(NLIB_NEON)
1310 # ifdef __aarch64__
1311  return vpaddq_s8(a, b);
1312 # else
1313  int8x8_t al = vget_low_s8(a);
1314  int8x8_t ah = vget_high_s8(a);
1315  int8x8_t rl = vpadd_s8(al, ah);
1316  int8x8_t bl = vget_low_s8(b);
1317  int8x8_t bh = vget_high_s8(b);
1318  int8x8_t rh = vpadd_s8(bl, bh);
1319  return vcombine_s8(rl, rh);
1320 # endif
1321 #endif
1322 }
1323 
1324 // r.i16[0] = value[0] + value[1], ..., r.i16[3] = value[6] + value[7]
1325 NLIB_M(i128) I128::PairwiseAdd16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1326 #if defined(NLIB_SSE41)
1327  return _mm_hadd_epi16(a, b);
1328 #elif defined(NLIB_NEON)
1329 # ifdef __aarch64__
1330  return vreinterpretq_s8_s16(vpaddq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)));
1331 # else
1332  int16x4_t al = vget_low_s16(vreinterpretq_s16_s8(a));
1333  int16x4_t ah = vget_high_s16(vreinterpretq_s16_s8(a));
1334  int16x4_t rl = vpadd_s16(al, ah);
1335  int16x4_t bl = vget_low_s16(vreinterpretq_s16_s8(b));
1336  int16x4_t bh = vget_high_s16(vreinterpretq_s16_s8(b));
1337  int16x4_t rh = vpadd_s16(bl, bh);
1338  return NLIB_CMB(s16, rl, rh);
1339 # endif
1340 #endif
1341 }
1342 
1343 // r.i32[0] = value[0] + value[1], r.i32[1] = value[2] + value[3]
1344 NLIB_M(i128) I128::PairwiseAdd32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1345 #if defined(NLIB_SSE41)
1346  return _mm_hadd_epi32(a, b);
1347 #elif defined(NLIB_NEON)
1348 # ifdef __aarch64__
1349  return vreinterpretq_s8_s32(vpaddq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)));
1350 # else
1351  int32x2_t al = vget_low_s32(vreinterpretq_s32_s8(a));
1352  int32x2_t ah = vget_high_s32(vreinterpretq_s32_s8(a));
1353  int32x2_t rl = vpadd_s32(al, ah);
1354  int32x2_t bl = vget_low_s32(vreinterpretq_s32_s8(b));
1355  int32x2_t bh = vget_high_s32(vreinterpretq_s32_s8(b));
1356  int32x2_t rh = vpadd_s32(bl, bh);
1357  return NLIB_CMB(s32, rl, rh);
1358 # endif
1359 #endif
1360 }
1361 
1362 // r.i16[i] = a.i16[i] * b.i16[i]
1363 NLIB_M(i128) I128::Mult16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1364 #if defined(NLIB_SSE41)
1365  return _mm_mullo_epi16(a, b);
1366 #elif defined(NLIB_NEON)
1367  return NLIB_OP2(vmulq, s16, a, b);
1368 #endif
1369 }
1370 
1371 // r = c + a * b
1372 NLIB_M(i128) I128::MultAdd16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1373 #if defined(NLIB_SSE41)
1374  return _mm_add_epi16(c, _mm_mullo_epi16(a, b));
1375 #elif defined(NLIB_NEON)
1376  return NLIB_OP3(vmlaq, s16, c, a, b);
1377 #endif
1378 }
1379 
1380 // r = c - a * b
1381 NLIB_M(i128) I128::MultSub16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1382 #if defined(NLIB_SSE41)
1383  return _mm_sub_epi16(c, _mm_mullo_epi16(a, b));
1384 #elif defined(NLIB_NEON)
1385  return NLIB_OP3(vmlsq, s16, c, a, b);
1386 #endif
1387 }
1388 
1389 // r.i32[i] = a.i32[i] * b.i32[i]
1390 NLIB_M(i128) I128::Mult32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1391 #if defined(NLIB_SSE41)
1392  return _mm_mullo_epi32(a, b);
1393 #elif defined(NLIB_NEON)
1394  return NLIB_OP2(vmulq, s32, a, b);
1395 #endif
1396 }
1397 
1398 // r = c + a * b
1399 NLIB_M(i128) I128::MultAdd32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1400 #if defined(NLIB_SSE41)
1401  return _mm_add_epi32(c, _mm_mullo_epi32(a, b));
1402 #elif defined(NLIB_NEON)
1403  return NLIB_OP3(vmlaq, s32, c, a, b);
1404 #endif
1405 }
1406 
1407 // r = c - a * b
1408 NLIB_M(i128) I128::MultSub32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1409 #if defined(NLIB_SSE41)
1410  return _mm_sub_epi32(c, _mm_mullo_epi32(a, b));
1411 #elif defined(NLIB_NEON)
1412  return NLIB_OP3(vmlsq, s32, c, a, b);
1413 #endif
1414 }
1415 
1416 // r.s8[i] = max(a.s8[i], b.s8[i])
1417 NLIB_M(i128) I128::MaxInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1418 #if defined(NLIB_SSE41)
1419  return _mm_max_epi8(a, b);
1420 #elif defined(NLIB_NEON)
1421  return NLIB_OP2(vmaxq, s8, a, b);
1422 #endif
1423 }
1424 
1425 // r.s16[i] = max(a.s16[i], b.s16[i])
1426 NLIB_M(i128) I128::MaxInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1427 #if defined(NLIB_SSE41)
1428  return _mm_max_epi16(a, b);
1429 #elif defined(NLIB_NEON)
1430  return NLIB_OP2(vmaxq, s16, a, b);
1431 #endif
1432 }
1433 
1434 // r.s32[i] = max(a.s32[i], b.s32[i])
1435 NLIB_M(i128) I128::MaxInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1436 #if defined(NLIB_SSE41)
1437  return _mm_max_epi32(a, b);
1438 #elif defined(NLIB_NEON)
1439  return NLIB_OP2(vmaxq, s32, a, b);
1440 #endif
1441 }
1442 
1443 // r.u8[i] = max(a.u8[i], b.u8[i])
1444 NLIB_M(i128) I128::MaxUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1445 #if defined(NLIB_SSE41)
1446  return _mm_max_epu8(a, b);
1447 #elif defined(NLIB_NEON)
1448  return NLIB_OP2(vmaxq, u8, a, b);
1449 #endif
1450 }
1451 
1452 // r.u16[i] = max(a.u16[i], b.u16[i])
1453 NLIB_M(i128) I128::MaxUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1454 #if defined(NLIB_SSE41)
1455  return _mm_max_epu16(a, b);
1456 #elif defined(NLIB_NEON)
1457  return NLIB_OP2(vmaxq, u16, a, b);
1458 #endif
1459 }
1460 
1461 // r.u32[i] = max(a.u32[i], b.u32[i])
1462 NLIB_M(i128) I128::MaxUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1463 #if defined(NLIB_SSE41)
1464  return _mm_max_epu32(a, b);
1465 #elif defined(NLIB_NEON)
1466  return NLIB_OP2(vmaxq, u32, a, b);
1467 #endif
1468 }
1469 
1470 // r.s8[i] = min(a.s8[i], b.s8[i])
1471 NLIB_M(i128) I128::MinInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1472 #if defined(NLIB_SSE41)
1473  return _mm_min_epi8(a, b);
1474 #elif defined(NLIB_NEON)
1475  return NLIB_OP2(vminq, s8, a, b);
1476 #endif
1477 }
1478 
1479 // r.s16[i] = min(a.s16[i], b.s16[i])
1480 NLIB_M(i128) I128::MinInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1481 #if defined(NLIB_SSE41)
1482  return _mm_min_epi16(a, b);
1483 #elif defined(NLIB_NEON)
1484  return NLIB_OP2(vminq, s16, a, b);
1485 #endif
1486 }
1487 
1488 // r.s32[i] = min(a.s32[i], b.s32[i])
1489 NLIB_M(i128) I128::MinInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1490 #if defined(NLIB_SSE41)
1491  return _mm_min_epi32(a, b);
1492 #elif defined(NLIB_NEON)
1493  return NLIB_OP2(vminq, s32, a, b);
1494 #endif
1495 }
1496 
1497 // r.u8[i] = min(a.u8[i], b.u8[i])
1498 NLIB_M(i128) I128::MinUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1499 #if defined(NLIB_SSE41)
1500  return _mm_min_epu8(a, b);
1501 #elif defined(NLIB_NEON)
1502  return NLIB_OP2(vminq, u8, a, b);
1503 #endif
1504 }
1505 
1506 // r.u16[i] = min(a.u16[i], b.u16[i])
1507 NLIB_M(i128) I128::MinUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1508 #if defined(NLIB_SSE41)
1509  return _mm_min_epu16(a, b);
1510 #elif defined(NLIB_NEON)
1511  return NLIB_OP2(vminq, u16, a, b);
1512 #endif
1513 }
1514 
1515 // r.u32[i] = min(a.u32[i], b.u32[i])
1516 NLIB_M(i128) I128::MinUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1517 #if defined(NLIB_SSE41)
1518  return _mm_min_epu32(a, b);
1519 #elif defined(NLIB_NEON)
1520  return NLIB_OP2(vminq, u32, a, b);
1521 #endif
1522 }
1523 
1524 // r.s8[i] = abs(value.s8[i])
1525 NLIB_M(i128) I128::AbsInt8(i128arg value) NLIB_NOEXCEPT {
1526 #if defined(NLIB_SSE41)
1527  return _mm_abs_epi8(value);
1528 #elif defined(NLIB_NEON)
1529  return NLIB_OP1(vabsq, s8, value);
1530 #endif
1531 }
1532 
1533 // r.s16[i] = abs(value.s16[i])
1534 NLIB_M(i128) I128::AbsInt16(i128arg value) NLIB_NOEXCEPT {
1535 #if defined(NLIB_SSE41)
1536  return _mm_abs_epi16(value);
1537 #elif defined(NLIB_NEON)
1538  return NLIB_OP1(vabsq, s16, value);
1539 #endif
1540 }
1541 
1542 // r.s32[i] = abs(value.s32[i])
1543 NLIB_M(i128) I128::AbsInt32(i128arg value) NLIB_NOEXCEPT {
1544 #if defined(NLIB_SSE41)
1545  return _mm_abs_epi32(value);
1546 #elif defined(NLIB_NEON)
1547  return NLIB_OP1(vabsq, s32, value);
1548 #endif
1549 }
1550 
1551 // r.s8[i] = abs(a.s8[i] - b.s8[i])
1552 NLIB_M(i128) I128::AbsDiffInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1553 #if defined(NLIB_SSE41)
1554  return _mm_abs_epi8(_mm_sub_epi8(a, b));
1555 #elif defined(NLIB_NEON)
1556  return NLIB_OP2(vabdq, s8, a, b);
1557 #endif
1558 }
1559 
1560 // r.s16[i] = abs(a.s16[i] - b.s16[i])
1561 NLIB_M(i128) I128::AbsDiffInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1562 #if defined(NLIB_SSE41)
1563  return _mm_abs_epi16(_mm_sub_epi16(a, b));
1564 #elif defined(NLIB_NEON)
1565  return NLIB_OP2(vabdq, s16, a, b);
1566 #endif
1567 }
1568 
1569 // r.s32[i] = abs(a.s32[i] - b.s32[i])
1570 NLIB_M(i128) I128::AbsDiffInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1571 #if defined(NLIB_SSE41)
1572  return _mm_abs_epi32(_mm_sub_epi32(a, b));
1573 #elif defined(NLIB_NEON)
1574  return NLIB_OP2(vabdq, s32, a, b);
1575 #endif
1576 }
1577 
1578 // r.s8[i] = -value.s8[i]
1579 NLIB_M(i128) I128::NegateInt8(i128arg value) NLIB_NOEXCEPT {
1580 #if defined(NLIB_SSE41)
1581  return _mm_sub_epi8(_mm_setzero_si128(), value);
1582 #elif defined(NLIB_NEON)
1583  return NLIB_OP1(vnegq, s8, value);
1584 #endif
1585 }
1586 
1587 // r.s16[i] = -value.s16[i]
1588 NLIB_M(i128) I128::NegateInt16(i128arg value) NLIB_NOEXCEPT {
1589 #if defined(NLIB_SSE41)
1590  return _mm_sub_epi16(_mm_setzero_si128(), value);
1591 #elif defined(NLIB_NEON)
1592  return NLIB_OP1(vnegq, s16, value);
1593 #endif
1594 }
1595 
1596 // r.s32[i] = -value.s32[i]
1597 NLIB_M(i128) I128::NegateInt32(i128arg value) NLIB_NOEXCEPT {
1598 #if defined(NLIB_SSE41)
1599  return _mm_sub_epi32(_mm_setzero_si128(), value);
1600 #elif defined(NLIB_NEON)
1601  return NLIB_OP1(vnegq, s32, value);
1602 #endif
1603 }
1604 
1605 // r = a & b
1606 NLIB_M(i128) I128::And(i128arg a, i128arg b) NLIB_NOEXCEPT {
1607 #if defined(NLIB_SSE41)
1608  return _mm_and_si128(a, b);
1609 #elif defined(NLIB_NEON)
1610  return NLIB_OP2(vandq, s8, a, b);
1611 #endif
1612 }
1613 
1614 // r = a | b
1615 NLIB_M(i128) I128::Or(i128arg a, i128arg b) NLIB_NOEXCEPT {
1616 #if defined(NLIB_SSE41)
1617  return _mm_or_si128(a, b);
1618 #elif defined(NLIB_NEON)
1619  return NLIB_OP2(vorrq, s8, a, b);
1620 #endif
1621 }
1622 
1623 // r = a ^ b
1624 NLIB_M(i128) I128::Xor(i128arg a, i128arg b) NLIB_NOEXCEPT {
1625 #if defined(NLIB_SSE41)
1626  return _mm_xor_si128(a, b);
1627 #elif defined(NLIB_NEON)
1628  return NLIB_OP2(veorq, s8, a, b);
1629 #endif
1630 }
1631 
1632 // r = ~a
1633 NLIB_M(i128) I128::Not(i128arg a) NLIB_NOEXCEPT {
1634 #if defined(NLIB_SSE41)
1635  return _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1636 #elif defined(NLIB_NEON)
1637  return NLIB_OP1(vmvnq, s8, a);
1638 #endif
1639 }
1640 
1641 // r = ~a & b
1642 NLIB_M(i128) I128::AndNot(i128arg a, i128arg b) NLIB_NOEXCEPT {
1643 #if defined(NLIB_SSE41)
1644  return _mm_andnot_si128(a, b);
1645 #elif defined(NLIB_NEON)
1646  return NLIB_OP2(vbicq, s8, b, a);
1647 #endif
1648 }
1649 
1650 // r = ~a | b
1651 NLIB_M(i128) I128::OrNot(i128arg a, i128arg b) NLIB_NOEXCEPT {
1652 #if defined(NLIB_SSE41)
1653  __m128i not_a = _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1654  return _mm_or_si128(not_a, b);
1655 #elif defined(NLIB_NEON)
1656  return NLIB_OP2(vornq, s8, b, a);
1657 #endif
1658 }
1659 
1660 NLIB_M(i128) I128::Test8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1661 #if defined(NLIB_NEON)
1662  return vtstq_s8(a, b);
1663 #else
1664  return I128::Not(I128::CmpEqZero8(I128::And(a, b)));
1665 #endif
1666 }
1667 
1668 NLIB_M(i128) I128::Test16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1669 #if defined(NLIB_NEON)
1670  return NLIB_OP2(vtstq, s16, a, b);
1671 #else
1672  return I128::Not(I128::CmpEqZero16(I128::And(a, b)));
1673 #endif
1674 }
1675 
1676 NLIB_M(i128) I128::Test32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1677 #if defined(NLIB_NEON)
1678  return NLIB_OP2(vtstq, s32, a, b);
1679 #else
1680  return I128::Not(I128::CmpEqZero32(I128::And(a, b)));
1681 #endif
1682 }
1683 
1684 // r.i8[i] = a.i8[i] == b.i8[i] ? 0xFF : 0
1685 NLIB_M(i128) I128::CmpEq8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1686 #if defined(NLIB_SSE41)
1687  return _mm_cmpeq_epi8(a, b);
1688 #elif defined(NLIB_NEON)
1689  return NLIB_CMP(vceqq, s8, a, b, u8);
1690 #endif
1691 }
1692 
1693 // r.i16[i] = a.i16[i] == b.i16[i] ? 0xFFFF : 0
1694 NLIB_M(i128) I128::CmpEq16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1695 #if defined(NLIB_SSE41)
1696  return _mm_cmpeq_epi16(a, b);
1697 #elif defined(NLIB_NEON)
1698  return NLIB_CMP(vceqq, s16, a, b, u16);
1699 #endif
1700 }
1701 
1702 // r.i32[i] = a.i32[i] == b.i32[i] ? 0xFFFFFFFF : 0
1703 NLIB_M(i128) I128::CmpEq32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1704 #if defined(NLIB_SSE41)
1705  return _mm_cmpeq_epi32(a, b);
1706 #elif defined(NLIB_NEON)
1707  return NLIB_CMP(vceqq, s32, a, b, u32);
1708 #endif
1709 }
1710 
1711 // r.i64[i] = a.i64[i] == b.i64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1712 NLIB_M(i128) I128::CmpEq64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1713 #if defined(NLIB_SSE41)
1714  return _mm_cmpeq_epi64(a, b);
1715 #elif defined(NLIB_NEON)
1716 #ifdef __aarch64__
1717  return NLIB_CMP(vceqq, s64, a, b, u64);
1718 #else
1719  uint32x4_t x0 = vceqq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b));
1720  uint32x2x2_t x1 = vtrn_u32(vget_low_u32(x0), vget_high_u32(x0));
1721  uint32x2_t x2 = vand_u32(x1.val[0], x1.val[1]);
1722  int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1723  return vreinterpretq_s8_s64(result);
1724 #endif
1725 #endif
1726 }
1727 
1728 // r.s8[i] = a.s8[i] < b.s8[i] ? 0xFF : 0
1729 NLIB_M(i128) I128::CmpLtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1730 #if defined(NLIB_SSE41)
1731  return _mm_cmplt_epi8(a, b);
1732 #elif defined(NLIB_NEON)
1733  return NLIB_CMP(vcltq, s8, a, b, u8);
1734 #endif
1735 }
1736 
1737 // r.s16[i] = a.s16[i] < b.s16[i] ? 0xFFFF : 0
1738 NLIB_M(i128) I128::CmpLtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1739 #if defined(NLIB_SSE41)
1740  return _mm_cmplt_epi16(a, b);
1741 #elif defined(NLIB_NEON)
1742  return NLIB_CMP(vcltq, s16, a, b, u16);
1743 #endif
1744 }
1745 
1746 // r.s32[i] = a.s32[i] < b.s32[i] ? 0xFFFFFFFF : 0
1747 NLIB_M(i128) I128::CmpLtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1748 #if defined(NLIB_SSE41)
1749  return _mm_cmplt_epi32(a, b);
1750 #elif defined(NLIB_NEON)
1751  return NLIB_CMP(vcltq, s32, a, b, u32);
1752 #endif
1753 }
1754 
1755 // r.s64[i] = a.s64[i] < b.s64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1756 NLIB_M(i128) I128::CmpLtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1757 #if defined(NLIB_SSE42)
1758  return _mm_cmpgt_epi64(b, a);
1759 #elif defined(NLIB_NEON)
1760 #ifdef __aarch64__
1761  return NLIB_CMP(vcltq, s64, a, b, u64);
1762 #else
1763  int32x2x2_t trn_a = vtrn_s32(vreinterpret_s32_s8(vget_low_s8(a)),
1764  vreinterpret_s32_s8(vget_high_s8(a)));
1765  int32x2x2_t trn_b = vtrn_s32(vreinterpret_s32_s8(vget_low_s8(b)),
1766  vreinterpret_s32_s8(vget_high_s8(b)));
1767  uint32x2_t upper_lt = vclt_s32(trn_a.val[1], trn_b.val[1]);
1768  uint32x2_t upper_eq = vceq_s32(trn_a.val[1], trn_b.val[1]);
1769  uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1770  uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1771  int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1772  return vreinterpretq_s8_s64(result);
1773 #endif
1774 #else
1775  i128 cmp = I128::CmpLtInt32(a, b);
1776  i128 eq = I128::CmpEq32(a, b);
1777  i128 cmp_lt = I128::CmpLtUint32(a, b);
1778  i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1779  i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp_lt, cmp_lt);
1780  i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1781  return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1782 #endif
1783 }
1784 
1785 // r.s8[i] = a.s8[i] > b.s8[i] ? 0xFF : 0
1786 NLIB_M(i128) I128::CmpGtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1787 #if defined(NLIB_SSE41)
1788  return _mm_cmpgt_epi8(a, b);
1789 #elif defined(NLIB_NEON)
1790  return NLIB_CMP(vcgtq, s8, a, b, u8);
1791 #endif
1792 }
1793 
1794 // r.s16[i] = a.s16[i] > b.s16[i] ? 0xFFFF : 0
1795 NLIB_M(i128) I128::CmpGtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1796 #if defined(NLIB_SSE41)
1797  return _mm_cmpgt_epi16(a, b);
1798 #elif defined(NLIB_NEON)
1799  return NLIB_CMP(vcgtq, s16, a, b, u16);
1800 #endif
1801 }
1802 
1803 // r.s32[i] = a.s32[i] > b.s32[i] ? 0xFFFFFFFF : 0
1804 NLIB_M(i128) I128::CmpGtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1805 #if defined(NLIB_SSE41)
1806  return _mm_cmpgt_epi32(a, b);
1807 #elif defined(NLIB_NEON)
1808  return NLIB_CMP(vcgtq, s32, a, b, u32);
1809 #endif
1810 }
1811 
1812 // r.s64[i] = a.s64[i] > b.s64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1813 NLIB_M(i128) I128::CmpGtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1814 #if defined(NLIB_SSE42)
1815  return _mm_cmpgt_epi64(a, b);
1816 #elif defined(NLIB_NEON) && defined(__aarch64__)
1817  return NLIB_CMP(vcgtq, s64, a, b, u64);
1818 #else
1819  return I128::CmpLtInt64(b, a);
1820 #endif
1821 }
1822 
1823 // r.u8[i] = a.u8[i] < b.u8[i] ? 0xFF : 0
1824 NLIB_M(i128) I128::CmpLtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1825 #if defined(NLIB_SSE41)
1826  i128 ofs = I128::SetValue(0x80, each_uint8);
1827  return _mm_cmplt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1828 #elif defined(NLIB_NEON)
1829  return NLIB_CMP(vcltq, u8, a, b, u8);
1830 #endif
1831 }
1832 
1833 // r.u8[i] = a.u8[i] > b.u8[i] ? 0xFF : 0
1834 NLIB_M(i128) I128::CmpGtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1835 #if defined(NLIB_SSE41)
1836  i128 ofs = I128::SetValue(0x80, each_uint8);
1837  return _mm_cmpgt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1838 #elif defined(NLIB_NEON)
1839  return NLIB_CMP(vcgtq, u8, a, b, u8);
1840 #endif
1841 }
1842 
1843 // r.u16[i] = a.u16[i] < b.u16[i] ? 0xFFFF : 0
1844 NLIB_M(i128) I128::CmpLtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1845 #if defined(NLIB_SSE41)
1846  i128 ofs = I128::SetValue(0x8000U, each_uint16);
1847  return _mm_cmplt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1848 #elif defined(NLIB_NEON)
1849  return NLIB_CMP(vcltq, u16, a, b, u16);
1850 #endif
1851 }
1852 
1853 // r.u16[i] = a.u16[i] > b.u16[i] ? 0xFFFF : 0
1854 NLIB_M(i128) I128::CmpGtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1855 #if defined(NLIB_SSE41)
1856  i128 ofs = I128::SetValue(0x8000U, each_uint16);
1857  return _mm_cmpgt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1858 #elif defined(NLIB_NEON)
1859  return NLIB_CMP(vcgtq, u16, a, b, u16);
1860 #endif
1861 }
1862 
1863 // r.u32[i] = a.u32[i] < b.u32[i] ? 0xFFFFFFFF : 0
1864 NLIB_M(i128) I128::CmpLtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1865 #if defined(NLIB_SSE41)
1866  i128 ofs = I128::SetValue(0x80000000U, each_uint32);
1867  return _mm_cmplt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1868 #elif defined(NLIB_NEON)
1869  return NLIB_CMP(vcltq, u32, a, b, u32);
1870 #endif
1871 }
1872 
1873 // r.u32[i] = a.u32[i] > b.u32[i] ? 0xFFFFFFFF : 0
1874 NLIB_M(i128) I128::CmpGtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1875 #if defined(NLIB_SSE41)
1876  i128 ofs = I128::SetValue(0x80000000U, each_uint32);
1877  return _mm_cmpgt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1878 #elif defined(NLIB_NEON)
1879  return NLIB_CMP(vcgtq, u32, a, b, u32);
1880 #endif
1881 }
1882 
1883 // r.u64[i] = a.u64[i] < b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1884 NLIB_M(i128) I128::CmpLtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1885 #if defined(NLIB_SSE42)
1886  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
1887  return _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
1888 #elif defined(NLIB_NEON)
1889 #ifdef __aarch64__
1890  return NLIB_CMP(vcltq, u64, a, b, u64);
1891 #else
1892  uint32x2x2_t trn_a = vtrn_u32(vreinterpret_u32_s8(vget_low_s8(a)),
1893  vreinterpret_u32_s8(vget_high_s8(a)));
1894  uint32x2x2_t trn_b = vtrn_u32(vreinterpret_u32_s8(vget_low_s8(b)),
1895  vreinterpret_u32_s8(vget_high_s8(b)));
1896  uint32x2_t upper_lt = vclt_u32(trn_a.val[1], trn_b.val[1]);
1897  uint32x2_t upper_eq = vceq_u32(trn_a.val[1], trn_b.val[1]);
1898  uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1899  uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1900  int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1901  return vreinterpretq_s8_s64(result);
1902 #endif
1903 #else
1904  i128 cmp = I128::CmpLtUint32(a, b);
1905  i128 eq = I128::CmpEq32(a, b);
1906  i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1907  i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp, cmp);
1908  i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1909  return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1910 #endif
1911 }
1912 
1913 // r.u64[i] = a.u64[i] > b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1914 NLIB_M(i128) I128::CmpGtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1915 #if defined(NLIB_SSE42)
1916  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
1917  return _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
1918 #elif defined(NLIB_NEON) && defined(__aarch64__)
1919  return NLIB_CMP(vcgtq, u64, a, b, u64);
1920 #else
1921  return I128::CmpLtUint64(b, a);
1922 #endif
1923 }
1924 
1925 // r.s8[i] = a.s8[i] <= b.s8[i] ? 0xFF : 0
1926 NLIB_M(i128) I128::CmpLeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1927 #if defined(NLIB_SSE41)
1928  return _mm_or_si128(_mm_cmplt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1929 #elif defined(NLIB_NEON)
1930  return NLIB_CMP(vcleq, s8, a, b, u8);
1931 #endif
1932 }
1933 
1934 // r.s16[i] = a.s16[i] <= b.s16[i] ? 0xFFFF : 0
1935 NLIB_M(i128) I128::CmpLeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1936 #if defined(NLIB_SSE41)
1937  return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1938 #elif defined(NLIB_NEON)
1939  return NLIB_CMP(vcleq, s16, a, b, u16);
1940 #endif
1941 }
1942 
1943 // r.s32[i] = a.s32[i] <= b.s32[i] ? 0xFFFFFFFF : 0
1944 NLIB_M(i128) I128::CmpLeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1945 #if defined(NLIB_SSE41)
1946  return _mm_or_si128(_mm_cmplt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1947 #elif defined(NLIB_NEON)
1948  return NLIB_CMP(vcleq, s32, a, b, u32);
1949 #endif
1950 }
1951 
1952 // r.s32[i] = a.s32[i] <= b.s32[i] ? 0xFFFFFFFFFFFFFFFF : 0
1953 NLIB_M(i128) I128::CmpLeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1954 #if defined(NLIB_SSE42)
1955  return _mm_or_si128(_mm_cmpgt_epi64(b, a), _mm_cmpeq_epi64(a, b));
1956 #elif defined(NLIB_NEON) && defined(__aarch64__)
1957  return NLIB_CMP(vcleq, s64, a, b, u64);
1958 #else
1959  return I128::Not(I128::CmpGtInt64(a, b));
1960 #endif
1961 }
1962 
1963 // r.s8[i] = a.s8[i] >= b.s8[i] ? 0xFF : 0
1964 NLIB_M(i128) I128::CmpGeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1965 #if defined(NLIB_SSE41)
1966  return _mm_or_si128(_mm_cmpgt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1967 #elif defined(NLIB_NEON)
1968  return NLIB_CMP(vcgeq, s8, a, b, u8);
1969 #endif
1970 }
1971 
1972 // r.s16[i] = a.s16[i] >= b.s16[i] ? 0xFFFF : 0
1973 NLIB_M(i128) I128::CmpGeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1974 #if defined(NLIB_SSE41)
1975  return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1976 #elif defined(NLIB_NEON)
1977  return NLIB_CMP(vcgeq, s16, a, b, u16);
1978 #endif
1979 }
1980 
1981 // r.s32[i] = a.s32[i] >= b.s32[i] ? 0xFFFFFFFF : 0
1982 NLIB_M(i128) I128::CmpGeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1983 #if defined(NLIB_SSE41)
1984  return _mm_or_si128(_mm_cmpgt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1985 #elif defined(NLIB_NEON)
1986  return NLIB_CMP(vcgeq, s32, a, b, u32);
1987 #endif
1988 }
1989 
1990 // r.s64[i] = a.s64[i] >= b.s64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1991 NLIB_M(i128) I128::CmpGeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1992 #if defined(NLIB_SSE42)
1993  return _mm_or_si128(_mm_cmpgt_epi64(a, b), _mm_cmpeq_epi64(a, b));
1994 #elif defined(NLIB_NEON) && defined(__aarch64__)
1995  return NLIB_CMP(vcgeq, s64, a, b, u64);
1996 #else
1997  return I128::Not(I128::CmpLtInt64(a, b));
1998 #endif
1999 }
2000 
2001 // r.u8[i] = a.u8[i] <= b.u8[i] ? 0xFF : 0
2002 NLIB_M(i128) I128::CmpLeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2003 #if defined(NLIB_SSE41)
2004  return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
2005 #elif defined(NLIB_NEON)
2006  return NLIB_CMP(vcleq, u8, a, b, u8);
2007 #endif
2008 }
2009 
2010 // r.u16[i] = a.u16[i] <= b.u16[i] ? 0xFFFF : 0
2011 NLIB_M(i128) I128::CmpLeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2012 #if defined(NLIB_SSE41)
2013  return _mm_cmpeq_epi16(_mm_min_epu16(a, b), a);
2014 #elif defined(NLIB_NEON)
2015  return NLIB_CMP(vcleq, u16, a, b, u16);
2016 #endif
2017 }
2018 
2019 // r.u32[i] = a.u32[i] <= b.u32[i] ? 0xFFFFFFFF : 0
2020 NLIB_M(i128) I128::CmpLeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2021 #if defined(NLIB_SSE41)
2022  return _mm_cmpeq_epi32(_mm_min_epu32(a, b), a);
2023 #elif defined(NLIB_NEON)
2024  return NLIB_CMP(vcleq, u32, a, b, u32);
2025 #endif
2026 }
2027 
2028 // r.u64[i] = a.u64[i] <= b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
2029 NLIB_M(i128) I128::CmpLeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
2030 #if defined(NLIB_SSE42)
2031  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
2032  i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
2033  return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2034 #elif defined(NLIB_NEON) && defined(__aarch64__)
2035  return NLIB_CMP(vcleq, u64, a, b, u64);
2036 #else
2037  return I128::Not(I128::CmpGtUint64(a, b));
2038 #endif
2039 }
2040 
2041 // r.u8[i] = a.u8[i] >= b.u8[i] ? 0xFF : 0
2042 NLIB_M(i128) I128::CmpGeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2043 #if defined(NLIB_SSE41)
2044  return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a);
2045 #elif defined(NLIB_NEON)
2046  return NLIB_CMP(vcgeq, u8, a, b, u8);
2047 #endif
2048 }
2049 
2050 // r.u16[i] = a.u16[i] >= b.u16[i] ? 0xFFFF : 0
2051 NLIB_M(i128) I128::CmpGeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2052 #if defined(NLIB_SSE41)
2053  return _mm_cmpeq_epi16(_mm_max_epu16(a, b), a);
2054 #elif defined(NLIB_NEON)
2055  return NLIB_CMP(vcgeq, u16, a, b, u16);
2056 #endif
2057 }
2058 
2059 // r.u32[i] = a.u32[i] >= b.u32[i] ? 0xFFFFFFFF : 0
2060 NLIB_M(i128) I128::CmpGeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2061 #if defined(NLIB_SSE41)
2062  return _mm_cmpeq_epi32(_mm_max_epu32(a, b), a);
2063 #elif defined(NLIB_NEON)
2064  return NLIB_CMP(vcgeq, u32, a, b, u32);
2065 #endif
2066 }
2067 
2068 // r.u64[i] = a.u64[i] >= b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
2069 NLIB_M(i128) I128::CmpGeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
2070 #if defined(NLIB_SSE42)
2071  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
2072  i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
2073  return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2074 #elif defined(NLIB_NEON) && defined(__aarch64__)
2075  return NLIB_CMP(vcgeq, u64, a, b, u64);
2076 #else
2077  return I128::Not(I128::CmpLtUint64(a, b));
2078 #endif
2079 }
2080 
2081 NLIB_M(i128) I128::CmpEqZero8(i128arg value) NLIB_NOEXCEPT {
2082 #if defined(__aarch64__)
2083  return vceqzq_s8(value);
2084 #else
2085  return I128::CmpEq8(value, I128::SetZero());
2086 #endif
2087 }
2088 
2089 NLIB_M(i128) I128::CmpEqZero16(i128arg value) NLIB_NOEXCEPT {
2090 #if defined(__aarch64__)
2091  return vreinterpretq_s8_s16(vceqzq_s16(vreinterpretq_s16_s8(value)));
2092 #else
2093  return I128::CmpEq16(value, I128::SetZero());
2094 #endif
2095 }
2096 
2097 NLIB_M(i128) I128::CmpEqZero32(i128arg value) NLIB_NOEXCEPT {
2098 #if defined(__aarch64__)
2099  return vreinterpretq_s8_s32(vceqzq_s32(vreinterpretq_s32_s8(value)));
2100 #else
2101  return I128::CmpEq32(value, I128::SetZero());
2102 #endif
2103 }
2104 
2105 NLIB_M(i128) I128::CmpEqZero64(i128arg value) NLIB_NOEXCEPT {
2106 #if defined(__aarch64__)
2107  return vreinterpretq_s8_s64(vceqzq_s64(vreinterpretq_s64_s8(value)));
2108 #else
2109  return I128::CmpEq64(value, I128::SetZero());
2110 #endif
2111 }
2112 
2113 // r.u8[i] = value.u8[i] << count
2114 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value, int count) NLIB_NOEXCEPT {
2115 #if defined(NLIB_SSE41)
2116  __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2117  __m128i xh = _mm_slli_epi16(_mm_cvtepu8_epi16(hi), count);
2118  __m128i xl = _mm_slli_epi16(_mm_cvtepu8_epi16(value), count);
2119  return I128::NarrowFrom16To8(xl, xh);
2120 #elif defined(NLIB_NEON)
2121  return NLIB_SFT(vshlq, u8, value, count, s8);
2122 #endif
2123 }
2124 
2125 // r.u8[i] = value.u8[i] >> count
2126 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value, int count) NLIB_NOEXCEPT {
2127 #if defined(NLIB_SSE41)
2128  __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2129  __m128i xh = _mm_srli_epi16(_mm_cvtepu8_epi16(hi), count);
2130  __m128i xl = _mm_srli_epi16(_mm_cvtepu8_epi16(value), count);
2131  return _mm_packus_epi16(xl, xh);
2132 #elif defined(NLIB_NEON)
2133  return NLIB_SFT(vshlq, u8, value, -count, s8);
2134 #endif
2135 }
2136 
2137 // r.s8[i] = value.s8[i] >> count
2138 NLIB_M(i128) I128::ShiftRightArithmetic8(i128arg value, int count) NLIB_NOEXCEPT {
2139 #if defined(NLIB_SSE41)
2140  __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2141  __m128i xh = _mm_srai_epi16(_mm_cvtepi8_epi16(hi), count);
2142  __m128i xl = _mm_srai_epi16(_mm_cvtepi8_epi16(value), count);
2143  return _mm_packus_epi16(xl, xh);
2144 #elif defined(NLIB_NEON)
2145  return NLIB_SFT(vshlq, s8, value, -count, s8);
2146 #endif
2147 }
2148 
2149 // r.u16[i] = value.u16[i] << count
2150 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value, int count) NLIB_NOEXCEPT {
2151 #if defined(NLIB_SSE41)
2152  return _mm_slli_epi16(value, count);
2153 #elif defined(NLIB_NEON)
2154  return NLIB_SFT(vshlq, u16, value, count, s16);
2155 #endif
2156 }
2157 
2158 // r.u16[i] = value.u16[i] >> count
2159 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value, int count) NLIB_NOEXCEPT {
2160 #if defined(NLIB_SSE41)
2161  return _mm_srli_epi16(value, count);
2162 #elif defined(NLIB_NEON)
2163  return NLIB_SFT(vshlq, u16, value, -count, s16);
2164 #endif
2165 }
2166 
2167 // r.s16[i] = value.s16[i] >> count
2168 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value, int count) NLIB_NOEXCEPT {
2169 #if defined(NLIB_SSE41)
2170  return _mm_srai_epi16(value, count);
2171 #elif defined(NLIB_NEON)
2172  return NLIB_SFT(vshlq, s16, value, -count, s16);
2173 #endif
2174 }
2175 
2176 // r.u32[i] = value.u32[i] << count
2177 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value, int count) NLIB_NOEXCEPT {
2178 #if defined(NLIB_SSE41)
2179  return _mm_slli_epi32(value, count);
2180 #elif defined(NLIB_NEON)
2181  return NLIB_SFT(vshlq, u32, value, count, s32);
2182 #endif
2183 }
2184 
2185 // r.u32[i] = value.u32[i] >> count
2186 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value, int count) NLIB_NOEXCEPT {
2187 #if defined(NLIB_SSE41)
2188  return _mm_srli_epi32(value, count);
2189 #elif defined(NLIB_NEON)
2190  return NLIB_SFT(vshlq, u32, value, -count, s32);
2191 #endif
2192 }
2193 
2194 // r.s32[i] = value.s32[i] >> count
2195 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value, int count) NLIB_NOEXCEPT {
2196 #if defined(NLIB_SSE41)
2197  return _mm_srai_epi32(value, count);
2198 #elif defined(NLIB_NEON)
2199  return NLIB_SFT(vshlq, s32, value, -count, s32);
2200 #endif
2201 }
2202 
2203 // r.u32[i] = value.u64[i] << count
2204 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value, int count) NLIB_NOEXCEPT {
2205 #if defined(NLIB_SSE41)
2206  return _mm_slli_epi64(value, count);
2207 #elif defined(NLIB_NEON)
2208  return NLIB_SFT(vshlq, u64, value, count, s64);
2209 #endif
2210 }
2211 
2212 // r.u32[i] = value.u64[i] >> count
2213 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value, int count) NLIB_NOEXCEPT {
2214 #if defined(NLIB_SSE41)
2215  return _mm_srli_epi64(value, count);
2216 #elif defined(NLIB_NEON)
2217  return NLIB_SFT(vshlq, u64, value, -count, s64);
2218 #endif
2219 }
2220 
2221 template<size_t N>
2222 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value) NLIB_NOEXCEPT {
2223  NLIB_STATIC_ASSERT(N >= 0 && N <= 8);
2224 #ifdef NLIB_NEON
2225  return vshlq_n_s8(value, N);
2226 #else
2227  return I128::ShiftLeftLogical8(value, N);
2228 #endif
2229 }
2230 
2231 template <size_t N>
2232 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value) NLIB_NOEXCEPT {
2233  NLIB_STATIC_ASSERT(N >= 0 && N <= 8);
2234 #ifdef NLIB_NEON
2235  uint8x16_t tmp = vreinterpretq_u8_s8(value);
2236  return vreinterpretq_s8_u8(vshrq_n_u8(tmp, N));
2237 #else
2238  return I128::ShiftRightLogical8(value, N);
2239 #endif
2240 }
2241 
2242 template <size_t N>
2243 NLIB_M(i128) I128::ShiftRightArithmetic8(i128arg value) NLIB_NOEXCEPT {
2244  NLIB_STATIC_ASSERT(N >= 0 && N <= 8);
2245 #ifdef NLIB_NEON
2246  return vshrq_n_s8(value, N);
2247 #else
2248  return I128::ShiftRightArithmetic8(value, N);
2249 #endif
2250 }
2251 
2252 template <size_t N>
2253 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value) NLIB_NOEXCEPT {
2254  NLIB_STATIC_ASSERT(N >= 0 && N <= 16);
2255 #ifdef NLIB_NEON
2256  uint16x8_t tmp = vreinterpretq_u16_s8(value);
2257  return vreinterpretq_s8_u16(vshlq_n_u16(tmp, N));
2258 #else
2259  return I128::ShiftLeftLogical16(value, N);
2260 #endif
2261 }
2262 
2263 template <size_t N>
2264 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value) NLIB_NOEXCEPT {
2265  NLIB_STATIC_ASSERT(N >= 0 && N <= 16);
2266 #ifdef NLIB_NEON
2267  uint16x8_t tmp = vreinterpretq_u16_s8(value);
2268  return vreinterpretq_s8_u16(vshrq_n_u16(tmp, N));
2269 #else
2270  return I128::ShiftRightLogical16(value, N);
2271 #endif
2272 }
2273 
2274 template <size_t N>
2275 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value) NLIB_NOEXCEPT {
2276  NLIB_STATIC_ASSERT(N >= 0 && N <= 16);
2277 #ifdef NLIB_NEON
2278  int16x8_t tmp = vreinterpretq_s16_s8(value);
2279  return vreinterpretq_s8_s16(vshrq_n_s16(tmp, N));
2280 #else
2281  return I128::ShiftRightArithmetic16(value, N);
2282 #endif
2283 }
2284 
2285 template <size_t N>
2286 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value) NLIB_NOEXCEPT {
2287  NLIB_STATIC_ASSERT(N >= 0 && N <= 32);
2288 #ifdef NLIB_NEON
2289  uint32x4_t tmp = vreinterpretq_u32_s8(value);
2290  return vreinterpretq_s8_u32(vshlq_n_u32(tmp, N));
2291 #else
2292  return I128::ShiftLeftLogical32(value, N);
2293 #endif
2294 }
2295 
2296 template <size_t N>
2297 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value) NLIB_NOEXCEPT {
2298  NLIB_STATIC_ASSERT(N >= 0 && N <= 32);
2299 #ifdef NLIB_NEON
2300  uint32x4_t tmp = vreinterpretq_u32_s8(value);
2301  return vreinterpretq_s8_u32(vshrq_n_u32(tmp, N));
2302 #else
2303  return I128::ShiftRightLogical32(value, N);
2304 #endif
2305 }
2306 
2307 template <size_t N>
2308 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value) NLIB_NOEXCEPT {
2309  NLIB_STATIC_ASSERT(N >= 0 && N <= 32);
2310 #ifdef NLIB_NEON
2311  int32x4_t tmp = vreinterpretq_s32_s8(value);
2312  return vreinterpretq_s8_s32(vshrq_n_s32(tmp, N));
2313 #else
2314  return I128::ShiftRightArithmetic32(value, N);
2315 #endif
2316 }
2317 
2318 template <size_t N>
2319 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value) NLIB_NOEXCEPT {
2320  NLIB_STATIC_ASSERT(N >= 0 && N <= 64);
2321 #ifdef NLIB_NEON
2322  uint64x2_t tmp = vreinterpretq_u64_s8(value);
2323  return vreinterpretq_s8_u64(vshlq_n_u64(tmp, N));
2324 #else
2325  return I128::ShiftLeftLogical64(value, N);
2326 #endif
2327 }
2328 
2329 template <size_t N>
2330 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value) NLIB_NOEXCEPT {
2331  NLIB_STATIC_ASSERT(N >= 0 && N <= 64);
2332 #ifdef NLIB_NEON
2333  uint64x2_t tmp = vreinterpretq_u64_s8(value);
2334  return vreinterpretq_s8_u64(vshrq_n_u64(tmp, N));
2335 #else
2336  return I128::ShiftRightLogical64(value, N);
2337 #endif
2338 }
2339 
2340 #ifdef NLIB_NEON
2341 template<>
2342 NLIB_M(i128) I128::ShiftLeftLogical8<8>(i128arg value) NLIB_NOEXCEPT {
2343  NLIB_UNUSED(value);
2344  return I128::SetZero();
2345 }
2346 template<>
2347 NLIB_M(i128) I128::ShiftRightLogical8<0>(i128arg value) NLIB_NOEXCEPT {
2348  return value;
2349 }
2350 template<>
2351 NLIB_M(i128) I128::ShiftLeftLogical16<16>(i128arg value) NLIB_NOEXCEPT {
2352  NLIB_UNUSED(value);
2353  return I128::SetZero();
2354 }
2355 template<>
2356 NLIB_M(i128) I128::ShiftRightLogical16<0>(i128arg value) NLIB_NOEXCEPT {
2357  return value;
2358 }
2359 template<>
2360 NLIB_M(i128) I128::ShiftRightArithmetic16<0>(i128arg value) NLIB_NOEXCEPT {
2361  return value;
2362 }
2363 template<>
2364 NLIB_M(i128) I128::ShiftLeftLogical32<32>(i128arg value) NLIB_NOEXCEPT {
2365  NLIB_UNUSED(value);
2366  return I128::SetZero();
2367 }
2368 template<>
2369 NLIB_M(i128) I128::ShiftRightLogical32<0>(i128arg value) NLIB_NOEXCEPT {
2370  return value;
2371 }
2372 template<>
2373 NLIB_M(i128) I128::ShiftRightArithmetic32<0>(i128arg value) NLIB_NOEXCEPT {
2374  return value;
2375 }
2376 template<>
2377 NLIB_M(i128) I128::ShiftLeftLogical64<64>(i128arg value) NLIB_NOEXCEPT {
2378  NLIB_UNUSED(value);
2379  return I128::SetZero();
2380 }
2381 template<>
2382 NLIB_M(i128) I128::ShiftRightLogical64<0>(i128arg value) NLIB_NOEXCEPT {
2383  return value;
2384 }
2385 #endif
2386 
2387 template <size_t N>
2388 // r.i8[i + N] = r.i8[i], fill zero
2389 NLIB_M(i128) I128::ByteShiftLeft(i128arg value) NLIB_NOEXCEPT {
2390  NLIB_STATIC_ASSERT(N < 16);
2391 #if defined(NLIB_SSE41)
2392  return _mm_slli_si128(value, N);
2393 #elif defined(NLIB_NEON)
2394  return vextq_s8(vdupq_n_s8(0), value, 16 - N);
2395 #endif
2396 }
2397 
2398 template <size_t N>
2399 // r.i8[i - N] = r.i8[i], fill zero
2400 NLIB_M(i128) I128::ByteShiftRight(i128arg value) NLIB_NOEXCEPT {
2401  NLIB_STATIC_ASSERT(N < 16);
2402 #if defined(NLIB_SSE41)
2403  return _mm_srli_si128(value, N);
2404 #elif defined(NLIB_NEON)
2405  return vextq_s8(value, vdupq_n_s8(0), N);
2406 #endif
2407 }
2408 
2409 template <size_t N>
2410 // left <- fedcba9876543210 -> right
2411 NLIB_M(i128) I128::ByteRotateRight(i128arg value) NLIB_NOEXCEPT {
2412  NLIB_STATIC_ASSERT(N < 16);
2413 #if defined(NLIB_SSE41)
2414  return _mm_alignr_epi8(value, value, N);
2415 #elif defined(NLIB_NEON)
2416  return vextq_s8(value, value, N);
2417 #endif
2418 }
2419 
2420 template <size_t N>
2421 // _mm_alignr_epi8(a, b, N), which returns ((a << 128) | b) >> (N * 8)
2422 NLIB_M(i128) I128::AlignR(i128arg a, i128arg b) NLIB_NOEXCEPT {
2423  NLIB_STATIC_ASSERT(N < 16);
2424 #if defined(NLIB_SSE41)
2425  return _mm_alignr_epi8(a, b, N);
2426 #elif defined(NLIB_NEON)
2427  return vextq_s8(b, a, N);
2428 #endif
2429 }
2430 
2431 // r.u8[i] = value.u8[i * 2]
2432 NLIB_M(i128) I128::NarrowFrom16To8(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2433 #if defined(NLIB_SSE41)
2434  i128 mask = I128::SetValue(0x00FFU, each_uint16);
2435  __m128i lo_mask = _mm_and_si128(lo, mask);
2436  __m128i hi_mask = _mm_and_si128(hi, mask);
2437  return _mm_packus_epi16(lo_mask, hi_mask);
2438 #elif defined(NLIB_NEON)
2439 # ifdef __aarch64__
2440  uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2441  return vreinterpretq_s8_u8(vmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2442 # else
2443  uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2444  uint8x8_t h = vmovn_u16(vreinterpretq_u16_s8(hi));
2445  return NLIB_CMB(u8, l, h);
2446 # endif
2447 #endif
2448 }
2449 
2450 // r.u16[i] = value.u16[i * 2]
2451 NLIB_M(i128) I128::NarrowFrom32To16(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2452 #if defined(NLIB_SSE41)
2453  i128 mask = I128::SetValue(0xFFFFU, each_uint32);
2454  __m128i lo_mask = _mm_and_si128(lo, mask);
2455  __m128i hi_mask = _mm_and_si128(hi, mask);
2456  return _mm_packus_epi32(lo_mask, hi_mask);
2457 #elif defined(NLIB_NEON)
2458 # ifdef __aarch64__
2459  uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2460  return vreinterpretq_s8_u16(vmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2461 # else
2462  uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2463  uint16x4_t h = vmovn_u32(vreinterpretq_u32_s8(hi));
2464  return NLIB_CMB(u16, l, h);
2465 # endif
2466 #endif
2467 }
2468 
2469 // r.u32[i] = value.u32[i * 2]
2470 NLIB_M(i128) I128::NarrowFrom64To32(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2471 #if defined(NLIB_SSE41)
2472  __m128i lo_ = _mm_shuffle_epi32(lo, _MM_SHUFFLE(3, 1, 2, 0));
2473  __m128i hi_ = _mm_shuffle_epi32(hi, _MM_SHUFFLE(3, 1, 2, 0));
2474  return _mm_unpacklo_epi64(lo_, hi_);
2475 #elif defined(NLIB_NEON)
2476 # ifdef __aarch64__
2477  uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2478  return vreinterpretq_s8_u32(vmovn_high_u64(l, vreinterpretq_u64_s8(hi)));
2479 # else
2480  uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2481  uint32x2_t h = vmovn_u64(vreinterpretq_u64_s8(hi));
2482  return NLIB_CMB(u32, l, h);
2483 # endif
2484 #endif
2485 }
2486 
2487 // r.u8[i] = 255 if value.u16[i] > 255
2488 NLIB_M(i128) I128::ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2489 #if defined(NLIB_SSE41)
2490  i128 b7FFF = I128::SetValue(0x7FFFU, each_uint16);
2491  __m128i lotmp = _mm_and_si128(lo, b7FFF);
2492  __m128i hitmp = _mm_and_si128(hi, b7FFF);
2493  return _mm_packus_epi16(lotmp, hitmp);
2494 #elif defined(NLIB_NEON)
2495 # ifdef __aarch64__
2496  uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2497  return vreinterpretq_s8_u8(vqmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2498 # else
2499  uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2500  uint8x8_t h = vqmovn_u16(vreinterpretq_u16_s8(hi));
2501  return NLIB_CMB(u8, l, h);
2502 # endif
2503 #endif
2504 }
2505 
2506 // r.s8[i] = 127 if value.s16[i] > 127, -128 if value.s16[i] < -128
2507 NLIB_M(i128) I128::ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2508 #if defined(NLIB_SSE41)
2509  return _mm_packs_epi16(lo, hi);
2510 #elif defined(NLIB_NEON)
2511 # ifdef __aarch64__
2512  int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2513  return vqmovn_high_s16(l, vreinterpretq_s16_s8(hi));
2514 # else
2515  int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2516  int8x8_t h = vqmovn_s16(vreinterpretq_s16_s8(hi));
2517  return NLIB_CMB(s8, l, h);
2518 # endif
2519 #endif
2520 }
2521 
2522 // r.u16[i] = 65535 if value.u32[i] > 65535
2523 NLIB_M(i128) I128::ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2524 #if defined(NLIB_SSE41)
2525  i128 b7FFFFFFF = I128::SetValue(0x7FFFFFFFU, each_uint32);
2526  __m128i lotmp = _mm_and_si128(lo, b7FFFFFFF);
2527  __m128i hitmp = _mm_and_si128(hi, b7FFFFFFF);
2528  return _mm_packus_epi32(lotmp, hitmp);
2529 #elif defined(NLIB_NEON)
2530 # ifdef __aarch64__
2531  uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2532  return vreinterpretq_s8_u16(vqmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2533 # else
2534  uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2535  uint16x4_t h = vqmovn_u32(vreinterpretq_u32_s8(hi));
2536  return NLIB_CMB(u16, l, h);
2537 # endif
2538 #endif
2539 }
2540 
2541 // r.s16[i] = 32767 if value.s32[i] > 32767, -32768 if value.s32[i] < -32768
2542 NLIB_M(i128) I128::ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2543 #if defined(NLIB_SSE41)
2544  return _mm_packs_epi32(lo, hi);
2545 #elif defined(NLIB_NEON)
2546 # ifdef __aarch64__
2547  int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2548  return vreinterpretq_s8_s16(vqmovn_high_s32(l, vreinterpretq_s32_s8(hi)));
2549 # else
2550  int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2551  int16x4_t h = vqmovn_s32(vreinterpretq_s32_s8(hi));
2552  return NLIB_CMB(s16, l, h);
2553 # endif
2554 #endif
2555 }
2556 
2557 // r.s8[2 * i] = value.s8[i], r.u8[2 * i + 1] = value.s8[i] < 0 ? 0xFF : 0
2558 NLIB_M(i128) I128::ConvertFromInt8ToInt16Lo(i128arg value) NLIB_NOEXCEPT {
2559 #if defined(NLIB_SSE41)
2560  return _mm_cvtepi8_epi16(value);
2561 #elif defined(NLIB_NEON)
2562  return vreinterpretq_s8_s16(vmovl_s8(vget_low_s8(value)));
2563 #endif
2564 }
2565 
2566 // r.s8[2 * i] = value.s8[i + 8], r.u8[2 * i + 1] = value.s8[i + 8] < 0 ? 0xFF : 0
2567 NLIB_M(i128) I128::ConvertFromInt8ToInt16Hi(i128arg value) NLIB_NOEXCEPT {
2568 #if defined(NLIB_SSE41)
2569  return _mm_cvtepi8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2570 #elif defined(NLIB_NEON)
2571 # ifdef __aarch64__
2572  int16x8_t result = vmovl_high_s8(value);
2573 # else
2574  int16x8_t result = vmovl_s8(vget_high_s8(value));
2575 # endif
2576  return vreinterpretq_s8_s16(result);
2577 #endif
2578 }
2579 
2580 // r.s16[2 * i] = value.s16[i], r.u16[2 * i + 1] = value.s16[i] < 0 ? 0xFFFF : 0
2581 NLIB_M(i128) I128::ConvertFromInt16ToInt32Lo(i128arg value) NLIB_NOEXCEPT {
2582 #if defined(NLIB_SSE41)
2583  return _mm_cvtepi16_epi32(value);
2584 #elif defined(NLIB_NEON)
2585  int16x8_t x = vreinterpretq_s16_s8(value);
2586  int32x4_t result = vmovl_s16(vget_low_s16(x));
2587  return vreinterpretq_s8_s32(result);
2588 #endif
2589 }
2590 
2591 // r.s16[2 * i] = value.s16[i + 4], r.u16[2 * i + 1] = value.s16[i + 4] < 0 ? 0xFFFF : 0
2592 NLIB_M(i128) I128::ConvertFromInt16ToInt32Hi(i128arg value) NLIB_NOEXCEPT {
2593 #if defined(NLIB_SSE41)
2594  return _mm_cvtepi16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2595 #elif defined(NLIB_NEON)
2596  int16x8_t x = vreinterpretq_s16_s8(value);
2597 # ifdef __aarch64__
2598  int32x4_t result = vmovl_high_s16(x);
2599 # else
2600  int32x4_t result = vmovl_s16(vget_high_s16(x));
2601 # endif
2602  return vreinterpretq_s8_s32(result);
2603 #endif
2604 }
2605 
2606 // r.s32[2 * i] = value.s32[i], r.u32[2 * i + 1] = value.s32[i] < 0 ? 0xFFFFFFFF : 0
2607 NLIB_M(i128) I128::ConvertFromInt32ToInt64Lo(i128arg value) NLIB_NOEXCEPT {
2608 #if defined(NLIB_SSE41)
2609  return _mm_cvtepi32_epi64(value);
2610 #elif defined(NLIB_NEON)
2611  int32x4_t x = vreinterpretq_s32_s8(value);
2612  int64x2_t result = vmovl_s32(vget_low_s32(x));
2613  return vreinterpretq_s8_s64(result);
2614 #endif
2615 }
2616 
2617 // r.s32[2 * i] = value.s32[i + 2], r.u32[2 * i + 1] = value.s32[i + 2] < 0 ? 0xFFFFFFFF : 0
2618 NLIB_M(i128) I128::ConvertFromInt32ToInt64Hi(i128arg value) NLIB_NOEXCEPT {
2619 #if defined(NLIB_SSE41)
2620  return _mm_cvtepi32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2621 #elif defined(NLIB_NEON)
2622  int32x4_t x = vreinterpretq_s32_s8(value);
2623 # ifdef __aarch64__
2624  int64x2_t result = vmovl_high_s32(x);
2625 # else
2626  int64x2_t result = vmovl_s32(vget_high_s32(x));
2627 # endif
2628  return vreinterpretq_s8_s64(result);
2629 #endif
2630 }
2631 
2632 // r.u16[i] = value.u8[i]
2633 NLIB_M(i128) I128::ConvertFromUint8ToUint16Lo(i128arg value) NLIB_NOEXCEPT {
2634 #if defined(NLIB_SSE41)
2635  return _mm_cvtepu8_epi16(value);
2636 #elif defined(NLIB_NEON)
2637  uint8x16_t x = vreinterpretq_u8_s8(value);
2638  uint16x8_t result = vmovl_u8(vget_low_u8(x));
2639  return vreinterpretq_s8_u16(result);
2640 #endif
2641 }
2642 
2643 // r.u16[i] = value.u8[i + 8]
2644 NLIB_M(i128) I128::ConvertFromUint8ToUint16Hi(i128arg value) NLIB_NOEXCEPT {
2645 #if defined(NLIB_SSE41)
2646  return _mm_cvtepu8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2647 #elif defined(NLIB_NEON)
2648  uint8x16_t x = vreinterpretq_u8_s8(value);
2649 # ifdef __aarch64__
2650  uint16x8_t result = vmovl_high_u8(x);
2651 # else
2652  uint16x8_t result = vmovl_u8(vget_high_u8(x));
2653 # endif
2654  return vreinterpretq_s8_u16(result);
2655 #endif
2656 }
2657 
2658 // r.u32[i] = value.u16[i]
2659 NLIB_M(i128) I128::ConvertFromUint16ToUint32Lo(i128arg value) NLIB_NOEXCEPT {
2660 #if defined(NLIB_SSE41)
2661  return _mm_cvtepu16_epi32(value);
2662 #elif defined(NLIB_NEON)
2663  uint16x8_t x = vreinterpretq_u16_s8(value);
2664  uint32x4_t result = vmovl_u16(vget_low_u16(x));
2665  return vreinterpretq_s8_u32(result);
2666 #endif
2667 }
2668 
2669 // r.u32[i] = value.u16[i + 4]
2670 NLIB_M(i128) I128::ConvertFromUint16ToUint32Hi(i128arg value) NLIB_NOEXCEPT {
2671 #if defined(NLIB_SSE41)
2672  return _mm_cvtepu16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2673 #elif defined(NLIB_NEON)
2674  uint16x8_t x = vreinterpretq_u16_s8(value);
2675 # ifdef __aarch64__
2676  uint32x4_t result = vmovl_high_u16(x);
2677 # else
2678  uint32x4_t result = vmovl_u16(vget_high_u16(x));
2679 # endif
2680  return vreinterpretq_s8_u32(result);
2681 #endif
2682 }
2683 
2684 // r.u64[i] = value.u32[i]
2685 NLIB_M(i128) I128::ConvertFromUint32ToUint64Lo(i128arg value) NLIB_NOEXCEPT {
2686 #if defined(NLIB_SSE41)
2687  return _mm_cvtepu32_epi64(value);
2688 #elif defined(NLIB_NEON)
2689  uint32x4_t x = vreinterpretq_u32_s8(value);
2690  uint64x2_t result = vmovl_u32(vget_low_u32(x));
2691  return vreinterpretq_s8_u64(result);
2692 #endif
2693 }
2694 
2695 // r.u64[i] = value.u32[i + 2]
2696 NLIB_M(i128) I128::ConvertFromUint32ToUint64Hi(i128arg value) NLIB_NOEXCEPT {
2697 #if defined(NLIB_SSE41)
2698  return _mm_cvtepu32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2699 #elif defined(NLIB_NEON)
2700  uint32x4_t x = vreinterpretq_u32_s8(value);
2701 # ifdef __aarch64__
2702  uint64x2_t result = vmovl_high_u32(x);
2703 # else
2704  uint64x2_t result = vmovl_u32(vget_high_u32(x));
2705 # endif
2706  return vreinterpretq_s8_u64(result);
2707 #endif
2708 }
2709 
2710 // 01234567 abcdefgh -> 0a1b2c3d4e5f6g7h
2711 NLIB_M(i128) I128::Zip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2712 #if defined(NLIB_SSE41)
2713  return _mm_unpacklo_epi8(a, b);
2714 #elif defined(NLIB_NEON)
2715 # ifdef __aarch64__
2716  return vzip1q_s8(a, b);
2717 # else
2718  return vzipq_s8(a, b).val[0];
2719 # endif
2720 #endif
2721 }
2722 
2723 // 01234567 abcdefgh -> 0a1b2c3d4e5f6g7h
2724 NLIB_M(i128) I128::Zip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2725 #if defined(NLIB_SSE41)
2726  return _mm_unpackhi_epi8(a, b);
2727 #elif defined(NLIB_NEON)
2728 # ifdef __aarch64__
2729  return vzip2q_s8(a, b);
2730 # else
2731  return vzipq_s8(a, b).val[1];
2732 # endif
2733 #endif
2734 }
2735 
2736 NLIB_M(i128) I128::Unzip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2737 #if defined(NLIB_SSE41)
2738  i128 mask = I128::SetValue(0x00FFU, each_uint16);
2739  __m128i lo_mask = _mm_and_si128(a, mask);
2740  __m128i hi_mask = _mm_and_si128(b, mask);
2741  return _mm_packus_epi16(lo_mask, hi_mask);
2742 #elif defined(NLIB_NEON)
2743 # ifdef __aarch64__
2744  return vuzp1q_s8(a, b);
2745 # else
2746  return vuzpq_s8(a, b).val[0];
2747 # endif
2748 #endif
2749 }
2750 
2751 NLIB_M(i128) I128::Unzip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2752 #if defined(NLIB_SSE41)
2753  i128 mask = I128::SetValue(0xFF00U, each_uint16);
2754  __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 1);
2755  __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 1);
2756  return _mm_packus_epi16(lo_mask, hi_mask);
2757 #elif defined(NLIB_NEON)
2758 # ifdef __aarch64__
2759  return vuzp2q_s8(a, b);
2760 # else
2761  return vuzpq_s8(a, b).val[1];
2762 # endif
2763 #endif
2764 }
2765 
2766 // 0123 abcd -> 0a1b2c3d
2767 NLIB_M(i128) I128::Zip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2768 #if defined(NLIB_SSE41)
2769  return _mm_unpacklo_epi16(a, b);
2770 #elif defined(NLIB_NEON)
2771 # ifdef __aarch64__
2772  return NLIB_OP2(vzip1q, u16, a, b);
2773 # else
2774  return vreinterpretq_s8_u16(vzipq_u16(
2775  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2776 # endif
2777 #endif
2778 }
2779 
2780 // 0123 abcd -> 0a1b2c3d
2781 NLIB_M(i128) I128::Zip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2782 #if defined(NLIB_SSE41)
2783  return _mm_unpackhi_epi16(a, b);
2784 #elif defined(NLIB_NEON)
2785 # ifdef __aarch64__
2786  return NLIB_OP2(vzip2q, u16, a, b);
2787 # else
2788  return vreinterpretq_s8_u16(vzipq_u16(
2789  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2790 # endif
2791 #endif
2792 }
2793 
2794 NLIB_M(i128) I128::Unzip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2795 #if defined(NLIB_SSE41)
2796  i128 mask = I128::SetValue(0xFFFFU, each_uint32);
2797  __m128i lo_mask = _mm_and_si128(a, mask);
2798  __m128i hi_mask = _mm_and_si128(b, mask);
2799  return _mm_packus_epi32(lo_mask, hi_mask);
2800 #elif defined(NLIB_NEON)
2801 # ifdef __aarch64__
2802  return NLIB_OP2(vuzp1q, u16, a, b);
2803 # else
2804  return vreinterpretq_s8_u16(vuzpq_u16(
2805  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2806 # endif
2807 #endif
2808 }
2809 
2810 NLIB_M(i128) I128::Unzip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2811 #if defined(NLIB_SSE41)
2812  i128 mask = I128::SetValue(0xFFFF0000U, each_uint32);
2813  __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 2);
2814  __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 2);
2815  return _mm_packus_epi32(lo_mask, hi_mask);
2816 #elif defined(NLIB_NEON)
2817 # ifdef __aarch64__
2818  return NLIB_OP2(vuzp2q, u16, a, b);
2819 # else
2820  return vreinterpretq_s8_u16(vuzpq_u16(
2821  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2822 # endif
2823 #endif
2824 }
2825 
2826 // 01 ab -> 0a1b
2827 NLIB_M(i128) I128::Zip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2828 #if defined(NLIB_SSE41)
2829  return _mm_unpacklo_epi32(a, b);
2830 #elif defined(NLIB_NEON)
2831 # ifdef __aarch64__
2832  return NLIB_OP2(vzip1q, u32, a, b);
2833 # else
2834  return vreinterpretq_s8_u32(vzipq_u32(
2835  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2836 # endif
2837 #endif
2838 }
2839 
2840 // 01 ab -> 0a1b
2841 NLIB_M(i128) I128::Zip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2842 #if defined(NLIB_SSE41)
2843  return _mm_unpackhi_epi32(a, b);
2844 #elif defined(NLIB_NEON)
2845 # ifdef __aarch64__
2846  return NLIB_OP2(vzip2q, u32, a, b);
2847 # else
2848  return vreinterpretq_s8_u32(vzipq_u32(
2849  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2850 # endif
2851 #endif
2852 }
2853 
2854 NLIB_M(i128) I128::Unzip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2855 #if defined(NLIB_SSE41)
2856  __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
2857  __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
2858  return _mm_blend_epi16(x0, x1, 0xF0);
2859 #elif defined(NLIB_NEON)
2860 # ifdef __aarch64__
2861  return NLIB_OP2(vuzp1q, u32, a, b);
2862 # else
2863  return vreinterpretq_s8_u32(vuzpq_u32(
2864  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2865 # endif
2866 #endif
2867 }
2868 
2869 NLIB_M(i128) I128::Unzip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2870 #if defined(NLIB_SSE41)
2871  __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 3, 1));
2872  __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 2, 0));
2873  return _mm_blend_epi16(x0, x1, 0xF0);
2874 #elif defined(NLIB_NEON)
2875 # ifdef __aarch64__
2876  return NLIB_OP2(vuzp2q, u32, a, b);
2877 # else
2878  return vreinterpretq_s8_u32(vuzpq_u32(
2879  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2880 # endif
2881 #endif
2882 }
2883 
2884 template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7,
2885  int V8, int V9, int V10, int V11, int V12, int V13, int V14, int V15>
2886 NLIB_M(i128) I128::Permute8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2887 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON)
2888  return __builtin_shufflevector(
2889  a, b,
2890  V0, V1, V2, V3, V4, V5, V6, V7,
2891  V8, V9, V10, V11, V12, V13, V14, V15);
2892 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41)
2893  return __builtin_shufflevector((__v16qi)a, (__v16qi)b,
2894  V0, V1, V2, V3, V4, V5, V6, V7,
2895  V8, V9, V10, V11, V12, V13, V14, V15);
2896 #else
2897  NLIB_ALIGNAS(16) int8_t mask_a[16] = {
2898  (V0 < 0 || V0 > 15) ? -128 : V0,
2899  (V1 < 0 || V1 > 15) ? -128 : V1,
2900  (V2 < 0 || V2 > 15) ? -128 : V2,
2901  (V3 < 0 || V3 > 15) ? -128 : V3,
2902  (V4 < 0 || V4 > 15) ? -128 : V4,
2903  (V5 < 0 || V5 > 15) ? -128 : V5,
2904  (V6 < 0 || V6 > 15) ? -128 : V6,
2905  (V7 < 0 || V7 > 15) ? -128 : V7,
2906  (V8 < 0 || V8 > 15) ? -128 : V8,
2907  (V9 < 0 || V9 > 15) ? -128 : V9,
2908  (V10 < 0 || V10 > 15) ? -128 : V10,
2909  (V11 < 0 || V11 > 15) ? -128 : V11,
2910  (V12 < 0 || V12 > 15) ? -128 : V12,
2911  (V13 < 0 || V13 > 15) ? -128 : V13,
2912  (V14 < 0 || V14 > 15) ? -128 : V14,
2913  (V15 < 0 || V15 > 15) ? -128 : V15
2914  };
2915  NLIB_ALIGNAS(16) int8_t mask_b[16] = {
2916  V0 < 16 ? -128 : (V0 - 16),
2917  V1 < 16 ? -128 : (V1 - 16),
2918  V2 < 16 ? -128 : (V2 - 16),
2919  V3 < 16 ? -128 : (V3 - 16),
2920  V4 < 16 ? -128 : (V4 - 16),
2921  V5 < 16 ? -128 : (V5 - 16),
2922  V6 < 16 ? -128 : (V6 - 16),
2923  V7 < 16 ? -128 : (V7 - 16),
2924  V8 < 16 ? -128 : (V8 - 16),
2925  V9 < 16 ? -128 : (V9 - 16),
2926  V10 < 16 ? -128 : (V10 - 16),
2927  V11 < 16 ? -128 : (V11 - 16),
2928  V12 < 16 ? -128 : (V12 - 16),
2929  V13 < 16 ? -128 : (V13 - 16),
2930  V14 < 16 ? -128 : (V14 - 16),
2931  V15 < 16 ? -128 : (V15 - 16)
2932  };
2933  i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2934  i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2935  return I128::Or(tmp_a, tmp_b);
2936 #endif
2937 }
2938 
2939 template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7>
2940 NLIB_M(i128) I128::Permute16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2941 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON)
2942  return vreinterpretq_s8_u16(__builtin_shufflevector(
2943  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b),
2944  V0, V1, V2, V3, V4, V5, V6, V7));
2945 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41)
2946  return __builtin_shufflevector((__v8hi)a, (__v8hi)b,
2947  V0, V1, V2, V3, V4, V5, V6, V7);
2948 #else
2949  NLIB_ALIGNAS(16) int8_t mask_a[16] = {
2950  (V0 < 0 || V0 > 7) ? -128 : V0 * 2,
2951  (V0 < 0 || V0 > 7) ? -128 : V0 * 2 + 1,
2952  (V1 < 0 || V1 > 7) ? -128 : V1 * 2,
2953  (V1 < 0 || V1 > 7) ? -128 : V1 * 2 + 1,
2954  (V2 < 0 || V2 > 7) ? -128 : V2 * 2,
2955  (V2 < 0 || V2 > 7) ? -128 : V2 * 2 + 1,
2956  (V3 < 0 || V3 > 7) ? -128 : V3 * 2,
2957  (V3 < 0 || V3 > 7) ? -128 : V3 * 2 + 1,
2958  (V4 < 0 || V4 > 7) ? -128 : V4 * 2,
2959  (V4 < 0 || V4 > 7) ? -128 : V4 * 2 + 1,
2960  (V5 < 0 || V5 > 7) ? -128 : V5 * 2,
2961  (V5 < 0 || V5 > 7) ? -128 : V5 * 2 + 1,
2962  (V6 < 0 || V6 > 7) ? -128 : V6 * 2,
2963  (V6 < 0 || V6 > 7) ? -128 : V6 * 2 + 1,
2964  (V7 < 0 || V7 > 7) ? -128 : V7 * 2,
2965  (V7 < 0 || V7 > 7) ? -128 : V7 * 2 + 1
2966  };
2967  NLIB_ALIGNAS(16) int8_t mask_b[16] = {
2968  V0 < 8 ? -128 : (V0 - 8) * 2,
2969  V0 < 8 ? -128 : (V0 - 8) * 2 + 1,
2970  V1 < 8 ? -128 : (V1 - 8) * 2,
2971  V1 < 8 ? -128 : (V1 - 8) * 2 + 1,
2972  V2 < 8 ? -128 : (V2 - 8) * 2,
2973  V2 < 8 ? -128 : (V2 - 8) * 2 + 1,
2974  V3 < 8 ? -128 : (V3 - 8) * 2,
2975  V3 < 8 ? -128 : (V3 - 8) * 2 + 1,
2976  V4 < 8 ? -128 : (V4 - 8) * 2,
2977  V4 < 8 ? -128 : (V4 - 8) * 2 + 1,
2978  V5 < 8 ? -128 : (V5 - 8) * 2,
2979  V5 < 8 ? -128 : (V5 - 8) * 2 + 1,
2980  V6 < 8 ? -128 : (V6 - 8) * 2,
2981  V6 < 8 ? -128 : (V6 - 8) * 2 + 1,
2982  V7 < 8 ? -128 : (V7 - 8) * 2,
2983  V7 < 8 ? -128 : (V7 - 8) * 2 + 1
2984  };
2985  i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2986  i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2987  return I128::Or(tmp_a, tmp_b);
2988 #endif
2989 }
2990 
2991 template<int V0, int V1, int V2, int V3>
2992 NLIB_M(i128) I128::Permute32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2993 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON)
2994  return vreinterpretq_s8_u32(__builtin_shufflevector(
2995  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b),
2996  V0, V1, V2, V3));
2997 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41)
2998  return __builtin_shufflevector((__v4si)a, (__v4si)b,
2999  V0, V1, V2, V3);
3000 #else
3001  NLIB_ALIGNAS(16) int8_t mask_a[16] = {
3002  (V0 < 0 || V0 > 3) ? -128 : V0 * 4,
3003  (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 1,
3004  (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 2,
3005  (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 3,
3006  (V1 < 0 || V1 > 3) ? -128 : V1 * 4,
3007  (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 1,
3008  (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 2,
3009  (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 3,
3010  (V2 < 0 || V2 > 3) ? -128 : V2 * 4,
3011  (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 1,
3012  (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 2,
3013  (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 3,
3014  (V3 < 0 || V3 > 3) ? -128 : V3 * 4,
3015  (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 1,
3016  (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 2,
3017  (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 3
3018  };
3019  NLIB_ALIGNAS(16) int8_t mask_b[16] = {
3020  V0 < 4 ? -128 : (V0 - 4) * 4,
3021  V0 < 4 ? -128 : (V0 - 4) * 4 + 1,
3022  V0 < 4 ? -128 : (V0 - 4) * 4 + 2,
3023  V0 < 4 ? -128 : (V0 - 4) * 4 + 3,
3024  V1 < 4 ? -128 : (V1 - 4) * 4,
3025  V1 < 4 ? -128 : (V1 - 4) * 4 + 1,
3026  V1 < 4 ? -128 : (V1 - 4) * 4 + 2,
3027  V1 < 4 ? -128 : (V1 - 4) * 4 + 3,
3028  V2 < 4 ? -128 : (V2 - 4) * 4,
3029  V2 < 4 ? -128 : (V2 - 4) * 4 + 1,
3030  V2 < 4 ? -128 : (V2 - 4) * 4 + 2,
3031  V2 < 4 ? -128 : (V2 - 4) * 4 + 3,
3032  V3 < 4 ? -128 : (V3 - 4) * 4,
3033  V3 < 4 ? -128 : (V3 - 4) * 4 + 1,
3034  V3 < 4 ? -128 : (V3 - 4) * 4 + 2,
3035  V3 < 4 ? -128 : (V3 - 4) * 4 + 3
3036  };
3037  i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
3038  i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
3039  return I128::Or(tmp_a, tmp_b);
3040 #endif
3041 }
3042 
3043 
3044 // 0123456789abcdef -> 1032547698badcfe
3045 NLIB_M(i128) I128::Reverse16(i128arg value) NLIB_NOEXCEPT {
3046 #if defined(NLIB_SSE41)
3047  NLIB_ALIGNAS(16) const int8_t mask_[16] = {
3048  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
3049  };
3050  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3051 #elif defined(NLIB_NEON)
3052  return NLIB_OP1(vrev16q, u8, value);
3053 #endif
3054 }
3055 
3056 // 0123456789abcdef -> 32107654ba98fedc
3057 NLIB_M(i128) I128::Reverse32(i128arg value) NLIB_NOEXCEPT {
3058 #if defined(NLIB_SSE41)
3059  NLIB_ALIGNAS(16) const int8_t mask_[16] = {
3060  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
3061  };
3062  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3063 #elif defined(NLIB_NEON)
3064  return NLIB_OP1(vrev32q, u8, value);
3065 #endif
3066 }
3067 
3068 // 0123456789abcdef -> 76543210fedcba98
3069 NLIB_M(i128) I128::Reverse64(i128arg value) NLIB_NOEXCEPT {
3070 #if defined(NLIB_SSE41)
3071  NLIB_ALIGNAS(16) const int8_t mask_[16] = {
3072  7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
3073  };
3074  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3075 #elif defined(NLIB_NEON)
3076  return NLIB_OP1(vrev64q, u8, value);
3077 #endif
3078 }
3079 
3080 // r = Or(ismasked(value.u8[i]) ? (1 << i) : 0)
3081 NLIB_M(int) I128::MoveMask8(i128arg value) NLIB_NOEXCEPT { // NOLINT
3082 #if defined(NLIB_SSE41)
3083  return _mm_movemask_epi8(value);
3084 #elif defined(NLIB_NEON)
3085  uint8x16_t powers = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL));
3086  uint8x16_t a = vandq_u8(value, powers);
3087 # ifdef __aarch64__
3088  return vaddv_u8(vget_low_u8(a)) | (vaddv_u8(vget_high_u8(a)) << 8);
3089 # else
3090  uint8x8_t al = vget_low_u8(a);
3091  uint8x8_t ah = vget_high_u8(a);
3092  uint8x8_t tmp = vpadd_u8(al, ah);
3093  tmp = vpadd_u8(tmp, tmp);
3094  tmp = vpadd_u8(tmp, tmp);
3095  return vget_lane_u16(vreinterpret_u16_u8(tmp), 0);
3096 # endif
3097 #endif
3098 }
3099 
3100 // r = Or(ismasked(value.u16[i]) ? (1 << i) : 0)
3101 NLIB_M(int) I128::MoveMask16(i128arg value) NLIB_NOEXCEPT { // NOLINT
3102 #if defined(NLIB_SSE41)
3103  __m128i tmp = _mm_packs_epi16(value, value);
3104  return _mm_movemask_epi8(tmp) & 255;
3105 #elif defined(NLIB_NEON)
3106  uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3107  uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3108  uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3109  uint16x8_t a = vandq_u16(vreinterpretq_u16_s8(value), powers);
3110 # ifdef __aarch64__
3111  return vaddvq_u16(a);
3112 # else
3113  uint8x8_t tmp = vmovn_u16(a);
3114  tmp = vpadd_u8(tmp, tmp);
3115  tmp = vpadd_u8(tmp, tmp);
3116  tmp = vpadd_u8(tmp, tmp);
3117  return vget_lane_u8(tmp, 0);
3118 # endif
3119 #endif
3120 }
3121 
3122 // r = Or(ismasked(value.u32[i]) ? (1 << i) : 0)
3123 NLIB_M(int) I128::MoveMask32(i128arg value) NLIB_NOEXCEPT { // NOLINT
3124 #if defined(NLIB_SSE41)
3125  __m128i tmp = _mm_packs_epi16(value, value);
3126  tmp = _mm_packs_epi16(tmp, tmp);
3127  return _mm_movemask_epi8(tmp) & 15;
3128 #elif defined(NLIB_NEON)
3129  uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3130  uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3131  uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3132  uint32x4_t a = vandq_u32(vreinterpretq_u32_s8(value), powers);
3133 # ifdef __aarch64__
3134  return vaddvq_u32(a);
3135 # else
3136  uint16x4_t tmp = vmovn_u32(a);
3137  tmp = vpadd_u16(tmp, tmp);
3138  tmp = vpadd_u16(tmp, tmp);
3139  return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
3140 # endif
3141 #endif
3142 }
3143 
3144 NLIB_M(i128) I128::SetMask8(int mask) NLIB_NOEXCEPT {
3145 #if defined(NLIB_NEON)
3146  int8x8_t m = vcreate_s8(0x8040201008040201ULL);
3147  int8x8_t s0 = vdup_n_s8(mask & 0xFF);
3148  int8x8_t s1 = vdup_n_s8(mask >> 8);
3149  return vtstq_s8(vcombine_s8(m, m), vcombine_s8(s0, s1));
3150 #elif defined(NLIB_SSE41)
3151  i128 m = I128::SetValue(0x8040201008040201ULL, each_uint64);
3152  i128 s0 = I128::SetValue(mask & 0xFF, each_int8);
3153  i128 s1 = I128::SetValue(static_cast<int8_t>(mask >> 8), each_int8);
3154  i128 s = _mm_blend_epi16(s0, s1, 0xF0);
3155  return I128::Test8(m, s);
3156 #endif
3157 }
3158 
3159 NLIB_M(i128) I128::SetMask16(int mask) NLIB_NOEXCEPT {
3160 #if defined(NLIB_NEON)
3161  uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3162  uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3163  uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3164  uint16x8_t s = vdupq_n_u16(mask);
3165  return vreinterpretq_s8_u16(vtstq_u16(powers, s));
3166 #elif defined(NLIB_SSE41)
3167  i128 m0 = I128::SetValue(0x0008000400020001ULL, each_uint64);
3168  i128 m1 = I128::SetValue(0x0080004000200010ULL, each_uint64);
3169  i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3170  i128 s = I128::SetValue(static_cast<int16_t>(mask), each_int16);
3171  return I128::Test16(m, s);
3172 #endif
3173 }
3174 
3175 NLIB_M(i128) I128::SetMask32(int mask) NLIB_NOEXCEPT {
3176 #if defined(NLIB_NEON)
3177  uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3178  uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3179  uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3180  uint32x4_t s = vdupq_n_u32(mask);
3181  return vreinterpretq_s8_u32(vtstq_u32(powers, s));
3182 #elif defined(NLIB_SSE41)
3183  i128 m0 = I128::SetValue(0x0000000200000001ULL, each_uint64);
3184  i128 m1 = I128::SetValue(0x0000000800000004ULL, each_uint64);
3185  i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3186  i128 s = I128::SetValue(mask, each_int32);
3187  return I128::Test32(m, s);
3188 #endif
3189 }
3190 
3191 // true if value == 0
3192 NLIB_M(bool) I128::IsZero(i128arg value) NLIB_NOEXCEPT { // NOLINT
3193 #if defined(NLIB_SSE41)
3194  return _mm_testz_si128(value, value) != 0;
3195 #elif defined(NLIB_NEON)
3196 # ifdef __aarch64__
3197  uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(value));
3198  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3199 # else
3200  // NOTE: there is a better way
3201  // https://stackoverflow.com/questions/15389539/fastest-way-to-test-a-128-bit-neon-register-for-a-value-of-0-using-intrinsics // NOLINT
3202  int8x8_t tmp = vorr_s8(vget_low_s8(value), vget_high_s8(value));
3203  return vget_lane_u64(vreinterpret_u64_s8(tmp), 0) == 0;
3204 # endif
3205 #endif
3206 }
3207 
3208 // true if all bits on
3209 NLIB_M(bool) I128::IsFull(i128arg value) NLIB_NOEXCEPT { // NOLINT
3210 #if defined(NLIB_SSE41)
3211  return _mm_testc_si128(value, _mm_cmpeq_epi8(value, value)) != 0;
3212 #elif defined(NLIB_NEON)
3213 # ifdef __aarch64__
3214  uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(vmvnq_s8(value)));
3215  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3216 # else
3217  int8x8_t tmp = vand_s8(vget_low_s8(value), vget_high_s8(value));
3218  return vget_lane_s64(vreinterpret_s64_s8(tmp), 0) == -1;
3219 # endif
3220 #endif
3221 }
3222 
3223 // r.i8[i] = ismasked(mask.i8[i]) ? a.i8[i] : b.i8[i]
3224 NLIB_M(i128) I128::Select(i128arg mask, i128arg a, i128arg b) NLIB_NOEXCEPT {
3225 #if defined(NLIB_SSE41)
3226  return _mm_blendv_epi8(b, a, mask);
3227 #elif defined(NLIB_NEON)
3228  return NLIB_OP3(vbslq, u32, mask, a, b);
3229 #endif
3230 }
3231 
3232 // r[i] = 0 if shuffle.u8[i] == 0x80, r[i] = value[shuffle.u8[i]] if 0 <= shuffle.u8[i] < 16
3233 NLIB_M(i128) I128::Shuffle8(i128arg value, i128arg shuffle) NLIB_NOEXCEPT {
3234 #if defined(NLIB_SSE41)
3235  return _mm_shuffle_epi8(value, shuffle);
3236 #elif defined(NLIB_NEON)
3237 # ifdef __aarch64__
3238  return vqtbl1q_s8(value, vreinterpretq_u8_s8(shuffle));
3239 # else
3240  int8x8x2_t x;
3241  x.val[0] = vget_low_s8(value);
3242  x.val[1] = vget_high_s8(value);
3243  int8x8_t lo = vtbl2_s8(x, vget_low_s8(shuffle));
3244  int8x8_t hi = vtbl2_s8(x, vget_high_s8(shuffle));
3245  return vcombine_s8(lo, hi);
3246 # endif
3247 #endif
3248 }
3249 
3250 // popcnt of mask(returns nlib_popcnt16(I128::MoveMask8(value))
3251 NLIB_ALWAYS_INLINE int __vectorcall I128::PopCntMask8(i128arg value) NLIB_NOEXCEPT {
3252 #if defined(NLIB_NEON)
3253 # ifdef __aarch64__
3254  int8x16_t tmp = vnegq_s8(value);
3255  return vaddvq_s8(tmp);
3256 # else
3257  int8x16_t tmp = vnegq_s8(value);
3258  int8x8_t lo = vget_low_s8(tmp);
3259  int8x8_t hi = vget_high_s8(tmp);
3260  lo = vadd_s8(lo, hi);
3261  lo = vpadd_s8(lo, lo);
3262  lo = vpadd_s8(lo, lo);
3263  lo = vpadd_s8(lo, lo);
3264  return vget_lane_s8(lo, 0);
3265 # endif
3266 #else
3267  return nlib_popcnt16(static_cast<uint16_t>(I128::MoveMask8(value)));
3268 #endif
3269 }
3270 
3271 NLIB_ALWAYS_INLINE int __vectorcall I128::ClzMask8(i128arg value) NLIB_NOEXCEPT {
3272  return nlib_clz(static_cast<uint32_t>(I128::MoveMask8(value))) - 16;
3273 }
3274 
3275 NLIB_ALWAYS_INLINE int __vectorcall I128::CtzMask8(i128arg value) NLIB_NOEXCEPT {
3276  return nlib_ctz(static_cast<uint32_t>(I128::MoveMask8(value) | 0x10000));
3277 }
3278 
3279 #ifdef NLIB_NEON
3280 # undef vreinterpretq_s8_s8
3281 # undef NLIB_OP1
3282 # undef NLIB_OP2
3283 # undef NLIB_OP3
3284 # undef NLIB_CMP
3285 # undef NLIB_SFT
3286 # undef NLIB_CMB
3287 #endif
3288 
3289 #endif // NLIB_DOXYGEN
3290 
3291 #undef NLIB_M
3292 #undef NLIB_M2
3293 
3294 #if defined(NLIB_SSE41)
3295 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3296  { \
3297  row0 = _mm_shuffle_epi32(row0, _MM_SHUFFLE(3, 1, 2, 0)); \
3298  row1 = _mm_shuffle_epi32(row1, _MM_SHUFFLE(3, 1, 2, 0)); \
3299  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(3, 1, 2, 0)); \
3300  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(3, 1, 2, 0)); \
3301  __m128i t0_transpose32_ = _mm_unpacklo_epi32(row0, row1); \
3302  __m128i t1_transpose32_ = _mm_unpackhi_epi32(row0, row1); \
3303  __m128i t2_transpose32_ = _mm_unpacklo_epi32(row2, row3); \
3304  __m128i t3_transpose32_ = _mm_unpackhi_epi32(row2, row3); \
3305  row0 = _mm_unpacklo_epi64(t0_transpose32_, t2_transpose32_); \
3306  row1 = _mm_unpacklo_epi64(t1_transpose32_, t3_transpose32_); \
3307  row2 = _mm_unpackhi_epi64(t0_transpose32_, t2_transpose32_); \
3308  row3 = _mm_unpackhi_epi64(t1_transpose32_, t3_transpose32_); \
3309 }
3310 #elif defined(NLIB_NEON)
3311 # ifdef __aarch64__
3312 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3313  { \
3314  uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), \
3315  vreinterpretq_u32_s8(row1)); \
3316  uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), \
3317  vreinterpretq_u32_s8(row3)); \
3318  uint64x2_t row0_, row1_, row2_, row3_; \
3319  row0_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \
3320  vreinterpretq_u64_u32(trn_f1_.val[0])); \
3321  row0 = vreinterpretq_s8_u64(row0_); \
3322  row1_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \
3323  vreinterpretq_u64_u32(trn_f1_.val[1])); \
3324  row1 = vreinterpretq_s8_u64(row1_); \
3325  row2_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \
3326  vreinterpretq_u64_u32(trn_f1_.val[0])); \
3327  row2 = vreinterpretq_s8_u64(row2_); \
3328  row3_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \
3329  vreinterpretq_u64_u32(trn_f1_.val[1])); \
3330  row3 = vreinterpretq_s8_u64(row3_); \
3331  }
3332 # else
3333 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3334  { \
3335  uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), \
3336  vreinterpretq_u32_s8(row1)); \
3337  uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), \
3338  vreinterpretq_u32_s8(row3)); \
3339  uint32x4_t row0_, row1_, row2_, row3_; \
3340  uint32x2_t lo, hi; \
3341  lo = vget_low_u32(trn_f0_.val[0]); hi = vget_low_u32(trn_f1_.val[0]); \
3342  row0_ = vcombine_u32(lo, hi); \
3343  row0 = vreinterpretq_s8_u32(row0_); \
3344  lo = vget_low_u32(trn_f0_.val[1]); hi = vget_low_u32(trn_f1_.val[1]); \
3345  row1_ = vcombine_u32(lo, hi); \
3346  row1 = vreinterpretq_s8_u32(row1_); \
3347  lo = vget_high_u32(trn_f0_.val[0]); hi = vget_high_u32(trn_f1_.val[0]); \
3348  row2_ = vcombine_u32(lo, hi); \
3349  row2 = vreinterpretq_s8_u32(row2_); \
3350  lo = vget_high_u32(trn_f0_.val[1]); hi = vget_high_u32(trn_f1_.val[1]); \
3351  row3_ = vcombine_u32(lo, hi); \
3352  row3 = vreinterpretq_s8_u32(row3_); \
3353  }
3354 # endif
3355 #endif
3356 
3357 #endif // NLIB_SIMD
3358 
3359 } // namespace simd
3360 NLIB_NAMESPACE_END
3361 
3362 #endif // INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
The tag for representing a signed 32-bit integer with an empty structure.
Definition: SimdInt.h:22
The tag for representing a signed 64-bit integer with an empty structure.
Definition: SimdInt.h:24
#define NLIB_ALWAYS_INLINE
Indicates that the compiler is forced to perform inline expansion of functions.
Definition: Platform_unix.h:69
constexpr const each_uint8_tag each_uint8
The tag for representing an unsigned 8-bit integer with an each_uint8_tag-type constant object...
Definition: SimdInt.h:38
The tag for representing the selection of a lane divided into 8-bit units with an empty structure...
Definition: SimdInt.h:48
The tag for representing the selection of a lane divided into 32-bit units with an empty structure...
Definition: SimdInt.h:44
The tag for representing a signed 8-bit integer with an empty structure.
Definition: SimdInt.h:18
#define NLIB_VIS_HIDDEN
Symbols for functions and classes are not made available outside of the library.
Definition: Platform_unix.h:60
constexpr const each_uint16_tag each_uint16
The tag for representing an unsigned 16-bit integer with an each_uint16_tag-type constant object...
Definition: SimdInt.h:39
The tag for representing an unsigned 16-bit integer with an empty structure.
Definition: SimdInt.h:28
The tag for representing an unsigned 64-bit integer with an empty structure.
Definition: SimdInt.h:32
constexpr const each_int64_tag each_int64
The tag for representing a signed 64-bit integer with an each_int64_tag-type constant object...
Definition: SimdInt.h:37
nlib_i128_t i128
nlib_i128_t is defined using typedef.
Definition: SimdInt.h:63
constexpr const each_uint64_tag each_uint64
The tag for representing an unsigned 64-bit integer with an each_uint64_tag-type constant object...
Definition: SimdInt.h:41
static int nlib_popcnt16(uint16_t x)
Returns the number of bits that are 1.
Definition: Platform.h:3212
The class for integer SIMD computations using128-bit registers (MM0-XMM15 for SSE, and Q0-Q15 for NEON).
Definition: SimdInt.h:71
The tag for representing an unsigned 8-bit integer with an empty structure.
Definition: SimdInt.h:26
The tag for representing a signed 16-bit integer with an empty structure.
Definition: SimdInt.h:20
constexpr const each_int16_tag each_int16
The tag for representing a signed 16-bit integer with an each_int16_tag-type constant object...
Definition: SimdInt.h:35
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
Definition: SimdInt.h:40
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Definition: Config.h:86
The tag for representing the selection of a lane divided into 16-bit units with an empty structure...
Definition: SimdInt.h:46
constexpr const each_select16_tag each_select16
The tag for representing the selection of a 16-bit lane with an each_select16_tag-type constant objec...
Definition: SimdInt.h:51
#define NLIB_CEXPR
Defines constexpr if it is available for use. If not, holds an empty string.
Definition: Config.h:80
A file that contains the configuration information for each development environment.
__m128i nlib_i128_t
The type for a SIMD register for 128-bit integers.
Definition: SimdInt.h:9
constexpr const each_select8_tag each_select8
The tag for representing the selection of an 8-bit lane with an each_select8_tag-type constant object...
Definition: SimdInt.h:52
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
Definition: Config.h:221
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
Definition: SimdInt.h:34
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
Definition: SimdInt.h:50
The tag for representing an unsigned 32-bit integer with an empty structure.
Definition: SimdInt.h:30
constexpr const each_int32_tag each_int32
The tag for representing a signed 32-bit integer with an each_int32_tag-type constant object...
Definition: SimdInt.h:36
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
Definition: Config.h:136