nlib
SimdInt.h
[詳解]
1 
2 /*--------------------------------------------------------------------------------*
3  Project: CrossRoad
4  Copyright (C)Nintendo All rights reserved.
5 
6  These coded instructions, statements, and computer programs contain proprietary
7  information of Nintendo and/or its licensed developers and are protected by
8  national and international copyright laws. They may not be disclosed to third
9  parties or copied or duplicated in any form, in whole or in part, without the
10  prior written consent of Nintendo.
11 
12  The content herein is highly confidential and should be handled accordingly.
13  *--------------------------------------------------------------------------------*/
14 
15 #pragma once
16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
17 #define INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
18 
19 #include "nn/nlib/Config.h"
20 
21 #if defined(NLIB_SSE41)
22 typedef __m128i nlib_i128_t;
23 #elif defined(NLIB_NEON)
24 typedef int8x16_t nlib_i128_t;
25 #endif
26 
27 NLIB_NAMESPACE_BEGIN
28 namespace simd {
29 
30 // use each_int8 for the argument value
31 struct each_int8_tag {};
32 // use each_int16 for the argument value
33 struct each_int16_tag {};
34 // use each_int32 for the argument value
35 struct each_int32_tag {};
36 // use each_int64 for the argument value
37 struct each_int64_tag {};
38 // use each_uint8 for the argument value
39 struct each_uint8_tag {};
40 // use each_uint16 for the argument value
41 struct each_uint16_tag {};
42 // use each_uint32 for the argument value
43 struct each_uint32_tag {};
44 // use each_uint64 for the argument value
45 struct each_uint64_tag {};
46 
55 
56 // use each_select32 for the argument value
58 // use each_select16 for the argument value
60 // use each_select8 for the argument value
61 struct each_select8_tag {};
62 
66 
67 #if !defined(_MSC_VER) && !defined(__vectorcall)
68 #define __vectorcall
69 #endif
70 
71 #if defined(NLIB_SIMD)
72 
73 // __m128i(SSE), int8x16_t(NEON)
74 typedef nlib_i128_t i128;
75 
76 #if defined(_MSC_VER) && _MSC_VER < 1800
77 typedef const i128& i128arg;
78 #else
79 typedef const i128 i128arg;
80 #endif
81 
83  static i128 __vectorcall SetValue(int8_t v, each_int8_tag) NLIB_NOEXCEPT;
84  static i128 __vectorcall SetValue(int16_t v, each_int16_tag) NLIB_NOEXCEPT;
85  static i128 __vectorcall SetValue(int32_t v, each_int32_tag) NLIB_NOEXCEPT;
86  static i128 __vectorcall SetValue(int64_t v, each_int64_tag) NLIB_NOEXCEPT;
87  static i128 __vectorcall SetValue(uint8_t v, each_uint8_tag) NLIB_NOEXCEPT;
88  static i128 __vectorcall SetValue(uint16_t v, each_uint16_tag) NLIB_NOEXCEPT;
89  static i128 __vectorcall SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT;
90  static i128 __vectorcall SetValue(uint64_t v, each_uint64_tag) NLIB_NOEXCEPT;
91 
92  template<size_t N>
93  static i128 __vectorcall SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT;
94  template<size_t N>
95  static i128 __vectorcall SetValue(i128 value, each_select16_tag) NLIB_NOEXCEPT;
96  template<size_t N>
97  static i128 __vectorcall SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT;
98 
99  static i128 __vectorcall SetZero() NLIB_NOEXCEPT;
100  static i128 __vectorcall SetFull(i128arg dummy) NLIB_NOEXCEPT;
101 
102  static i128 __vectorcall LoadA16(const void* p) NLIB_NOEXCEPT;
103  static i128 __vectorcall LoadA8(const void* p) NLIB_NOEXCEPT;
104  static i128 __vectorcall LoadA4(const void* p) NLIB_NOEXCEPT;
105  static i128 __vectorcall LoadA2(const void* p) NLIB_NOEXCEPT;
106  static i128 __vectorcall LoadA1(const void* p) NLIB_NOEXCEPT;
107  static i128 __vectorcall LoadLoA8(const void* p) NLIB_NOEXCEPT;
108  static i128 __vectorcall LoadLoA4(const void* p) NLIB_NOEXCEPT;
109  static i128 __vectorcall LoadLoA2(const void* p) NLIB_NOEXCEPT;
110  static i128 __vectorcall LoadLoA1(const void* p) NLIB_NOEXCEPT;
111  static i128 __vectorcall LoadHiA8(const void* p) NLIB_NOEXCEPT;
112  static i128 __vectorcall LoadHiA4(const void* p) NLIB_NOEXCEPT;
113  static i128 __vectorcall LoadHiA2(const void* p) NLIB_NOEXCEPT;
114  static i128 __vectorcall LoadHiA1(const void* p) NLIB_NOEXCEPT;
115 
116 #define NLIB_LOAD_REDIRECT(func) \
117  static i128 __vectorcall func(uintptr_t p) NLIB_NOEXCEPT { \
118  return func(reinterpret_cast<void*>(p)); \
119  } \
120  static i128 __vectorcall func(intptr_t p) NLIB_NOEXCEPT { \
121  return func(reinterpret_cast<void*>(p)); \
122  }
123  NLIB_LOAD_REDIRECT(LoadA16)
124  NLIB_LOAD_REDIRECT(LoadA8)
125  NLIB_LOAD_REDIRECT(LoadA4)
126  NLIB_LOAD_REDIRECT(LoadA2)
127  NLIB_LOAD_REDIRECT(LoadA1)
128  NLIB_LOAD_REDIRECT(LoadLoA8)
129  NLIB_LOAD_REDIRECT(LoadLoA4)
130  NLIB_LOAD_REDIRECT(LoadLoA2)
131  NLIB_LOAD_REDIRECT(LoadLoA1)
132  NLIB_LOAD_REDIRECT(LoadHiA8)
133  NLIB_LOAD_REDIRECT(LoadHiA4)
134  NLIB_LOAD_REDIRECT(LoadHiA2)
135  NLIB_LOAD_REDIRECT(LoadHiA1)
136 #undef NLIB_LOAD_REDIRECT
137 
138  static void __vectorcall StoreA16(void* p, i128arg value) NLIB_NOEXCEPT;
139  static void __vectorcall StoreA8(void* p, i128arg value) NLIB_NOEXCEPT;
140  static void __vectorcall StoreA4(void* p, i128arg value) NLIB_NOEXCEPT;
141  static void __vectorcall StoreA2(void* p, i128arg value) NLIB_NOEXCEPT;
142  static void __vectorcall StoreA1(void* p, i128arg value) NLIB_NOEXCEPT;
143  static void __vectorcall StoreLoA8(void* p, i128arg value) NLIB_NOEXCEPT;
144  static void __vectorcall StoreLoA4(void* p, i128arg value) NLIB_NOEXCEPT;
145  static void __vectorcall StoreLoA2(void* p, i128arg value) NLIB_NOEXCEPT;
146  static void __vectorcall StoreLoA1(void* p, i128arg value) NLIB_NOEXCEPT;
147  static void __vectorcall StoreHiA8(void* p, i128arg value) NLIB_NOEXCEPT;
148  static void __vectorcall StoreHiA4(void* p, i128arg value) NLIB_NOEXCEPT;
149  static void __vectorcall StoreHiA2(void* p, i128arg value) NLIB_NOEXCEPT;
150  static void __vectorcall StoreHiA1(void* p, i128arg value) NLIB_NOEXCEPT;
151 
152 #define NLIB_STORE_REDIRECT(func) \
153  static void __vectorcall func(uintptr_t p, i128arg value) NLIB_NOEXCEPT { \
154  func(reinterpret_cast<void*>(p), value); \
155  } \
156  static void __vectorcall func(intptr_t p, i128arg value) NLIB_NOEXCEPT { \
157  func(reinterpret_cast<void*>(p), value); \
158  }
159  NLIB_STORE_REDIRECT(StoreA16)
160  NLIB_STORE_REDIRECT(StoreA8)
161  NLIB_STORE_REDIRECT(StoreA4)
162  NLIB_STORE_REDIRECT(StoreA2)
163  NLIB_STORE_REDIRECT(StoreA1)
164  NLIB_STORE_REDIRECT(StoreLoA8)
165  NLIB_STORE_REDIRECT(StoreLoA4)
166  NLIB_STORE_REDIRECT(StoreLoA2)
167  NLIB_STORE_REDIRECT(StoreLoA1)
168  NLIB_STORE_REDIRECT(StoreHiA8)
169  NLIB_STORE_REDIRECT(StoreHiA4)
170  NLIB_STORE_REDIRECT(StoreHiA2)
171  NLIB_STORE_REDIRECT(StoreHiA1)
172 #undef NLIB_STORE_REDIRECT
173 
174  //
175  // Get/Set
176  //
177  template<size_t N>
178  static uint8_t __vectorcall GetUint8FromLane(i128arg value) NLIB_NOEXCEPT;
179  template<size_t N>
180  static uint16_t __vectorcall GetUint16FromLane(i128arg value) NLIB_NOEXCEPT;
181  template<size_t N>
182  static uint32_t __vectorcall GetUint32FromLane(i128arg value) NLIB_NOEXCEPT;
183  template<size_t N>
184  static uint64_t __vectorcall GetUint64FromLane(i128arg value) NLIB_NOEXCEPT;
185  template<size_t N>
186  static i128 __vectorcall SetUint8ToLane(i128arg value, uint8_t v) NLIB_NOEXCEPT;
187  template<size_t N>
188  static i128 __vectorcall SetUint16ToLane(i128arg value, uint16_t v) NLIB_NOEXCEPT;
189  template<size_t N>
190  static i128 __vectorcall SetUint32ToLane(i128arg value, uint32_t v) NLIB_NOEXCEPT;
191  template<size_t N>
192  static i128 __vectorcall SetUint64ToLane(i128arg value, uint64_t v) NLIB_NOEXCEPT;
193 
194  //
195  // Arithmetic Operations
196  //
197  static i128 __vectorcall Add8(i128arg a, i128arg b) NLIB_NOEXCEPT;
198  static i128 __vectorcall Add16(i128arg a, i128arg b) NLIB_NOEXCEPT;
199  static i128 __vectorcall Add32(i128arg a, i128arg b) NLIB_NOEXCEPT;
200  static i128 __vectorcall Add64(i128arg a, i128arg b) NLIB_NOEXCEPT;
201 
202  static i128 __vectorcall AddInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
203  static i128 __vectorcall AddInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
204 
205  static i128 __vectorcall AddUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
206  static i128 __vectorcall AddUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
207 
208  static i128 __vectorcall Sub8(i128arg a, i128arg b) NLIB_NOEXCEPT;
209  static i128 __vectorcall Sub16(i128arg a, i128arg b) NLIB_NOEXCEPT;
210  static i128 __vectorcall Sub32(i128arg a, i128arg b) NLIB_NOEXCEPT;
211  static i128 __vectorcall Sub64(i128arg a, i128arg b) NLIB_NOEXCEPT;
212 
213  static i128 __vectorcall SubInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
214  static i128 __vectorcall SubInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
215 
216  static i128 __vectorcall SubUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
217  static i128 __vectorcall SubUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
218 
219  static i128 __vectorcall PairwiseAdd8(i128arg a, i128arg b) NLIB_NOEXCEPT;
220  static i128 __vectorcall PairwiseAdd16(i128arg a, i128arg b) NLIB_NOEXCEPT;
221  static i128 __vectorcall PairwiseAdd32(i128arg a, i128arg b) NLIB_NOEXCEPT;
222  // PairwiseMaxInt8, PairwiseMaxInt16, PairwiseMaxInt32
223  // PairwiseMaxUint8, PairwiseMaxUint16, PairwiseMaxUint32
224  // PairwiseMinInt8, PairwiseMinInt16, PairwiseMinInt32
225  // PairwiseMinUint8, PairwiseMinUint16, PairwiseMinUint32
226 
227  static i128 __vectorcall Mult16(i128arg a, i128arg b) NLIB_NOEXCEPT;
228  static i128 __vectorcall MultAdd16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
229  static i128 __vectorcall MultSub16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
230  static i128 __vectorcall Mult32(i128arg a, i128arg b) NLIB_NOEXCEPT;
231  static i128 __vectorcall MultAdd32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
232  static i128 __vectorcall MultSub32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
233 
234  static i128 __vectorcall NegateInt8(i128arg value) NLIB_NOEXCEPT;
235  static i128 __vectorcall NegateInt16(i128arg value) NLIB_NOEXCEPT;
236  static i128 __vectorcall NegateInt32(i128arg value) NLIB_NOEXCEPT;
237 
238  static i128 __vectorcall MaxInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
239  static i128 __vectorcall MaxInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
240  static i128 __vectorcall MaxInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
241  static i128 __vectorcall MaxUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
242  static i128 __vectorcall MaxUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
243  static i128 __vectorcall MaxUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
244  static i128 __vectorcall MinInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
245  static i128 __vectorcall MinInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
246  static i128 __vectorcall MinInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
247  static i128 __vectorcall MinUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
248  static i128 __vectorcall MinUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
249  static i128 __vectorcall MinUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
250 
251  static i128 __vectorcall AbsInt8(i128arg value) NLIB_NOEXCEPT;
252  static i128 __vectorcall AbsInt16(i128arg value) NLIB_NOEXCEPT;
253  static i128 __vectorcall AbsInt32(i128arg value) NLIB_NOEXCEPT;
254  static i128 __vectorcall AbsDiffInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
255  static i128 __vectorcall AbsDiffInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
256  static i128 __vectorcall AbsDiffInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
257 
258  //
259  // Logical Operations
260  //
261  static i128 __vectorcall And(i128arg a, i128arg b) NLIB_NOEXCEPT;
262  static i128 __vectorcall Or(i128arg a, i128arg b) NLIB_NOEXCEPT;
263  static i128 __vectorcall Xor(i128arg a, i128arg b) NLIB_NOEXCEPT;
264  static i128 __vectorcall Not(i128arg a) NLIB_NOEXCEPT;
265  static i128 __vectorcall AndNot(i128arg a, i128arg b) NLIB_NOEXCEPT;
266  static i128 __vectorcall OrNot(i128arg a, i128arg b) NLIB_NOEXCEPT;
267  static i128 __vectorcall Test8(i128arg a, i128arg b) NLIB_NOEXCEPT;
268  static i128 __vectorcall Test16(i128arg a, i128arg b) NLIB_NOEXCEPT;
269  static i128 __vectorcall Test32(i128arg a, i128arg b) NLIB_NOEXCEPT;
270 
271  //
272  // Comparison Operations
273  //
274  static i128 __vectorcall CmpEq8(i128arg a, i128arg b) NLIB_NOEXCEPT;
275  static i128 __vectorcall CmpEq16(i128arg a, i128arg b) NLIB_NOEXCEPT;
276  static i128 __vectorcall CmpEq32(i128arg a, i128arg b) NLIB_NOEXCEPT;
277  static i128 __vectorcall CmpEq64(i128arg a, i128arg b) NLIB_NOEXCEPT;
278 
279  static i128 __vectorcall CmpLtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
280  static i128 __vectorcall CmpLtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
281  static i128 __vectorcall CmpLtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
282  static i128 __vectorcall CmpLtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
283 
284  static i128 __vectorcall CmpGtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
285  static i128 __vectorcall CmpGtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
286  static i128 __vectorcall CmpGtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
287  static i128 __vectorcall CmpGtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
288 
289  static i128 __vectorcall CmpLtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
290  static i128 __vectorcall CmpLtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
291  static i128 __vectorcall CmpLtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
292  static i128 __vectorcall CmpLtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
293 
294  static i128 __vectorcall CmpGtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
295  static i128 __vectorcall CmpGtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
296  static i128 __vectorcall CmpGtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
297  static i128 __vectorcall CmpGtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
298 
299  static i128 __vectorcall CmpLeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
300  static i128 __vectorcall CmpLeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
301  static i128 __vectorcall CmpLeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
302  static i128 __vectorcall CmpLeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
303 
304  static i128 __vectorcall CmpGeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
305  static i128 __vectorcall CmpGeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
306  static i128 __vectorcall CmpGeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
307  static i128 __vectorcall CmpGeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
308 
309  static i128 __vectorcall CmpLeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
310  static i128 __vectorcall CmpLeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
311  static i128 __vectorcall CmpLeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
312  static i128 __vectorcall CmpLeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
313 
314  static i128 __vectorcall CmpGeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
315  static i128 __vectorcall CmpGeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
316  static i128 __vectorcall CmpGeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
317  static i128 __vectorcall CmpGeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
318 
319  static i128 __vectorcall CmpEqZero8(i128arg value) NLIB_NOEXCEPT;
320  static i128 __vectorcall CmpEqZero16(i128arg value) NLIB_NOEXCEPT;
321  static i128 __vectorcall CmpEqZero32(i128arg value) NLIB_NOEXCEPT;
322  static i128 __vectorcall CmpEqZero64(i128arg value) NLIB_NOEXCEPT;
323 
324  //
325  // Bit Shift
326  //
327  static i128 __vectorcall ShiftLeftLogical8(i128arg value, int count) NLIB_NOEXCEPT;
328  static i128 __vectorcall ShiftRightLogical8(i128arg value, int count) NLIB_NOEXCEPT;
329  static i128 __vectorcall ShiftRightArithmetic8(i128arg value, int count) NLIB_NOEXCEPT;
330 
331  static i128 __vectorcall ShiftLeftLogical16(i128arg value, int count) NLIB_NOEXCEPT;
332  static i128 __vectorcall ShiftRightLogical16(i128arg value, int count) NLIB_NOEXCEPT;
333  static i128 __vectorcall ShiftRightArithmetic16(i128arg value, int count) NLIB_NOEXCEPT;
334 
335  static i128 __vectorcall ShiftLeftLogical32(i128arg value, int count) NLIB_NOEXCEPT;
336  static i128 __vectorcall ShiftRightLogical32(i128arg value, int count) NLIB_NOEXCEPT;
337  static i128 __vectorcall ShiftRightArithmetic32(i128arg value, int count) NLIB_NOEXCEPT;
338 
339  static i128 __vectorcall ShiftLeftLogical64(i128arg value, int count) NLIB_NOEXCEPT;
340  static i128 __vectorcall ShiftRightLogical64(i128arg value, int count) NLIB_NOEXCEPT;
341 
342  //
343  // Bit Shift(Constant)
344  //
345  template<size_t N>
346  static i128 __vectorcall ShiftLeftLogical8(i128arg value) NLIB_NOEXCEPT;
347  template<size_t N>
348  static i128 __vectorcall ShiftRightLogical8(i128arg value) NLIB_NOEXCEPT;
349  template<size_t N>
350  static i128 __vectorcall ShiftRightArithmetic8(i128arg value) NLIB_NOEXCEPT;
351 
352  template<size_t N>
353  static i128 __vectorcall ShiftLeftLogical16(i128arg value) NLIB_NOEXCEPT;
354  template<size_t N>
355  static i128 __vectorcall ShiftRightLogical16(i128arg value) NLIB_NOEXCEPT;
356  template<size_t N>
357  static i128 __vectorcall ShiftRightArithmetic16(i128arg value) NLIB_NOEXCEPT;
358 
359  template<size_t N>
360  static i128 __vectorcall ShiftLeftLogical32(i128arg value) NLIB_NOEXCEPT;
361  template<size_t N>
362  static i128 __vectorcall ShiftRightLogical32(i128arg value) NLIB_NOEXCEPT;
363  template<size_t N>
364  static i128 __vectorcall ShiftRightArithmetic32(i128arg value) NLIB_NOEXCEPT;
365 
366  template<size_t N>
367  static i128 __vectorcall ShiftLeftLogical64(i128arg value) NLIB_NOEXCEPT;
368  template<size_t N>
369  static i128 __vectorcall ShiftRightLogical64(i128arg value) NLIB_NOEXCEPT;
370 
371  //
372  // 128bit wide Byte Shift/Rotate
373  //
374  template<size_t N>
375  static i128 __vectorcall ByteShiftLeft(i128arg value) NLIB_NOEXCEPT;
376  template<size_t N>
377  static i128 __vectorcall ByteShiftRight(i128arg value) NLIB_NOEXCEPT;
378  template<size_t N>
379  static i128 __vectorcall ByteRotateRight(i128arg value) NLIB_NOEXCEPT;
380  template<size_t N>
381  static i128 __vectorcall AlignR(i128arg a, i128arg b) NLIB_NOEXCEPT;
382 
383  //
384  // Conversion
385  //
386  static i128 __vectorcall NarrowFrom16To8(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
387  static i128 __vectorcall NarrowFrom32To16(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
388  static i128 __vectorcall NarrowFrom64To32(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
389 
390  static i128 __vectorcall ConvertFromUint16ToUint8Saturated(i128arg lo,
391  i128arg hi) NLIB_NOEXCEPT;
392  static i128 __vectorcall ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
393  static i128 __vectorcall ConvertFromUint32ToUint16Saturated(i128arg lo,
394  i128arg hi) NLIB_NOEXCEPT;
395  static i128 __vectorcall ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
396 
397  static i128 __vectorcall ConvertFromInt8ToInt16Lo(i128arg value) NLIB_NOEXCEPT;
398  static i128 __vectorcall ConvertFromInt8ToInt16Hi(i128arg value) NLIB_NOEXCEPT;
399  static i128 __vectorcall ConvertFromInt16ToInt32Lo(i128arg value) NLIB_NOEXCEPT;
400  static i128 __vectorcall ConvertFromInt16ToInt32Hi(i128arg value) NLIB_NOEXCEPT;
401  static i128 __vectorcall ConvertFromInt32ToInt64Lo(i128arg value) NLIB_NOEXCEPT;
402  static i128 __vectorcall ConvertFromInt32ToInt64Hi(i128arg value) NLIB_NOEXCEPT;
403  static i128 __vectorcall ConvertFromUint8ToUint16Lo(i128arg value) NLIB_NOEXCEPT;
404  static i128 __vectorcall ConvertFromUint8ToUint16Hi(i128arg value) NLIB_NOEXCEPT;
405  static i128 __vectorcall ConvertFromUint16ToUint32Lo(i128arg value) NLIB_NOEXCEPT;
406  static i128 __vectorcall ConvertFromUint16ToUint32Hi(i128arg value) NLIB_NOEXCEPT;
407  static i128 __vectorcall ConvertFromUint32ToUint64Lo(i128arg value) NLIB_NOEXCEPT;
408  static i128 __vectorcall ConvertFromUint32ToUint64Hi(i128arg value) NLIB_NOEXCEPT;
409 
410  static i128 __vectorcall Zip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
411  static i128 __vectorcall Zip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
412  static i128 __vectorcall Unzip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
413  static i128 __vectorcall Unzip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
414  static i128 __vectorcall Zip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
415  static i128 __vectorcall Zip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
416  static i128 __vectorcall Unzip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
417  static i128 __vectorcall Unzip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
418  static i128 __vectorcall Zip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
419  static i128 __vectorcall Zip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
420  static i128 __vectorcall Unzip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
421  static i128 __vectorcall Unzip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
422 
423  template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7, int V8, int V9,
424  int V10, int V11, int V12, int V13, int V14, int V15>
425  static i128 __vectorcall Permute8(i128arg a, i128arg b) NLIB_NOEXCEPT;
426  template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7>
427  static i128 __vectorcall Permute16(i128arg a, i128arg b) NLIB_NOEXCEPT;
428  template<int V0, int V1, int V2, int V3>
429  static i128 __vectorcall Permute32(i128arg a, i128arg b) NLIB_NOEXCEPT;
430 
431  //
432  // Swap endian
433  //
434  static i128 __vectorcall Reverse16(i128arg value) NLIB_NOEXCEPT;
435  static i128 __vectorcall Reverse32(i128arg value) NLIB_NOEXCEPT;
436  static i128 __vectorcall Reverse64(i128arg value) NLIB_NOEXCEPT;
437 
438  //
439  // Misc
440  //
441  static int __vectorcall MoveMask8(i128arg value) NLIB_NOEXCEPT;
442  static int __vectorcall MoveMask16(i128arg value) NLIB_NOEXCEPT;
443  static int __vectorcall MoveMask32(i128arg value) NLIB_NOEXCEPT;
444  static i128 __vectorcall SetMask8(int mask) NLIB_NOEXCEPT;
445  static i128 __vectorcall SetMask16(int mask) NLIB_NOEXCEPT;
446  static i128 __vectorcall SetMask32(int mask) NLIB_NOEXCEPT;
447  static bool __vectorcall IsZero(i128arg value) NLIB_NOEXCEPT;
448  static bool __vectorcall IsFull(i128arg value) NLIB_NOEXCEPT;
449  static i128 __vectorcall Select(i128arg mask, i128arg a, i128arg b) NLIB_NOEXCEPT;
450  static i128 __vectorcall Shuffle8(i128arg value, i128arg shuffle) NLIB_NOEXCEPT;
451  static int __vectorcall PopCntMask8(i128arg value) NLIB_NOEXCEPT;
452  static int __vectorcall ClzMask8(i128arg value) NLIB_NOEXCEPT;
453  static int __vectorcall CtzMask8(i128arg value) NLIB_NOEXCEPT;
454  static int __vectorcall SumUint8(i128arg value) NLIB_NOEXCEPT;
455  static int __vectorcall SumUint16(i128arg value) NLIB_NOEXCEPT;
456 
457  private:
458  I128(); // forbidden
459 };
460 
461 #ifndef NLIB_DOXYGEN
462 
463 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall
464 #define NLIB_M2(tp) inline tp __vectorcall
465 
466 #ifdef NLIB_NEON
467 #undef vreinterpret_s8_s8
468 #undef NLIB_OP1
469 #undef NLIB_OP2
470 #undef NLIB_OP3
471 #undef NLIB_CMP
472 #undef NLIB_SFT
473 #undef NLIB_CMB
474 
475 #define vreinterpretq_s8_s8(a) (a)
476 #define NLIB_OP1(intrin, tp, a) vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a)))
477 #define NLIB_OP2(intrin, tp, a, b) \
478  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vreinterpretq_##tp##_s8(b)))
479 #define NLIB_OP3(intrin, tp, a, b, c) \
480  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vreinterpretq_##tp##_s8(b), \
481  vreinterpretq_##tp##_s8(c)))
482 #define NLIB_CMP(intrin, tp, a, b, utp) \
483  vreinterpretq_s8_##utp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vreinterpretq_##tp##_s8(b)))
484 #define NLIB_SFT(intrin, tp, a, cnt, stp) \
485  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vdupq_n_##stp(cnt)))
486 #define NLIB_CMB(tp, l, h) vreinterpretq_s8_##tp(vcombine_##tp(l, h))
487 #endif
488 
489 // r.s8[i] = v for all i_mm_cvtsi32_si128
490 NLIB_M(i128) I128::SetValue(int8_t v, each_int8_tag) NLIB_NOEXCEPT {
491 #if defined(NLIB_SSE41)
492  // faster than _mm_set1_epi8(v)
493  return _mm_shuffle_epi8(_mm_cvtsi32_si128(static_cast<uint8_t>(v)), _mm_setzero_si128());
494 #elif defined(NLIB_NEON)
495  return vdupq_n_s8(v);
496 #endif
497 }
498 
499 // r.s16[i] = v for all i
500 NLIB_M(i128) I128::SetValue(int16_t v, each_int16_tag) NLIB_NOEXCEPT {
501 #if defined(NLIB_SSE41)
502  return _mm_set1_epi16(v);
503 #elif defined(NLIB_NEON)
504  return vreinterpretq_s8_s16(vdupq_n_s16(v));
505 #endif
506 }
507 
508 // r.s32[i] = v for all i
509 NLIB_M(i128) I128::SetValue(int32_t v, each_int32_tag) NLIB_NOEXCEPT {
510 #if defined(NLIB_SSE41)
511  return _mm_set1_epi32(v);
512 #elif defined(NLIB_NEON)
513  return vreinterpretq_s8_s32(vdupq_n_s32(v));
514 #endif
515 }
516 
517 // r.s64[i] = v for all i
518 NLIB_M(i128) I128::SetValue(int64_t v, each_int64_tag) NLIB_NOEXCEPT {
519 #if defined(NLIB_SSE41)
520 #ifdef _MSC_VER
521  // conflicts with ffloor()?
522  // Do not use MMX anyway
523  // return _mm_set1_epi64(*reinterpret_cast<__m64*>(&x));
524  NLIB_ALIGNAS(16) int64_t tmp[2] = {v, v};
525  return I128::LoadA16(tmp);
526 #else
527  return _mm_set1_epi64x(v);
528 #endif
529 #elif defined(NLIB_NEON)
530  return vreinterpretq_s8_s64(vdupq_n_s64(v));
531 #endif
532 }
533 
534 // r.u8[i] = v for all i
535 NLIB_M(i128) I128::SetValue(uint8_t v, each_uint8_tag) NLIB_NOEXCEPT {
536 #if defined(NLIB_SSE41)
537  // faster than _mm_set1_epi8(v)
538  return _mm_shuffle_epi8(_mm_cvtsi32_si128(v), _mm_setzero_si128());
539 #elif defined(NLIB_NEON)
540  return vreinterpretq_s8_u8(vdupq_n_u8(v));
541 #endif
542 }
543 
544 // r.u16[i] = v for all i
545 NLIB_M(i128) I128::SetValue(uint16_t v, each_uint16_tag) NLIB_NOEXCEPT {
546 #if defined(NLIB_SSE41)
547  return _mm_set1_epi16(static_cast<int16_t>(v));
548 #elif defined(NLIB_NEON)
549  return vreinterpretq_s8_u16(vdupq_n_u16(v));
550 #endif
551 }
552 
553 // r.u32[i] = v for all i
554 NLIB_M(i128) I128::SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT {
555 #if defined(NLIB_SSE41)
556  return _mm_set1_epi32(static_cast<int32_t>(v));
557 #elif defined(NLIB_NEON)
558  return vreinterpretq_s8_u32(vdupq_n_u32(v));
559 #endif
560 }
561 
562 // r.u64[i] = v for all i
563 NLIB_M(i128) I128::SetValue(uint64_t v, each_uint64_tag) NLIB_NOEXCEPT {
564 #if defined(NLIB_SSE41)
565 #ifdef _MSC_VER
566  NLIB_ALIGNAS(16) uint64_t tmp[2] = {v, v};
567  return I128::LoadA16(tmp);
568 #else
569  return _mm_set1_epi64x(static_cast<int64_t>(v));
570 #endif
571 #elif defined(NLIB_NEON)
572  return vreinterpretq_s8_u64(vdupq_n_u64(v));
573 #endif
574 }
575 
576 #if defined(NLIB_SSE41)
577 template<size_t N>
578 // r.u32[i] = value.u32[N] for all i
579 NLIB_M(i128) I128::SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT {
580  NLIB_STATIC_ASSERT(N < 4);
581  return _mm_shuffle_epi32(value, _MM_SHUFFLE(N, N, N, N));
582 }
583 #elif defined(NLIB_NEON)
584 #ifdef __aarch64__
585 template<size_t N>
586 // r.u32[i] = value.u32[N] for all i
587 NLIB_M(i128) I128::SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT {
588  NLIB_STATIC_ASSERT(N < 4);
589  uint32x4_t v = vreinterpretq_u32_s8(value);
590  return vreinterpretq_s8_u32(vdupq_laneq_u32(v, N));
591 }
592 #else
593 template<>
594 NLIB_M(i128)
595 I128::SetValue<0>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
596  uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
597  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
598 }
599 template<>
600 NLIB_M(i128)
601 I128::SetValue<1>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
602  uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
603  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
604 }
605 template<>
606 NLIB_M(i128)
607 I128::SetValue<2>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
608  uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
609  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
610 }
611 template<>
612 NLIB_M(i128)
613 I128::SetValue<3>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
614  uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
615  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
616 }
617 #endif
618 #endif
619 
620 #if defined(NLIB_SSE41)
621 template<size_t N>
622 // r.u16[i] = value.u16[N] for all i
623 NLIB_M2(i128) I128::SetValue(i128 value, each_select16_tag) NLIB_NOEXCEPT {
624  NLIB_STATIC_ASSERT(N < 8);
625  NLIB_ALIGNAS(16)
626  const int8_t mask[16] = {2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1,
627  2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1,
628  2 * N, 2 * N + 1, 2 * N, 2 * N + 1};
629  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(mask));
630 }
631 #elif defined(NLIB_NEON)
632 template<>
633 NLIB_M(i128)
634 I128::SetValue<0>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
635 #ifdef __aarch64__
636  uint16x8_t v = vreinterpretq_u16_s8(value);
637  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 0));
638 #else
639  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
640  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
641 #endif
642 }
643 
644 template<>
645 NLIB_M(i128)
646 I128::SetValue<1>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
647 #ifdef __aarch64__
648  uint16x8_t v = vreinterpretq_u16_s8(value);
649  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 1));
650 #else
651  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
652  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
653 #endif
654 }
655 
656 template<>
657 NLIB_M(i128)
658 I128::SetValue<2>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
659 #ifdef __aarch64__
660  uint16x8_t v = vreinterpretq_u16_s8(value);
661  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 2));
662 #else
663  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
664  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
665 #endif
666 }
667 
668 template<>
669 NLIB_M(i128)
670 I128::SetValue<3>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
671 #ifdef __aarch64__
672  uint16x8_t v = vreinterpretq_u16_s8(value);
673  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 3));
674 #else
675  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
676  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
677 #endif
678 }
679 
680 template<>
681 NLIB_M(i128)
682 I128::SetValue<4>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
683 #ifdef __aarch64__
684  uint16x8_t v = vreinterpretq_u16_s8(value);
685  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 4));
686 #else
687  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
688  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
689 #endif
690 }
691 
692 template<>
693 NLIB_M(i128)
694 I128::SetValue<5>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
695 #ifdef __aarch64__
696  uint16x8_t v = vreinterpretq_u16_s8(value);
697  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 5));
698 #else
699  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
700  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
701 #endif
702 }
703 
704 template<>
705 NLIB_M(i128)
706 I128::SetValue<6>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
707 #ifdef __aarch64__
708  uint16x8_t v = vreinterpretq_u16_s8(value);
709  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 6));
710 #else
711  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
712  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
713 #endif
714 }
715 
716 template<>
717 NLIB_M(i128)
718 I128::SetValue<7>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
719 #ifdef __aarch64__
720  uint16x8_t v = vreinterpretq_u16_s8(value);
721  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 7));
722 #else
723  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
724  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
725 #endif
726 }
727 #endif
728 
729 #if defined(NLIB_SSE41)
730 template<size_t N>
731 // r.u8[i] = value.u8[N] for all i
732 NLIB_M2(i128) I128::SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT {
733  NLIB_STATIC_ASSERT(N < 16);
734  NLIB_ALIGNAS(16) const int8_t mask[16] = {N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N};
735  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
736 }
737 #elif defined(NLIB_NEON)
738 namespace detail {
739 template<size_t N, bool IsLower>
740 struct SetValue8Helper {
741  NLIB_M(i128) operator()(i128 value) NLIB_NOEXCEPT {
742  return vdupq_lane_s8(vget_low_s8(value), N);
743  }
744 };
745 
746 template<size_t N>
747 struct SetValue8Helper<N, false> {
748  NLIB_M(i128) operator()(i128 value) NLIB_NOEXCEPT {
749  return vdupq_lane_s8(vget_high_s8(value), N - 8);
750  }
751 };
752 
753 } // namespace detail
754 
755 template<size_t N>
756 NLIB_M(i128)
757 I128::SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT {
758  NLIB_STATIC_ASSERT(N < 16);
759 #ifdef __aarch64__
760  return vdupq_laneq_s8(value, N);
761 #else
762  return detail::SetValue8Helper<N, (N < 8)>()(value);
763 #endif
764 }
765 #endif
766 
767 // set 0 to all bits
768 NLIB_M(i128) I128::SetZero() NLIB_NOEXCEPT {
769 #if defined(NLIB_SSE41)
770  return _mm_setzero_si128();
771 #elif defined(NLIB_NEON)
772  return vdupq_n_s8(0);
773 #endif
774 }
775 
776 // set 1 to all bits
777 NLIB_M(i128) I128::SetFull(i128arg dummy) NLIB_NOEXCEPT {
778  return I128::CmpEq8(dummy, dummy);
779 }
780 
781 // r[i] = p[i], p is 16 bytes aligned
782 NLIB_M(i128) I128::LoadA16(const void* p) NLIB_NOEXCEPT {
783 #if defined(NLIB_SSE41)
784  return _mm_load_si128(static_cast<const __m128i*>(p));
785 #elif defined(NLIB_NEON)
786  uint64x2_t tmp = vld1q_u64(static_cast<const uint64_t*>(p));
787  return vreinterpretq_s8_u64(tmp);
788 #endif
789 }
790 
791 // r[i] = p[i], p is 8 bytes aligned
792 NLIB_M(i128) I128::LoadA8(const void* p) NLIB_NOEXCEPT {
793 #if defined(NLIB_SSE41)
794  return _mm_loadu_si128(static_cast<const __m128i*>(p));
795 #elif defined(NLIB_NEON)
796  uint64x2_t tmp = vld1q_u64(static_cast<const uint64_t*>(p));
797  return vreinterpretq_s8_u64(tmp);
798 #endif
799 }
800 
801 // r[i] = p[i], p is 4 bytes aligned
802 NLIB_M(i128) I128::LoadA4(const void* p) NLIB_NOEXCEPT {
803 #if defined(NLIB_SSE41)
804  return _mm_loadu_si128(static_cast<const __m128i*>(p));
805 #elif defined(NLIB_NEON)
806  uint32x4_t tmp = vld1q_u32(static_cast<const uint32_t*>(p));
807  return vreinterpretq_s8_u32(tmp);
808 #endif
809 }
810 
811 // r[i] = p[i], p is 2 bytes aligned
812 NLIB_M(i128) I128::LoadA2(const void* p) NLIB_NOEXCEPT {
813 #if defined(NLIB_SSE41)
814  return _mm_loadu_si128(static_cast<const __m128i*>(p));
815 #elif defined(NLIB_NEON)
816  uint16x8_t tmp = vld1q_u16(static_cast<const uint16_t*>(p));
817  return vreinterpretq_s8_u16(tmp);
818 #endif
819 }
820 
821 // r[i] = p[i]
822 NLIB_M(i128) I128::LoadA1(const void* p) NLIB_NOEXCEPT {
823 #if defined(NLIB_SSE41)
824  return _mm_loadu_si128(static_cast<const __m128i*>(p));
825 #elif defined(NLIB_NEON)
826  return vld1q_s8(static_cast<const int8_t*>(p));
827 #endif
828 }
829 
830 NLIB_M(i128) I128::LoadLoA8(const void* p) NLIB_NOEXCEPT {
831 #if defined(NLIB_SSE41)
832  return _mm_loadl_epi64(static_cast<const __m128i*>(p));
833 #elif defined(NLIB_NEON)
834  int8x8_t lo = vreinterpret_s8_u64(vld1_u64(static_cast<const uint64_t*>(p)));
835  return vcombine_s8(lo, vdup_n_s8(0));
836 #endif
837 }
838 
839 NLIB_M(i128) I128::LoadLoA4(const void* p) NLIB_NOEXCEPT {
840 #if defined(NLIB_SSE41)
841  return _mm_loadl_epi64(static_cast<const __m128i*>(p));
842 #elif defined(NLIB_NEON)
843  int8x8_t lo = vreinterpret_s8_u32(vld1_u32(static_cast<const uint32_t*>(p)));
844  return vcombine_s8(lo, vdup_n_s8(0));
845 #endif
846 }
847 
848 NLIB_M(i128) I128::LoadLoA2(const void* p) NLIB_NOEXCEPT {
849 #if defined(NLIB_SSE41)
850  return _mm_loadl_epi64(static_cast<const __m128i*>(p));
851 #elif defined(NLIB_NEON)
852  int8x8_t lo = vreinterpret_s8_u16(vld1_u16(static_cast<const uint16_t*>(p)));
853  return vcombine_s8(lo, vdup_n_s8(0));
854 #endif
855 }
856 
857 NLIB_M(i128) I128::LoadLoA1(const void* p) NLIB_NOEXCEPT {
858 #if defined(NLIB_SSE41)
859  return _mm_loadl_epi64(static_cast<const __m128i*>(p));
860 #elif defined(NLIB_NEON)
861  int8x8_t lo = vld1_s8(static_cast<const int8_t*>(p));
862  return vcombine_s8(lo, vdup_n_s8(0));
863 #endif
864 }
865 
866 NLIB_M(i128) I128::LoadHiA8(const void* p) NLIB_NOEXCEPT {
867 #if defined(NLIB_SSE41)
868  __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
869  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
870 #elif defined(NLIB_NEON)
871  int8x8_t hi = vreinterpret_s8_u64(vld1_u64(static_cast<const uint64_t*>(p)));
872  return vcombine_s8(vdup_n_s8(0), hi);
873 #endif
874 }
875 
876 NLIB_M(i128) I128::LoadHiA4(const void* p) NLIB_NOEXCEPT {
877 #if defined(NLIB_SSE41)
878  __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
879  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
880 #elif defined(NLIB_NEON)
881  int8x8_t hi = vreinterpret_s8_u32(vld1_u32(static_cast<const uint32_t*>(p)));
882  return vcombine_s8(vdup_n_s8(0), hi);
883 #endif
884 }
885 
886 NLIB_M(i128) I128::LoadHiA2(const void* p) NLIB_NOEXCEPT {
887 #if defined(NLIB_SSE41)
888  __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
889  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
890 #elif defined(NLIB_NEON)
891  int8x8_t hi = vreinterpret_s8_u16(vld1_u16(static_cast<const uint16_t*>(p)));
892  return vcombine_s8(vdup_n_s8(0), hi);
893 #endif
894 }
895 
896 NLIB_M(i128) I128::LoadHiA1(const void* p) NLIB_NOEXCEPT {
897 #if defined(NLIB_SSE41)
898  __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
899  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
900 #elif defined(NLIB_NEON)
901  int8x8_t hi = vld1_s8(static_cast<const int8_t*>(p));
902  return vcombine_s8(vdup_n_s8(0), hi);
903 #endif
904 }
905 
906 // p[i] = value[i], p is 16 bytes aligned
907 NLIB_M(void) I128::StoreA16(void* p, i128arg value) NLIB_NOEXCEPT {
908 #if defined(NLIB_SSE41)
909  _mm_store_si128(static_cast<i128*>(p), value);
910 #elif defined(NLIB_NEON)
911  vst1q_u64(static_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
912 #endif
913 }
914 
915 // p[i] = value[i], p is 8 bytes aligned
916 NLIB_M(void) I128::StoreA8(void* p, i128arg value) NLIB_NOEXCEPT {
917 #if defined(NLIB_SSE41)
918  _mm_storeu_si128(static_cast<i128*>(p), value);
919 #elif defined(NLIB_NEON)
920  vst1q_u64(static_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
921 #endif
922 }
923 
924 // p[i] = value[i], p is 4 bytes aligned
925 NLIB_M(void) I128::StoreA4(void* p, i128arg value) NLIB_NOEXCEPT {
926 #if defined(NLIB_SSE41)
927  _mm_storeu_si128(static_cast<i128*>(p), value);
928 #elif defined(NLIB_NEON)
929  vst1q_u32(static_cast<uint32_t*>(p), vreinterpretq_u32_s8(value));
930 #endif
931 }
932 
933 // p[i] = value[i], p is 2 bytes aligned
934 NLIB_M(void) I128::StoreA2(void* p, i128arg value) NLIB_NOEXCEPT {
935 #if defined(NLIB_SSE41)
936  _mm_storeu_si128(static_cast<i128*>(p), value);
937 #elif defined(NLIB_NEON)
938  vst1q_u16(static_cast<uint16_t*>(p), vreinterpretq_u16_s8(value));
939 #endif
940 }
941 
942 // p[i] = value[i]
943 NLIB_M(void) I128::StoreA1(void* p, i128arg value) NLIB_NOEXCEPT {
944 #if defined(NLIB_SSE41)
945  _mm_storeu_si128(static_cast<i128*>(p), value);
946 #elif defined(NLIB_NEON)
947  vst1q_s8(static_cast<int8_t*>(p), value);
948 #endif
949 }
950 
951 NLIB_M(void) I128::StoreLoA8(void* p, i128arg value) NLIB_NOEXCEPT {
952 #if defined(NLIB_SSE41)
953  _mm_storel_epi64(static_cast<i128*>(p), value);
954 #elif defined(NLIB_NEON)
955  uint64x1_t x = vreinterpret_u64_s8(vget_low_s8(value));
956  vst1_u64(static_cast<uint64_t*>(p), x);
957 #endif
958 }
959 
960 NLIB_M(void) I128::StoreLoA4(void* p, i128arg value) NLIB_NOEXCEPT {
961 #if defined(NLIB_SSE41)
962  _mm_storel_epi64(static_cast<i128*>(p), value);
963 #elif defined(NLIB_NEON)
964  uint32x2_t x = vreinterpret_u32_s8(vget_low_s8(value));
965  vst1_u32(static_cast<uint32_t*>(p), x);
966 #endif
967 }
968 
969 NLIB_M(void) I128::StoreLoA2(void* p, i128arg value) NLIB_NOEXCEPT {
970 #if defined(NLIB_SSE41)
971  _mm_storel_epi64(static_cast<i128*>(p), value);
972 #elif defined(NLIB_NEON)
973  uint16x4_t x = vreinterpret_u16_s8(vget_low_s8(value));
974  vst1_u16(static_cast<uint16_t*>(p), x);
975 #endif
976 }
977 
978 NLIB_M(void) I128::StoreLoA1(void* p, i128arg value) NLIB_NOEXCEPT {
979 #if defined(NLIB_SSE41)
980  _mm_storel_epi64(static_cast<i128*>(p), value);
981 #elif defined(NLIB_NEON)
982  int8x8_t x = vget_low_s8(value);
983  vst1_s8(static_cast<int8_t*>(p), x);
984 #endif
985 }
986 
987 NLIB_M(void) I128::StoreHiA8(void* p, i128arg value) NLIB_NOEXCEPT {
988 #if defined(NLIB_SSE41)
989  _mm_storel_epi64(static_cast<i128*>(p), _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
990 #elif defined(NLIB_NEON)
991  uint64x1_t x = vreinterpret_u64_s8(vget_high_s8(value));
992  vst1_u64(static_cast<uint64_t*>(p), x);
993 #endif
994 }
995 
996 NLIB_M(void) I128::StoreHiA4(void* p, i128arg value) NLIB_NOEXCEPT {
997 #if defined(NLIB_SSE41)
998  _mm_storel_epi64(static_cast<i128*>(p), _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
999 #elif defined(NLIB_NEON)
1000  uint32x2_t x = vreinterpret_u32_s8(vget_high_s8(value));
1001  vst1_u32(static_cast<uint32_t*>(p), x);
1002 #endif
1003 }
1004 
1005 NLIB_M(void) I128::StoreHiA2(void* p, i128arg value) NLIB_NOEXCEPT {
1006 #if defined(NLIB_SSE41)
1007  _mm_storel_epi64(static_cast<i128*>(p), _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
1008 #elif defined(NLIB_NEON)
1009  uint16x4_t x = vreinterpret_u16_s8(vget_high_s8(value));
1010  vst1_u16(static_cast<uint16_t*>(p), x);
1011 #endif
1012 }
1013 
1014 NLIB_M(void) I128::StoreHiA1(void* p, i128arg value) NLIB_NOEXCEPT {
1015 #if defined(NLIB_SSE41)
1016  _mm_storel_epi64(static_cast<i128*>(p), _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
1017 #elif defined(NLIB_NEON)
1018  int8x8_t x = vget_high_s8(value);
1019  vst1_s8(static_cast<int8_t*>(p), x);
1020 #endif
1021 }
1022 
1023 template<size_t N>
1024 // r = value.u8[N]
1025 NLIB_M(uint8_t) I128::GetUint8FromLane(i128arg value) NLIB_NOEXCEPT {
1026  NLIB_STATIC_ASSERT(N < 16);
1027 #if defined(NLIB_SSE41)
1028  return static_cast<uint8_t>(_mm_extract_epi8(value, N));
1029 #elif defined(NLIB_NEON)
1030  return vgetq_lane_u8(vreinterpretq_u8_s8(value), N);
1031 #endif
1032 }
1033 
1034 template<size_t N>
1035 // r = value.u16[N]
1036 NLIB_M(uint16_t) I128::GetUint16FromLane(i128arg value) NLIB_NOEXCEPT {
1037  NLIB_STATIC_ASSERT(N < 8);
1038 #if defined(NLIB_SSE41)
1039  return static_cast<uint16_t>(_mm_extract_epi16(value, N));
1040 #elif defined(NLIB_NEON)
1041  return vgetq_lane_u16(vreinterpretq_u16_s8(value), N);
1042 #endif
1043 }
1044 
1045 template<size_t N>
1046 // r = value.u32[N]
1047 NLIB_M(uint32_t) I128::GetUint32FromLane(i128arg value) NLIB_NOEXCEPT {
1048  NLIB_STATIC_ASSERT(N < 4);
1049 #if defined(NLIB_SSE41)
1050  return static_cast<uint32_t>(_mm_extract_epi32(value, N));
1051 #elif defined(NLIB_NEON)
1052  return vgetq_lane_u32(vreinterpretq_u32_s8(value), N);
1053 #endif
1054 }
1055 
1056 template<size_t N>
1057 // r = value.u64[N]
1058 NLIB_M(uint64_t) I128::GetUint64FromLane(i128arg value) NLIB_NOEXCEPT {
1059  NLIB_STATIC_ASSERT(N < 2);
1060 #if defined(NLIB_SSE41)
1061 #ifdef NLIB_64BIT
1062  return static_cast<uint64_t>(_mm_extract_epi64(value, N));
1063 #else
1064  NLIB_UNUSED(value);
1065  return 0; // dummy
1066 #endif
1067 #elif defined(NLIB_NEON)
1068  return vgetq_lane_u64(vreinterpretq_u64_s8(value), N);
1069 #endif
1070 }
1071 
1072 #if defined(NLIB_SSE41) && !defined(NLIB_64BIT)
1073 template<>
1074 NLIB_M(uint64_t)
1075 I128::GetUint64FromLane<0>(i128arg value) NLIB_NOEXCEPT {
1076  uint64_t rval;
1077  _mm_storel_epi64(reinterpret_cast<i128*>(&rval), value);
1078  return rval;
1079 }
1080 template<>
1081 NLIB_M(uint64_t)
1082 I128::GetUint64FromLane<1>(i128arg value) NLIB_NOEXCEPT {
1083  uint64_t rval;
1084  i128 tmp = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
1085  _mm_storel_epi64(reinterpret_cast<i128*>(&rval), tmp);
1086  return rval;
1087 }
1088 #endif
1089 
1090 template<size_t N>
1091 // r = value, r.u8[N] = v
1092 NLIB_M(i128) I128::SetUint8ToLane(i128arg value, uint8_t v) NLIB_NOEXCEPT {
1093  NLIB_STATIC_ASSERT(N < 16);
1094 #if defined(NLIB_SSE41)
1095  return _mm_insert_epi8(value, static_cast<int8_t>(v), N);
1096 #elif defined(NLIB_NEON)
1097  return __builtin_constant_p(v)
1098  ? I128::Permute8 < N == 0 ? 16 : 0,
1099  N == 1 ? 17 : 1, N == 2 ? 18 : 2, N == 3 ? 19 : 3, N == 4 ? 20 : 4, N == 5 ? 21 : 5,
1100  N == 6 ? 22 : 6, N == 7 ? 23 : 7, N == 8 ? 24 : 8, N == 9 ? 25 : 9, N == 10 ? 26 : 10,
1101  N == 11 ? 27 : 11, N == 12 ? 28 : 12, N == 13 ? 29 : 13, N == 14 ? 30 : 14,
1102  N == 15 ? 31 : 15 > (value, vreinterpretq_s8_u8(vdupq_n_u8(v)))
1103  : vreinterpretq_s8_u8(vsetq_lane_u8(v, vreinterpretq_u8_s8(value), N));
1104 #endif
1105 }
1106 
1107 template<size_t N>
1108 // r = value, r.u16[N] = v
1109 NLIB_M(i128) I128::SetUint16ToLane(i128arg value, uint16_t v) NLIB_NOEXCEPT {
1110  NLIB_STATIC_ASSERT(N < 8);
1111 #if defined(NLIB_SSE41)
1112  return _mm_insert_epi16(value, static_cast<int16_t>(v), N);
1113 #elif defined(NLIB_NEON)
1114  return __builtin_constant_p(v)
1115  ? I128::Permute16 < N == 0 ? 8 : 0,
1116  N == 1 ? 9 : 1, N == 2 ? 10 : 2, N == 3 ? 11 : 3, N == 4 ? 12 : 4, N == 5 ? 13 : 5,
1117  N == 6 ? 14 : 6,
1118  N == 7 ? 15 : 7 > (value, vreinterpretq_s8_u16(vdupq_n_u16(v)))
1119  : vreinterpretq_s8_u16(vsetq_lane_u16(v, vreinterpretq_u16_s8(value), N));
1120 #endif
1121 }
1122 
1123 template<size_t N>
1124 // r = value, r.u32[N] = v
1125 NLIB_M(i128) I128::SetUint32ToLane(i128arg value, uint32_t v) NLIB_NOEXCEPT {
1126  NLIB_STATIC_ASSERT(N < 4);
1127 #if defined(NLIB_SSE41)
1128  return _mm_insert_epi32(value, static_cast<uint32_t>(v), N);
1129 #elif defined(NLIB_NEON)
1130  return __builtin_constant_p(v)
1131  ? I128::Permute32 < N == 0 ? 4 : 0,
1132  N == 1 ? 5 : 1, N == 2 ? 6 : 2,
1133  N == 3 ? 7 : 3 > (value, vreinterpretq_s8_u32(vdupq_n_u32(v)))
1134  : vreinterpretq_s8_u32(vsetq_lane_u32(v, vreinterpretq_u32_s8(value), N));
1135 #endif
1136 }
1137 
1138 template<size_t N>
1139 // r = value, r.u64[N] = v
1140 NLIB_M(i128) I128::SetUint64ToLane(i128arg value, uint64_t v) NLIB_NOEXCEPT {
1141  NLIB_STATIC_ASSERT(N < 2);
1142 #if defined(NLIB_SSE41)
1143 #ifdef NLIB_64BIT
1144  return _mm_insert_epi64(value, static_cast<int64_t>(v), N);
1145 #else
1146  union {
1147  int32_t i32[2];
1148  int64_t i64;
1149  } tmp;
1150  tmp.i64 = static_cast<int64_t>(v);
1151  __m128i rval;
1152  rval = _mm_insert_epi32(value, tmp.i32[0], N * 2 + 0);
1153  return _mm_insert_epi32(rval, tmp.i32[1], N * 2 + 1);
1154 #endif
1155 #elif defined(NLIB_NEON)
1156  return vreinterpretq_s8_u64(vsetq_lane_u64(v, vreinterpretq_u64_s8(value), N));
1157 #endif
1158 }
1159 
1160 // r.i8[i] = a.i8[i] + b.i8[i]
1161 NLIB_M(i128) I128::Add8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1162 #if defined(NLIB_SSE41)
1163  return _mm_add_epi8(a, b);
1164 #elif defined(NLIB_NEON)
1165  return vaddq_s8(a, b);
1166 #endif
1167 }
1168 
1169 // r.i16[i] = a.i16[i] + b.i16[i]
1170 NLIB_M(i128) I128::Add16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1171 #if defined(NLIB_SSE41)
1172  return _mm_add_epi16(a, b);
1173 #elif defined(NLIB_NEON)
1174  return NLIB_OP2(vaddq, s16, a, b);
1175 #endif
1176 }
1177 
1178 // r.i32[i] = a.i32[i] + b.i32[i]
1179 NLIB_M(i128) I128::Add32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1180 #if defined(NLIB_SSE41)
1181  return _mm_add_epi32(a, b);
1182 #elif defined(NLIB_NEON)
1183  return NLIB_OP2(vaddq, s32, a, b);
1184 #endif
1185 }
1186 
1187 // r.i64[i] = a.i64[i] + b.i64[i]
1188 NLIB_M(i128) I128::Add64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1189 #if defined(NLIB_SSE41)
1190  return _mm_add_epi64(a, b);
1191 #elif defined(NLIB_NEON)
1192  return NLIB_OP2(vaddq, s64, a, b);
1193 #endif
1194 }
1195 
1196 // 127 if a[i] + b[i] > 127, -128 if a[i] + b[i] < -128
1197 NLIB_M(i128) I128::AddInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1198 #if defined(NLIB_SSE41)
1199  return _mm_adds_epi8(a, b);
1200 #elif defined(NLIB_NEON)
1201  return vqaddq_s8(a, b);
1202 #endif
1203 }
1204 
1205 // 32767 if a[i] + b[i] > 32767, -32768 if a[i] + b[i] < -32768
1206 NLIB_M(i128) I128::AddInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1207 #if defined(NLIB_SSE41)
1208  return _mm_adds_epi16(a, b);
1209 #elif defined(NLIB_NEON)
1210  return NLIB_OP2(vqaddq, s16, a, b);
1211 #endif
1212 }
1213 
1214 // 255 if a[i] + b[i] > 255
1215 NLIB_M(i128) I128::AddUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1216 #if defined(NLIB_SSE41)
1217  return _mm_adds_epu8(a, b);
1218 #elif defined(NLIB_NEON)
1219  return NLIB_OP2(vqaddq, u8, a, b);
1220 #endif
1221 }
1222 
1223 // 65535 if a[i] + b[i] > 65535
1224 NLIB_M(i128) I128::AddUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1225 #if defined(NLIB_SSE41)
1226  return _mm_adds_epu16(a, b);
1227 #elif defined(NLIB_NEON)
1228  return NLIB_OP2(vqaddq, u16, a, b);
1229 #endif
1230 }
1231 
1232 // r.i8[i] = a.i8[i] - b.i8[i]
1233 NLIB_M(i128) I128::Sub8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1234 #if defined(NLIB_SSE41)
1235  return _mm_sub_epi8(a, b);
1236 #elif defined(NLIB_NEON)
1237  return vsubq_s8(a, b);
1238 #endif
1239 }
1240 
1241 // r.i16[i] = a.i16[i] - b.i16[i]
1242 NLIB_M(i128) I128::Sub16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1243 #if defined(NLIB_SSE41)
1244  return _mm_sub_epi16(a, b);
1245 #elif defined(NLIB_NEON)
1246  return NLIB_OP2(vsubq, s16, a, b);
1247 #endif
1248 }
1249 
1250 // r.i32[i] = a.i32[i] - b.i32[i]
1251 NLIB_M(i128) I128::Sub32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1252 #if defined(NLIB_SSE41)
1253  return _mm_sub_epi32(a, b);
1254 #elif defined(NLIB_NEON)
1255  return NLIB_OP2(vsubq, s32, a, b);
1256 #endif
1257 }
1258 
1259 // r.i64[0] = a.i64[0] - b.i64[0]
1260 NLIB_M(i128) I128::Sub64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1261 #if defined(NLIB_SSE41)
1262  return _mm_sub_epi64(a, b);
1263 #elif defined(NLIB_NEON)
1264  return NLIB_OP2(vsubq, s64, a, b);
1265 #endif
1266 }
1267 
1268 // 127 if a[i] - b[i] > 127, -128 if a[i] - b[i] < -128
1269 NLIB_M(i128) I128::SubInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1270 #if defined(NLIB_SSE41)
1271  return _mm_subs_epi8(a, b);
1272 #elif defined(NLIB_NEON)
1273  return NLIB_OP2(vqsubq, s8, a, b);
1274 #endif
1275 }
1276 
1277 // 32767 if a[i] - b[i] > 32767, -32768 if a[i] - b[i] < -32768
1278 NLIB_M(i128) I128::SubInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1279 #if defined(NLIB_SSE41)
1280  return _mm_subs_epi16(a, b);
1281 #elif defined(NLIB_NEON)
1282  return NLIB_OP2(vqsubq, s16, a, b);
1283 #endif
1284 }
1285 
1286 // 0 if a.u8[i] - b.u8[i] < 0
1287 NLIB_M(i128) I128::SubUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1288 #if defined(NLIB_SSE41)
1289  return _mm_subs_epu8(a, b);
1290 #elif defined(NLIB_NEON)
1291  return NLIB_OP2(vqsubq, u8, a, b);
1292 #endif
1293 }
1294 
1295 // 0 if a.u16[i] - b.u16[i] < 0
1296 NLIB_M(i128) I128::SubUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1297 #if defined(NLIB_SSE41)
1298  return _mm_subs_epu16(a, b);
1299 #elif defined(NLIB_NEON)
1300  return NLIB_OP2(vqsubq, u16, a, b);
1301 #endif
1302 }
1303 
1304 // r.i8[0] = value[0] + value[1], ..., r.i8[7] = value[14] + value[15]
1305 NLIB_M(i128) I128::PairwiseAdd8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1306 #if defined(NLIB_SSE41)
1307  __m128i ax = _mm_add_epi8(a, _mm_srli_epi16(a, 8));
1308  __m128i bx = _mm_add_epi8(b, _mm_srli_epi16(b, 8));
1309  return I128::NarrowFrom16To8(ax, bx);
1310 #elif defined(NLIB_NEON)
1311 #ifdef __aarch64__
1312  return vpaddq_s8(a, b);
1313 #else
1314  int8x8_t al = vget_low_s8(a);
1315  int8x8_t ah = vget_high_s8(a);
1316  int8x8_t rl = vpadd_s8(al, ah);
1317  int8x8_t bl = vget_low_s8(b);
1318  int8x8_t bh = vget_high_s8(b);
1319  int8x8_t rh = vpadd_s8(bl, bh);
1320  return vcombine_s8(rl, rh);
1321 #endif
1322 #endif
1323 }
1324 
1325 // r.i16[0] = value[0] + value[1], ..., r.i16[3] = value[6] + value[7]
1326 NLIB_M(i128) I128::PairwiseAdd16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1327 #if defined(NLIB_SSE41)
1328  return _mm_hadd_epi16(a, b);
1329 #elif defined(NLIB_NEON)
1330 #ifdef __aarch64__
1331  return vreinterpretq_s8_s16(vpaddq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)));
1332 #else
1333  int16x4_t al = vget_low_s16(vreinterpretq_s16_s8(a));
1334  int16x4_t ah = vget_high_s16(vreinterpretq_s16_s8(a));
1335  int16x4_t rl = vpadd_s16(al, ah);
1336  int16x4_t bl = vget_low_s16(vreinterpretq_s16_s8(b));
1337  int16x4_t bh = vget_high_s16(vreinterpretq_s16_s8(b));
1338  int16x4_t rh = vpadd_s16(bl, bh);
1339  return NLIB_CMB(s16, rl, rh);
1340 #endif
1341 #endif
1342 }
1343 
1344 // r.i32[0] = value[0] + value[1], r.i32[1] = value[2] + value[3]
1345 NLIB_M(i128) I128::PairwiseAdd32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1346 #if defined(NLIB_SSE41)
1347  return _mm_hadd_epi32(a, b);
1348 #elif defined(NLIB_NEON)
1349 #ifdef __aarch64__
1350  return vreinterpretq_s8_s32(vpaddq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)));
1351 #else
1352  int32x2_t al = vget_low_s32(vreinterpretq_s32_s8(a));
1353  int32x2_t ah = vget_high_s32(vreinterpretq_s32_s8(a));
1354  int32x2_t rl = vpadd_s32(al, ah);
1355  int32x2_t bl = vget_low_s32(vreinterpretq_s32_s8(b));
1356  int32x2_t bh = vget_high_s32(vreinterpretq_s32_s8(b));
1357  int32x2_t rh = vpadd_s32(bl, bh);
1358  return NLIB_CMB(s32, rl, rh);
1359 #endif
1360 #endif
1361 }
1362 
1363 // r.i16[i] = a.i16[i] * b.i16[i]
1364 NLIB_M(i128) I128::Mult16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1365 #if defined(NLIB_SSE41)
1366  return _mm_mullo_epi16(a, b);
1367 #elif defined(NLIB_NEON)
1368  return NLIB_OP2(vmulq, s16, a, b);
1369 #endif
1370 }
1371 
1372 // r = c + a * b
1373 NLIB_M(i128) I128::MultAdd16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1374 #if defined(NLIB_SSE41)
1375  return _mm_add_epi16(c, _mm_mullo_epi16(a, b));
1376 #elif defined(NLIB_NEON)
1377  return NLIB_OP3(vmlaq, s16, c, a, b);
1378 #endif
1379 }
1380 
1381 // r = c - a * b
1382 NLIB_M(i128) I128::MultSub16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1383 #if defined(NLIB_SSE41)
1384  return _mm_sub_epi16(c, _mm_mullo_epi16(a, b));
1385 #elif defined(NLIB_NEON)
1386  return NLIB_OP3(vmlsq, s16, c, a, b);
1387 #endif
1388 }
1389 
1390 // r.i32[i] = a.i32[i] * b.i32[i]
1391 NLIB_M(i128) I128::Mult32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1392 #if defined(NLIB_SSE41)
1393  return _mm_mullo_epi32(a, b);
1394 #elif defined(NLIB_NEON)
1395  return NLIB_OP2(vmulq, s32, a, b);
1396 #endif
1397 }
1398 
1399 // r = c + a * b
1400 NLIB_M(i128) I128::MultAdd32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1401 #if defined(NLIB_SSE41)
1402  return _mm_add_epi32(c, _mm_mullo_epi32(a, b));
1403 #elif defined(NLIB_NEON)
1404  return NLIB_OP3(vmlaq, s32, c, a, b);
1405 #endif
1406 }
1407 
1408 // r = c - a * b
1409 NLIB_M(i128) I128::MultSub32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1410 #if defined(NLIB_SSE41)
1411  return _mm_sub_epi32(c, _mm_mullo_epi32(a, b));
1412 #elif defined(NLIB_NEON)
1413  return NLIB_OP3(vmlsq, s32, c, a, b);
1414 #endif
1415 }
1416 
1417 // r.s8[i] = max(a.s8[i], b.s8[i])
1418 NLIB_M(i128) I128::MaxInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1419 #if defined(NLIB_SSE41)
1420  return _mm_max_epi8(a, b);
1421 #elif defined(NLIB_NEON)
1422  return NLIB_OP2(vmaxq, s8, a, b);
1423 #endif
1424 }
1425 
1426 // r.s16[i] = max(a.s16[i], b.s16[i])
1427 NLIB_M(i128) I128::MaxInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1428 #if defined(NLIB_SSE41)
1429  return _mm_max_epi16(a, b);
1430 #elif defined(NLIB_NEON)
1431  return NLIB_OP2(vmaxq, s16, a, b);
1432 #endif
1433 }
1434 
1435 // r.s32[i] = max(a.s32[i], b.s32[i])
1436 NLIB_M(i128) I128::MaxInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1437 #if defined(NLIB_SSE41)
1438  return _mm_max_epi32(a, b);
1439 #elif defined(NLIB_NEON)
1440  return NLIB_OP2(vmaxq, s32, a, b);
1441 #endif
1442 }
1443 
1444 // r.u8[i] = max(a.u8[i], b.u8[i])
1445 NLIB_M(i128) I128::MaxUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1446 #if defined(NLIB_SSE41)
1447  return _mm_max_epu8(a, b);
1448 #elif defined(NLIB_NEON)
1449  return NLIB_OP2(vmaxq, u8, a, b);
1450 #endif
1451 }
1452 
1453 // r.u16[i] = max(a.u16[i], b.u16[i])
1454 NLIB_M(i128) I128::MaxUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1455 #if defined(NLIB_SSE41)
1456  return _mm_max_epu16(a, b);
1457 #elif defined(NLIB_NEON)
1458  return NLIB_OP2(vmaxq, u16, a, b);
1459 #endif
1460 }
1461 
1462 // r.u32[i] = max(a.u32[i], b.u32[i])
1463 NLIB_M(i128) I128::MaxUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1464 #if defined(NLIB_SSE41)
1465  return _mm_max_epu32(a, b);
1466 #elif defined(NLIB_NEON)
1467  return NLIB_OP2(vmaxq, u32, a, b);
1468 #endif
1469 }
1470 
1471 // r.s8[i] = min(a.s8[i], b.s8[i])
1472 NLIB_M(i128) I128::MinInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1473 #if defined(NLIB_SSE41)
1474  return _mm_min_epi8(a, b);
1475 #elif defined(NLIB_NEON)
1476  return NLIB_OP2(vminq, s8, a, b);
1477 #endif
1478 }
1479 
1480 // r.s16[i] = min(a.s16[i], b.s16[i])
1481 NLIB_M(i128) I128::MinInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1482 #if defined(NLIB_SSE41)
1483  return _mm_min_epi16(a, b);
1484 #elif defined(NLIB_NEON)
1485  return NLIB_OP2(vminq, s16, a, b);
1486 #endif
1487 }
1488 
1489 // r.s32[i] = min(a.s32[i], b.s32[i])
1490 NLIB_M(i128) I128::MinInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1491 #if defined(NLIB_SSE41)
1492  return _mm_min_epi32(a, b);
1493 #elif defined(NLIB_NEON)
1494  return NLIB_OP2(vminq, s32, a, b);
1495 #endif
1496 }
1497 
1498 // r.u8[i] = min(a.u8[i], b.u8[i])
1499 NLIB_M(i128) I128::MinUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1500 #if defined(NLIB_SSE41)
1501  return _mm_min_epu8(a, b);
1502 #elif defined(NLIB_NEON)
1503  return NLIB_OP2(vminq, u8, a, b);
1504 #endif
1505 }
1506 
1507 // r.u16[i] = min(a.u16[i], b.u16[i])
1508 NLIB_M(i128) I128::MinUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1509 #if defined(NLIB_SSE41)
1510  return _mm_min_epu16(a, b);
1511 #elif defined(NLIB_NEON)
1512  return NLIB_OP2(vminq, u16, a, b);
1513 #endif
1514 }
1515 
1516 // r.u32[i] = min(a.u32[i], b.u32[i])
1517 NLIB_M(i128) I128::MinUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1518 #if defined(NLIB_SSE41)
1519  return _mm_min_epu32(a, b);
1520 #elif defined(NLIB_NEON)
1521  return NLIB_OP2(vminq, u32, a, b);
1522 #endif
1523 }
1524 
1525 // r.s8[i] = abs(value.s8[i])
1526 NLIB_M(i128) I128::AbsInt8(i128arg value) NLIB_NOEXCEPT {
1527 #if defined(NLIB_SSE41)
1528  return _mm_abs_epi8(value);
1529 #elif defined(NLIB_NEON)
1530  return NLIB_OP1(vabsq, s8, value);
1531 #endif
1532 }
1533 
1534 // r.s16[i] = abs(value.s16[i])
1535 NLIB_M(i128) I128::AbsInt16(i128arg value) NLIB_NOEXCEPT {
1536 #if defined(NLIB_SSE41)
1537  return _mm_abs_epi16(value);
1538 #elif defined(NLIB_NEON)
1539  return NLIB_OP1(vabsq, s16, value);
1540 #endif
1541 }
1542 
1543 // r.s32[i] = abs(value.s32[i])
1544 NLIB_M(i128) I128::AbsInt32(i128arg value) NLIB_NOEXCEPT {
1545 #if defined(NLIB_SSE41)
1546  return _mm_abs_epi32(value);
1547 #elif defined(NLIB_NEON)
1548  return NLIB_OP1(vabsq, s32, value);
1549 #endif
1550 }
1551 
1552 // r.s8[i] = abs(a.s8[i] - b.s8[i])
1553 NLIB_M(i128) I128::AbsDiffInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1554 #if defined(NLIB_SSE41)
1555  return _mm_abs_epi8(_mm_sub_epi8(a, b));
1556 #elif defined(NLIB_NEON)
1557  return NLIB_OP2(vabdq, s8, a, b);
1558 #endif
1559 }
1560 
1561 // r.s16[i] = abs(a.s16[i] - b.s16[i])
1562 NLIB_M(i128) I128::AbsDiffInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1563 #if defined(NLIB_SSE41)
1564  return _mm_abs_epi16(_mm_sub_epi16(a, b));
1565 #elif defined(NLIB_NEON)
1566  return NLIB_OP2(vabdq, s16, a, b);
1567 #endif
1568 }
1569 
1570 // r.s32[i] = abs(a.s32[i] - b.s32[i])
1571 NLIB_M(i128) I128::AbsDiffInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1572 #if defined(NLIB_SSE41)
1573  return _mm_abs_epi32(_mm_sub_epi32(a, b));
1574 #elif defined(NLIB_NEON)
1575  return NLIB_OP2(vabdq, s32, a, b);
1576 #endif
1577 }
1578 
1579 // r.s8[i] = -value.s8[i]
1580 NLIB_M(i128) I128::NegateInt8(i128arg value) NLIB_NOEXCEPT {
1581 #if defined(NLIB_SSE41)
1582  return _mm_sub_epi8(_mm_setzero_si128(), value);
1583 #elif defined(NLIB_NEON)
1584  return NLIB_OP1(vnegq, s8, value);
1585 #endif
1586 }
1587 
1588 // r.s16[i] = -value.s16[i]
1589 NLIB_M(i128) I128::NegateInt16(i128arg value) NLIB_NOEXCEPT {
1590 #if defined(NLIB_SSE41)
1591  return _mm_sub_epi16(_mm_setzero_si128(), value);
1592 #elif defined(NLIB_NEON)
1593  return NLIB_OP1(vnegq, s16, value);
1594 #endif
1595 }
1596 
1597 // r.s32[i] = -value.s32[i]
1598 NLIB_M(i128) I128::NegateInt32(i128arg value) NLIB_NOEXCEPT {
1599 #if defined(NLIB_SSE41)
1600  return _mm_sub_epi32(_mm_setzero_si128(), value);
1601 #elif defined(NLIB_NEON)
1602  return NLIB_OP1(vnegq, s32, value);
1603 #endif
1604 }
1605 
1606 // r = a & b
1607 NLIB_M(i128) I128::And(i128arg a, i128arg b) NLIB_NOEXCEPT {
1608 #if defined(NLIB_SSE41)
1609  return _mm_and_si128(a, b);
1610 #elif defined(NLIB_NEON)
1611  return NLIB_OP2(vandq, s8, a, b);
1612 #endif
1613 }
1614 
1615 // r = a | b
1616 NLIB_M(i128) I128::Or(i128arg a, i128arg b) NLIB_NOEXCEPT {
1617 #if defined(NLIB_SSE41)
1618  return _mm_or_si128(a, b);
1619 #elif defined(NLIB_NEON)
1620  return NLIB_OP2(vorrq, s8, a, b);
1621 #endif
1622 }
1623 
1624 // r = a ^ b
1625 NLIB_M(i128) I128::Xor(i128arg a, i128arg b) NLIB_NOEXCEPT {
1626 #if defined(NLIB_SSE41)
1627  return _mm_xor_si128(a, b);
1628 #elif defined(NLIB_NEON)
1629  return NLIB_OP2(veorq, s8, a, b);
1630 #endif
1631 }
1632 
1633 // r = ~a
1634 NLIB_M(i128) I128::Not(i128arg a) NLIB_NOEXCEPT {
1635 #if defined(NLIB_SSE41)
1636  return _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1637 #elif defined(NLIB_NEON)
1638  return NLIB_OP1(vmvnq, s8, a);
1639 #endif
1640 }
1641 
1642 // r = ~a & b
1643 NLIB_M(i128) I128::AndNot(i128arg a, i128arg b) NLIB_NOEXCEPT {
1644 #if defined(NLIB_SSE41)
1645  return _mm_andnot_si128(a, b);
1646 #elif defined(NLIB_NEON)
1647  return NLIB_OP2(vbicq, s8, b, a);
1648 #endif
1649 }
1650 
1651 // r = ~a | b
1652 NLIB_M(i128) I128::OrNot(i128arg a, i128arg b) NLIB_NOEXCEPT {
1653 #if defined(NLIB_SSE41)
1654  __m128i not_a = _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1655  return _mm_or_si128(not_a, b);
1656 #elif defined(NLIB_NEON)
1657  return NLIB_OP2(vornq, s8, b, a);
1658 #endif
1659 }
1660 
1661 NLIB_M(i128) I128::Test8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1662 #if defined(NLIB_NEON)
1663  return vtstq_s8(a, b);
1664 #else
1665  return I128::Not(I128::CmpEqZero8(I128::And(a, b)));
1666 #endif
1667 }
1668 
1669 NLIB_M(i128) I128::Test16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1670 #if defined(NLIB_NEON)
1671  return NLIB_OP2(vtstq, s16, a, b);
1672 #else
1673  return I128::Not(I128::CmpEqZero16(I128::And(a, b)));
1674 #endif
1675 }
1676 
1677 NLIB_M(i128) I128::Test32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1678 #if defined(NLIB_NEON)
1679  return NLIB_OP2(vtstq, s32, a, b);
1680 #else
1681  return I128::Not(I128::CmpEqZero32(I128::And(a, b)));
1682 #endif
1683 }
1684 
1685 // r.i8[i] = a.i8[i] == b.i8[i] ? 0xFF : 0
1686 NLIB_M(i128) I128::CmpEq8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1687 #if defined(NLIB_SSE41)
1688  return _mm_cmpeq_epi8(a, b);
1689 #elif defined(NLIB_NEON)
1690  return NLIB_CMP(vceqq, s8, a, b, u8);
1691 #endif
1692 }
1693 
1694 // r.i16[i] = a.i16[i] == b.i16[i] ? 0xFFFF : 0
1695 NLIB_M(i128) I128::CmpEq16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1696 #if defined(NLIB_SSE41)
1697  return _mm_cmpeq_epi16(a, b);
1698 #elif defined(NLIB_NEON)
1699  return NLIB_CMP(vceqq, s16, a, b, u16);
1700 #endif
1701 }
1702 
1703 // r.i32[i] = a.i32[i] == b.i32[i] ? 0xFFFFFFFF : 0
1704 NLIB_M(i128) I128::CmpEq32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1705 #if defined(NLIB_SSE41)
1706  return _mm_cmpeq_epi32(a, b);
1707 #elif defined(NLIB_NEON)
1708  return NLIB_CMP(vceqq, s32, a, b, u32);
1709 #endif
1710 }
1711 
1712 // r.i64[i] = a.i64[i] == b.i64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1713 NLIB_M(i128) I128::CmpEq64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1714 #if defined(NLIB_SSE41)
1715  return _mm_cmpeq_epi64(a, b);
1716 #elif defined(NLIB_NEON)
1717 #ifdef __aarch64__
1718  return NLIB_CMP(vceqq, s64, a, b, u64);
1719 #else
1720  uint32x4_t x0 = vceqq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b));
1721  uint32x2x2_t x1 = vtrn_u32(vget_low_u32(x0), vget_high_u32(x0));
1722  uint32x2_t x2 = vand_u32(x1.val[0], x1.val[1]);
1723  int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1724  return vreinterpretq_s8_s64(result);
1725 #endif
1726 #endif
1727 }
1728 
1729 // r.s8[i] = a.s8[i] < b.s8[i] ? 0xFF : 0
1730 NLIB_M(i128) I128::CmpLtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1731 #if defined(NLIB_SSE41)
1732  return _mm_cmplt_epi8(a, b);
1733 #elif defined(NLIB_NEON)
1734  return NLIB_CMP(vcltq, s8, a, b, u8);
1735 #endif
1736 }
1737 
1738 // r.s16[i] = a.s16[i] < b.s16[i] ? 0xFFFF : 0
1739 NLIB_M(i128) I128::CmpLtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1740 #if defined(NLIB_SSE41)
1741  return _mm_cmplt_epi16(a, b);
1742 #elif defined(NLIB_NEON)
1743  return NLIB_CMP(vcltq, s16, a, b, u16);
1744 #endif
1745 }
1746 
1747 // r.s32[i] = a.s32[i] < b.s32[i] ? 0xFFFFFFFF : 0
1748 NLIB_M(i128) I128::CmpLtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1749 #if defined(NLIB_SSE41)
1750  return _mm_cmplt_epi32(a, b);
1751 #elif defined(NLIB_NEON)
1752  return NLIB_CMP(vcltq, s32, a, b, u32);
1753 #endif
1754 }
1755 
1756 // r.s64[i] = a.s64[i] < b.s64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1757 NLIB_M(i128) I128::CmpLtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1758 #if defined(NLIB_SSE42)
1759  return _mm_cmpgt_epi64(b, a);
1760 #elif defined(NLIB_NEON)
1761 #ifdef __aarch64__
1762  return NLIB_CMP(vcltq, s64, a, b, u64);
1763 #else
1764  int32x2x2_t trn_a =
1765  vtrn_s32(vreinterpret_s32_s8(vget_low_s8(a)), vreinterpret_s32_s8(vget_high_s8(a)));
1766  int32x2x2_t trn_b =
1767  vtrn_s32(vreinterpret_s32_s8(vget_low_s8(b)), vreinterpret_s32_s8(vget_high_s8(b)));
1768  uint32x2_t upper_lt = vclt_s32(trn_a.val[1], trn_b.val[1]);
1769  uint32x2_t upper_eq = vceq_s32(trn_a.val[1], trn_b.val[1]);
1770  uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1771  uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1772  int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1773  return vreinterpretq_s8_s64(result);
1774 #endif
1775 #else
1776  i128 cmp = I128::CmpLtInt32(a, b);
1777  i128 eq = I128::CmpEq32(a, b);
1778  i128 cmp_lt = I128::CmpLtUint32(a, b);
1779  i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1780  i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp_lt, cmp_lt);
1781  i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1782  return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1783 #endif
1784 }
1785 
1786 // r.s8[i] = a.s8[i] > b.s8[i] ? 0xFF : 0
1787 NLIB_M(i128) I128::CmpGtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1788 #if defined(NLIB_SSE41)
1789  return _mm_cmpgt_epi8(a, b);
1790 #elif defined(NLIB_NEON)
1791  return NLIB_CMP(vcgtq, s8, a, b, u8);
1792 #endif
1793 }
1794 
1795 // r.s16[i] = a.s16[i] > b.s16[i] ? 0xFFFF : 0
1796 NLIB_M(i128) I128::CmpGtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1797 #if defined(NLIB_SSE41)
1798  return _mm_cmpgt_epi16(a, b);
1799 #elif defined(NLIB_NEON)
1800  return NLIB_CMP(vcgtq, s16, a, b, u16);
1801 #endif
1802 }
1803 
1804 // r.s32[i] = a.s32[i] > b.s32[i] ? 0xFFFFFFFF : 0
1805 NLIB_M(i128) I128::CmpGtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1806 #if defined(NLIB_SSE41)
1807  return _mm_cmpgt_epi32(a, b);
1808 #elif defined(NLIB_NEON)
1809  return NLIB_CMP(vcgtq, s32, a, b, u32);
1810 #endif
1811 }
1812 
1813 // r.s64[i] = a.s64[i] > b.s64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1814 NLIB_M(i128) I128::CmpGtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1815 #if defined(NLIB_SSE42)
1816  return _mm_cmpgt_epi64(a, b);
1817 #elif defined(NLIB_NEON) && defined(__aarch64__)
1818  return NLIB_CMP(vcgtq, s64, a, b, u64);
1819 #else
1820  return I128::CmpLtInt64(b, a);
1821 #endif
1822 }
1823 
1824 // r.u8[i] = a.u8[i] < b.u8[i] ? 0xFF : 0
1825 NLIB_M(i128) I128::CmpLtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1826 #if defined(NLIB_SSE41)
1827  i128 ofs = I128::SetValue(0x80, each_uint8);
1828  return _mm_cmplt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1829 #elif defined(NLIB_NEON)
1830  return NLIB_CMP(vcltq, u8, a, b, u8);
1831 #endif
1832 }
1833 
1834 // r.u8[i] = a.u8[i] > b.u8[i] ? 0xFF : 0
1835 NLIB_M(i128) I128::CmpGtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1836 #if defined(NLIB_SSE41)
1837  i128 ofs = I128::SetValue(0x80, each_uint8);
1838  return _mm_cmpgt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1839 #elif defined(NLIB_NEON)
1840  return NLIB_CMP(vcgtq, u8, a, b, u8);
1841 #endif
1842 }
1843 
1844 // r.u16[i] = a.u16[i] < b.u16[i] ? 0xFFFF : 0
1845 NLIB_M(i128) I128::CmpLtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1846 #if defined(NLIB_SSE41)
1847  i128 ofs = I128::SetValue(0x8000U, each_uint16);
1848  return _mm_cmplt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1849 #elif defined(NLIB_NEON)
1850  return NLIB_CMP(vcltq, u16, a, b, u16);
1851 #endif
1852 }
1853 
1854 // r.u16[i] = a.u16[i] > b.u16[i] ? 0xFFFF : 0
1855 NLIB_M(i128) I128::CmpGtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1856 #if defined(NLIB_SSE41)
1857  i128 ofs = I128::SetValue(0x8000U, each_uint16);
1858  return _mm_cmpgt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1859 #elif defined(NLIB_NEON)
1860  return NLIB_CMP(vcgtq, u16, a, b, u16);
1861 #endif
1862 }
1863 
1864 // r.u32[i] = a.u32[i] < b.u32[i] ? 0xFFFFFFFF : 0
1865 NLIB_M(i128) I128::CmpLtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1866 #if defined(NLIB_SSE41)
1867  i128 ofs = I128::SetValue(0x80000000U, each_uint32);
1868  return _mm_cmplt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1869 #elif defined(NLIB_NEON)
1870  return NLIB_CMP(vcltq, u32, a, b, u32);
1871 #endif
1872 }
1873 
1874 // r.u32[i] = a.u32[i] > b.u32[i] ? 0xFFFFFFFF : 0
1875 NLIB_M(i128) I128::CmpGtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1876 #if defined(NLIB_SSE41)
1877  i128 ofs = I128::SetValue(0x80000000U, each_uint32);
1878  return _mm_cmpgt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1879 #elif defined(NLIB_NEON)
1880  return NLIB_CMP(vcgtq, u32, a, b, u32);
1881 #endif
1882 }
1883 
1884 // r.u64[i] = a.u64[i] < b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1885 NLIB_M(i128) I128::CmpLtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1886 #if defined(NLIB_SSE42)
1887  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
1888  return _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
1889 #elif defined(NLIB_NEON)
1890 #ifdef __aarch64__
1891  return NLIB_CMP(vcltq, u64, a, b, u64);
1892 #else
1893  uint32x2x2_t trn_a =
1894  vtrn_u32(vreinterpret_u32_s8(vget_low_s8(a)), vreinterpret_u32_s8(vget_high_s8(a)));
1895  uint32x2x2_t trn_b =
1896  vtrn_u32(vreinterpret_u32_s8(vget_low_s8(b)), vreinterpret_u32_s8(vget_high_s8(b)));
1897  uint32x2_t upper_lt = vclt_u32(trn_a.val[1], trn_b.val[1]);
1898  uint32x2_t upper_eq = vceq_u32(trn_a.val[1], trn_b.val[1]);
1899  uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1900  uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1901  int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1902  return vreinterpretq_s8_s64(result);
1903 #endif
1904 #else
1905  i128 cmp = I128::CmpLtUint32(a, b);
1906  i128 eq = I128::CmpEq32(a, b);
1907  i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1908  i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp, cmp);
1909  i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1910  return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1911 #endif
1912 }
1913 
1914 // r.u64[i] = a.u64[i] > b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1915 NLIB_M(i128) I128::CmpGtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1916 #if defined(NLIB_SSE42)
1917  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
1918  return _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
1919 #elif defined(NLIB_NEON) && defined(__aarch64__)
1920  return NLIB_CMP(vcgtq, u64, a, b, u64);
1921 #else
1922  return I128::CmpLtUint64(b, a);
1923 #endif
1924 }
1925 
1926 // r.s8[i] = a.s8[i] <= b.s8[i] ? 0xFF : 0
1927 NLIB_M(i128) I128::CmpLeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1928 #if defined(NLIB_SSE41)
1929  return _mm_or_si128(_mm_cmplt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1930 #elif defined(NLIB_NEON)
1931  return NLIB_CMP(vcleq, s8, a, b, u8);
1932 #endif
1933 }
1934 
1935 // r.s16[i] = a.s16[i] <= b.s16[i] ? 0xFFFF : 0
1936 NLIB_M(i128) I128::CmpLeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1937 #if defined(NLIB_SSE41)
1938  return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1939 #elif defined(NLIB_NEON)
1940  return NLIB_CMP(vcleq, s16, a, b, u16);
1941 #endif
1942 }
1943 
1944 // r.s32[i] = a.s32[i] <= b.s32[i] ? 0xFFFFFFFF : 0
1945 NLIB_M(i128) I128::CmpLeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1946 #if defined(NLIB_SSE41)
1947  return _mm_or_si128(_mm_cmplt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1948 #elif defined(NLIB_NEON)
1949  return NLIB_CMP(vcleq, s32, a, b, u32);
1950 #endif
1951 }
1952 
1953 // r.s32[i] = a.s32[i] <= b.s32[i] ? 0xFFFFFFFFFFFFFFFF : 0
1954 NLIB_M(i128) I128::CmpLeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1955 #if defined(NLIB_SSE42)
1956  return _mm_or_si128(_mm_cmpgt_epi64(b, a), _mm_cmpeq_epi64(a, b));
1957 #elif defined(NLIB_NEON) && defined(__aarch64__)
1958  return NLIB_CMP(vcleq, s64, a, b, u64);
1959 #else
1960  return I128::Not(I128::CmpGtInt64(a, b));
1961 #endif
1962 }
1963 
1964 // r.s8[i] = a.s8[i] >= b.s8[i] ? 0xFF : 0
1965 NLIB_M(i128) I128::CmpGeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1966 #if defined(NLIB_SSE41)
1967  return _mm_or_si128(_mm_cmpgt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1968 #elif defined(NLIB_NEON)
1969  return NLIB_CMP(vcgeq, s8, a, b, u8);
1970 #endif
1971 }
1972 
1973 // r.s16[i] = a.s16[i] >= b.s16[i] ? 0xFFFF : 0
1974 NLIB_M(i128) I128::CmpGeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1975 #if defined(NLIB_SSE41)
1976  return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1977 #elif defined(NLIB_NEON)
1978  return NLIB_CMP(vcgeq, s16, a, b, u16);
1979 #endif
1980 }
1981 
1982 // r.s32[i] = a.s32[i] >= b.s32[i] ? 0xFFFFFFFF : 0
1983 NLIB_M(i128) I128::CmpGeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1984 #if defined(NLIB_SSE41)
1985  return _mm_or_si128(_mm_cmpgt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1986 #elif defined(NLIB_NEON)
1987  return NLIB_CMP(vcgeq, s32, a, b, u32);
1988 #endif
1989 }
1990 
1991 // r.s64[i] = a.s64[i] >= b.s64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1992 NLIB_M(i128) I128::CmpGeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1993 #if defined(NLIB_SSE42)
1994  return _mm_or_si128(_mm_cmpgt_epi64(a, b), _mm_cmpeq_epi64(a, b));
1995 #elif defined(NLIB_NEON) && defined(__aarch64__)
1996  return NLIB_CMP(vcgeq, s64, a, b, u64);
1997 #else
1998  return I128::Not(I128::CmpLtInt64(a, b));
1999 #endif
2000 }
2001 
2002 // r.u8[i] = a.u8[i] <= b.u8[i] ? 0xFF : 0
2003 NLIB_M(i128) I128::CmpLeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2004 #if defined(NLIB_SSE41)
2005  return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
2006 #elif defined(NLIB_NEON)
2007  return NLIB_CMP(vcleq, u8, a, b, u8);
2008 #endif
2009 }
2010 
2011 // r.u16[i] = a.u16[i] <= b.u16[i] ? 0xFFFF : 0
2012 NLIB_M(i128) I128::CmpLeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2013 #if defined(NLIB_SSE41)
2014  return _mm_cmpeq_epi16(_mm_min_epu16(a, b), a);
2015 #elif defined(NLIB_NEON)
2016  return NLIB_CMP(vcleq, u16, a, b, u16);
2017 #endif
2018 }
2019 
2020 // r.u32[i] = a.u32[i] <= b.u32[i] ? 0xFFFFFFFF : 0
2021 NLIB_M(i128) I128::CmpLeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2022 #if defined(NLIB_SSE41)
2023  return _mm_cmpeq_epi32(_mm_min_epu32(a, b), a);
2024 #elif defined(NLIB_NEON)
2025  return NLIB_CMP(vcleq, u32, a, b, u32);
2026 #endif
2027 }
2028 
2029 // r.u64[i] = a.u64[i] <= b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
2030 NLIB_M(i128) I128::CmpLeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
2031 #if defined(NLIB_SSE42)
2032  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
2033  i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
2034  return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2035 #elif defined(NLIB_NEON) && defined(__aarch64__)
2036  return NLIB_CMP(vcleq, u64, a, b, u64);
2037 #else
2038  return I128::Not(I128::CmpGtUint64(a, b));
2039 #endif
2040 }
2041 
2042 // r.u8[i] = a.u8[i] >= b.u8[i] ? 0xFF : 0
2043 NLIB_M(i128) I128::CmpGeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2044 #if defined(NLIB_SSE41)
2045  return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a);
2046 #elif defined(NLIB_NEON)
2047  return NLIB_CMP(vcgeq, u8, a, b, u8);
2048 #endif
2049 }
2050 
2051 // r.u16[i] = a.u16[i] >= b.u16[i] ? 0xFFFF : 0
2052 NLIB_M(i128) I128::CmpGeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2053 #if defined(NLIB_SSE41)
2054  return _mm_cmpeq_epi16(_mm_max_epu16(a, b), a);
2055 #elif defined(NLIB_NEON)
2056  return NLIB_CMP(vcgeq, u16, a, b, u16);
2057 #endif
2058 }
2059 
2060 // r.u32[i] = a.u32[i] >= b.u32[i] ? 0xFFFFFFFF : 0
2061 NLIB_M(i128) I128::CmpGeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2062 #if defined(NLIB_SSE41)
2063  return _mm_cmpeq_epi32(_mm_max_epu32(a, b), a);
2064 #elif defined(NLIB_NEON)
2065  return NLIB_CMP(vcgeq, u32, a, b, u32);
2066 #endif
2067 }
2068 
2069 // r.u64[i] = a.u64[i] >= b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
2070 NLIB_M(i128) I128::CmpGeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
2071 #if defined(NLIB_SSE42)
2072  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
2073  i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
2074  return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2075 #elif defined(NLIB_NEON) && defined(__aarch64__)
2076  return NLIB_CMP(vcgeq, u64, a, b, u64);
2077 #else
2078  return I128::Not(I128::CmpLtUint64(a, b));
2079 #endif
2080 }
2081 
2082 NLIB_M(i128) I128::CmpEqZero8(i128arg value) NLIB_NOEXCEPT {
2083 #if defined(__aarch64__)
2084  return vceqzq_s8(value);
2085 #else
2086  return I128::CmpEq8(value, I128::SetZero());
2087 #endif
2088 }
2089 
2090 NLIB_M(i128) I128::CmpEqZero16(i128arg value) NLIB_NOEXCEPT {
2091 #if defined(__aarch64__)
2092  return vreinterpretq_s8_s16(vceqzq_s16(vreinterpretq_s16_s8(value)));
2093 #else
2094  return I128::CmpEq16(value, I128::SetZero());
2095 #endif
2096 }
2097 
2098 NLIB_M(i128) I128::CmpEqZero32(i128arg value) NLIB_NOEXCEPT {
2099 #if defined(__aarch64__)
2100  return vreinterpretq_s8_s32(vceqzq_s32(vreinterpretq_s32_s8(value)));
2101 #else
2102  return I128::CmpEq32(value, I128::SetZero());
2103 #endif
2104 }
2105 
2106 NLIB_M(i128) I128::CmpEqZero64(i128arg value) NLIB_NOEXCEPT {
2107 #if defined(__aarch64__)
2108  return vreinterpretq_s8_s64(vceqzq_s64(vreinterpretq_s64_s8(value)));
2109 #else
2110  return I128::CmpEq64(value, I128::SetZero());
2111 #endif
2112 }
2113 
2114 // r.u8[i] = value.u8[i] << count
2115 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value, int count) NLIB_NOEXCEPT {
2116 #if defined(NLIB_SSE41)
2117  __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2118  __m128i xh = _mm_slli_epi16(_mm_cvtepu8_epi16(hi), count);
2119  __m128i xl = _mm_slli_epi16(_mm_cvtepu8_epi16(value), count);
2120  return I128::NarrowFrom16To8(xl, xh);
2121 #elif defined(NLIB_NEON)
2122  return NLIB_SFT(vshlq, u8, value, count, s8);
2123 #endif
2124 }
2125 
2126 // r.u8[i] = value.u8[i] >> count
2127 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value, int count) NLIB_NOEXCEPT {
2128 #if defined(NLIB_SSE41)
2129  __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2130  __m128i xh = _mm_srli_epi16(_mm_cvtepu8_epi16(hi), count);
2131  __m128i xl = _mm_srli_epi16(_mm_cvtepu8_epi16(value), count);
2132  return _mm_packus_epi16(xl, xh);
2133 #elif defined(NLIB_NEON)
2134  return NLIB_SFT(vshlq, u8, value, -count, s8);
2135 #endif
2136 }
2137 
2138 // r.s8[i] = value.s8[i] >> count
2139 NLIB_M(i128) I128::ShiftRightArithmetic8(i128arg value, int count) NLIB_NOEXCEPT {
2140 #if defined(NLIB_SSE41)
2141  __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2142  __m128i xh = _mm_srai_epi16(_mm_cvtepi8_epi16(hi), count);
2143  __m128i xl = _mm_srai_epi16(_mm_cvtepi8_epi16(value), count);
2144  return _mm_packus_epi16(xl, xh);
2145 #elif defined(NLIB_NEON)
2146  return NLIB_SFT(vshlq, s8, value, -count, s8);
2147 #endif
2148 }
2149 
2150 // r.u16[i] = value.u16[i] << count
2151 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value, int count) NLIB_NOEXCEPT {
2152 #if defined(NLIB_SSE41)
2153  return _mm_slli_epi16(value, count);
2154 #elif defined(NLIB_NEON)
2155  return NLIB_SFT(vshlq, u16, value, count, s16);
2156 #endif
2157 }
2158 
2159 // r.u16[i] = value.u16[i] >> count
2160 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value, int count) NLIB_NOEXCEPT {
2161 #if defined(NLIB_SSE41)
2162  return _mm_srli_epi16(value, count);
2163 #elif defined(NLIB_NEON)
2164  return NLIB_SFT(vshlq, u16, value, -count, s16);
2165 #endif
2166 }
2167 
2168 // r.s16[i] = value.s16[i] >> count
2169 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value, int count) NLIB_NOEXCEPT {
2170 #if defined(NLIB_SSE41)
2171  return _mm_srai_epi16(value, count);
2172 #elif defined(NLIB_NEON)
2173  return NLIB_SFT(vshlq, s16, value, -count, s16);
2174 #endif
2175 }
2176 
2177 // r.u32[i] = value.u32[i] << count
2178 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value, int count) NLIB_NOEXCEPT {
2179 #if defined(NLIB_SSE41)
2180  return _mm_slli_epi32(value, count);
2181 #elif defined(NLIB_NEON)
2182  return NLIB_SFT(vshlq, u32, value, count, s32);
2183 #endif
2184 }
2185 
2186 // r.u32[i] = value.u32[i] >> count
2187 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value, int count) NLIB_NOEXCEPT {
2188 #if defined(NLIB_SSE41)
2189  return _mm_srli_epi32(value, count);
2190 #elif defined(NLIB_NEON)
2191  return NLIB_SFT(vshlq, u32, value, -count, s32);
2192 #endif
2193 }
2194 
2195 // r.s32[i] = value.s32[i] >> count
2196 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value, int count) NLIB_NOEXCEPT {
2197 #if defined(NLIB_SSE41)
2198  return _mm_srai_epi32(value, count);
2199 #elif defined(NLIB_NEON)
2200  return NLIB_SFT(vshlq, s32, value, -count, s32);
2201 #endif
2202 }
2203 
2204 // r.u32[i] = value.u64[i] << count
2205 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value, int count) NLIB_NOEXCEPT {
2206 #if defined(NLIB_SSE41)
2207  return _mm_slli_epi64(value, count);
2208 #elif defined(NLIB_NEON)
2209  return NLIB_SFT(vshlq, u64, value, count, s64);
2210 #endif
2211 }
2212 
2213 // r.u32[i] = value.u64[i] >> count
2214 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value, int count) NLIB_NOEXCEPT {
2215 #if defined(NLIB_SSE41)
2216  return _mm_srli_epi64(value, count);
2217 #elif defined(NLIB_NEON)
2218  return NLIB_SFT(vshlq, u64, value, -count, s64);
2219 #endif
2220 }
2221 
2222 template<size_t N>
2223 NLIB_M(i128)
2224 I128::ShiftLeftLogical8(i128arg value) NLIB_NOEXCEPT {
2225  NLIB_STATIC_ASSERT(N >= 0 && N <= 8);
2226 #ifdef NLIB_NEON
2227  return vshlq_n_s8(value, N);
2228 #else
2229  return I128::ShiftLeftLogical8(value, N);
2230 #endif
2231 }
2232 
2233 template<size_t N>
2234 NLIB_M(i128)
2235 I128::ShiftRightLogical8(i128arg value) NLIB_NOEXCEPT {
2236  NLIB_STATIC_ASSERT(N >= 0 && N <= 8);
2237 #ifdef NLIB_NEON
2238  uint8x16_t tmp = vreinterpretq_u8_s8(value);
2239  return vreinterpretq_s8_u8(vshrq_n_u8(tmp, N));
2240 #else
2241  return I128::ShiftRightLogical8(value, N);
2242 #endif
2243 }
2244 
2245 template<size_t N>
2246 NLIB_M(i128)
2247 I128::ShiftRightArithmetic8(i128arg value) NLIB_NOEXCEPT {
2248  NLIB_STATIC_ASSERT(N >= 0 && N <= 8);
2249 #ifdef NLIB_NEON
2250  return vshrq_n_s8(value, N);
2251 #else
2252  return I128::ShiftRightArithmetic8(value, N);
2253 #endif
2254 }
2255 
2256 template<size_t N>
2257 NLIB_M(i128)
2258 I128::ShiftLeftLogical16(i128arg value) NLIB_NOEXCEPT {
2259  NLIB_STATIC_ASSERT(N >= 0 && N <= 16);
2260 #ifdef NLIB_NEON
2261  uint16x8_t tmp = vreinterpretq_u16_s8(value);
2262  return vreinterpretq_s8_u16(vshlq_n_u16(tmp, N));
2263 #else
2264  return I128::ShiftLeftLogical16(value, N);
2265 #endif
2266 }
2267 
2268 template<size_t N>
2269 NLIB_M(i128)
2270 I128::ShiftRightLogical16(i128arg value) NLIB_NOEXCEPT {
2271  NLIB_STATIC_ASSERT(N >= 0 && N <= 16);
2272 #ifdef NLIB_NEON
2273  uint16x8_t tmp = vreinterpretq_u16_s8(value);
2274  return vreinterpretq_s8_u16(vshrq_n_u16(tmp, N));
2275 #else
2276  return I128::ShiftRightLogical16(value, N);
2277 #endif
2278 }
2279 
2280 template<size_t N>
2281 NLIB_M(i128)
2282 I128::ShiftRightArithmetic16(i128arg value) NLIB_NOEXCEPT {
2283  NLIB_STATIC_ASSERT(N >= 0 && N <= 16);
2284 #ifdef NLIB_NEON
2285  int16x8_t tmp = vreinterpretq_s16_s8(value);
2286  return vreinterpretq_s8_s16(vshrq_n_s16(tmp, N));
2287 #else
2288  return I128::ShiftRightArithmetic16(value, N);
2289 #endif
2290 }
2291 
2292 template<size_t N>
2293 NLIB_M(i128)
2294 I128::ShiftLeftLogical32(i128arg value) NLIB_NOEXCEPT {
2295  NLIB_STATIC_ASSERT(N >= 0 && N <= 32);
2296 #ifdef NLIB_NEON
2297  uint32x4_t tmp = vreinterpretq_u32_s8(value);
2298  return vreinterpretq_s8_u32(vshlq_n_u32(tmp, N));
2299 #else
2300  return I128::ShiftLeftLogical32(value, N);
2301 #endif
2302 }
2303 
2304 template<size_t N>
2305 NLIB_M(i128)
2306 I128::ShiftRightLogical32(i128arg value) NLIB_NOEXCEPT {
2307  NLIB_STATIC_ASSERT(N >= 0 && N <= 32);
2308 #ifdef NLIB_NEON
2309  uint32x4_t tmp = vreinterpretq_u32_s8(value);
2310  return vreinterpretq_s8_u32(vshrq_n_u32(tmp, N));
2311 #else
2312  return I128::ShiftRightLogical32(value, N);
2313 #endif
2314 }
2315 
2316 template<size_t N>
2317 NLIB_M(i128)
2318 I128::ShiftRightArithmetic32(i128arg value) NLIB_NOEXCEPT {
2319  NLIB_STATIC_ASSERT(N >= 0 && N <= 32);
2320 #ifdef NLIB_NEON
2321  int32x4_t tmp = vreinterpretq_s32_s8(value);
2322  return vreinterpretq_s8_s32(vshrq_n_s32(tmp, N));
2323 #else
2324  return I128::ShiftRightArithmetic32(value, N);
2325 #endif
2326 }
2327 
2328 template<size_t N>
2329 NLIB_M(i128)
2330 I128::ShiftLeftLogical64(i128arg value) NLIB_NOEXCEPT {
2331  NLIB_STATIC_ASSERT(N >= 0 && N <= 64);
2332 #ifdef NLIB_NEON
2333  uint64x2_t tmp = vreinterpretq_u64_s8(value);
2334  return vreinterpretq_s8_u64(vshlq_n_u64(tmp, N));
2335 #else
2336  return I128::ShiftLeftLogical64(value, N);
2337 #endif
2338 }
2339 
2340 template<size_t N>
2341 NLIB_M(i128)
2342 I128::ShiftRightLogical64(i128arg value) NLIB_NOEXCEPT {
2343  NLIB_STATIC_ASSERT(N >= 0 && N <= 64);
2344 #ifdef NLIB_NEON
2345  uint64x2_t tmp = vreinterpretq_u64_s8(value);
2346  return vreinterpretq_s8_u64(vshrq_n_u64(tmp, N));
2347 #else
2348  return I128::ShiftRightLogical64(value, N);
2349 #endif
2350 }
2351 
2352 #ifdef NLIB_NEON
2353 template<>
2354 NLIB_M(i128)
2355 I128::ShiftLeftLogical8<8>(i128arg value) NLIB_NOEXCEPT {
2356  NLIB_UNUSED(value);
2357  return I128::SetZero();
2358 }
2359 template<>
2360 NLIB_M(i128)
2361 I128::ShiftRightLogical8<0>(i128arg value) NLIB_NOEXCEPT {
2362  return value;
2363 }
2364 template<>
2365 NLIB_M(i128)
2366 I128::ShiftLeftLogical16<16>(i128arg value) NLIB_NOEXCEPT {
2367  NLIB_UNUSED(value);
2368  return I128::SetZero();
2369 }
2370 template<>
2371 NLIB_M(i128)
2372 I128::ShiftRightLogical16<0>(i128arg value) NLIB_NOEXCEPT {
2373  return value;
2374 }
2375 template<>
2376 NLIB_M(i128)
2377 I128::ShiftRightArithmetic16<0>(i128arg value) NLIB_NOEXCEPT {
2378  return value;
2379 }
2380 template<>
2381 NLIB_M(i128)
2382 I128::ShiftLeftLogical32<32>(i128arg value) NLIB_NOEXCEPT {
2383  NLIB_UNUSED(value);
2384  return I128::SetZero();
2385 }
2386 template<>
2387 NLIB_M(i128)
2388 I128::ShiftRightLogical32<0>(i128arg value) NLIB_NOEXCEPT {
2389  return value;
2390 }
2391 template<>
2392 NLIB_M(i128)
2393 I128::ShiftRightArithmetic32<0>(i128arg value) NLIB_NOEXCEPT {
2394  return value;
2395 }
2396 template<>
2397 NLIB_M(i128)
2398 I128::ShiftLeftLogical64<64>(i128arg value) NLIB_NOEXCEPT {
2399  NLIB_UNUSED(value);
2400  return I128::SetZero();
2401 }
2402 template<>
2403 NLIB_M(i128)
2404 I128::ShiftRightLogical64<0>(i128arg value) NLIB_NOEXCEPT {
2405  return value;
2406 }
2407 #endif
2408 
2409 template<size_t N>
2410 // r.i8[i + N] = r.i8[i], fill zero
2411 NLIB_M(i128) I128::ByteShiftLeft(i128arg value) NLIB_NOEXCEPT {
2412  NLIB_STATIC_ASSERT(N < 16);
2413 #if defined(NLIB_SSE41)
2414  return _mm_slli_si128(value, N);
2415 #elif defined(NLIB_NEON)
2416  return vextq_s8(vdupq_n_s8(0), value, 16 - N);
2417 #endif
2418 }
2419 
2420 template<size_t N>
2421 // r.i8[i - N] = r.i8[i], fill zero
2422 NLIB_M(i128) I128::ByteShiftRight(i128arg value) NLIB_NOEXCEPT {
2423  NLIB_STATIC_ASSERT(N < 16);
2424 #if defined(NLIB_SSE41)
2425  return _mm_srli_si128(value, N);
2426 #elif defined(NLIB_NEON)
2427  return vextq_s8(value, vdupq_n_s8(0), N);
2428 #endif
2429 }
2430 
2431 template<size_t N>
2432 // left <- fedcba9876543210 -> right
2433 NLIB_M(i128) I128::ByteRotateRight(i128arg value) NLIB_NOEXCEPT {
2434  NLIB_STATIC_ASSERT(N < 16);
2435 #if defined(NLIB_SSE41)
2436  return _mm_alignr_epi8(value, value, N);
2437 #elif defined(NLIB_NEON)
2438  return vextq_s8(value, value, N);
2439 #endif
2440 }
2441 
2442 template<size_t N>
2443 // _mm_alignr_epi8(a, b, N), which returns ((a << 128) | b) >> (N * 8)
2444 NLIB_M(i128) I128::AlignR(i128arg a, i128arg b) NLIB_NOEXCEPT {
2445  NLIB_STATIC_ASSERT(N < 16);
2446 #if defined(NLIB_SSE41)
2447  return _mm_alignr_epi8(a, b, N);
2448 #elif defined(NLIB_NEON)
2449  return vextq_s8(b, a, N);
2450 #endif
2451 }
2452 
2453 // r.u8[i] = value.u8[i * 2]
2454 NLIB_M(i128) I128::NarrowFrom16To8(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2455 #if defined(NLIB_SSE41)
2456  i128 mask = I128::SetValue(0x00FFU, each_uint16);
2457  __m128i lo_mask = _mm_and_si128(lo, mask);
2458  __m128i hi_mask = _mm_and_si128(hi, mask);
2459  return _mm_packus_epi16(lo_mask, hi_mask);
2460 #elif defined(NLIB_NEON)
2461 #ifdef __aarch64__
2462  uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2463  return vreinterpretq_s8_u8(vmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2464 #else
2465  uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2466  uint8x8_t h = vmovn_u16(vreinterpretq_u16_s8(hi));
2467  return NLIB_CMB(u8, l, h);
2468 #endif
2469 #endif
2470 }
2471 
2472 // r.u16[i] = value.u16[i * 2]
2473 NLIB_M(i128) I128::NarrowFrom32To16(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2474 #if defined(NLIB_SSE41)
2475  i128 mask = I128::SetValue(0xFFFFU, each_uint32);
2476  __m128i lo_mask = _mm_and_si128(lo, mask);
2477  __m128i hi_mask = _mm_and_si128(hi, mask);
2478  return _mm_packus_epi32(lo_mask, hi_mask);
2479 #elif defined(NLIB_NEON)
2480 #ifdef __aarch64__
2481  uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2482  return vreinterpretq_s8_u16(vmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2483 #else
2484  uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2485  uint16x4_t h = vmovn_u32(vreinterpretq_u32_s8(hi));
2486  return NLIB_CMB(u16, l, h);
2487 #endif
2488 #endif
2489 }
2490 
2491 // r.u32[i] = value.u32[i * 2]
2492 NLIB_M(i128) I128::NarrowFrom64To32(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2493 #if defined(NLIB_SSE41)
2494  __m128i lo_ = _mm_shuffle_epi32(lo, _MM_SHUFFLE(3, 1, 2, 0));
2495  __m128i hi_ = _mm_shuffle_epi32(hi, _MM_SHUFFLE(3, 1, 2, 0));
2496  return _mm_unpacklo_epi64(lo_, hi_);
2497 #elif defined(NLIB_NEON)
2498 #ifdef __aarch64__
2499  uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2500  return vreinterpretq_s8_u32(vmovn_high_u64(l, vreinterpretq_u64_s8(hi)));
2501 #else
2502  uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2503  uint32x2_t h = vmovn_u64(vreinterpretq_u64_s8(hi));
2504  return NLIB_CMB(u32, l, h);
2505 #endif
2506 #endif
2507 }
2508 
2509 // r.u8[i] = 255 if value.u16[i] > 255
2510 NLIB_M(i128) I128::ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2511 #if defined(NLIB_SSE41)
2512  i128 b7FFF = I128::SetValue(0x7FFFU, each_uint16);
2513  __m128i lotmp = _mm_and_si128(lo, b7FFF);
2514  __m128i hitmp = _mm_and_si128(hi, b7FFF);
2515  return _mm_packus_epi16(lotmp, hitmp);
2516 #elif defined(NLIB_NEON)
2517 #ifdef __aarch64__
2518  uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2519  return vreinterpretq_s8_u8(vqmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2520 #else
2521  uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2522  uint8x8_t h = vqmovn_u16(vreinterpretq_u16_s8(hi));
2523  return NLIB_CMB(u8, l, h);
2524 #endif
2525 #endif
2526 }
2527 
2528 // r.s8[i] = 127 if value.s16[i] > 127, -128 if value.s16[i] < -128
2529 NLIB_M(i128) I128::ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2530 #if defined(NLIB_SSE41)
2531  return _mm_packs_epi16(lo, hi);
2532 #elif defined(NLIB_NEON)
2533 #ifdef __aarch64__
2534  int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2535  return vqmovn_high_s16(l, vreinterpretq_s16_s8(hi));
2536 #else
2537  int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2538  int8x8_t h = vqmovn_s16(vreinterpretq_s16_s8(hi));
2539  return NLIB_CMB(s8, l, h);
2540 #endif
2541 #endif
2542 }
2543 
2544 // r.u16[i] = 65535 if value.u32[i] > 65535
2545 NLIB_M(i128) I128::ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2546 #if defined(NLIB_SSE41)
2547  i128 b7FFFFFFF = I128::SetValue(0x7FFFFFFFU, each_uint32);
2548  __m128i lotmp = _mm_and_si128(lo, b7FFFFFFF);
2549  __m128i hitmp = _mm_and_si128(hi, b7FFFFFFF);
2550  return _mm_packus_epi32(lotmp, hitmp);
2551 #elif defined(NLIB_NEON)
2552 #ifdef __aarch64__
2553  uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2554  return vreinterpretq_s8_u16(vqmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2555 #else
2556  uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2557  uint16x4_t h = vqmovn_u32(vreinterpretq_u32_s8(hi));
2558  return NLIB_CMB(u16, l, h);
2559 #endif
2560 #endif
2561 }
2562 
2563 // r.s16[i] = 32767 if value.s32[i] > 32767, -32768 if value.s32[i] < -32768
2564 NLIB_M(i128) I128::ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2565 #if defined(NLIB_SSE41)
2566  return _mm_packs_epi32(lo, hi);
2567 #elif defined(NLIB_NEON)
2568 #ifdef __aarch64__
2569  int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2570  return vreinterpretq_s8_s16(vqmovn_high_s32(l, vreinterpretq_s32_s8(hi)));
2571 #else
2572  int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2573  int16x4_t h = vqmovn_s32(vreinterpretq_s32_s8(hi));
2574  return NLIB_CMB(s16, l, h);
2575 #endif
2576 #endif
2577 }
2578 
2579 // r.s8[2 * i] = value.s8[i], r.u8[2 * i + 1] = value.s8[i] < 0 ? 0xFF : 0
2580 NLIB_M(i128) I128::ConvertFromInt8ToInt16Lo(i128arg value) NLIB_NOEXCEPT {
2581 #if defined(NLIB_SSE41)
2582  return _mm_cvtepi8_epi16(value);
2583 #elif defined(NLIB_NEON)
2584  return vreinterpretq_s8_s16(vmovl_s8(vget_low_s8(value)));
2585 #endif
2586 }
2587 
2588 // r.s8[2 * i] = value.s8[i + 8], r.u8[2 * i + 1] = value.s8[i + 8] < 0 ? 0xFF : 0
2589 NLIB_M(i128) I128::ConvertFromInt8ToInt16Hi(i128arg value) NLIB_NOEXCEPT {
2590 #if defined(NLIB_SSE41)
2591  return _mm_cvtepi8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2592 #elif defined(NLIB_NEON)
2593 #ifdef __aarch64__
2594  int16x8_t result = vmovl_high_s8(value);
2595 #else
2596  int16x8_t result = vmovl_s8(vget_high_s8(value));
2597 #endif
2598  return vreinterpretq_s8_s16(result);
2599 #endif
2600 }
2601 
2602 // r.s16[2 * i] = value.s16[i], r.u16[2 * i + 1] = value.s16[i] < 0 ? 0xFFFF : 0
2603 NLIB_M(i128) I128::ConvertFromInt16ToInt32Lo(i128arg value) NLIB_NOEXCEPT {
2604 #if defined(NLIB_SSE41)
2605  return _mm_cvtepi16_epi32(value);
2606 #elif defined(NLIB_NEON)
2607  int16x8_t x = vreinterpretq_s16_s8(value);
2608  int32x4_t result = vmovl_s16(vget_low_s16(x));
2609  return vreinterpretq_s8_s32(result);
2610 #endif
2611 }
2612 
2613 // r.s16[2 * i] = value.s16[i + 4], r.u16[2 * i + 1] = value.s16[i + 4] < 0 ? 0xFFFF : 0
2614 NLIB_M(i128) I128::ConvertFromInt16ToInt32Hi(i128arg value) NLIB_NOEXCEPT {
2615 #if defined(NLIB_SSE41)
2616  return _mm_cvtepi16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2617 #elif defined(NLIB_NEON)
2618  int16x8_t x = vreinterpretq_s16_s8(value);
2619 #ifdef __aarch64__
2620  int32x4_t result = vmovl_high_s16(x);
2621 #else
2622  int32x4_t result = vmovl_s16(vget_high_s16(x));
2623 #endif
2624  return vreinterpretq_s8_s32(result);
2625 #endif
2626 }
2627 
2628 // r.s32[2 * i] = value.s32[i], r.u32[2 * i + 1] = value.s32[i] < 0 ? 0xFFFFFFFF : 0
2629 NLIB_M(i128) I128::ConvertFromInt32ToInt64Lo(i128arg value) NLIB_NOEXCEPT {
2630 #if defined(NLIB_SSE41)
2631  return _mm_cvtepi32_epi64(value);
2632 #elif defined(NLIB_NEON)
2633  int32x4_t x = vreinterpretq_s32_s8(value);
2634  int64x2_t result = vmovl_s32(vget_low_s32(x));
2635  return vreinterpretq_s8_s64(result);
2636 #endif
2637 }
2638 
2639 // r.s32[2 * i] = value.s32[i + 2], r.u32[2 * i + 1] = value.s32[i + 2] < 0 ? 0xFFFFFFFF : 0
2640 NLIB_M(i128) I128::ConvertFromInt32ToInt64Hi(i128arg value) NLIB_NOEXCEPT {
2641 #if defined(NLIB_SSE41)
2642  return _mm_cvtepi32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2643 #elif defined(NLIB_NEON)
2644  int32x4_t x = vreinterpretq_s32_s8(value);
2645 #ifdef __aarch64__
2646  int64x2_t result = vmovl_high_s32(x);
2647 #else
2648  int64x2_t result = vmovl_s32(vget_high_s32(x));
2649 #endif
2650  return vreinterpretq_s8_s64(result);
2651 #endif
2652 }
2653 
2654 // r.u16[i] = value.u8[i]
2655 NLIB_M(i128) I128::ConvertFromUint8ToUint16Lo(i128arg value) NLIB_NOEXCEPT {
2656 #if defined(NLIB_SSE41)
2657  return _mm_cvtepu8_epi16(value);
2658 #elif defined(NLIB_NEON)
2659  uint8x16_t x = vreinterpretq_u8_s8(value);
2660  uint16x8_t result = vmovl_u8(vget_low_u8(x));
2661  return vreinterpretq_s8_u16(result);
2662 #endif
2663 }
2664 
2665 // r.u16[i] = value.u8[i + 8]
2666 NLIB_M(i128) I128::ConvertFromUint8ToUint16Hi(i128arg value) NLIB_NOEXCEPT {
2667 #if defined(NLIB_SSE41)
2668  return _mm_cvtepu8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2669 #elif defined(NLIB_NEON)
2670  uint8x16_t x = vreinterpretq_u8_s8(value);
2671 #ifdef __aarch64__
2672  uint16x8_t result = vmovl_high_u8(x);
2673 #else
2674  uint16x8_t result = vmovl_u8(vget_high_u8(x));
2675 #endif
2676  return vreinterpretq_s8_u16(result);
2677 #endif
2678 }
2679 
2680 // r.u32[i] = value.u16[i]
2681 NLIB_M(i128) I128::ConvertFromUint16ToUint32Lo(i128arg value) NLIB_NOEXCEPT {
2682 #if defined(NLIB_SSE41)
2683  return _mm_cvtepu16_epi32(value);
2684 #elif defined(NLIB_NEON)
2685  uint16x8_t x = vreinterpretq_u16_s8(value);
2686  uint32x4_t result = vmovl_u16(vget_low_u16(x));
2687  return vreinterpretq_s8_u32(result);
2688 #endif
2689 }
2690 
2691 // r.u32[i] = value.u16[i + 4]
2692 NLIB_M(i128) I128::ConvertFromUint16ToUint32Hi(i128arg value) NLIB_NOEXCEPT {
2693 #if defined(NLIB_SSE41)
2694  return _mm_cvtepu16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2695 #elif defined(NLIB_NEON)
2696  uint16x8_t x = vreinterpretq_u16_s8(value);
2697 #ifdef __aarch64__
2698  uint32x4_t result = vmovl_high_u16(x);
2699 #else
2700  uint32x4_t result = vmovl_u16(vget_high_u16(x));
2701 #endif
2702  return vreinterpretq_s8_u32(result);
2703 #endif
2704 }
2705 
2706 // r.u64[i] = value.u32[i]
2707 NLIB_M(i128) I128::ConvertFromUint32ToUint64Lo(i128arg value) NLIB_NOEXCEPT {
2708 #if defined(NLIB_SSE41)
2709  return _mm_cvtepu32_epi64(value);
2710 #elif defined(NLIB_NEON)
2711  uint32x4_t x = vreinterpretq_u32_s8(value);
2712  uint64x2_t result = vmovl_u32(vget_low_u32(x));
2713  return vreinterpretq_s8_u64(result);
2714 #endif
2715 }
2716 
2717 // r.u64[i] = value.u32[i + 2]
2718 NLIB_M(i128) I128::ConvertFromUint32ToUint64Hi(i128arg value) NLIB_NOEXCEPT {
2719 #if defined(NLIB_SSE41)
2720  return _mm_cvtepu32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2721 #elif defined(NLIB_NEON)
2722  uint32x4_t x = vreinterpretq_u32_s8(value);
2723 #ifdef __aarch64__
2724  uint64x2_t result = vmovl_high_u32(x);
2725 #else
2726  uint64x2_t result = vmovl_u32(vget_high_u32(x));
2727 #endif
2728  return vreinterpretq_s8_u64(result);
2729 #endif
2730 }
2731 
2732 // 01234567 abcdefgh -> 0a1b2c3d4e5f6g7h
2733 NLIB_M(i128) I128::Zip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2734 #if defined(NLIB_SSE41)
2735  return _mm_unpacklo_epi8(a, b);
2736 #elif defined(NLIB_NEON)
2737 #ifdef __aarch64__
2738  return vzip1q_s8(a, b);
2739 #else
2740  return vzipq_s8(a, b).val[0];
2741 #endif
2742 #endif
2743 }
2744 
2745 // 01234567 abcdefgh -> 0a1b2c3d4e5f6g7h
2746 NLIB_M(i128) I128::Zip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2747 #if defined(NLIB_SSE41)
2748  return _mm_unpackhi_epi8(a, b);
2749 #elif defined(NLIB_NEON)
2750 #ifdef __aarch64__
2751  return vzip2q_s8(a, b);
2752 #else
2753  return vzipq_s8(a, b).val[1];
2754 #endif
2755 #endif
2756 }
2757 
2758 NLIB_M(i128) I128::Unzip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2759 #if defined(NLIB_SSE41)
2760  i128 mask = I128::SetValue(0x00FFU, each_uint16);
2761  __m128i lo_mask = _mm_and_si128(a, mask);
2762  __m128i hi_mask = _mm_and_si128(b, mask);
2763  return _mm_packus_epi16(lo_mask, hi_mask);
2764 #elif defined(NLIB_NEON)
2765 #ifdef __aarch64__
2766  return vuzp1q_s8(a, b);
2767 #else
2768  return vuzpq_s8(a, b).val[0];
2769 #endif
2770 #endif
2771 }
2772 
2773 NLIB_M(i128) I128::Unzip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2774 #if defined(NLIB_SSE41)
2775  i128 mask = I128::SetValue(0xFF00U, each_uint16);
2776  __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 1);
2777  __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 1);
2778  return _mm_packus_epi16(lo_mask, hi_mask);
2779 #elif defined(NLIB_NEON)
2780 #ifdef __aarch64__
2781  return vuzp2q_s8(a, b);
2782 #else
2783  return vuzpq_s8(a, b).val[1];
2784 #endif
2785 #endif
2786 }
2787 
2788 // 0123 abcd -> 0a1b2c3d
2789 NLIB_M(i128) I128::Zip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2790 #if defined(NLIB_SSE41)
2791  return _mm_unpacklo_epi16(a, b);
2792 #elif defined(NLIB_NEON)
2793 #ifdef __aarch64__
2794  return NLIB_OP2(vzip1q, u16, a, b);
2795 #else
2796  return vreinterpretq_s8_u16(vzipq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2797 #endif
2798 #endif
2799 }
2800 
2801 // 0123 abcd -> 0a1b2c3d
2802 NLIB_M(i128) I128::Zip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2803 #if defined(NLIB_SSE41)
2804  return _mm_unpackhi_epi16(a, b);
2805 #elif defined(NLIB_NEON)
2806 #ifdef __aarch64__
2807  return NLIB_OP2(vzip2q, u16, a, b);
2808 #else
2809  return vreinterpretq_s8_u16(vzipq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2810 #endif
2811 #endif
2812 }
2813 
2814 NLIB_M(i128) I128::Unzip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2815 #if defined(NLIB_SSE41)
2816  i128 mask = I128::SetValue(0xFFFFU, each_uint32);
2817  __m128i lo_mask = _mm_and_si128(a, mask);
2818  __m128i hi_mask = _mm_and_si128(b, mask);
2819  return _mm_packus_epi32(lo_mask, hi_mask);
2820 #elif defined(NLIB_NEON)
2821 #ifdef __aarch64__
2822  return NLIB_OP2(vuzp1q, u16, a, b);
2823 #else
2824  return vreinterpretq_s8_u16(vuzpq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2825 #endif
2826 #endif
2827 }
2828 
2829 NLIB_M(i128) I128::Unzip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2830 #if defined(NLIB_SSE41)
2831  i128 mask = I128::SetValue(0xFFFF0000U, each_uint32);
2832  __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 2);
2833  __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 2);
2834  return _mm_packus_epi32(lo_mask, hi_mask);
2835 #elif defined(NLIB_NEON)
2836 #ifdef __aarch64__
2837  return NLIB_OP2(vuzp2q, u16, a, b);
2838 #else
2839  return vreinterpretq_s8_u16(vuzpq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2840 #endif
2841 #endif
2842 }
2843 
2844 // 01 ab -> 0a1b
2845 NLIB_M(i128) I128::Zip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2846 #if defined(NLIB_SSE41)
2847  return _mm_unpacklo_epi32(a, b);
2848 #elif defined(NLIB_NEON)
2849 #ifdef __aarch64__
2850  return NLIB_OP2(vzip1q, u32, a, b);
2851 #else
2852  return vreinterpretq_s8_u32(vzipq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2853 #endif
2854 #endif
2855 }
2856 
2857 // 01 ab -> 0a1b
2858 NLIB_M(i128) I128::Zip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2859 #if defined(NLIB_SSE41)
2860  return _mm_unpackhi_epi32(a, b);
2861 #elif defined(NLIB_NEON)
2862 #ifdef __aarch64__
2863  return NLIB_OP2(vzip2q, u32, a, b);
2864 #else
2865  return vreinterpretq_s8_u32(vzipq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2866 #endif
2867 #endif
2868 }
2869 
2870 NLIB_M(i128) I128::Unzip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2871 #if defined(NLIB_SSE41)
2872  __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
2873  __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
2874  return _mm_blend_epi16(x0, x1, 0xF0);
2875 #elif defined(NLIB_NEON)
2876 #ifdef __aarch64__
2877  return NLIB_OP2(vuzp1q, u32, a, b);
2878 #else
2879  return vreinterpretq_s8_u32(vuzpq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2880 #endif
2881 #endif
2882 }
2883 
2884 NLIB_M(i128) I128::Unzip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2885 #if defined(NLIB_SSE41)
2886  __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 3, 1));
2887  __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 2, 0));
2888  return _mm_blend_epi16(x0, x1, 0xF0);
2889 #elif defined(NLIB_NEON)
2890 #ifdef __aarch64__
2891  return NLIB_OP2(vuzp2q, u32, a, b);
2892 #else
2893  return vreinterpretq_s8_u32(vuzpq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2894 #endif
2895 #endif
2896 }
2897 
2898 template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7, int V8, int V9, int V10,
2899  int V11, int V12, int V13, int V14, int V15>
2900 NLIB_M(i128)
2901 I128::Permute8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2902 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON)
2903  return __builtin_shufflevector(a, b, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13,
2904  V14, V15);
2905 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41)
2906  return __builtin_shufflevector((__v16qi)a, (__v16qi)b, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9,
2907  V10, V11, V12, V13, V14, V15);
2908 #else
2909  NLIB_ALIGNAS(16)
2910  int8_t mask_a[16] = {(V0 < 0 || V0 > 15) ? -128 : V0, (V1 < 0 || V1 > 15) ? -128 : V1,
2911  (V2 < 0 || V2 > 15) ? -128 : V2, (V3 < 0 || V3 > 15) ? -128 : V3,
2912  (V4 < 0 || V4 > 15) ? -128 : V4, (V5 < 0 || V5 > 15) ? -128 : V5,
2913  (V6 < 0 || V6 > 15) ? -128 : V6, (V7 < 0 || V7 > 15) ? -128 : V7,
2914  (V8 < 0 || V8 > 15) ? -128 : V8, (V9 < 0 || V9 > 15) ? -128 : V9,
2915  (V10 < 0 || V10 > 15) ? -128 : V10, (V11 < 0 || V11 > 15) ? -128 : V11,
2916  (V12 < 0 || V12 > 15) ? -128 : V12, (V13 < 0 || V13 > 15) ? -128 : V13,
2917  (V14 < 0 || V14 > 15) ? -128 : V14, (V15 < 0 || V15 > 15) ? -128 : V15};
2918  NLIB_ALIGNAS(16)
2919  int8_t mask_b[16] = {
2920  V0 < 16 ? -128 : (V0 - 16), V1 < 16 ? -128 : (V1 - 16), V2 < 16 ? -128 : (V2 - 16),
2921  V3 < 16 ? -128 : (V3 - 16), V4 < 16 ? -128 : (V4 - 16), V5 < 16 ? -128 : (V5 - 16),
2922  V6 < 16 ? -128 : (V6 - 16), V7 < 16 ? -128 : (V7 - 16), V8 < 16 ? -128 : (V8 - 16),
2923  V9 < 16 ? -128 : (V9 - 16), V10 < 16 ? -128 : (V10 - 16), V11 < 16 ? -128 : (V11 - 16),
2924  V12 < 16 ? -128 : (V12 - 16), V13 < 16 ? -128 : (V13 - 16), V14 < 16 ? -128 : (V14 - 16),
2925  V15 < 16 ? -128 : (V15 - 16)};
2926  i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2927  i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2928  return I128::Or(tmp_a, tmp_b);
2929 #endif
2930 }
2931 
2932 template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7>
2933 NLIB_M(i128)
2934 I128::Permute16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2935 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON)
2936  return vreinterpretq_s8_u16(__builtin_shufflevector(
2937  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b), V0, V1, V2, V3, V4, V5, V6, V7));
2938 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41)
2939  return __builtin_shufflevector((__v8hi)a, (__v8hi)b, V0, V1, V2, V3, V4, V5, V6, V7);
2940 #else
2941  NLIB_ALIGNAS(16)
2942  int8_t mask_a[16] = {
2943  (V0 < 0 || V0 > 7) ? -128 : V0 * 2, (V0 < 0 || V0 > 7) ? -128 : V0 * 2 + 1,
2944  (V1 < 0 || V1 > 7) ? -128 : V1 * 2, (V1 < 0 || V1 > 7) ? -128 : V1 * 2 + 1,
2945  (V2 < 0 || V2 > 7) ? -128 : V2 * 2, (V2 < 0 || V2 > 7) ? -128 : V2 * 2 + 1,
2946  (V3 < 0 || V3 > 7) ? -128 : V3 * 2, (V3 < 0 || V3 > 7) ? -128 : V3 * 2 + 1,
2947  (V4 < 0 || V4 > 7) ? -128 : V4 * 2, (V4 < 0 || V4 > 7) ? -128 : V4 * 2 + 1,
2948  (V5 < 0 || V5 > 7) ? -128 : V5 * 2, (V5 < 0 || V5 > 7) ? -128 : V5 * 2 + 1,
2949  (V6 < 0 || V6 > 7) ? -128 : V6 * 2, (V6 < 0 || V6 > 7) ? -128 : V6 * 2 + 1,
2950  (V7 < 0 || V7 > 7) ? -128 : V7 * 2, (V7 < 0 || V7 > 7) ? -128 : V7 * 2 + 1};
2951  NLIB_ALIGNAS(16)
2952  int8_t mask_b[16] = {V0 < 8 ? -128 : (V0 - 8) * 2, V0 < 8 ? -128 : (V0 - 8) * 2 + 1,
2953  V1 < 8 ? -128 : (V1 - 8) * 2, V1 < 8 ? -128 : (V1 - 8) * 2 + 1,
2954  V2 < 8 ? -128 : (V2 - 8) * 2, V2 < 8 ? -128 : (V2 - 8) * 2 + 1,
2955  V3 < 8 ? -128 : (V3 - 8) * 2, V3 < 8 ? -128 : (V3 - 8) * 2 + 1,
2956  V4 < 8 ? -128 : (V4 - 8) * 2, V4 < 8 ? -128 : (V4 - 8) * 2 + 1,
2957  V5 < 8 ? -128 : (V5 - 8) * 2, V5 < 8 ? -128 : (V5 - 8) * 2 + 1,
2958  V6 < 8 ? -128 : (V6 - 8) * 2, V6 < 8 ? -128 : (V6 - 8) * 2 + 1,
2959  V7 < 8 ? -128 : (V7 - 8) * 2, V7 < 8 ? -128 : (V7 - 8) * 2 + 1};
2960  i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2961  i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2962  return I128::Or(tmp_a, tmp_b);
2963 #endif
2964 }
2965 
2966 template<int V0, int V1, int V2, int V3>
2967 NLIB_M(i128)
2968 I128::Permute32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2969 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON)
2970  return vreinterpretq_s8_u32(
2971  __builtin_shufflevector(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b), V0, V1, V2, V3));
2972 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41)
2973  return __builtin_shufflevector((__v4si)a, (__v4si)b, V0, V1, V2, V3);
2974 #else
2975  NLIB_ALIGNAS(16)
2976  int8_t mask_a[16] = {
2977  (V0 < 0 || V0 > 3) ? -128 : V0 * 4, (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 1,
2978  (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 2, (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 3,
2979  (V1 < 0 || V1 > 3) ? -128 : V1 * 4, (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 1,
2980  (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 2, (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 3,
2981  (V2 < 0 || V2 > 3) ? -128 : V2 * 4, (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 1,
2982  (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 2, (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 3,
2983  (V3 < 0 || V3 > 3) ? -128 : V3 * 4, (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 1,
2984  (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 2, (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 3};
2985  NLIB_ALIGNAS(16)
2986  int8_t mask_b[16] = {V0 < 4 ? -128 : (V0 - 4) * 4, V0 < 4 ? -128 : (V0 - 4) * 4 + 1,
2987  V0 < 4 ? -128 : (V0 - 4) * 4 + 2, V0 < 4 ? -128 : (V0 - 4) * 4 + 3,
2988  V1 < 4 ? -128 : (V1 - 4) * 4, V1 < 4 ? -128 : (V1 - 4) * 4 + 1,
2989  V1 < 4 ? -128 : (V1 - 4) * 4 + 2, V1 < 4 ? -128 : (V1 - 4) * 4 + 3,
2990  V2 < 4 ? -128 : (V2 - 4) * 4, V2 < 4 ? -128 : (V2 - 4) * 4 + 1,
2991  V2 < 4 ? -128 : (V2 - 4) * 4 + 2, V2 < 4 ? -128 : (V2 - 4) * 4 + 3,
2992  V3 < 4 ? -128 : (V3 - 4) * 4, V3 < 4 ? -128 : (V3 - 4) * 4 + 1,
2993  V3 < 4 ? -128 : (V3 - 4) * 4 + 2, V3 < 4 ? -128 : (V3 - 4) * 4 + 3};
2994  i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2995  i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2996  return I128::Or(tmp_a, tmp_b);
2997 #endif
2998 }
2999 
3000 // 0123456789abcdef -> 1032547698badcfe
3001 NLIB_M(i128) I128::Reverse16(i128arg value) NLIB_NOEXCEPT {
3002 #if defined(NLIB_SSE41)
3003  NLIB_ALIGNAS(16)
3004  const int8_t mask_[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
3005  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3006 #elif defined(NLIB_NEON)
3007  return NLIB_OP1(vrev16q, u8, value);
3008 #endif
3009 }
3010 
3011 // 0123456789abcdef -> 32107654ba98fedc
3012 NLIB_M(i128) I128::Reverse32(i128arg value) NLIB_NOEXCEPT {
3013 #if defined(NLIB_SSE41)
3014  NLIB_ALIGNAS(16)
3015  const int8_t mask_[16] = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12};
3016  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3017 #elif defined(NLIB_NEON)
3018  return NLIB_OP1(vrev32q, u8, value);
3019 #endif
3020 }
3021 
3022 // 0123456789abcdef -> 76543210fedcba98
3023 NLIB_M(i128) I128::Reverse64(i128arg value) NLIB_NOEXCEPT {
3024 #if defined(NLIB_SSE41)
3025  NLIB_ALIGNAS(16)
3026  const int8_t mask_[16] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
3027  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3028 #elif defined(NLIB_NEON)
3029  return NLIB_OP1(vrev64q, u8, value);
3030 #endif
3031 }
3032 
3033 // r = Or(ismasked(value.u8[i]) ? (1 << i) : 0)
3034 NLIB_M(int) I128::MoveMask8(i128arg value) NLIB_NOEXCEPT {
3035 #if defined(NLIB_SSE41)
3036  return _mm_movemask_epi8(value);
3037 #elif defined(NLIB_NEON)
3038  uint8x16_t powers = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL));
3039  uint8x16_t a = vandq_u8(value, powers);
3040 #ifdef __aarch64__
3041  return vaddv_u8(vget_low_u8(a)) | (vaddv_u8(vget_high_u8(a)) << 8);
3042 #else
3043  uint8x8_t al = vget_low_u8(a);
3044  uint8x8_t ah = vget_high_u8(a);
3045  uint8x8_t tmp = vpadd_u8(al, ah);
3046  tmp = vpadd_u8(tmp, tmp);
3047  tmp = vpadd_u8(tmp, tmp);
3048  return vget_lane_u16(vreinterpret_u16_u8(tmp), 0);
3049 #endif
3050 #endif
3051 }
3052 
3053 // r = Or(ismasked(value.u16[i]) ? (1 << i) : 0)
3054 NLIB_M(int) I128::MoveMask16(i128arg value) NLIB_NOEXCEPT {
3055 #if defined(NLIB_SSE41)
3056  __m128i tmp = _mm_packs_epi16(value, value);
3057  return _mm_movemask_epi8(tmp) & 255;
3058 #elif defined(NLIB_NEON)
3059  uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3060  uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3061  uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3062  uint16x8_t a = vandq_u16(vreinterpretq_u16_s8(value), powers);
3063 #ifdef __aarch64__
3064  return vaddvq_u16(a);
3065 #else
3066  uint8x8_t tmp = vmovn_u16(a);
3067  tmp = vpadd_u8(tmp, tmp);
3068  tmp = vpadd_u8(tmp, tmp);
3069  tmp = vpadd_u8(tmp, tmp);
3070  return vget_lane_u8(tmp, 0);
3071 #endif
3072 #endif
3073 }
3074 
3075 // r = Or(ismasked(value.u32[i]) ? (1 << i) : 0)
3076 NLIB_M(int) I128::MoveMask32(i128arg value) NLIB_NOEXCEPT {
3077 #if defined(NLIB_SSE41)
3078  __m128i tmp = _mm_packs_epi16(value, value);
3079  tmp = _mm_packs_epi16(tmp, tmp);
3080  return _mm_movemask_epi8(tmp) & 15;
3081 #elif defined(NLIB_NEON)
3082  uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3083  uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3084  uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3085  uint32x4_t a = vandq_u32(vreinterpretq_u32_s8(value), powers);
3086 #ifdef __aarch64__
3087  return vaddvq_u32(a);
3088 #else
3089  uint16x4_t tmp = vmovn_u32(a);
3090  tmp = vpadd_u16(tmp, tmp);
3091  tmp = vpadd_u16(tmp, tmp);
3092  return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
3093 #endif
3094 #endif
3095 }
3096 
3097 NLIB_M(i128) I128::SetMask8(int mask) NLIB_NOEXCEPT {
3098 #if defined(NLIB_NEON)
3099  int8x8_t m = vcreate_s8(0x8040201008040201ULL);
3100  int8x8_t s0 = vdup_n_s8(mask & 0xFF);
3101  int8x8_t s1 = vdup_n_s8(mask >> 8);
3102  return vtstq_s8(vcombine_s8(m, m), vcombine_s8(s0, s1));
3103 #elif defined(NLIB_SSE41)
3104  i128 m = I128::SetValue(0x8040201008040201ULL, each_uint64);
3105  i128 s0 = I128::SetValue(mask & 0xFF, each_int8);
3106  i128 s1 = I128::SetValue(static_cast<int8_t>(mask >> 8), each_int8);
3107  i128 s = _mm_blend_epi16(s0, s1, 0xF0);
3108  return I128::Test8(m, s);
3109 #endif
3110 }
3111 
3112 NLIB_M(i128) I128::SetMask16(int mask) NLIB_NOEXCEPT {
3113 #if defined(NLIB_NEON)
3114  uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3115  uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3116  uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3117  uint16x8_t s = vdupq_n_u16(mask);
3118  return vreinterpretq_s8_u16(vtstq_u16(powers, s));
3119 #elif defined(NLIB_SSE41)
3120  i128 m0 = I128::SetValue(0x0008000400020001ULL, each_uint64);
3121  i128 m1 = I128::SetValue(0x0080004000200010ULL, each_uint64);
3122  i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3123  i128 s = I128::SetValue(static_cast<int16_t>(mask), each_int16);
3124  return I128::Test16(m, s);
3125 #endif
3126 }
3127 
3128 NLIB_M(i128) I128::SetMask32(int mask) NLIB_NOEXCEPT {
3129 #if defined(NLIB_NEON)
3130  uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3131  uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3132  uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3133  uint32x4_t s = vdupq_n_u32(mask);
3134  return vreinterpretq_s8_u32(vtstq_u32(powers, s));
3135 #elif defined(NLIB_SSE41)
3136  i128 m0 = I128::SetValue(0x0000000200000001ULL, each_uint64);
3137  i128 m1 = I128::SetValue(0x0000000800000004ULL, each_uint64);
3138  i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3139  i128 s = I128::SetValue(mask, each_int32);
3140  return I128::Test32(m, s);
3141 #endif
3142 }
3143 
3144 // true if value == 0
3145 NLIB_M(bool) I128::IsZero(i128arg value) NLIB_NOEXCEPT {
3146 #if defined(NLIB_SSE41)
3147  return _mm_testz_si128(value, value) != 0;
3148 #elif defined(NLIB_NEON)
3149 #ifdef __aarch64__
3150  uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(value));
3151  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3152 #else
3153  // NOTE: there is a better way
3154  // https://stackoverflow.com/questions/15389539/fastest-way-to-test-a-128-bit-neon-register-for-a-value-of-0-using-intrinsics
3155  int8x8_t tmp = vorr_s8(vget_low_s8(value), vget_high_s8(value));
3156  return vget_lane_u64(vreinterpret_u64_s8(tmp), 0) == 0;
3157 #endif
3158 #endif
3159 }
3160 
3161 // true if all bits on
3162 NLIB_M(bool) I128::IsFull(i128arg value) NLIB_NOEXCEPT {
3163 #if defined(NLIB_SSE41)
3164  return _mm_testc_si128(value, _mm_cmpeq_epi8(value, value)) != 0;
3165 #elif defined(NLIB_NEON)
3166 #ifdef __aarch64__
3167  uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(vmvnq_s8(value)));
3168  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3169 #else
3170  int8x8_t tmp = vand_s8(vget_low_s8(value), vget_high_s8(value));
3171  return vget_lane_s64(vreinterpret_s64_s8(tmp), 0) == -1;
3172 #endif
3173 #endif
3174 }
3175 
3176 // r.i8[i] = ismasked(mask.i8[i]) ? a.i8[i] : b.i8[i]
3177 NLIB_M(i128) I128::Select(i128arg mask, i128arg a, i128arg b) NLIB_NOEXCEPT {
3178 #if defined(NLIB_SSE41)
3179  return _mm_blendv_epi8(b, a, mask);
3180 #elif defined(NLIB_NEON)
3181  return NLIB_OP3(vbslq, u32, mask, a, b);
3182 #endif
3183 }
3184 
3185 // r[i] = 0 if shuffle.u8[i] == 0x80, r[i] = value[shuffle.u8[i]] if 0 <= shuffle.u8[i] < 16
3186 NLIB_M(i128) I128::Shuffle8(i128arg value, i128arg shuffle) NLIB_NOEXCEPT {
3187 #if defined(NLIB_SSE41)
3188  return _mm_shuffle_epi8(value, shuffle);
3189 #elif defined(NLIB_NEON)
3190 #ifdef __aarch64__
3191  return vqtbl1q_s8(value, vreinterpretq_u8_s8(shuffle));
3192 #else
3193  int8x8x2_t x;
3194  x.val[0] = vget_low_s8(value);
3195  x.val[1] = vget_high_s8(value);
3196  int8x8_t lo = vtbl2_s8(x, vget_low_s8(shuffle));
3197  int8x8_t hi = vtbl2_s8(x, vget_high_s8(shuffle));
3198  return vcombine_s8(lo, hi);
3199 #endif
3200 #endif
3201 }
3202 
3203 // popcnt of mask(returns nlib_popcnt16(I128::MoveMask8(value))
3204 NLIB_ALWAYS_INLINE int __vectorcall I128::PopCntMask8(i128arg value) NLIB_NOEXCEPT {
3205 #if defined(NLIB_NEON)
3206 #ifdef __aarch64__
3207  int8x16_t tmp = vnegq_s8(value);
3208  return vaddvq_s8(tmp);
3209 #else
3210  int8x16_t tmp = vnegq_s8(value);
3211  int8x8_t lo = vget_low_s8(tmp);
3212  int8x8_t hi = vget_high_s8(tmp);
3213  lo = vadd_s8(lo, hi);
3214  lo = vpadd_s8(lo, lo);
3215  lo = vpadd_s8(lo, lo);
3216  lo = vpadd_s8(lo, lo);
3217  return vget_lane_s8(lo, 0);
3218 #endif
3219 #else
3220  return nlib_popcnt16(static_cast<uint16_t>(I128::MoveMask8(value)));
3221 #endif
3222 }
3223 
3224 NLIB_ALWAYS_INLINE int __vectorcall I128::ClzMask8(i128arg value) NLIB_NOEXCEPT {
3225  return nlib_clz32(static_cast<uint32_t>(I128::MoveMask8(value))) - 16;
3226 }
3227 
3228 NLIB_ALWAYS_INLINE int __vectorcall I128::CtzMask8(i128arg value) NLIB_NOEXCEPT {
3229  return nlib_ctz32(static_cast<uint32_t>(I128::MoveMask8(value) | 0x10000));
3230 }
3231 
3232 NLIB_ALWAYS_INLINE int __vectorcall I128::SumUint8(i128arg value) NLIB_NOEXCEPT {
3233 #if defined(NLIB_NEON) && defined(__aarch64__)
3234  uint8x16_t x = vreinterpretq_u8_s8(value);
3235  uint16x8_t lo = vmovl_u8(vget_low_u8(x));
3236  uint16x8_t hi = vmovl_high_u8(x);
3237  return vaddvq_u16(lo) + vaddvq_u16(hi);
3238 #else
3239  i128 lo = I128::ConvertFromUint8ToUint16Lo(value);
3240  i128 hi = I128::ConvertFromUint8ToUint16Hi(value);
3241  i128 x = I128::Add16(lo, hi);
3242  x = I128::Add16(x, I128::ByteShiftRight<8>(x));
3243  x = I128::Add16(x, I128::ByteShiftRight<4>(x));
3244  x = I128::Add16(x, I128::ByteShiftRight<2>(x));
3245  return GetUint16FromLane<0>(x);
3246 #endif
3247 }
3248 
3249 NLIB_ALWAYS_INLINE int __vectorcall I128::SumUint16(i128arg value) NLIB_NOEXCEPT {
3250 #if defined(NLIB_NEON) && defined(__aarch64__)
3251  uint16x8_t x = vreinterpretq_u16_s8(value);
3252  uint32x4_t lo = vmovl_u16(vget_low_u16(x));
3253  uint32x4_t hi = vmovl_high_u16(x);
3254  return vaddvq_u32(lo) + vaddvq_u32(hi);
3255 #else
3256  i128 lo = I128::ConvertFromUint16ToUint32Lo(value);
3257  i128 hi = I128::ConvertFromUint16ToUint32Hi(value);
3258  i128 x = I128::Add32(lo, hi);
3259  x = I128::Add32(x, I128::ByteShiftRight<8>(x));
3260  x = I128::Add32(x, I128::ByteShiftRight<4>(x));
3261  return GetUint32FromLane<0>(x);
3262 #endif
3263 }
3264 
3265 #ifdef NLIB_NEON
3266 #undef vreinterpretq_s8_s8
3267 #undef NLIB_OP1
3268 #undef NLIB_OP2
3269 #undef NLIB_OP3
3270 #undef NLIB_CMP
3271 #undef NLIB_SFT
3272 #undef NLIB_CMB
3273 #endif
3274 
3275 #endif // NLIB_DOXYGEN
3276 
3277 #undef NLIB_M
3278 #undef NLIB_M2
3279 
3280 #if defined(NLIB_SSE41)
3281 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3282  { \
3283  (row0) = _mm_shuffle_epi32((row0), _MM_SHUFFLE(3, 1, 2, 0)); \
3284  (row1) = _mm_shuffle_epi32((row1), _MM_SHUFFLE(3, 1, 2, 0)); \
3285  (row2) = _mm_shuffle_epi32((row2), _MM_SHUFFLE(3, 1, 2, 0)); \
3286  (row3) = _mm_shuffle_epi32((row3), _MM_SHUFFLE(3, 1, 2, 0)); \
3287  __m128i t0_transpose32_ = _mm_unpacklo_epi32((row0), (row1)); \
3288  __m128i t1_transpose32_ = _mm_unpackhi_epi32((row0), (row1)); \
3289  __m128i t2_transpose32_ = _mm_unpacklo_epi32((row2), (row3)); \
3290  __m128i t3_transpose32_ = _mm_unpackhi_epi32((row2), (row3)); \
3291  (row0) = _mm_unpacklo_epi64(t0_transpose32_, t2_transpose32_); \
3292  (row1) = _mm_unpacklo_epi64(t1_transpose32_, t3_transpose32_); \
3293  (row2) = _mm_unpackhi_epi64(t0_transpose32_, t2_transpose32_); \
3294  (row3) = _mm_unpackhi_epi64(t1_transpose32_, t3_transpose32_); \
3295  }
3296 #elif defined(NLIB_NEON)
3297 #ifdef __aarch64__
3298 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3299  { \
3300  uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), vreinterpretq_u32_s8(row1)); \
3301  uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), vreinterpretq_u32_s8(row3)); \
3302  uint64x2_t row0_, row1_, row2_, row3_; \
3303  row0_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \
3304  vreinterpretq_u64_u32(trn_f1_.val[0])); \
3305  row0 = vreinterpretq_s8_u64(row0_); \
3306  row1_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \
3307  vreinterpretq_u64_u32(trn_f1_.val[1])); \
3308  row1 = vreinterpretq_s8_u64(row1_); \
3309  row2_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \
3310  vreinterpretq_u64_u32(trn_f1_.val[0])); \
3311  row2 = vreinterpretq_s8_u64(row2_); \
3312  row3_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \
3313  vreinterpretq_u64_u32(trn_f1_.val[1])); \
3314  row3 = vreinterpretq_s8_u64(row3_); \
3315  }
3316 #else
3317 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3318  { \
3319  uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), vreinterpretq_u32_s8(row1)); \
3320  uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), vreinterpretq_u32_s8(row3)); \
3321  uint32x4_t row0_, row1_, row2_, row3_; \
3322  uint32x2_t lo, hi; \
3323  lo = vget_low_u32(trn_f0_.val[0]); \
3324  hi = vget_low_u32(trn_f1_.val[0]); \
3325  row0_ = vcombine_u32(lo, hi); \
3326  row0 = vreinterpretq_s8_u32(row0_); \
3327  lo = vget_low_u32(trn_f0_.val[1]); \
3328  hi = vget_low_u32(trn_f1_.val[1]); \
3329  row1_ = vcombine_u32(lo, hi); \
3330  row1 = vreinterpretq_s8_u32(row1_); \
3331  lo = vget_high_u32(trn_f0_.val[0]); \
3332  hi = vget_high_u32(trn_f1_.val[0]); \
3333  row2_ = vcombine_u32(lo, hi); \
3334  row2 = vreinterpretq_s8_u32(row2_); \
3335  lo = vget_high_u32(trn_f0_.val[1]); \
3336  hi = vget_high_u32(trn_f1_.val[1]); \
3337  row3_ = vcombine_u32(lo, hi); \
3338  row3 = vreinterpretq_s8_u32(row3_); \
3339  }
3340 #endif
3341 #endif
3342 
3343 #endif // NLIB_SIMD
3344 
3345 } // namespace simd
3346 NLIB_NAMESPACE_END
3347 
3348 #endif // INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
空の構造体で32bitの符号付き整数を示すためのタグです。
Definition: SimdInt.h:35
空の構造体で64bitの符号付き整数を示すためのタグです。
Definition: SimdInt.h:37
#define NLIB_ALWAYS_INLINE
コンパイラに関数をインライン展開するように強く示します。
Definition: Platform_unix.h:95
constexpr const each_uint8_tag each_uint8
each_uint8_tag型の定数オブジェクトで、8bitの符号なし整数を示すためのタグです。
Definition: SimdInt.h:51
空の構造体で8bit単位に分けたレーンを選択することを示すためのタグです。
Definition: SimdInt.h:61
空の構造体で32bit単位に分けたレーンを選択することを示すためのタグです。
Definition: SimdInt.h:57
空の構造体で8bitの符号付き整数を示すためのタグです。
Definition: SimdInt.h:31
#define NLIB_VIS_HIDDEN
関数やクラス等のシンボルをライブラリの外部に公開しません。
Definition: Platform_unix.h:86
static int nlib_clz32(uint32_t x)
MSB(most significant bit)から見て連続する0ビットの数を返します。
Definition: Platform.h:2690
constexpr const each_uint16_tag each_uint16
each_uint16_tag型の定数オブジェクトで、16bitの符号なし整数を示すためのタグです。
Definition: SimdInt.h:52
空の構造体で16bitの符号なし整数を示すためのタグです。
Definition: SimdInt.h:41
空の構造体で64bitの符号なし整数を示すためのタグです。
Definition: SimdInt.h:45
constexpr const each_int64_tag each_int64
each_int64_tag型の定数オブジェクトで、64bitの符号付き整数を示すためのタグです。
Definition: SimdInt.h:50
nlib_i128_t i128
nlib_i128_tがtypedefされています。
Definition: SimdInt.h:74
constexpr const each_uint64_tag each_uint64
each_uint64_tag型の定数オブジェクトで、64bitの符号なし整数を示すためのタグです。
Definition: SimdInt.h:54
static int nlib_popcnt16(uint16_t x)
1となっているビットの数を返します。
Definition: Platform.h:2554
128bitレジスタ(SSEではXMM0-XMM15, NEONではQ0-Q15)を用いた整数SIMD演算を行うためのクラスです。 ...
Definition: SimdInt.h:82
static int nlib_ctz32(uint32_t x)
LSB(least significant bit)から見て連続する0ビットの数を返します。
Definition: Platform.h:2691
空の構造体で8bitの符号なし整数を示すためのタグです。
Definition: SimdInt.h:39
空の構造体で16bitの符号付き整数を示すためのタグです。
Definition: SimdInt.h:33
constexpr const each_int16_tag each_int16
each_int16_tag型の定数オブジェクトで、16bitの符号付き整数を示すためのタグです。
Definition: SimdInt.h:48
constexpr const each_uint32_tag each_uint32
each_uint32_tag型の定数オブジェクトで、32bitの符号なし整数を示すためのタグです。
Definition: SimdInt.h:53
#define NLIB_NOEXCEPT
環境に合わせてnoexcept 又は同等の定義がされます。
Definition: Config.h:109
空の構造体で16bit単位に分けたレーンを選択することを示すためのタグです。
Definition: SimdInt.h:59
constexpr const each_select16_tag each_select16
each_select16_tag型の定数オブジェクトで、16bitのレーンを選択することを示すためのタグです。 ...
Definition: SimdInt.h:64
#define NLIB_CEXPR
利用可能であればconstexprが定義されます。そうでない場合は空文字列です。
Definition: Config.h:111
開発環境別の設定が書かれるファイルです。
__m128i nlib_i128_t
128bitの整数用SIMDレジスタのための型です。
Definition: SimdInt.h:22
constexpr const each_select8_tag each_select8
each_select8_tag型の定数オブジェクトで、8bitのレーンを選択することを示すためのタグです。 ...
Definition: SimdInt.h:65
#define NLIB_ALIGNAS(x)
alignas(x)又は同等の定義がされます。
Definition: Config.h:260
constexpr const each_int8_tag each_int8
each_int8_tag型の定数オブジェクトで、8bitの符号付き整数を示すためのタグです。
Definition: SimdInt.h:47
constexpr const each_select32_tag each_select32
each_select32_tag型の定数オブジェクトで、32bitのレーンを選択することを示すためのタグです。 ...
Definition: SimdInt.h:63
空の構造体で32bitの符号なし整数を示すためのタグです。
Definition: SimdInt.h:43
constexpr const each_int32_tag each_int32
each_int32_tag型の定数オブジェクトで、32bitの符号付き整数を示すためのタグです。
Definition: SimdInt.h:49
#define NLIB_STATIC_ASSERT(exp)
静的アサートが定義されます。利用可能であればstatic_assertを利用します。
Definition: Config.h:174