nlib
SimdInt.h
Go to the documentation of this file.
1 
2 /*--------------------------------------------------------------------------------*
3  Project: CrossRoad
4  Copyright (C)Nintendo All rights reserved.
5 
6  These coded instructions, statements, and computer programs contain proprietary
7  information of Nintendo and/or its licensed developers and are protected by
8  national and international copyright laws. They may not be disclosed to third
9  parties or copied or duplicated in any form, in whole or in part, without the
10  prior written consent of Nintendo.
11 
12  The content herein is highly confidential and should be handled accordingly.
13  *--------------------------------------------------------------------------------*/
14 
15 #pragma once
16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
17 #define INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
18 
19 #include "nn/nlib/Config.h"
20 
21 #if defined(NLIB_SSE41)
22 typedef __m128i nlib_i128_t;
23 #elif defined(NLIB_NEON)
24 typedef int8x16_t nlib_i128_t;
25 #endif
26 
27 NLIB_NAMESPACE_BEGIN
28 namespace simd {
29 
30 // use each_int8 for the argument value
31 struct each_int8_tag {};
32 // use each_int16 for the argument value
33 struct each_int16_tag {};
34 // use each_int32 for the argument value
35 struct each_int32_tag {};
36 // use each_int64 for the argument value
37 struct each_int64_tag {};
38 // use each_uint8 for the argument value
39 struct each_uint8_tag {};
40 // use each_uint16 for the argument value
41 struct each_uint16_tag {};
42 // use each_uint32 for the argument value
43 struct each_uint32_tag {};
44 // use each_uint64 for the argument value
45 struct each_uint64_tag {};
46 
55 
56 // use each_select32 for the argument value
58 // use each_select16 for the argument value
60 // use each_select8 for the argument value
61 struct each_select8_tag {};
62 
66 
67 #if !defined(_MSC_VER) && !defined(__vectorcall)
68 #define __vectorcall
69 #endif
70 
71 #if defined(NLIB_SIMD)
72 
73 // __m128i(SSE), int8x16_t(NEON)
74 typedef nlib_i128_t i128;
75 
76 #if defined(_MSC_VER) && _MSC_VER < 1800
77 typedef const i128& i128arg;
78 #else
79 typedef const i128 i128arg;
80 #endif
81 
83  static i128 __vectorcall SetValue(int8_t v, each_int8_tag) NLIB_NOEXCEPT;
84  static i128 __vectorcall SetValue(int16_t v, each_int16_tag) NLIB_NOEXCEPT;
85  static i128 __vectorcall SetValue(int32_t v, each_int32_tag) NLIB_NOEXCEPT;
86  static i128 __vectorcall SetValue(int64_t v, each_int64_tag) NLIB_NOEXCEPT;
87  static i128 __vectorcall SetValue(uint8_t v, each_uint8_tag) NLIB_NOEXCEPT;
88  static i128 __vectorcall SetValue(uint16_t v, each_uint16_tag) NLIB_NOEXCEPT;
89  static i128 __vectorcall SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT;
90  static i128 __vectorcall SetValue(uint64_t v, each_uint64_tag) NLIB_NOEXCEPT;
91 
92  template <size_t N>
93  static i128 __vectorcall SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT;
94  template <size_t N>
95  static i128 __vectorcall SetValue(i128 value, each_select16_tag) NLIB_NOEXCEPT;
96  template <size_t N>
97  static i128 __vectorcall SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT;
98 
99  static i128 __vectorcall SetZero() NLIB_NOEXCEPT;
100  static i128 __vectorcall SetFull(i128arg dummy) NLIB_NOEXCEPT;
101 
102  static i128 __vectorcall LoadA16(const void* p) NLIB_NOEXCEPT;
103  static i128 __vectorcall LoadA8(const void* p) NLIB_NOEXCEPT;
104  static i128 __vectorcall LoadA4(const void* p) NLIB_NOEXCEPT;
105  static i128 __vectorcall LoadA2(const void* p) NLIB_NOEXCEPT;
106  static i128 __vectorcall LoadA1(const void* p) NLIB_NOEXCEPT;
107  static i128 __vectorcall LoadLoA8(const void* p) NLIB_NOEXCEPT;
108  static i128 __vectorcall LoadLoA4(const void* p) NLIB_NOEXCEPT;
109  static i128 __vectorcall LoadLoA2(const void* p) NLIB_NOEXCEPT;
110  static i128 __vectorcall LoadLoA1(const void* p) NLIB_NOEXCEPT;
111  static i128 __vectorcall LoadHiA8(const void* p) NLIB_NOEXCEPT;
112  static i128 __vectorcall LoadHiA4(const void* p) NLIB_NOEXCEPT;
113  static i128 __vectorcall LoadHiA2(const void* p) NLIB_NOEXCEPT;
114  static i128 __vectorcall LoadHiA1(const void* p) NLIB_NOEXCEPT;
115 
116 #define NLIB_LOAD_REDIRECT(func) \
117  static i128 __vectorcall func(uintptr_t p) NLIB_NOEXCEPT { \
118  return func(reinterpret_cast<void*>(p)); \
119  } \
120  static i128 __vectorcall func(intptr_t p) NLIB_NOEXCEPT { \
121  return func(reinterpret_cast<void*>(p)); \
122  }
123  NLIB_LOAD_REDIRECT(LoadA16)
124  NLIB_LOAD_REDIRECT(LoadA8)
125  NLIB_LOAD_REDIRECT(LoadA4)
126  NLIB_LOAD_REDIRECT(LoadA2)
127  NLIB_LOAD_REDIRECT(LoadA1)
128  NLIB_LOAD_REDIRECT(LoadLoA8)
129  NLIB_LOAD_REDIRECT(LoadLoA4)
130  NLIB_LOAD_REDIRECT(LoadLoA2)
131  NLIB_LOAD_REDIRECT(LoadLoA1)
132  NLIB_LOAD_REDIRECT(LoadHiA8)
133  NLIB_LOAD_REDIRECT(LoadHiA4)
134  NLIB_LOAD_REDIRECT(LoadHiA2)
135  NLIB_LOAD_REDIRECT(LoadHiA1)
136 #undef NLIB_LOAD_REDIRECT
137 
138  static void __vectorcall StoreA16(void* p, i128arg value) NLIB_NOEXCEPT;
139  static void __vectorcall StoreA8(void* p, i128arg value) NLIB_NOEXCEPT;
140  static void __vectorcall StoreA4(void* p, i128arg value) NLIB_NOEXCEPT;
141  static void __vectorcall StoreA2(void* p, i128arg value) NLIB_NOEXCEPT;
142  static void __vectorcall StoreA1(void* p, i128arg value) NLIB_NOEXCEPT;
143  static void __vectorcall StoreLoA8(void* p, i128arg value) NLIB_NOEXCEPT;
144  static void __vectorcall StoreLoA4(void* p, i128arg value) NLIB_NOEXCEPT;
145  static void __vectorcall StoreLoA2(void* p, i128arg value) NLIB_NOEXCEPT;
146  static void __vectorcall StoreLoA1(void* p, i128arg value) NLIB_NOEXCEPT;
147  static void __vectorcall StoreHiA8(void* p, i128arg value) NLIB_NOEXCEPT;
148  static void __vectorcall StoreHiA4(void* p, i128arg value) NLIB_NOEXCEPT;
149  static void __vectorcall StoreHiA2(void* p, i128arg value) NLIB_NOEXCEPT;
150  static void __vectorcall StoreHiA1(void* p, i128arg value) NLIB_NOEXCEPT;
151 
152 #define NLIB_STORE_REDIRECT(func) \
153  static void __vectorcall func(uintptr_t p, i128arg value) NLIB_NOEXCEPT { \
154  func(reinterpret_cast<void*>(p), value); \
155  } \
156  static void __vectorcall func(intptr_t p, i128arg value) NLIB_NOEXCEPT { \
157  func(reinterpret_cast<void*>(p), value); \
158  }
159  NLIB_STORE_REDIRECT(StoreA16)
160  NLIB_STORE_REDIRECT(StoreA8)
161  NLIB_STORE_REDIRECT(StoreA4)
162  NLIB_STORE_REDIRECT(StoreA2)
163  NLIB_STORE_REDIRECT(StoreA1)
164  NLIB_STORE_REDIRECT(StoreLoA8)
165  NLIB_STORE_REDIRECT(StoreLoA4)
166  NLIB_STORE_REDIRECT(StoreLoA2)
167  NLIB_STORE_REDIRECT(StoreLoA1)
168  NLIB_STORE_REDIRECT(StoreHiA8)
169  NLIB_STORE_REDIRECT(StoreHiA4)
170  NLIB_STORE_REDIRECT(StoreHiA2)
171  NLIB_STORE_REDIRECT(StoreHiA1)
172 #undef NLIB_STORE_REDIRECT
173 
174  //
175  // Get/Set
176  //
177  template <size_t N>
178  static uint8_t __vectorcall GetUint8FromLane(i128arg value) NLIB_NOEXCEPT;
179  template <size_t N>
180  static uint16_t __vectorcall GetUint16FromLane(i128arg value) NLIB_NOEXCEPT;
181  template <size_t N>
182  static uint32_t __vectorcall GetUint32FromLane(i128arg value) NLIB_NOEXCEPT;
183  template <size_t N>
184  static uint64_t __vectorcall GetUint64FromLane(i128arg value) NLIB_NOEXCEPT;
185  template <size_t N>
186  static i128 __vectorcall SetUint8ToLane(i128arg value, uint8_t v) NLIB_NOEXCEPT;
187  template <size_t N>
188  static i128 __vectorcall SetUint16ToLane(i128arg value, uint16_t v) NLIB_NOEXCEPT;
189  template <size_t N>
190  static i128 __vectorcall SetUint32ToLane(i128arg value, uint32_t v) NLIB_NOEXCEPT;
191  template <size_t N>
192  static i128 __vectorcall SetUint64ToLane(i128arg value, uint64_t v) NLIB_NOEXCEPT;
193 
194  //
195  // Arithmetic Operations
196  //
197  static i128 __vectorcall Add8(i128arg a, i128arg b) NLIB_NOEXCEPT;
198  static i128 __vectorcall Add16(i128arg a, i128arg b) NLIB_NOEXCEPT;
199  static i128 __vectorcall Add32(i128arg a, i128arg b) NLIB_NOEXCEPT;
200  static i128 __vectorcall Add64(i128arg a, i128arg b) NLIB_NOEXCEPT;
201 
202  static i128 __vectorcall AddInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
203  static i128 __vectorcall AddInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
204 
205  static i128 __vectorcall AddUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
206  static i128 __vectorcall AddUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
207 
208  static i128 __vectorcall Sub8(i128arg a, i128arg b) NLIB_NOEXCEPT;
209  static i128 __vectorcall Sub16(i128arg a, i128arg b) NLIB_NOEXCEPT;
210  static i128 __vectorcall Sub32(i128arg a, i128arg b) NLIB_NOEXCEPT;
211  static i128 __vectorcall Sub64(i128arg a, i128arg b) NLIB_NOEXCEPT;
212 
213  static i128 __vectorcall SubInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
214  static i128 __vectorcall SubInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
215 
216  static i128 __vectorcall SubUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
217  static i128 __vectorcall SubUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
218 
219  static i128 __vectorcall PairwiseAdd8(i128arg a, i128arg b) NLIB_NOEXCEPT;
220  static i128 __vectorcall PairwiseAdd16(i128arg a, i128arg b) NLIB_NOEXCEPT;
221  static i128 __vectorcall PairwiseAdd32(i128arg a, i128arg b) NLIB_NOEXCEPT;
222  // PairwiseMaxInt8, PairwiseMaxInt16, PairwiseMaxInt32
223  // PairwiseMaxUint8, PairwiseMaxUint16, PairwiseMaxUint32
224  // PairwiseMinInt8, PairwiseMinInt16, PairwiseMinInt32
225  // PairwiseMinUint8, PairwiseMinUint16, PairwiseMinUint32
226 
227  static i128 __vectorcall Mult16(i128arg a, i128arg b) NLIB_NOEXCEPT;
228  static i128 __vectorcall MultAdd16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
229  static i128 __vectorcall MultSub16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
230  static i128 __vectorcall Mult32(i128arg a, i128arg b) NLIB_NOEXCEPT;
231  static i128 __vectorcall MultAdd32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
232  static i128 __vectorcall MultSub32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
233 
234  static i128 __vectorcall NegateInt8(i128arg value) NLIB_NOEXCEPT;
235  static i128 __vectorcall NegateInt16(i128arg value) NLIB_NOEXCEPT;
236  static i128 __vectorcall NegateInt32(i128arg value) NLIB_NOEXCEPT;
237 
238  static i128 __vectorcall MaxInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
239  static i128 __vectorcall MaxInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
240  static i128 __vectorcall MaxInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
241  static i128 __vectorcall MaxUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
242  static i128 __vectorcall MaxUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
243  static i128 __vectorcall MaxUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
244  static i128 __vectorcall MinInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
245  static i128 __vectorcall MinInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
246  static i128 __vectorcall MinInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
247  static i128 __vectorcall MinUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
248  static i128 __vectorcall MinUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
249  static i128 __vectorcall MinUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
250 
251  static i128 __vectorcall AbsInt8(i128arg value) NLIB_NOEXCEPT;
252  static i128 __vectorcall AbsInt16(i128arg value) NLIB_NOEXCEPT;
253  static i128 __vectorcall AbsInt32(i128arg value) NLIB_NOEXCEPT;
254  static i128 __vectorcall AbsDiffInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
255  static i128 __vectorcall AbsDiffInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
256  static i128 __vectorcall AbsDiffInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
257 
258  //
259  // Logical Operations
260  //
261  static i128 __vectorcall And(i128arg a, i128arg b) NLIB_NOEXCEPT;
262  static i128 __vectorcall Or(i128arg a, i128arg b) NLIB_NOEXCEPT;
263  static i128 __vectorcall Xor(i128arg a, i128arg b) NLIB_NOEXCEPT;
264  static i128 __vectorcall Not(i128arg a) NLIB_NOEXCEPT;
265  static i128 __vectorcall AndNot(i128arg a, i128arg b) NLIB_NOEXCEPT;
266  static i128 __vectorcall OrNot(i128arg a, i128arg b) NLIB_NOEXCEPT;
267  static i128 __vectorcall Test8(i128arg a, i128arg b) NLIB_NOEXCEPT;
268  static i128 __vectorcall Test16(i128arg a, i128arg b) NLIB_NOEXCEPT;
269  static i128 __vectorcall Test32(i128arg a, i128arg b) NLIB_NOEXCEPT;
270 
271  //
272  // Comparison Operations
273  //
274  static i128 __vectorcall CmpEq8(i128arg a, i128arg b) NLIB_NOEXCEPT;
275  static i128 __vectorcall CmpEq16(i128arg a, i128arg b) NLIB_NOEXCEPT;
276  static i128 __vectorcall CmpEq32(i128arg a, i128arg b) NLIB_NOEXCEPT;
277  static i128 __vectorcall CmpEq64(i128arg a, i128arg b) NLIB_NOEXCEPT;
278 
279  static i128 __vectorcall CmpLtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
280  static i128 __vectorcall CmpLtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
281  static i128 __vectorcall CmpLtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
282  static i128 __vectorcall CmpLtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
283 
284  static i128 __vectorcall CmpGtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
285  static i128 __vectorcall CmpGtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
286  static i128 __vectorcall CmpGtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
287  static i128 __vectorcall CmpGtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
288 
289  static i128 __vectorcall CmpLtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
290  static i128 __vectorcall CmpLtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
291  static i128 __vectorcall CmpLtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
292  static i128 __vectorcall CmpLtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
293 
294  static i128 __vectorcall CmpGtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
295  static i128 __vectorcall CmpGtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
296  static i128 __vectorcall CmpGtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
297  static i128 __vectorcall CmpGtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
298 
299  static i128 __vectorcall CmpLeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
300  static i128 __vectorcall CmpLeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
301  static i128 __vectorcall CmpLeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
302  static i128 __vectorcall CmpLeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
303 
304  static i128 __vectorcall CmpGeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
305  static i128 __vectorcall CmpGeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
306  static i128 __vectorcall CmpGeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
307  static i128 __vectorcall CmpGeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
308 
309  static i128 __vectorcall CmpLeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
310  static i128 __vectorcall CmpLeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
311  static i128 __vectorcall CmpLeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
312  static i128 __vectorcall CmpLeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
313 
314  static i128 __vectorcall CmpGeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
315  static i128 __vectorcall CmpGeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
316  static i128 __vectorcall CmpGeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
317  static i128 __vectorcall CmpGeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
318 
319  static i128 __vectorcall CmpEqZero8(i128arg value) NLIB_NOEXCEPT;
320  static i128 __vectorcall CmpEqZero16(i128arg value) NLIB_NOEXCEPT;
321  static i128 __vectorcall CmpEqZero32(i128arg value) NLIB_NOEXCEPT;
322  static i128 __vectorcall CmpEqZero64(i128arg value) NLIB_NOEXCEPT;
323 
324  //
325  // Bit Shift
326  //
327  static i128 __vectorcall ShiftLeftLogical8(i128arg value, int count) NLIB_NOEXCEPT;
328  static i128 __vectorcall ShiftRightLogical8(i128arg value, int count) NLIB_NOEXCEPT;
329  static i128 __vectorcall ShiftRightArithmetic8(i128arg value, int count) NLIB_NOEXCEPT;
330 
331  static i128 __vectorcall ShiftLeftLogical16(i128arg value, int count) NLIB_NOEXCEPT;
332  static i128 __vectorcall ShiftRightLogical16(i128arg value, int count) NLIB_NOEXCEPT;
333  static i128 __vectorcall ShiftRightArithmetic16(i128arg value, int count) NLIB_NOEXCEPT;
334 
335  static i128 __vectorcall ShiftLeftLogical32(i128arg value, int count) NLIB_NOEXCEPT;
336  static i128 __vectorcall ShiftRightLogical32(i128arg value, int count) NLIB_NOEXCEPT;
337  static i128 __vectorcall ShiftRightArithmetic32(i128arg value, int count) NLIB_NOEXCEPT;
338 
339  static i128 __vectorcall ShiftLeftLogical64(i128arg value, int count) NLIB_NOEXCEPT;
340  static i128 __vectorcall ShiftRightLogical64(i128arg value, int count) NLIB_NOEXCEPT;
341 
342  //
343  // Bit Shift(Constant)
344  //
345  template <size_t N>
346  static i128 __vectorcall ShiftLeftLogical8(i128arg value) NLIB_NOEXCEPT;
347  template <size_t N>
348  static i128 __vectorcall ShiftRightLogical8(i128arg value) NLIB_NOEXCEPT;
349  template <size_t N>
350  static i128 __vectorcall ShiftRightArithmetic8(i128arg value) NLIB_NOEXCEPT;
351 
352  template <size_t N>
353  static i128 __vectorcall ShiftLeftLogical16(i128arg value) NLIB_NOEXCEPT;
354  template <size_t N>
355  static i128 __vectorcall ShiftRightLogical16(i128arg value) NLIB_NOEXCEPT;
356  template <size_t N>
357  static i128 __vectorcall ShiftRightArithmetic16(i128arg value) NLIB_NOEXCEPT;
358 
359  template <size_t N>
360  static i128 __vectorcall ShiftLeftLogical32(i128arg value) NLIB_NOEXCEPT;
361  template <size_t N>
362  static i128 __vectorcall ShiftRightLogical32(i128arg value) NLIB_NOEXCEPT;
363  template <size_t N>
364  static i128 __vectorcall ShiftRightArithmetic32(i128arg value) NLIB_NOEXCEPT;
365 
366  template <size_t N>
367  static i128 __vectorcall ShiftLeftLogical64(i128arg value) NLIB_NOEXCEPT;
368  template <size_t N>
369  static i128 __vectorcall ShiftRightLogical64(i128arg value) NLIB_NOEXCEPT;
370 
371  //
372  // 128bit wide Byte Shift/Rotate
373  //
374  template <size_t N>
375  static i128 __vectorcall ByteShiftLeft(i128arg value) NLIB_NOEXCEPT;
376  template <size_t N>
377  static i128 __vectorcall ByteShiftRight(i128arg value) NLIB_NOEXCEPT;
378  template <size_t N>
379  static i128 __vectorcall ByteRotateRight(i128arg value) NLIB_NOEXCEPT;
380  template <size_t N>
381  static i128 __vectorcall AlignR(i128arg a, i128arg b) NLIB_NOEXCEPT;
382 
383  //
384  // Conversion
385  //
386  static i128 __vectorcall NarrowFrom16To8(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
387  static i128 __vectorcall NarrowFrom32To16(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
388  static i128 __vectorcall NarrowFrom64To32(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
389 
390  static i128 __vectorcall
391  ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT; // NOLINT
392  static i128 __vectorcall ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
393  static i128 __vectorcall
394  ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT; // NOLINT
395  static i128 __vectorcall
396  ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT; // NOLINT
397 
398  static i128 __vectorcall ConvertFromInt8ToInt16Lo(i128arg value) NLIB_NOEXCEPT;
399  static i128 __vectorcall ConvertFromInt8ToInt16Hi(i128arg value) NLIB_NOEXCEPT;
400  static i128 __vectorcall ConvertFromInt16ToInt32Lo(i128arg value) NLIB_NOEXCEPT;
401  static i128 __vectorcall ConvertFromInt16ToInt32Hi(i128arg value) NLIB_NOEXCEPT;
402  static i128 __vectorcall ConvertFromInt32ToInt64Lo(i128arg value) NLIB_NOEXCEPT;
403  static i128 __vectorcall ConvertFromInt32ToInt64Hi(i128arg value) NLIB_NOEXCEPT;
404  static i128 __vectorcall ConvertFromUint8ToUint16Lo(i128arg value) NLIB_NOEXCEPT;
405  static i128 __vectorcall ConvertFromUint8ToUint16Hi(i128arg value) NLIB_NOEXCEPT;
406  static i128 __vectorcall ConvertFromUint16ToUint32Lo(i128arg value) NLIB_NOEXCEPT;
407  static i128 __vectorcall ConvertFromUint16ToUint32Hi(i128arg value) NLIB_NOEXCEPT;
408  static i128 __vectorcall ConvertFromUint32ToUint64Lo(i128arg value) NLIB_NOEXCEPT;
409  static i128 __vectorcall ConvertFromUint32ToUint64Hi(i128arg value) NLIB_NOEXCEPT;
410 
411  static i128 __vectorcall Zip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
412  static i128 __vectorcall Zip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
413  static i128 __vectorcall Unzip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
414  static i128 __vectorcall Unzip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
415  static i128 __vectorcall Zip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
416  static i128 __vectorcall Zip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
417  static i128 __vectorcall Unzip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
418  static i128 __vectorcall Unzip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
419  static i128 __vectorcall Zip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
420  static i128 __vectorcall Zip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
421  static i128 __vectorcall Unzip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
422  static i128 __vectorcall Unzip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
423 
424  template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7,
425  int V8, int V9, int V10, int V11, int V12, int V13, int V14, int V15>
426  static i128 __vectorcall Permute8(i128arg a, i128arg b) NLIB_NOEXCEPT;
427  template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7>
428  static i128 __vectorcall Permute16(i128arg a, i128arg b) NLIB_NOEXCEPT;
429  template<int V0, int V1, int V2, int V3>
430  static i128 __vectorcall Permute32(i128arg a, i128arg b) NLIB_NOEXCEPT;
431 
432  //
433  // Swap endian
434  //
435  static i128 __vectorcall Reverse16(i128arg value) NLIB_NOEXCEPT;
436  static i128 __vectorcall Reverse32(i128arg value) NLIB_NOEXCEPT;
437  static i128 __vectorcall Reverse64(i128arg value) NLIB_NOEXCEPT;
438 
439  //
440  // Misc
441  //
442  static int __vectorcall MoveMask8(i128arg value) NLIB_NOEXCEPT;
443  static int __vectorcall MoveMask16(i128arg value) NLIB_NOEXCEPT;
444  static int __vectorcall MoveMask32(i128arg value) NLIB_NOEXCEPT;
445  static i128 __vectorcall SetMask8(int mask) NLIB_NOEXCEPT;
446  static i128 __vectorcall SetMask16(int mask) NLIB_NOEXCEPT;
447  static i128 __vectorcall SetMask32(int mask) NLIB_NOEXCEPT;
448  static bool __vectorcall IsZero(i128arg value) NLIB_NOEXCEPT;
449  static bool __vectorcall IsFull(i128arg value) NLIB_NOEXCEPT;
450  static i128 __vectorcall Select(i128arg mask, i128arg a, i128arg b) NLIB_NOEXCEPT;
451  static i128 __vectorcall Shuffle8(i128arg value, i128arg shuffle) NLIB_NOEXCEPT;
452  static int __vectorcall PopCntMask8(i128arg value) NLIB_NOEXCEPT;
453  static int __vectorcall ClzMask8(i128arg value) NLIB_NOEXCEPT;
454  static int __vectorcall CtzMask8(i128arg value) NLIB_NOEXCEPT;
455 
456  private:
457  I128(); // forbidden
458 };
459 
460 #ifndef NLIB_DOXYGEN
461 
462 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall
463 #define NLIB_M2(tp) inline tp __vectorcall
464 
465 #ifdef NLIB_NEON
466 # undef vreinterpret_s8_s8
467 # undef NLIB_OP1
468 # undef NLIB_OP2
469 # undef NLIB_OP3
470 # undef NLIB_CMP
471 # undef NLIB_SFT
472 # undef NLIB_CMB
473 
474 #define vreinterpretq_s8_s8(a) (a)
475 #define NLIB_OP1(intrin, tp, a) \
476  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a)))
477 #define NLIB_OP2(intrin, tp, a, b) \
478  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
479  vreinterpretq_##tp##_s8(b)))
480 #define NLIB_OP3(intrin, tp, a, b, c) \
481  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
482  vreinterpretq_##tp##_s8(b), \
483  vreinterpretq_##tp##_s8(c)))
484 #define NLIB_CMP(intrin, tp, a, b, utp) \
485  vreinterpretq_s8_##utp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
486  vreinterpretq_##tp##_s8(b)))
487 #define NLIB_SFT(intrin, tp, a, cnt, stp) \
488  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vdupq_n_##stp(cnt)))
489 #define NLIB_CMB(tp, l, h) vreinterpretq_s8_##tp(vcombine_##tp(l, h))
490 #endif
491 
492 // r.s8[i] = v for all i_mm_cvtsi32_si128
493 NLIB_M(i128) I128::SetValue(int8_t v, each_int8_tag) NLIB_NOEXCEPT {
494 #if defined(NLIB_SSE41)
495  // faster than _mm_set1_epi8(v)
496  return _mm_shuffle_epi8(_mm_cvtsi32_si128(static_cast<uint8_t>(v)), _mm_setzero_si128());
497 #elif defined(NLIB_NEON)
498  return vdupq_n_s8(v);
499 #endif
500 }
501 
502 // r.s16[i] = v for all i
503 NLIB_M(i128) I128::SetValue(int16_t v, each_int16_tag) NLIB_NOEXCEPT {
504 #if defined(NLIB_SSE41)
505  return _mm_set1_epi16(v);
506 #elif defined(NLIB_NEON)
507  return vreinterpretq_s8_s16(vdupq_n_s16(v));
508 #endif
509 }
510 
511 // r.s32[i] = v for all i
512 NLIB_M(i128) I128::SetValue(int32_t v, each_int32_tag) NLIB_NOEXCEPT {
513 #if defined(NLIB_SSE41)
514  return _mm_set1_epi32(v);
515 #elif defined(NLIB_NEON)
516  return vreinterpretq_s8_s32(vdupq_n_s32(v));
517 #endif
518 }
519 
520 // r.s64[i] = v for all i
521 NLIB_M(i128) I128::SetValue(int64_t v, each_int64_tag) NLIB_NOEXCEPT {
522 #if defined(NLIB_SSE41)
523 #ifdef _MSC_VER
524  // conflicts with ffloor()?
525  // Do not use MMX anyway
526  // return _mm_set1_epi64(*reinterpret_cast<__m64*>(&x));
527  NLIB_ALIGNAS(16) int64_t tmp[2] = {v, v};
528  return I128::LoadA16(tmp);
529 #else
530  return _mm_set1_epi64x(v);
531 #endif
532 #elif defined(NLIB_NEON)
533  return vreinterpretq_s8_s64(vdupq_n_s64(v));
534 #endif
535 }
536 
537 // r.u8[i] = v for all i
538 NLIB_M(i128) I128::SetValue(uint8_t v, each_uint8_tag) NLIB_NOEXCEPT {
539 #if defined(NLIB_SSE41)
540  // faster than _mm_set1_epi8(v)
541  return _mm_shuffle_epi8(_mm_cvtsi32_si128(v), _mm_setzero_si128());
542 #elif defined(NLIB_NEON)
543  return vreinterpretq_s8_u8(vdupq_n_u8(v));
544 #endif
545 }
546 
547 // r.u16[i] = v for all i
548 NLIB_M(i128) I128::SetValue(uint16_t v, each_uint16_tag) NLIB_NOEXCEPT {
549 #if defined(NLIB_SSE41)
550  return _mm_set1_epi16(static_cast<int16_t>(v));
551 #elif defined(NLIB_NEON)
552  return vreinterpretq_s8_u16(vdupq_n_u16(v));
553 #endif
554 }
555 
556 // r.u32[i] = v for all i
557 NLIB_M(i128) I128::SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT {
558 #if defined(NLIB_SSE41)
559  return _mm_set1_epi32(static_cast<int32_t>(v));
560 #elif defined(NLIB_NEON)
561  return vreinterpretq_s8_u32(vdupq_n_u32(v));
562 #endif
563 }
564 
565 // r.u64[i] = v for all i
566 NLIB_M(i128) I128::SetValue(uint64_t v, each_uint64_tag) NLIB_NOEXCEPT {
567 #if defined(NLIB_SSE41)
568 #ifdef _MSC_VER
569  NLIB_ALIGNAS(16) uint64_t tmp[2] = {v, v};
570  return I128::LoadA16(tmp);
571 #else
572  return _mm_set1_epi64x(static_cast<int64_t>(v));
573 #endif
574 #elif defined(NLIB_NEON)
575  return vreinterpretq_s8_u64(vdupq_n_u64(v));
576 #endif
577 }
578 
579 #if defined(NLIB_SSE41)
580 template <size_t N>
581 // r.u32[i] = value.u32[N] for all i
582 NLIB_M(i128) I128::SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT {
583  NLIB_STATIC_ASSERT(N < 4);
584  return _mm_shuffle_epi32(value, _MM_SHUFFLE(N, N, N, N));
585 }
586 #elif defined(NLIB_NEON)
587 # ifdef __aarch64__
588 template <size_t N>
589 // r.u32[i] = value.u32[N] for all i
590 NLIB_M(i128) I128::SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT {
591  NLIB_STATIC_ASSERT(N < 4);
592  uint32x4_t v = vreinterpretq_u32_s8(value);
593  return vreinterpretq_s8_u32(vdupq_laneq_u32(v, N));
594 }
595 # else
596 template <>
597 NLIB_M(i128) I128::SetValue<0>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
598  uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
599  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
600 }
601 template <>
602 NLIB_M(i128) I128::SetValue<1>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
603  uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
604  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
605 }
606 template <>
607 NLIB_M(i128) I128::SetValue<2>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
608  uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
609  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
610 }
611 template <>
612 NLIB_M(i128) I128::SetValue<3>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
613  uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
614  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
615 }
616 # endif
617 #endif
618 
619 #if defined(NLIB_SSE41)
620 template <size_t N>
621 // r.u16[i] = value.u16[N] for all i
622 NLIB_M2(i128) I128::SetValue(i128 value, each_select16_tag) NLIB_NOEXCEPT {
623  NLIB_STATIC_ASSERT(N < 8);
624  NLIB_ALIGNAS(16) const int8_t mask[16] = {
625  2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1,
626  2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1
627  };
628  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(mask));
629 }
630 #elif defined(NLIB_NEON)
631 template <>
632 NLIB_M(i128) I128::SetValue<0>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
633 # ifdef __aarch64__
634  uint16x8_t v = vreinterpretq_u16_s8(value);
635  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 0));
636 # else
637  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
638  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
639 # endif
640 }
641 
642 template <>
643 NLIB_M(i128) I128::SetValue<1>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
644 # ifdef __aarch64__
645  uint16x8_t v = vreinterpretq_u16_s8(value);
646  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 1));
647 # else
648  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
649  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
650 # endif
651 }
652 
653 template <>
654 NLIB_M(i128) I128::SetValue<2>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
655 # ifdef __aarch64__
656  uint16x8_t v = vreinterpretq_u16_s8(value);
657  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 2));
658 # else
659  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
660  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
661 # endif
662 }
663 
664 template <>
665 NLIB_M(i128) I128::SetValue<3>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
666 # ifdef __aarch64__
667  uint16x8_t v = vreinterpretq_u16_s8(value);
668  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 3));
669 # else
670  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
671  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
672 # endif
673 }
674 
675 template <>
676 NLIB_M(i128) I128::SetValue<4>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
677 # ifdef __aarch64__
678  uint16x8_t v = vreinterpretq_u16_s8(value);
679  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 4));
680 # else
681  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
682  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
683 # endif
684 }
685 
686 template <>
687 NLIB_M(i128) I128::SetValue<5>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
688 # ifdef __aarch64__
689  uint16x8_t v = vreinterpretq_u16_s8(value);
690  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 5));
691 # else
692  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
693  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
694 # endif
695 }
696 
697 template <>
698 NLIB_M(i128) I128::SetValue<6>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
699 # ifdef __aarch64__
700  uint16x8_t v = vreinterpretq_u16_s8(value);
701  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 6));
702 # else
703  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
704  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
705 # endif
706 }
707 
708 template <>
709 NLIB_M(i128) I128::SetValue<7>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
710 # ifdef __aarch64__
711  uint16x8_t v = vreinterpretq_u16_s8(value);
712  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 7));
713 # else
714  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
715  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
716 # endif
717 }
718 #endif
719 
720 #if defined(NLIB_SSE41)
721 template <size_t N>
722 // r.u8[i] = value.u8[N] for all i
723 NLIB_M2(i128) I128::SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT {
724  NLIB_STATIC_ASSERT(N < 16);
725  NLIB_ALIGNAS(16) const int8_t mask[16] = {
726  N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
727  };
728  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
729 }
730 #elif defined(NLIB_NEON)
731 namespace detail {
732 template <size_t N, bool IsLower>
733 struct SetValue8Helper {
734  NLIB_M(i128) operator()(i128 value) NLIB_NOEXCEPT {
735  return vdupq_lane_s8(vget_low_s8(value), N);
736  }
737 };
738 
739 template <size_t N>
740 struct SetValue8Helper<N, false> {
741  NLIB_M(i128) operator()(i128 value) NLIB_NOEXCEPT {
742  return vdupq_lane_s8(vget_high_s8(value), N - 8);
743  }
744 };
745 
746 } // namespace detail
747 
748 template <size_t N>
749 NLIB_M(i128) I128::SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT {
750  NLIB_STATIC_ASSERT(N < 16);
751 # ifdef __aarch64__
752  return vdupq_laneq_s8(value, N);
753 # else
754  return detail::SetValue8Helper<N, (N < 8)>()(value);
755 # endif
756 }
757 #endif
758 
759 // set 0 to all bits
760 NLIB_M(i128) I128::SetZero() NLIB_NOEXCEPT {
761 #if defined(NLIB_SSE41)
762  return _mm_setzero_si128();
763 #elif defined(NLIB_NEON)
764  return vdupq_n_s8(0);
765 #endif
766 }
767 
768 // set 1 to all bits
769 NLIB_M(i128) I128::SetFull(i128arg dummy) NLIB_NOEXCEPT { return I128::CmpEq8(dummy, dummy); }
770 
771 // r[i] = p[i], p is 16 bytes aligned
772 NLIB_M(i128) I128::LoadA16(const void* p) NLIB_NOEXCEPT {
773 #if defined(NLIB_SSE41)
774  return _mm_load_si128(static_cast<const __m128i*>(p));
775 #elif defined(NLIB_NEON)
776  uint64x2_t tmp = vld1q_u64(static_cast<const uint64_t*>(p));
777  return vreinterpretq_s8_u64(tmp);
778 #endif
779 }
780 
781 // r[i] = p[i], p is 8 bytes aligned
782 NLIB_M(i128) I128::LoadA8(const void* p) NLIB_NOEXCEPT {
783 #if defined(NLIB_SSE41)
784  return _mm_loadu_si128(static_cast<const __m128i*>(p));
785 #elif defined(NLIB_NEON)
786  uint64x2_t tmp = vld1q_u64(static_cast<const uint64_t*>(p));
787  return vreinterpretq_s8_u64(tmp);
788 #endif
789 }
790 
791 // r[i] = p[i], p is 4 bytes aligned
792 NLIB_M(i128) I128::LoadA4(const void* p) NLIB_NOEXCEPT {
793 #if defined(NLIB_SSE41)
794  return _mm_loadu_si128(static_cast<const __m128i*>(p));
795 #elif defined(NLIB_NEON)
796  uint32x4_t tmp = vld1q_u32(static_cast<const uint32_t*>(p));
797  return vreinterpretq_s8_u32(tmp);
798 #endif
799 }
800 
801 // r[i] = p[i], p is 2 bytes aligned
802 NLIB_M(i128) I128::LoadA2(const void* p) NLIB_NOEXCEPT {
803 #if defined(NLIB_SSE41)
804  return _mm_loadu_si128(static_cast<const __m128i*>(p));
805 #elif defined(NLIB_NEON)
806  uint16x8_t tmp = vld1q_u16(static_cast<const uint16_t*>(p));
807  return vreinterpretq_s8_u16(tmp);
808 #endif
809 }
810 
811 // r[i] = p[i]
812 NLIB_M(i128) I128::LoadA1(const void* p) NLIB_NOEXCEPT {
813 #if defined(NLIB_SSE41)
814  return _mm_loadu_si128(static_cast<const __m128i*>(p));
815 #elif defined(NLIB_NEON)
816  return vld1q_s8(static_cast<const int8_t*>(p));
817 #endif
818 }
819 
820 NLIB_M(i128) I128::LoadLoA8(const void* p) NLIB_NOEXCEPT {
821 #if defined(NLIB_SSE41)
822  return _mm_loadl_epi64(static_cast<const __m128i*>(p));
823 #elif defined(NLIB_NEON)
824  int8x8_t lo = vreinterpret_s8_u64(vld1_u64(static_cast<const uint64_t*>(p)));
825  return vcombine_s8(lo, vdup_n_s8(0));
826 #endif
827 }
828 
829 NLIB_M(i128) I128::LoadLoA4(const void* p) NLIB_NOEXCEPT {
830 #if defined(NLIB_SSE41)
831  return _mm_loadl_epi64(static_cast<const __m128i*>(p));
832 #elif defined(NLIB_NEON)
833  int8x8_t lo = vreinterpret_s8_u32(vld1_u32(static_cast<const uint32_t*>(p)));
834  return vcombine_s8(lo, vdup_n_s8(0));
835 #endif
836 }
837 
838 NLIB_M(i128) I128::LoadLoA2(const void* p) NLIB_NOEXCEPT {
839 #if defined(NLIB_SSE41)
840  return _mm_loadl_epi64(static_cast<const __m128i*>(p));
841 #elif defined(NLIB_NEON)
842  int8x8_t lo = vreinterpret_s8_u16(vld1_u16(static_cast<const uint16_t*>(p)));
843  return vcombine_s8(lo, vdup_n_s8(0));
844 #endif
845 }
846 
847 NLIB_M(i128) I128::LoadLoA1(const void* p) NLIB_NOEXCEPT {
848 #if defined(NLIB_SSE41)
849  return _mm_loadl_epi64(static_cast<const __m128i*>(p));
850 #elif defined(NLIB_NEON)
851  int8x8_t lo = vld1_s8(static_cast<const int8_t*>(p));
852  return vcombine_s8(lo, vdup_n_s8(0));
853 #endif
854 }
855 
856 NLIB_M(i128) I128::LoadHiA8(const void* p) NLIB_NOEXCEPT {
857 #if defined(NLIB_SSE41)
858  __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
859  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
860 #elif defined(NLIB_NEON)
861  int8x8_t hi = vreinterpret_s8_u64(vld1_u64(static_cast<const uint64_t*>(p)));
862  return vcombine_s8(vdup_n_s8(0), hi);
863 #endif
864 }
865 
866 NLIB_M(i128) I128::LoadHiA4(const void* p) NLIB_NOEXCEPT {
867 #if defined(NLIB_SSE41)
868  __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
869  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
870 #elif defined(NLIB_NEON)
871  int8x8_t hi = vreinterpret_s8_u32(vld1_u32(static_cast<const uint32_t*>(p)));
872  return vcombine_s8(vdup_n_s8(0), hi);
873 #endif
874 }
875 
876 NLIB_M(i128) I128::LoadHiA2(const void* p) NLIB_NOEXCEPT {
877 #if defined(NLIB_SSE41)
878  __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
879  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
880 #elif defined(NLIB_NEON)
881  int8x8_t hi = vreinterpret_s8_u16(vld1_u16(static_cast<const uint16_t*>(p)));
882  return vcombine_s8(vdup_n_s8(0), hi);
883 #endif
884 }
885 
886 NLIB_M(i128) I128::LoadHiA1(const void* p) NLIB_NOEXCEPT {
887 #if defined(NLIB_SSE41)
888  __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
889  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
890 #elif defined(NLIB_NEON)
891  int8x8_t hi = vld1_s8(static_cast<const int8_t*>(p));
892  return vcombine_s8(vdup_n_s8(0), hi);
893 #endif
894 }
895 
896 // p[i] = value[i], p is 16 bytes aligned
897 NLIB_M(void) I128::StoreA16(void* p, i128arg value) NLIB_NOEXCEPT {
898 #if defined(NLIB_SSE41)
899  _mm_store_si128(static_cast<i128*>(p), value);
900 #elif defined(NLIB_NEON)
901  vst1q_u64(static_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
902 #endif
903 }
904 
905 // p[i] = value[i], p is 8 bytes aligned
906 NLIB_M(void) I128::StoreA8(void* p, i128arg value) NLIB_NOEXCEPT {
907 #if defined(NLIB_SSE41)
908  _mm_storeu_si128(static_cast<i128*>(p), value);
909 #elif defined(NLIB_NEON)
910  vst1q_u64(static_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
911 #endif
912 }
913 
914 // p[i] = value[i], p is 4 bytes aligned
915 NLIB_M(void) I128::StoreA4(void* p, i128arg value) NLIB_NOEXCEPT {
916 #if defined(NLIB_SSE41)
917  _mm_storeu_si128(static_cast<i128*>(p), value);
918 #elif defined(NLIB_NEON)
919  vst1q_u32(static_cast<uint32_t*>(p), vreinterpretq_u32_s8(value));
920 #endif
921 }
922 
923 // p[i] = value[i], p is 2 bytes aligned
924 NLIB_M(void) I128::StoreA2(void* p, i128arg value) NLIB_NOEXCEPT {
925 #if defined(NLIB_SSE41)
926  _mm_storeu_si128(static_cast<i128*>(p), value);
927 #elif defined(NLIB_NEON)
928  vst1q_u16(static_cast<uint16_t*>(p), vreinterpretq_u16_s8(value));
929 #endif
930 }
931 
932 // p[i] = value[i]
933 NLIB_M(void) I128::StoreA1(void* p, i128arg value) NLIB_NOEXCEPT {
934 #if defined(NLIB_SSE41)
935  _mm_storeu_si128(static_cast<i128*>(p), value);
936 #elif defined(NLIB_NEON)
937  vst1q_s8(static_cast<int8_t*>(p), value);
938 #endif
939 }
940 
941 NLIB_M(void) I128::StoreLoA8(void* p, i128arg value) NLIB_NOEXCEPT {
942 #if defined(NLIB_SSE41)
943  _mm_storel_epi64(static_cast<i128*>(p), value);
944 #elif defined(NLIB_NEON)
945  uint64x1_t x = vreinterpret_u64_s8(vget_low_s8(value));
946  vst1_u64(static_cast<uint64_t*>(p), x);
947 #endif
948 }
949 
950 NLIB_M(void) I128::StoreLoA4(void* p, i128arg value) NLIB_NOEXCEPT {
951 #if defined(NLIB_SSE41)
952  _mm_storel_epi64(static_cast<i128*>(p), value);
953 #elif defined(NLIB_NEON)
954  uint32x2_t x = vreinterpret_u32_s8(vget_low_s8(value));
955  vst1_u32(static_cast<uint32_t*>(p), x);
956 #endif
957 }
958 
959 NLIB_M(void) I128::StoreLoA2(void* p, i128arg value) NLIB_NOEXCEPT {
960 #if defined(NLIB_SSE41)
961  _mm_storel_epi64(static_cast<i128*>(p), value);
962 #elif defined(NLIB_NEON)
963  uint16x4_t x = vreinterpret_u16_s8(vget_low_s8(value));
964  vst1_u16(static_cast<uint16_t*>(p), x);
965 #endif
966 }
967 
968 NLIB_M(void) I128::StoreLoA1(void* p, i128arg value) NLIB_NOEXCEPT {
969 #if defined(NLIB_SSE41)
970  _mm_storel_epi64(static_cast<i128*>(p), value);
971 #elif defined(NLIB_NEON)
972  int8x8_t x = vget_low_s8(value);
973  vst1_s8(static_cast<int8_t*>(p), x);
974 #endif
975 }
976 
977 NLIB_M(void) I128::StoreHiA8(void* p, i128arg value) NLIB_NOEXCEPT {
978 #if defined(NLIB_SSE41)
979  _mm_storel_epi64(static_cast<i128*>(p),
980  _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
981 #elif defined(NLIB_NEON)
982  uint64x1_t x = vreinterpret_u64_s8(vget_high_s8(value));
983  vst1_u64(static_cast<uint64_t*>(p), x);
984 #endif
985 }
986 
987 NLIB_M(void) I128::StoreHiA4(void* p, i128arg value) NLIB_NOEXCEPT {
988 #if defined(NLIB_SSE41)
989  _mm_storel_epi64(static_cast<i128*>(p),
990  _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
991 #elif defined(NLIB_NEON)
992  uint32x2_t x = vreinterpret_u32_s8(vget_high_s8(value));
993  vst1_u32(static_cast<uint32_t*>(p), x);
994 #endif
995 }
996 
997 NLIB_M(void) I128::StoreHiA2(void* p, i128arg value) NLIB_NOEXCEPT {
998 #if defined(NLIB_SSE41)
999  _mm_storel_epi64(static_cast<i128*>(p),
1000  _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
1001 #elif defined(NLIB_NEON)
1002  uint16x4_t x = vreinterpret_u16_s8(vget_high_s8(value));
1003  vst1_u16(static_cast<uint16_t*>(p), x);
1004 #endif
1005 }
1006 
1007 NLIB_M(void) I128::StoreHiA1(void* p, i128arg value) NLIB_NOEXCEPT {
1008 #if defined(NLIB_SSE41)
1009  _mm_storel_epi64(static_cast<i128*>(p),
1010  _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
1011 #elif defined(NLIB_NEON)
1012  int8x8_t x = vget_high_s8(value);
1013  vst1_s8(static_cast<int8_t*>(p), x);
1014 #endif
1015 }
1016 
1017 template <size_t N>
1018 // r = value.u8[N]
1019 NLIB_M(uint8_t) I128::GetUint8FromLane(i128arg value) NLIB_NOEXCEPT {
1020  NLIB_STATIC_ASSERT(N < 16);
1021 #if defined(NLIB_SSE41)
1022  return static_cast<uint8_t>(_mm_extract_epi8(value, N));
1023 #elif defined(NLIB_NEON)
1024  return vgetq_lane_u8(vreinterpretq_u8_s8(value), N);
1025 #endif
1026 }
1027 
1028 template <size_t N>
1029 // r = value.u16[N]
1030 NLIB_M(uint16_t) I128::GetUint16FromLane(i128arg value) NLIB_NOEXCEPT {
1031  NLIB_STATIC_ASSERT(N < 8);
1032 #if defined(NLIB_SSE41)
1033  return static_cast<uint16_t>(_mm_extract_epi16(value, N));
1034 #elif defined(NLIB_NEON)
1035  return vgetq_lane_u16(vreinterpretq_u16_s8(value), N);
1036 #endif
1037 }
1038 
1039 template <size_t N>
1040 // r = value.u32[N]
1041 NLIB_M(uint32_t) I128::GetUint32FromLane(i128arg value) NLIB_NOEXCEPT {
1042  NLIB_STATIC_ASSERT(N < 4);
1043 #if defined(NLIB_SSE41)
1044  return static_cast<uint32_t>(_mm_extract_epi32(value, N));
1045 #elif defined(NLIB_NEON)
1046  return vgetq_lane_u32(vreinterpretq_u32_s8(value), N);
1047 #endif
1048 }
1049 
1050 template <size_t N>
1051 // r = value.u64[N]
1052 NLIB_M(uint64_t) I128::GetUint64FromLane(i128arg value) NLIB_NOEXCEPT {
1053  NLIB_STATIC_ASSERT(N < 2);
1054 #if defined(NLIB_SSE41)
1055 #ifdef NLIB_64BIT
1056  return static_cast<uint64_t>(_mm_extract_epi64(value, N));
1057 #else
1058  NLIB_UNUSED(value);
1059  return 0; // dummy
1060 #endif
1061 #elif defined(NLIB_NEON)
1062  return vgetq_lane_u64(vreinterpretq_u64_s8(value), N);
1063 #endif
1064 }
1065 
1066 #if defined(NLIB_SSE41) && !defined(NLIB_64BIT)
1067 template <>
1068 NLIB_M(uint64_t) I128::GetUint64FromLane<0>(i128arg value) NLIB_NOEXCEPT {
1069  uint64_t rval;
1070  _mm_storel_epi64(reinterpret_cast<i128*>(&rval), value);
1071  return rval;
1072 }
1073 template <>
1074 NLIB_M(uint64_t) I128::GetUint64FromLane<1>(i128arg value) NLIB_NOEXCEPT {
1075  uint64_t rval;
1076  i128 tmp = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
1077  _mm_storel_epi64(reinterpret_cast<i128*>(&rval), tmp);
1078  return rval;
1079 }
1080 #endif
1081 
1082 template <size_t N>
1083 // r = value, r.u8[N] = v
1084 NLIB_M(i128) I128::SetUint8ToLane(i128arg value, uint8_t v) NLIB_NOEXCEPT {
1085  NLIB_STATIC_ASSERT(N < 16);
1086 #if defined(NLIB_SSE41)
1087  return _mm_insert_epi8(value, static_cast<int8_t>(v), N);
1088 #elif defined(NLIB_NEON)
1089  return __builtin_constant_p(v) ?
1090  I128::Permute8<
1091  N == 0 ? 16 : 0,
1092  N == 1 ? 17 : 1,
1093  N == 2 ? 18 : 2,
1094  N == 3 ? 19 : 3,
1095  N == 4 ? 20 : 4,
1096  N == 5 ? 21 : 5,
1097  N == 6 ? 22 : 6,
1098  N == 7 ? 23 : 7,
1099  N == 8 ? 24 : 8,
1100  N == 9 ? 25 : 9,
1101  N == 10 ? 26 : 10,
1102  N == 11 ? 27 : 11,
1103  N == 12 ? 28 : 12,
1104  N == 13 ? 29 : 13,
1105  N == 14 ? 30 : 14,
1106  N == 15 ? 31 : 15>(value, vreinterpretq_s8_u8(vdupq_n_u8(v))) :
1107  vreinterpretq_s8_u8(vsetq_lane_u8(v, vreinterpretq_u8_s8(value), N));
1108 #endif
1109 }
1110 
1111 template <size_t N>
1112 // r = value, r.u16[N] = v
1113 NLIB_M(i128) I128::SetUint16ToLane(i128arg value, uint16_t v) NLIB_NOEXCEPT {
1114  NLIB_STATIC_ASSERT(N < 8);
1115 #if defined(NLIB_SSE41)
1116  return _mm_insert_epi16(value, static_cast<int16_t>(v), N);
1117 #elif defined(NLIB_NEON)
1118  return __builtin_constant_p(v) ?
1119  I128::Permute16<
1120  N == 0 ? 8 : 0,
1121  N == 1 ? 9 : 1,
1122  N == 2 ? 10 : 2,
1123  N == 3 ? 11 : 3,
1124  N == 4 ? 12 : 4,
1125  N == 5 ? 13 : 5,
1126  N == 6 ? 14 : 6,
1127  N == 7 ? 15 : 7>(value, vreinterpretq_s8_u16(vdupq_n_u16(v))) :
1128  vreinterpretq_s8_u16(vsetq_lane_u16(v, vreinterpretq_u16_s8(value), N));
1129 #endif
1130 }
1131 
1132 template <size_t N>
1133 // r = value, r.u32[N] = v
1134 NLIB_M(i128) I128::SetUint32ToLane(i128arg value, uint32_t v) NLIB_NOEXCEPT {
1135  NLIB_STATIC_ASSERT(N < 4);
1136 #if defined(NLIB_SSE41)
1137  return _mm_insert_epi32(value, static_cast<uint32_t>(v), N);
1138 #elif defined(NLIB_NEON)
1139  return __builtin_constant_p(v) ?
1140  I128::Permute32<N == 0 ? 4 : 0,
1141  N == 1 ? 5 : 1,
1142  N == 2 ? 6 : 2,
1143  N == 3 ? 7 : 3>(value, vreinterpretq_s8_u32(vdupq_n_u32(v))) :
1144  vreinterpretq_s8_u32(vsetq_lane_u32(v, vreinterpretq_u32_s8(value), N));
1145 #endif
1146 }
1147 
1148 template <size_t N>
1149 // r = value, r.u64[N] = v
1150 NLIB_M(i128) I128::SetUint64ToLane(i128arg value, uint64_t v) NLIB_NOEXCEPT {
1151  NLIB_STATIC_ASSERT(N < 2);
1152 #if defined(NLIB_SSE41)
1153 #ifdef NLIB_64BIT
1154  return _mm_insert_epi64(value, static_cast<int64_t>(v), N);
1155 #else
1156  union {
1157  int32_t i32[2];
1158  int64_t i64;
1159  } tmp;
1160  tmp.i64 = static_cast<int64_t>(v);
1161  __m128i rval;
1162  rval = _mm_insert_epi32(value, tmp.i32[0], N * 2 + 0);
1163  return _mm_insert_epi32(rval, tmp.i32[1], N * 2 + 1);
1164 #endif
1165 #elif defined(NLIB_NEON)
1166  return vreinterpretq_s8_u64(vsetq_lane_u64(v, vreinterpretq_u64_s8(value), N));
1167 #endif
1168 }
1169 
1170 // r.i8[i] = a.i8[i] + b.i8[i]
1171 NLIB_M(i128) I128::Add8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1172 #if defined(NLIB_SSE41)
1173  return _mm_add_epi8(a, b);
1174 #elif defined(NLIB_NEON)
1175  return vaddq_s8(a, b);
1176 #endif
1177 }
1178 
1179 // r.i16[i] = a.i16[i] + b.i16[i]
1180 NLIB_M(i128) I128::Add16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1181 #if defined(NLIB_SSE41)
1182  return _mm_add_epi16(a, b);
1183 #elif defined(NLIB_NEON)
1184  return NLIB_OP2(vaddq, s16, a, b);
1185 #endif
1186 }
1187 
1188 // r.i32[i] = a.i32[i] + b.i32[i]
1189 NLIB_M(i128) I128::Add32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1190 #if defined(NLIB_SSE41)
1191  return _mm_add_epi32(a, b);
1192 #elif defined(NLIB_NEON)
1193  return NLIB_OP2(vaddq, s32, a, b);
1194 #endif
1195 }
1196 
1197 // r.i64[i] = a.i64[i] + b.i64[i]
1198 NLIB_M(i128) I128::Add64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1199 #if defined(NLIB_SSE41)
1200  return _mm_add_epi64(a, b);
1201 #elif defined(NLIB_NEON)
1202  return NLIB_OP2(vaddq, s64, a, b);
1203 #endif
1204 }
1205 
1206 // 127 if a[i] + b[i] > 127, -128 if a[i] + b[i] < -128
1207 NLIB_M(i128) I128::AddInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1208 #if defined(NLIB_SSE41)
1209  return _mm_adds_epi8(a, b);
1210 #elif defined(NLIB_NEON)
1211  return vqaddq_s8(a, b);
1212 #endif
1213 }
1214 
1215 // 32767 if a[i] + b[i] > 32767, -32768 if a[i] + b[i] < -32768
1216 NLIB_M(i128) I128::AddInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1217 #if defined(NLIB_SSE41)
1218  return _mm_adds_epi16(a, b);
1219 #elif defined(NLIB_NEON)
1220  return NLIB_OP2(vqaddq, s16, a, b);
1221 #endif
1222 }
1223 
1224 // 255 if a[i] + b[i] > 255
1225 NLIB_M(i128) I128::AddUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1226 #if defined(NLIB_SSE41)
1227  return _mm_adds_epu8(a, b);
1228 #elif defined(NLIB_NEON)
1229  return NLIB_OP2(vqaddq, u8, a, b);
1230 #endif
1231 }
1232 
1233 // 65535 if a[i] + b[i] > 65535
1234 NLIB_M(i128) I128::AddUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1235 #if defined(NLIB_SSE41)
1236  return _mm_adds_epu16(a, b);
1237 #elif defined(NLIB_NEON)
1238  return NLIB_OP2(vqaddq, u16, a, b);
1239 #endif
1240 }
1241 
1242 // r.i8[i] = a.i8[i] - b.i8[i]
1243 NLIB_M(i128) I128::Sub8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1244 #if defined(NLIB_SSE41)
1245  return _mm_sub_epi8(a, b);
1246 #elif defined(NLIB_NEON)
1247  return vsubq_s8(a, b);
1248 #endif
1249 }
1250 
1251 // r.i16[i] = a.i16[i] - b.i16[i]
1252 NLIB_M(i128) I128::Sub16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1253 #if defined(NLIB_SSE41)
1254  return _mm_sub_epi16(a, b);
1255 #elif defined(NLIB_NEON)
1256  return NLIB_OP2(vsubq, s16, a, b);
1257 #endif
1258 }
1259 
1260 // r.i32[i] = a.i32[i] - b.i32[i]
1261 NLIB_M(i128) I128::Sub32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1262 #if defined(NLIB_SSE41)
1263  return _mm_sub_epi32(a, b);
1264 #elif defined(NLIB_NEON)
1265  return NLIB_OP2(vsubq, s32, a, b);
1266 #endif
1267 }
1268 
1269 // r.i64[0] = a.i64[0] - b.i64[0]
1270 NLIB_M(i128) I128::Sub64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1271 #if defined(NLIB_SSE41)
1272  return _mm_sub_epi64(a, b);
1273 #elif defined(NLIB_NEON)
1274  return NLIB_OP2(vsubq, s64, a, b);
1275 #endif
1276 }
1277 
1278 // 127 if a[i] - b[i] > 127, -128 if a[i] - b[i] < -128
1279 NLIB_M(i128) I128::SubInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1280 #if defined(NLIB_SSE41)
1281  return _mm_subs_epi8(a, b);
1282 #elif defined(NLIB_NEON)
1283  return NLIB_OP2(vqsubq, s8, a, b);
1284 #endif
1285 }
1286 
1287 // 32767 if a[i] - b[i] > 32767, -32768 if a[i] - b[i] < -32768
1288 NLIB_M(i128) I128::SubInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1289 #if defined(NLIB_SSE41)
1290  return _mm_subs_epi16(a, b);
1291 #elif defined(NLIB_NEON)
1292  return NLIB_OP2(vqsubq, s16, a, b);
1293 #endif
1294 }
1295 
1296 // 0 if a.u8[i] - b.u8[i] < 0
1297 NLIB_M(i128) I128::SubUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1298 #if defined(NLIB_SSE41)
1299  return _mm_subs_epu8(a, b);
1300 #elif defined(NLIB_NEON)
1301  return NLIB_OP2(vqsubq, u8, a, b);
1302 #endif
1303 }
1304 
1305 // 0 if a.u16[i] - b.u16[i] < 0
1306 NLIB_M(i128) I128::SubUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1307 #if defined(NLIB_SSE41)
1308  return _mm_subs_epu16(a, b);
1309 #elif defined(NLIB_NEON)
1310  return NLIB_OP2(vqsubq, u16, a, b);
1311 #endif
1312 }
1313 
1314 // r.i8[0] = value[0] + value[1], ..., r.i8[7] = value[14] + value[15]
1315 NLIB_M(i128) I128::PairwiseAdd8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1316 #if defined(NLIB_SSE41)
1317  __m128i ax = _mm_add_epi8(a, _mm_srli_epi16(a, 8));
1318  __m128i bx = _mm_add_epi8(b, _mm_srli_epi16(b, 8));
1319  return I128::NarrowFrom16To8(ax, bx);
1320 #elif defined(NLIB_NEON)
1321 # ifdef __aarch64__
1322  return vpaddq_s8(a, b);
1323 # else
1324  int8x8_t al = vget_low_s8(a);
1325  int8x8_t ah = vget_high_s8(a);
1326  int8x8_t rl = vpadd_s8(al, ah);
1327  int8x8_t bl = vget_low_s8(b);
1328  int8x8_t bh = vget_high_s8(b);
1329  int8x8_t rh = vpadd_s8(bl, bh);
1330  return vcombine_s8(rl, rh);
1331 # endif
1332 #endif
1333 }
1334 
1335 // r.i16[0] = value[0] + value[1], ..., r.i16[3] = value[6] + value[7]
1336 NLIB_M(i128) I128::PairwiseAdd16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1337 #if defined(NLIB_SSE41)
1338  return _mm_hadd_epi16(a, b);
1339 #elif defined(NLIB_NEON)
1340 # ifdef __aarch64__
1341  return vreinterpretq_s8_s16(vpaddq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)));
1342 # else
1343  int16x4_t al = vget_low_s16(vreinterpretq_s16_s8(a));
1344  int16x4_t ah = vget_high_s16(vreinterpretq_s16_s8(a));
1345  int16x4_t rl = vpadd_s16(al, ah);
1346  int16x4_t bl = vget_low_s16(vreinterpretq_s16_s8(b));
1347  int16x4_t bh = vget_high_s16(vreinterpretq_s16_s8(b));
1348  int16x4_t rh = vpadd_s16(bl, bh);
1349  return NLIB_CMB(s16, rl, rh);
1350 # endif
1351 #endif
1352 }
1353 
1354 // r.i32[0] = value[0] + value[1], r.i32[1] = value[2] + value[3]
1355 NLIB_M(i128) I128::PairwiseAdd32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1356 #if defined(NLIB_SSE41)
1357  return _mm_hadd_epi32(a, b);
1358 #elif defined(NLIB_NEON)
1359 # ifdef __aarch64__
1360  return vreinterpretq_s8_s32(vpaddq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)));
1361 # else
1362  int32x2_t al = vget_low_s32(vreinterpretq_s32_s8(a));
1363  int32x2_t ah = vget_high_s32(vreinterpretq_s32_s8(a));
1364  int32x2_t rl = vpadd_s32(al, ah);
1365  int32x2_t bl = vget_low_s32(vreinterpretq_s32_s8(b));
1366  int32x2_t bh = vget_high_s32(vreinterpretq_s32_s8(b));
1367  int32x2_t rh = vpadd_s32(bl, bh);
1368  return NLIB_CMB(s32, rl, rh);
1369 # endif
1370 #endif
1371 }
1372 
1373 // r.i16[i] = a.i16[i] * b.i16[i]
1374 NLIB_M(i128) I128::Mult16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1375 #if defined(NLIB_SSE41)
1376  return _mm_mullo_epi16(a, b);
1377 #elif defined(NLIB_NEON)
1378  return NLIB_OP2(vmulq, s16, a, b);
1379 #endif
1380 }
1381 
1382 // r = c + a * b
1383 NLIB_M(i128) I128::MultAdd16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1384 #if defined(NLIB_SSE41)
1385  return _mm_add_epi16(c, _mm_mullo_epi16(a, b));
1386 #elif defined(NLIB_NEON)
1387  return NLIB_OP3(vmlaq, s16, c, a, b);
1388 #endif
1389 }
1390 
1391 // r = c - a * b
1392 NLIB_M(i128) I128::MultSub16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1393 #if defined(NLIB_SSE41)
1394  return _mm_sub_epi16(c, _mm_mullo_epi16(a, b));
1395 #elif defined(NLIB_NEON)
1396  return NLIB_OP3(vmlsq, s16, c, a, b);
1397 #endif
1398 }
1399 
1400 // r.i32[i] = a.i32[i] * b.i32[i]
1401 NLIB_M(i128) I128::Mult32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1402 #if defined(NLIB_SSE41)
1403  return _mm_mullo_epi32(a, b);
1404 #elif defined(NLIB_NEON)
1405  return NLIB_OP2(vmulq, s32, a, b);
1406 #endif
1407 }
1408 
1409 // r = c + a * b
1410 NLIB_M(i128) I128::MultAdd32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1411 #if defined(NLIB_SSE41)
1412  return _mm_add_epi32(c, _mm_mullo_epi32(a, b));
1413 #elif defined(NLIB_NEON)
1414  return NLIB_OP3(vmlaq, s32, c, a, b);
1415 #endif
1416 }
1417 
1418 // r = c - a * b
1419 NLIB_M(i128) I128::MultSub32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1420 #if defined(NLIB_SSE41)
1421  return _mm_sub_epi32(c, _mm_mullo_epi32(a, b));
1422 #elif defined(NLIB_NEON)
1423  return NLIB_OP3(vmlsq, s32, c, a, b);
1424 #endif
1425 }
1426 
1427 // r.s8[i] = max(a.s8[i], b.s8[i])
1428 NLIB_M(i128) I128::MaxInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1429 #if defined(NLIB_SSE41)
1430  return _mm_max_epi8(a, b);
1431 #elif defined(NLIB_NEON)
1432  return NLIB_OP2(vmaxq, s8, a, b);
1433 #endif
1434 }
1435 
1436 // r.s16[i] = max(a.s16[i], b.s16[i])
1437 NLIB_M(i128) I128::MaxInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1438 #if defined(NLIB_SSE41)
1439  return _mm_max_epi16(a, b);
1440 #elif defined(NLIB_NEON)
1441  return NLIB_OP2(vmaxq, s16, a, b);
1442 #endif
1443 }
1444 
1445 // r.s32[i] = max(a.s32[i], b.s32[i])
1446 NLIB_M(i128) I128::MaxInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1447 #if defined(NLIB_SSE41)
1448  return _mm_max_epi32(a, b);
1449 #elif defined(NLIB_NEON)
1450  return NLIB_OP2(vmaxq, s32, a, b);
1451 #endif
1452 }
1453 
1454 // r.u8[i] = max(a.u8[i], b.u8[i])
1455 NLIB_M(i128) I128::MaxUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1456 #if defined(NLIB_SSE41)
1457  return _mm_max_epu8(a, b);
1458 #elif defined(NLIB_NEON)
1459  return NLIB_OP2(vmaxq, u8, a, b);
1460 #endif
1461 }
1462 
1463 // r.u16[i] = max(a.u16[i], b.u16[i])
1464 NLIB_M(i128) I128::MaxUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1465 #if defined(NLIB_SSE41)
1466  return _mm_max_epu16(a, b);
1467 #elif defined(NLIB_NEON)
1468  return NLIB_OP2(vmaxq, u16, a, b);
1469 #endif
1470 }
1471 
1472 // r.u32[i] = max(a.u32[i], b.u32[i])
1473 NLIB_M(i128) I128::MaxUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1474 #if defined(NLIB_SSE41)
1475  return _mm_max_epu32(a, b);
1476 #elif defined(NLIB_NEON)
1477  return NLIB_OP2(vmaxq, u32, a, b);
1478 #endif
1479 }
1480 
1481 // r.s8[i] = min(a.s8[i], b.s8[i])
1482 NLIB_M(i128) I128::MinInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1483 #if defined(NLIB_SSE41)
1484  return _mm_min_epi8(a, b);
1485 #elif defined(NLIB_NEON)
1486  return NLIB_OP2(vminq, s8, a, b);
1487 #endif
1488 }
1489 
1490 // r.s16[i] = min(a.s16[i], b.s16[i])
1491 NLIB_M(i128) I128::MinInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1492 #if defined(NLIB_SSE41)
1493  return _mm_min_epi16(a, b);
1494 #elif defined(NLIB_NEON)
1495  return NLIB_OP2(vminq, s16, a, b);
1496 #endif
1497 }
1498 
1499 // r.s32[i] = min(a.s32[i], b.s32[i])
1500 NLIB_M(i128) I128::MinInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1501 #if defined(NLIB_SSE41)
1502  return _mm_min_epi32(a, b);
1503 #elif defined(NLIB_NEON)
1504  return NLIB_OP2(vminq, s32, a, b);
1505 #endif
1506 }
1507 
1508 // r.u8[i] = min(a.u8[i], b.u8[i])
1509 NLIB_M(i128) I128::MinUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1510 #if defined(NLIB_SSE41)
1511  return _mm_min_epu8(a, b);
1512 #elif defined(NLIB_NEON)
1513  return NLIB_OP2(vminq, u8, a, b);
1514 #endif
1515 }
1516 
1517 // r.u16[i] = min(a.u16[i], b.u16[i])
1518 NLIB_M(i128) I128::MinUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1519 #if defined(NLIB_SSE41)
1520  return _mm_min_epu16(a, b);
1521 #elif defined(NLIB_NEON)
1522  return NLIB_OP2(vminq, u16, a, b);
1523 #endif
1524 }
1525 
1526 // r.u32[i] = min(a.u32[i], b.u32[i])
1527 NLIB_M(i128) I128::MinUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1528 #if defined(NLIB_SSE41)
1529  return _mm_min_epu32(a, b);
1530 #elif defined(NLIB_NEON)
1531  return NLIB_OP2(vminq, u32, a, b);
1532 #endif
1533 }
1534 
1535 // r.s8[i] = abs(value.s8[i])
1536 NLIB_M(i128) I128::AbsInt8(i128arg value) NLIB_NOEXCEPT {
1537 #if defined(NLIB_SSE41)
1538  return _mm_abs_epi8(value);
1539 #elif defined(NLIB_NEON)
1540  return NLIB_OP1(vabsq, s8, value);
1541 #endif
1542 }
1543 
1544 // r.s16[i] = abs(value.s16[i])
1545 NLIB_M(i128) I128::AbsInt16(i128arg value) NLIB_NOEXCEPT {
1546 #if defined(NLIB_SSE41)
1547  return _mm_abs_epi16(value);
1548 #elif defined(NLIB_NEON)
1549  return NLIB_OP1(vabsq, s16, value);
1550 #endif
1551 }
1552 
1553 // r.s32[i] = abs(value.s32[i])
1554 NLIB_M(i128) I128::AbsInt32(i128arg value) NLIB_NOEXCEPT {
1555 #if defined(NLIB_SSE41)
1556  return _mm_abs_epi32(value);
1557 #elif defined(NLIB_NEON)
1558  return NLIB_OP1(vabsq, s32, value);
1559 #endif
1560 }
1561 
1562 // r.s8[i] = abs(a.s8[i] - b.s8[i])
1563 NLIB_M(i128) I128::AbsDiffInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1564 #if defined(NLIB_SSE41)
1565  return _mm_abs_epi8(_mm_sub_epi8(a, b));
1566 #elif defined(NLIB_NEON)
1567  return NLIB_OP2(vabdq, s8, a, b);
1568 #endif
1569 }
1570 
1571 // r.s16[i] = abs(a.s16[i] - b.s16[i])
1572 NLIB_M(i128) I128::AbsDiffInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1573 #if defined(NLIB_SSE41)
1574  return _mm_abs_epi16(_mm_sub_epi16(a, b));
1575 #elif defined(NLIB_NEON)
1576  return NLIB_OP2(vabdq, s16, a, b);
1577 #endif
1578 }
1579 
1580 // r.s32[i] = abs(a.s32[i] - b.s32[i])
1581 NLIB_M(i128) I128::AbsDiffInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1582 #if defined(NLIB_SSE41)
1583  return _mm_abs_epi32(_mm_sub_epi32(a, b));
1584 #elif defined(NLIB_NEON)
1585  return NLIB_OP2(vabdq, s32, a, b);
1586 #endif
1587 }
1588 
1589 // r.s8[i] = -value.s8[i]
1590 NLIB_M(i128) I128::NegateInt8(i128arg value) NLIB_NOEXCEPT {
1591 #if defined(NLIB_SSE41)
1592  return _mm_sub_epi8(_mm_setzero_si128(), value);
1593 #elif defined(NLIB_NEON)
1594  return NLIB_OP1(vnegq, s8, value);
1595 #endif
1596 }
1597 
1598 // r.s16[i] = -value.s16[i]
1599 NLIB_M(i128) I128::NegateInt16(i128arg value) NLIB_NOEXCEPT {
1600 #if defined(NLIB_SSE41)
1601  return _mm_sub_epi16(_mm_setzero_si128(), value);
1602 #elif defined(NLIB_NEON)
1603  return NLIB_OP1(vnegq, s16, value);
1604 #endif
1605 }
1606 
1607 // r.s32[i] = -value.s32[i]
1608 NLIB_M(i128) I128::NegateInt32(i128arg value) NLIB_NOEXCEPT {
1609 #if defined(NLIB_SSE41)
1610  return _mm_sub_epi32(_mm_setzero_si128(), value);
1611 #elif defined(NLIB_NEON)
1612  return NLIB_OP1(vnegq, s32, value);
1613 #endif
1614 }
1615 
1616 // r = a & b
1617 NLIB_M(i128) I128::And(i128arg a, i128arg b) NLIB_NOEXCEPT {
1618 #if defined(NLIB_SSE41)
1619  return _mm_and_si128(a, b);
1620 #elif defined(NLIB_NEON)
1621  return NLIB_OP2(vandq, s8, a, b);
1622 #endif
1623 }
1624 
1625 // r = a | b
1626 NLIB_M(i128) I128::Or(i128arg a, i128arg b) NLIB_NOEXCEPT {
1627 #if defined(NLIB_SSE41)
1628  return _mm_or_si128(a, b);
1629 #elif defined(NLIB_NEON)
1630  return NLIB_OP2(vorrq, s8, a, b);
1631 #endif
1632 }
1633 
1634 // r = a ^ b
1635 NLIB_M(i128) I128::Xor(i128arg a, i128arg b) NLIB_NOEXCEPT {
1636 #if defined(NLIB_SSE41)
1637  return _mm_xor_si128(a, b);
1638 #elif defined(NLIB_NEON)
1639  return NLIB_OP2(veorq, s8, a, b);
1640 #endif
1641 }
1642 
1643 // r = ~a
1644 NLIB_M(i128) I128::Not(i128arg a) NLIB_NOEXCEPT {
1645 #if defined(NLIB_SSE41)
1646  return _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1647 #elif defined(NLIB_NEON)
1648  return NLIB_OP1(vmvnq, s8, a);
1649 #endif
1650 }
1651 
1652 // r = ~a & b
1653 NLIB_M(i128) I128::AndNot(i128arg a, i128arg b) NLIB_NOEXCEPT {
1654 #if defined(NLIB_SSE41)
1655  return _mm_andnot_si128(a, b);
1656 #elif defined(NLIB_NEON)
1657  return NLIB_OP2(vbicq, s8, b, a);
1658 #endif
1659 }
1660 
1661 // r = ~a | b
1662 NLIB_M(i128) I128::OrNot(i128arg a, i128arg b) NLIB_NOEXCEPT {
1663 #if defined(NLIB_SSE41)
1664  __m128i not_a = _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1665  return _mm_or_si128(not_a, b);
1666 #elif defined(NLIB_NEON)
1667  return NLIB_OP2(vornq, s8, b, a);
1668 #endif
1669 }
1670 
1671 NLIB_M(i128) I128::Test8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1672 #if defined(NLIB_NEON)
1673  return vtstq_s8(a, b);
1674 #else
1675  return I128::Not(I128::CmpEqZero8(I128::And(a, b)));
1676 #endif
1677 }
1678 
1679 NLIB_M(i128) I128::Test16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1680 #if defined(NLIB_NEON)
1681  return NLIB_OP2(vtstq, s16, a, b);
1682 #else
1683  return I128::Not(I128::CmpEqZero16(I128::And(a, b)));
1684 #endif
1685 }
1686 
1687 NLIB_M(i128) I128::Test32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1688 #if defined(NLIB_NEON)
1689  return NLIB_OP2(vtstq, s32, a, b);
1690 #else
1691  return I128::Not(I128::CmpEqZero32(I128::And(a, b)));
1692 #endif
1693 }
1694 
1695 // r.i8[i] = a.i8[i] == b.i8[i] ? 0xFF : 0
1696 NLIB_M(i128) I128::CmpEq8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1697 #if defined(NLIB_SSE41)
1698  return _mm_cmpeq_epi8(a, b);
1699 #elif defined(NLIB_NEON)
1700  return NLIB_CMP(vceqq, s8, a, b, u8);
1701 #endif
1702 }
1703 
1704 // r.i16[i] = a.i16[i] == b.i16[i] ? 0xFFFF : 0
1705 NLIB_M(i128) I128::CmpEq16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1706 #if defined(NLIB_SSE41)
1707  return _mm_cmpeq_epi16(a, b);
1708 #elif defined(NLIB_NEON)
1709  return NLIB_CMP(vceqq, s16, a, b, u16);
1710 #endif
1711 }
1712 
1713 // r.i32[i] = a.i32[i] == b.i32[i] ? 0xFFFFFFFF : 0
1714 NLIB_M(i128) I128::CmpEq32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1715 #if defined(NLIB_SSE41)
1716  return _mm_cmpeq_epi32(a, b);
1717 #elif defined(NLIB_NEON)
1718  return NLIB_CMP(vceqq, s32, a, b, u32);
1719 #endif
1720 }
1721 
1722 // r.i64[i] = a.i64[i] == b.i64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1723 NLIB_M(i128) I128::CmpEq64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1724 #if defined(NLIB_SSE41)
1725  return _mm_cmpeq_epi64(a, b);
1726 #elif defined(NLIB_NEON)
1727 #ifdef __aarch64__
1728  return NLIB_CMP(vceqq, s64, a, b, u64);
1729 #else
1730  uint32x4_t x0 = vceqq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b));
1731  uint32x2x2_t x1 = vtrn_u32(vget_low_u32(x0), vget_high_u32(x0));
1732  uint32x2_t x2 = vand_u32(x1.val[0], x1.val[1]);
1733  int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1734  return vreinterpretq_s8_s64(result);
1735 #endif
1736 #endif
1737 }
1738 
1739 // r.s8[i] = a.s8[i] < b.s8[i] ? 0xFF : 0
1740 NLIB_M(i128) I128::CmpLtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1741 #if defined(NLIB_SSE41)
1742  return _mm_cmplt_epi8(a, b);
1743 #elif defined(NLIB_NEON)
1744  return NLIB_CMP(vcltq, s8, a, b, u8);
1745 #endif
1746 }
1747 
1748 // r.s16[i] = a.s16[i] < b.s16[i] ? 0xFFFF : 0
1749 NLIB_M(i128) I128::CmpLtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1750 #if defined(NLIB_SSE41)
1751  return _mm_cmplt_epi16(a, b);
1752 #elif defined(NLIB_NEON)
1753  return NLIB_CMP(vcltq, s16, a, b, u16);
1754 #endif
1755 }
1756 
1757 // r.s32[i] = a.s32[i] < b.s32[i] ? 0xFFFFFFFF : 0
1758 NLIB_M(i128) I128::CmpLtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1759 #if defined(NLIB_SSE41)
1760  return _mm_cmplt_epi32(a, b);
1761 #elif defined(NLIB_NEON)
1762  return NLIB_CMP(vcltq, s32, a, b, u32);
1763 #endif
1764 }
1765 
1766 // r.s64[i] = a.s64[i] < b.s64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1767 NLIB_M(i128) I128::CmpLtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1768 #if defined(NLIB_SSE42)
1769  return _mm_cmpgt_epi64(b, a);
1770 #elif defined(NLIB_NEON)
1771 #ifdef __aarch64__
1772  return NLIB_CMP(vcltq, s64, a, b, u64);
1773 #else
1774  int32x2x2_t trn_a = vtrn_s32(vreinterpret_s32_s8(vget_low_s8(a)),
1775  vreinterpret_s32_s8(vget_high_s8(a)));
1776  int32x2x2_t trn_b = vtrn_s32(vreinterpret_s32_s8(vget_low_s8(b)),
1777  vreinterpret_s32_s8(vget_high_s8(b)));
1778  uint32x2_t upper_lt = vclt_s32(trn_a.val[1], trn_b.val[1]);
1779  uint32x2_t upper_eq = vceq_s32(trn_a.val[1], trn_b.val[1]);
1780  uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1781  uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1782  int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1783  return vreinterpretq_s8_s64(result);
1784 #endif
1785 #else
1786  i128 cmp = I128::CmpLtInt32(a, b);
1787  i128 eq = I128::CmpEq32(a, b);
1788  i128 cmp_lt = I128::CmpLtUint32(a, b);
1789  i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1790  i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp_lt, cmp_lt);
1791  i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1792  return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1793 #endif
1794 }
1795 
1796 // r.s8[i] = a.s8[i] > b.s8[i] ? 0xFF : 0
1797 NLIB_M(i128) I128::CmpGtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1798 #if defined(NLIB_SSE41)
1799  return _mm_cmpgt_epi8(a, b);
1800 #elif defined(NLIB_NEON)
1801  return NLIB_CMP(vcgtq, s8, a, b, u8);
1802 #endif
1803 }
1804 
1805 // r.s16[i] = a.s16[i] > b.s16[i] ? 0xFFFF : 0
1806 NLIB_M(i128) I128::CmpGtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1807 #if defined(NLIB_SSE41)
1808  return _mm_cmpgt_epi16(a, b);
1809 #elif defined(NLIB_NEON)
1810  return NLIB_CMP(vcgtq, s16, a, b, u16);
1811 #endif
1812 }
1813 
1814 // r.s32[i] = a.s32[i] > b.s32[i] ? 0xFFFFFFFF : 0
1815 NLIB_M(i128) I128::CmpGtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1816 #if defined(NLIB_SSE41)
1817  return _mm_cmpgt_epi32(a, b);
1818 #elif defined(NLIB_NEON)
1819  return NLIB_CMP(vcgtq, s32, a, b, u32);
1820 #endif
1821 }
1822 
1823 // r.s64[i] = a.s64[i] > b.s64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1824 NLIB_M(i128) I128::CmpGtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1825 #if defined(NLIB_SSE42)
1826  return _mm_cmpgt_epi64(a, b);
1827 #elif defined(NLIB_NEON) && defined(__aarch64__)
1828  return NLIB_CMP(vcgtq, s64, a, b, u64);
1829 #else
1830  return I128::CmpLtInt64(b, a);
1831 #endif
1832 }
1833 
1834 // r.u8[i] = a.u8[i] < b.u8[i] ? 0xFF : 0
1835 NLIB_M(i128) I128::CmpLtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1836 #if defined(NLIB_SSE41)
1837  i128 ofs = I128::SetValue(0x80, each_uint8);
1838  return _mm_cmplt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1839 #elif defined(NLIB_NEON)
1840  return NLIB_CMP(vcltq, u8, a, b, u8);
1841 #endif
1842 }
1843 
1844 // r.u8[i] = a.u8[i] > b.u8[i] ? 0xFF : 0
1845 NLIB_M(i128) I128::CmpGtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1846 #if defined(NLIB_SSE41)
1847  i128 ofs = I128::SetValue(0x80, each_uint8);
1848  return _mm_cmpgt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1849 #elif defined(NLIB_NEON)
1850  return NLIB_CMP(vcgtq, u8, a, b, u8);
1851 #endif
1852 }
1853 
1854 // r.u16[i] = a.u16[i] < b.u16[i] ? 0xFFFF : 0
1855 NLIB_M(i128) I128::CmpLtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1856 #if defined(NLIB_SSE41)
1857  i128 ofs = I128::SetValue(0x8000U, each_uint16);
1858  return _mm_cmplt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1859 #elif defined(NLIB_NEON)
1860  return NLIB_CMP(vcltq, u16, a, b, u16);
1861 #endif
1862 }
1863 
1864 // r.u16[i] = a.u16[i] > b.u16[i] ? 0xFFFF : 0
1865 NLIB_M(i128) I128::CmpGtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1866 #if defined(NLIB_SSE41)
1867  i128 ofs = I128::SetValue(0x8000U, each_uint16);
1868  return _mm_cmpgt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1869 #elif defined(NLIB_NEON)
1870  return NLIB_CMP(vcgtq, u16, a, b, u16);
1871 #endif
1872 }
1873 
1874 // r.u32[i] = a.u32[i] < b.u32[i] ? 0xFFFFFFFF : 0
1875 NLIB_M(i128) I128::CmpLtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1876 #if defined(NLIB_SSE41)
1877  i128 ofs = I128::SetValue(0x80000000U, each_uint32);
1878  return _mm_cmplt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1879 #elif defined(NLIB_NEON)
1880  return NLIB_CMP(vcltq, u32, a, b, u32);
1881 #endif
1882 }
1883 
1884 // r.u32[i] = a.u32[i] > b.u32[i] ? 0xFFFFFFFF : 0
1885 NLIB_M(i128) I128::CmpGtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1886 #if defined(NLIB_SSE41)
1887  i128 ofs = I128::SetValue(0x80000000U, each_uint32);
1888  return _mm_cmpgt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1889 #elif defined(NLIB_NEON)
1890  return NLIB_CMP(vcgtq, u32, a, b, u32);
1891 #endif
1892 }
1893 
1894 // r.u64[i] = a.u64[i] < b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1895 NLIB_M(i128) I128::CmpLtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1896 #if defined(NLIB_SSE42)
1897  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
1898  return _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
1899 #elif defined(NLIB_NEON)
1900 #ifdef __aarch64__
1901  return NLIB_CMP(vcltq, u64, a, b, u64);
1902 #else
1903  uint32x2x2_t trn_a = vtrn_u32(vreinterpret_u32_s8(vget_low_s8(a)),
1904  vreinterpret_u32_s8(vget_high_s8(a)));
1905  uint32x2x2_t trn_b = vtrn_u32(vreinterpret_u32_s8(vget_low_s8(b)),
1906  vreinterpret_u32_s8(vget_high_s8(b)));
1907  uint32x2_t upper_lt = vclt_u32(trn_a.val[1], trn_b.val[1]);
1908  uint32x2_t upper_eq = vceq_u32(trn_a.val[1], trn_b.val[1]);
1909  uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1910  uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1911  int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1912  return vreinterpretq_s8_s64(result);
1913 #endif
1914 #else
1915  i128 cmp = I128::CmpLtUint32(a, b);
1916  i128 eq = I128::CmpEq32(a, b);
1917  i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1918  i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp, cmp);
1919  i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1920  return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1921 #endif
1922 }
1923 
1924 // r.u64[i] = a.u64[i] > b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1925 NLIB_M(i128) I128::CmpGtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1926 #if defined(NLIB_SSE42)
1927  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
1928  return _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
1929 #elif defined(NLIB_NEON) && defined(__aarch64__)
1930  return NLIB_CMP(vcgtq, u64, a, b, u64);
1931 #else
1932  return I128::CmpLtUint64(b, a);
1933 #endif
1934 }
1935 
1936 // r.s8[i] = a.s8[i] <= b.s8[i] ? 0xFF : 0
1937 NLIB_M(i128) I128::CmpLeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1938 #if defined(NLIB_SSE41)
1939  return _mm_or_si128(_mm_cmplt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1940 #elif defined(NLIB_NEON)
1941  return NLIB_CMP(vcleq, s8, a, b, u8);
1942 #endif
1943 }
1944 
1945 // r.s16[i] = a.s16[i] <= b.s16[i] ? 0xFFFF : 0
1946 NLIB_M(i128) I128::CmpLeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1947 #if defined(NLIB_SSE41)
1948  return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1949 #elif defined(NLIB_NEON)
1950  return NLIB_CMP(vcleq, s16, a, b, u16);
1951 #endif
1952 }
1953 
1954 // r.s32[i] = a.s32[i] <= b.s32[i] ? 0xFFFFFFFF : 0
1955 NLIB_M(i128) I128::CmpLeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1956 #if defined(NLIB_SSE41)
1957  return _mm_or_si128(_mm_cmplt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1958 #elif defined(NLIB_NEON)
1959  return NLIB_CMP(vcleq, s32, a, b, u32);
1960 #endif
1961 }
1962 
1963 // r.s32[i] = a.s32[i] <= b.s32[i] ? 0xFFFFFFFFFFFFFFFF : 0
1964 NLIB_M(i128) I128::CmpLeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1965 #if defined(NLIB_SSE42)
1966  return _mm_or_si128(_mm_cmpgt_epi64(b, a), _mm_cmpeq_epi64(a, b));
1967 #elif defined(NLIB_NEON) && defined(__aarch64__)
1968  return NLIB_CMP(vcleq, s64, a, b, u64);
1969 #else
1970  return I128::Not(I128::CmpGtInt64(a, b));
1971 #endif
1972 }
1973 
1974 // r.s8[i] = a.s8[i] >= b.s8[i] ? 0xFF : 0
1975 NLIB_M(i128) I128::CmpGeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1976 #if defined(NLIB_SSE41)
1977  return _mm_or_si128(_mm_cmpgt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1978 #elif defined(NLIB_NEON)
1979  return NLIB_CMP(vcgeq, s8, a, b, u8);
1980 #endif
1981 }
1982 
1983 // r.s16[i] = a.s16[i] >= b.s16[i] ? 0xFFFF : 0
1984 NLIB_M(i128) I128::CmpGeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1985 #if defined(NLIB_SSE41)
1986  return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1987 #elif defined(NLIB_NEON)
1988  return NLIB_CMP(vcgeq, s16, a, b, u16);
1989 #endif
1990 }
1991 
1992 // r.s32[i] = a.s32[i] >= b.s32[i] ? 0xFFFFFFFF : 0
1993 NLIB_M(i128) I128::CmpGeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1994 #if defined(NLIB_SSE41)
1995  return _mm_or_si128(_mm_cmpgt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1996 #elif defined(NLIB_NEON)
1997  return NLIB_CMP(vcgeq, s32, a, b, u32);
1998 #endif
1999 }
2000 
2001 // r.s64[i] = a.s64[i] >= b.s64[i] ? 0xFFFFFFFFFFFFFFFF : 0
2002 NLIB_M(i128) I128::CmpGeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
2003 #if defined(NLIB_SSE42)
2004  return _mm_or_si128(_mm_cmpgt_epi64(a, b), _mm_cmpeq_epi64(a, b));
2005 #elif defined(NLIB_NEON) && defined(__aarch64__)
2006  return NLIB_CMP(vcgeq, s64, a, b, u64);
2007 #else
2008  return I128::Not(I128::CmpLtInt64(a, b));
2009 #endif
2010 }
2011 
2012 // r.u8[i] = a.u8[i] <= b.u8[i] ? 0xFF : 0
2013 NLIB_M(i128) I128::CmpLeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2014 #if defined(NLIB_SSE41)
2015  return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
2016 #elif defined(NLIB_NEON)
2017  return NLIB_CMP(vcleq, u8, a, b, u8);
2018 #endif
2019 }
2020 
2021 // r.u16[i] = a.u16[i] <= b.u16[i] ? 0xFFFF : 0
2022 NLIB_M(i128) I128::CmpLeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2023 #if defined(NLIB_SSE41)
2024  return _mm_cmpeq_epi16(_mm_min_epu16(a, b), a);
2025 #elif defined(NLIB_NEON)
2026  return NLIB_CMP(vcleq, u16, a, b, u16);
2027 #endif
2028 }
2029 
2030 // r.u32[i] = a.u32[i] <= b.u32[i] ? 0xFFFFFFFF : 0
2031 NLIB_M(i128) I128::CmpLeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2032 #if defined(NLIB_SSE41)
2033  return _mm_cmpeq_epi32(_mm_min_epu32(a, b), a);
2034 #elif defined(NLIB_NEON)
2035  return NLIB_CMP(vcleq, u32, a, b, u32);
2036 #endif
2037 }
2038 
2039 // r.u64[i] = a.u64[i] <= b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
2040 NLIB_M(i128) I128::CmpLeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
2041 #if defined(NLIB_SSE42)
2042  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
2043  i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
2044  return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2045 #elif defined(NLIB_NEON) && defined(__aarch64__)
2046  return NLIB_CMP(vcleq, u64, a, b, u64);
2047 #else
2048  return I128::Not(I128::CmpGtUint64(a, b));
2049 #endif
2050 }
2051 
2052 // r.u8[i] = a.u8[i] >= b.u8[i] ? 0xFF : 0
2053 NLIB_M(i128) I128::CmpGeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2054 #if defined(NLIB_SSE41)
2055  return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a);
2056 #elif defined(NLIB_NEON)
2057  return NLIB_CMP(vcgeq, u8, a, b, u8);
2058 #endif
2059 }
2060 
2061 // r.u16[i] = a.u16[i] >= b.u16[i] ? 0xFFFF : 0
2062 NLIB_M(i128) I128::CmpGeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2063 #if defined(NLIB_SSE41)
2064  return _mm_cmpeq_epi16(_mm_max_epu16(a, b), a);
2065 #elif defined(NLIB_NEON)
2066  return NLIB_CMP(vcgeq, u16, a, b, u16);
2067 #endif
2068 }
2069 
2070 // r.u32[i] = a.u32[i] >= b.u32[i] ? 0xFFFFFFFF : 0
2071 NLIB_M(i128) I128::CmpGeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2072 #if defined(NLIB_SSE41)
2073  return _mm_cmpeq_epi32(_mm_max_epu32(a, b), a);
2074 #elif defined(NLIB_NEON)
2075  return NLIB_CMP(vcgeq, u32, a, b, u32);
2076 #endif
2077 }
2078 
2079 // r.u64[i] = a.u64[i] >= b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
2080 NLIB_M(i128) I128::CmpGeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
2081 #if defined(NLIB_SSE42)
2082  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
2083  i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
2084  return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2085 #elif defined(NLIB_NEON) && defined(__aarch64__)
2086  return NLIB_CMP(vcgeq, u64, a, b, u64);
2087 #else
2088  return I128::Not(I128::CmpLtUint64(a, b));
2089 #endif
2090 }
2091 
2092 NLIB_M(i128) I128::CmpEqZero8(i128arg value) NLIB_NOEXCEPT {
2093 #if defined(__aarch64__)
2094  return vceqzq_s8(value);
2095 #else
2096  return I128::CmpEq8(value, I128::SetZero());
2097 #endif
2098 }
2099 
2100 NLIB_M(i128) I128::CmpEqZero16(i128arg value) NLIB_NOEXCEPT {
2101 #if defined(__aarch64__)
2102  return vreinterpretq_s8_s16(vceqzq_s16(vreinterpretq_s16_s8(value)));
2103 #else
2104  return I128::CmpEq16(value, I128::SetZero());
2105 #endif
2106 }
2107 
2108 NLIB_M(i128) I128::CmpEqZero32(i128arg value) NLIB_NOEXCEPT {
2109 #if defined(__aarch64__)
2110  return vreinterpretq_s8_s32(vceqzq_s32(vreinterpretq_s32_s8(value)));
2111 #else
2112  return I128::CmpEq32(value, I128::SetZero());
2113 #endif
2114 }
2115 
2116 NLIB_M(i128) I128::CmpEqZero64(i128arg value) NLIB_NOEXCEPT {
2117 #if defined(__aarch64__)
2118  return vreinterpretq_s8_s64(vceqzq_s64(vreinterpretq_s64_s8(value)));
2119 #else
2120  return I128::CmpEq64(value, I128::SetZero());
2121 #endif
2122 }
2123 
2124 // r.u8[i] = value.u8[i] << count
2125 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value, int count) NLIB_NOEXCEPT {
2126 #if defined(NLIB_SSE41)
2127  __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2128  __m128i xh = _mm_slli_epi16(_mm_cvtepu8_epi16(hi), count);
2129  __m128i xl = _mm_slli_epi16(_mm_cvtepu8_epi16(value), count);
2130  return I128::NarrowFrom16To8(xl, xh);
2131 #elif defined(NLIB_NEON)
2132  return NLIB_SFT(vshlq, u8, value, count, s8);
2133 #endif
2134 }
2135 
2136 // r.u8[i] = value.u8[i] >> count
2137 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value, int count) NLIB_NOEXCEPT {
2138 #if defined(NLIB_SSE41)
2139  __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2140  __m128i xh = _mm_srli_epi16(_mm_cvtepu8_epi16(hi), count);
2141  __m128i xl = _mm_srli_epi16(_mm_cvtepu8_epi16(value), count);
2142  return _mm_packus_epi16(xl, xh);
2143 #elif defined(NLIB_NEON)
2144  return NLIB_SFT(vshlq, u8, value, -count, s8);
2145 #endif
2146 }
2147 
2148 // r.s8[i] = value.s8[i] >> count
2149 NLIB_M(i128) I128::ShiftRightArithmetic8(i128arg value, int count) NLIB_NOEXCEPT {
2150 #if defined(NLIB_SSE41)
2151  __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2152  __m128i xh = _mm_srai_epi16(_mm_cvtepi8_epi16(hi), count);
2153  __m128i xl = _mm_srai_epi16(_mm_cvtepi8_epi16(value), count);
2154  return _mm_packus_epi16(xl, xh);
2155 #elif defined(NLIB_NEON)
2156  return NLIB_SFT(vshlq, s8, value, -count, s8);
2157 #endif
2158 }
2159 
2160 // r.u16[i] = value.u16[i] << count
2161 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value, int count) NLIB_NOEXCEPT {
2162 #if defined(NLIB_SSE41)
2163  return _mm_slli_epi16(value, count);
2164 #elif defined(NLIB_NEON)
2165  return NLIB_SFT(vshlq, u16, value, count, s16);
2166 #endif
2167 }
2168 
2169 // r.u16[i] = value.u16[i] >> count
2170 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value, int count) NLIB_NOEXCEPT {
2171 #if defined(NLIB_SSE41)
2172  return _mm_srli_epi16(value, count);
2173 #elif defined(NLIB_NEON)
2174  return NLIB_SFT(vshlq, u16, value, -count, s16);
2175 #endif
2176 }
2177 
2178 // r.s16[i] = value.s16[i] >> count
2179 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value, int count) NLIB_NOEXCEPT {
2180 #if defined(NLIB_SSE41)
2181  return _mm_srai_epi16(value, count);
2182 #elif defined(NLIB_NEON)
2183  return NLIB_SFT(vshlq, s16, value, -count, s16);
2184 #endif
2185 }
2186 
2187 // r.u32[i] = value.u32[i] << count
2188 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value, int count) NLIB_NOEXCEPT {
2189 #if defined(NLIB_SSE41)
2190  return _mm_slli_epi32(value, count);
2191 #elif defined(NLIB_NEON)
2192  return NLIB_SFT(vshlq, u32, value, count, s32);
2193 #endif
2194 }
2195 
2196 // r.u32[i] = value.u32[i] >> count
2197 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value, int count) NLIB_NOEXCEPT {
2198 #if defined(NLIB_SSE41)
2199  return _mm_srli_epi32(value, count);
2200 #elif defined(NLIB_NEON)
2201  return NLIB_SFT(vshlq, u32, value, -count, s32);
2202 #endif
2203 }
2204 
2205 // r.s32[i] = value.s32[i] >> count
2206 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value, int count) NLIB_NOEXCEPT {
2207 #if defined(NLIB_SSE41)
2208  return _mm_srai_epi32(value, count);
2209 #elif defined(NLIB_NEON)
2210  return NLIB_SFT(vshlq, s32, value, -count, s32);
2211 #endif
2212 }
2213 
2214 // r.u32[i] = value.u64[i] << count
2215 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value, int count) NLIB_NOEXCEPT {
2216 #if defined(NLIB_SSE41)
2217  return _mm_slli_epi64(value, count);
2218 #elif defined(NLIB_NEON)
2219  return NLIB_SFT(vshlq, u64, value, count, s64);
2220 #endif
2221 }
2222 
2223 // r.u32[i] = value.u64[i] >> count
2224 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value, int count) NLIB_NOEXCEPT {
2225 #if defined(NLIB_SSE41)
2226  return _mm_srli_epi64(value, count);
2227 #elif defined(NLIB_NEON)
2228  return NLIB_SFT(vshlq, u64, value, -count, s64);
2229 #endif
2230 }
2231 
2232 template<size_t N>
2233 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value) NLIB_NOEXCEPT {
2234  NLIB_STATIC_ASSERT(N >= 0 && N <= 8);
2235 #ifdef NLIB_NEON
2236  return vshlq_n_s8(value, N);
2237 #else
2238  return I128::ShiftLeftLogical8(value, N);
2239 #endif
2240 }
2241 
2242 template <size_t N>
2243 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value) NLIB_NOEXCEPT {
2244  NLIB_STATIC_ASSERT(N >= 0 && N <= 8);
2245 #ifdef NLIB_NEON
2246  uint8x16_t tmp = vreinterpretq_u8_s8(value);
2247  return vreinterpretq_s8_u8(vshrq_n_u8(tmp, N));
2248 #else
2249  return I128::ShiftRightLogical8(value, N);
2250 #endif
2251 }
2252 
2253 template <size_t N>
2254 NLIB_M(i128) I128::ShiftRightArithmetic8(i128arg value) NLIB_NOEXCEPT {
2255  NLIB_STATIC_ASSERT(N >= 0 && N <= 8);
2256 #ifdef NLIB_NEON
2257  return vshrq_n_s8(value, N);
2258 #else
2259  return I128::ShiftRightArithmetic8(value, N);
2260 #endif
2261 }
2262 
2263 template <size_t N>
2264 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value) NLIB_NOEXCEPT {
2265  NLIB_STATIC_ASSERT(N >= 0 && N <= 16);
2266 #ifdef NLIB_NEON
2267  uint16x8_t tmp = vreinterpretq_u16_s8(value);
2268  return vreinterpretq_s8_u16(vshlq_n_u16(tmp, N));
2269 #else
2270  return I128::ShiftLeftLogical16(value, N);
2271 #endif
2272 }
2273 
2274 template <size_t N>
2275 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value) NLIB_NOEXCEPT {
2276  NLIB_STATIC_ASSERT(N >= 0 && N <= 16);
2277 #ifdef NLIB_NEON
2278  uint16x8_t tmp = vreinterpretq_u16_s8(value);
2279  return vreinterpretq_s8_u16(vshrq_n_u16(tmp, N));
2280 #else
2281  return I128::ShiftRightLogical16(value, N);
2282 #endif
2283 }
2284 
2285 template <size_t N>
2286 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value) NLIB_NOEXCEPT {
2287  NLIB_STATIC_ASSERT(N >= 0 && N <= 16);
2288 #ifdef NLIB_NEON
2289  int16x8_t tmp = vreinterpretq_s16_s8(value);
2290  return vreinterpretq_s8_s16(vshrq_n_s16(tmp, N));
2291 #else
2292  return I128::ShiftRightArithmetic16(value, N);
2293 #endif
2294 }
2295 
2296 template <size_t N>
2297 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value) NLIB_NOEXCEPT {
2298  NLIB_STATIC_ASSERT(N >= 0 && N <= 32);
2299 #ifdef NLIB_NEON
2300  uint32x4_t tmp = vreinterpretq_u32_s8(value);
2301  return vreinterpretq_s8_u32(vshlq_n_u32(tmp, N));
2302 #else
2303  return I128::ShiftLeftLogical32(value, N);
2304 #endif
2305 }
2306 
2307 template <size_t N>
2308 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value) NLIB_NOEXCEPT {
2309  NLIB_STATIC_ASSERT(N >= 0 && N <= 32);
2310 #ifdef NLIB_NEON
2311  uint32x4_t tmp = vreinterpretq_u32_s8(value);
2312  return vreinterpretq_s8_u32(vshrq_n_u32(tmp, N));
2313 #else
2314  return I128::ShiftRightLogical32(value, N);
2315 #endif
2316 }
2317 
2318 template <size_t N>
2319 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value) NLIB_NOEXCEPT {
2320  NLIB_STATIC_ASSERT(N >= 0 && N <= 32);
2321 #ifdef NLIB_NEON
2322  int32x4_t tmp = vreinterpretq_s32_s8(value);
2323  return vreinterpretq_s8_s32(vshrq_n_s32(tmp, N));
2324 #else
2325  return I128::ShiftRightArithmetic32(value, N);
2326 #endif
2327 }
2328 
2329 template <size_t N>
2330 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value) NLIB_NOEXCEPT {
2331  NLIB_STATIC_ASSERT(N >= 0 && N <= 64);
2332 #ifdef NLIB_NEON
2333  uint64x2_t tmp = vreinterpretq_u64_s8(value);
2334  return vreinterpretq_s8_u64(vshlq_n_u64(tmp, N));
2335 #else
2336  return I128::ShiftLeftLogical64(value, N);
2337 #endif
2338 }
2339 
2340 template <size_t N>
2341 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value) NLIB_NOEXCEPT {
2342  NLIB_STATIC_ASSERT(N >= 0 && N <= 64);
2343 #ifdef NLIB_NEON
2344  uint64x2_t tmp = vreinterpretq_u64_s8(value);
2345  return vreinterpretq_s8_u64(vshrq_n_u64(tmp, N));
2346 #else
2347  return I128::ShiftRightLogical64(value, N);
2348 #endif
2349 }
2350 
2351 #ifdef NLIB_NEON
2352 template<>
2353 NLIB_M(i128) I128::ShiftLeftLogical8<8>(i128arg value) NLIB_NOEXCEPT {
2354  NLIB_UNUSED(value);
2355  return I128::SetZero();
2356 }
2357 template<>
2358 NLIB_M(i128) I128::ShiftRightLogical8<0>(i128arg value) NLIB_NOEXCEPT {
2359  return value;
2360 }
2361 template<>
2362 NLIB_M(i128) I128::ShiftLeftLogical16<16>(i128arg value) NLIB_NOEXCEPT {
2363  NLIB_UNUSED(value);
2364  return I128::SetZero();
2365 }
2366 template<>
2367 NLIB_M(i128) I128::ShiftRightLogical16<0>(i128arg value) NLIB_NOEXCEPT {
2368  return value;
2369 }
2370 template<>
2371 NLIB_M(i128) I128::ShiftRightArithmetic16<0>(i128arg value) NLIB_NOEXCEPT {
2372  return value;
2373 }
2374 template<>
2375 NLIB_M(i128) I128::ShiftLeftLogical32<32>(i128arg value) NLIB_NOEXCEPT {
2376  NLIB_UNUSED(value);
2377  return I128::SetZero();
2378 }
2379 template<>
2380 NLIB_M(i128) I128::ShiftRightLogical32<0>(i128arg value) NLIB_NOEXCEPT {
2381  return value;
2382 }
2383 template<>
2384 NLIB_M(i128) I128::ShiftRightArithmetic32<0>(i128arg value) NLIB_NOEXCEPT {
2385  return value;
2386 }
2387 template<>
2388 NLIB_M(i128) I128::ShiftLeftLogical64<64>(i128arg value) NLIB_NOEXCEPT {
2389  NLIB_UNUSED(value);
2390  return I128::SetZero();
2391 }
2392 template<>
2393 NLIB_M(i128) I128::ShiftRightLogical64<0>(i128arg value) NLIB_NOEXCEPT {
2394  return value;
2395 }
2396 #endif
2397 
2398 template <size_t N>
2399 // r.i8[i + N] = r.i8[i], fill zero
2400 NLIB_M(i128) I128::ByteShiftLeft(i128arg value) NLIB_NOEXCEPT {
2401  NLIB_STATIC_ASSERT(N < 16);
2402 #if defined(NLIB_SSE41)
2403  return _mm_slli_si128(value, N);
2404 #elif defined(NLIB_NEON)
2405  return vextq_s8(vdupq_n_s8(0), value, 16 - N);
2406 #endif
2407 }
2408 
2409 template <size_t N>
2410 // r.i8[i - N] = r.i8[i], fill zero
2411 NLIB_M(i128) I128::ByteShiftRight(i128arg value) NLIB_NOEXCEPT {
2412  NLIB_STATIC_ASSERT(N < 16);
2413 #if defined(NLIB_SSE41)
2414  return _mm_srli_si128(value, N);
2415 #elif defined(NLIB_NEON)
2416  return vextq_s8(value, vdupq_n_s8(0), N);
2417 #endif
2418 }
2419 
2420 template <size_t N>
2421 // left <- fedcba9876543210 -> right
2422 NLIB_M(i128) I128::ByteRotateRight(i128arg value) NLIB_NOEXCEPT {
2423  NLIB_STATIC_ASSERT(N < 16);
2424 #if defined(NLIB_SSE41)
2425  return _mm_alignr_epi8(value, value, N);
2426 #elif defined(NLIB_NEON)
2427  return vextq_s8(value, value, N);
2428 #endif
2429 }
2430 
2431 template <size_t N>
2432 // _mm_alignr_epi8(a, b, N), which returns ((a << 128) | b) >> (N * 8)
2433 NLIB_M(i128) I128::AlignR(i128arg a, i128arg b) NLIB_NOEXCEPT {
2434  NLIB_STATIC_ASSERT(N < 16);
2435 #if defined(NLIB_SSE41)
2436  return _mm_alignr_epi8(a, b, N);
2437 #elif defined(NLIB_NEON)
2438  return vextq_s8(b, a, N);
2439 #endif
2440 }
2441 
2442 // r.u8[i] = value.u8[i * 2]
2443 NLIB_M(i128) I128::NarrowFrom16To8(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2444 #if defined(NLIB_SSE41)
2445  i128 mask = I128::SetValue(0x00FFU, each_uint16);
2446  __m128i lo_mask = _mm_and_si128(lo, mask);
2447  __m128i hi_mask = _mm_and_si128(hi, mask);
2448  return _mm_packus_epi16(lo_mask, hi_mask);
2449 #elif defined(NLIB_NEON)
2450 # ifdef __aarch64__
2451  uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2452  return vreinterpretq_s8_u8(vmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2453 # else
2454  uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2455  uint8x8_t h = vmovn_u16(vreinterpretq_u16_s8(hi));
2456  return NLIB_CMB(u8, l, h);
2457 # endif
2458 #endif
2459 }
2460 
2461 // r.u16[i] = value.u16[i * 2]
2462 NLIB_M(i128) I128::NarrowFrom32To16(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2463 #if defined(NLIB_SSE41)
2464  i128 mask = I128::SetValue(0xFFFFU, each_uint32);
2465  __m128i lo_mask = _mm_and_si128(lo, mask);
2466  __m128i hi_mask = _mm_and_si128(hi, mask);
2467  return _mm_packus_epi32(lo_mask, hi_mask);
2468 #elif defined(NLIB_NEON)
2469 # ifdef __aarch64__
2470  uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2471  return vreinterpretq_s8_u16(vmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2472 # else
2473  uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2474  uint16x4_t h = vmovn_u32(vreinterpretq_u32_s8(hi));
2475  return NLIB_CMB(u16, l, h);
2476 # endif
2477 #endif
2478 }
2479 
2480 // r.u32[i] = value.u32[i * 2]
2481 NLIB_M(i128) I128::NarrowFrom64To32(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2482 #if defined(NLIB_SSE41)
2483  __m128i lo_ = _mm_shuffle_epi32(lo, _MM_SHUFFLE(3, 1, 2, 0));
2484  __m128i hi_ = _mm_shuffle_epi32(hi, _MM_SHUFFLE(3, 1, 2, 0));
2485  return _mm_unpacklo_epi64(lo_, hi_);
2486 #elif defined(NLIB_NEON)
2487 # ifdef __aarch64__
2488  uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2489  return vreinterpretq_s8_u32(vmovn_high_u64(l, vreinterpretq_u64_s8(hi)));
2490 # else
2491  uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2492  uint32x2_t h = vmovn_u64(vreinterpretq_u64_s8(hi));
2493  return NLIB_CMB(u32, l, h);
2494 # endif
2495 #endif
2496 }
2497 
2498 // r.u8[i] = 255 if value.u16[i] > 255
2499 NLIB_M(i128) I128::ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2500 #if defined(NLIB_SSE41)
2501  i128 b7FFF = I128::SetValue(0x7FFFU, each_uint16);
2502  __m128i lotmp = _mm_and_si128(lo, b7FFF);
2503  __m128i hitmp = _mm_and_si128(hi, b7FFF);
2504  return _mm_packus_epi16(lotmp, hitmp);
2505 #elif defined(NLIB_NEON)
2506 # ifdef __aarch64__
2507  uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2508  return vreinterpretq_s8_u8(vqmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2509 # else
2510  uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2511  uint8x8_t h = vqmovn_u16(vreinterpretq_u16_s8(hi));
2512  return NLIB_CMB(u8, l, h);
2513 # endif
2514 #endif
2515 }
2516 
2517 // r.s8[i] = 127 if value.s16[i] > 127, -128 if value.s16[i] < -128
2518 NLIB_M(i128) I128::ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2519 #if defined(NLIB_SSE41)
2520  return _mm_packs_epi16(lo, hi);
2521 #elif defined(NLIB_NEON)
2522 # ifdef __aarch64__
2523  int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2524  return vqmovn_high_s16(l, vreinterpretq_s16_s8(hi));
2525 # else
2526  int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2527  int8x8_t h = vqmovn_s16(vreinterpretq_s16_s8(hi));
2528  return NLIB_CMB(s8, l, h);
2529 # endif
2530 #endif
2531 }
2532 
2533 // r.u16[i] = 65535 if value.u32[i] > 65535
2534 NLIB_M(i128) I128::ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2535 #if defined(NLIB_SSE41)
2536  i128 b7FFFFFFF = I128::SetValue(0x7FFFFFFFU, each_uint32);
2537  __m128i lotmp = _mm_and_si128(lo, b7FFFFFFF);
2538  __m128i hitmp = _mm_and_si128(hi, b7FFFFFFF);
2539  return _mm_packus_epi32(lotmp, hitmp);
2540 #elif defined(NLIB_NEON)
2541 # ifdef __aarch64__
2542  uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2543  return vreinterpretq_s8_u16(vqmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2544 # else
2545  uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2546  uint16x4_t h = vqmovn_u32(vreinterpretq_u32_s8(hi));
2547  return NLIB_CMB(u16, l, h);
2548 # endif
2549 #endif
2550 }
2551 
2552 // r.s16[i] = 32767 if value.s32[i] > 32767, -32768 if value.s32[i] < -32768
2553 NLIB_M(i128) I128::ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2554 #if defined(NLIB_SSE41)
2555  return _mm_packs_epi32(lo, hi);
2556 #elif defined(NLIB_NEON)
2557 # ifdef __aarch64__
2558  int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2559  return vreinterpretq_s8_s16(vqmovn_high_s32(l, vreinterpretq_s32_s8(hi)));
2560 # else
2561  int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2562  int16x4_t h = vqmovn_s32(vreinterpretq_s32_s8(hi));
2563  return NLIB_CMB(s16, l, h);
2564 # endif
2565 #endif
2566 }
2567 
2568 // r.s8[2 * i] = value.s8[i], r.u8[2 * i + 1] = value.s8[i] < 0 ? 0xFF : 0
2569 NLIB_M(i128) I128::ConvertFromInt8ToInt16Lo(i128arg value) NLIB_NOEXCEPT {
2570 #if defined(NLIB_SSE41)
2571  return _mm_cvtepi8_epi16(value);
2572 #elif defined(NLIB_NEON)
2573  return vreinterpretq_s8_s16(vmovl_s8(vget_low_s8(value)));
2574 #endif
2575 }
2576 
2577 // r.s8[2 * i] = value.s8[i + 8], r.u8[2 * i + 1] = value.s8[i + 8] < 0 ? 0xFF : 0
2578 NLIB_M(i128) I128::ConvertFromInt8ToInt16Hi(i128arg value) NLIB_NOEXCEPT {
2579 #if defined(NLIB_SSE41)
2580  return _mm_cvtepi8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2581 #elif defined(NLIB_NEON)
2582 # ifdef __aarch64__
2583  int16x8_t result = vmovl_high_s8(value);
2584 # else
2585  int16x8_t result = vmovl_s8(vget_high_s8(value));
2586 # endif
2587  return vreinterpretq_s8_s16(result);
2588 #endif
2589 }
2590 
2591 // r.s16[2 * i] = value.s16[i], r.u16[2 * i + 1] = value.s16[i] < 0 ? 0xFFFF : 0
2592 NLIB_M(i128) I128::ConvertFromInt16ToInt32Lo(i128arg value) NLIB_NOEXCEPT {
2593 #if defined(NLIB_SSE41)
2594  return _mm_cvtepi16_epi32(value);
2595 #elif defined(NLIB_NEON)
2596  int16x8_t x = vreinterpretq_s16_s8(value);
2597  int32x4_t result = vmovl_s16(vget_low_s16(x));
2598  return vreinterpretq_s8_s32(result);
2599 #endif
2600 }
2601 
2602 // r.s16[2 * i] = value.s16[i + 4], r.u16[2 * i + 1] = value.s16[i + 4] < 0 ? 0xFFFF : 0
2603 NLIB_M(i128) I128::ConvertFromInt16ToInt32Hi(i128arg value) NLIB_NOEXCEPT {
2604 #if defined(NLIB_SSE41)
2605  return _mm_cvtepi16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2606 #elif defined(NLIB_NEON)
2607  int16x8_t x = vreinterpretq_s16_s8(value);
2608 # ifdef __aarch64__
2609  int32x4_t result = vmovl_high_s16(x);
2610 # else
2611  int32x4_t result = vmovl_s16(vget_high_s16(x));
2612 # endif
2613  return vreinterpretq_s8_s32(result);
2614 #endif
2615 }
2616 
2617 // r.s32[2 * i] = value.s32[i], r.u32[2 * i + 1] = value.s32[i] < 0 ? 0xFFFFFFFF : 0
2618 NLIB_M(i128) I128::ConvertFromInt32ToInt64Lo(i128arg value) NLIB_NOEXCEPT {
2619 #if defined(NLIB_SSE41)
2620  return _mm_cvtepi32_epi64(value);
2621 #elif defined(NLIB_NEON)
2622  int32x4_t x = vreinterpretq_s32_s8(value);
2623  int64x2_t result = vmovl_s32(vget_low_s32(x));
2624  return vreinterpretq_s8_s64(result);
2625 #endif
2626 }
2627 
2628 // r.s32[2 * i] = value.s32[i + 2], r.u32[2 * i + 1] = value.s32[i + 2] < 0 ? 0xFFFFFFFF : 0
2629 NLIB_M(i128) I128::ConvertFromInt32ToInt64Hi(i128arg value) NLIB_NOEXCEPT {
2630 #if defined(NLIB_SSE41)
2631  return _mm_cvtepi32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2632 #elif defined(NLIB_NEON)
2633  int32x4_t x = vreinterpretq_s32_s8(value);
2634 # ifdef __aarch64__
2635  int64x2_t result = vmovl_high_s32(x);
2636 # else
2637  int64x2_t result = vmovl_s32(vget_high_s32(x));
2638 # endif
2639  return vreinterpretq_s8_s64(result);
2640 #endif
2641 }
2642 
2643 // r.u16[i] = value.u8[i]
2644 NLIB_M(i128) I128::ConvertFromUint8ToUint16Lo(i128arg value) NLIB_NOEXCEPT {
2645 #if defined(NLIB_SSE41)
2646  return _mm_cvtepu8_epi16(value);
2647 #elif defined(NLIB_NEON)
2648  uint8x16_t x = vreinterpretq_u8_s8(value);
2649  uint16x8_t result = vmovl_u8(vget_low_u8(x));
2650  return vreinterpretq_s8_u16(result);
2651 #endif
2652 }
2653 
2654 // r.u16[i] = value.u8[i + 8]
2655 NLIB_M(i128) I128::ConvertFromUint8ToUint16Hi(i128arg value) NLIB_NOEXCEPT {
2656 #if defined(NLIB_SSE41)
2657  return _mm_cvtepu8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2658 #elif defined(NLIB_NEON)
2659  uint8x16_t x = vreinterpretq_u8_s8(value);
2660 # ifdef __aarch64__
2661  uint16x8_t result = vmovl_high_u8(x);
2662 # else
2663  uint16x8_t result = vmovl_u8(vget_high_u8(x));
2664 # endif
2665  return vreinterpretq_s8_u16(result);
2666 #endif
2667 }
2668 
2669 // r.u32[i] = value.u16[i]
2670 NLIB_M(i128) I128::ConvertFromUint16ToUint32Lo(i128arg value) NLIB_NOEXCEPT {
2671 #if defined(NLIB_SSE41)
2672  return _mm_cvtepu16_epi32(value);
2673 #elif defined(NLIB_NEON)
2674  uint16x8_t x = vreinterpretq_u16_s8(value);
2675  uint32x4_t result = vmovl_u16(vget_low_u16(x));
2676  return vreinterpretq_s8_u32(result);
2677 #endif
2678 }
2679 
2680 // r.u32[i] = value.u16[i + 4]
2681 NLIB_M(i128) I128::ConvertFromUint16ToUint32Hi(i128arg value) NLIB_NOEXCEPT {
2682 #if defined(NLIB_SSE41)
2683  return _mm_cvtepu16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2684 #elif defined(NLIB_NEON)
2685  uint16x8_t x = vreinterpretq_u16_s8(value);
2686 # ifdef __aarch64__
2687  uint32x4_t result = vmovl_high_u16(x);
2688 # else
2689  uint32x4_t result = vmovl_u16(vget_high_u16(x));
2690 # endif
2691  return vreinterpretq_s8_u32(result);
2692 #endif
2693 }
2694 
2695 // r.u64[i] = value.u32[i]
2696 NLIB_M(i128) I128::ConvertFromUint32ToUint64Lo(i128arg value) NLIB_NOEXCEPT {
2697 #if defined(NLIB_SSE41)
2698  return _mm_cvtepu32_epi64(value);
2699 #elif defined(NLIB_NEON)
2700  uint32x4_t x = vreinterpretq_u32_s8(value);
2701  uint64x2_t result = vmovl_u32(vget_low_u32(x));
2702  return vreinterpretq_s8_u64(result);
2703 #endif
2704 }
2705 
2706 // r.u64[i] = value.u32[i + 2]
2707 NLIB_M(i128) I128::ConvertFromUint32ToUint64Hi(i128arg value) NLIB_NOEXCEPT {
2708 #if defined(NLIB_SSE41)
2709  return _mm_cvtepu32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2710 #elif defined(NLIB_NEON)
2711  uint32x4_t x = vreinterpretq_u32_s8(value);
2712 # ifdef __aarch64__
2713  uint64x2_t result = vmovl_high_u32(x);
2714 # else
2715  uint64x2_t result = vmovl_u32(vget_high_u32(x));
2716 # endif
2717  return vreinterpretq_s8_u64(result);
2718 #endif
2719 }
2720 
2721 // 01234567 abcdefgh -> 0a1b2c3d4e5f6g7h
2722 NLIB_M(i128) I128::Zip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2723 #if defined(NLIB_SSE41)
2724  return _mm_unpacklo_epi8(a, b);
2725 #elif defined(NLIB_NEON)
2726 # ifdef __aarch64__
2727  return vzip1q_s8(a, b);
2728 # else
2729  return vzipq_s8(a, b).val[0];
2730 # endif
2731 #endif
2732 }
2733 
2734 // 01234567 abcdefgh -> 0a1b2c3d4e5f6g7h
2735 NLIB_M(i128) I128::Zip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2736 #if defined(NLIB_SSE41)
2737  return _mm_unpackhi_epi8(a, b);
2738 #elif defined(NLIB_NEON)
2739 # ifdef __aarch64__
2740  return vzip2q_s8(a, b);
2741 # else
2742  return vzipq_s8(a, b).val[1];
2743 # endif
2744 #endif
2745 }
2746 
2747 NLIB_M(i128) I128::Unzip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2748 #if defined(NLIB_SSE41)
2749  i128 mask = I128::SetValue(0x00FFU, each_uint16);
2750  __m128i lo_mask = _mm_and_si128(a, mask);
2751  __m128i hi_mask = _mm_and_si128(b, mask);
2752  return _mm_packus_epi16(lo_mask, hi_mask);
2753 #elif defined(NLIB_NEON)
2754 # ifdef __aarch64__
2755  return vuzp1q_s8(a, b);
2756 # else
2757  return vuzpq_s8(a, b).val[0];
2758 # endif
2759 #endif
2760 }
2761 
2762 NLIB_M(i128) I128::Unzip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2763 #if defined(NLIB_SSE41)
2764  i128 mask = I128::SetValue(0xFF00U, each_uint16);
2765  __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 1);
2766  __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 1);
2767  return _mm_packus_epi16(lo_mask, hi_mask);
2768 #elif defined(NLIB_NEON)
2769 # ifdef __aarch64__
2770  return vuzp2q_s8(a, b);
2771 # else
2772  return vuzpq_s8(a, b).val[1];
2773 # endif
2774 #endif
2775 }
2776 
2777 // 0123 abcd -> 0a1b2c3d
2778 NLIB_M(i128) I128::Zip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2779 #if defined(NLIB_SSE41)
2780  return _mm_unpacklo_epi16(a, b);
2781 #elif defined(NLIB_NEON)
2782 # ifdef __aarch64__
2783  return NLIB_OP2(vzip1q, u16, a, b);
2784 # else
2785  return vreinterpretq_s8_u16(vzipq_u16(
2786  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2787 # endif
2788 #endif
2789 }
2790 
2791 // 0123 abcd -> 0a1b2c3d
2792 NLIB_M(i128) I128::Zip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2793 #if defined(NLIB_SSE41)
2794  return _mm_unpackhi_epi16(a, b);
2795 #elif defined(NLIB_NEON)
2796 # ifdef __aarch64__
2797  return NLIB_OP2(vzip2q, u16, a, b);
2798 # else
2799  return vreinterpretq_s8_u16(vzipq_u16(
2800  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2801 # endif
2802 #endif
2803 }
2804 
2805 NLIB_M(i128) I128::Unzip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2806 #if defined(NLIB_SSE41)
2807  i128 mask = I128::SetValue(0xFFFFU, each_uint32);
2808  __m128i lo_mask = _mm_and_si128(a, mask);
2809  __m128i hi_mask = _mm_and_si128(b, mask);
2810  return _mm_packus_epi32(lo_mask, hi_mask);
2811 #elif defined(NLIB_NEON)
2812 # ifdef __aarch64__
2813  return NLIB_OP2(vuzp1q, u16, a, b);
2814 # else
2815  return vreinterpretq_s8_u16(vuzpq_u16(
2816  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2817 # endif
2818 #endif
2819 }
2820 
2821 NLIB_M(i128) I128::Unzip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2822 #if defined(NLIB_SSE41)
2823  i128 mask = I128::SetValue(0xFFFF0000U, each_uint32);
2824  __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 2);
2825  __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 2);
2826  return _mm_packus_epi32(lo_mask, hi_mask);
2827 #elif defined(NLIB_NEON)
2828 # ifdef __aarch64__
2829  return NLIB_OP2(vuzp2q, u16, a, b);
2830 # else
2831  return vreinterpretq_s8_u16(vuzpq_u16(
2832  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2833 # endif
2834 #endif
2835 }
2836 
2837 // 01 ab -> 0a1b
2838 NLIB_M(i128) I128::Zip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2839 #if defined(NLIB_SSE41)
2840  return _mm_unpacklo_epi32(a, b);
2841 #elif defined(NLIB_NEON)
2842 # ifdef __aarch64__
2843  return NLIB_OP2(vzip1q, u32, a, b);
2844 # else
2845  return vreinterpretq_s8_u32(vzipq_u32(
2846  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2847 # endif
2848 #endif
2849 }
2850 
2851 // 01 ab -> 0a1b
2852 NLIB_M(i128) I128::Zip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2853 #if defined(NLIB_SSE41)
2854  return _mm_unpackhi_epi32(a, b);
2855 #elif defined(NLIB_NEON)
2856 # ifdef __aarch64__
2857  return NLIB_OP2(vzip2q, u32, a, b);
2858 # else
2859  return vreinterpretq_s8_u32(vzipq_u32(
2860  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2861 # endif
2862 #endif
2863 }
2864 
2865 NLIB_M(i128) I128::Unzip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2866 #if defined(NLIB_SSE41)
2867  __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
2868  __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
2869  return _mm_blend_epi16(x0, x1, 0xF0);
2870 #elif defined(NLIB_NEON)
2871 # ifdef __aarch64__
2872  return NLIB_OP2(vuzp1q, u32, a, b);
2873 # else
2874  return vreinterpretq_s8_u32(vuzpq_u32(
2875  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2876 # endif
2877 #endif
2878 }
2879 
2880 NLIB_M(i128) I128::Unzip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2881 #if defined(NLIB_SSE41)
2882  __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 3, 1));
2883  __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 2, 0));
2884  return _mm_blend_epi16(x0, x1, 0xF0);
2885 #elif defined(NLIB_NEON)
2886 # ifdef __aarch64__
2887  return NLIB_OP2(vuzp2q, u32, a, b);
2888 # else
2889  return vreinterpretq_s8_u32(vuzpq_u32(
2890  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2891 # endif
2892 #endif
2893 }
2894 
2895 template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7,
2896  int V8, int V9, int V10, int V11, int V12, int V13, int V14, int V15>
2897 NLIB_M(i128) I128::Permute8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2898 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON)
2899  return __builtin_shufflevector(
2900  a, b,
2901  V0, V1, V2, V3, V4, V5, V6, V7,
2902  V8, V9, V10, V11, V12, V13, V14, V15);
2903 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41)
2904  return __builtin_shufflevector((__v16qi)a, (__v16qi)b,
2905  V0, V1, V2, V3, V4, V5, V6, V7,
2906  V8, V9, V10, V11, V12, V13, V14, V15);
2907 #else
2908  NLIB_ALIGNAS(16) int8_t mask_a[16] = {
2909  (V0 < 0 || V0 > 15) ? -128 : V0,
2910  (V1 < 0 || V1 > 15) ? -128 : V1,
2911  (V2 < 0 || V2 > 15) ? -128 : V2,
2912  (V3 < 0 || V3 > 15) ? -128 : V3,
2913  (V4 < 0 || V4 > 15) ? -128 : V4,
2914  (V5 < 0 || V5 > 15) ? -128 : V5,
2915  (V6 < 0 || V6 > 15) ? -128 : V6,
2916  (V7 < 0 || V7 > 15) ? -128 : V7,
2917  (V8 < 0 || V8 > 15) ? -128 : V8,
2918  (V9 < 0 || V9 > 15) ? -128 : V9,
2919  (V10 < 0 || V10 > 15) ? -128 : V10,
2920  (V11 < 0 || V11 > 15) ? -128 : V11,
2921  (V12 < 0 || V12 > 15) ? -128 : V12,
2922  (V13 < 0 || V13 > 15) ? -128 : V13,
2923  (V14 < 0 || V14 > 15) ? -128 : V14,
2924  (V15 < 0 || V15 > 15) ? -128 : V15
2925  };
2926  NLIB_ALIGNAS(16) int8_t mask_b[16] = {
2927  V0 < 16 ? -128 : (V0 - 16),
2928  V1 < 16 ? -128 : (V1 - 16),
2929  V2 < 16 ? -128 : (V2 - 16),
2930  V3 < 16 ? -128 : (V3 - 16),
2931  V4 < 16 ? -128 : (V4 - 16),
2932  V5 < 16 ? -128 : (V5 - 16),
2933  V6 < 16 ? -128 : (V6 - 16),
2934  V7 < 16 ? -128 : (V7 - 16),
2935  V8 < 16 ? -128 : (V8 - 16),
2936  V9 < 16 ? -128 : (V9 - 16),
2937  V10 < 16 ? -128 : (V10 - 16),
2938  V11 < 16 ? -128 : (V11 - 16),
2939  V12 < 16 ? -128 : (V12 - 16),
2940  V13 < 16 ? -128 : (V13 - 16),
2941  V14 < 16 ? -128 : (V14 - 16),
2942  V15 < 16 ? -128 : (V15 - 16)
2943  };
2944  i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2945  i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2946  return I128::Or(tmp_a, tmp_b);
2947 #endif
2948 }
2949 
2950 template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7>
2951 NLIB_M(i128) I128::Permute16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2952 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON)
2953  return vreinterpretq_s8_u16(__builtin_shufflevector(
2954  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b),
2955  V0, V1, V2, V3, V4, V5, V6, V7));
2956 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41)
2957  return __builtin_shufflevector((__v8hi)a, (__v8hi)b,
2958  V0, V1, V2, V3, V4, V5, V6, V7);
2959 #else
2960  NLIB_ALIGNAS(16) int8_t mask_a[16] = {
2961  (V0 < 0 || V0 > 7) ? -128 : V0 * 2,
2962  (V0 < 0 || V0 > 7) ? -128 : V0 * 2 + 1,
2963  (V1 < 0 || V1 > 7) ? -128 : V1 * 2,
2964  (V1 < 0 || V1 > 7) ? -128 : V1 * 2 + 1,
2965  (V2 < 0 || V2 > 7) ? -128 : V2 * 2,
2966  (V2 < 0 || V2 > 7) ? -128 : V2 * 2 + 1,
2967  (V3 < 0 || V3 > 7) ? -128 : V3 * 2,
2968  (V3 < 0 || V3 > 7) ? -128 : V3 * 2 + 1,
2969  (V4 < 0 || V4 > 7) ? -128 : V4 * 2,
2970  (V4 < 0 || V4 > 7) ? -128 : V4 * 2 + 1,
2971  (V5 < 0 || V5 > 7) ? -128 : V5 * 2,
2972  (V5 < 0 || V5 > 7) ? -128 : V5 * 2 + 1,
2973  (V6 < 0 || V6 > 7) ? -128 : V6 * 2,
2974  (V6 < 0 || V6 > 7) ? -128 : V6 * 2 + 1,
2975  (V7 < 0 || V7 > 7) ? -128 : V7 * 2,
2976  (V7 < 0 || V7 > 7) ? -128 : V7 * 2 + 1
2977  };
2978  NLIB_ALIGNAS(16) int8_t mask_b[16] = {
2979  V0 < 8 ? -128 : (V0 - 8) * 2,
2980  V0 < 8 ? -128 : (V0 - 8) * 2 + 1,
2981  V1 < 8 ? -128 : (V1 - 8) * 2,
2982  V1 < 8 ? -128 : (V1 - 8) * 2 + 1,
2983  V2 < 8 ? -128 : (V2 - 8) * 2,
2984  V2 < 8 ? -128 : (V2 - 8) * 2 + 1,
2985  V3 < 8 ? -128 : (V3 - 8) * 2,
2986  V3 < 8 ? -128 : (V3 - 8) * 2 + 1,
2987  V4 < 8 ? -128 : (V4 - 8) * 2,
2988  V4 < 8 ? -128 : (V4 - 8) * 2 + 1,
2989  V5 < 8 ? -128 : (V5 - 8) * 2,
2990  V5 < 8 ? -128 : (V5 - 8) * 2 + 1,
2991  V6 < 8 ? -128 : (V6 - 8) * 2,
2992  V6 < 8 ? -128 : (V6 - 8) * 2 + 1,
2993  V7 < 8 ? -128 : (V7 - 8) * 2,
2994  V7 < 8 ? -128 : (V7 - 8) * 2 + 1
2995  };
2996  i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2997  i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2998  return I128::Or(tmp_a, tmp_b);
2999 #endif
3000 }
3001 
3002 template<int V0, int V1, int V2, int V3>
3003 NLIB_M(i128) I128::Permute32(i128arg a, i128arg b) NLIB_NOEXCEPT {
3004 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON)
3005  return vreinterpretq_s8_u32(__builtin_shufflevector(
3006  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b),
3007  V0, V1, V2, V3));
3008 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41)
3009  return __builtin_shufflevector((__v4si)a, (__v4si)b,
3010  V0, V1, V2, V3);
3011 #else
3012  NLIB_ALIGNAS(16) int8_t mask_a[16] = {
3013  (V0 < 0 || V0 > 3) ? -128 : V0 * 4,
3014  (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 1,
3015  (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 2,
3016  (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 3,
3017  (V1 < 0 || V1 > 3) ? -128 : V1 * 4,
3018  (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 1,
3019  (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 2,
3020  (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 3,
3021  (V2 < 0 || V2 > 3) ? -128 : V2 * 4,
3022  (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 1,
3023  (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 2,
3024  (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 3,
3025  (V3 < 0 || V3 > 3) ? -128 : V3 * 4,
3026  (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 1,
3027  (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 2,
3028  (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 3
3029  };
3030  NLIB_ALIGNAS(16) int8_t mask_b[16] = {
3031  V0 < 4 ? -128 : (V0 - 4) * 4,
3032  V0 < 4 ? -128 : (V0 - 4) * 4 + 1,
3033  V0 < 4 ? -128 : (V0 - 4) * 4 + 2,
3034  V0 < 4 ? -128 : (V0 - 4) * 4 + 3,
3035  V1 < 4 ? -128 : (V1 - 4) * 4,
3036  V1 < 4 ? -128 : (V1 - 4) * 4 + 1,
3037  V1 < 4 ? -128 : (V1 - 4) * 4 + 2,
3038  V1 < 4 ? -128 : (V1 - 4) * 4 + 3,
3039  V2 < 4 ? -128 : (V2 - 4) * 4,
3040  V2 < 4 ? -128 : (V2 - 4) * 4 + 1,
3041  V2 < 4 ? -128 : (V2 - 4) * 4 + 2,
3042  V2 < 4 ? -128 : (V2 - 4) * 4 + 3,
3043  V3 < 4 ? -128 : (V3 - 4) * 4,
3044  V3 < 4 ? -128 : (V3 - 4) * 4 + 1,
3045  V3 < 4 ? -128 : (V3 - 4) * 4 + 2,
3046  V3 < 4 ? -128 : (V3 - 4) * 4 + 3
3047  };
3048  i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
3049  i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
3050  return I128::Or(tmp_a, tmp_b);
3051 #endif
3052 }
3053 
3054 
3055 // 0123456789abcdef -> 1032547698badcfe
3056 NLIB_M(i128) I128::Reverse16(i128arg value) NLIB_NOEXCEPT {
3057 #if defined(NLIB_SSE41)
3058  NLIB_ALIGNAS(16) const int8_t mask_[16] = {
3059  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
3060  };
3061  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3062 #elif defined(NLIB_NEON)
3063  return NLIB_OP1(vrev16q, u8, value);
3064 #endif
3065 }
3066 
3067 // 0123456789abcdef -> 32107654ba98fedc
3068 NLIB_M(i128) I128::Reverse32(i128arg value) NLIB_NOEXCEPT {
3069 #if defined(NLIB_SSE41)
3070  NLIB_ALIGNAS(16) const int8_t mask_[16] = {
3071  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
3072  };
3073  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3074 #elif defined(NLIB_NEON)
3075  return NLIB_OP1(vrev32q, u8, value);
3076 #endif
3077 }
3078 
3079 // 0123456789abcdef -> 76543210fedcba98
3080 NLIB_M(i128) I128::Reverse64(i128arg value) NLIB_NOEXCEPT {
3081 #if defined(NLIB_SSE41)
3082  NLIB_ALIGNAS(16) const int8_t mask_[16] = {
3083  7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
3084  };
3085  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3086 #elif defined(NLIB_NEON)
3087  return NLIB_OP1(vrev64q, u8, value);
3088 #endif
3089 }
3090 
3091 // r = Or(ismasked(value.u8[i]) ? (1 << i) : 0)
3092 NLIB_M(int) I128::MoveMask8(i128arg value) NLIB_NOEXCEPT { // NOLINT
3093 #if defined(NLIB_SSE41)
3094  return _mm_movemask_epi8(value);
3095 #elif defined(NLIB_NEON)
3096  uint8x16_t powers = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL));
3097  uint8x16_t a = vandq_u8(value, powers);
3098 # ifdef __aarch64__
3099  return vaddv_u8(vget_low_u8(a)) | (vaddv_u8(vget_high_u8(a)) << 8);
3100 # else
3101  uint8x8_t al = vget_low_u8(a);
3102  uint8x8_t ah = vget_high_u8(a);
3103  uint8x8_t tmp = vpadd_u8(al, ah);
3104  tmp = vpadd_u8(tmp, tmp);
3105  tmp = vpadd_u8(tmp, tmp);
3106  return vget_lane_u16(vreinterpret_u16_u8(tmp), 0);
3107 # endif
3108 #endif
3109 }
3110 
3111 // r = Or(ismasked(value.u16[i]) ? (1 << i) : 0)
3112 NLIB_M(int) I128::MoveMask16(i128arg value) NLIB_NOEXCEPT { // NOLINT
3113 #if defined(NLIB_SSE41)
3114  __m128i tmp = _mm_packs_epi16(value, value);
3115  return _mm_movemask_epi8(tmp) & 255;
3116 #elif defined(NLIB_NEON)
3117  uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3118  uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3119  uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3120  uint16x8_t a = vandq_u16(vreinterpretq_u16_s8(value), powers);
3121 # ifdef __aarch64__
3122  return vaddvq_u16(a);
3123 # else
3124  uint8x8_t tmp = vmovn_u16(a);
3125  tmp = vpadd_u8(tmp, tmp);
3126  tmp = vpadd_u8(tmp, tmp);
3127  tmp = vpadd_u8(tmp, tmp);
3128  return vget_lane_u8(tmp, 0);
3129 # endif
3130 #endif
3131 }
3132 
3133 // r = Or(ismasked(value.u32[i]) ? (1 << i) : 0)
3134 NLIB_M(int) I128::MoveMask32(i128arg value) NLIB_NOEXCEPT { // NOLINT
3135 #if defined(NLIB_SSE41)
3136  __m128i tmp = _mm_packs_epi16(value, value);
3137  tmp = _mm_packs_epi16(tmp, tmp);
3138  return _mm_movemask_epi8(tmp) & 15;
3139 #elif defined(NLIB_NEON)
3140  uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3141  uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3142  uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3143  uint32x4_t a = vandq_u32(vreinterpretq_u32_s8(value), powers);
3144 # ifdef __aarch64__
3145  return vaddvq_u32(a);
3146 # else
3147  uint16x4_t tmp = vmovn_u32(a);
3148  tmp = vpadd_u16(tmp, tmp);
3149  tmp = vpadd_u16(tmp, tmp);
3150  return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
3151 # endif
3152 #endif
3153 }
3154 
3155 NLIB_M(i128) I128::SetMask8(int mask) NLIB_NOEXCEPT {
3156 #if defined(NLIB_NEON)
3157  int8x8_t m = vcreate_s8(0x8040201008040201ULL);
3158  int8x8_t s0 = vdup_n_s8(mask & 0xFF);
3159  int8x8_t s1 = vdup_n_s8(mask >> 8);
3160  return vtstq_s8(vcombine_s8(m, m), vcombine_s8(s0, s1));
3161 #elif defined(NLIB_SSE41)
3162  i128 m = I128::SetValue(0x8040201008040201ULL, each_uint64);
3163  i128 s0 = I128::SetValue(mask & 0xFF, each_int8);
3164  i128 s1 = I128::SetValue(static_cast<int8_t>(mask >> 8), each_int8);
3165  i128 s = _mm_blend_epi16(s0, s1, 0xF0);
3166  return I128::Test8(m, s);
3167 #endif
3168 }
3169 
3170 NLIB_M(i128) I128::SetMask16(int mask) NLIB_NOEXCEPT {
3171 #if defined(NLIB_NEON)
3172  uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3173  uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3174  uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3175  uint16x8_t s = vdupq_n_u16(mask);
3176  return vreinterpretq_s8_u16(vtstq_u16(powers, s));
3177 #elif defined(NLIB_SSE41)
3178  i128 m0 = I128::SetValue(0x0008000400020001ULL, each_uint64);
3179  i128 m1 = I128::SetValue(0x0080004000200010ULL, each_uint64);
3180  i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3181  i128 s = I128::SetValue(static_cast<int16_t>(mask), each_int16);
3182  return I128::Test16(m, s);
3183 #endif
3184 }
3185 
3186 NLIB_M(i128) I128::SetMask32(int mask) NLIB_NOEXCEPT {
3187 #if defined(NLIB_NEON)
3188  uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3189  uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3190  uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3191  uint32x4_t s = vdupq_n_u32(mask);
3192  return vreinterpretq_s8_u32(vtstq_u32(powers, s));
3193 #elif defined(NLIB_SSE41)
3194  i128 m0 = I128::SetValue(0x0000000200000001ULL, each_uint64);
3195  i128 m1 = I128::SetValue(0x0000000800000004ULL, each_uint64);
3196  i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3197  i128 s = I128::SetValue(mask, each_int32);
3198  return I128::Test32(m, s);
3199 #endif
3200 }
3201 
3202 // true if value == 0
3203 NLIB_M(bool) I128::IsZero(i128arg value) NLIB_NOEXCEPT { // NOLINT
3204 #if defined(NLIB_SSE41)
3205  return _mm_testz_si128(value, value) != 0;
3206 #elif defined(NLIB_NEON)
3207 # ifdef __aarch64__
3208  uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(value));
3209  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3210 # else
3211  // NOTE: there is a better way
3212  // https://stackoverflow.com/questions/15389539/fastest-way-to-test-a-128-bit-neon-register-for-a-value-of-0-using-intrinsics // NOLINT
3213  int8x8_t tmp = vorr_s8(vget_low_s8(value), vget_high_s8(value));
3214  return vget_lane_u64(vreinterpret_u64_s8(tmp), 0) == 0;
3215 # endif
3216 #endif
3217 }
3218 
3219 // true if all bits on
3220 NLIB_M(bool) I128::IsFull(i128arg value) NLIB_NOEXCEPT { // NOLINT
3221 #if defined(NLIB_SSE41)
3222  return _mm_testc_si128(value, _mm_cmpeq_epi8(value, value)) != 0;
3223 #elif defined(NLIB_NEON)
3224 # ifdef __aarch64__
3225  uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(vmvnq_s8(value)));
3226  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3227 # else
3228  int8x8_t tmp = vand_s8(vget_low_s8(value), vget_high_s8(value));
3229  return vget_lane_s64(vreinterpret_s64_s8(tmp), 0) == -1;
3230 # endif
3231 #endif
3232 }
3233 
3234 // r.i8[i] = ismasked(mask.i8[i]) ? a.i8[i] : b.i8[i]
3235 NLIB_M(i128) I128::Select(i128arg mask, i128arg a, i128arg b) NLIB_NOEXCEPT {
3236 #if defined(NLIB_SSE41)
3237  return _mm_blendv_epi8(b, a, mask);
3238 #elif defined(NLIB_NEON)
3239  return NLIB_OP3(vbslq, u32, mask, a, b);
3240 #endif
3241 }
3242 
3243 // r[i] = 0 if shuffle.u8[i] == 0x80, r[i] = value[shuffle.u8[i]] if 0 <= shuffle.u8[i] < 16
3244 NLIB_M(i128) I128::Shuffle8(i128arg value, i128arg shuffle) NLIB_NOEXCEPT {
3245 #if defined(NLIB_SSE41)
3246  return _mm_shuffle_epi8(value, shuffle);
3247 #elif defined(NLIB_NEON)
3248 # ifdef __aarch64__
3249  return vqtbl1q_s8(value, vreinterpretq_u8_s8(shuffle));
3250 # else
3251  int8x8x2_t x;
3252  x.val[0] = vget_low_s8(value);
3253  x.val[1] = vget_high_s8(value);
3254  int8x8_t lo = vtbl2_s8(x, vget_low_s8(shuffle));
3255  int8x8_t hi = vtbl2_s8(x, vget_high_s8(shuffle));
3256  return vcombine_s8(lo, hi);
3257 # endif
3258 #endif
3259 }
3260 
3261 // popcnt of mask(returns nlib_popcnt16(I128::MoveMask8(value))
3262 NLIB_ALWAYS_INLINE int __vectorcall I128::PopCntMask8(i128arg value) NLIB_NOEXCEPT {
3263 #if defined(NLIB_NEON)
3264 # ifdef __aarch64__
3265  int8x16_t tmp = vnegq_s8(value);
3266  return vaddvq_s8(tmp);
3267 # else
3268  int8x16_t tmp = vnegq_s8(value);
3269  int8x8_t lo = vget_low_s8(tmp);
3270  int8x8_t hi = vget_high_s8(tmp);
3271  lo = vadd_s8(lo, hi);
3272  lo = vpadd_s8(lo, lo);
3273  lo = vpadd_s8(lo, lo);
3274  lo = vpadd_s8(lo, lo);
3275  return vget_lane_s8(lo, 0);
3276 # endif
3277 #else
3278  return nlib_popcnt16(static_cast<uint16_t>(I128::MoveMask8(value)));
3279 #endif
3280 }
3281 
3282 NLIB_ALWAYS_INLINE int __vectorcall I128::ClzMask8(i128arg value) NLIB_NOEXCEPT {
3283  return nlib_clz32(static_cast<uint32_t>(I128::MoveMask8(value))) - 16;
3284 }
3285 
3286 NLIB_ALWAYS_INLINE int __vectorcall I128::CtzMask8(i128arg value) NLIB_NOEXCEPT {
3287  return nlib_ctz32(static_cast<uint32_t>(I128::MoveMask8(value) | 0x10000));
3288 }
3289 
3290 #ifdef NLIB_NEON
3291 # undef vreinterpretq_s8_s8
3292 # undef NLIB_OP1
3293 # undef NLIB_OP2
3294 # undef NLIB_OP3
3295 # undef NLIB_CMP
3296 # undef NLIB_SFT
3297 # undef NLIB_CMB
3298 #endif
3299 
3300 #endif // NLIB_DOXYGEN
3301 
3302 #undef NLIB_M
3303 #undef NLIB_M2
3304 
3305 #if defined(NLIB_SSE41)
3306 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3307  { \
3308  row0 = _mm_shuffle_epi32(row0, _MM_SHUFFLE(3, 1, 2, 0)); \
3309  row1 = _mm_shuffle_epi32(row1, _MM_SHUFFLE(3, 1, 2, 0)); \
3310  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(3, 1, 2, 0)); \
3311  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(3, 1, 2, 0)); \
3312  __m128i t0_transpose32_ = _mm_unpacklo_epi32(row0, row1); \
3313  __m128i t1_transpose32_ = _mm_unpackhi_epi32(row0, row1); \
3314  __m128i t2_transpose32_ = _mm_unpacklo_epi32(row2, row3); \
3315  __m128i t3_transpose32_ = _mm_unpackhi_epi32(row2, row3); \
3316  row0 = _mm_unpacklo_epi64(t0_transpose32_, t2_transpose32_); \
3317  row1 = _mm_unpacklo_epi64(t1_transpose32_, t3_transpose32_); \
3318  row2 = _mm_unpackhi_epi64(t0_transpose32_, t2_transpose32_); \
3319  row3 = _mm_unpackhi_epi64(t1_transpose32_, t3_transpose32_); \
3320 }
3321 #elif defined(NLIB_NEON)
3322 # ifdef __aarch64__
3323 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3324  { \
3325  uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), \
3326  vreinterpretq_u32_s8(row1)); \
3327  uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), \
3328  vreinterpretq_u32_s8(row3)); \
3329  uint64x2_t row0_, row1_, row2_, row3_; \
3330  row0_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \
3331  vreinterpretq_u64_u32(trn_f1_.val[0])); \
3332  row0 = vreinterpretq_s8_u64(row0_); \
3333  row1_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \
3334  vreinterpretq_u64_u32(trn_f1_.val[1])); \
3335  row1 = vreinterpretq_s8_u64(row1_); \
3336  row2_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \
3337  vreinterpretq_u64_u32(trn_f1_.val[0])); \
3338  row2 = vreinterpretq_s8_u64(row2_); \
3339  row3_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \
3340  vreinterpretq_u64_u32(trn_f1_.val[1])); \
3341  row3 = vreinterpretq_s8_u64(row3_); \
3342  }
3343 # else
3344 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3345  { \
3346  uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), \
3347  vreinterpretq_u32_s8(row1)); \
3348  uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), \
3349  vreinterpretq_u32_s8(row3)); \
3350  uint32x4_t row0_, row1_, row2_, row3_; \
3351  uint32x2_t lo, hi; \
3352  lo = vget_low_u32(trn_f0_.val[0]); hi = vget_low_u32(trn_f1_.val[0]); \
3353  row0_ = vcombine_u32(lo, hi); \
3354  row0 = vreinterpretq_s8_u32(row0_); \
3355  lo = vget_low_u32(trn_f0_.val[1]); hi = vget_low_u32(trn_f1_.val[1]); \
3356  row1_ = vcombine_u32(lo, hi); \
3357  row1 = vreinterpretq_s8_u32(row1_); \
3358  lo = vget_high_u32(trn_f0_.val[0]); hi = vget_high_u32(trn_f1_.val[0]); \
3359  row2_ = vcombine_u32(lo, hi); \
3360  row2 = vreinterpretq_s8_u32(row2_); \
3361  lo = vget_high_u32(trn_f0_.val[1]); hi = vget_high_u32(trn_f1_.val[1]); \
3362  row3_ = vcombine_u32(lo, hi); \
3363  row3 = vreinterpretq_s8_u32(row3_); \
3364  }
3365 # endif
3366 #endif
3367 
3368 #endif // NLIB_SIMD
3369 
3370 } // namespace simd
3371 NLIB_NAMESPACE_END
3372 
3373 #endif // INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
The tag for representing a signed 32-bit integer with an empty structure.
Definition: SimdInt.h:35
The tag for representing a signed 64-bit integer with an empty structure.
Definition: SimdInt.h:37
#define NLIB_ALWAYS_INLINE
Indicates that the compiler is forced to perform inline expansion of functions.
Definition: Platform_unix.h:97
constexpr const each_uint8_tag each_uint8
The tag for representing an unsigned 8-bit integer with an each_uint8_tag-type constant object...
Definition: SimdInt.h:51
The tag for representing the selection of a lane divided into 8-bit units with an empty structure...
Definition: SimdInt.h:61
The tag for representing the selection of a lane divided into 32-bit units with an empty structure...
Definition: SimdInt.h:57
The tag for representing a signed 8-bit integer with an empty structure.
Definition: SimdInt.h:31
#define NLIB_VIS_HIDDEN
Symbols for functions and classes are not made available outside of the library.
Definition: Platform_unix.h:88
static int nlib_clz32(uint32_t x)
Returns the number of consecutive zero bits, with respect to the most significant bit (MSB)...
Definition: Platform.h:2615
constexpr const each_uint16_tag each_uint16
The tag for representing an unsigned 16-bit integer with an each_uint16_tag-type constant object...
Definition: SimdInt.h:52
The tag for representing an unsigned 16-bit integer with an empty structure.
Definition: SimdInt.h:41
The tag for representing an unsigned 64-bit integer with an empty structure.
Definition: SimdInt.h:45
constexpr const each_int64_tag each_int64
The tag for representing a signed 64-bit integer with an each_int64_tag-type constant object...
Definition: SimdInt.h:50
nlib_i128_t i128
nlib_i128_t is defined using typedef.
Definition: SimdInt.h:74
constexpr const each_uint64_tag each_uint64
The tag for representing an unsigned 64-bit integer with an each_uint64_tag-type constant object...
Definition: SimdInt.h:54
static int nlib_popcnt16(uint16_t x)
Returns the number of bits that are 1.
Definition: Platform.h:2479
The class for integer SIMD computations using128-bit registers (MM0-XMM15 for SSE, and Q0-Q15 for NEON).
Definition: SimdInt.h:82
static int nlib_ctz32(uint32_t x)
Returns the number of consecutive zero bits, with respect to the least significant bit (LSB)...
Definition: Platform.h:2616
The tag for representing an unsigned 8-bit integer with an empty structure.
Definition: SimdInt.h:39
The tag for representing a signed 16-bit integer with an empty structure.
Definition: SimdInt.h:33
constexpr const each_int16_tag each_int16
The tag for representing a signed 16-bit integer with an each_int16_tag-type constant object...
Definition: SimdInt.h:48
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
Definition: SimdInt.h:53
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Definition: Config.h:105
The tag for representing the selection of a lane divided into 16-bit units with an empty structure...
Definition: SimdInt.h:59
constexpr const each_select16_tag each_select16
The tag for representing the selection of a 16-bit lane with an each_select16_tag-type constant objec...
Definition: SimdInt.h:64
#define NLIB_CEXPR
Defines constexpr if it is available for use. If not, holds an empty string.
Definition: Config.h:107
A file that contains the configuration information for each development environment.
__m128i nlib_i128_t
The type for a SIMD register for 128-bit integers.
Definition: SimdInt.h:22
constexpr const each_select8_tag each_select8
The tag for representing the selection of an 8-bit lane with an each_select8_tag-type constant object...
Definition: SimdInt.h:65
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
Definition: Config.h:255
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
Definition: SimdInt.h:47
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
Definition: SimdInt.h:63
The tag for representing an unsigned 32-bit integer with an empty structure.
Definition: SimdInt.h:43
constexpr const each_int32_tag each_int32
The tag for representing a signed 32-bit integer with an each_int32_tag-type constant object...
Definition: SimdInt.h:49
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
Definition: Config.h:170