nlib
SimdInt.h
Go to the documentation of this file.
1 
2 /*---------------------------------------------------------------------------*
3 
4  Project: CrossRoad
5  Copyright (C)2012-2016 Nintendo. All rights reserved.
6 
7  These coded instructions, statements, and computer programs contain
8  proprietary information of Nintendo of America Inc. and/or Nintendo
9  Company Ltd., and are protected by Federal copyright law. They may
10  not be disclosed to third parties or copied or duplicated in any form,
11  in whole or in part, without the prior written consent of Nintendo.
12 
13  *---------------------------------------------------------------------------*/
14 
15 #pragma once
16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
17 #define INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
18 
19 #include "nn/nlib/Config.h"
20 
21 #if defined(NLIB_SSE41)
22 typedef __m128i nlib_i128_t;
23 #elif defined(NLIB_NEON)
24 typedef int8x16_t nlib_i128_t;
25 #endif
26 
27 NLIB_NAMESPACE_BEGIN
28 namespace simd {
29 
30 // use each_int8 for the argument value
31 struct each_int8_tag {};
32 // use each_int16 for the argument value
33 struct each_int16_tag {};
34 // use each_int32 for the argument value
35 struct each_int32_tag {};
36 // use each_int64 for the argument value
37 struct each_int64_tag {};
38 // use each_uint8 for the argument value
39 struct each_uint8_tag {};
40 // use each_uint16 for the argument value
41 struct each_uint16_tag {};
42 // use each_uint32 for the argument value
43 struct each_uint32_tag {};
44 // use each_uint64 for the argument value
45 struct each_uint64_tag {};
46 
55 
56 // use each_select32 for the argument value
58 // use each_select16 for the argument value
60 // use each_select8 for the argument value
61 struct each_select8_tag {};
62 
66 
67 #if !defined(_MSC_VER) || _MSC_VER < 1800
68 #ifndef __vectorcall
69 #define __vectorcall
70 #endif
71 #endif
72 
73 #if defined(NLIB_SIMD)
74 
75 // __m128i(SSE), int8x16_t(NEON)
76 typedef nlib_i128_t i128;
77 
78 #if _MSC_VER < 1800
79 typedef const i128& i128arg;
80 #else
81 typedef const i128 i128arg;
82 #endif
83 
85  static i128 __vectorcall SetValue(int8_t v, each_int8_tag) NLIB_NOEXCEPT;
86  static i128 __vectorcall SetValue(int16_t v, each_int16_tag) NLIB_NOEXCEPT;
87  static i128 __vectorcall SetValue(int32_t v, each_int32_tag) NLIB_NOEXCEPT;
88  static i128 __vectorcall SetValue(int64_t v, each_int64_tag) NLIB_NOEXCEPT;
89  static i128 __vectorcall SetValue(uint8_t v, each_uint8_tag) NLIB_NOEXCEPT;
90  static i128 __vectorcall SetValue(uint16_t v, each_uint16_tag) NLIB_NOEXCEPT;
91  static i128 __vectorcall SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT;
92  static i128 __vectorcall SetValue(uint64_t v, each_uint64_tag) NLIB_NOEXCEPT;
93 
94  template <size_t N>
95  static i128 __vectorcall SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT;
96  template <size_t N>
97  static i128 __vectorcall SetValue(i128 value, each_select16_tag) NLIB_NOEXCEPT;
98  template <size_t N>
99  static i128 __vectorcall SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT;
100 
101  static i128 __vectorcall SetZero() NLIB_NOEXCEPT;
102  static i128 __vectorcall SetFull(i128arg dummy) NLIB_NOEXCEPT;
103 
104  static i128 __vectorcall LoadA16(const void* p) NLIB_NOEXCEPT;
105  static i128 __vectorcall LoadA8(const void* p) NLIB_NOEXCEPT;
106  static i128 __vectorcall LoadA4(const void* p) NLIB_NOEXCEPT;
107  static i128 __vectorcall LoadA2(const void* p) NLIB_NOEXCEPT;
108  static i128 __vectorcall LoadA1(const void* p) NLIB_NOEXCEPT;
109  static i128 __vectorcall LoadLoA8(const void* p) NLIB_NOEXCEPT;
110  static i128 __vectorcall LoadLoA4(const void* p) NLIB_NOEXCEPT;
111  static i128 __vectorcall LoadLoA2(const void* p) NLIB_NOEXCEPT;
112  static i128 __vectorcall LoadLoA1(const void* p) NLIB_NOEXCEPT;
113  static i128 __vectorcall LoadHiA8(const void* p) NLIB_NOEXCEPT;
114  static i128 __vectorcall LoadHiA4(const void* p) NLIB_NOEXCEPT;
115  static i128 __vectorcall LoadHiA2(const void* p) NLIB_NOEXCEPT;
116  static i128 __vectorcall LoadHiA1(const void* p) NLIB_NOEXCEPT;
117 
118 #define NLIB_LOAD_REDIRECT(func) \
119  static i128 __vectorcall func(uintptr_t p) NLIB_NOEXCEPT { \
120  return func(reinterpret_cast<void*>(p)); \
121  } \
122  static i128 __vectorcall func(intptr_t p) NLIB_NOEXCEPT { \
123  return func(reinterpret_cast<void*>(p)); \
124  }
125  NLIB_LOAD_REDIRECT(LoadA16)
126  NLIB_LOAD_REDIRECT(LoadA8)
127  NLIB_LOAD_REDIRECT(LoadA4)
128  NLIB_LOAD_REDIRECT(LoadA2)
129  NLIB_LOAD_REDIRECT(LoadA1)
130  NLIB_LOAD_REDIRECT(LoadLoA8)
131  NLIB_LOAD_REDIRECT(LoadLoA4)
132  NLIB_LOAD_REDIRECT(LoadLoA2)
133  NLIB_LOAD_REDIRECT(LoadLoA1)
134  NLIB_LOAD_REDIRECT(LoadHiA8)
135  NLIB_LOAD_REDIRECT(LoadHiA4)
136  NLIB_LOAD_REDIRECT(LoadHiA2)
137  NLIB_LOAD_REDIRECT(LoadHiA1)
138 #undef NLIB_LOAD_REDIRECT
139 
140  static void __vectorcall StoreA16(void* p, i128arg value) NLIB_NOEXCEPT;
141  static void __vectorcall StoreA8(void* p, i128arg value) NLIB_NOEXCEPT;
142  static void __vectorcall StoreA4(void* p, i128arg value) NLIB_NOEXCEPT;
143  static void __vectorcall StoreA2(void* p, i128arg value) NLIB_NOEXCEPT;
144  static void __vectorcall StoreA1(void* p, i128arg value) NLIB_NOEXCEPT;
145  static void __vectorcall StoreLoA8(void* p, i128arg value) NLIB_NOEXCEPT;
146  static void __vectorcall StoreLoA4(void* p, i128arg value) NLIB_NOEXCEPT;
147  static void __vectorcall StoreLoA2(void* p, i128arg value) NLIB_NOEXCEPT;
148  static void __vectorcall StoreLoA1(void* p, i128arg value) NLIB_NOEXCEPT;
149  static void __vectorcall StoreHiA8(void* p, i128arg value) NLIB_NOEXCEPT;
150  static void __vectorcall StoreHiA4(void* p, i128arg value) NLIB_NOEXCEPT;
151  static void __vectorcall StoreHiA2(void* p, i128arg value) NLIB_NOEXCEPT;
152  static void __vectorcall StoreHiA1(void* p, i128arg value) NLIB_NOEXCEPT;
153 
154 #define NLIB_STORE_REDIRECT(func) \
155  static void __vectorcall func(uintptr_t p, i128arg value) NLIB_NOEXCEPT { \
156  func(reinterpret_cast<void*>(p), value); \
157  } \
158  static void __vectorcall func(intptr_t p, i128arg value) NLIB_NOEXCEPT { \
159  func(reinterpret_cast<void*>(p), value); \
160  }
161  NLIB_STORE_REDIRECT(StoreA16)
162  NLIB_STORE_REDIRECT(StoreA8)
163  NLIB_STORE_REDIRECT(StoreA4)
164  NLIB_STORE_REDIRECT(StoreA2)
165  NLIB_STORE_REDIRECT(StoreA1)
166  NLIB_STORE_REDIRECT(StoreLoA8)
167  NLIB_STORE_REDIRECT(StoreLoA4)
168  NLIB_STORE_REDIRECT(StoreLoA2)
169  NLIB_STORE_REDIRECT(StoreLoA1)
170  NLIB_STORE_REDIRECT(StoreHiA8)
171  NLIB_STORE_REDIRECT(StoreHiA4)
172  NLIB_STORE_REDIRECT(StoreHiA2)
173  NLIB_STORE_REDIRECT(StoreHiA1)
174 #undef NLIB_STORE_REDIRECT
175 
176  //
177  // Get/Set
178  //
179  template <size_t N>
180  static uint8_t __vectorcall GetUint8FromLane(i128arg value) NLIB_NOEXCEPT;
181  template <size_t N>
182  static uint16_t __vectorcall GetUint16FromLane(i128arg value) NLIB_NOEXCEPT;
183  template <size_t N>
184  static uint32_t __vectorcall GetUint32FromLane(i128arg value) NLIB_NOEXCEPT;
185  template <size_t N>
186  static uint64_t __vectorcall GetUint64FromLane(i128arg value) NLIB_NOEXCEPT;
187  template <size_t N>
188  static i128 __vectorcall SetUint8ToLane(i128arg value, uint8_t v) NLIB_NOEXCEPT;
189  template <size_t N>
190  static i128 __vectorcall SetUint16ToLane(i128arg value, uint16_t v) NLIB_NOEXCEPT;
191  template <size_t N>
192  static i128 __vectorcall SetUint32ToLane(i128arg value, uint32_t v) NLIB_NOEXCEPT;
193  template <size_t N>
194  static i128 __vectorcall SetUint64ToLane(i128arg value, uint64_t v) NLIB_NOEXCEPT;
195 
196  //
197  // Arithmetic Operations
198  //
199  static i128 __vectorcall Add8(i128arg a, i128arg b) NLIB_NOEXCEPT;
200  static i128 __vectorcall Add16(i128arg a, i128arg b) NLIB_NOEXCEPT;
201  static i128 __vectorcall Add32(i128arg a, i128arg b) NLIB_NOEXCEPT;
202  static i128 __vectorcall Add64(i128arg a, i128arg b) NLIB_NOEXCEPT;
203 
204  static i128 __vectorcall AddInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
205  static i128 __vectorcall AddInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
206 
207  static i128 __vectorcall AddUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
208  static i128 __vectorcall AddUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
209 
210  static i128 __vectorcall Sub8(i128arg a, i128arg b) NLIB_NOEXCEPT;
211  static i128 __vectorcall Sub16(i128arg a, i128arg b) NLIB_NOEXCEPT;
212  static i128 __vectorcall Sub32(i128arg a, i128arg b) NLIB_NOEXCEPT;
213  static i128 __vectorcall Sub64(i128arg a, i128arg b) NLIB_NOEXCEPT;
214 
215  static i128 __vectorcall SubInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
216  static i128 __vectorcall SubInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
217 
218  static i128 __vectorcall SubUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
219  static i128 __vectorcall SubUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT;
220 
221  static i128 __vectorcall PairwiseAdd8(i128arg a, i128arg b) NLIB_NOEXCEPT;
222  static i128 __vectorcall PairwiseAdd16(i128arg a, i128arg b) NLIB_NOEXCEPT;
223  static i128 __vectorcall PairwiseAdd32(i128arg a, i128arg b) NLIB_NOEXCEPT;
224  // PairwiseMaxInt8, PairwiseMaxInt16, PairwiseMaxInt32
225  // PairwiseMaxUint8, PairwiseMaxUint16, PairwiseMaxUint32
226  // PairwiseMinInt8, PairwiseMinInt16, PairwiseMinInt32
227  // PairwiseMinUint8, PairwiseMinUint16, PairwiseMinUint32
228 
229  static i128 __vectorcall Mult16(i128arg a, i128arg b) NLIB_NOEXCEPT;
230  static i128 __vectorcall MultAdd16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
231  static i128 __vectorcall MultSub16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
232  static i128 __vectorcall Mult32(i128arg a, i128arg b) NLIB_NOEXCEPT;
233  static i128 __vectorcall MultAdd32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
234  static i128 __vectorcall MultSub32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT;
235 
236  static i128 __vectorcall NegateInt8(i128arg value) NLIB_NOEXCEPT;
237  static i128 __vectorcall NegateInt16(i128arg value) NLIB_NOEXCEPT;
238  static i128 __vectorcall NegateInt32(i128arg value) NLIB_NOEXCEPT;
239 
240  static i128 __vectorcall MaxInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
241  static i128 __vectorcall MaxInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
242  static i128 __vectorcall MaxInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
243  static i128 __vectorcall MaxUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
244  static i128 __vectorcall MaxUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
245  static i128 __vectorcall MaxUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
246  static i128 __vectorcall MinInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
247  static i128 __vectorcall MinInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
248  static i128 __vectorcall MinInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
249  static i128 __vectorcall MinUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
250  static i128 __vectorcall MinUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
251  static i128 __vectorcall MinUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
252 
253  static i128 __vectorcall AbsInt8(i128arg value) NLIB_NOEXCEPT;
254  static i128 __vectorcall AbsInt16(i128arg value) NLIB_NOEXCEPT;
255  static i128 __vectorcall AbsInt32(i128arg value) NLIB_NOEXCEPT;
256  static i128 __vectorcall AbsDiffInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
257  static i128 __vectorcall AbsDiffInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
258  static i128 __vectorcall AbsDiffInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
259 
260  //
261  // Logical Operations
262  //
263  static i128 __vectorcall And(i128arg a, i128arg b) NLIB_NOEXCEPT;
264  static i128 __vectorcall Or(i128arg a, i128arg b) NLIB_NOEXCEPT;
265  static i128 __vectorcall Xor(i128arg a, i128arg b) NLIB_NOEXCEPT;
266  static i128 __vectorcall Not(i128arg a) NLIB_NOEXCEPT;
267  static i128 __vectorcall AndNot(i128arg a, i128arg b) NLIB_NOEXCEPT;
268  static i128 __vectorcall OrNot(i128arg a, i128arg b) NLIB_NOEXCEPT;
269  static i128 __vectorcall Test8(i128arg a, i128arg b) NLIB_NOEXCEPT;
270  static i128 __vectorcall Test16(i128arg a, i128arg b) NLIB_NOEXCEPT;
271  static i128 __vectorcall Test32(i128arg a, i128arg b) NLIB_NOEXCEPT;
272 
273  //
274  // Comparison Operations
275  //
276  static i128 __vectorcall CmpEq8(i128arg a, i128arg b) NLIB_NOEXCEPT;
277  static i128 __vectorcall CmpEq16(i128arg a, i128arg b) NLIB_NOEXCEPT;
278  static i128 __vectorcall CmpEq32(i128arg a, i128arg b) NLIB_NOEXCEPT;
279  static i128 __vectorcall CmpEq64(i128arg a, i128arg b) NLIB_NOEXCEPT;
280 
281  static i128 __vectorcall CmpLtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
282  static i128 __vectorcall CmpLtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
283  static i128 __vectorcall CmpLtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
284  static i128 __vectorcall CmpLtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
285 
286  static i128 __vectorcall CmpGtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
287  static i128 __vectorcall CmpGtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
288  static i128 __vectorcall CmpGtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
289  static i128 __vectorcall CmpGtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
290 
291  static i128 __vectorcall CmpLtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
292  static i128 __vectorcall CmpLtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
293  static i128 __vectorcall CmpLtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
294  static i128 __vectorcall CmpLtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
295 
296  static i128 __vectorcall CmpGtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
297  static i128 __vectorcall CmpGtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
298  static i128 __vectorcall CmpGtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
299  static i128 __vectorcall CmpGtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
300 
301  static i128 __vectorcall CmpLeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
302  static i128 __vectorcall CmpLeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
303  static i128 __vectorcall CmpLeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
304  static i128 __vectorcall CmpLeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
305 
306  static i128 __vectorcall CmpGeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT;
307  static i128 __vectorcall CmpGeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT;
308  static i128 __vectorcall CmpGeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT;
309  static i128 __vectorcall CmpGeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT;
310 
311  static i128 __vectorcall CmpLeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
312  static i128 __vectorcall CmpLeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
313  static i128 __vectorcall CmpLeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
314  static i128 __vectorcall CmpLeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
315 
316  static i128 __vectorcall CmpGeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT;
317  static i128 __vectorcall CmpGeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT;
318  static i128 __vectorcall CmpGeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT;
319  static i128 __vectorcall CmpGeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT;
320 
321  static i128 __vectorcall CmpEqZero8(i128arg value) NLIB_NOEXCEPT;
322  static i128 __vectorcall CmpEqZero16(i128arg value) NLIB_NOEXCEPT;
323  static i128 __vectorcall CmpEqZero32(i128arg value) NLIB_NOEXCEPT;
324  static i128 __vectorcall CmpEqZero64(i128arg value) NLIB_NOEXCEPT;
325 
326  //
327  // Bit Shift
328  //
329  static i128 __vectorcall ShiftLeftLogical8(i128arg value, int count) NLIB_NOEXCEPT;
330  static i128 __vectorcall ShiftRightLogical8(i128arg value, int count) NLIB_NOEXCEPT;
331  static i128 __vectorcall ShiftRightArithmetic8(i128arg value, int count) NLIB_NOEXCEPT;
332 
333  static i128 __vectorcall ShiftLeftLogical16(i128arg value, int count) NLIB_NOEXCEPT;
334  static i128 __vectorcall ShiftRightLogical16(i128arg value, int count) NLIB_NOEXCEPT;
335  static i128 __vectorcall ShiftRightArithmetic16(i128arg value, int count) NLIB_NOEXCEPT;
336 
337  static i128 __vectorcall ShiftLeftLogical32(i128arg value, int count) NLIB_NOEXCEPT;
338  static i128 __vectorcall ShiftRightLogical32(i128arg value, int count) NLIB_NOEXCEPT;
339  static i128 __vectorcall ShiftRightArithmetic32(i128arg value, int count) NLIB_NOEXCEPT;
340 
341  static i128 __vectorcall ShiftLeftLogical64(i128arg value, int count) NLIB_NOEXCEPT;
342  static i128 __vectorcall ShiftRightLogical64(i128arg value, int count) NLIB_NOEXCEPT;
343 
344  //
345  // Bit Shift(Constant)
346  //
347  template <size_t N>
348  static i128 __vectorcall ShiftLeftLogical8(i128arg value) NLIB_NOEXCEPT;
349  template <size_t N>
350  static i128 __vectorcall ShiftRightLogical8(i128arg value) NLIB_NOEXCEPT;
351  template <size_t N>
352  static i128 __vectorcall ShiftRightArithmetic8(i128arg value) NLIB_NOEXCEPT;
353 
354  template <size_t N>
355  static i128 __vectorcall ShiftLeftLogical16(i128arg value) NLIB_NOEXCEPT;
356  template <size_t N>
357  static i128 __vectorcall ShiftRightLogical16(i128arg value) NLIB_NOEXCEPT;
358  template <size_t N>
359  static i128 __vectorcall ShiftRightArithmetic16(i128arg value) NLIB_NOEXCEPT;
360 
361  template <size_t N>
362  static i128 __vectorcall ShiftLeftLogical32(i128arg value) NLIB_NOEXCEPT;
363  template <size_t N>
364  static i128 __vectorcall ShiftRightLogical32(i128arg value) NLIB_NOEXCEPT;
365  template <size_t N>
366  static i128 __vectorcall ShiftRightArithmetic32(i128arg value) NLIB_NOEXCEPT;
367 
368  template <size_t N>
369  static i128 __vectorcall ShiftLeftLogical64(i128arg value) NLIB_NOEXCEPT;
370  template <size_t N>
371  static i128 __vectorcall ShiftRightLogical64(i128arg value) NLIB_NOEXCEPT;
372 
373  //
374  // 128bit wide Byte Shift/Rotate
375  //
376  template <size_t N>
377  static i128 __vectorcall ByteShiftLeft(i128arg value) NLIB_NOEXCEPT;
378  template <size_t N>
379  static i128 __vectorcall ByteShiftRight(i128arg value) NLIB_NOEXCEPT;
380  template <size_t N>
381  static i128 __vectorcall ByteRotateRight(i128arg value) NLIB_NOEXCEPT;
382  template <size_t N>
383  static i128 __vectorcall AlignR(i128arg a, i128arg b) NLIB_NOEXCEPT;
384 
385  //
386  // Conversion
387  //
388  static i128 __vectorcall NarrowFrom16To8(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
389  static i128 __vectorcall NarrowFrom32To16(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
390  static i128 __vectorcall NarrowFrom64To32(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
391 
392  static i128 __vectorcall
393  ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT; // NOLINT
394  static i128 __vectorcall ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT;
395  static i128 __vectorcall
396  ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT; // NOLINT
397  static i128 __vectorcall
398  ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT; // NOLINT
399 
400  static i128 __vectorcall ConvertFromInt8ToInt16Lo(i128arg value) NLIB_NOEXCEPT;
401  static i128 __vectorcall ConvertFromInt8ToInt16Hi(i128arg value) NLIB_NOEXCEPT;
402  static i128 __vectorcall ConvertFromInt16ToInt32Lo(i128arg value) NLIB_NOEXCEPT;
403  static i128 __vectorcall ConvertFromInt16ToInt32Hi(i128arg value) NLIB_NOEXCEPT;
404  static i128 __vectorcall ConvertFromInt32ToInt64Lo(i128arg value) NLIB_NOEXCEPT;
405  static i128 __vectorcall ConvertFromInt32ToInt64Hi(i128arg value) NLIB_NOEXCEPT;
406  static i128 __vectorcall ConvertFromUint8ToUint16Lo(i128arg value) NLIB_NOEXCEPT;
407  static i128 __vectorcall ConvertFromUint8ToUint16Hi(i128arg value) NLIB_NOEXCEPT;
408  static i128 __vectorcall ConvertFromUint16ToUint32Lo(i128arg value) NLIB_NOEXCEPT;
409  static i128 __vectorcall ConvertFromUint16ToUint32Hi(i128arg value) NLIB_NOEXCEPT;
410  static i128 __vectorcall ConvertFromUint32ToUint64Lo(i128arg value) NLIB_NOEXCEPT;
411  static i128 __vectorcall ConvertFromUint32ToUint64Hi(i128arg value) NLIB_NOEXCEPT;
412 
413  static i128 __vectorcall Zip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
414  static i128 __vectorcall Zip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
415  static i128 __vectorcall Unzip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
416  static i128 __vectorcall Unzip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
417  static i128 __vectorcall Zip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
418  static i128 __vectorcall Zip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
419  static i128 __vectorcall Unzip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
420  static i128 __vectorcall Unzip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
421  static i128 __vectorcall Zip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
422  static i128 __vectorcall Zip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
423  static i128 __vectorcall Unzip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT;
424  static i128 __vectorcall Unzip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT;
425 
426  template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7,
427  int V8, int V9, int V10, int V11, int V12, int V13, int V14, int V15>
428  static i128 __vectorcall Permute8(i128arg a, i128arg b) NLIB_NOEXCEPT;
429  template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7>
430  static i128 __vectorcall Permute16(i128arg a, i128arg b) NLIB_NOEXCEPT;
431  template<int V0, int V1, int V2, int V3>
432  static i128 __vectorcall Permute32(i128arg a, i128arg b) NLIB_NOEXCEPT;
433 
434  //
435  // Swap endian
436  //
437  static i128 __vectorcall Reverse16(i128arg value) NLIB_NOEXCEPT;
438  static i128 __vectorcall Reverse32(i128arg value) NLIB_NOEXCEPT;
439  static i128 __vectorcall Reverse64(i128arg value) NLIB_NOEXCEPT;
440 
441  //
442  // Misc
443  //
444  static int __vectorcall MoveMask8(i128arg value) NLIB_NOEXCEPT;
445  static int __vectorcall MoveMask16(i128arg value) NLIB_NOEXCEPT;
446  static int __vectorcall MoveMask32(i128arg value) NLIB_NOEXCEPT;
447  static i128 __vectorcall SetMask8(int mask) NLIB_NOEXCEPT;
448  static i128 __vectorcall SetMask16(int mask) NLIB_NOEXCEPT;
449  static i128 __vectorcall SetMask32(int mask) NLIB_NOEXCEPT;
450  static bool __vectorcall IsZero(i128arg value) NLIB_NOEXCEPT;
451  static bool __vectorcall IsFull(i128arg value) NLIB_NOEXCEPT;
452  static i128 __vectorcall Select(i128arg mask, i128arg a, i128arg b) NLIB_NOEXCEPT;
453  static i128 __vectorcall Shuffle8(i128arg value, i128arg shuffle) NLIB_NOEXCEPT;
454  static int __vectorcall PopCntMask8(i128arg value) NLIB_NOEXCEPT;
455  static int __vectorcall ClzMask8(i128arg value) NLIB_NOEXCEPT;
456  static int __vectorcall CtzMask8(i128arg value) NLIB_NOEXCEPT;
457 
458  private:
459  I128(); // forbidden
460 };
461 
462 #ifndef NLIB_DOXYGEN
463 
464 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall
465 #define NLIB_M2(tp) inline tp __vectorcall
466 
467 #ifdef NLIB_NEON
468 # undef vreinterpret_s8_s8
469 # undef NLIB_OP1
470 # undef NLIB_OP2
471 # undef NLIB_OP3
472 # undef NLIB_CMP
473 # undef NLIB_SFT
474 # undef NLIB_CMB
475 
476 #define vreinterpretq_s8_s8(a) (a)
477 #define NLIB_OP1(intrin, tp, a) \
478  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a)))
479 #define NLIB_OP2(intrin, tp, a, b) \
480  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
481  vreinterpretq_##tp##_s8(b)))
482 #define NLIB_OP3(intrin, tp, a, b, c) \
483  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
484  vreinterpretq_##tp##_s8(b), \
485  vreinterpretq_##tp##_s8(c)))
486 #define NLIB_CMP(intrin, tp, a, b, utp) \
487  vreinterpretq_s8_##utp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \
488  vreinterpretq_##tp##_s8(b)))
489 #define NLIB_SFT(intrin, tp, a, cnt, stp) \
490  vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vdupq_n_##stp(cnt)))
491 #define NLIB_CMB(tp, l, h) vreinterpretq_s8_##tp(vcombine_##tp(l, h))
492 #endif
493 
494 // r.s8[i] = v for all i_mm_cvtsi32_si128
495 NLIB_M(i128) I128::SetValue(int8_t v, each_int8_tag) NLIB_NOEXCEPT {
496 #if defined(NLIB_SSE41)
497  // faster than _mm_set1_epi8(v)
498  return _mm_shuffle_epi8(_mm_cvtsi32_si128(static_cast<uint8_t>(v)), _mm_setzero_si128());
499 #elif defined(NLIB_NEON)
500  return vdupq_n_s8(v);
501 #endif
502 }
503 
504 // r.s16[i] = v for all i
505 NLIB_M(i128) I128::SetValue(int16_t v, each_int16_tag) NLIB_NOEXCEPT {
506 #if defined(NLIB_SSE41)
507  return _mm_set1_epi16(v);
508 #elif defined(NLIB_NEON)
509  return vreinterpretq_s8_s16(vdupq_n_s16(v));
510 #endif
511 }
512 
513 // r.s32[i] = v for all i
514 NLIB_M(i128) I128::SetValue(int32_t v, each_int32_tag) NLIB_NOEXCEPT {
515 #if defined(NLIB_SSE41)
516  return _mm_set1_epi32(v);
517 #elif defined(NLIB_NEON)
518  return vreinterpretq_s8_s32(vdupq_n_s32(v));
519 #endif
520 }
521 
522 // r.s64[i] = v for all i
523 NLIB_M(i128) I128::SetValue(int64_t v, each_int64_tag) NLIB_NOEXCEPT {
524 #if defined(NLIB_SSE41)
525 #ifdef _MSC_VER
526  // conflicts with ffloor()?
527  // Do not use MMX anyway
528  // return _mm_set1_epi64(*reinterpret_cast<__m64*>(&x));
529  NLIB_ALIGNAS(16) int64_t tmp[2] = {v, v};
530  return I128::LoadA16(tmp);
531 #else
532  return _mm_set1_epi64x(v);
533 #endif
534 #elif defined(NLIB_NEON)
535  return vreinterpretq_s8_s64(vdupq_n_s64(v));
536 #endif
537 }
538 
539 // r.u8[i] = v for all i
540 NLIB_M(i128) I128::SetValue(uint8_t v, each_uint8_tag) NLIB_NOEXCEPT {
541 #if defined(NLIB_SSE41)
542  // faster than _mm_set1_epi8(v)
543  return _mm_shuffle_epi8(_mm_cvtsi32_si128(v), _mm_setzero_si128());
544 #elif defined(NLIB_NEON)
545  return vreinterpretq_s8_u8(vdupq_n_u8(v));
546 #endif
547 }
548 
549 // r.u16[i] = v for all i
550 NLIB_M(i128) I128::SetValue(uint16_t v, each_uint16_tag) NLIB_NOEXCEPT {
551 #if defined(NLIB_SSE41)
552  return _mm_set1_epi16(static_cast<int16_t>(v));
553 #elif defined(NLIB_NEON)
554  return vreinterpretq_s8_u16(vdupq_n_u16(v));
555 #endif
556 }
557 
558 // r.u32[i] = v for all i
559 NLIB_M(i128) I128::SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT {
560 #if defined(NLIB_SSE41)
561  return _mm_set1_epi32(static_cast<int32_t>(v));
562 #elif defined(NLIB_NEON)
563  return vreinterpretq_s8_u32(vdupq_n_u32(v));
564 #endif
565 }
566 
567 // r.u64[i] = v for all i
568 NLIB_M(i128) I128::SetValue(uint64_t v, each_uint64_tag) NLIB_NOEXCEPT {
569 #if defined(NLIB_SSE41)
570 #ifdef _MSC_VER
571  NLIB_ALIGNAS(16) uint64_t tmp[2] = {v, v};
572  return I128::LoadA16(tmp);
573 #else
574  return _mm_set1_epi64x(static_cast<int64_t>(v));
575 #endif
576 #elif defined(NLIB_NEON)
577  return vreinterpretq_s8_u64(vdupq_n_u64(v));
578 #endif
579 }
580 
581 #if defined(NLIB_SSE41)
582 template <size_t N>
583 // r.u32[i] = value.u32[N] for all i
584 NLIB_M(i128) I128::SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT {
585  NLIB_STATIC_ASSERT(N < 4);
586  return _mm_shuffle_epi32(value, _MM_SHUFFLE(N, N, N, N));
587 }
588 #elif defined(NLIB_NEON)
589 # ifdef __aarch64__
590 template <size_t N>
591 // r.u32[i] = value.u32[N] for all i
592 NLIB_M(i128) I128::SetValue(i128 value, each_select32_tag) NLIB_NOEXCEPT {
593  NLIB_STATIC_ASSERT(N < 4);
594  uint32x4_t v = vreinterpretq_u32_s8(value);
595  return vreinterpretq_s8_u32(vdupq_laneq_u32(v, N));
596 }
597 # else
598 template <>
599 NLIB_M(i128) I128::SetValue<0>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
600  uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
601  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
602 }
603 template <>
604 NLIB_M(i128) I128::SetValue<1>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
605  uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
606  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
607 }
608 template <>
609 NLIB_M(i128) I128::SetValue<2>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
610  uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
611  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
612 }
613 template <>
614 NLIB_M(i128) I128::SetValue<3>(i128 value, each_select32_tag) NLIB_NOEXCEPT {
615  uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
616  return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
617 }
618 # endif
619 #endif
620 
621 #if defined(NLIB_SSE41)
622 template <size_t N>
623 // r.u16[i] = value.u16[N] for all i
624 NLIB_M2(i128) I128::SetValue(i128 value, each_select16_tag) NLIB_NOEXCEPT {
625  NLIB_STATIC_ASSERT(N < 8);
626  NLIB_ALIGNAS(16) const int8_t mask[16] = {
627  2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1,
628  2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1
629  };
630  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(mask));
631 }
632 #elif defined(NLIB_NEON)
633 template <>
634 NLIB_M(i128) I128::SetValue<0>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
635 # ifdef __aarch64__
636  uint16x8_t v = vreinterpretq_u16_s8(value);
637  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 0));
638 # else
639  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
640  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
641 # endif
642 }
643 
644 template <>
645 NLIB_M(i128) I128::SetValue<1>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
646 # ifdef __aarch64__
647  uint16x8_t v = vreinterpretq_u16_s8(value);
648  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 1));
649 # else
650  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
651  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
652 # endif
653 }
654 
655 template <>
656 NLIB_M(i128) I128::SetValue<2>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
657 # ifdef __aarch64__
658  uint16x8_t v = vreinterpretq_u16_s8(value);
659  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 2));
660 # else
661  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
662  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
663 # endif
664 }
665 
666 template <>
667 NLIB_M(i128) I128::SetValue<3>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
668 # ifdef __aarch64__
669  uint16x8_t v = vreinterpretq_u16_s8(value);
670  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 3));
671 # else
672  uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
673  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
674 # endif
675 }
676 
677 template <>
678 NLIB_M(i128) I128::SetValue<4>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
679 # ifdef __aarch64__
680  uint16x8_t v = vreinterpretq_u16_s8(value);
681  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 4));
682 # else
683  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
684  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
685 # endif
686 }
687 
688 template <>
689 NLIB_M(i128) I128::SetValue<5>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
690 # ifdef __aarch64__
691  uint16x8_t v = vreinterpretq_u16_s8(value);
692  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 5));
693 # else
694  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
695  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
696 # endif
697 }
698 
699 template <>
700 NLIB_M(i128) I128::SetValue<6>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
701 # ifdef __aarch64__
702  uint16x8_t v = vreinterpretq_u16_s8(value);
703  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 6));
704 # else
705  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
706  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
707 # endif
708 }
709 
710 template <>
711 NLIB_M(i128) I128::SetValue<7>(i128 value, each_select16_tag) NLIB_NOEXCEPT {
712 # ifdef __aarch64__
713  uint16x8_t v = vreinterpretq_u16_s8(value);
714  return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 7));
715 # else
716  uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
717  return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
718 # endif
719 }
720 #endif
721 
722 #if defined(NLIB_SSE41)
723 template <size_t N>
724 // r.u8[i] = value.u8[N] for all i
725 NLIB_M2(i128) I128::SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT {
726  NLIB_STATIC_ASSERT(N < 16);
727  NLIB_ALIGNAS(16) const int8_t mask[16] = {
728  N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
729  };
730  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
731 }
732 #elif defined(NLIB_NEON)
733 namespace detail {
734 template <size_t N, bool IsLower>
735 struct SetValue8Helper {
736  NLIB_M(i128) operator()(i128 value) NLIB_NOEXCEPT {
737  return vdupq_lane_s8(vget_low_s8(value), N);
738  }
739 };
740 
741 template <size_t N>
742 struct SetValue8Helper<N, false> {
743  NLIB_M(i128) operator()(i128 value) NLIB_NOEXCEPT {
744  return vdupq_lane_s8(vget_high_s8(value), N - 8);
745  }
746 };
747 
748 } // namespace detail
749 
750 template <size_t N>
751 NLIB_M(i128) I128::SetValue(i128 value, each_select8_tag) NLIB_NOEXCEPT {
752  NLIB_STATIC_ASSERT(N < 16);
753 # ifdef __aarch64__
754  return vdupq_laneq_s8(value, N);
755 # else
756  return detail::SetValue8Helper<N, (N < 8)>()(value);
757 # endif
758 }
759 #endif
760 
761 // set 0 to all bits
762 NLIB_M(i128) I128::SetZero() NLIB_NOEXCEPT {
763 #if defined(NLIB_SSE41)
764  return _mm_setzero_si128();
765 #elif defined(NLIB_NEON)
766  return vdupq_n_s8(0);
767 #endif
768 }
769 
770 // set 1 to all bits
771 NLIB_M(i128) I128::SetFull(i128arg dummy) NLIB_NOEXCEPT { return I128::CmpEq8(dummy, dummy); }
772 
773 // r[i] = p[i], p is 16 bytes aligned
774 NLIB_M(i128) I128::LoadA16(const void* p) NLIB_NOEXCEPT {
775 #if defined(NLIB_SSE41)
776  return _mm_load_si128(reinterpret_cast<const __m128i*>(p));
777 #elif defined(NLIB_NEON)
778  uint64x2_t tmp = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
779  return vreinterpretq_s8_u64(tmp);
780 #endif
781 }
782 
783 // r[i] = p[i], p is 8 bytes aligned
784 NLIB_M(i128) I128::LoadA8(const void* p) NLIB_NOEXCEPT {
785 #if defined(NLIB_SSE41)
786  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
787 #elif defined(NLIB_NEON)
788  uint64x2_t tmp = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
789  return vreinterpretq_s8_u64(tmp);
790 #endif
791 }
792 
793 // r[i] = p[i], p is 4 bytes aligned
794 NLIB_M(i128) I128::LoadA4(const void* p) NLIB_NOEXCEPT {
795 #if defined(NLIB_SSE41)
796  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
797 #elif defined(NLIB_NEON)
798  uint32x4_t tmp = vld1q_u32(reinterpret_cast<const uint32_t*>(p));
799  return vreinterpretq_s8_u32(tmp);
800 #endif
801 }
802 
803 // r[i] = p[i], p is 2 bytes aligned
804 NLIB_M(i128) I128::LoadA2(const void* p) NLIB_NOEXCEPT {
805 #if defined(NLIB_SSE41)
806  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
807 #elif defined(NLIB_NEON)
808  uint16x8_t tmp = vld1q_u16(reinterpret_cast<const uint16_t*>(p));
809  return vreinterpretq_s8_u16(tmp);
810 #endif
811 }
812 
813 // r[i] = p[i]
814 NLIB_M(i128) I128::LoadA1(const void* p) NLIB_NOEXCEPT {
815 #if defined(NLIB_SSE41)
816  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
817 #elif defined(NLIB_NEON)
818  return vld1q_s8(reinterpret_cast<const int8_t*>(p));
819 #endif
820 }
821 
822 NLIB_M(i128) I128::LoadLoA8(const void* p) NLIB_NOEXCEPT {
823 #if defined(NLIB_SSE41)
824  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
825 #elif defined(NLIB_NEON)
826  int8x8_t lo = vreinterpret_s8_u64(vld1_u64(reinterpret_cast<const uint64_t*>(p)));
827  return vcombine_s8(lo, vdup_n_s8(0));
828 #endif
829 }
830 
831 NLIB_M(i128) I128::LoadLoA4(const void* p) NLIB_NOEXCEPT {
832 #if defined(NLIB_SSE41)
833  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
834 #elif defined(NLIB_NEON)
835  int8x8_t lo = vreinterpret_s8_u32(vld1_u32(reinterpret_cast<const uint32_t*>(p)));
836  return vcombine_s8(lo, vdup_n_s8(0));
837 #endif
838 }
839 
840 NLIB_M(i128) I128::LoadLoA2(const void* p) NLIB_NOEXCEPT {
841 #if defined(NLIB_SSE41)
842  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
843 #elif defined(NLIB_NEON)
844  int8x8_t lo = vreinterpret_s8_u16(vld1_u16(reinterpret_cast<const uint16_t*>(p)));
845  return vcombine_s8(lo, vdup_n_s8(0));
846 #endif
847 }
848 
849 NLIB_M(i128) I128::LoadLoA1(const void* p) NLIB_NOEXCEPT {
850 #if defined(NLIB_SSE41)
851  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
852 #elif defined(NLIB_NEON)
853  int8x8_t lo = vld1_s8(reinterpret_cast<const int8_t*>(p));
854  return vcombine_s8(lo, vdup_n_s8(0));
855 #endif
856 }
857 
858 NLIB_M(i128) I128::LoadHiA8(const void* p) NLIB_NOEXCEPT {
859 #if defined(NLIB_SSE41)
860  __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
861  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
862 #elif defined(NLIB_NEON)
863  int8x8_t hi = vreinterpret_s8_u64(vld1_u64(reinterpret_cast<const uint64_t*>(p)));
864  return vcombine_s8(vdup_n_s8(0), hi);
865 #endif
866 }
867 
868 NLIB_M(i128) I128::LoadHiA4(const void* p) NLIB_NOEXCEPT {
869 #if defined(NLIB_SSE41)
870  __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
871  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
872 #elif defined(NLIB_NEON)
873  int8x8_t hi = vreinterpret_s8_u32(vld1_u32(reinterpret_cast<const uint32_t*>(p)));
874  return vcombine_s8(vdup_n_s8(0), hi);
875 #endif
876 }
877 
878 NLIB_M(i128) I128::LoadHiA2(const void* p) NLIB_NOEXCEPT {
879 #if defined(NLIB_SSE41)
880  __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
881  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
882 #elif defined(NLIB_NEON)
883  int8x8_t hi = vreinterpret_s8_u16(vld1_u16(reinterpret_cast<const uint16_t*>(p)));
884  return vcombine_s8(vdup_n_s8(0), hi);
885 #endif
886 }
887 
888 NLIB_M(i128) I128::LoadHiA1(const void* p) NLIB_NOEXCEPT {
889 #if defined(NLIB_SSE41)
890  __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
891  return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
892 #elif defined(NLIB_NEON)
893  int8x8_t hi = vld1_s8(reinterpret_cast<const int8_t*>(p));
894  return vcombine_s8(vdup_n_s8(0), hi);
895 #endif
896 }
897 
898 // p[i] = value[i], p is 16 bytes aligned
899 NLIB_M(void) I128::StoreA16(void* p, i128arg value) NLIB_NOEXCEPT {
900 #if defined(NLIB_SSE41)
901  _mm_store_si128(reinterpret_cast<i128*>(p), value);
902 #elif defined(NLIB_NEON)
903  vst1q_u64(reinterpret_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
904 #endif
905 }
906 
907 // p[i] = value[i], p is 8 bytes aligned
908 NLIB_M(void) I128::StoreA8(void* p, i128arg value) NLIB_NOEXCEPT {
909 #if defined(NLIB_SSE41)
910  _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
911 #elif defined(NLIB_NEON)
912  vst1q_u64(reinterpret_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
913 #endif
914 }
915 
916 // p[i] = value[i], p is 4 bytes aligned
917 NLIB_M(void) I128::StoreA4(void* p, i128arg value) NLIB_NOEXCEPT {
918 #if defined(NLIB_SSE41)
919  _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
920 #elif defined(NLIB_NEON)
921  vst1q_u32(reinterpret_cast<uint32_t*>(p), vreinterpretq_u32_s8(value));
922 #endif
923 }
924 
925 // p[i] = value[i], p is 2 bytes aligned
926 NLIB_M(void) I128::StoreA2(void* p, i128arg value) NLIB_NOEXCEPT {
927 #if defined(NLIB_SSE41)
928  _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
929 #elif defined(NLIB_NEON)
930  vst1q_u16(reinterpret_cast<uint16_t*>(p), vreinterpretq_u16_s8(value));
931 #endif
932 }
933 
934 // p[i] = value[i]
935 NLIB_M(void) I128::StoreA1(void* p, i128arg value) NLIB_NOEXCEPT {
936 #if defined(NLIB_SSE41)
937  _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
938 #elif defined(NLIB_NEON)
939  vst1q_s8(reinterpret_cast<int8_t*>(p), value);
940 #endif
941 }
942 
943 NLIB_M(void) I128::StoreLoA8(void* p, i128arg value) NLIB_NOEXCEPT {
944 #if defined(NLIB_SSE41)
945  _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
946 #elif defined(NLIB_NEON)
947  uint64x1_t x = vreinterpret_u64_s8(vget_low_s8(value));
948  vst1_u64(reinterpret_cast<uint64_t*>(p), x);
949 #endif
950 }
951 
952 NLIB_M(void) I128::StoreLoA4(void* p, i128arg value) NLIB_NOEXCEPT {
953 #if defined(NLIB_SSE41)
954  _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
955 #elif defined(NLIB_NEON)
956  uint32x2_t x = vreinterpret_u32_s8(vget_low_s8(value));
957  vst1_u32(reinterpret_cast<uint32_t*>(p), x);
958 #endif
959 }
960 
961 NLIB_M(void) I128::StoreLoA2(void* p, i128arg value) NLIB_NOEXCEPT {
962 #if defined(NLIB_SSE41)
963  _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
964 #elif defined(NLIB_NEON)
965  uint16x4_t x = vreinterpret_u16_s8(vget_low_s8(value));
966  vst1_u16(reinterpret_cast<uint16_t*>(p), x);
967 #endif
968 }
969 
970 NLIB_M(void) I128::StoreLoA1(void* p, i128arg value) NLIB_NOEXCEPT {
971 #if defined(NLIB_SSE41)
972  _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
973 #elif defined(NLIB_NEON)
974  int8x8_t x = vget_low_s8(value);
975  vst1_s8(reinterpret_cast<int8_t*>(p), x);
976 #endif
977 }
978 
979 NLIB_M(void) I128::StoreHiA8(void* p, i128arg value) NLIB_NOEXCEPT {
980 #if defined(NLIB_SSE41)
981  _mm_storel_epi64(reinterpret_cast<i128*>(p),
982  _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
983 #elif defined(NLIB_NEON)
984  uint64x1_t x = vreinterpret_u64_s8(vget_high_s8(value));
985  vst1_u64(reinterpret_cast<uint64_t*>(p), x);
986 #endif
987 }
988 
989 NLIB_M(void) I128::StoreHiA4(void* p, i128arg value) NLIB_NOEXCEPT {
990 #if defined(NLIB_SSE41)
991  _mm_storel_epi64(reinterpret_cast<i128*>(p),
992  _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
993 #elif defined(NLIB_NEON)
994  uint32x2_t x = vreinterpret_u32_s8(vget_high_s8(value));
995  vst1_u32(reinterpret_cast<uint32_t*>(p), x);
996 #endif
997 }
998 
999 NLIB_M(void) I128::StoreHiA2(void* p, i128arg value) NLIB_NOEXCEPT {
1000 #if defined(NLIB_SSE41)
1001  _mm_storel_epi64(reinterpret_cast<i128*>(p),
1002  _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
1003 #elif defined(NLIB_NEON)
1004  uint16x4_t x = vreinterpret_u16_s8(vget_high_s8(value));
1005  vst1_u16(reinterpret_cast<uint16_t*>(p), x);
1006 #endif
1007 }
1008 
1009 NLIB_M(void) I128::StoreHiA1(void* p, i128arg value) NLIB_NOEXCEPT {
1010 #if defined(NLIB_SSE41)
1011  _mm_storel_epi64(reinterpret_cast<i128*>(p),
1012  _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
1013 #elif defined(NLIB_NEON)
1014  int8x8_t x = vget_high_s8(value);
1015  vst1_s8(reinterpret_cast<int8_t*>(p), x);
1016 #endif
1017 }
1018 
1019 template <size_t N>
1020 // r = value.u8[N]
1021 NLIB_M(uint8_t) I128::GetUint8FromLane(i128arg value) NLIB_NOEXCEPT {
1022  NLIB_STATIC_ASSERT(N < 16);
1023 #if defined(NLIB_SSE41)
1024  return static_cast<uint8_t>(_mm_extract_epi8(value, N));
1025 #elif defined(NLIB_NEON)
1026  return vgetq_lane_u8(vreinterpretq_u8_s8(value), N);
1027 #endif
1028 }
1029 
1030 template <size_t N>
1031 // r = value.u16[N]
1032 NLIB_M(uint16_t) I128::GetUint16FromLane(i128arg value) NLIB_NOEXCEPT {
1033  NLIB_STATIC_ASSERT(N < 8);
1034 #if defined(NLIB_SSE41)
1035  return static_cast<uint16_t>(_mm_extract_epi16(value, N));
1036 #elif defined(NLIB_NEON)
1037  return vgetq_lane_u16(vreinterpretq_u16_s8(value), N);
1038 #endif
1039 }
1040 
1041 template <size_t N>
1042 // r = value.u32[N]
1043 NLIB_M(uint32_t) I128::GetUint32FromLane(i128arg value) NLIB_NOEXCEPT {
1044  NLIB_STATIC_ASSERT(N < 4);
1045 #if defined(NLIB_SSE41)
1046  return static_cast<uint32_t>(_mm_extract_epi32(value, N));
1047 #elif defined(NLIB_NEON)
1048  return vgetq_lane_u32(vreinterpretq_u32_s8(value), N);
1049 #endif
1050 }
1051 
1052 template <size_t N>
1053 // r = value.u64[N]
1054 NLIB_M(uint64_t) I128::GetUint64FromLane(i128arg value) NLIB_NOEXCEPT {
1055  NLIB_STATIC_ASSERT(N < 2);
1056 #if defined(NLIB_SSE41)
1057 #ifdef NLIB_64BIT
1058  return static_cast<uint64_t>(_mm_extract_epi64(value, N));
1059 #else
1060  NLIB_UNUSED(value);
1061  return 0; // dummy
1062 #endif
1063 #elif defined(NLIB_NEON)
1064  return vgetq_lane_u64(vreinterpretq_u64_s8(value), N);
1065 #endif
1066 }
1067 
1068 #if defined(NLIB_SSE41) && !defined(NLIB_64BIT)
1069 template <>
1070 NLIB_M(uint64_t) I128::GetUint64FromLane<0>(i128arg value) NLIB_NOEXCEPT {
1071  uint64_t rval;
1072  _mm_storel_epi64(reinterpret_cast<i128*>(&rval), value);
1073  return rval;
1074 }
1075 template <>
1076 NLIB_M(uint64_t) I128::GetUint64FromLane<1>(i128arg value) NLIB_NOEXCEPT {
1077  uint64_t rval;
1078  i128 tmp = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
1079  _mm_storel_epi64(reinterpret_cast<i128*>(&rval), tmp);
1080  return rval;
1081 }
1082 #endif
1083 
1084 template <size_t N>
1085 // r = value, r.u8[N] = v
1086 NLIB_M(i128) I128::SetUint8ToLane(i128arg value, uint8_t v) NLIB_NOEXCEPT {
1087  NLIB_STATIC_ASSERT(N < 16);
1088 #if defined(NLIB_SSE41)
1089  return _mm_insert_epi8(value, static_cast<int8_t>(v), N);
1090 #elif defined(NLIB_NEON)
1091  return __builtin_constant_p(v) ?
1092  I128::Permute8<
1093  N == 0 ? 16 : 0,
1094  N == 1 ? 17 : 1,
1095  N == 2 ? 18 : 2,
1096  N == 3 ? 19 : 3,
1097  N == 4 ? 20 : 4,
1098  N == 5 ? 21 : 5,
1099  N == 6 ? 22 : 6,
1100  N == 7 ? 23 : 7,
1101  N == 8 ? 24 : 8,
1102  N == 9 ? 25 : 9,
1103  N == 10 ? 26 : 10,
1104  N == 11 ? 27 : 11,
1105  N == 12 ? 28 : 12,
1106  N == 13 ? 29 : 13,
1107  N == 14 ? 30 : 14,
1108  N == 15 ? 31 : 15>(value, vreinterpretq_s8_u8(vdupq_n_u8(v))) :
1109  vreinterpretq_s8_u8(vsetq_lane_u8(v, vreinterpretq_u8_s8(value), N));
1110 #endif
1111 }
1112 
1113 template <size_t N>
1114 // r = value, r.u16[N] = v
1115 NLIB_M(i128) I128::SetUint16ToLane(i128arg value, uint16_t v) NLIB_NOEXCEPT {
1116  NLIB_STATIC_ASSERT(N < 8);
1117 #if defined(NLIB_SSE41)
1118  return _mm_insert_epi16(value, static_cast<int16_t>(v), N);
1119 #elif defined(NLIB_NEON)
1120  return __builtin_constant_p(v) ?
1121  I128::Permute16<
1122  N == 0 ? 8 : 0,
1123  N == 1 ? 9 : 1,
1124  N == 2 ? 10 : 2,
1125  N == 3 ? 11 : 3,
1126  N == 4 ? 12 : 4,
1127  N == 5 ? 13 : 5,
1128  N == 6 ? 14 : 6,
1129  N == 7 ? 15 : 7>(value, vreinterpretq_s8_u16(vdupq_n_u16(v))) :
1130  vreinterpretq_s8_u16(vsetq_lane_u16(v, vreinterpretq_u16_s8(value), N));
1131 #endif
1132 }
1133 
1134 template <size_t N>
1135 // r = value, r.u32[N] = v
1136 NLIB_M(i128) I128::SetUint32ToLane(i128arg value, uint32_t v) NLIB_NOEXCEPT {
1137  NLIB_STATIC_ASSERT(N < 4);
1138 #if defined(NLIB_SSE41)
1139  return _mm_insert_epi32(value, static_cast<uint32_t>(v), N);
1140 #elif defined(NLIB_NEON)
1141  return __builtin_constant_p(v) ?
1142  I128::Permute32<N == 0 ? 4 : 0,
1143  N == 1 ? 5 : 1,
1144  N == 2 ? 6 : 2,
1145  N == 3 ? 7 : 3>(value, vreinterpretq_s8_u32(vdupq_n_u32(v))) :
1146  vreinterpretq_s8_u32(vsetq_lane_u32(v, vreinterpretq_u32_s8(value), N));
1147 #endif
1148 }
1149 
1150 template <size_t N>
1151 // r = value, r.u64[N] = v
1152 NLIB_M(i128) I128::SetUint64ToLane(i128arg value, uint64_t v) NLIB_NOEXCEPT {
1153  NLIB_STATIC_ASSERT(N < 2);
1154 #if defined(NLIB_SSE41)
1155 #ifdef NLIB_64BIT
1156  return _mm_insert_epi64(value, static_cast<int64_t>(v), N);
1157 #else
1158  union {
1159  int32_t i32[2];
1160  int64_t i64;
1161  } tmp;
1162  tmp.i64 = static_cast<int64_t>(v);
1163  __m128i rval;
1164  rval = _mm_insert_epi32(value, tmp.i32[0], N * 2 + 0);
1165  return _mm_insert_epi32(rval, tmp.i32[1], N * 2 + 1);
1166 #endif
1167 #elif defined(NLIB_NEON)
1168  return vreinterpretq_s8_u64(vsetq_lane_u64(v, vreinterpretq_u64_s8(value), N));
1169 #endif
1170 }
1171 
1172 // r.i8[i] = a.i8[i] + b.i8[i]
1173 NLIB_M(i128) I128::Add8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1174 #if defined(NLIB_SSE41)
1175  return _mm_add_epi8(a, b);
1176 #elif defined(NLIB_NEON)
1177  return vaddq_s8(a, b);
1178 #endif
1179 }
1180 
1181 // r.i16[i] = a.i16[i] + b.i16[i]
1182 NLIB_M(i128) I128::Add16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1183 #if defined(NLIB_SSE41)
1184  return _mm_add_epi16(a, b);
1185 #elif defined(NLIB_NEON)
1186  return NLIB_OP2(vaddq, s16, a, b);
1187 #endif
1188 }
1189 
1190 // r.i32[i] = a.i32[i] + b.i32[i]
1191 NLIB_M(i128) I128::Add32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1192 #if defined(NLIB_SSE41)
1193  return _mm_add_epi32(a, b);
1194 #elif defined(NLIB_NEON)
1195  return NLIB_OP2(vaddq, s32, a, b);
1196 #endif
1197 }
1198 
1199 // r.i64[i] = a.i64[i] + b.i64[i]
1200 NLIB_M(i128) I128::Add64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1201 #if defined(NLIB_SSE41)
1202  return _mm_add_epi64(a, b);
1203 #elif defined(NLIB_NEON)
1204  return NLIB_OP2(vaddq, s64, a, b);
1205 #endif
1206 }
1207 
1208 // 127 if a[i] + b[i] > 127, -128 if a[i] + b[i] < -128
1209 NLIB_M(i128) I128::AddInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1210 #if defined(NLIB_SSE41)
1211  return _mm_adds_epi8(a, b);
1212 #elif defined(NLIB_NEON)
1213  return vqaddq_s8(a, b);
1214 #endif
1215 }
1216 
1217 // 32767 if a[i] + b[i] > 32767, -32768 if a[i] + b[i] < -32768
1218 NLIB_M(i128) I128::AddInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1219 #if defined(NLIB_SSE41)
1220  return _mm_adds_epi16(a, b);
1221 #elif defined(NLIB_NEON)
1222  return NLIB_OP2(vqaddq, s16, a, b);
1223 #endif
1224 }
1225 
1226 // 255 if a[i] + b[i] > 255
1227 NLIB_M(i128) I128::AddUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1228 #if defined(NLIB_SSE41)
1229  return _mm_adds_epu8(a, b);
1230 #elif defined(NLIB_NEON)
1231  return NLIB_OP2(vqaddq, u8, a, b);
1232 #endif
1233 }
1234 
1235 // 65535 if a[i] + b[i] > 65535
1236 NLIB_M(i128) I128::AddUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1237 #if defined(NLIB_SSE41)
1238  return _mm_adds_epu16(a, b);
1239 #elif defined(NLIB_NEON)
1240  return NLIB_OP2(vqaddq, u16, a, b);
1241 #endif
1242 }
1243 
1244 // r.i8[i] = a.i8[i] - b.i8[i]
1245 NLIB_M(i128) I128::Sub8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1246 #if defined(NLIB_SSE41)
1247  return _mm_sub_epi8(a, b);
1248 #elif defined(NLIB_NEON)
1249  return vsubq_s8(a, b);
1250 #endif
1251 }
1252 
1253 // r.i16[i] = a.i16[i] - b.i16[i]
1254 NLIB_M(i128) I128::Sub16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1255 #if defined(NLIB_SSE41)
1256  return _mm_sub_epi16(a, b);
1257 #elif defined(NLIB_NEON)
1258  return NLIB_OP2(vsubq, s16, a, b);
1259 #endif
1260 }
1261 
1262 // r.i32[i] = a.i32[i] - b.i32[i]
1263 NLIB_M(i128) I128::Sub32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1264 #if defined(NLIB_SSE41)
1265  return _mm_sub_epi32(a, b);
1266 #elif defined(NLIB_NEON)
1267  return NLIB_OP2(vsubq, s32, a, b);
1268 #endif
1269 }
1270 
1271 // r.i64[0] = a.i64[0] - b.i64[0]
1272 NLIB_M(i128) I128::Sub64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1273 #if defined(NLIB_SSE41)
1274  return _mm_sub_epi64(a, b);
1275 #elif defined(NLIB_NEON)
1276  return NLIB_OP2(vsubq, s64, a, b);
1277 #endif
1278 }
1279 
1280 // 127 if a[i] - b[i] > 127, -128 if a[i] - b[i] < -128
1281 NLIB_M(i128) I128::SubInt8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1282 #if defined(NLIB_SSE41)
1283  return _mm_subs_epi8(a, b);
1284 #elif defined(NLIB_NEON)
1285  return NLIB_OP2(vqsubq, s8, a, b);
1286 #endif
1287 }
1288 
1289 // 32767 if a[i] - b[i] > 32767, -32768 if a[i] - b[i] < -32768
1290 NLIB_M(i128) I128::SubInt16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1291 #if defined(NLIB_SSE41)
1292  return _mm_subs_epi16(a, b);
1293 #elif defined(NLIB_NEON)
1294  return NLIB_OP2(vqsubq, s16, a, b);
1295 #endif
1296 }
1297 
1298 // 0 if a.u8[i] - b.u8[i] < 0
1299 NLIB_M(i128) I128::SubUint8Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1300 #if defined(NLIB_SSE41)
1301  return _mm_subs_epu8(a, b);
1302 #elif defined(NLIB_NEON)
1303  return NLIB_OP2(vqsubq, u8, a, b);
1304 #endif
1305 }
1306 
1307 // 0 if a.u16[i] - b.u16[i] < 0
1308 NLIB_M(i128) I128::SubUint16Saturated(i128arg a, i128arg b) NLIB_NOEXCEPT {
1309 #if defined(NLIB_SSE41)
1310  return _mm_subs_epu16(a, b);
1311 #elif defined(NLIB_NEON)
1312  return NLIB_OP2(vqsubq, u16, a, b);
1313 #endif
1314 }
1315 
1316 // r.i8[0] = value[0] + value[1], ..., r.i8[7] = value[14] + value[15]
1317 NLIB_M(i128) I128::PairwiseAdd8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1318 #if defined(NLIB_SSE41)
1319  __m128i ax = _mm_add_epi8(a, _mm_srli_epi16(a, 8));
1320  __m128i bx = _mm_add_epi8(b, _mm_srli_epi16(b, 8));
1321  return I128::NarrowFrom16To8(ax, bx);
1322 #elif defined(NLIB_NEON)
1323 # ifdef __aarch64__
1324  return vpaddq_s8(a, b);
1325 # else
1326  int8x8_t al = vget_low_s8(a);
1327  int8x8_t ah = vget_high_s8(a);
1328  int8x8_t rl = vpadd_s8(al, ah);
1329  int8x8_t bl = vget_low_s8(b);
1330  int8x8_t bh = vget_high_s8(b);
1331  int8x8_t rh = vpadd_s8(bl, bh);
1332  return vcombine_s8(rl, rh);
1333 # endif
1334 #endif
1335 }
1336 
1337 // r.i16[0] = value[0] + value[1], ..., r.i16[3] = value[6] + value[7]
1338 NLIB_M(i128) I128::PairwiseAdd16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1339 #if defined(NLIB_SSE41)
1340  return _mm_hadd_epi16(a, b);
1341 #elif defined(NLIB_NEON)
1342 # ifdef __aarch64__
1343  return vreinterpretq_s8_s16(vpaddq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)));
1344 # else
1345  int16x4_t al = vget_low_s16(vreinterpretq_s16_s8(a));
1346  int16x4_t ah = vget_high_s16(vreinterpretq_s16_s8(a));
1347  int16x4_t rl = vpadd_s16(al, ah);
1348  int16x4_t bl = vget_low_s16(vreinterpretq_s16_s8(b));
1349  int16x4_t bh = vget_high_s16(vreinterpretq_s16_s8(b));
1350  int16x4_t rh = vpadd_s16(bl, bh);
1351  return NLIB_CMB(s16, rl, rh);
1352 # endif
1353 #endif
1354 }
1355 
1356 // r.i32[0] = value[0] + value[1], r.i32[1] = value[2] + value[3]
1357 NLIB_M(i128) I128::PairwiseAdd32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1358 #if defined(NLIB_SSE41)
1359  return _mm_hadd_epi32(a, b);
1360 #elif defined(NLIB_NEON)
1361 # ifdef __aarch64__
1362  return vreinterpretq_s8_s32(vpaddq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)));
1363 # else
1364  int32x2_t al = vget_low_s32(vreinterpretq_s32_s8(a));
1365  int32x2_t ah = vget_high_s32(vreinterpretq_s32_s8(a));
1366  int32x2_t rl = vpadd_s32(al, ah);
1367  int32x2_t bl = vget_low_s32(vreinterpretq_s32_s8(b));
1368  int32x2_t bh = vget_high_s32(vreinterpretq_s32_s8(b));
1369  int32x2_t rh = vpadd_s32(bl, bh);
1370  return NLIB_CMB(s32, rl, rh);
1371 # endif
1372 #endif
1373 }
1374 
1375 // r.i16[i] = a.i16[i] * b.i16[i]
1376 NLIB_M(i128) I128::Mult16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1377 #if defined(NLIB_SSE41)
1378  return _mm_mullo_epi16(a, b);
1379 #elif defined(NLIB_NEON)
1380  return NLIB_OP2(vmulq, s16, a, b);
1381 #endif
1382 }
1383 
1384 // r = c + a * b
1385 NLIB_M(i128) I128::MultAdd16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1386 #if defined(NLIB_SSE41)
1387  return _mm_add_epi16(c, _mm_mullo_epi16(a, b));
1388 #elif defined(NLIB_NEON)
1389  return NLIB_OP3(vmlaq, s16, c, a, b);
1390 #endif
1391 }
1392 
1393 // r = c - a * b
1394 NLIB_M(i128) I128::MultSub16(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1395 #if defined(NLIB_SSE41)
1396  return _mm_sub_epi16(c, _mm_mullo_epi16(a, b));
1397 #elif defined(NLIB_NEON)
1398  return NLIB_OP3(vmlsq, s16, c, a, b);
1399 #endif
1400 }
1401 
1402 // r.i32[i] = a.i32[i] * b.i32[i]
1403 NLIB_M(i128) I128::Mult32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1404 #if defined(NLIB_SSE41)
1405  return _mm_mullo_epi32(a, b);
1406 #elif defined(NLIB_NEON)
1407  return NLIB_OP2(vmulq, s32, a, b);
1408 #endif
1409 }
1410 
1411 // r = c + a * b
1412 NLIB_M(i128) I128::MultAdd32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1413 #if defined(NLIB_SSE41)
1414  return _mm_add_epi32(c, _mm_mullo_epi32(a, b));
1415 #elif defined(NLIB_NEON)
1416  return NLIB_OP3(vmlaq, s32, c, a, b);
1417 #endif
1418 }
1419 
1420 // r = c - a * b
1421 NLIB_M(i128) I128::MultSub32(i128arg a, i128arg b, i128arg c) NLIB_NOEXCEPT {
1422 #if defined(NLIB_SSE41)
1423  return _mm_sub_epi32(c, _mm_mullo_epi32(a, b));
1424 #elif defined(NLIB_NEON)
1425  return NLIB_OP3(vmlsq, s32, c, a, b);
1426 #endif
1427 }
1428 
1429 // r.s8[i] = max(a.s8[i], b.s8[i])
1430 NLIB_M(i128) I128::MaxInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1431 #if defined(NLIB_SSE41)
1432  return _mm_max_epi8(a, b);
1433 #elif defined(NLIB_NEON)
1434  return NLIB_OP2(vmaxq, s8, a, b);
1435 #endif
1436 }
1437 
1438 // r.s16[i] = max(a.s16[i], b.s16[i])
1439 NLIB_M(i128) I128::MaxInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1440 #if defined(NLIB_SSE41)
1441  return _mm_max_epi16(a, b);
1442 #elif defined(NLIB_NEON)
1443  return NLIB_OP2(vmaxq, s16, a, b);
1444 #endif
1445 }
1446 
1447 // r.s32[i] = max(a.s32[i], b.s32[i])
1448 NLIB_M(i128) I128::MaxInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1449 #if defined(NLIB_SSE41)
1450  return _mm_max_epi32(a, b);
1451 #elif defined(NLIB_NEON)
1452  return NLIB_OP2(vmaxq, s32, a, b);
1453 #endif
1454 }
1455 
1456 // r.u8[i] = max(a.u8[i], b.u8[i])
1457 NLIB_M(i128) I128::MaxUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1458 #if defined(NLIB_SSE41)
1459  return _mm_max_epu8(a, b);
1460 #elif defined(NLIB_NEON)
1461  return NLIB_OP2(vmaxq, u8, a, b);
1462 #endif
1463 }
1464 
1465 // r.u16[i] = max(a.u16[i], b.u16[i])
1466 NLIB_M(i128) I128::MaxUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1467 #if defined(NLIB_SSE41)
1468  return _mm_max_epu16(a, b);
1469 #elif defined(NLIB_NEON)
1470  return NLIB_OP2(vmaxq, u16, a, b);
1471 #endif
1472 }
1473 
1474 // r.u32[i] = max(a.u32[i], b.u32[i])
1475 NLIB_M(i128) I128::MaxUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1476 #if defined(NLIB_SSE41)
1477  return _mm_max_epu32(a, b);
1478 #elif defined(NLIB_NEON)
1479  return NLIB_OP2(vmaxq, u32, a, b);
1480 #endif
1481 }
1482 
1483 // r.s8[i] = min(a.s8[i], b.s8[i])
1484 NLIB_M(i128) I128::MinInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1485 #if defined(NLIB_SSE41)
1486  return _mm_min_epi8(a, b);
1487 #elif defined(NLIB_NEON)
1488  return NLIB_OP2(vminq, s8, a, b);
1489 #endif
1490 }
1491 
1492 // r.s16[i] = min(a.s16[i], b.s16[i])
1493 NLIB_M(i128) I128::MinInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1494 #if defined(NLIB_SSE41)
1495  return _mm_min_epi16(a, b);
1496 #elif defined(NLIB_NEON)
1497  return NLIB_OP2(vminq, s16, a, b);
1498 #endif
1499 }
1500 
1501 // r.s32[i] = min(a.s32[i], b.s32[i])
1502 NLIB_M(i128) I128::MinInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1503 #if defined(NLIB_SSE41)
1504  return _mm_min_epi32(a, b);
1505 #elif defined(NLIB_NEON)
1506  return NLIB_OP2(vminq, s32, a, b);
1507 #endif
1508 }
1509 
1510 // r.u8[i] = min(a.u8[i], b.u8[i])
1511 NLIB_M(i128) I128::MinUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1512 #if defined(NLIB_SSE41)
1513  return _mm_min_epu8(a, b);
1514 #elif defined(NLIB_NEON)
1515  return NLIB_OP2(vminq, u8, a, b);
1516 #endif
1517 }
1518 
1519 // r.u16[i] = min(a.u16[i], b.u16[i])
1520 NLIB_M(i128) I128::MinUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1521 #if defined(NLIB_SSE41)
1522  return _mm_min_epu16(a, b);
1523 #elif defined(NLIB_NEON)
1524  return NLIB_OP2(vminq, u16, a, b);
1525 #endif
1526 }
1527 
1528 // r.u32[i] = min(a.u32[i], b.u32[i])
1529 NLIB_M(i128) I128::MinUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1530 #if defined(NLIB_SSE41)
1531  return _mm_min_epu32(a, b);
1532 #elif defined(NLIB_NEON)
1533  return NLIB_OP2(vminq, u32, a, b);
1534 #endif
1535 }
1536 
1537 // r.s8[i] = abs(value.s8[i])
1538 NLIB_M(i128) I128::AbsInt8(i128arg value) NLIB_NOEXCEPT {
1539 #if defined(NLIB_SSE41)
1540  return _mm_abs_epi8(value);
1541 #elif defined(NLIB_NEON)
1542  return NLIB_OP1(vabsq, s8, value);
1543 #endif
1544 }
1545 
1546 // r.s16[i] = abs(value.s16[i])
1547 NLIB_M(i128) I128::AbsInt16(i128arg value) NLIB_NOEXCEPT {
1548 #if defined(NLIB_SSE41)
1549  return _mm_abs_epi16(value);
1550 #elif defined(NLIB_NEON)
1551  return NLIB_OP1(vabsq, s16, value);
1552 #endif
1553 }
1554 
1555 // r.s32[i] = abs(value.s32[i])
1556 NLIB_M(i128) I128::AbsInt32(i128arg value) NLIB_NOEXCEPT {
1557 #if defined(NLIB_SSE41)
1558  return _mm_abs_epi32(value);
1559 #elif defined(NLIB_NEON)
1560  return NLIB_OP1(vabsq, s32, value);
1561 #endif
1562 }
1563 
1564 // r.s8[i] = abs(a.s8[i] - b.s8[i])
1565 NLIB_M(i128) I128::AbsDiffInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1566 #if defined(NLIB_SSE41)
1567  return _mm_abs_epi8(_mm_sub_epi8(a, b));
1568 #elif defined(NLIB_NEON)
1569  return NLIB_OP2(vabdq, s8, a, b);
1570 #endif
1571 }
1572 
1573 // r.s16[i] = abs(a.s16[i] - b.s16[i])
1574 NLIB_M(i128) I128::AbsDiffInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1575 #if defined(NLIB_SSE41)
1576  return _mm_abs_epi16(_mm_sub_epi16(a, b));
1577 #elif defined(NLIB_NEON)
1578  return NLIB_OP2(vabdq, s16, a, b);
1579 #endif
1580 }
1581 
1582 // r.s32[i] = abs(a.s32[i] - b.s32[i])
1583 NLIB_M(i128) I128::AbsDiffInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1584 #if defined(NLIB_SSE41)
1585  return _mm_abs_epi32(_mm_sub_epi32(a, b));
1586 #elif defined(NLIB_NEON)
1587  return NLIB_OP2(vabdq, s32, a, b);
1588 #endif
1589 }
1590 
1591 // r.s8[i] = -value.s8[i]
1592 NLIB_M(i128) I128::NegateInt8(i128arg value) NLIB_NOEXCEPT {
1593 #if defined(NLIB_SSE41)
1594  return _mm_sub_epi8(_mm_setzero_si128(), value);
1595 #elif defined(NLIB_NEON)
1596  return NLIB_OP1(vnegq, s8, value);
1597 #endif
1598 }
1599 
1600 // r.s16[i] = -value.s16[i]
1601 NLIB_M(i128) I128::NegateInt16(i128arg value) NLIB_NOEXCEPT {
1602 #if defined(NLIB_SSE41)
1603  return _mm_sub_epi16(_mm_setzero_si128(), value);
1604 #elif defined(NLIB_NEON)
1605  return NLIB_OP1(vnegq, s16, value);
1606 #endif
1607 }
1608 
1609 // r.s32[i] = -value.s32[i]
1610 NLIB_M(i128) I128::NegateInt32(i128arg value) NLIB_NOEXCEPT {
1611 #if defined(NLIB_SSE41)
1612  return _mm_sub_epi32(_mm_setzero_si128(), value);
1613 #elif defined(NLIB_NEON)
1614  return NLIB_OP1(vnegq, s32, value);
1615 #endif
1616 }
1617 
1618 // r = a & b
1619 NLIB_M(i128) I128::And(i128arg a, i128arg b) NLIB_NOEXCEPT {
1620 #if defined(NLIB_SSE41)
1621  return _mm_and_si128(a, b);
1622 #elif defined(NLIB_NEON)
1623  return NLIB_OP2(vandq, s8, a, b);
1624 #endif
1625 }
1626 
1627 // r = a | b
1628 NLIB_M(i128) I128::Or(i128arg a, i128arg b) NLIB_NOEXCEPT {
1629 #if defined(NLIB_SSE41)
1630  return _mm_or_si128(a, b);
1631 #elif defined(NLIB_NEON)
1632  return NLIB_OP2(vorrq, s8, a, b);
1633 #endif
1634 }
1635 
1636 // r = a ^ b
1637 NLIB_M(i128) I128::Xor(i128arg a, i128arg b) NLIB_NOEXCEPT {
1638 #if defined(NLIB_SSE41)
1639  return _mm_xor_si128(a, b);
1640 #elif defined(NLIB_NEON)
1641  return NLIB_OP2(veorq, s8, a, b);
1642 #endif
1643 }
1644 
1645 // r = ~a
1646 NLIB_M(i128) I128::Not(i128arg a) NLIB_NOEXCEPT {
1647 #if defined(NLIB_SSE41)
1648  return _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1649 #elif defined(NLIB_NEON)
1650  return NLIB_OP1(vmvnq, s8, a);
1651 #endif
1652 }
1653 
1654 // r = ~a & b
1655 NLIB_M(i128) I128::AndNot(i128arg a, i128arg b) NLIB_NOEXCEPT {
1656 #if defined(NLIB_SSE41)
1657  return _mm_andnot_si128(a, b);
1658 #elif defined(NLIB_NEON)
1659  return NLIB_OP2(vbicq, s8, b, a);
1660 #endif
1661 }
1662 
1663 // r = ~a | b
1664 NLIB_M(i128) I128::OrNot(i128arg a, i128arg b) NLIB_NOEXCEPT {
1665 #if defined(NLIB_SSE41)
1666  __m128i not_a = _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1667  return _mm_or_si128(not_a, b);
1668 #elif defined(NLIB_NEON)
1669  return NLIB_OP2(vornq, s8, b, a);
1670 #endif
1671 }
1672 
1673 NLIB_M(i128) I128::Test8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1674 #if defined(NLIB_NEON)
1675  return vtstq_s8(a, b);
1676 #else
1677  return I128::Not(I128::CmpEqZero8(I128::And(a, b)));
1678 #endif
1679 }
1680 
1681 NLIB_M(i128) I128::Test16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1682 #if defined(NLIB_NEON)
1683  return NLIB_OP2(vtstq, s16, a, b);
1684 #else
1685  return I128::Not(I128::CmpEqZero16(I128::And(a, b)));
1686 #endif
1687 }
1688 
1689 NLIB_M(i128) I128::Test32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1690 #if defined(NLIB_NEON)
1691  return NLIB_OP2(vtstq, s32, a, b);
1692 #else
1693  return I128::Not(I128::CmpEqZero32(I128::And(a, b)));
1694 #endif
1695 }
1696 
1697 // r.i8[i] = a.i8[i] == b.i8[i] ? 0xFF : 0
1698 NLIB_M(i128) I128::CmpEq8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1699 #if defined(NLIB_SSE41)
1700  return _mm_cmpeq_epi8(a, b);
1701 #elif defined(NLIB_NEON)
1702  return NLIB_CMP(vceqq, s8, a, b, u8);
1703 #endif
1704 }
1705 
1706 // r.i16[i] = a.i16[i] == b.i16[i] ? 0xFFFF : 0
1707 NLIB_M(i128) I128::CmpEq16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1708 #if defined(NLIB_SSE41)
1709  return _mm_cmpeq_epi16(a, b);
1710 #elif defined(NLIB_NEON)
1711  return NLIB_CMP(vceqq, s16, a, b, u16);
1712 #endif
1713 }
1714 
1715 // r.i32[i] = a.i32[i] == b.i32[i] ? 0xFFFFFFFF : 0
1716 NLIB_M(i128) I128::CmpEq32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1717 #if defined(NLIB_SSE41)
1718  return _mm_cmpeq_epi32(a, b);
1719 #elif defined(NLIB_NEON)
1720  return NLIB_CMP(vceqq, s32, a, b, u32);
1721 #endif
1722 }
1723 
1724 // r.i64[i] = a.i64[i] == b.i64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1725 NLIB_M(i128) I128::CmpEq64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1726 #if defined(NLIB_SSE41)
1727  return _mm_cmpeq_epi64(a, b);
1728 #elif defined(NLIB_NEON)
1729 #ifdef __aarch64__
1730  return NLIB_CMP(vceqq, s64, a, b, u64);
1731 #else
1732  uint32x4_t x0 = vceqq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b));
1733  uint32x2x2_t x1 = vtrn_u32(vget_low_u32(x0), vget_high_u32(x0));
1734  uint32x2_t x2 = vand_u32(x1.val[0], x1.val[1]);
1735  int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1736  return vreinterpretq_s8_s64(result);
1737 #endif
1738 #endif
1739 }
1740 
1741 // r.s8[i] = a.s8[i] < b.s8[i] ? 0xFF : 0
1742 NLIB_M(i128) I128::CmpLtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1743 #if defined(NLIB_SSE41)
1744  return _mm_cmplt_epi8(a, b);
1745 #elif defined(NLIB_NEON)
1746  return NLIB_CMP(vcltq, s8, a, b, u8);
1747 #endif
1748 }
1749 
1750 // r.s16[i] = a.s16[i] < b.s16[i] ? 0xFFFF : 0
1751 NLIB_M(i128) I128::CmpLtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1752 #if defined(NLIB_SSE41)
1753  return _mm_cmplt_epi16(a, b);
1754 #elif defined(NLIB_NEON)
1755  return NLIB_CMP(vcltq, s16, a, b, u16);
1756 #endif
1757 }
1758 
1759 // r.s32[i] = a.s32[i] < b.s32[i] ? 0xFFFFFFFF : 0
1760 NLIB_M(i128) I128::CmpLtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1761 #if defined(NLIB_SSE41)
1762  return _mm_cmplt_epi32(a, b);
1763 #elif defined(NLIB_NEON)
1764  return NLIB_CMP(vcltq, s32, a, b, u32);
1765 #endif
1766 }
1767 
1768 // r.s64[i] = a.s64[i] < b.s64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1769 NLIB_M(i128) I128::CmpLtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1770 #if defined(NLIB_SSE42)
1771  return _mm_cmpgt_epi64(b, a);
1772 #elif defined(NLIB_NEON)
1773 #ifdef __aarch64__
1774  return NLIB_CMP(vcltq, s64, a, b, u64);
1775 #else
1776  int32x2x2_t trn_a = vtrn_s32(vreinterpret_s32_s8(vget_low_s8(a)),
1777  vreinterpret_s32_s8(vget_high_s8(a)));
1778  int32x2x2_t trn_b = vtrn_s32(vreinterpret_s32_s8(vget_low_s8(b)),
1779  vreinterpret_s32_s8(vget_high_s8(b)));
1780  uint32x2_t upper_lt = vclt_s32(trn_a.val[1], trn_b.val[1]);
1781  uint32x2_t upper_eq = vceq_s32(trn_a.val[1], trn_b.val[1]);
1782  uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1783  uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1784  int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1785  return vreinterpretq_s8_s64(result);
1786 #endif
1787 #else
1788  i128 cmp = I128::CmpLtInt32(a, b);
1789  i128 eq = I128::CmpEq32(a, b);
1790  i128 cmp_lt = I128::CmpLtUint32(a, b);
1791  i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1792  i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp_lt, cmp_lt);
1793  i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1794  return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1795 #endif
1796 }
1797 
1798 // r.s8[i] = a.s8[i] > b.s8[i] ? 0xFF : 0
1799 NLIB_M(i128) I128::CmpGtInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1800 #if defined(NLIB_SSE41)
1801  return _mm_cmpgt_epi8(a, b);
1802 #elif defined(NLIB_NEON)
1803  return NLIB_CMP(vcgtq, s8, a, b, u8);
1804 #endif
1805 }
1806 
1807 // r.s16[i] = a.s16[i] > b.s16[i] ? 0xFFFF : 0
1808 NLIB_M(i128) I128::CmpGtInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1809 #if defined(NLIB_SSE41)
1810  return _mm_cmpgt_epi16(a, b);
1811 #elif defined(NLIB_NEON)
1812  return NLIB_CMP(vcgtq, s16, a, b, u16);
1813 #endif
1814 }
1815 
1816 // r.s32[i] = a.s32[i] > b.s32[i] ? 0xFFFFFFFF : 0
1817 NLIB_M(i128) I128::CmpGtInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1818 #if defined(NLIB_SSE41)
1819  return _mm_cmpgt_epi32(a, b);
1820 #elif defined(NLIB_NEON)
1821  return NLIB_CMP(vcgtq, s32, a, b, u32);
1822 #endif
1823 }
1824 
1825 // r.s64[i] = a.s64[i] > b.s64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1826 NLIB_M(i128) I128::CmpGtInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1827 #if defined(NLIB_SSE42)
1828  return _mm_cmpgt_epi64(a, b);
1829 #elif defined(NLIB_NEON) && defined(__aarch64__)
1830  return NLIB_CMP(vcgtq, s64, a, b, u64);
1831 #else
1832  return I128::CmpLtInt64(b, a);
1833 #endif
1834 }
1835 
1836 // r.u8[i] = a.u8[i] < b.u8[i] ? 0xFF : 0
1837 NLIB_M(i128) I128::CmpLtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1838 #if defined(NLIB_SSE41)
1839  i128 ofs = I128::SetValue(0x80, each_uint8);
1840  return _mm_cmplt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1841 #elif defined(NLIB_NEON)
1842  return NLIB_CMP(vcltq, u8, a, b, u8);
1843 #endif
1844 }
1845 
1846 // r.u8[i] = a.u8[i] > b.u8[i] ? 0xFF : 0
1847 NLIB_M(i128) I128::CmpGtUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1848 #if defined(NLIB_SSE41)
1849  i128 ofs = I128::SetValue(0x80, each_uint8);
1850  return _mm_cmpgt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1851 #elif defined(NLIB_NEON)
1852  return NLIB_CMP(vcgtq, u8, a, b, u8);
1853 #endif
1854 }
1855 
1856 // r.u16[i] = a.u16[i] < b.u16[i] ? 0xFFFF : 0
1857 NLIB_M(i128) I128::CmpLtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1858 #if defined(NLIB_SSE41)
1859  i128 ofs = I128::SetValue(0x8000U, each_uint16);
1860  return _mm_cmplt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1861 #elif defined(NLIB_NEON)
1862  return NLIB_CMP(vcltq, u16, a, b, u16);
1863 #endif
1864 }
1865 
1866 // r.u16[i] = a.u16[i] > b.u16[i] ? 0xFFFF : 0
1867 NLIB_M(i128) I128::CmpGtUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1868 #if defined(NLIB_SSE41)
1869  i128 ofs = I128::SetValue(0x8000U, each_uint16);
1870  return _mm_cmpgt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1871 #elif defined(NLIB_NEON)
1872  return NLIB_CMP(vcgtq, u16, a, b, u16);
1873 #endif
1874 }
1875 
1876 // r.u32[i] = a.u32[i] < b.u32[i] ? 0xFFFFFFFF : 0
1877 NLIB_M(i128) I128::CmpLtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1878 #if defined(NLIB_SSE41)
1879  i128 ofs = I128::SetValue(0x80000000U, each_uint32);
1880  return _mm_cmplt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1881 #elif defined(NLIB_NEON)
1882  return NLIB_CMP(vcltq, u32, a, b, u32);
1883 #endif
1884 }
1885 
1886 // r.u32[i] = a.u32[i] > b.u32[i] ? 0xFFFFFFFF : 0
1887 NLIB_M(i128) I128::CmpGtUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1888 #if defined(NLIB_SSE41)
1889  i128 ofs = I128::SetValue(0x80000000U, each_uint32);
1890  return _mm_cmpgt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1891 #elif defined(NLIB_NEON)
1892  return NLIB_CMP(vcgtq, u32, a, b, u32);
1893 #endif
1894 }
1895 
1896 // r.u64[i] = a.u64[i] < b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1897 NLIB_M(i128) I128::CmpLtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1898 #if defined(NLIB_SSE42)
1899  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
1900  return _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
1901 #elif defined(NLIB_NEON)
1902 #ifdef __aarch64__
1903  return NLIB_CMP(vcltq, u64, a, b, u64);
1904 #else
1905  uint32x2x2_t trn_a = vtrn_u32(vreinterpret_u32_s8(vget_low_s8(a)),
1906  vreinterpret_u32_s8(vget_high_s8(a)));
1907  uint32x2x2_t trn_b = vtrn_u32(vreinterpret_u32_s8(vget_low_s8(b)),
1908  vreinterpret_u32_s8(vget_high_s8(b)));
1909  uint32x2_t upper_lt = vclt_u32(trn_a.val[1], trn_b.val[1]);
1910  uint32x2_t upper_eq = vceq_u32(trn_a.val[1], trn_b.val[1]);
1911  uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1912  uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1913  int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1914  return vreinterpretq_s8_s64(result);
1915 #endif
1916 #else
1917  i128 cmp = I128::CmpLtUint32(a, b);
1918  i128 eq = I128::CmpEq32(a, b);
1919  i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1920  i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp, cmp);
1921  i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1922  return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1923 #endif
1924 }
1925 
1926 // r.u64[i] = a.u64[i] > b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
1927 NLIB_M(i128) I128::CmpGtUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1928 #if defined(NLIB_SSE42)
1929  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
1930  return _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
1931 #elif defined(NLIB_NEON) && defined(__aarch64__)
1932  return NLIB_CMP(vcgtq, u64, a, b, u64);
1933 #else
1934  return I128::CmpLtUint64(b, a);
1935 #endif
1936 }
1937 
1938 // r.s8[i] = a.s8[i] <= b.s8[i] ? 0xFF : 0
1939 NLIB_M(i128) I128::CmpLeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1940 #if defined(NLIB_SSE41)
1941  return _mm_or_si128(_mm_cmplt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1942 #elif defined(NLIB_NEON)
1943  return NLIB_CMP(vcleq, s8, a, b, u8);
1944 #endif
1945 }
1946 
1947 // r.s16[i] = a.s16[i] <= b.s16[i] ? 0xFFFF : 0
1948 NLIB_M(i128) I128::CmpLeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1949 #if defined(NLIB_SSE41)
1950  return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1951 #elif defined(NLIB_NEON)
1952  return NLIB_CMP(vcleq, s16, a, b, u16);
1953 #endif
1954 }
1955 
1956 // r.s32[i] = a.s32[i] <= b.s32[i] ? 0xFFFFFFFF : 0
1957 NLIB_M(i128) I128::CmpLeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1958 #if defined(NLIB_SSE41)
1959  return _mm_or_si128(_mm_cmplt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1960 #elif defined(NLIB_NEON)
1961  return NLIB_CMP(vcleq, s32, a, b, u32);
1962 #endif
1963 }
1964 
1965 // r.s32[i] = a.s32[i] <= b.s32[i] ? 0xFFFFFFFFFFFFFFFF : 0
1966 NLIB_M(i128) I128::CmpLeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
1967 #if defined(NLIB_SSE42)
1968  return _mm_or_si128(_mm_cmpgt_epi64(b, a), _mm_cmpeq_epi64(a, b));
1969 #elif defined(NLIB_NEON) && defined(__aarch64__)
1970  return NLIB_CMP(vcleq, s64, a, b, u64);
1971 #else
1972  return I128::Not(I128::CmpGtInt64(a, b));
1973 #endif
1974 }
1975 
1976 // r.s8[i] = a.s8[i] >= b.s8[i] ? 0xFF : 0
1977 NLIB_M(i128) I128::CmpGeInt8(i128arg a, i128arg b) NLIB_NOEXCEPT {
1978 #if defined(NLIB_SSE41)
1979  return _mm_or_si128(_mm_cmpgt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1980 #elif defined(NLIB_NEON)
1981  return NLIB_CMP(vcgeq, s8, a, b, u8);
1982 #endif
1983 }
1984 
1985 // r.s16[i] = a.s16[i] >= b.s16[i] ? 0xFFFF : 0
1986 NLIB_M(i128) I128::CmpGeInt16(i128arg a, i128arg b) NLIB_NOEXCEPT {
1987 #if defined(NLIB_SSE41)
1988  return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1989 #elif defined(NLIB_NEON)
1990  return NLIB_CMP(vcgeq, s16, a, b, u16);
1991 #endif
1992 }
1993 
1994 // r.s32[i] = a.s32[i] >= b.s32[i] ? 0xFFFFFFFF : 0
1995 NLIB_M(i128) I128::CmpGeInt32(i128arg a, i128arg b) NLIB_NOEXCEPT {
1996 #if defined(NLIB_SSE41)
1997  return _mm_or_si128(_mm_cmpgt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1998 #elif defined(NLIB_NEON)
1999  return NLIB_CMP(vcgeq, s32, a, b, u32);
2000 #endif
2001 }
2002 
2003 // r.s64[i] = a.s64[i] >= b.s64[i] ? 0xFFFFFFFFFFFFFFFF : 0
2004 NLIB_M(i128) I128::CmpGeInt64(i128arg a, i128arg b) NLIB_NOEXCEPT {
2005 #if defined(NLIB_SSE42)
2006  return _mm_or_si128(_mm_cmpgt_epi64(a, b), _mm_cmpeq_epi64(a, b));
2007 #elif defined(NLIB_NEON) && defined(__aarch64__)
2008  return NLIB_CMP(vcgeq, s64, a, b, u64);
2009 #else
2010  return I128::Not(I128::CmpLtInt64(a, b));
2011 #endif
2012 }
2013 
2014 // r.u8[i] = a.u8[i] <= b.u8[i] ? 0xFF : 0
2015 NLIB_M(i128) I128::CmpLeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2016 #if defined(NLIB_SSE41)
2017  return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
2018 #elif defined(NLIB_NEON)
2019  return NLIB_CMP(vcleq, u8, a, b, u8);
2020 #endif
2021 }
2022 
2023 // r.u16[i] = a.u16[i] <= b.u16[i] ? 0xFFFF : 0
2024 NLIB_M(i128) I128::CmpLeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2025 #if defined(NLIB_SSE41)
2026  return _mm_cmpeq_epi16(_mm_min_epu16(a, b), a);
2027 #elif defined(NLIB_NEON)
2028  return NLIB_CMP(vcleq, u16, a, b, u16);
2029 #endif
2030 }
2031 
2032 // r.u32[i] = a.u32[i] <= b.u32[i] ? 0xFFFFFFFF : 0
2033 NLIB_M(i128) I128::CmpLeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2034 #if defined(NLIB_SSE41)
2035  return _mm_cmpeq_epi32(_mm_min_epu32(a, b), a);
2036 #elif defined(NLIB_NEON)
2037  return NLIB_CMP(vcleq, u32, a, b, u32);
2038 #endif
2039 }
2040 
2041 // r.u64[i] = a.u64[i] <= b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
2042 NLIB_M(i128) I128::CmpLeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
2043 #if defined(NLIB_SSE42)
2044  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
2045  i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
2046  return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2047 #elif defined(NLIB_NEON) && defined(__aarch64__)
2048  return NLIB_CMP(vcleq, u64, a, b, u64);
2049 #else
2050  return I128::Not(I128::CmpGtUint64(a, b));
2051 #endif
2052 }
2053 
2054 // r.u8[i] = a.u8[i] >= b.u8[i] ? 0xFF : 0
2055 NLIB_M(i128) I128::CmpGeUint8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2056 #if defined(NLIB_SSE41)
2057  return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a);
2058 #elif defined(NLIB_NEON)
2059  return NLIB_CMP(vcgeq, u8, a, b, u8);
2060 #endif
2061 }
2062 
2063 // r.u16[i] = a.u16[i] >= b.u16[i] ? 0xFFFF : 0
2064 NLIB_M(i128) I128::CmpGeUint16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2065 #if defined(NLIB_SSE41)
2066  return _mm_cmpeq_epi16(_mm_max_epu16(a, b), a);
2067 #elif defined(NLIB_NEON)
2068  return NLIB_CMP(vcgeq, u16, a, b, u16);
2069 #endif
2070 }
2071 
2072 // r.u32[i] = a.u32[i] >= b.u32[i] ? 0xFFFFFFFF : 0
2073 NLIB_M(i128) I128::CmpGeUint32(i128arg a, i128arg b) NLIB_NOEXCEPT {
2074 #if defined(NLIB_SSE41)
2075  return _mm_cmpeq_epi32(_mm_max_epu32(a, b), a);
2076 #elif defined(NLIB_NEON)
2077  return NLIB_CMP(vcgeq, u32, a, b, u32);
2078 #endif
2079 }
2080 
2081 // r.u64[i] = a.u64[i] >= b.u64[i] ? 0xFFFFFFFFFFFFFFFF : 0
2082 NLIB_M(i128) I128::CmpGeUint64(i128arg a, i128arg b) NLIB_NOEXCEPT {
2083 #if defined(NLIB_SSE42)
2084  i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
2085  i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
2086  return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2087 #elif defined(NLIB_NEON) && defined(__aarch64__)
2088  return NLIB_CMP(vcgeq, u64, a, b, u64);
2089 #else
2090  return I128::Not(I128::CmpLtUint64(a, b));
2091 #endif
2092 }
2093 
2094 NLIB_M(i128) I128::CmpEqZero8(i128arg value) NLIB_NOEXCEPT {
2095 #if defined(__aarch64__)
2096  return vceqzq_s8(value);
2097 #else
2098  return I128::CmpEq8(value, I128::SetZero());
2099 #endif
2100 }
2101 
2102 NLIB_M(i128) I128::CmpEqZero16(i128arg value) NLIB_NOEXCEPT {
2103 #if defined(__aarch64__)
2104  return vreinterpretq_s8_s16(vceqzq_s16(vreinterpretq_s16_s8(value)));
2105 #else
2106  return I128::CmpEq16(value, I128::SetZero());
2107 #endif
2108 }
2109 
2110 NLIB_M(i128) I128::CmpEqZero32(i128arg value) NLIB_NOEXCEPT {
2111 #if defined(__aarch64__)
2112  return vreinterpretq_s8_s32(vceqzq_s32(vreinterpretq_s32_s8(value)));
2113 #else
2114  return I128::CmpEq32(value, I128::SetZero());
2115 #endif
2116 }
2117 
2118 NLIB_M(i128) I128::CmpEqZero64(i128arg value) NLIB_NOEXCEPT {
2119 #if defined(__aarch64__)
2120  return vreinterpretq_s8_s64(vceqzq_s64(vreinterpretq_s64_s8(value)));
2121 #else
2122  return I128::CmpEq64(value, I128::SetZero());
2123 #endif
2124 }
2125 
2126 // r.u8[i] = value.u8[i] << count
2127 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value, int count) NLIB_NOEXCEPT {
2128 #if defined(NLIB_SSE41)
2129  __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2130  __m128i xh = _mm_slli_epi16(_mm_cvtepu8_epi16(hi), count);
2131  __m128i xl = _mm_slli_epi16(_mm_cvtepu8_epi16(value), count);
2132  return I128::NarrowFrom16To8(xl, xh);
2133 #elif defined(NLIB_NEON)
2134  return NLIB_SFT(vshlq, u8, value, count, s8);
2135 #endif
2136 }
2137 
2138 // r.u8[i] = value.u8[i] >> count
2139 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value, int count) NLIB_NOEXCEPT {
2140 #if defined(NLIB_SSE41)
2141  __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2142  __m128i xh = _mm_srli_epi16(_mm_cvtepu8_epi16(hi), count);
2143  __m128i xl = _mm_srli_epi16(_mm_cvtepu8_epi16(value), count);
2144  return _mm_packus_epi16(xl, xh);
2145 #elif defined(NLIB_NEON)
2146  return NLIB_SFT(vshlq, u8, value, -count, s8);
2147 #endif
2148 }
2149 
2150 // r.s8[i] = value.s8[i] >> count
2151 NLIB_M(i128) I128::ShiftRightArithmetic8(i128arg value, int count) NLIB_NOEXCEPT {
2152 #if defined(NLIB_SSE41)
2153  __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2154  __m128i xh = _mm_srai_epi16(_mm_cvtepi8_epi16(hi), count);
2155  __m128i xl = _mm_srai_epi16(_mm_cvtepi8_epi16(value), count);
2156  return _mm_packus_epi16(xl, xh);
2157 #elif defined(NLIB_NEON)
2158  return NLIB_SFT(vshlq, s8, value, -count, s8);
2159 #endif
2160 }
2161 
2162 // r.u16[i] = value.u16[i] << count
2163 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value, int count) NLIB_NOEXCEPT {
2164 #if defined(NLIB_SSE41)
2165  return _mm_slli_epi16(value, count);
2166 #elif defined(NLIB_NEON)
2167  return NLIB_SFT(vshlq, u16, value, count, s16);
2168 #endif
2169 }
2170 
2171 // r.u16[i] = value.u16[i] >> count
2172 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value, int count) NLIB_NOEXCEPT {
2173 #if defined(NLIB_SSE41)
2174  return _mm_srli_epi16(value, count);
2175 #elif defined(NLIB_NEON)
2176  return NLIB_SFT(vshlq, u16, value, -count, s16);
2177 #endif
2178 }
2179 
2180 // r.s16[i] = value.s16[i] >> count
2181 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value, int count) NLIB_NOEXCEPT {
2182 #if defined(NLIB_SSE41)
2183  return _mm_srai_epi16(value, count);
2184 #elif defined(NLIB_NEON)
2185  return NLIB_SFT(vshlq, s16, value, -count, s16);
2186 #endif
2187 }
2188 
2189 // r.u32[i] = value.u32[i] << count
2190 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value, int count) NLIB_NOEXCEPT {
2191 #if defined(NLIB_SSE41)
2192  return _mm_slli_epi32(value, count);
2193 #elif defined(NLIB_NEON)
2194  return NLIB_SFT(vshlq, u32, value, count, s32);
2195 #endif
2196 }
2197 
2198 // r.u32[i] = value.u32[i] >> count
2199 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value, int count) NLIB_NOEXCEPT {
2200 #if defined(NLIB_SSE41)
2201  return _mm_srli_epi32(value, count);
2202 #elif defined(NLIB_NEON)
2203  return NLIB_SFT(vshlq, u32, value, -count, s32);
2204 #endif
2205 }
2206 
2207 // r.s32[i] = value.s32[i] >> count
2208 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value, int count) NLIB_NOEXCEPT {
2209 #if defined(NLIB_SSE41)
2210  return _mm_srai_epi32(value, count);
2211 #elif defined(NLIB_NEON)
2212  return NLIB_SFT(vshlq, s32, value, -count, s32);
2213 #endif
2214 }
2215 
2216 // r.u32[i] = value.u64[i] << count
2217 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value, int count) NLIB_NOEXCEPT {
2218 #if defined(NLIB_SSE41)
2219  return _mm_slli_epi64(value, count);
2220 #elif defined(NLIB_NEON)
2221  return NLIB_SFT(vshlq, u64, value, count, s64);
2222 #endif
2223 }
2224 
2225 // r.u32[i] = value.u64[i] >> count
2226 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value, int count) NLIB_NOEXCEPT {
2227 #if defined(NLIB_SSE41)
2228  return _mm_srli_epi64(value, count);
2229 #elif defined(NLIB_NEON)
2230  return NLIB_SFT(vshlq, u64, value, -count, s64);
2231 #endif
2232 }
2233 
2234 template<size_t N>
2235 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value) NLIB_NOEXCEPT {
2236  NLIB_STATIC_ASSERT(N >= 0 && N <= 8);
2237 #ifdef NLIB_NEON
2238  return vshlq_n_s8(value, N);
2239 #else
2240  return I128::ShiftLeftLogical8(value, N);
2241 #endif
2242 }
2243 
2244 template <size_t N>
2245 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value) NLIB_NOEXCEPT {
2246  NLIB_STATIC_ASSERT(N >= 0 && N <= 8);
2247 #ifdef NLIB_NEON
2248  uint8x16_t tmp = vreinterpretq_u8_s8(value);
2249  return vreinterpretq_s8_u8(vshrq_n_u8(tmp, N));
2250 #else
2251  return I128::ShiftRightLogical8(value, N);
2252 #endif
2253 }
2254 
2255 template <size_t N>
2256 NLIB_M(i128) I128::ShiftRightArithmetic8(i128arg value) NLIB_NOEXCEPT {
2257  NLIB_STATIC_ASSERT(N >= 0 && N <= 8);
2258 #ifdef NLIB_NEON
2259  return vshrq_n_s8(value, N);
2260 #else
2261  return I128::ShiftRightArithmetic8(value, N);
2262 #endif
2263 }
2264 
2265 template <size_t N>
2266 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value) NLIB_NOEXCEPT {
2267  NLIB_STATIC_ASSERT(N >= 0 && N <= 16);
2268 #ifdef NLIB_NEON
2269  uint16x8_t tmp = vreinterpretq_u16_s8(value);
2270  return vreinterpretq_s8_u16(vshlq_n_u16(tmp, N));
2271 #else
2272  return I128::ShiftLeftLogical16(value, N);
2273 #endif
2274 }
2275 
2276 template <size_t N>
2277 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value) NLIB_NOEXCEPT {
2278  NLIB_STATIC_ASSERT(N >= 0 && N <= 16);
2279 #ifdef NLIB_NEON
2280  uint16x8_t tmp = vreinterpretq_u16_s8(value);
2281  return vreinterpretq_s8_u16(vshrq_n_u16(tmp, N));
2282 #else
2283  return I128::ShiftRightLogical16(value, N);
2284 #endif
2285 }
2286 
2287 template <size_t N>
2288 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value) NLIB_NOEXCEPT {
2289  NLIB_STATIC_ASSERT(N >= 0 && N <= 16);
2290 #ifdef NLIB_NEON
2291  int16x8_t tmp = vreinterpretq_s16_s8(value);
2292  return vreinterpretq_s8_s16(vshrq_n_s16(tmp, N));
2293 #else
2294  return I128::ShiftRightArithmetic16(value, N);
2295 #endif
2296 }
2297 
2298 template <size_t N>
2299 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value) NLIB_NOEXCEPT {
2300  NLIB_STATIC_ASSERT(N >= 0 && N <= 32);
2301 #ifdef NLIB_NEON
2302  uint32x4_t tmp = vreinterpretq_u32_s8(value);
2303  return vreinterpretq_s8_u32(vshlq_n_u32(tmp, N));
2304 #else
2305  return I128::ShiftLeftLogical32(value, N);
2306 #endif
2307 }
2308 
2309 template <size_t N>
2310 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value) NLIB_NOEXCEPT {
2311  NLIB_STATIC_ASSERT(N >= 0 && N <= 32);
2312 #ifdef NLIB_NEON
2313  uint32x4_t tmp = vreinterpretq_u32_s8(value);
2314  return vreinterpretq_s8_u32(vshrq_n_u32(tmp, N));
2315 #else
2316  return I128::ShiftRightLogical32(value, N);
2317 #endif
2318 }
2319 
2320 template <size_t N>
2321 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value) NLIB_NOEXCEPT {
2322  NLIB_STATIC_ASSERT(N >= 0 && N <= 32);
2323 #ifdef NLIB_NEON
2324  int32x4_t tmp = vreinterpretq_s32_s8(value);
2325  return vreinterpretq_s8_s32(vshrq_n_s32(tmp, N));
2326 #else
2327  return I128::ShiftRightArithmetic32(value, N);
2328 #endif
2329 }
2330 
2331 template <size_t N>
2332 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value) NLIB_NOEXCEPT {
2333  NLIB_STATIC_ASSERT(N >= 0 && N <= 64);
2334 #ifdef NLIB_NEON
2335  uint64x2_t tmp = vreinterpretq_u64_s8(value);
2336  return vreinterpretq_s8_u64(vshlq_n_u64(tmp, N));
2337 #else
2338  return I128::ShiftLeftLogical64(value, N);
2339 #endif
2340 }
2341 
2342 template <size_t N>
2343 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value) NLIB_NOEXCEPT {
2344  NLIB_STATIC_ASSERT(N >= 0 && N <= 64);
2345 #ifdef NLIB_NEON
2346  uint64x2_t tmp = vreinterpretq_u64_s8(value);
2347  return vreinterpretq_s8_u64(vshrq_n_u64(tmp, N));
2348 #else
2349  return I128::ShiftRightLogical64(value, N);
2350 #endif
2351 }
2352 
2353 #ifdef NLIB_NEON
2354 template<>
2355 NLIB_M(i128) I128::ShiftLeftLogical8<8>(i128arg value) NLIB_NOEXCEPT {
2356  NLIB_UNUSED(value);
2357  return I128::SetZero();
2358 }
2359 template<>
2360 NLIB_M(i128) I128::ShiftRightLogical8<0>(i128arg value) NLIB_NOEXCEPT {
2361  return value;
2362 }
2363 template<>
2364 NLIB_M(i128) I128::ShiftLeftLogical16<16>(i128arg value) NLIB_NOEXCEPT {
2365  NLIB_UNUSED(value);
2366  return I128::SetZero();
2367 }
2368 template<>
2369 NLIB_M(i128) I128::ShiftRightLogical16<0>(i128arg value) NLIB_NOEXCEPT {
2370  return value;
2371 }
2372 template<>
2373 NLIB_M(i128) I128::ShiftRightArithmetic16<0>(i128arg value) NLIB_NOEXCEPT {
2374  return value;
2375 }
2376 template<>
2377 NLIB_M(i128) I128::ShiftLeftLogical32<32>(i128arg value) NLIB_NOEXCEPT {
2378  NLIB_UNUSED(value);
2379  return I128::SetZero();
2380 }
2381 template<>
2382 NLIB_M(i128) I128::ShiftRightLogical32<0>(i128arg value) NLIB_NOEXCEPT {
2383  return value;
2384 }
2385 template<>
2386 NLIB_M(i128) I128::ShiftRightArithmetic32<0>(i128arg value) NLIB_NOEXCEPT {
2387  return value;
2388 }
2389 template<>
2390 NLIB_M(i128) I128::ShiftLeftLogical64<64>(i128arg value) NLIB_NOEXCEPT {
2391  NLIB_UNUSED(value);
2392  return I128::SetZero();
2393 }
2394 template<>
2395 NLIB_M(i128) I128::ShiftRightLogical64<0>(i128arg value) NLIB_NOEXCEPT {
2396  return value;
2397 }
2398 #endif
2399 
2400 template <size_t N>
2401 // r.i8[i + N] = r.i8[i], fill zero
2402 NLIB_M(i128) I128::ByteShiftLeft(i128arg value) NLIB_NOEXCEPT {
2403  NLIB_STATIC_ASSERT(N < 16);
2404 #if defined(NLIB_SSE41)
2405  return _mm_slli_si128(value, N);
2406 #elif defined(NLIB_NEON)
2407  return vextq_s8(vdupq_n_s8(0), value, 16 - N);
2408 #endif
2409 }
2410 
2411 template <size_t N>
2412 // r.i8[i - N] = r.i8[i], fill zero
2413 NLIB_M(i128) I128::ByteShiftRight(i128arg value) NLIB_NOEXCEPT {
2414  NLIB_STATIC_ASSERT(N < 16);
2415 #if defined(NLIB_SSE41)
2416  return _mm_srli_si128(value, N);
2417 #elif defined(NLIB_NEON)
2418  return vextq_s8(value, vdupq_n_s8(0), N);
2419 #endif
2420 }
2421 
2422 template <size_t N>
2423 // left <- fedcba9876543210 -> right
2424 NLIB_M(i128) I128::ByteRotateRight(i128arg value) NLIB_NOEXCEPT {
2425  NLIB_STATIC_ASSERT(N < 16);
2426 #if defined(NLIB_SSE41)
2427  return _mm_alignr_epi8(value, value, N);
2428 #elif defined(NLIB_NEON)
2429  return vextq_s8(value, value, N);
2430 #endif
2431 }
2432 
2433 template <size_t N>
2434 // _mm_alignr_epi8(a, b, N), which returns ((a << 128) | b) >> (N * 8)
2435 NLIB_M(i128) I128::AlignR(i128arg a, i128arg b) NLIB_NOEXCEPT {
2436  NLIB_STATIC_ASSERT(N < 16);
2437 #if defined(NLIB_SSE41)
2438  return _mm_alignr_epi8(a, b, N);
2439 #elif defined(NLIB_NEON)
2440  return vextq_s8(b, a, N);
2441 #endif
2442 }
2443 
2444 // r.u8[i] = value.u8[i * 2]
2445 NLIB_M(i128) I128::NarrowFrom16To8(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2446 #if defined(NLIB_SSE41)
2447  i128 mask = I128::SetValue(0x00FFU, each_uint16);
2448  __m128i lo_mask = _mm_and_si128(lo, mask);
2449  __m128i hi_mask = _mm_and_si128(hi, mask);
2450  return _mm_packus_epi16(lo_mask, hi_mask);
2451 #elif defined(NLIB_NEON)
2452 # ifdef __aarch64__
2453  uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2454  return vreinterpretq_s8_u8(vmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2455 # else
2456  uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2457  uint8x8_t h = vmovn_u16(vreinterpretq_u16_s8(hi));
2458  return NLIB_CMB(u8, l, h);
2459 # endif
2460 #endif
2461 }
2462 
2463 // r.u16[i] = value.u16[i * 2]
2464 NLIB_M(i128) I128::NarrowFrom32To16(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2465 #if defined(NLIB_SSE41)
2466  i128 mask = I128::SetValue(0xFFFFU, each_uint32);
2467  __m128i lo_mask = _mm_and_si128(lo, mask);
2468  __m128i hi_mask = _mm_and_si128(hi, mask);
2469  return _mm_packus_epi32(lo_mask, hi_mask);
2470 #elif defined(NLIB_NEON)
2471 # ifdef __aarch64__
2472  uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2473  return vreinterpretq_s8_u16(vmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2474 # else
2475  uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2476  uint16x4_t h = vmovn_u32(vreinterpretq_u32_s8(hi));
2477  return NLIB_CMB(u16, l, h);
2478 # endif
2479 #endif
2480 }
2481 
2482 // r.u32[i] = value.u32[i * 2]
2483 NLIB_M(i128) I128::NarrowFrom64To32(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2484 #if defined(NLIB_SSE41)
2485  __m128i lo_ = _mm_shuffle_epi32(lo, _MM_SHUFFLE(3, 1, 2, 0));
2486  __m128i hi_ = _mm_shuffle_epi32(hi, _MM_SHUFFLE(3, 1, 2, 0));
2487  return _mm_unpacklo_epi64(lo_, hi_);
2488 #elif defined(NLIB_NEON)
2489 # ifdef __aarch64__
2490  uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2491  return vreinterpretq_s8_u32(vmovn_high_u64(l, vreinterpretq_u64_s8(hi)));
2492 # else
2493  uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2494  uint32x2_t h = vmovn_u64(vreinterpretq_u64_s8(hi));
2495  return NLIB_CMB(u32, l, h);
2496 # endif
2497 #endif
2498 }
2499 
2500 // r.u8[i] = 255 if value.u16[i] > 255
2501 NLIB_M(i128) I128::ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2502 #if defined(NLIB_SSE41)
2503  i128 b7FFF = I128::SetValue(0x7FFFU, each_uint16);
2504  __m128i lotmp = _mm_and_si128(lo, b7FFF);
2505  __m128i hitmp = _mm_and_si128(hi, b7FFF);
2506  return _mm_packus_epi16(lotmp, hitmp);
2507 #elif defined(NLIB_NEON)
2508 # ifdef __aarch64__
2509  uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2510  return vreinterpretq_s8_u8(vqmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2511 # else
2512  uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2513  uint8x8_t h = vqmovn_u16(vreinterpretq_u16_s8(hi));
2514  return NLIB_CMB(u8, l, h);
2515 # endif
2516 #endif
2517 }
2518 
2519 // r.s8[i] = 127 if value.s16[i] > 127, -128 if value.s16[i] < -128
2520 NLIB_M(i128) I128::ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2521 #if defined(NLIB_SSE41)
2522  return _mm_packs_epi16(lo, hi);
2523 #elif defined(NLIB_NEON)
2524 # ifdef __aarch64__
2525  int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2526  return vqmovn_high_s16(l, vreinterpretq_s16_s8(hi));
2527 # else
2528  int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2529  int8x8_t h = vqmovn_s16(vreinterpretq_s16_s8(hi));
2530  return NLIB_CMB(s8, l, h);
2531 # endif
2532 #endif
2533 }
2534 
2535 // r.u16[i] = 65535 if value.u32[i] > 65535
2536 NLIB_M(i128) I128::ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2537 #if defined(NLIB_SSE41)
2538  i128 b7FFFFFFF = I128::SetValue(0x7FFFFFFFU, each_uint32);
2539  __m128i lotmp = _mm_and_si128(lo, b7FFFFFFF);
2540  __m128i hitmp = _mm_and_si128(hi, b7FFFFFFF);
2541  return _mm_packus_epi32(lotmp, hitmp);
2542 #elif defined(NLIB_NEON)
2543 # ifdef __aarch64__
2544  uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2545  return vreinterpretq_s8_u16(vqmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2546 # else
2547  uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2548  uint16x4_t h = vqmovn_u32(vreinterpretq_u32_s8(hi));
2549  return NLIB_CMB(u16, l, h);
2550 # endif
2551 #endif
2552 }
2553 
2554 // r.s16[i] = 32767 if value.s32[i] > 32767, -32768 if value.s32[i] < -32768
2555 NLIB_M(i128) I128::ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi) NLIB_NOEXCEPT {
2556 #if defined(NLIB_SSE41)
2557  return _mm_packs_epi32(lo, hi);
2558 #elif defined(NLIB_NEON)
2559 # ifdef __aarch64__
2560  int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2561  return vreinterpretq_s8_s16(vqmovn_high_s32(l, vreinterpretq_s32_s8(hi)));
2562 # else
2563  int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2564  int16x4_t h = vqmovn_s32(vreinterpretq_s32_s8(hi));
2565  return NLIB_CMB(s16, l, h);
2566 # endif
2567 #endif
2568 }
2569 
2570 // r.s8[2 * i] = value.s8[i], r.u8[2 * i + 1] = value.s8[i] < 0 ? 0xFF : 0
2571 NLIB_M(i128) I128::ConvertFromInt8ToInt16Lo(i128arg value) NLIB_NOEXCEPT {
2572 #if defined(NLIB_SSE41)
2573  return _mm_cvtepi8_epi16(value);
2574 #elif defined(NLIB_NEON)
2575  return vreinterpretq_s8_s16(vmovl_s8(vget_low_s8(value)));
2576 #endif
2577 }
2578 
2579 // r.s8[2 * i] = value.s8[i + 8], r.u8[2 * i + 1] = value.s8[i + 8] < 0 ? 0xFF : 0
2580 NLIB_M(i128) I128::ConvertFromInt8ToInt16Hi(i128arg value) NLIB_NOEXCEPT {
2581 #if defined(NLIB_SSE41)
2582  return _mm_cvtepi8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2583 #elif defined(NLIB_NEON)
2584 # ifdef __aarch64__
2585  int16x8_t result = vmovl_high_s8(value);
2586 # else
2587  int16x8_t result = vmovl_s8(vget_high_s8(value));
2588 # endif
2589  return vreinterpretq_s8_s16(result);
2590 #endif
2591 }
2592 
2593 // r.s16[2 * i] = value.s16[i], r.u16[2 * i + 1] = value.s16[i] < 0 ? 0xFFFF : 0
2594 NLIB_M(i128) I128::ConvertFromInt16ToInt32Lo(i128arg value) NLIB_NOEXCEPT {
2595 #if defined(NLIB_SSE41)
2596  return _mm_cvtepi16_epi32(value);
2597 #elif defined(NLIB_NEON)
2598  int16x8_t x = vreinterpretq_s16_s8(value);
2599  int32x4_t result = vmovl_s16(vget_low_s16(x));
2600  return vreinterpretq_s8_s32(result);
2601 #endif
2602 }
2603 
2604 // r.s16[2 * i] = value.s16[i + 4], r.u16[2 * i + 1] = value.s16[i + 4] < 0 ? 0xFFFF : 0
2605 NLIB_M(i128) I128::ConvertFromInt16ToInt32Hi(i128arg value) NLIB_NOEXCEPT {
2606 #if defined(NLIB_SSE41)
2607  return _mm_cvtepi16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2608 #elif defined(NLIB_NEON)
2609  int16x8_t x = vreinterpretq_s16_s8(value);
2610 # ifdef __aarch64__
2611  int32x4_t result = vmovl_high_s16(x);
2612 # else
2613  int32x4_t result = vmovl_s16(vget_high_s16(x));
2614 # endif
2615  return vreinterpretq_s8_s32(result);
2616 #endif
2617 }
2618 
2619 // r.s32[2 * i] = value.s32[i], r.u32[2 * i + 1] = value.s32[i] < 0 ? 0xFFFFFFFF : 0
2620 NLIB_M(i128) I128::ConvertFromInt32ToInt64Lo(i128arg value) NLIB_NOEXCEPT {
2621 #if defined(NLIB_SSE41)
2622  return _mm_cvtepi32_epi64(value);
2623 #elif defined(NLIB_NEON)
2624  int32x4_t x = vreinterpretq_s32_s8(value);
2625  int64x2_t result = vmovl_s32(vget_low_s32(x));
2626  return vreinterpretq_s8_s64(result);
2627 #endif
2628 }
2629 
2630 // r.s32[2 * i] = value.s32[i + 2], r.u32[2 * i + 1] = value.s32[i + 2] < 0 ? 0xFFFFFFFF : 0
2631 NLIB_M(i128) I128::ConvertFromInt32ToInt64Hi(i128arg value) NLIB_NOEXCEPT {
2632 #if defined(NLIB_SSE41)
2633  return _mm_cvtepi32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2634 #elif defined(NLIB_NEON)
2635  int32x4_t x = vreinterpretq_s32_s8(value);
2636 # ifdef __aarch64__
2637  int64x2_t result = vmovl_high_s32(x);
2638 # else
2639  int64x2_t result = vmovl_s32(vget_high_s32(x));
2640 # endif
2641  return vreinterpretq_s8_s64(result);
2642 #endif
2643 }
2644 
2645 // r.u16[i] = value.u8[i]
2646 NLIB_M(i128) I128::ConvertFromUint8ToUint16Lo(i128arg value) NLIB_NOEXCEPT {
2647 #if defined(NLIB_SSE41)
2648  return _mm_cvtepu8_epi16(value);
2649 #elif defined(NLIB_NEON)
2650  uint8x16_t x = vreinterpretq_u8_s8(value);
2651  uint16x8_t result = vmovl_u8(vget_low_u8(x));
2652  return vreinterpretq_s8_u16(result);
2653 #endif
2654 }
2655 
2656 // r.u16[i] = value.u8[i + 8]
2657 NLIB_M(i128) I128::ConvertFromUint8ToUint16Hi(i128arg value) NLIB_NOEXCEPT {
2658 #if defined(NLIB_SSE41)
2659  return _mm_cvtepu8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2660 #elif defined(NLIB_NEON)
2661  uint8x16_t x = vreinterpretq_u8_s8(value);
2662 # ifdef __aarch64__
2663  uint16x8_t result = vmovl_high_u8(x);
2664 # else
2665  uint16x8_t result = vmovl_u8(vget_high_u8(x));
2666 # endif
2667  return vreinterpretq_s8_u16(result);
2668 #endif
2669 }
2670 
2671 // r.u32[i] = value.u16[i]
2672 NLIB_M(i128) I128::ConvertFromUint16ToUint32Lo(i128arg value) NLIB_NOEXCEPT {
2673 #if defined(NLIB_SSE41)
2674  return _mm_cvtepu16_epi32(value);
2675 #elif defined(NLIB_NEON)
2676  uint16x8_t x = vreinterpretq_u16_s8(value);
2677  uint32x4_t result = vmovl_u16(vget_low_u16(x));
2678  return vreinterpretq_s8_u32(result);
2679 #endif
2680 }
2681 
2682 // r.u32[i] = value.u16[i + 4]
2683 NLIB_M(i128) I128::ConvertFromUint16ToUint32Hi(i128arg value) NLIB_NOEXCEPT {
2684 #if defined(NLIB_SSE41)
2685  return _mm_cvtepu16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2686 #elif defined(NLIB_NEON)
2687  uint16x8_t x = vreinterpretq_u16_s8(value);
2688 # ifdef __aarch64__
2689  uint32x4_t result = vmovl_high_u16(x);
2690 # else
2691  uint32x4_t result = vmovl_u16(vget_high_u16(x));
2692 # endif
2693  return vreinterpretq_s8_u32(result);
2694 #endif
2695 }
2696 
2697 // r.u64[i] = value.u32[i]
2698 NLIB_M(i128) I128::ConvertFromUint32ToUint64Lo(i128arg value) NLIB_NOEXCEPT {
2699 #if defined(NLIB_SSE41)
2700  return _mm_cvtepu32_epi64(value);
2701 #elif defined(NLIB_NEON)
2702  uint32x4_t x = vreinterpretq_u32_s8(value);
2703  uint64x2_t result = vmovl_u32(vget_low_u32(x));
2704  return vreinterpretq_s8_u64(result);
2705 #endif
2706 }
2707 
2708 // r.u64[i] = value.u32[i + 2]
2709 NLIB_M(i128) I128::ConvertFromUint32ToUint64Hi(i128arg value) NLIB_NOEXCEPT {
2710 #if defined(NLIB_SSE41)
2711  return _mm_cvtepu32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2712 #elif defined(NLIB_NEON)
2713  uint32x4_t x = vreinterpretq_u32_s8(value);
2714 # ifdef __aarch64__
2715  uint64x2_t result = vmovl_high_u32(x);
2716 # else
2717  uint64x2_t result = vmovl_u32(vget_high_u32(x));
2718 # endif
2719  return vreinterpretq_s8_u64(result);
2720 #endif
2721 }
2722 
2723 // 01234567 abcdefgh -> 0a1b2c3d4e5f6g7h
2724 NLIB_M(i128) I128::Zip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2725 #if defined(NLIB_SSE41)
2726  return _mm_unpacklo_epi8(a, b);
2727 #elif defined(NLIB_NEON)
2728 # ifdef __aarch64__
2729  return vzip1q_s8(a, b);
2730 # else
2731  return vzipq_s8(a, b).val[0];
2732 # endif
2733 #endif
2734 }
2735 
2736 // 01234567 abcdefgh -> 0a1b2c3d4e5f6g7h
2737 NLIB_M(i128) I128::Zip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2738 #if defined(NLIB_SSE41)
2739  return _mm_unpackhi_epi8(a, b);
2740 #elif defined(NLIB_NEON)
2741 # ifdef __aarch64__
2742  return vzip2q_s8(a, b);
2743 # else
2744  return vzipq_s8(a, b).val[1];
2745 # endif
2746 #endif
2747 }
2748 
2749 NLIB_M(i128) I128::Unzip8Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2750 #if defined(NLIB_SSE41)
2751  i128 mask = I128::SetValue(0x00FFU, each_uint16);
2752  __m128i lo_mask = _mm_and_si128(a, mask);
2753  __m128i hi_mask = _mm_and_si128(b, mask);
2754  return _mm_packus_epi16(lo_mask, hi_mask);
2755 #elif defined(NLIB_NEON)
2756 # ifdef __aarch64__
2757  return vuzp1q_s8(a, b);
2758 # else
2759  return vuzpq_s8(a, b).val[0];
2760 # endif
2761 #endif
2762 }
2763 
2764 NLIB_M(i128) I128::Unzip8Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2765 #if defined(NLIB_SSE41)
2766  i128 mask = I128::SetValue(0xFF00U, each_uint16);
2767  __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 1);
2768  __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 1);
2769  return _mm_packus_epi16(lo_mask, hi_mask);
2770 #elif defined(NLIB_NEON)
2771 # ifdef __aarch64__
2772  return vuzp2q_s8(a, b);
2773 # else
2774  return vuzpq_s8(a, b).val[1];
2775 # endif
2776 #endif
2777 }
2778 
2779 // 0123 abcd -> 0a1b2c3d
2780 NLIB_M(i128) I128::Zip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2781 #if defined(NLIB_SSE41)
2782  return _mm_unpacklo_epi16(a, b);
2783 #elif defined(NLIB_NEON)
2784 # ifdef __aarch64__
2785  return NLIB_OP2(vzip1q, u16, a, b);
2786 # else
2787  return vreinterpretq_s8_u16(vzipq_u16(
2788  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2789 # endif
2790 #endif
2791 }
2792 
2793 // 0123 abcd -> 0a1b2c3d
2794 NLIB_M(i128) I128::Zip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2795 #if defined(NLIB_SSE41)
2796  return _mm_unpackhi_epi16(a, b);
2797 #elif defined(NLIB_NEON)
2798 # ifdef __aarch64__
2799  return NLIB_OP2(vzip2q, u16, a, b);
2800 # else
2801  return vreinterpretq_s8_u16(vzipq_u16(
2802  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2803 # endif
2804 #endif
2805 }
2806 
2807 NLIB_M(i128) I128::Unzip16Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2808 #if defined(NLIB_SSE41)
2809  i128 mask = I128::SetValue(0xFFFFU, each_uint32);
2810  __m128i lo_mask = _mm_and_si128(a, mask);
2811  __m128i hi_mask = _mm_and_si128(b, mask);
2812  return _mm_packus_epi32(lo_mask, hi_mask);
2813 #elif defined(NLIB_NEON)
2814 # ifdef __aarch64__
2815  return NLIB_OP2(vuzp1q, u16, a, b);
2816 # else
2817  return vreinterpretq_s8_u16(vuzpq_u16(
2818  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2819 # endif
2820 #endif
2821 }
2822 
2823 NLIB_M(i128) I128::Unzip16Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2824 #if defined(NLIB_SSE41)
2825  i128 mask = I128::SetValue(0xFFFF0000U, each_uint32);
2826  __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 2);
2827  __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 2);
2828  return _mm_packus_epi32(lo_mask, hi_mask);
2829 #elif defined(NLIB_NEON)
2830 # ifdef __aarch64__
2831  return NLIB_OP2(vuzp2q, u16, a, b);
2832 # else
2833  return vreinterpretq_s8_u16(vuzpq_u16(
2834  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2835 # endif
2836 #endif
2837 }
2838 
2839 // 01 ab -> 0a1b
2840 NLIB_M(i128) I128::Zip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2841 #if defined(NLIB_SSE41)
2842  return _mm_unpacklo_epi32(a, b);
2843 #elif defined(NLIB_NEON)
2844 # ifdef __aarch64__
2845  return NLIB_OP2(vzip1q, u32, a, b);
2846 # else
2847  return vreinterpretq_s8_u32(vzipq_u32(
2848  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2849 # endif
2850 #endif
2851 }
2852 
2853 // 01 ab -> 0a1b
2854 NLIB_M(i128) I128::Zip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2855 #if defined(NLIB_SSE41)
2856  return _mm_unpackhi_epi32(a, b);
2857 #elif defined(NLIB_NEON)
2858 # ifdef __aarch64__
2859  return NLIB_OP2(vzip2q, u32, a, b);
2860 # else
2861  return vreinterpretq_s8_u32(vzipq_u32(
2862  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2863 # endif
2864 #endif
2865 }
2866 
2867 NLIB_M(i128) I128::Unzip32Lo(i128arg a, i128arg b) NLIB_NOEXCEPT {
2868 #if defined(NLIB_SSE41)
2869  __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
2870  __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
2871  return _mm_blend_epi16(x0, x1, 0xF0);
2872 #elif defined(NLIB_NEON)
2873 # ifdef __aarch64__
2874  return NLIB_OP2(vuzp1q, u32, a, b);
2875 # else
2876  return vreinterpretq_s8_u32(vuzpq_u32(
2877  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2878 # endif
2879 #endif
2880 }
2881 
2882 NLIB_M(i128) I128::Unzip32Hi(i128arg a, i128arg b) NLIB_NOEXCEPT {
2883 #if defined(NLIB_SSE41)
2884  __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 3, 1));
2885  __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 2, 0));
2886  return _mm_blend_epi16(x0, x1, 0xF0);
2887 #elif defined(NLIB_NEON)
2888 # ifdef __aarch64__
2889  return NLIB_OP2(vuzp2q, u32, a, b);
2890 # else
2891  return vreinterpretq_s8_u32(vuzpq_u32(
2892  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2893 # endif
2894 #endif
2895 }
2896 
2897 template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7,
2898  int V8, int V9, int V10, int V11, int V12, int V13, int V14, int V15>
2899 NLIB_M(i128) I128::Permute8(i128arg a, i128arg b) NLIB_NOEXCEPT {
2900 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON)
2901  return __builtin_shufflevector(
2902  a, b,
2903  V0, V1, V2, V3, V4, V5, V6, V7,
2904  V8, V9, V10, V11, V12, V13, V14, V15);
2905 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41)
2906  return __builtin_shufflevector((__v16qi)a, (__v16qi)b,
2907  V0, V1, V2, V3, V4, V5, V6, V7,
2908  V8, V9, V10, V11, V12, V13, V14, V15);
2909 #else
2910  NLIB_ALIGNAS(16) int8_t mask_a[16] = {
2911  (V0 < 0 || V0 > 15) ? -128 : V0,
2912  (V1 < 0 || V1 > 15) ? -128 : V1,
2913  (V2 < 0 || V2 > 15) ? -128 : V2,
2914  (V3 < 0 || V3 > 15) ? -128 : V3,
2915  (V4 < 0 || V4 > 15) ? -128 : V4,
2916  (V5 < 0 || V5 > 15) ? -128 : V5,
2917  (V6 < 0 || V6 > 15) ? -128 : V6,
2918  (V7 < 0 || V7 > 15) ? -128 : V7,
2919  (V8 < 0 || V8 > 15) ? -128 : V8,
2920  (V9 < 0 || V9 > 15) ? -128 : V9,
2921  (V10 < 0 || V10 > 15) ? -128 : V10,
2922  (V11 < 0 || V11 > 15) ? -128 : V11,
2923  (V12 < 0 || V12 > 15) ? -128 : V12,
2924  (V13 < 0 || V13 > 15) ? -128 : V13,
2925  (V14 < 0 || V14 > 15) ? -128 : V14,
2926  (V15 < 0 || V15 > 15) ? -128 : V15
2927  };
2928  NLIB_ALIGNAS(16) int8_t mask_b[16] = {
2929  V0 < 16 ? -128 : (V0 - 16),
2930  V1 < 16 ? -128 : (V1 - 16),
2931  V2 < 16 ? -128 : (V2 - 16),
2932  V3 < 16 ? -128 : (V3 - 16),
2933  V4 < 16 ? -128 : (V4 - 16),
2934  V5 < 16 ? -128 : (V5 - 16),
2935  V6 < 16 ? -128 : (V6 - 16),
2936  V7 < 16 ? -128 : (V7 - 16),
2937  V8 < 16 ? -128 : (V8 - 16),
2938  V9 < 16 ? -128 : (V9 - 16),
2939  V10 < 16 ? -128 : (V10 - 16),
2940  V11 < 16 ? -128 : (V11 - 16),
2941  V12 < 16 ? -128 : (V12 - 16),
2942  V13 < 16 ? -128 : (V13 - 16),
2943  V14 < 16 ? -128 : (V14 - 16),
2944  V15 < 16 ? -128 : (V15 - 16)
2945  };
2946  i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2947  i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2948  return I128::Or(tmp_a, tmp_b);
2949 #endif
2950 }
2951 
2952 template<int V0, int V1, int V2, int V3, int V4, int V5, int V6, int V7>
2953 NLIB_M(i128) I128::Permute16(i128arg a, i128arg b) NLIB_NOEXCEPT {
2954 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON)
2955  return vreinterpretq_s8_u16(__builtin_shufflevector(
2956  vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b),
2957  V0, V1, V2, V3, V4, V5, V6, V7));
2958 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41)
2959  return __builtin_shufflevector((__v8hi)a, (__v8hi)b,
2960  V0, V1, V2, V3, V4, V5, V6, V7);
2961 #else
2962  NLIB_ALIGNAS(16) int8_t mask_a[16] = {
2963  (V0 < 0 || V0 > 7) ? -128 : V0 * 2,
2964  (V0 < 0 || V0 > 7) ? -128 : V0 * 2 + 1,
2965  (V1 < 0 || V1 > 7) ? -128 : V1 * 2,
2966  (V1 < 0 || V1 > 7) ? -128 : V1 * 2 + 1,
2967  (V2 < 0 || V2 > 7) ? -128 : V2 * 2,
2968  (V2 < 0 || V2 > 7) ? -128 : V2 * 2 + 1,
2969  (V3 < 0 || V3 > 7) ? -128 : V3 * 2,
2970  (V3 < 0 || V3 > 7) ? -128 : V3 * 2 + 1,
2971  (V4 < 0 || V4 > 7) ? -128 : V4 * 2,
2972  (V4 < 0 || V4 > 7) ? -128 : V4 * 2 + 1,
2973  (V5 < 0 || V5 > 7) ? -128 : V5 * 2,
2974  (V5 < 0 || V5 > 7) ? -128 : V5 * 2 + 1,
2975  (V6 < 0 || V6 > 7) ? -128 : V6 * 2,
2976  (V6 < 0 || V6 > 7) ? -128 : V6 * 2 + 1,
2977  (V7 < 0 || V7 > 7) ? -128 : V7 * 2,
2978  (V7 < 0 || V7 > 7) ? -128 : V7 * 2 + 1
2979  };
2980  NLIB_ALIGNAS(16) int8_t mask_b[16] = {
2981  V0 < 8 ? -128 : (V0 - 8) * 2,
2982  V0 < 8 ? -128 : (V0 - 8) * 2 + 1,
2983  V1 < 8 ? -128 : (V1 - 8) * 2,
2984  V1 < 8 ? -128 : (V1 - 8) * 2 + 1,
2985  V2 < 8 ? -128 : (V2 - 8) * 2,
2986  V2 < 8 ? -128 : (V2 - 8) * 2 + 1,
2987  V3 < 8 ? -128 : (V3 - 8) * 2,
2988  V3 < 8 ? -128 : (V3 - 8) * 2 + 1,
2989  V4 < 8 ? -128 : (V4 - 8) * 2,
2990  V4 < 8 ? -128 : (V4 - 8) * 2 + 1,
2991  V5 < 8 ? -128 : (V5 - 8) * 2,
2992  V5 < 8 ? -128 : (V5 - 8) * 2 + 1,
2993  V6 < 8 ? -128 : (V6 - 8) * 2,
2994  V6 < 8 ? -128 : (V6 - 8) * 2 + 1,
2995  V7 < 8 ? -128 : (V7 - 8) * 2,
2996  V7 < 8 ? -128 : (V7 - 8) * 2 + 1
2997  };
2998  i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2999  i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
3000  return I128::Or(tmp_a, tmp_b);
3001 #endif
3002 }
3003 
3004 template<int V0, int V1, int V2, int V3>
3005 NLIB_M(i128) I128::Permute32(i128arg a, i128arg b) NLIB_NOEXCEPT {
3006 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON)
3007  return vreinterpretq_s8_u32(__builtin_shufflevector(
3008  vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b),
3009  V0, V1, V2, V3));
3010 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41)
3011  return __builtin_shufflevector((__v4si)a, (__v4si)b,
3012  V0, V1, V2, V3);
3013 #else
3014  NLIB_ALIGNAS(16) int8_t mask_a[16] = {
3015  (V0 < 0 || V0 > 3) ? -128 : V0 * 4,
3016  (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 1,
3017  (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 2,
3018  (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 3,
3019  (V1 < 0 || V1 > 3) ? -128 : V1 * 4,
3020  (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 1,
3021  (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 2,
3022  (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 3,
3023  (V2 < 0 || V2 > 3) ? -128 : V2 * 4,
3024  (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 1,
3025  (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 2,
3026  (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 3,
3027  (V3 < 0 || V3 > 3) ? -128 : V3 * 4,
3028  (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 1,
3029  (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 2,
3030  (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 3
3031  };
3032  NLIB_ALIGNAS(16) int8_t mask_b[16] = {
3033  V0 < 4 ? -128 : (V0 - 4) * 4,
3034  V0 < 4 ? -128 : (V0 - 4) * 4 + 1,
3035  V0 < 4 ? -128 : (V0 - 4) * 4 + 2,
3036  V0 < 4 ? -128 : (V0 - 4) * 4 + 3,
3037  V1 < 4 ? -128 : (V1 - 4) * 4,
3038  V1 < 4 ? -128 : (V1 - 4) * 4 + 1,
3039  V1 < 4 ? -128 : (V1 - 4) * 4 + 2,
3040  V1 < 4 ? -128 : (V1 - 4) * 4 + 3,
3041  V2 < 4 ? -128 : (V2 - 4) * 4,
3042  V2 < 4 ? -128 : (V2 - 4) * 4 + 1,
3043  V2 < 4 ? -128 : (V2 - 4) * 4 + 2,
3044  V2 < 4 ? -128 : (V2 - 4) * 4 + 3,
3045  V3 < 4 ? -128 : (V3 - 4) * 4,
3046  V3 < 4 ? -128 : (V3 - 4) * 4 + 1,
3047  V3 < 4 ? -128 : (V3 - 4) * 4 + 2,
3048  V3 < 4 ? -128 : (V3 - 4) * 4 + 3
3049  };
3050  i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
3051  i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
3052  return I128::Or(tmp_a, tmp_b);
3053 #endif
3054 }
3055 
3056 
3057 // 0123456789abcdef -> 1032547698badcfe
3058 NLIB_M(i128) I128::Reverse16(i128arg value) NLIB_NOEXCEPT {
3059 #if defined(NLIB_SSE41)
3060  NLIB_ALIGNAS(16) const int8_t mask_[16] = {
3061  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
3062  };
3063  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3064 #elif defined(NLIB_NEON)
3065  return NLIB_OP1(vrev16q, u8, value);
3066 #endif
3067 }
3068 
3069 // 0123456789abcdef -> 32107654ba98fedc
3070 NLIB_M(i128) I128::Reverse32(i128arg value) NLIB_NOEXCEPT {
3071 #if defined(NLIB_SSE41)
3072  NLIB_ALIGNAS(16) const int8_t mask_[16] = {
3073  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
3074  };
3075  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3076 #elif defined(NLIB_NEON)
3077  return NLIB_OP1(vrev32q, u8, value);
3078 #endif
3079 }
3080 
3081 // 0123456789abcdef -> 76543210fedcba98
3082 NLIB_M(i128) I128::Reverse64(i128arg value) NLIB_NOEXCEPT {
3083 #if defined(NLIB_SSE41)
3084  NLIB_ALIGNAS(16) const int8_t mask_[16] = {
3085  7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
3086  };
3087  return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3088 #elif defined(NLIB_NEON)
3089  return NLIB_OP1(vrev64q, u8, value);
3090 #endif
3091 }
3092 
3093 // r = Or(ismasked(value.u8[i]) ? (1 << i) : 0)
3094 NLIB_M(int) I128::MoveMask8(i128arg value) NLIB_NOEXCEPT { // NOLINT
3095 #if defined(NLIB_SSE41)
3096  return _mm_movemask_epi8(value);
3097 #elif defined(NLIB_NEON)
3098  uint8x16_t powers = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL));
3099  uint8x16_t a = vandq_u8(value, powers);
3100 # ifdef __aarch64__
3101  return vaddv_u8(vget_low_u8(a)) | (vaddv_u8(vget_high_u8(a)) << 8);
3102 # else
3103  uint8x8_t al = vget_low_u8(a);
3104  uint8x8_t ah = vget_high_u8(a);
3105  uint8x8_t tmp = vpadd_u8(al, ah);
3106  tmp = vpadd_u8(tmp, tmp);
3107  tmp = vpadd_u8(tmp, tmp);
3108  return vget_lane_u16(vreinterpret_u16_u8(tmp), 0);
3109 # endif
3110 #endif
3111 }
3112 
3113 // r = Or(ismasked(value.u16[i]) ? (1 << i) : 0)
3114 NLIB_M(int) I128::MoveMask16(i128arg value) NLIB_NOEXCEPT { // NOLINT
3115 #if defined(NLIB_SSE41)
3116  __m128i tmp = _mm_packs_epi16(value, value);
3117  return _mm_movemask_epi8(tmp) & 255;
3118 #elif defined(NLIB_NEON)
3119  uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3120  uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3121  uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3122  uint16x8_t a = vandq_u16(vreinterpretq_u16_s8(value), powers);
3123 # ifdef __aarch64__
3124  return vaddvq_u16(a);
3125 # else
3126  uint8x8_t tmp = vmovn_u16(a);
3127  tmp = vpadd_u8(tmp, tmp);
3128  tmp = vpadd_u8(tmp, tmp);
3129  tmp = vpadd_u8(tmp, tmp);
3130  return vget_lane_u8(tmp, 0);
3131 # endif
3132 #endif
3133 }
3134 
3135 // r = Or(ismasked(value.u32[i]) ? (1 << i) : 0)
3136 NLIB_M(int) I128::MoveMask32(i128arg value) NLIB_NOEXCEPT { // NOLINT
3137 #if defined(NLIB_SSE41)
3138  __m128i tmp = _mm_packs_epi16(value, value);
3139  tmp = _mm_packs_epi16(tmp, tmp);
3140  return _mm_movemask_epi8(tmp) & 15;
3141 #elif defined(NLIB_NEON)
3142  uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3143  uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3144  uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3145  uint32x4_t a = vandq_u32(vreinterpretq_u32_s8(value), powers);
3146 # ifdef __aarch64__
3147  return vaddvq_u32(a);
3148 # else
3149  uint16x4_t tmp = vmovn_u32(a);
3150  tmp = vpadd_u16(tmp, tmp);
3151  tmp = vpadd_u16(tmp, tmp);
3152  return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
3153 # endif
3154 #endif
3155 }
3156 
3157 NLIB_M(i128) I128::SetMask8(int mask) NLIB_NOEXCEPT {
3158 #if defined(NLIB_NEON)
3159  int8x8_t m = vcreate_s8(0x8040201008040201ULL);
3160  int8x8_t s0 = vdup_n_s8(mask & 0xFF);
3161  int8x8_t s1 = vdup_n_s8(mask >> 8);
3162  return vtstq_s8(vcombine_s8(m, m), vcombine_s8(s0, s1));
3163 #elif defined(NLIB_SSE41)
3164  i128 m = I128::SetValue(0x8040201008040201ULL, each_uint64);
3165  i128 s0 = I128::SetValue(mask & 0xFF, each_int8);
3166  i128 s1 = I128::SetValue(static_cast<int8_t>(mask >> 8), each_int8);
3167  i128 s = _mm_blend_epi16(s0, s1, 0xF0);
3168  return I128::Test8(m, s);
3169 #endif
3170 }
3171 
3172 NLIB_M(i128) I128::SetMask16(int mask) NLIB_NOEXCEPT {
3173 #if defined(NLIB_NEON)
3174  uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3175  uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3176  uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3177  uint16x8_t s = vdupq_n_u16(mask);
3178  return vreinterpretq_s8_u16(vtstq_u16(powers, s));
3179 #elif defined(NLIB_SSE41)
3180  i128 m0 = I128::SetValue(0x0008000400020001ULL, each_uint64);
3181  i128 m1 = I128::SetValue(0x0080004000200010ULL, each_uint64);
3182  i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3183  i128 s = I128::SetValue(static_cast<int16_t>(mask), each_int16);
3184  return I128::Test16(m, s);
3185 #endif
3186 }
3187 
3188 NLIB_M(i128) I128::SetMask32(int mask) NLIB_NOEXCEPT {
3189 #if defined(NLIB_NEON)
3190  uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3191  uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3192  uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3193  uint32x4_t s = vdupq_n_u32(mask);
3194  return vreinterpretq_s8_u32(vtstq_u32(powers, s));
3195 #elif defined(NLIB_SSE41)
3196  i128 m0 = I128::SetValue(0x0000000200000001ULL, each_uint64);
3197  i128 m1 = I128::SetValue(0x0000000800000004ULL, each_uint64);
3198  i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3199  i128 s = I128::SetValue(mask, each_int32);
3200  return I128::Test32(m, s);
3201 #endif
3202 }
3203 
3204 // true if value == 0
3205 NLIB_M(bool) I128::IsZero(i128arg value) NLIB_NOEXCEPT { // NOLINT
3206 #if defined(NLIB_SSE41)
3207  return _mm_testz_si128(value, value) != 0;
3208 #elif defined(NLIB_NEON)
3209 # ifdef __aarch64__
3210  uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(value));
3211  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3212 # else
3213  // NOTE: there is a better way
3214  // https://stackoverflow.com/questions/15389539/fastest-way-to-test-a-128-bit-neon-register-for-a-value-of-0-using-intrinsics // NOLINT
3215  int8x8_t tmp = vorr_s8(vget_low_s8(value), vget_high_s8(value));
3216  return vget_lane_u64(vreinterpret_u64_s8(tmp), 0) == 0;
3217 # endif
3218 #endif
3219 }
3220 
3221 // true if all bits on
3222 NLIB_M(bool) I128::IsFull(i128arg value) NLIB_NOEXCEPT { // NOLINT
3223 #if defined(NLIB_SSE41)
3224  return _mm_testc_si128(value, _mm_cmpeq_epi8(value, value)) != 0;
3225 #elif defined(NLIB_NEON)
3226 # ifdef __aarch64__
3227  uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(vmvnq_s8(value)));
3228  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3229 # else
3230  int8x8_t tmp = vand_s8(vget_low_s8(value), vget_high_s8(value));
3231  return vget_lane_s64(vreinterpret_s64_s8(tmp), 0) == -1;
3232 # endif
3233 #endif
3234 }
3235 
3236 // r.i8[i] = ismasked(mask.i8[i]) ? a.i8[i] : b.i8[i]
3237 NLIB_M(i128) I128::Select(i128arg mask, i128arg a, i128arg b) NLIB_NOEXCEPT {
3238 #if defined(NLIB_SSE41)
3239  return _mm_blendv_epi8(b, a, mask);
3240 #elif defined(NLIB_NEON)
3241  return NLIB_OP3(vbslq, u32, mask, a, b);
3242 #endif
3243 }
3244 
3245 // r[i] = 0 if shuffle.u8[i] == 0x80, r[i] = value[shuffle.u8[i]] if 0 <= shuffle.u8[i] < 16
3246 NLIB_M(i128) I128::Shuffle8(i128arg value, i128arg shuffle) NLIB_NOEXCEPT {
3247 #if defined(NLIB_SSE41)
3248  return _mm_shuffle_epi8(value, shuffle);
3249 #elif defined(NLIB_NEON)
3250 # ifdef __aarch64__
3251  return vqtbl1q_s8(value, vreinterpretq_u8_s8(shuffle));
3252 # else
3253  int8x8x2_t x;
3254  x.val[0] = vget_low_s8(value);
3255  x.val[1] = vget_high_s8(value);
3256  int8x8_t lo = vtbl2_s8(x, vget_low_s8(shuffle));
3257  int8x8_t hi = vtbl2_s8(x, vget_high_s8(shuffle));
3258  return vcombine_s8(lo, hi);
3259 # endif
3260 #endif
3261 }
3262 
3263 // popcnt of mask(returns nlib_popcnt16(I128::MoveMask8(value))
3264 NLIB_ALWAYS_INLINE int __vectorcall I128::PopCntMask8(i128arg value) NLIB_NOEXCEPT {
3265 #if defined(NLIB_NEON)
3266 # ifdef __aarch64__
3267  int8x16_t tmp = vnegq_s8(value);
3268  return vaddvq_s8(tmp);
3269 # else
3270  int8x16_t tmp = vnegq_s8(value);
3271  int8x8_t lo = vget_low_s8(tmp);
3272  int8x8_t hi = vget_high_s8(tmp);
3273  lo = vadd_s8(lo, hi);
3274  lo = vpadd_s8(lo, lo);
3275  lo = vpadd_s8(lo, lo);
3276  lo = vpadd_s8(lo, lo);
3277  return vget_lane_s8(lo, 0);
3278 # endif
3279 #else
3280  return nlib_popcnt16(static_cast<uint16_t>(I128::MoveMask8(value)));
3281 #endif
3282 }
3283 
3284 NLIB_ALWAYS_INLINE int __vectorcall I128::ClzMask8(i128arg value) NLIB_NOEXCEPT {
3285  return nlib_clz(static_cast<uint32_t>(I128::MoveMask8(value))) - 16;
3286 }
3287 
3288 NLIB_ALWAYS_INLINE int __vectorcall I128::CtzMask8(i128arg value) NLIB_NOEXCEPT {
3289  return nlib_ctz(static_cast<uint32_t>(I128::MoveMask8(value) | 0x10000));
3290 }
3291 
3292 #ifdef NLIB_NEON
3293 # undef vreinterpretq_s8_s8
3294 # undef NLIB_OP1
3295 # undef NLIB_OP2
3296 # undef NLIB_OP3
3297 # undef NLIB_CMP
3298 # undef NLIB_SFT
3299 # undef NLIB_CMB
3300 #endif
3301 
3302 #endif // NLIB_DOXYGEN
3303 
3304 #undef NLIB_M
3305 #undef NLIB_M2
3306 
3307 #if defined(NLIB_SSE41)
3308 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3309  { \
3310  row0 = _mm_shuffle_epi32(row0, _MM_SHUFFLE(3, 1, 2, 0)); \
3311  row1 = _mm_shuffle_epi32(row1, _MM_SHUFFLE(3, 1, 2, 0)); \
3312  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(3, 1, 2, 0)); \
3313  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(3, 1, 2, 0)); \
3314  __m128i t0_transpose32_ = _mm_unpacklo_epi32(row0, row1); \
3315  __m128i t1_transpose32_ = _mm_unpackhi_epi32(row0, row1); \
3316  __m128i t2_transpose32_ = _mm_unpacklo_epi32(row2, row3); \
3317  __m128i t3_transpose32_ = _mm_unpackhi_epi32(row2, row3); \
3318  row0 = _mm_unpacklo_epi64(t0_transpose32_, t2_transpose32_); \
3319  row1 = _mm_unpacklo_epi64(t1_transpose32_, t3_transpose32_); \
3320  row2 = _mm_unpackhi_epi64(t0_transpose32_, t2_transpose32_); \
3321  row3 = _mm_unpackhi_epi64(t1_transpose32_, t3_transpose32_); \
3322 }
3323 #elif defined(NLIB_NEON)
3324 # ifdef __aarch64__
3325 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3326  { \
3327  uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), \
3328  vreinterpretq_u32_s8(row1)); \
3329  uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), \
3330  vreinterpretq_u32_s8(row3)); \
3331  uint64x2_t row0_, row1_, row2_, row3_; \
3332  row0_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \
3333  vreinterpretq_u64_u32(trn_f1_.val[0])); \
3334  row0 = vreinterpretq_s8_u64(row0_); \
3335  row1_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \
3336  vreinterpretq_u64_u32(trn_f1_.val[1])); \
3337  row1 = vreinterpretq_s8_u64(row1_); \
3338  row2_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \
3339  vreinterpretq_u64_u32(trn_f1_.val[0])); \
3340  row2 = vreinterpretq_s8_u64(row2_); \
3341  row3_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \
3342  vreinterpretq_u64_u32(trn_f1_.val[1])); \
3343  row3 = vreinterpretq_s8_u64(row3_); \
3344  }
3345 # else
3346 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \
3347  { \
3348  uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), \
3349  vreinterpretq_u32_s8(row1)); \
3350  uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), \
3351  vreinterpretq_u32_s8(row3)); \
3352  uint32x4_t row0_, row1_, row2_, row3_; \
3353  uint32x2_t lo, hi; \
3354  lo = vget_low_u32(trn_f0_.val[0]); hi = vget_low_u32(trn_f1_.val[0]); \
3355  row0_ = vcombine_u32(lo, hi); \
3356  row0 = vreinterpretq_s8_u32(row0_); \
3357  lo = vget_low_u32(trn_f0_.val[1]); hi = vget_low_u32(trn_f1_.val[1]); \
3358  row1_ = vcombine_u32(lo, hi); \
3359  row1 = vreinterpretq_s8_u32(row1_); \
3360  lo = vget_high_u32(trn_f0_.val[0]); hi = vget_high_u32(trn_f1_.val[0]); \
3361  row2_ = vcombine_u32(lo, hi); \
3362  row2 = vreinterpretq_s8_u32(row2_); \
3363  lo = vget_high_u32(trn_f0_.val[1]); hi = vget_high_u32(trn_f1_.val[1]); \
3364  row3_ = vcombine_u32(lo, hi); \
3365  row3 = vreinterpretq_s8_u32(row3_); \
3366  }
3367 # endif
3368 #endif
3369 
3370 #endif // NLIB_SIMD
3371 
3372 } // namespace simd
3373 NLIB_NAMESPACE_END
3374 
3375 #endif // INCLUDE_NN_NLIB_SIMD_SIMDINT_H_
The tag for representing a signed 32-bit integer with an empty structure.
Definition: SimdInt.h:35
The tag for representing a signed 64-bit integer with an empty structure.
Definition: SimdInt.h:37
#define NLIB_ALWAYS_INLINE
Indicates that the compiler is forced to perform inline expansion of functions.
Definition: Platform_unix.h:95
constexpr const each_uint8_tag each_uint8
The tag for representing an unsigned 8-bit integer with an each_uint8_tag-type constant object...
Definition: SimdInt.h:51
The tag for representing the selection of a lane divided into 8-bit units with an empty structure...
Definition: SimdInt.h:61
The tag for representing the selection of a lane divided into 32-bit units with an empty structure...
Definition: SimdInt.h:57
The tag for representing a signed 8-bit integer with an empty structure.
Definition: SimdInt.h:31
#define NLIB_VIS_HIDDEN
Symbols for functions and classes are not made available outside of the library.
Definition: Platform_unix.h:86
constexpr const each_uint16_tag each_uint16
The tag for representing an unsigned 16-bit integer with an each_uint16_tag-type constant object...
Definition: SimdInt.h:52
The tag for representing an unsigned 16-bit integer with an empty structure.
Definition: SimdInt.h:41
The tag for representing an unsigned 64-bit integer with an empty structure.
Definition: SimdInt.h:45
constexpr const each_int64_tag each_int64
The tag for representing a signed 64-bit integer with an each_int64_tag-type constant object...
Definition: SimdInt.h:50
nlib_i128_t i128
nlib_i128_t is defined using typedef.
Definition: SimdInt.h:76
constexpr const each_uint64_tag each_uint64
The tag for representing an unsigned 64-bit integer with an each_uint64_tag-type constant object...
Definition: SimdInt.h:54
static int nlib_popcnt16(uint16_t x)
Returns the number of bits that are 1.
Definition: Platform.h:3293
The class for integer SIMD computations using128-bit registers (MM0-XMM15 for SSE, and Q0-Q15 for NEON).
Definition: SimdInt.h:84
The tag for representing an unsigned 8-bit integer with an empty structure.
Definition: SimdInt.h:39
The tag for representing a signed 16-bit integer with an empty structure.
Definition: SimdInt.h:33
constexpr const each_int16_tag each_int16
The tag for representing a signed 16-bit integer with an each_int16_tag-type constant object...
Definition: SimdInt.h:48
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
Definition: SimdInt.h:53
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Definition: Config.h:99
The tag for representing the selection of a lane divided into 16-bit units with an empty structure...
Definition: SimdInt.h:59
constexpr const each_select16_tag each_select16
The tag for representing the selection of a 16-bit lane with an each_select16_tag-type constant objec...
Definition: SimdInt.h:64
#define NLIB_CEXPR
Defines constexpr if it is available for use. If not, holds an empty string.
Definition: Config.h:93
A file that contains the configuration information for each development environment.
__m128i nlib_i128_t
The type for a SIMD register for 128-bit integers.
Definition: SimdInt.h:22
constexpr const each_select8_tag each_select8
The tag for representing the selection of an 8-bit lane with an each_select8_tag-type constant object...
Definition: SimdInt.h:65
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
Definition: Config.h:234
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
Definition: SimdInt.h:47
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
Definition: SimdInt.h:63
The tag for representing an unsigned 32-bit integer with an empty structure.
Definition: SimdInt.h:43
constexpr const each_int32_tag each_int32
The tag for representing a signed 32-bit integer with an each_int32_tag-type constant object...
Definition: SimdInt.h:49
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
Definition: Config.h:149